diff --git a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp index e2ed45eec0ecd..66b6570e7659c 100644 --- a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp @@ -568,6 +568,13 @@ static void buildCopyToRegs(MachineIRBuilder &B, ArrayRef DstRegs, const TypeSize PartSize = PartTy.getSizeInBits(); + if (PartSize == SrcTy.getSizeInBits() && DstRegs.size() == 1) { + // TODO: Handle int<->ptr casts. It just happens the ABI lowering + // assignments are not pointer aware. + B.buildBitcast(DstRegs[0], SrcReg); + return; + } + if (PartTy.isVector() == SrcTy.isVector() && PartTy.getScalarSizeInBits() > SrcTy.getScalarSizeInBits()) { assert(DstRegs.size() == 1); @@ -576,7 +583,8 @@ static void buildCopyToRegs(MachineIRBuilder &B, ArrayRef DstRegs, } if (SrcTy.isVector() && !PartTy.isVector() && - TypeSize::isKnownGT(PartSize, SrcTy.getElementType().getSizeInBits())) { + TypeSize::isKnownGT(PartSize, SrcTy.getElementType().getSizeInBits()) && + SrcTy.getElementCount() == ElementCount::getFixed(DstRegs.size())) { // Vector was scalarized, and the elements extended. auto UnmergeToEltTy = B.buildUnmerge(SrcTy.getElementType(), SrcReg); for (int i = 0, e = DstRegs.size(); i != e; ++i) @@ -614,9 +622,21 @@ static void buildCopyToRegs(MachineIRBuilder &B, ArrayRef DstRegs, MachineRegisterInfo &MRI = *B.getMRI(); LLT DstTy = MRI.getType(DstRegs[0]); - LLT LCMTy = getCoverTy(SrcTy, PartTy); + LLT CoverTy = getCoverTy(SrcTy, PartTy); + if (SrcTy.isVector() && DstRegs.size() > 1) { + TypeSize FullCoverSize = + DstTy.getSizeInBits().multiplyCoefficientBy(DstRegs.size()); + + LLT EltTy = SrcTy.getElementType(); + TypeSize EltSize = EltTy.getSizeInBits(); + if (FullCoverSize.isKnownMultipleOf(EltSize)) { + TypeSize VecSize = FullCoverSize.divideCoefficientBy(EltSize); + CoverTy = + LLT::vector(ElementCount::get(VecSize, VecSize.isScalable()), EltTy); + } + } - if (PartTy.isVector() && LCMTy == PartTy) { + if (PartTy.isVector() && CoverTy == PartTy) { assert(DstRegs.size() == 1); B.buildPadVectorWithUndefElements(DstRegs[0], SrcReg); return; @@ -624,11 +644,11 @@ static void buildCopyToRegs(MachineIRBuilder &B, ArrayRef DstRegs, const unsigned DstSize = DstTy.getSizeInBits(); const unsigned SrcSize = SrcTy.getSizeInBits(); - unsigned CoveringSize = LCMTy.getSizeInBits(); + unsigned CoveringSize = CoverTy.getSizeInBits(); Register UnmergeSrc = SrcReg; - if (!LCMTy.isVector() && CoveringSize != SrcSize) { + if (!CoverTy.isVector() && CoveringSize != SrcSize) { // For scalars, it's common to be able to use a simple extension. if (SrcTy.isScalar() && DstTy.isScalar()) { CoveringSize = alignTo(SrcSize, DstSize); @@ -641,12 +661,12 @@ static void buildCopyToRegs(MachineIRBuilder &B, ArrayRef DstRegs, SmallVector MergeParts(1, SrcReg); for (unsigned Size = SrcSize; Size != CoveringSize; Size += SrcSize) MergeParts.push_back(Undef); - UnmergeSrc = B.buildMergeLikeInstr(LCMTy, MergeParts).getReg(0); + UnmergeSrc = B.buildMergeLikeInstr(CoverTy, MergeParts).getReg(0); } } - if (LCMTy.isVector() && CoveringSize != SrcSize) - UnmergeSrc = B.buildPadVectorWithUndefElements(LCMTy, SrcReg).getReg(0); + if (CoverTy.isVector() && CoveringSize != SrcSize) + UnmergeSrc = B.buildPadVectorWithUndefElements(CoverTy, SrcReg).getReg(0); B.buildUnmerge(DstRegs, UnmergeSrc); } diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 1ec0204005f12..8c95cacd02079 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -1111,9 +1111,9 @@ MVT SITargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context, EVT ScalarVT = VT.getScalarType(); unsigned Size = ScalarVT.getSizeInBits(); if (Size == 16) { - if (Subtarget->has16BitInsts()) - return MVT::getVectorVT(ScalarVT.getSimpleVT(), 2); - return VT.isInteger() ? MVT::i32 : MVT::f32; + return Subtarget->has16BitInsts() + ? MVT::getVectorVT(ScalarVT.getSimpleVT(), 2) + : MVT::i32; } if (Size < 16) @@ -1139,7 +1139,7 @@ unsigned SITargetLowering::getNumRegistersForCallingConv(LLVMContext &Context, unsigned Size = ScalarVT.getSizeInBits(); // FIXME: Should probably promote 8-bit vectors to i16. - if (Size == 16 && Subtarget->has16BitInsts()) + if (Size == 16) return (NumElts + 1) / 2; if (Size <= 32) @@ -1163,11 +1163,13 @@ unsigned SITargetLowering::getVectorTypeBreakdownForCallingConv( // FIXME: We should fix the ABI to be the same on targets without 16-bit // support, but unless we can properly handle 3-vectors, it will be still be // inconsistent. - if (Size == 16 && Subtarget->has16BitInsts()) { - RegisterVT = MVT::getVectorVT(ScalarVT.getSimpleVT(), 2); - IntermediateVT = RegisterVT; + if (Size == 16) { + MVT SimpleIntermediateVT = + MVT::getVectorVT(ScalarVT.getSimpleVT(), ElementCount::getFixed(2)); + IntermediateVT = SimpleIntermediateVT; + RegisterVT = Subtarget->has16BitInsts() ? SimpleIntermediateVT : MVT::i32; NumIntermediates = (NumElts + 1) / 2; - return NumIntermediates; + return (NumElts + 1) / 2; } if (Size == 32) { diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.ll index d6f1b142b36e0..5c60eb696f6b2 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.ll @@ -200,10 +200,15 @@ define <2 x i16> @s_add_v2i16(<2 x i16> inreg %a, <2 x i16> inreg %b) { ; GFX7-LABEL: s_add_v2i16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_add_i32 s16, s16, s18 -; GFX7-NEXT: s_add_i32 s17, s17, s19 -; GFX7-NEXT: v_mov_b32_e32 v0, s16 -; GFX7-NEXT: v_mov_b32_e32 v1, s17 +; GFX7-NEXT: s_lshr_b32 s4, s16, 16 +; GFX7-NEXT: s_lshr_b32 s5, s17, 16 +; GFX7-NEXT: s_add_i32 s4, s4, s5 +; GFX7-NEXT: s_add_i32 s16, s16, s17 +; GFX7-NEXT: s_and_b32 s4, s4, 0xffff +; GFX7-NEXT: s_and_b32 s5, s16, 0xffff +; GFX7-NEXT: s_lshl_b32 s4, s4, 16 +; GFX7-NEXT: s_or_b32 s4, s5, s4 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: s_add_v2i16: @@ -278,8 +283,14 @@ define <2 x i16> @v_add_v2i16(<2 x i16> %a, <2 x i16> %b) { ; GFX7-LABEL: v_add_v2i16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GFX7-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GFX7-NEXT: v_add_i32_e32 v1, vcc, v2, v3 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_add_v2i16: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll index 814acc3be1fc0..244d006844a09 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll @@ -9,8 +9,14 @@ define <2 x i16> @v_add_v2i16(<2 x i16> %a, <2 x i16> %b) { ; GFX7-LABEL: v_add_v2i16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GFX7-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GFX7-NEXT: v_add_i32_e32 v1, vcc, v2, v3 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_add_v2i16: @@ -40,13 +46,15 @@ define <2 x i16> @v_add_v2i16_fneg_lhs(<2 x half> %a, <2 x i16> %b) { ; GFX7-LABEL: v_add_v2i16_fneg_lhs: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GFX7-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GFX7-NEXT: v_add_i32_e32 v1, vcc, v2, v3 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_add_v2i16_fneg_lhs: @@ -79,13 +87,15 @@ define <2 x i16> @v_add_v2i16_fneg_rhs(<2 x i16> %a, <2 x half> %b) { ; GFX7-LABEL: v_add_v2i16_fneg_rhs: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX7-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GFX7-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; GFX7-NEXT: v_xor_b32_e32 v1, 0x80008000, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GFX7-NEXT: v_add_i32_e32 v1, vcc, v2, v3 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_add_v2i16_fneg_rhs: @@ -118,18 +128,16 @@ define <2 x i16> @v_add_v2i16_fneg_lhs_fneg_rhs(<2 x half> %a, <2 x half> %b) { ; GFX7-LABEL: v_add_v2i16_fneg_lhs_fneg_rhs: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX7-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 ; GFX7-NEXT: v_xor_b32_e32 v1, 0x80008000, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX7-NEXT: v_add_i32_e32 v1, vcc, v2, v3 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_add_v2i16_fneg_lhs_fneg_rhs: @@ -165,8 +173,13 @@ define <2 x i16> @v_add_v2i16_neg_inline_imm_splat(<2 x i16> %a) { ; GFX7-LABEL: v_add_v2i16_neg_inline_imm_splat: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xffffffc0, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX7-NEXT: v_add_i32_e32 v1, vcc, 0xffffffc0, v1 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xffffffc0, v0 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_add_v2i16_neg_inline_imm_splat: @@ -197,8 +210,13 @@ define <2 x i16> @v_add_v2i16_neg_inline_imm_lo(<2 x i16> %a) { ; GFX7-LABEL: v_add_v2i16_neg_inline_imm_lo: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xffffffc0, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX7-NEXT: v_add_i32_e32 v1, vcc, 4, v1 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xffffffc0, v0 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_add_v2i16_neg_inline_imm_lo: @@ -230,8 +248,13 @@ define <2 x i16> @v_add_v2i16_neg_inline_imm_hi(<2 x i16> %a) { ; GFX7-LABEL: v_add_v2i16_neg_inline_imm_hi: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v0, vcc, 4, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX7-NEXT: v_add_i32_e32 v1, vcc, 0xffffffc0, v1 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 4, v0 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_add_v2i16_neg_inline_imm_hi: @@ -262,6 +285,7 @@ define <2 x i16> @v_add_v2i16_neg_inline_imm_hi(<2 x i16> %a) { define amdgpu_ps i32 @s_add_v2i16_neg_inline_imm_splat(<2 x i16> inreg %a) { ; GFX7-LABEL: s_add_v2i16_neg_inline_imm_splat: ; GFX7: ; %bb.0: +; GFX7-NEXT: s_lshr_b32 s1, s0, 16 ; GFX7-NEXT: s_sub_i32 s1, s1, 64 ; GFX7-NEXT: s_sub_i32 s0, s0, 64 ; GFX7-NEXT: s_and_b32 s1, s1, 0xffff @@ -304,6 +328,7 @@ define amdgpu_ps i32 @s_add_v2i16_neg_inline_imm_splat(<2 x i16> inreg %a) { define amdgpu_ps i32 @s_add_v2i16_neg_inline_imm_lo(<2 x i16> inreg %a) { ; GFX7-LABEL: s_add_v2i16_neg_inline_imm_lo: ; GFX7: ; %bb.0: +; GFX7-NEXT: s_lshr_b32 s1, s0, 16 ; GFX7-NEXT: s_add_i32 s1, s1, 4 ; GFX7-NEXT: s_sub_i32 s0, s0, 64 ; GFX7-NEXT: s_and_b32 s1, s1, 0xffff @@ -346,6 +371,7 @@ define amdgpu_ps i32 @s_add_v2i16_neg_inline_imm_lo(<2 x i16> inreg %a) { define amdgpu_ps i32 @s_add_v2i16_neg_inline_imm_hi(<2 x i16> inreg %a) { ; GFX7-LABEL: s_add_v2i16_neg_inline_imm_hi: ; GFX7: ; %bb.0: +; GFX7-NEXT: s_lshr_b32 s1, s0, 16 ; GFX7-NEXT: s_sub_i32 s1, s1, 64 ; GFX7-NEXT: s_add_i32 s0, s0, 4 ; GFX7-NEXT: s_and_b32 s1, s1, 0xffff @@ -388,9 +414,11 @@ define amdgpu_ps i32 @s_add_v2i16_neg_inline_imm_hi(<2 x i16> inreg %a) { define amdgpu_ps i32 @s_add_v2i16(<2 x i16> inreg %a, <2 x i16> inreg %b) { ; GFX7-LABEL: s_add_v2i16: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_add_i32 s1, s1, s3 -; GFX7-NEXT: s_add_i32 s0, s0, s2 -; GFX7-NEXT: s_and_b32 s1, s1, 0xffff +; GFX7-NEXT: s_lshr_b32 s2, s0, 16 +; GFX7-NEXT: s_lshr_b32 s3, s1, 16 +; GFX7-NEXT: s_add_i32 s2, s2, s3 +; GFX7-NEXT: s_add_i32 s0, s0, s1 +; GFX7-NEXT: s_and_b32 s1, s2, 0xffff ; GFX7-NEXT: s_and_b32 s0, s0, 0xffff ; GFX7-NEXT: s_lshl_b32 s1, s1, 16 ; GFX7-NEXT: s_or_b32 s0, s0, s1 @@ -433,14 +461,12 @@ define amdgpu_ps i32 @s_add_v2i16(<2 x i16> inreg %a, <2 x i16> inreg %b) { define amdgpu_ps i32 @s_add_v2i16_fneg_lhs(<2 x half> inreg %a, <2 x i16> inreg %b) { ; GFX7-LABEL: s_add_v2i16_fneg_lhs: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_lshl_b32 s1, s1, 16 -; GFX7-NEXT: s_and_b32 s0, s0, 0xffff -; GFX7-NEXT: s_or_b32 s0, s1, s0 ; GFX7-NEXT: s_xor_b32 s0, s0, 0x80008000 -; GFX7-NEXT: s_lshr_b32 s1, s0, 16 -; GFX7-NEXT: s_add_i32 s1, s1, s3 -; GFX7-NEXT: s_add_i32 s0, s0, s2 -; GFX7-NEXT: s_and_b32 s1, s1, 0xffff +; GFX7-NEXT: s_lshr_b32 s2, s0, 16 +; GFX7-NEXT: s_lshr_b32 s3, s1, 16 +; GFX7-NEXT: s_add_i32 s2, s2, s3 +; GFX7-NEXT: s_add_i32 s0, s0, s1 +; GFX7-NEXT: s_and_b32 s1, s2, 0xffff ; GFX7-NEXT: s_and_b32 s0, s0, 0xffff ; GFX7-NEXT: s_lshl_b32 s1, s1, 16 ; GFX7-NEXT: s_or_b32 s0, s0, s1 @@ -488,14 +514,12 @@ define amdgpu_ps i32 @s_add_v2i16_fneg_lhs(<2 x half> inreg %a, <2 x i16> inreg define amdgpu_ps i32 @s_add_v2i16_fneg_rhs(<2 x i16> inreg %a, <2 x half> inreg %b) { ; GFX7-LABEL: s_add_v2i16_fneg_rhs: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_lshl_b32 s3, s3, 16 -; GFX7-NEXT: s_and_b32 s2, s2, 0xffff -; GFX7-NEXT: s_or_b32 s2, s3, s2 -; GFX7-NEXT: s_xor_b32 s2, s2, 0x80008000 -; GFX7-NEXT: s_lshr_b32 s3, s2, 16 -; GFX7-NEXT: s_add_i32 s1, s1, s3 -; GFX7-NEXT: s_add_i32 s0, s0, s2 -; GFX7-NEXT: s_and_b32 s1, s1, 0xffff +; GFX7-NEXT: s_xor_b32 s1, s1, 0x80008000 +; GFX7-NEXT: s_lshr_b32 s2, s0, 16 +; GFX7-NEXT: s_lshr_b32 s3, s1, 16 +; GFX7-NEXT: s_add_i32 s2, s2, s3 +; GFX7-NEXT: s_add_i32 s0, s0, s1 +; GFX7-NEXT: s_and_b32 s1, s2, 0xffff ; GFX7-NEXT: s_and_b32 s0, s0, 0xffff ; GFX7-NEXT: s_lshl_b32 s1, s1, 16 ; GFX7-NEXT: s_or_b32 s0, s0, s1 @@ -543,12 +567,6 @@ define amdgpu_ps i32 @s_add_v2i16_fneg_rhs(<2 x i16> inreg %a, <2 x half> inreg define amdgpu_ps i32 @s_add_v2i16_fneg_lhs_fneg_rhs(<2 x half> inreg %a, <2 x half> inreg %b) { ; GFX7-LABEL: s_add_v2i16_fneg_lhs_fneg_rhs: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_lshl_b32 s1, s1, 16 -; GFX7-NEXT: s_and_b32 s0, s0, 0xffff -; GFX7-NEXT: s_or_b32 s0, s1, s0 -; GFX7-NEXT: s_lshl_b32 s1, s3, 16 -; GFX7-NEXT: s_and_b32 s2, s2, 0xffff -; GFX7-NEXT: s_or_b32 s1, s1, s2 ; GFX7-NEXT: s_xor_b32 s0, s0, 0x80008000 ; GFX7-NEXT: s_xor_b32 s1, s1, 0x80008000 ; GFX7-NEXT: s_lshr_b32 s2, s0, 16 @@ -609,7 +627,11 @@ define <2 x i16> @add_inline_imm_neg1_0(<2 x i16> %x) { ; GFX7-LABEL: add_inline_imm_neg1_0: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, -1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: add_inline_imm_neg1_0: @@ -640,7 +662,11 @@ define <2 x i16> @add_inline_imm_1_0(<2 x i16> %x) { ; GFX7-LABEL: add_inline_imm_1_0: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: add_inline_imm_1_0: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll index 22b63a7de5f89..a20387b17c53d 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll @@ -513,14 +513,8 @@ define amdgpu_ps float @v_andn2_i16_vs(i16 %src0, i16 inreg %src1) { define amdgpu_ps i32 @s_andn2_v2i16(<2 x i16> inreg %src0, <2 x i16> inreg %src1) { ; GFX6-LABEL: s_andn2_v2i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_lshl_b32 s0, s3, 16 -; GFX6-NEXT: s_and_b32 s1, s2, 0xffff -; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_lshl_b32 s1, s5, 16 -; GFX6-NEXT: s_and_b32 s2, s4, 0xffff -; GFX6-NEXT: s_or_b32 s1, s1, s2 -; GFX6-NEXT: s_xor_b32 s1, s1, -1 -; GFX6-NEXT: s_and_b32 s0, s0, s1 +; GFX6-NEXT: s_xor_b32 s0, s3, -1 +; GFX6-NEXT: s_and_b32 s0, s2, s0 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_andn2_v2i16: @@ -546,14 +540,8 @@ define amdgpu_ps i32 @s_andn2_v2i16(<2 x i16> inreg %src0, <2 x i16> inreg %src1 define amdgpu_ps i32 @s_andn2_v2i16_commute(<2 x i16> inreg %src0, <2 x i16> inreg %src1) { ; GFX6-LABEL: s_andn2_v2i16_commute: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_lshl_b32 s0, s3, 16 -; GFX6-NEXT: s_and_b32 s1, s2, 0xffff -; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_lshl_b32 s1, s5, 16 -; GFX6-NEXT: s_and_b32 s2, s4, 0xffff -; GFX6-NEXT: s_or_b32 s1, s1, s2 -; GFX6-NEXT: s_xor_b32 s1, s1, -1 -; GFX6-NEXT: s_and_b32 s0, s1, s0 +; GFX6-NEXT: s_xor_b32 s0, s3, -1 +; GFX6-NEXT: s_and_b32 s0, s0, s2 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_andn2_v2i16_commute: @@ -579,14 +567,8 @@ define amdgpu_ps i32 @s_andn2_v2i16_commute(<2 x i16> inreg %src0, <2 x i16> inr define amdgpu_ps { i32, i32 } @s_andn2_v2i16_multi_use(<2 x i16> inreg %src0, <2 x i16> inreg %src1) { ; GFX6-LABEL: s_andn2_v2i16_multi_use: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_lshl_b32 s0, s3, 16 -; GFX6-NEXT: s_and_b32 s1, s2, 0xffff -; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_lshl_b32 s1, s5, 16 -; GFX6-NEXT: s_and_b32 s2, s4, 0xffff -; GFX6-NEXT: s_or_b32 s1, s1, s2 -; GFX6-NEXT: s_xor_b32 s1, s1, -1 -; GFX6-NEXT: s_and_b32 s0, s0, s1 +; GFX6-NEXT: s_xor_b32 s1, s3, -1 +; GFX6-NEXT: s_and_b32 s0, s2, s1 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_andn2_v2i16_multi_use: @@ -619,18 +601,9 @@ define amdgpu_ps { i32, i32 } @s_andn2_v2i16_multi_use(<2 x i16> inreg %src0, <2 define amdgpu_ps { i32, i32 } @s_andn2_v2i16_multi_foldable_use(<2 x i16> inreg %src0, <2 x i16> inreg %src1, <2 x i16> inreg %src2) { ; GFX6-LABEL: s_andn2_v2i16_multi_foldable_use: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_lshl_b32 s0, s3, 16 -; GFX6-NEXT: s_and_b32 s1, s2, 0xffff -; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_lshl_b32 s1, s5, 16 -; GFX6-NEXT: s_and_b32 s2, s4, 0xffff -; GFX6-NEXT: s_or_b32 s1, s1, s2 -; GFX6-NEXT: s_lshl_b32 s2, s7, 16 -; GFX6-NEXT: s_and_b32 s3, s6, 0xffff -; GFX6-NEXT: s_or_b32 s2, s2, s3 -; GFX6-NEXT: s_xor_b32 s2, s2, -1 -; GFX6-NEXT: s_and_b32 s0, s0, s2 -; GFX6-NEXT: s_and_b32 s1, s1, s2 +; GFX6-NEXT: s_xor_b32 s1, s4, -1 +; GFX6-NEXT: s_and_b32 s0, s2, s1 +; GFX6-NEXT: s_and_b32 s1, s3, s1 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_andn2_v2i16_multi_foldable_use: @@ -662,26 +635,12 @@ define amdgpu_ps { i32, i32 } @s_andn2_v2i16_multi_foldable_use(<2 x i16> inreg } define <2 x i16> @v_andn2_v2i16(<2 x i16> %src0, <2 x i16> %src1) { -; GFX6-LABEL: v_andn2_v2i16: -; GFX6: ; %bb.0: -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX6-NEXT: v_xor_b32_e32 v1, -1, v1 -; GFX6-NEXT: v_and_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX6-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: v_andn2_v2i16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_xor_b32_e32 v1, -1, v1 -; GFX9-NEXT: v_and_b32_e32 v0, v0, v1 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: v_andn2_v2i16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_xor_b32_e32 v1, -1, v1 +; GCN-NEXT: v_and_b32_e32 v0, v0, v1 +; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX10PLUS-LABEL: v_andn2_v2i16: ; GFX10PLUS: ; %bb.0: @@ -698,19 +657,19 @@ define <2 x i16> @v_andn2_v2i16(<2 x i16> %src0, <2 x i16> %src1) { define amdgpu_ps i48 @s_andn2_v3i16(<3 x i16> inreg %src0, <3 x i16> inreg %src1) { ; GFX6-LABEL: s_andn2_v3i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_and_b32 s6, s6, 0xffff +; GFX6-NEXT: s_lshr_b32 s7, s4, 16 ; GFX6-NEXT: s_mov_b32 s0, -1 -; GFX6-NEXT: s_and_b32 s5, s5, 0xffff -; GFX6-NEXT: s_lshl_b32 s6, s6, 16 -; GFX6-NEXT: s_and_b32 s3, s3, 0xffff +; GFX6-NEXT: s_and_b32 s4, s4, 0xffff +; GFX6-NEXT: s_lshl_b32 s7, s7, 16 +; GFX6-NEXT: s_lshr_b32 s6, s2, 16 ; GFX6-NEXT: s_mov_b32 s1, 0xffff -; GFX6-NEXT: s_or_b32 s6, s5, s6 -; GFX6-NEXT: s_and_b32 s7, s7, 0xffff +; GFX6-NEXT: s_or_b32 s4, s4, s7 +; GFX6-NEXT: s_and_b32 s5, s5, 0xffff +; GFX6-NEXT: s_xor_b64 s[0:1], s[4:5], s[0:1] ; GFX6-NEXT: s_and_b32 s2, s2, 0xffff -; GFX6-NEXT: s_lshl_b32 s3, s3, 16 -; GFX6-NEXT: s_xor_b64 s[0:1], s[6:7], s[0:1] -; GFX6-NEXT: s_or_b32 s2, s2, s3 -; GFX6-NEXT: s_and_b32 s3, s4, 0xffff +; GFX6-NEXT: s_lshl_b32 s4, s6, 16 +; GFX6-NEXT: s_or_b32 s2, s2, s4 +; GFX6-NEXT: s_and_b32 s3, s3, 0xffff ; GFX6-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1] ; GFX6-NEXT: s_lshr_b32 s2, s0, 16 ; GFX6-NEXT: s_and_b32 s0, s0, 0xffff @@ -751,19 +710,19 @@ define amdgpu_ps i48 @s_andn2_v3i16(<3 x i16> inreg %src0, <3 x i16> inreg %src1 define amdgpu_ps i48 @s_andn2_v3i16_commute(<3 x i16> inreg %src0, <3 x i16> inreg %src1) { ; GFX6-LABEL: s_andn2_v3i16_commute: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_and_b32 s6, s6, 0xffff +; GFX6-NEXT: s_lshr_b32 s7, s4, 16 ; GFX6-NEXT: s_mov_b32 s0, -1 -; GFX6-NEXT: s_and_b32 s5, s5, 0xffff -; GFX6-NEXT: s_lshl_b32 s6, s6, 16 -; GFX6-NEXT: s_and_b32 s3, s3, 0xffff +; GFX6-NEXT: s_and_b32 s4, s4, 0xffff +; GFX6-NEXT: s_lshl_b32 s7, s7, 16 +; GFX6-NEXT: s_lshr_b32 s6, s2, 16 ; GFX6-NEXT: s_mov_b32 s1, 0xffff -; GFX6-NEXT: s_or_b32 s6, s5, s6 -; GFX6-NEXT: s_and_b32 s7, s7, 0xffff +; GFX6-NEXT: s_or_b32 s4, s4, s7 +; GFX6-NEXT: s_and_b32 s5, s5, 0xffff +; GFX6-NEXT: s_xor_b64 s[0:1], s[4:5], s[0:1] ; GFX6-NEXT: s_and_b32 s2, s2, 0xffff -; GFX6-NEXT: s_lshl_b32 s3, s3, 16 -; GFX6-NEXT: s_xor_b64 s[0:1], s[6:7], s[0:1] -; GFX6-NEXT: s_or_b32 s2, s2, s3 -; GFX6-NEXT: s_and_b32 s3, s4, 0xffff +; GFX6-NEXT: s_lshl_b32 s4, s6, 16 +; GFX6-NEXT: s_or_b32 s2, s2, s4 +; GFX6-NEXT: s_and_b32 s3, s3, 0xffff ; GFX6-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] ; GFX6-NEXT: s_lshr_b32 s2, s0, 16 ; GFX6-NEXT: s_and_b32 s0, s0, 0xffff @@ -804,30 +763,30 @@ define amdgpu_ps i48 @s_andn2_v3i16_commute(<3 x i16> inreg %src0, <3 x i16> inr define amdgpu_ps { i48, i48 } @s_andn2_v3i16_multi_use(<3 x i16> inreg %src0, <3 x i16> inreg %src1) { ; GFX6-LABEL: s_andn2_v3i16_multi_use: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_and_b32 s6, s6, 0xffff +; GFX6-NEXT: s_lshr_b32 s7, s4, 16 ; GFX6-NEXT: s_mov_b32 s0, -1 -; GFX6-NEXT: s_and_b32 s5, s5, 0xffff -; GFX6-NEXT: s_lshl_b32 s6, s6, 16 +; GFX6-NEXT: s_and_b32 s4, s4, 0xffff +; GFX6-NEXT: s_lshl_b32 s7, s7, 16 +; GFX6-NEXT: s_lshr_b32 s6, s2, 16 ; GFX6-NEXT: s_mov_b32 s1, 0xffff -; GFX6-NEXT: s_or_b32 s6, s5, s6 -; GFX6-NEXT: s_and_b32 s7, s7, 0xffff -; GFX6-NEXT: s_xor_b64 s[6:7], s[6:7], s[0:1] -; GFX6-NEXT: s_and_b32 s1, s3, 0xffff +; GFX6-NEXT: s_or_b32 s4, s4, s7 +; GFX6-NEXT: s_and_b32 s5, s5, 0xffff +; GFX6-NEXT: s_xor_b64 s[4:5], s[4:5], s[0:1] ; GFX6-NEXT: s_and_b32 s0, s2, 0xffff -; GFX6-NEXT: s_lshl_b32 s1, s1, 16 +; GFX6-NEXT: s_lshl_b32 s1, s6, 16 ; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_and_b32 s1, s4, 0xffff -; GFX6-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7] +; GFX6-NEXT: s_and_b32 s1, s3, 0xffff +; GFX6-NEXT: s_and_b64 s[0:1], s[0:1], s[4:5] ; GFX6-NEXT: s_lshr_b32 s2, s0, 16 -; GFX6-NEXT: s_lshr_b32 s5, s6, 16 +; GFX6-NEXT: s_lshr_b32 s7, s4, 16 ; GFX6-NEXT: s_and_b32 s0, s0, 0xffff ; GFX6-NEXT: s_lshl_b32 s2, s2, 16 ; GFX6-NEXT: s_or_b32 s0, s0, s2 -; GFX6-NEXT: s_and_b32 s2, s6, 0xffff -; GFX6-NEXT: s_lshl_b32 s3, s5, 16 +; GFX6-NEXT: s_and_b32 s2, s4, 0xffff +; GFX6-NEXT: s_lshl_b32 s3, s7, 16 ; GFX6-NEXT: s_and_b32 s1, s1, 0xffff ; GFX6-NEXT: s_or_b32 s2, s2, s3 -; GFX6-NEXT: s_and_b32 s3, s7, 0xffff +; GFX6-NEXT: s_and_b32 s3, s5, 0xffff ; GFX6-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_andn2_v3i16_multi_use: @@ -876,21 +835,25 @@ define <3 x i16> @v_andn2_v3i16(<3 x i16> %src0, <3 x i16> %src1) { ; GFX6-LABEL: v_andn2_v3i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX6-NEXT: v_or_b32_e32 v2, v2, v5 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX6-NEXT: v_and_b32_e32 v0, v0, v2 +; GFX6-NEXT: v_xor_b32_e32 v3, 0xfff5, v3 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX6-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX6-NEXT: v_and_b32_e32 v1, v1, v3 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v5 -; GFX6-NEXT: v_xor_b32_e32 v3, -1, v3 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_xor_b32_e32 v4, 0xfff5, v4 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; GFX6-NEXT: v_and_b32_e32 v0, v0, v3 -; GFX6-NEXT: v_and_b32_e32 v2, v1, v4 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_andn2_v3i16: @@ -918,17 +881,21 @@ define <3 x i16> @v_andn2_v3i16(<3 x i16> %src0, <3 x i16> %src1) { define amdgpu_ps i64 @s_andn2_v4i16(<4 x i16> inreg %src0, <4 x i16> inreg %src1) { ; GFX6-LABEL: s_andn2_v4i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_lshl_b32 s0, s3, 16 -; GFX6-NEXT: s_and_b32 s1, s2, 0xffff -; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_lshl_b32 s1, s5, 16 -; GFX6-NEXT: s_and_b32 s2, s4, 0xffff +; GFX6-NEXT: s_lshr_b32 s0, s2, 16 +; GFX6-NEXT: s_lshr_b32 s1, s3, 16 +; GFX6-NEXT: s_lshl_b32 s0, s0, 16 +; GFX6-NEXT: s_and_b32 s2, s2, 0xffff +; GFX6-NEXT: s_or_b32 s0, s0, s2 +; GFX6-NEXT: s_lshl_b32 s1, s1, 16 +; GFX6-NEXT: s_and_b32 s2, s3, 0xffff ; GFX6-NEXT: s_or_b32 s1, s1, s2 -; GFX6-NEXT: s_lshl_b32 s2, s7, 16 -; GFX6-NEXT: s_and_b32 s3, s6, 0xffff -; GFX6-NEXT: s_or_b32 s2, s2, s3 -; GFX6-NEXT: s_lshl_b32 s3, s9, 16 -; GFX6-NEXT: s_and_b32 s4, s8, 0xffff +; GFX6-NEXT: s_lshr_b32 s2, s4, 16 +; GFX6-NEXT: s_lshr_b32 s3, s5, 16 +; GFX6-NEXT: s_lshl_b32 s2, s2, 16 +; GFX6-NEXT: s_and_b32 s4, s4, 0xffff +; GFX6-NEXT: s_or_b32 s2, s2, s4 +; GFX6-NEXT: s_lshl_b32 s3, s3, 16 +; GFX6-NEXT: s_and_b32 s4, s5, 0xffff ; GFX6-NEXT: s_or_b32 s3, s3, s4 ; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], -1 ; GFX6-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] @@ -958,17 +925,21 @@ define amdgpu_ps i64 @s_andn2_v4i16(<4 x i16> inreg %src0, <4 x i16> inreg %src1 define amdgpu_ps i64 @s_andn2_v4i16_commute(<4 x i16> inreg %src0, <4 x i16> inreg %src1) { ; GFX6-LABEL: s_andn2_v4i16_commute: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_lshl_b32 s0, s3, 16 -; GFX6-NEXT: s_and_b32 s1, s2, 0xffff -; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_lshl_b32 s1, s5, 16 -; GFX6-NEXT: s_and_b32 s2, s4, 0xffff +; GFX6-NEXT: s_lshr_b32 s0, s2, 16 +; GFX6-NEXT: s_lshr_b32 s1, s3, 16 +; GFX6-NEXT: s_lshl_b32 s0, s0, 16 +; GFX6-NEXT: s_and_b32 s2, s2, 0xffff +; GFX6-NEXT: s_or_b32 s0, s0, s2 +; GFX6-NEXT: s_lshl_b32 s1, s1, 16 +; GFX6-NEXT: s_and_b32 s2, s3, 0xffff ; GFX6-NEXT: s_or_b32 s1, s1, s2 -; GFX6-NEXT: s_lshl_b32 s2, s7, 16 -; GFX6-NEXT: s_and_b32 s3, s6, 0xffff -; GFX6-NEXT: s_or_b32 s2, s2, s3 -; GFX6-NEXT: s_lshl_b32 s3, s9, 16 -; GFX6-NEXT: s_and_b32 s4, s8, 0xffff +; GFX6-NEXT: s_lshr_b32 s2, s4, 16 +; GFX6-NEXT: s_lshr_b32 s3, s5, 16 +; GFX6-NEXT: s_lshl_b32 s2, s2, 16 +; GFX6-NEXT: s_and_b32 s4, s4, 0xffff +; GFX6-NEXT: s_or_b32 s2, s2, s4 +; GFX6-NEXT: s_lshl_b32 s3, s3, 16 +; GFX6-NEXT: s_and_b32 s4, s5, 0xffff ; GFX6-NEXT: s_or_b32 s3, s3, s4 ; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], -1 ; GFX6-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1] @@ -998,17 +969,21 @@ define amdgpu_ps i64 @s_andn2_v4i16_commute(<4 x i16> inreg %src0, <4 x i16> inr define amdgpu_ps { i64, i64 } @s_andn2_v4i16_multi_use(<4 x i16> inreg %src0, <4 x i16> inreg %src1) { ; GFX6-LABEL: s_andn2_v4i16_multi_use: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_lshl_b32 s0, s3, 16 -; GFX6-NEXT: s_and_b32 s1, s2, 0xffff -; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_lshl_b32 s1, s5, 16 -; GFX6-NEXT: s_and_b32 s2, s4, 0xffff +; GFX6-NEXT: s_lshr_b32 s0, s2, 16 +; GFX6-NEXT: s_lshr_b32 s1, s3, 16 +; GFX6-NEXT: s_lshl_b32 s0, s0, 16 +; GFX6-NEXT: s_and_b32 s2, s2, 0xffff +; GFX6-NEXT: s_or_b32 s0, s0, s2 +; GFX6-NEXT: s_lshl_b32 s1, s1, 16 +; GFX6-NEXT: s_and_b32 s2, s3, 0xffff ; GFX6-NEXT: s_or_b32 s1, s1, s2 -; GFX6-NEXT: s_lshl_b32 s2, s7, 16 -; GFX6-NEXT: s_and_b32 s3, s6, 0xffff -; GFX6-NEXT: s_or_b32 s2, s2, s3 -; GFX6-NEXT: s_lshl_b32 s3, s9, 16 -; GFX6-NEXT: s_and_b32 s4, s8, 0xffff +; GFX6-NEXT: s_lshr_b32 s2, s4, 16 +; GFX6-NEXT: s_lshr_b32 s3, s5, 16 +; GFX6-NEXT: s_lshl_b32 s2, s2, 16 +; GFX6-NEXT: s_and_b32 s4, s4, 0xffff +; GFX6-NEXT: s_or_b32 s2, s2, s4 +; GFX6-NEXT: s_lshl_b32 s3, s3, 16 +; GFX6-NEXT: s_and_b32 s4, s5, 0xffff ; GFX6-NEXT: s_or_b32 s3, s3, s4 ; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], -1 ; GFX6-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] @@ -1046,23 +1021,29 @@ define amdgpu_ps { i64, i64 } @s_andn2_v4i16_multi_use(<4 x i16> inreg %src0, <4 define amdgpu_ps { i64, i64 } @s_andn2_v4i16_multi_foldable_use(<4 x i16> inreg %src0, <4 x i16> inreg %src1, <4 x i16> inreg %src2) { ; GFX6-LABEL: s_andn2_v4i16_multi_foldable_use: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_lshl_b32 s0, s3, 16 -; GFX6-NEXT: s_and_b32 s1, s2, 0xffff -; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_lshl_b32 s1, s5, 16 -; GFX6-NEXT: s_and_b32 s2, s4, 0xffff +; GFX6-NEXT: s_lshr_b32 s0, s2, 16 +; GFX6-NEXT: s_lshr_b32 s1, s3, 16 +; GFX6-NEXT: s_lshl_b32 s0, s0, 16 +; GFX6-NEXT: s_and_b32 s2, s2, 0xffff +; GFX6-NEXT: s_or_b32 s0, s0, s2 +; GFX6-NEXT: s_lshl_b32 s1, s1, 16 +; GFX6-NEXT: s_and_b32 s2, s3, 0xffff ; GFX6-NEXT: s_or_b32 s1, s1, s2 -; GFX6-NEXT: s_lshl_b32 s2, s7, 16 -; GFX6-NEXT: s_and_b32 s3, s6, 0xffff -; GFX6-NEXT: s_or_b32 s2, s2, s3 -; GFX6-NEXT: s_lshl_b32 s3, s9, 16 -; GFX6-NEXT: s_and_b32 s4, s8, 0xffff +; GFX6-NEXT: s_lshr_b32 s2, s4, 16 +; GFX6-NEXT: s_lshr_b32 s3, s5, 16 +; GFX6-NEXT: s_lshl_b32 s2, s2, 16 +; GFX6-NEXT: s_and_b32 s4, s4, 0xffff +; GFX6-NEXT: s_or_b32 s2, s2, s4 +; GFX6-NEXT: s_lshl_b32 s3, s3, 16 +; GFX6-NEXT: s_and_b32 s4, s5, 0xffff ; GFX6-NEXT: s_or_b32 s3, s3, s4 -; GFX6-NEXT: s_lshl_b32 s4, s11, 16 -; GFX6-NEXT: s_and_b32 s5, s10, 0xffff -; GFX6-NEXT: s_or_b32 s4, s4, s5 -; GFX6-NEXT: s_lshl_b32 s5, s13, 16 -; GFX6-NEXT: s_and_b32 s6, s12, 0xffff +; GFX6-NEXT: s_lshr_b32 s4, s6, 16 +; GFX6-NEXT: s_lshr_b32 s5, s7, 16 +; GFX6-NEXT: s_lshl_b32 s4, s4, 16 +; GFX6-NEXT: s_and_b32 s6, s6, 0xffff +; GFX6-NEXT: s_or_b32 s4, s4, s6 +; GFX6-NEXT: s_lshl_b32 s5, s5, 16 +; GFX6-NEXT: s_and_b32 s6, s7, 0xffff ; GFX6-NEXT: s_or_b32 s5, s5, s6 ; GFX6-NEXT: s_xor_b64 s[4:5], s[4:5], -1 ; GFX6-NEXT: s_and_b64 s[0:1], s[0:1], s[4:5] @@ -1101,24 +1082,26 @@ define <4 x i16> @v_andn2_v4i16(<4 x i16> %src0, <4 x i16> %src1) { ; GFX6-LABEL: v_andn2_v4i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX6-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_or_b32_e32 v1, v4, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v5 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v4 -; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v6 -; GFX6-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX6-NEXT: v_or_b32_e32 v3, v4, v3 ; GFX6-NEXT: v_xor_b32_e32 v2, -1, v2 ; GFX6-NEXT: v_xor_b32_e32 v3, -1, v3 ; GFX6-NEXT: v_and_b32_e32 v0, v0, v2 -; GFX6-NEXT: v_and_b32_e32 v2, v1, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX6-NEXT: v_and_b32_e32 v1, v1, v3 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_andn2_v4i16: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll index ecd7cc24fd920..6fe6b526a7afe 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll @@ -731,12 +731,16 @@ define <2 x i16> @v_ashr_v2i16(<2 x i16> %value, <2 x i16> %amount) { ; GFX6-LABEL: v_ashr_v2i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX6-NEXT: v_bfe_i32 v3, v0, 0, 16 +; GFX6-NEXT: v_bfe_i32 v0, v0, 16, 16 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, v2, v0 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v3 -; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16 -; GFX6-NEXT: v_ashrrev_i32_e32 v1, v2, v1 +; GFX6-NEXT: v_ashrrev_i32_e32 v1, v1, v3 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_ashr_v2i16: @@ -766,10 +770,14 @@ define <2 x i16> @v_ashr_v2i16_15(<2 x i16> %value) { ; GFX6-LABEL: v_ashr_v2i16_15: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16 +; GFX6-NEXT: v_bfe_i32 v1, v0, 0, 16 +; GFX6-NEXT: v_bfe_i32 v0, v0, 16, 16 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 15, v0 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 15, v1 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_ashr_v2i16_15: @@ -799,14 +807,15 @@ define <2 x i16> @v_ashr_v2i16_15(<2 x i16> %value) { define amdgpu_ps i32 @s_ashr_v2i16(<2 x i16> inreg %value, <2 x i16> inreg %amount) { ; GFX6-LABEL: s_ashr_v2i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_sext_i32_i16 s1, s1 -; GFX6-NEXT: s_sext_i32_i16 s0, s0 -; GFX6-NEXT: s_ashr_i32 s1, s1, s3 +; GFX6-NEXT: s_lshr_b32 s2, s1, 16 +; GFX6-NEXT: s_sext_i32_i16 s3, s0 +; GFX6-NEXT: s_bfe_i32 s0, s0, 0x100010 ; GFX6-NEXT: s_ashr_i32 s0, s0, s2 -; GFX6-NEXT: s_and_b32 s1, s1, 0xffff +; GFX6-NEXT: s_ashr_i32 s1, s3, s1 ; GFX6-NEXT: s_and_b32 s0, s0, 0xffff -; GFX6-NEXT: s_lshl_b32 s1, s1, 16 -; GFX6-NEXT: s_or_b32 s0, s0, s1 +; GFX6-NEXT: s_and_b32 s1, s1, 0xffff +; GFX6-NEXT: s_lshl_b32 s0, s0, 16 +; GFX6-NEXT: s_or_b32 s0, s1, s0 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_ashr_v2i16: @@ -852,12 +861,12 @@ define amdgpu_ps i32 @s_ashr_v2i16(<2 x i16> inreg %value, <2 x i16> inreg %amou define amdgpu_ps float @ashr_v2i16_sv(<2 x i16> inreg %value, <2 x i16> %amount) { ; GFX6-LABEL: ashr_v2i16_sv: ; GFX6: ; %bb.0: +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX6-NEXT: s_sext_i32_i16 s1, s0 +; GFX6-NEXT: s_bfe_i32 s0, s0, 0x100010 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX6-NEXT: s_sext_i32_i16 s0, s0 -; GFX6-NEXT: v_ashr_i32_e32 v0, s0, v0 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX6-NEXT: s_sext_i32_i16 s0, s1 ; GFX6-NEXT: v_ashr_i32_e32 v1, s0, v1 +; GFX6-NEXT: v_ashr_i32_e32 v0, s1, v0 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -890,16 +899,16 @@ define amdgpu_ps float @ashr_v2i16_sv(<2 x i16> inreg %value, <2 x i16> %amount) define amdgpu_ps float @ashr_v2i16_vs(<2 x i16> %value, <2 x i16> inreg %amount) { ; GFX6-LABEL: ashr_v2i16_vs: ; GFX6: ; %bb.0: +; GFX6-NEXT: s_lshr_b32 s1, s0, 16 +; GFX6-NEXT: v_bfe_i32 v1, v0, 0, 16 +; GFX6-NEXT: v_bfe_i32 v0, v0, 16, 16 ; GFX6-NEXT: s_and_b32 s0, s0, 0xffff -; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GFX6-NEXT: v_ashrrev_i32_e32 v0, s0, v0 -; GFX6-NEXT: s_and_b32 s0, s1, 0xffff -; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16 +; GFX6-NEXT: v_ashrrev_i32_e32 v0, s1, v0 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, s0, v1 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: ashr_v2i16_vs: @@ -940,26 +949,26 @@ define <2 x float> @v_ashr_v4i16(<4 x i16> %value, <4 x i16> %amount) { ; GFX6-LABEL: v_ashr_v4i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX6-NEXT: v_bfe_i32 v6, v0, 0, 16 +; GFX6-NEXT: v_bfe_i32 v0, v0, 16, 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, v4, v0 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v5 -; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16 -; GFX6-NEXT: v_ashrrev_i32_e32 v1, v4, v1 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v6 -; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16 -; GFX6-NEXT: v_ashrrev_i32_e32 v2, v4, v2 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v7 -; GFX6-NEXT: v_bfe_i32 v3, v3, 0, 16 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX6-NEXT: v_ashrrev_i32_e32 v3, v4, v3 +; GFX6-NEXT: v_bfe_i32 v4, v1, 0, 16 +; GFX6-NEXT: v_bfe_i32 v1, v1, 16, 16 +; GFX6-NEXT: v_ashrrev_i32_e32 v2, v2, v6 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX6-NEXT: v_ashrrev_i32_e32 v1, v5, v1 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX6-NEXT: v_ashrrev_i32_e32 v3, v3, v4 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v1, v2, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_ashr_v4i16: @@ -994,22 +1003,24 @@ define <2 x float> @v_ashr_v4i16(<4 x i16> %value, <4 x i16> %amount) { define amdgpu_ps <2 x i32> @s_ashr_v4i16(<4 x i16> inreg %value, <4 x i16> inreg %amount) { ; GFX6-LABEL: s_ashr_v4i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_sext_i32_i16 s1, s1 -; GFX6-NEXT: s_sext_i32_i16 s0, s0 -; GFX6-NEXT: s_ashr_i32 s1, s1, s5 +; GFX6-NEXT: s_lshr_b32 s4, s2, 16 +; GFX6-NEXT: s_sext_i32_i16 s6, s0 +; GFX6-NEXT: s_bfe_i32 s0, s0, 0x100010 +; GFX6-NEXT: s_lshr_b32 s5, s3, 16 ; GFX6-NEXT: s_ashr_i32 s0, s0, s4 -; GFX6-NEXT: s_sext_i32_i16 s2, s2 -; GFX6-NEXT: s_sext_i32_i16 s3, s3 -; GFX6-NEXT: s_and_b32 s1, s1, 0xffff -; GFX6-NEXT: s_ashr_i32 s2, s2, s6 -; GFX6-NEXT: s_ashr_i32 s3, s3, s7 +; GFX6-NEXT: s_sext_i32_i16 s4, s1 +; GFX6-NEXT: s_bfe_i32 s1, s1, 0x100010 +; GFX6-NEXT: s_ashr_i32 s2, s6, s2 +; GFX6-NEXT: s_ashr_i32 s1, s1, s5 ; GFX6-NEXT: s_and_b32 s0, s0, 0xffff -; GFX6-NEXT: s_lshl_b32 s1, s1, 16 -; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_and_b32 s1, s2, 0xffff +; GFX6-NEXT: s_ashr_i32 s3, s4, s3 +; GFX6-NEXT: s_and_b32 s2, s2, 0xffff +; GFX6-NEXT: s_lshl_b32 s0, s0, 16 +; GFX6-NEXT: s_and_b32 s1, s1, 0xffff +; GFX6-NEXT: s_or_b32 s0, s2, s0 ; GFX6-NEXT: s_and_b32 s2, s3, 0xffff -; GFX6-NEXT: s_lshl_b32 s2, s2, 16 -; GFX6-NEXT: s_or_b32 s1, s1, s2 +; GFX6-NEXT: s_lshl_b32 s1, s1, 16 +; GFX6-NEXT: s_or_b32 s1, s2, s1 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_ashr_v4i16: @@ -1103,46 +1114,46 @@ define <4 x float> @v_ashr_v8i16(<8 x i16> %value, <8 x i16> %amount) { ; GFX6-LABEL: v_ashr_v8i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v8, 16, v4 +; GFX6-NEXT: v_bfe_i32 v12, v0, 0, 16 +; GFX6-NEXT: v_bfe_i32 v0, v0, 16, 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v9, 16, v5 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, v8, v0 -; GFX6-NEXT: v_and_b32_e32 v8, 0xffff, v9 -; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16 -; GFX6-NEXT: v_ashrrev_i32_e32 v1, v8, v1 -; GFX6-NEXT: v_and_b32_e32 v8, 0xffff, v10 -; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16 -; GFX6-NEXT: v_ashrrev_i32_e32 v2, v8, v2 -; GFX6-NEXT: v_and_b32_e32 v8, 0xffff, v11 -; GFX6-NEXT: v_bfe_i32 v3, v3, 0, 16 -; GFX6-NEXT: v_ashrrev_i32_e32 v3, v8, v3 -; GFX6-NEXT: v_and_b32_e32 v8, 0xffff, v12 -; GFX6-NEXT: v_bfe_i32 v4, v4, 0, 16 -; GFX6-NEXT: v_ashrrev_i32_e32 v4, v8, v4 -; GFX6-NEXT: v_and_b32_e32 v8, 0xffff, v13 -; GFX6-NEXT: v_bfe_i32 v5, v5, 0, 16 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX6-NEXT: v_ashrrev_i32_e32 v5, v8, v5 -; GFX6-NEXT: v_and_b32_e32 v8, 0xffff, v14 -; GFX6-NEXT: v_bfe_i32 v6, v6, 0, 16 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX6-NEXT: v_bfe_i32 v8, v1, 0, 16 +; GFX6-NEXT: v_bfe_i32 v1, v1, 16, 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v10, 16, v6 +; GFX6-NEXT: v_ashrrev_i32_e32 v4, v4, v12 +; GFX6-NEXT: v_ashrrev_i32_e32 v5, v5, v8 +; GFX6-NEXT: v_ashrrev_i32_e32 v1, v9, v1 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX6-NEXT: v_bfe_i32 v8, v2, 0, 16 +; GFX6-NEXT: v_bfe_i32 v2, v2, 16, 16 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v11, 16, v7 +; GFX6-NEXT: v_ashrrev_i32_e32 v6, v6, v8 +; GFX6-NEXT: v_ashrrev_i32_e32 v2, v10, v2 +; GFX6-NEXT: v_bfe_i32 v8, v3, 0, 16 +; GFX6-NEXT: v_bfe_i32 v3, v3, 16, 16 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX6-NEXT: v_ashrrev_i32_e32 v3, v11, v3 +; GFX6-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_ashrrev_i32_e32 v6, v8, v6 -; GFX6-NEXT: v_and_b32_e32 v8, 0xffff, v15 -; GFX6-NEXT: v_bfe_i32 v7, v7, 0, 16 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v3 -; GFX6-NEXT: v_ashrrev_i32_e32 v7, v8, v7 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX6-NEXT: v_ashrrev_i32_e32 v7, v7, v8 +; GFX6-NEXT: v_or_b32_e32 v1, v4, v1 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v5 -; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 ; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v7 -; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v6 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_or_b32_e32 v3, v4, v3 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_ashr_v8i16: @@ -1187,38 +1198,42 @@ define <4 x float> @v_ashr_v8i16(<8 x i16> %value, <8 x i16> %amount) { define amdgpu_ps <4 x i32> @s_ashr_v8i16(<8 x i16> inreg %value, <8 x i16> inreg %amount) { ; GFX6-LABEL: s_ashr_v8i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_sext_i32_i16 s1, s1 -; GFX6-NEXT: s_sext_i32_i16 s0, s0 -; GFX6-NEXT: s_ashr_i32 s1, s1, s9 +; GFX6-NEXT: s_lshr_b32 s8, s4, 16 +; GFX6-NEXT: s_sext_i32_i16 s12, s0 +; GFX6-NEXT: s_bfe_i32 s0, s0, 0x100010 +; GFX6-NEXT: s_lshr_b32 s9, s5, 16 ; GFX6-NEXT: s_ashr_i32 s0, s0, s8 -; GFX6-NEXT: s_sext_i32_i16 s2, s2 -; GFX6-NEXT: s_sext_i32_i16 s3, s3 -; GFX6-NEXT: s_and_b32 s1, s1, 0xffff +; GFX6-NEXT: s_sext_i32_i16 s8, s1 +; GFX6-NEXT: s_bfe_i32 s1, s1, 0x100010 +; GFX6-NEXT: s_lshr_b32 s10, s6, 16 +; GFX6-NEXT: s_ashr_i32 s4, s12, s4 +; GFX6-NEXT: s_ashr_i32 s5, s8, s5 +; GFX6-NEXT: s_ashr_i32 s1, s1, s9 +; GFX6-NEXT: s_sext_i32_i16 s8, s2 +; GFX6-NEXT: s_bfe_i32 s2, s2, 0x100010 +; GFX6-NEXT: s_and_b32 s0, s0, 0xffff +; GFX6-NEXT: s_lshr_b32 s11, s7, 16 +; GFX6-NEXT: s_ashr_i32 s6, s8, s6 ; GFX6-NEXT: s_ashr_i32 s2, s2, s10 +; GFX6-NEXT: s_sext_i32_i16 s8, s3 +; GFX6-NEXT: s_bfe_i32 s3, s3, 0x100010 +; GFX6-NEXT: s_and_b32 s4, s4, 0xffff +; GFX6-NEXT: s_lshl_b32 s0, s0, 16 +; GFX6-NEXT: s_and_b32 s1, s1, 0xffff ; GFX6-NEXT: s_ashr_i32 s3, s3, s11 -; GFX6-NEXT: s_sext_i32_i16 s5, s5 -; GFX6-NEXT: s_and_b32 s0, s0, 0xffff +; GFX6-NEXT: s_or_b32 s0, s4, s0 +; GFX6-NEXT: s_and_b32 s4, s5, 0xffff ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 -; GFX6-NEXT: s_sext_i32_i16 s4, s4 -; GFX6-NEXT: s_ashr_i32 s5, s5, s13 -; GFX6-NEXT: s_sext_i32_i16 s7, s7 -; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_and_b32 s1, s2, 0xffff -; GFX6-NEXT: s_and_b32 s2, s3, 0xffff -; GFX6-NEXT: s_ashr_i32 s4, s4, s12 -; GFX6-NEXT: s_sext_i32_i16 s6, s6 -; GFX6-NEXT: s_ashr_i32 s7, s7, s15 +; GFX6-NEXT: s_and_b32 s2, s2, 0xffff +; GFX6-NEXT: s_ashr_i32 s7, s8, s7 +; GFX6-NEXT: s_or_b32 s1, s4, s1 +; GFX6-NEXT: s_and_b32 s4, s6, 0xffff ; GFX6-NEXT: s_lshl_b32 s2, s2, 16 -; GFX6-NEXT: s_and_b32 s3, s5, 0xffff -; GFX6-NEXT: s_ashr_i32 s6, s6, s14 -; GFX6-NEXT: s_or_b32 s1, s1, s2 -; GFX6-NEXT: s_and_b32 s2, s4, 0xffff -; GFX6-NEXT: s_lshl_b32 s3, s3, 16 +; GFX6-NEXT: s_and_b32 s3, s3, 0xffff +; GFX6-NEXT: s_or_b32 s2, s4, s2 ; GFX6-NEXT: s_and_b32 s4, s7, 0xffff -; GFX6-NEXT: s_or_b32 s2, s2, s3 -; GFX6-NEXT: s_and_b32 s3, s6, 0xffff -; GFX6-NEXT: s_lshl_b32 s4, s4, 16 -; GFX6-NEXT: s_or_b32 s3, s3, s4 +; GFX6-NEXT: s_lshl_b32 s3, s3, 16 +; GFX6-NEXT: s_or_b32 s3, s4, s3 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_ashr_v8i16: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_flat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_flat.ll index d03bbdecf84ef..a28b217607eec 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_flat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_flat.ll @@ -164,14 +164,10 @@ define <2 x i16> @atomic_load_flat_monotonic_i16_d16_hi_vector_insert(ptr %ptr, ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_load_ushort v0, v[0:1] glc -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v2 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: atomic_load_flat_monotonic_i16_d16_hi_vector_insert: @@ -239,13 +235,9 @@ define <2 x i16> @atomic_load_flat_monotonic_i16_d16_lo_vector_insert(ptr %ptr, ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_load_ushort v0, v[0:1] glc -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: atomic_load_flat_monotonic_i16_d16_lo_vector_insert: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_global.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_global.ll index a8def6e6f6e92..2be9e29530ace 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_global.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_global.ll @@ -483,28 +483,20 @@ define <2 x i16> @atomic_load_global_monotonic_i16_d16_hi_vector_insert(ptr addr ; GFX6-NEXT: s_mov_b32 s7, 0x100f000 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 glc -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v2 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: atomic_load_global_monotonic_i16_d16_hi_vector_insert: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_load_ushort v0, v[0:1] glc -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v2 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: atomic_load_global_monotonic_i16_d16_hi_vector_insert: @@ -588,26 +580,18 @@ define <2 x i16> @atomic_load_global_monotonic_i16_d16_lo_vector_insert(ptr addr ; GFX6-NEXT: s_mov_b32 s7, 0x100f000 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 glc -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: atomic_load_global_monotonic_i16_d16_lo_vector_insert: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_load_ushort v0, v[0:1] glc -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: atomic_load_global_monotonic_i16_d16_lo_vector_insert: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_local_2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_local_2.ll index fc7eafbbcdc77..08be9df3e5c5c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_local_2.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_local_2.ll @@ -377,14 +377,10 @@ define <2 x i16> @atomic_load_local_monotonic_i16_d16_hi_vector_insert(ptr addrs ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: ds_read_u16 v0, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX7-NEXT: v_or_b32_e32 v1, v2, v1 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: atomic_load_local_monotonic_i16_d16_hi_vector_insert: @@ -455,13 +451,9 @@ define <2 x i16> @atomic_load_local_monotonic_i16_d16_lo_vector_insert(ptr addrs ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: ds_read_u16 v0, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX7-NEXT: v_or_b32_e32 v1, v2, v1 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: atomic_load_local_monotonic_i16_d16_lo_vector_insert: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll index 132dc876b3b05..57755c6856858 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll @@ -447,16 +447,17 @@ define i16 @v_bswap_i16(i16 %src) { define amdgpu_ps i32 @s_bswap_v2i16(<2 x i16> inreg %src) { ; GFX7-LABEL: s_bswap_v2i16: ; GFX7: ; %bb.0: +; GFX7-NEXT: s_lshr_b32 s1, s0, 16 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: s_lshl_b32 s2, s0, 8 -; GFX7-NEXT: s_bfe_u32 s0, s0, 0x80008 -; GFX7-NEXT: s_or_b32 s0, s0, s2 -; GFX7-NEXT: s_lshl_b32 s2, s1, 8 -; GFX7-NEXT: s_bfe_u32 s1, s1, 0x80008 -; GFX7-NEXT: s_or_b32 s1, s1, s2 -; GFX7-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX7-NEXT: s_and_b32 s0, 0xffff, s0 -; GFX7-NEXT: s_lshl_b32 s1, s1, 16 -; GFX7-NEXT: s_or_b32 s0, s0, s1 +; GFX7-NEXT: s_bfe_u32 s3, s0, 0x80008 +; GFX7-NEXT: v_alignbit_b32 v0, s1, v0, 24 +; GFX7-NEXT: s_or_b32 s2, s3, s2 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: s_and_b32 s0, 0xffff, s2 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX7-NEXT: v_readfirstlane_b32 s0, v0 ; GFX7-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_bswap_v2i16: @@ -560,12 +561,15 @@ define <2 x i16> @v_bswap_v2i16(<2 x i16> %src) { ; GFX7-LABEL: v_bswap_v2i16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v0 -; GFX7-NEXT: v_bfe_u32 v0, v0, 8, 8 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v1 -; GFX7-NEXT: v_bfe_u32 v1, v1, 8, 8 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX7-NEXT: v_bfe_u32 v3, v0, 8, 8 +; GFX7-NEXT: v_alignbit_b32 v0, v1, v0, 24 +; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_bswap_v2i16: @@ -595,15 +599,19 @@ define <3 x i16> @v_bswap_v3i16(<3 x i16> %src) { ; GFX7-LABEL: v_bswap_v3i16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v0 -; GFX7-NEXT: v_bfe_u32 v0, v0, 8, 8 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v1 +; GFX7-NEXT: v_bfe_u32 v4, v0, 8, 8 +; GFX7-NEXT: v_alignbit_b32 v0, v2, v0, 24 +; GFX7-NEXT: v_or_b32_e32 v3, v4, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v1 ; GFX7-NEXT: v_bfe_u32 v1, v1, 8, 8 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v2 -; GFX7-NEXT: v_bfe_u32 v2, v2, 8, 8 -; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_bswap_v3i16: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/dummy-target.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/dummy-target.ll index 9eeb633f0a817..31907a6ee9656 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/dummy-target.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/dummy-target.ll @@ -62,15 +62,26 @@ define i16 @halfinsts_add_i16(i16 %arg0) #1 { define <2 x i16> @halfinsts_add_v2i16(<2 x i16> %arg0) #1 { ; CHECK-LABEL: name: halfinsts_add_v2i16 ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $vgpr0, $vgpr1 + ; CHECK-NEXT: liveins: $vgpr0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY]], [[COPY]] - ; CHECK-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[COPY1]], [[COPY1]] - ; CHECK-NEXT: $vgpr0 = COPY [[ADD]](s32) - ; CHECK-NEXT: $vgpr1 = COPY [[ADD1]](s32) - ; CHECK-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1 + ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY]](s32) + ; CHECK-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[BITCAST]](<2 x s16>) + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; CHECK-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[BITCAST]](<2 x s16>) + ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[BITCAST1]], [[BITCAST2]] + ; CHECK-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[LSHR]], [[LSHR1]] + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ADD]], [[C1]] + ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[ADD1]], [[C1]] + ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32) + ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; CHECK-NEXT: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) + ; CHECK-NEXT: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[BITCAST3]](<2 x s16>) + ; CHECK-NEXT: $vgpr0 = COPY [[BITCAST4]](s32) + ; CHECK-NEXT: SI_RETURN implicit $vgpr0 %add = add <2 x i16> %arg0, %arg0 ret <2 x i16> %add } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll index 62b264a537457..91aa286886a43 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll @@ -750,13 +750,15 @@ define <2 x half> @v_fdiv_v2f16(<2 x half> %a, <2 x half> %b) { ; GFX6-IEEE-LABEL: v_fdiv_v2f16: ; GFX6-IEEE: ; %bb.0: ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0 +; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX6-IEEE-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX6-IEEE-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-IEEE-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, v0 +; GFX6-IEEE-NEXT: v_div_scale_f32 v4, s[4:5], v3, v3, v2 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v5, v4 -; GFX6-IEEE-NEXT: v_div_scale_f32 v6, vcc, v0, v2, v0 +; GFX6-IEEE-NEXT: v_div_scale_f32 v6, vcc, v2, v3, v2 +; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-IEEE-NEXT: v_fma_f32 v7, -v4, v5, 1.0 ; GFX6-IEEE-NEXT: v_fma_f32 v5, v7, v5, v5 ; GFX6-IEEE-NEXT: v_mul_f32_e32 v7, v6, v5 @@ -764,30 +766,34 @@ define <2 x half> @v_fdiv_v2f16(<2 x half> %a, <2 x half> %b) { ; GFX6-IEEE-NEXT: v_fma_f32 v7, v8, v5, v7 ; GFX6-IEEE-NEXT: v_fma_f32 v4, -v4, v7, v6 ; GFX6-IEEE-NEXT: v_div_fmas_f32 v4, v4, v5, v7 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v4, v2, v0 -; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v3, v3, v1 -; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v2 -; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v1, v3, v1 -; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v4, 1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4 -; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4 -; GFX6-IEEE-NEXT: v_fma_f32 v7, -v2, v6, v5 +; GFX6-IEEE-NEXT: v_div_scale_f32 v5, s[4:5], v1, v1, v0 +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v6, v5 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v2, v4, v3, v2 +; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, v0, v1, v0 +; GFX6-IEEE-NEXT: v_fma_f32 v4, -v5, v6, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v4, v4, v6, v6 +; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v3, v4 +; GFX6-IEEE-NEXT: v_fma_f32 v7, -v5, v6, v3 ; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6 -; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v6, v5 -; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v4, v6 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v3, v1 -; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-IEEE-NEXT: v_fma_f32 v3, -v5, v6, v3 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v1, v0 +; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v2 +; GFX6-IEEE-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-IEEE-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-FLUSH-LABEL: v_fdiv_v2f16: ; GFX6-FLUSH: ; %bb.0: ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, v0 +; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v0 +; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX6-FLUSH-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX6-FLUSH-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, s[4:5], v3, v3, v2 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v5, v4 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v6, vcc, v0, v2, v0 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v6, vcc, v2, v3, v2 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v4, v5, 1.0 ; GFX6-FLUSH-NEXT: v_fma_f32 v5, v7, v5, v5 @@ -796,26 +802,28 @@ define <2 x half> @v_fdiv_v2f16(<2 x half> %a, <2 x half> %b) { ; GFX6-FLUSH-NEXT: v_fma_f32 v7, v8, v5, v7 ; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v4, v7, v6 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-FLUSH-NEXT: v_div_fmas_f32 v4, v4, v5, v7 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v4, v2, v0 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v2, v4, v3, v2 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 -; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v3, v3, v1 -; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v2 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v1, v3, v1 +; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v0 +; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v0, v1, v0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v4, 1.0 +; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v4, 1.0 ; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4 ; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4 -; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v2, v6, v5 +; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v3, v6, v5 ; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6 -; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v6, v5 +; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v4, v6 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v3, v1 -; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v1, v0 +; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-FLUSH-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-IEEE-LABEL: v_fdiv_v2f16: @@ -1066,16 +1074,20 @@ define <2 x half> @v_fdiv_v2f16_afn(<2 x half> %a, <2 x half> %b) { ; GFX6-LABEL: v_fdiv_v2f16_afn: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-NEXT: v_rcp_f32_e32 v2, v2 +; GFX6-NEXT: v_rcp_f32_e32 v1, v1 ; GFX6-NEXT: v_rcp_f32_e32 v3, v3 -; GFX6-NEXT: v_mul_f32_e32 v0, v0, v2 -; GFX6-NEXT: v_mul_f32_e32 v1, v1, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_mul_f32_e32 v1, v2, v1 +; GFX6-NEXT: v_mul_f32_e32 v0, v0, v3 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fdiv_v2f16_afn: @@ -1128,13 +1140,15 @@ define <2 x half> @v_fdiv_v2f16_ulp25(<2 x half> %a, <2 x half> %b) { ; GFX6-IEEE-LABEL: v_fdiv_v2f16_ulp25: ; GFX6-IEEE: ; %bb.0: ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0 +; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX6-IEEE-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX6-IEEE-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-IEEE-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, v0 +; GFX6-IEEE-NEXT: v_div_scale_f32 v4, s[4:5], v3, v3, v2 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v5, v4 -; GFX6-IEEE-NEXT: v_div_scale_f32 v6, vcc, v0, v2, v0 +; GFX6-IEEE-NEXT: v_div_scale_f32 v6, vcc, v2, v3, v2 +; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-IEEE-NEXT: v_fma_f32 v7, -v4, v5, 1.0 ; GFX6-IEEE-NEXT: v_fma_f32 v5, v7, v5, v5 ; GFX6-IEEE-NEXT: v_mul_f32_e32 v7, v6, v5 @@ -1142,30 +1156,34 @@ define <2 x half> @v_fdiv_v2f16_ulp25(<2 x half> %a, <2 x half> %b) { ; GFX6-IEEE-NEXT: v_fma_f32 v7, v8, v5, v7 ; GFX6-IEEE-NEXT: v_fma_f32 v4, -v4, v7, v6 ; GFX6-IEEE-NEXT: v_div_fmas_f32 v4, v4, v5, v7 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v4, v2, v0 -; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v3, v3, v1 -; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v2 -; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v1, v3, v1 -; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v4, 1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4 -; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4 -; GFX6-IEEE-NEXT: v_fma_f32 v7, -v2, v6, v5 +; GFX6-IEEE-NEXT: v_div_scale_f32 v5, s[4:5], v1, v1, v0 +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v6, v5 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v2, v4, v3, v2 +; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, v0, v1, v0 +; GFX6-IEEE-NEXT: v_fma_f32 v4, -v5, v6, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v4, v4, v6, v6 +; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v3, v4 +; GFX6-IEEE-NEXT: v_fma_f32 v7, -v5, v6, v3 ; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6 -; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v6, v5 -; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v4, v6 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v3, v1 -; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-IEEE-NEXT: v_fma_f32 v3, -v5, v6, v3 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v1, v0 +; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v2 +; GFX6-IEEE-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-IEEE-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-FLUSH-LABEL: v_fdiv_v2f16_ulp25: ; GFX6-FLUSH: ; %bb.0: ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, v0 +; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v0 +; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX6-FLUSH-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX6-FLUSH-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, s[4:5], v3, v3, v2 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v5, v4 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v6, vcc, v0, v2, v0 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v6, vcc, v2, v3, v2 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v4, v5, 1.0 ; GFX6-FLUSH-NEXT: v_fma_f32 v5, v7, v5, v5 @@ -1174,26 +1192,28 @@ define <2 x half> @v_fdiv_v2f16_ulp25(<2 x half> %a, <2 x half> %b) { ; GFX6-FLUSH-NEXT: v_fma_f32 v7, v8, v5, v7 ; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v4, v7, v6 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-FLUSH-NEXT: v_div_fmas_f32 v4, v4, v5, v7 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v4, v2, v0 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v2, v4, v3, v2 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 -; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v3, v3, v1 -; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v2 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v1, v3, v1 +; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v0 +; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v0, v1, v0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v4, 1.0 +; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v4, 1.0 ; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4 ; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4 -; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v2, v6, v5 +; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v3, v6, v5 ; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6 -; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v6, v5 +; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v4, v6 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v3, v1 -; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v1, v0 +; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-FLUSH-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-IEEE-LABEL: v_fdiv_v2f16_ulp25: @@ -1444,23 +1464,12 @@ define <2 x half> @v_rcp_v2f16(<2 x half> %x) { ; GFX6-IEEE-LABEL: v_rcp_v2f16: ; GFX6-IEEE: ; %bb.0: ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0 +; GFX6-IEEE-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0 -; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3 -; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5 -; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4 -; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 ; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, 1.0 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2 ; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, 1.0, v1, 1.0 -; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 ; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3 ; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3 @@ -1468,17 +1477,32 @@ define <2 x half> @v_rcp_v2f16(<2 x half> %x) { ; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5 ; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4 ; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, 1.0 +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3 ; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0 +; GFX6-IEEE-NEXT: v_div_scale_f32 v2, vcc, 1.0, v0, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v5, -v3, v4, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v4, v4 +; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v2, v4 +; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v5, v2 +; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v4, v5 +; GFX6-IEEE-NEXT: v_fma_f32 v2, -v3, v5, v2 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v4, v5 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 +; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-IEEE-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-IEEE-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-FLUSH-LABEL: v_rcp_v2f16: ; GFX6-FLUSH: ; %bb.0: ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0 +; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0 +; GFX6-FLUSH-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, 1.0 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v1, 1.0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 ; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3 @@ -1487,14 +1511,14 @@ define <2 x half> @v_rcp_v2f16(<2 x half> %x) { ; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5 ; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 -; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, 1.0 +; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v1, 1.0 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 ; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3 @@ -1504,8 +1528,10 @@ define <2 x half> @v_rcp_v2f16(<2 x half> %x) { ; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0 -; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 +; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-FLUSH-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-IEEE-LABEL: v_rcp_v2f16: @@ -1725,23 +1751,12 @@ define <2 x half> @v_neg_rcp_v2f16(<2 x half> %x) { ; GFX6-IEEE-LABEL: v_neg_rcp_v2f16: ; GFX6-IEEE: ; %bb.0: ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0 +; GFX6-IEEE-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, -1.0 -; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, -1.0, v0, -1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3 -; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5 -; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4 -; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, -1.0 ; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, -1.0 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2 ; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, -1.0, v1, -1.0 -; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 ; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3 ; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3 @@ -1749,17 +1764,32 @@ define <2 x half> @v_neg_rcp_v2f16(<2 x half> %x) { ; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5 ; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4 ; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, -1.0 +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3 ; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v1, -1.0 +; GFX6-IEEE-NEXT: v_div_scale_f32 v2, vcc, -1.0, v0, -1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v5, -v3, v4, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v4, v4 +; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v2, v4 +; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v5, v2 +; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v4, v5 +; GFX6-IEEE-NEXT: v_fma_f32 v2, -v3, v5, v2 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v4, v5 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, -1.0 +; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-IEEE-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-IEEE-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-FLUSH-LABEL: v_neg_rcp_v2f16: ; GFX6-FLUSH: ; %bb.0: ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, -1.0 +; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0 +; GFX6-FLUSH-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, -1.0 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, -1.0, v0, -1.0 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, -1.0, v1, -1.0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 ; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3 @@ -1768,14 +1798,14 @@ define <2 x half> @v_neg_rcp_v2f16(<2 x half> %x) { ; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5 ; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, -1.0 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, -1.0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 -; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, -1.0 +; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, -1.0 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, -1.0, v1, -1.0 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, -1.0, v0, -1.0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 ; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3 @@ -1785,8 +1815,10 @@ define <2 x half> @v_neg_rcp_v2f16(<2 x half> %x) { ; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, -1.0 -; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, -1.0 +; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-FLUSH-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-IEEE-LABEL: v_neg_rcp_v2f16: @@ -2006,12 +2038,10 @@ define <2 x half> @v_rcp_v2f16_fabs(<2 x half> %x) { ; GFX6-IEEE-LABEL: v_rcp_v2f16_fabs: ; GFX6-IEEE: ; %bb.0: ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-IEEE-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-IEEE-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX6-IEEE-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-IEEE-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0 ; GFX6-IEEE-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, 1.0 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2 ; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, 1.0, v1, 1.0 @@ -2021,30 +2051,28 @@ define <2 x half> @v_rcp_v2f16_fabs(<2 x half> %x) { ; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4 ; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5 ; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4 -; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v4, v0 -; GFX6-IEEE-NEXT: v_div_fmas_f32 v0, v2, v3, v5 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v0, v1, 1.0 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, 1.0 +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0 +; GFX6-IEEE-NEXT: v_div_scale_f32 v2, vcc, 1.0, v0, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v5, -v3, v4, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v4, v4 +; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v2, v4 +; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v5, v2 +; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v4, v5 +; GFX6-IEEE-NEXT: v_fma_f32 v2, -v3, v5, v2 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v4, v5 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v4, v4, 1.0 -; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1 -; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, 1.0, v4, 1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v2, 1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v2, v5, v2, v2 -; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v3, v2 -; GFX6-IEEE-NEXT: v_fma_f32 v6, -v1, v5, v3 -; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v2, v5 -; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v5, v3 -; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v5 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v1, v4, 1.0 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-IEEE-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-IEEE-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-FLUSH-LABEL: v_rcp_v2f16_fabs: ; GFX6-FLUSH: ; %bb.0: ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-FLUSH-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-FLUSH-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX6-FLUSH-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-FLUSH-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0 ; GFX6-FLUSH-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -2059,25 +2087,27 @@ define <2 x half> @v_rcp_v2f16_fabs(<2 x half> %x) { ; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5 ; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v0 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v1, 1.0 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 -; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v3, v3, 1.0 -; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v3, 1.0 +; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0 +; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v2, 1.0 -; GFX6-FLUSH-NEXT: v_fma_f32 v2, v5, v2, v2 -; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v2 -; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v1, v5, v4 -; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v2, v5 -; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v5, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3 +; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5 +; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v5 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v1, v3, 1.0 -; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 +; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-FLUSH-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-IEEE-LABEL: v_rcp_v2f16_fabs: @@ -2308,12 +2338,10 @@ define <2 x half> @v_neg_rcp_v2f16_fabs(<2 x half> %x) { ; GFX6-IEEE-LABEL: v_neg_rcp_v2f16_fabs: ; GFX6-IEEE: ; %bb.0: ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-IEEE-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-IEEE-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX6-IEEE-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-IEEE-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0 ; GFX6-IEEE-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, -1.0 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2 ; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, -1.0, v1, -1.0 @@ -2323,30 +2351,28 @@ define <2 x half> @v_neg_rcp_v2f16_fabs(<2 x half> %x) { ; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4 ; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5 ; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4 -; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v4, v0 -; GFX6-IEEE-NEXT: v_div_fmas_f32 v0, v2, v3, v5 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v0, v1, -1.0 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, -1.0 +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v1, -1.0 +; GFX6-IEEE-NEXT: v_div_scale_f32 v2, vcc, -1.0, v0, -1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v5, -v3, v4, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v4, v4 +; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v2, v4 +; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v5, v2 +; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v4, v5 +; GFX6-IEEE-NEXT: v_fma_f32 v2, -v3, v5, v2 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v4, v5 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, -1.0 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v4, v4, -1.0 -; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1 -; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, -1.0, v4, -1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v2, 1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v2, v5, v2, v2 -; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v3, v2 -; GFX6-IEEE-NEXT: v_fma_f32 v6, -v1, v5, v3 -; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v2, v5 -; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v5, v3 -; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v5 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v1, v4, -1.0 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-IEEE-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-IEEE-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-FLUSH-LABEL: v_neg_rcp_v2f16_fabs: ; GFX6-FLUSH: ; %bb.0: ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-FLUSH-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-FLUSH-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX6-FLUSH-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-FLUSH-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0 ; GFX6-FLUSH-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -2361,25 +2387,27 @@ define <2 x half> @v_neg_rcp_v2f16_fabs(<2 x half> %x) { ; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5 ; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v0 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v1, -1.0 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, -1.0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 -; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v3, v3, -1.0 -; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, -1.0, v3, -1.0 +; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, -1.0 +; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, -1.0, v0, -1.0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v2, 1.0 -; GFX6-FLUSH-NEXT: v_fma_f32 v2, v5, v2, v2 -; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v2 -; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v1, v5, v4 -; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v2, v5 -; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v5, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3 +; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5 +; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v5 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v1, v3, -1.0 -; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, -1.0 +; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-FLUSH-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-IEEE-LABEL: v_neg_rcp_v2f16_fabs: @@ -2610,23 +2638,12 @@ define <2 x half> @v_rcp_v2f16_arcp(<2 x half> %x) { ; GFX6-IEEE-LABEL: v_rcp_v2f16_arcp: ; GFX6-IEEE: ; %bb.0: ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0 +; GFX6-IEEE-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0 -; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3 -; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5 -; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4 -; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 ; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, 1.0 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2 ; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, 1.0, v1, 1.0 -; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 ; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3 ; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3 @@ -2634,17 +2651,32 @@ define <2 x half> @v_rcp_v2f16_arcp(<2 x half> %x) { ; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5 ; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4 ; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, 1.0 +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3 ; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0 +; GFX6-IEEE-NEXT: v_div_scale_f32 v2, vcc, 1.0, v0, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v5, -v3, v4, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v4, v4 +; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v2, v4 +; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v5, v2 +; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v4, v5 +; GFX6-IEEE-NEXT: v_fma_f32 v2, -v3, v5, v2 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v4, v5 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 +; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-IEEE-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-IEEE-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-FLUSH-LABEL: v_rcp_v2f16_arcp: ; GFX6-FLUSH: ; %bb.0: ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0 +; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0 +; GFX6-FLUSH-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, 1.0 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v1, 1.0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 ; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3 @@ -2653,14 +2685,14 @@ define <2 x half> @v_rcp_v2f16_arcp(<2 x half> %x) { ; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5 ; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 -; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, 1.0 +; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v1, 1.0 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 ; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3 @@ -2670,8 +2702,10 @@ define <2 x half> @v_rcp_v2f16_arcp(<2 x half> %x) { ; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0 -; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 +; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-FLUSH-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_rcp_v2f16_arcp: @@ -2715,12 +2749,15 @@ define <2 x half> @v_rcp_v2f16_arcp_afn(<2 x half> %x) { ; GFX6-LABEL: v_rcp_v2f16_arcp_afn: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-NEXT: v_rcp_f32_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: v_rcp_f32_e32 v1, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_rcp_f32_e32 v0, v0 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_rcp_v2f16_arcp_afn: @@ -2764,23 +2801,12 @@ define <2 x half> @v_rcp_v2f16_ulp25(<2 x half> %x) { ; GFX6-IEEE-LABEL: v_rcp_v2f16_ulp25: ; GFX6-IEEE: ; %bb.0: ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0 +; GFX6-IEEE-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0 -; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3 -; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5 -; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4 -; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 ; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, 1.0 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2 ; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, 1.0, v1, 1.0 -; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 ; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3 ; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3 @@ -2788,17 +2814,32 @@ define <2 x half> @v_rcp_v2f16_ulp25(<2 x half> %x) { ; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5 ; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4 ; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, 1.0 +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3 ; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0 +; GFX6-IEEE-NEXT: v_div_scale_f32 v2, vcc, 1.0, v0, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v5, -v3, v4, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v4, v4 +; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v2, v4 +; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v5, v2 +; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v4, v5 +; GFX6-IEEE-NEXT: v_fma_f32 v2, -v3, v5, v2 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v4, v5 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 +; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-IEEE-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-IEEE-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-FLUSH-LABEL: v_rcp_v2f16_ulp25: ; GFX6-FLUSH: ; %bb.0: ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0 +; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0 +; GFX6-FLUSH-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, 1.0 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v1, 1.0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 ; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3 @@ -2807,14 +2848,14 @@ define <2 x half> @v_rcp_v2f16_ulp25(<2 x half> %x) { ; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5 ; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 -; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, 1.0 +; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v1, 1.0 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 ; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3 @@ -2824,8 +2865,10 @@ define <2 x half> @v_rcp_v2f16_ulp25(<2 x half> %x) { ; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0 -; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 +; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-FLUSH-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-IEEE-LABEL: v_rcp_v2f16_ulp25: @@ -3045,16 +3088,20 @@ define <2 x half> @v_fdiv_v2f16_afn_ulp25(<2 x half> %a, <2 x half> %b) { ; GFX6-LABEL: v_fdiv_v2f16_afn_ulp25: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-NEXT: v_rcp_f32_e32 v2, v2 +; GFX6-NEXT: v_rcp_f32_e32 v1, v1 ; GFX6-NEXT: v_rcp_f32_e32 v3, v3 -; GFX6-NEXT: v_mul_f32_e32 v0, v0, v2 -; GFX6-NEXT: v_mul_f32_e32 v1, v1, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_mul_f32_e32 v1, v2, v1 +; GFX6-NEXT: v_mul_f32_e32 v0, v0, v3 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fdiv_v2f16_afn_ulp25: @@ -3107,13 +3154,15 @@ define <2 x half> @v_fdiv_v2f16_arcp_ulp25(<2 x half> %a, <2 x half> %b) { ; GFX6-IEEE-LABEL: v_fdiv_v2f16_arcp_ulp25: ; GFX6-IEEE: ; %bb.0: ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0 +; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX6-IEEE-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX6-IEEE-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-IEEE-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, v0 +; GFX6-IEEE-NEXT: v_div_scale_f32 v4, s[4:5], v3, v3, v2 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v5, v4 -; GFX6-IEEE-NEXT: v_div_scale_f32 v6, vcc, v0, v2, v0 +; GFX6-IEEE-NEXT: v_div_scale_f32 v6, vcc, v2, v3, v2 +; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-IEEE-NEXT: v_fma_f32 v7, -v4, v5, 1.0 ; GFX6-IEEE-NEXT: v_fma_f32 v5, v7, v5, v5 ; GFX6-IEEE-NEXT: v_mul_f32_e32 v7, v6, v5 @@ -3121,30 +3170,34 @@ define <2 x half> @v_fdiv_v2f16_arcp_ulp25(<2 x half> %a, <2 x half> %b) { ; GFX6-IEEE-NEXT: v_fma_f32 v7, v8, v5, v7 ; GFX6-IEEE-NEXT: v_fma_f32 v4, -v4, v7, v6 ; GFX6-IEEE-NEXT: v_div_fmas_f32 v4, v4, v5, v7 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v4, v2, v0 -; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v3, v3, v1 -; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v2 -; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v1, v3, v1 -; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v4, 1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4 -; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4 -; GFX6-IEEE-NEXT: v_fma_f32 v7, -v2, v6, v5 +; GFX6-IEEE-NEXT: v_div_scale_f32 v5, s[4:5], v1, v1, v0 +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v6, v5 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v2, v4, v3, v2 +; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, v0, v1, v0 +; GFX6-IEEE-NEXT: v_fma_f32 v4, -v5, v6, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v4, v4, v6, v6 +; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v3, v4 +; GFX6-IEEE-NEXT: v_fma_f32 v7, -v5, v6, v3 ; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6 -; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v6, v5 -; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v4, v6 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v3, v1 -; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-IEEE-NEXT: v_fma_f32 v3, -v5, v6, v3 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v1, v0 +; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v2 +; GFX6-IEEE-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-IEEE-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-FLUSH-LABEL: v_fdiv_v2f16_arcp_ulp25: ; GFX6-FLUSH: ; %bb.0: ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, v0 +; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v0 +; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX6-FLUSH-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX6-FLUSH-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, s[4:5], v3, v3, v2 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v5, v4 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v6, vcc, v0, v2, v0 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v6, vcc, v2, v3, v2 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v4, v5, 1.0 ; GFX6-FLUSH-NEXT: v_fma_f32 v5, v7, v5, v5 @@ -3153,26 +3206,28 @@ define <2 x half> @v_fdiv_v2f16_arcp_ulp25(<2 x half> %a, <2 x half> %b) { ; GFX6-FLUSH-NEXT: v_fma_f32 v7, v8, v5, v7 ; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v4, v7, v6 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-FLUSH-NEXT: v_div_fmas_f32 v4, v4, v5, v7 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v4, v2, v0 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v2, v4, v3, v2 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 -; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v3, v3, v1 -; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v2 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v1, v3, v1 +; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v0 +; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v0, v1, v0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v4, 1.0 +; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v4, 1.0 ; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4 ; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4 -; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v2, v6, v5 +; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v3, v6, v5 ; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6 -; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v6, v5 +; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v4, v6 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v3, v1 -; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v1, v0 +; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-FLUSH-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fdiv_v2f16_arcp_ulp25: @@ -3225,16 +3280,20 @@ define <2 x half> @v_fdiv_v2f16_arcp_afn_ulp25(<2 x half> %a, <2 x half> %b) { ; GFX6-LABEL: v_fdiv_v2f16_arcp_afn_ulp25: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-NEXT: v_rcp_f32_e32 v2, v2 +; GFX6-NEXT: v_rcp_f32_e32 v1, v1 ; GFX6-NEXT: v_rcp_f32_e32 v3, v3 -; GFX6-NEXT: v_mul_f32_e32 v0, v0, v2 -; GFX6-NEXT: v_mul_f32_e32 v1, v1, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_mul_f32_e32 v1, v2, v1 +; GFX6-NEXT: v_mul_f32_e32 v0, v0, v3 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fdiv_v2f16_arcp_afn_ulp25: @@ -5190,55 +5249,59 @@ define <2 x half> @v_rsq_v2f16(<2 x half> %a) { ; GFX6-IEEE-LABEL: v_rsq_v2f16: ; GFX6-IEEE: ; %bb.0: ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0 +; GFX6-IEEE-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0 ; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v1, v1 -; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0 -; GFX6-IEEE-NEXT: v_rcp_f32_e32 v5, v2 -; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 -; GFX6-IEEE-NEXT: v_div_scale_f32 v4, s[4:5], v1, v1, 1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v8, -v2, v5, 1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v5, v8, v5, v5 -; GFX6-IEEE-NEXT: v_rcp_f32_e32 v6, v4 -; GFX6-IEEE-NEXT: v_mul_f32_e32 v8, v3, v5 -; GFX6-IEEE-NEXT: v_fma_f32 v10, -v2, v8, v3 -; GFX6-IEEE-NEXT: v_fma_f32 v8, v10, v5, v8 -; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v8, v3 -; GFX6-IEEE-NEXT: v_fma_f32 v9, -v4, v6, 1.0 -; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v5, v8 -; GFX6-IEEE-NEXT: v_div_scale_f32 v7, s[4:5], 1.0, v1, 1.0 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v2, v9, v6, v6 +; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, 1.0 +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v2 +; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, 1.0, v1, 1.0 +; GFX6-IEEE-NEXT: v_div_scale_f32 v5, s[4:5], v0, v0, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v8, -v2, v4, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v4, v8, v4, v4 +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v6, v5 +; GFX6-IEEE-NEXT: v_mul_f32_e32 v9, v3, v4 +; GFX6-IEEE-NEXT: v_fma_f32 v10, -v2, v9, v3 +; GFX6-IEEE-NEXT: v_fma_f32 v9, v10, v4, v9 +; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v9, v3 +; GFX6-IEEE-NEXT: v_fma_f32 v8, -v5, v6, 1.0 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v4, v9 +; GFX6-IEEE-NEXT: v_div_scale_f32 v7, s[4:5], 1.0, v0, 1.0 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v2, v8, v6, v6 ; GFX6-IEEE-NEXT: v_mul_f32_e32 v3, v7, v2 -; GFX6-IEEE-NEXT: v_fma_f32 v5, -v4, v3, v7 -; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v2, v3 -; GFX6-IEEE-NEXT: v_fma_f32 v4, -v4, v3, v7 +; GFX6-IEEE-NEXT: v_fma_f32 v4, -v5, v3, v7 +; GFX6-IEEE-NEXT: v_fma_f32 v3, v4, v2, v3 +; GFX6-IEEE-NEXT: v_fma_f32 v4, -v5, v3, v7 ; GFX6-IEEE-NEXT: s_mov_b64 vcc, s[4:5] ; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v4, v2, v3 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-IEEE-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-IEEE-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-FLUSH-LABEL: v_rsq_v2f16: ; GFX6-FLUSH: ; %bb.0: ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0 +; GFX6-FLUSH-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0 ; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v1, v1 +; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 -; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0 +; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, 1.0 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v1, 1.0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 ; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3 @@ -5247,13 +5310,13 @@ define <2 x half> @v_rsq_v2f16(<2 x half> %a) { ; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5 ; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 -; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, 1.0 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0 +; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v1, 1.0 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 ; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3 @@ -5263,8 +5326,10 @@ define <2 x half> @v_rsq_v2f16(<2 x half> %a) { ; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0 -; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 +; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-FLUSH-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-IEEE-LABEL: v_rsq_v2f16: @@ -5483,55 +5548,59 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) { ; GFX6-IEEE-LABEL: v_neg_rsq_v2f16: ; GFX6-IEEE: ; %bb.0: ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0 +; GFX6-IEEE-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0 ; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v1, v1 -; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, -1.0 -; GFX6-IEEE-NEXT: v_rcp_f32_e32 v5, v2 -; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, -1.0, v0, -1.0 -; GFX6-IEEE-NEXT: v_div_scale_f32 v4, s[4:5], v1, v1, -1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v8, -v2, v5, 1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v5, v8, v5, v5 -; GFX6-IEEE-NEXT: v_rcp_f32_e32 v6, v4 -; GFX6-IEEE-NEXT: v_mul_f32_e32 v8, v3, v5 -; GFX6-IEEE-NEXT: v_fma_f32 v10, -v2, v8, v3 -; GFX6-IEEE-NEXT: v_fma_f32 v8, v10, v5, v8 -; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v8, v3 -; GFX6-IEEE-NEXT: v_fma_f32 v9, -v4, v6, 1.0 -; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v5, v8 -; GFX6-IEEE-NEXT: v_div_scale_f32 v7, s[4:5], -1.0, v1, -1.0 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, -1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v2, v9, v6, v6 +; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, -1.0 +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v2 +; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, -1.0, v1, -1.0 +; GFX6-IEEE-NEXT: v_div_scale_f32 v5, s[4:5], v0, v0, -1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v8, -v2, v4, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v4, v8, v4, v4 +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v6, v5 +; GFX6-IEEE-NEXT: v_mul_f32_e32 v9, v3, v4 +; GFX6-IEEE-NEXT: v_fma_f32 v10, -v2, v9, v3 +; GFX6-IEEE-NEXT: v_fma_f32 v9, v10, v4, v9 +; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v9, v3 +; GFX6-IEEE-NEXT: v_fma_f32 v8, -v5, v6, 1.0 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v4, v9 +; GFX6-IEEE-NEXT: v_div_scale_f32 v7, s[4:5], -1.0, v0, -1.0 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v1, -1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v2, v8, v6, v6 ; GFX6-IEEE-NEXT: v_mul_f32_e32 v3, v7, v2 -; GFX6-IEEE-NEXT: v_fma_f32 v5, -v4, v3, v7 -; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v2, v3 -; GFX6-IEEE-NEXT: v_fma_f32 v4, -v4, v3, v7 +; GFX6-IEEE-NEXT: v_fma_f32 v4, -v5, v3, v7 +; GFX6-IEEE-NEXT: v_fma_f32 v3, v4, v2, v3 +; GFX6-IEEE-NEXT: v_fma_f32 v4, -v5, v3, v7 ; GFX6-IEEE-NEXT: s_mov_b64 vcc, s[4:5] ; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v4, v2, v3 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v1, -1.0 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, -1.0 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-IEEE-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-IEEE-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-FLUSH-LABEL: v_neg_rsq_v2f16: ; GFX6-FLUSH: ; %bb.0: ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0 +; GFX6-FLUSH-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0 ; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v1, v1 +; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 -; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, -1.0 +; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, -1.0 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, -1.0, v0, -1.0 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, -1.0, v1, -1.0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 ; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3 @@ -5540,13 +5609,13 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) { ; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5 ; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, -1.0 -; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, -1.0 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, -1.0 +; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, -1.0 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, -1.0, v1, -1.0 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, -1.0, v0, -1.0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 ; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3 @@ -5556,8 +5625,10 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) { ; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, -1.0 -; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, -1.0 +; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-FLUSH-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-IEEE-LABEL: v_neg_rsq_v2f16: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll index 067704cfb4d80..f48c72688533a 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll @@ -362,16 +362,21 @@ define <2 x half> @v_fma_v2f16(<2 x half> %x, <2 x half> %y, <2 x half> %z) { ; GFX6-LABEL: v_fma_v2f16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_fma_f32 v0, v0, v2, v4 +; GFX6-NEXT: v_fma_f32 v0, v0, v1, v2 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_fma_f32 v1, v1, v3, v5 +; GFX6-NEXT: v_fma_f32 v1, v3, v4, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fma_v2f16: @@ -427,21 +432,22 @@ define <2 x half> @v_fma_v2f16_fneg_lhs(<2 x half> %x, <2 x half> %y, <2 x half> ; GFX6-LABEL: v_fma_v2f16_fneg_lhs: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_fma_f32 v0, v0, v2, v4 +; GFX6-NEXT: v_fma_f32 v0, v0, v1, v2 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_fma_f32 v1, v1, v3, v5 +; GFX6-NEXT: v_fma_f32 v1, v3, v4, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fma_v2f16_fneg_lhs: @@ -499,21 +505,22 @@ define <2 x half> @v_fma_v2f16_fneg_rhs(<2 x half> %x, <2 x half> %y, <2 x half> ; GFX6-LABEL: v_fma_v2f16_fneg_rhs: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX6-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX6-NEXT: v_xor_b32_e32 v1, 0x80008000, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_fma_f32 v0, v0, v2, v4 +; GFX6-NEXT: v_fma_f32 v0, v0, v1, v2 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_fma_f32 v1, v1, v3, v5 +; GFX6-NEXT: v_fma_f32 v1, v3, v4, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fma_v2f16_fneg_rhs: @@ -571,16 +578,21 @@ define <2 x half> @v_fma_v2f16_fneg_lhs_rhs(<2 x half> %x, <2 x half> %y, <2 x h ; GFX6-LABEL: v_fma_v2f16_fneg_lhs_rhs: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_fma_f32 v0, v0, v2, v4 +; GFX6-NEXT: v_fma_f32 v0, v0, v1, v2 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_fma_f32 v1, v1, v3, v5 +; GFX6-NEXT: v_fma_f32 v1, v3, v4, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fma_v2f16_fneg_lhs_rhs: @@ -638,21 +650,26 @@ define <3 x half> @v_fma_v3f16(<3 x half> %x, <3 x half> %y, <3 x half> %z) { ; GFX6-LABEL: v_fma_v3f16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v8, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_fma_f32 v0, v0, v3, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v8 +; GFX6-NEXT: v_fma_f32 v0, v0, v2, v4 +; GFX6-NEXT: v_fma_f32 v6, v6, v7, v8 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_fma_f32 v1, v1, v3, v4 -; GFX6-NEXT: v_fma_f32 v2, v2, v5, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v6 +; GFX6-NEXT: v_fma_f32 v1, v1, v3, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fma_v3f16: @@ -714,26 +731,36 @@ define <4 x half> @v_fma_v4f16(<4 x half> %x, <4 x half> %y, <4 x half> %z) { ; GFX6-LABEL: v_fma_v4f16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v8, 16, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v9, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v10, 16, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v11, 16, v5 +; GFX6-NEXT: v_fma_f32 v0, v0, v2, v9 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v8 ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v8, v8 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GFX6-NEXT: v_fma_f32 v0, v0, v4, v8 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v6 -; GFX6-NEXT: v_fma_f32 v1, v1, v5, v9 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v10 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v11 -; GFX6-NEXT: v_fma_f32 v2, v2, v4, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-NEXT: v_fma_f32 v3, v3, v6, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v8, v10 +; GFX6-NEXT: v_cvt_f32_f16_e32 v9, v11 +; GFX6-NEXT: v_fma_f32 v2, v2, v6, v4 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_fma_f32 v1, v1, v3, v5 +; GFX6-NEXT: v_fma_f32 v3, v7, v8, v9 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fma_v4f16: @@ -1618,16 +1645,25 @@ define amdgpu_ps void @fma_s64_uniform(double inreg %a, double inreg %b, double define amdgpu_ps <2 x half> @fma_v2s16_uniform(<2 x half> inreg %a, <2 x half> inreg %b, <2 x half> inreg %c) { ; GFX6-LABEL: fma_v2s16_uniform: ; GFX6: ; %bb.0: +; GFX6-NEXT: s_lshr_b32 s3, s0, 16 +; GFX6-NEXT: s_lshr_b32 s4, s1, 16 +; GFX6-NEXT: s_lshr_b32 s5, s2, 16 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, s0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, s2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, s4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, s1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, s3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, s1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, s2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, s3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, s4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, s5 ; GFX6-NEXT: v_fma_f32 v0, v0, v1, v2 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: v_fma_f32 v1, v3, v4, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: v_readfirstlane_b32 s0, v0 +; GFX6-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX6-NEXT: v_readfirstlane_b32 s1, v1 +; GFX6-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX6-NEXT: s_lshl_b32 s1, s1, 16 +; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: fma_v2s16_uniform: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll index 98a26a4e672b9..b19216ae7bbc5 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll @@ -407,31 +407,35 @@ define <2 x half> @v_pow_v2f16(<2 x half> %x, <2 x half> %y) { ; GFX6-LABEL: v_pow_v2f16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1 +; GFX6-NEXT: v_log_f32_e32 v2, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-NEXT: v_mov_b32_e32 v4, 0xc2fc0000 ; GFX6-NEXT: v_log_f32_e32 v0, v0 +; GFX6-NEXT: v_mov_b32_e32 v3, 0xc2fc0000 +; GFX6-NEXT: v_mul_legacy_f32_e32 v2, v2, v4 ; GFX6-NEXT: v_mov_b32_e32 v5, 0x42800000 -; GFX6-NEXT: v_log_f32_e32 v1, v1 -; GFX6-NEXT: v_mul_legacy_f32_e32 v0, v0, v2 -; GFX6-NEXT: v_cmp_lt_f32_e32 vcc, v0, v4 -; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v5, vcc -; GFX6-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 -; GFX6-NEXT: v_not_b32_e32 v3, 63 -; GFX6-NEXT: v_cndmask_b32_e32 v6, 0, v3, vcc +; GFX6-NEXT: v_cmp_lt_f32_e32 vcc, v2, v3 +; GFX6-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 +; GFX6-NEXT: v_cndmask_b32_e32 v4, 0, v5, vcc +; GFX6-NEXT: v_cmp_lt_f32_e64 s[4:5], v0, v3 +; GFX6-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, v5, s[4:5] +; GFX6-NEXT: v_exp_f32_e32 v2, v2 +; GFX6-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX6-NEXT: v_exp_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_legacy_f32_e32 v1, v1, v2 -; GFX6-NEXT: v_cmp_lt_f32_e32 vcc, v1, v4 -; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v5, vcc -; GFX6-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX6-NEXT: v_exp_f32_e32 v1, v1 -; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v3, vcc -; GFX6-NEXT: v_ldexp_f32_e32 v0, v0, v6 +; GFX6-NEXT: v_not_b32_e32 v4, 63 +; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc +; GFX6-NEXT: v_ldexp_f32_e32 v1, v2, v1 +; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, v4, s[4:5] +; GFX6-NEXT: v_ldexp_f32_e32 v0, v0, v2 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_ldexp_f32_e32 v1, v1, v2 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_pow_v2f16: @@ -542,36 +546,36 @@ define <2 x half> @v_pow_v2f16_fneg_lhs(<2 x half> %x, <2 x half> %y) { ; GFX6-LABEL: v_pow_v2f16_fneg_lhs: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX6-NEXT: v_log_f32_e32 v1, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1 +; GFX6-NEXT: v_log_f32_e32 v2, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_log_f32_e32 v0, v0 -; GFX6-NEXT: v_mov_b32_e32 v4, 0x42800000 -; GFX6-NEXT: v_mul_legacy_f32_e32 v1, v1, v2 -; GFX6-NEXT: v_mov_b32_e32 v2, 0xc2fc0000 -; GFX6-NEXT: v_cmp_lt_f32_e32 vcc, v1, v2 -; GFX6-NEXT: v_cndmask_b32_e32 v5, 0, v4, vcc -; GFX6-NEXT: v_add_f32_e32 v1, v1, v5 -; GFX6-NEXT: v_not_b32_e32 v5, 63 -; GFX6-NEXT: v_mul_legacy_f32_e32 v0, v0, v3 -; GFX6-NEXT: v_cndmask_b32_e32 v6, 0, v5, vcc -; GFX6-NEXT: v_cmp_lt_f32_e32 vcc, v0, v2 -; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc -; GFX6-NEXT: v_exp_f32_e32 v1, v1 -; GFX6-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX6-NEXT: v_exp_f32_e32 v2, v0 -; GFX6-NEXT: v_ldexp_f32_e32 v0, v1, v6 -; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v5, vcc +; GFX6-NEXT: v_mov_b32_e32 v3, 0xc2fc0000 +; GFX6-NEXT: v_mul_legacy_f32_e32 v2, v2, v4 +; GFX6-NEXT: v_mov_b32_e32 v5, 0x42800000 +; GFX6-NEXT: v_cmp_lt_f32_e32 vcc, v2, v3 +; GFX6-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 +; GFX6-NEXT: v_cndmask_b32_e32 v4, 0, v5, vcc +; GFX6-NEXT: v_cmp_lt_f32_e64 s[4:5], v0, v3 +; GFX6-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, v5, s[4:5] +; GFX6-NEXT: v_exp_f32_e32 v2, v2 +; GFX6-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX6-NEXT: v_exp_f32_e32 v0, v0 +; GFX6-NEXT: v_not_b32_e32 v4, 63 +; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc ; GFX6-NEXT: v_ldexp_f32_e32 v1, v2, v1 +; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, v4, s[4:5] +; GFX6-NEXT: v_ldexp_f32_e32 v0, v0, v2 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_pow_v2f16_fneg_lhs: @@ -690,36 +694,36 @@ define <2 x half> @v_pow_v2f16_fneg_rhs(<2 x half> %x, <2 x half> %y) { ; GFX6-LABEL: v_pow_v2f16_fneg_rhs: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX6-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 +; GFX6-NEXT: v_xor_b32_e32 v1, 0x80008000, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX6-NEXT: v_log_f32_e32 v2, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2 ; GFX6-NEXT: v_log_f32_e32 v0, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX6-NEXT: v_log_f32_e32 v1, v1 -; GFX6-NEXT: v_mul_legacy_f32_e32 v0, v0, v3 -; GFX6-NEXT: v_mov_b32_e32 v3, 0xc2fc0000 -; GFX6-NEXT: v_mov_b32_e32 v4, 0x42800000 -; GFX6-NEXT: v_cmp_lt_f32_e32 vcc, v0, v3 -; GFX6-NEXT: v_cndmask_b32_e32 v5, 0, v4, vcc -; GFX6-NEXT: v_add_f32_e32 v0, v0, v5 -; GFX6-NEXT: v_not_b32_e32 v5, 63 -; GFX6-NEXT: v_mul_legacy_f32_e32 v1, v1, v2 -; GFX6-NEXT: v_cndmask_b32_e32 v6, 0, v5, vcc -; GFX6-NEXT: v_cmp_lt_f32_e32 vcc, v1, v3 -; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc -; GFX6-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX6-NEXT: v_mov_b32_e32 v4, 0xc2fc0000 +; GFX6-NEXT: v_mul_legacy_f32_e32 v2, v2, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, 0x42800000 +; GFX6-NEXT: v_cmp_lt_f32_e32 vcc, v2, v4 +; GFX6-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 +; GFX6-NEXT: v_cndmask_b32_e32 v5, 0, v3, vcc +; GFX6-NEXT: v_cmp_lt_f32_e64 s[4:5], v0, v4 +; GFX6-NEXT: v_add_f32_e32 v2, v2, v5 +; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, v3, s[4:5] +; GFX6-NEXT: v_exp_f32_e32 v2, v2 +; GFX6-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX6-NEXT: v_exp_f32_e32 v0, v0 -; GFX6-NEXT: v_exp_f32_e32 v1, v1 -; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v5, vcc -; GFX6-NEXT: v_ldexp_f32_e32 v0, v0, v6 -; GFX6-NEXT: v_ldexp_f32_e32 v1, v1, v2 +; GFX6-NEXT: v_not_b32_e32 v5, 63 +; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v5, vcc +; GFX6-NEXT: v_ldexp_f32_e32 v1, v2, v1 +; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, v5, s[4:5] +; GFX6-NEXT: v_ldexp_f32_e32 v0, v0, v2 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_pow_v2f16_fneg_rhs: @@ -837,41 +841,37 @@ define <2 x half> @v_pow_v2f16_fneg_lhs_rhs(<2 x half> %x, <2 x half> %y) { ; GFX6-LABEL: v_pow_v2f16_fneg_lhs_rhs: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v0 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_xor_b32_e32 v1, 0x80008000, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v1 -; GFX6-NEXT: v_log_f32_e32 v3, v3 +; GFX6-NEXT: v_xor_b32_e32 v1, 0x80008000, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX6-NEXT: v_log_f32_e32 v2, v2 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_log_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_legacy_f32_e32 v2, v3, v2 -; GFX6-NEXT: v_mov_b32_e32 v3, 0xc2fc0000 -; GFX6-NEXT: v_mov_b32_e32 v4, 0x42800000 -; GFX6-NEXT: v_cmp_lt_f32_e32 vcc, v2, v3 -; GFX6-NEXT: v_cndmask_b32_e32 v5, 0, v4, vcc -; GFX6-NEXT: v_add_f32_e32 v2, v2, v5 -; GFX6-NEXT: v_not_b32_e32 v5, 63 +; GFX6-NEXT: v_mov_b32_e32 v4, 0xc2fc0000 +; GFX6-NEXT: v_mul_legacy_f32_e32 v2, v2, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, 0x42800000 +; GFX6-NEXT: v_cmp_lt_f32_e32 vcc, v2, v4 ; GFX6-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 -; GFX6-NEXT: v_cndmask_b32_e32 v6, 0, v5, vcc -; GFX6-NEXT: v_cmp_lt_f32_e32 vcc, v0, v3 -; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v5, 0, v3, vcc +; GFX6-NEXT: v_cmp_lt_f32_e64 s[4:5], v0, v4 +; GFX6-NEXT: v_add_f32_e32 v2, v2, v5 +; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, v3, s[4:5] ; GFX6-NEXT: v_exp_f32_e32 v2, v2 ; GFX6-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX6-NEXT: v_exp_f32_e32 v1, v0 -; GFX6-NEXT: v_ldexp_f32_e32 v0, v2, v6 -; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v5, vcc -; GFX6-NEXT: v_ldexp_f32_e32 v1, v1, v2 +; GFX6-NEXT: v_exp_f32_e32 v0, v0 +; GFX6-NEXT: v_not_b32_e32 v5, 63 +; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v5, vcc +; GFX6-NEXT: v_ldexp_f32_e32 v1, v2, v1 +; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, v5, s[4:5] +; GFX6-NEXT: v_ldexp_f32_e32 v0, v0, v2 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_pow_v2f16_fneg_lhs_rhs: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll index 54efb26ae1e01..6ad73601859d1 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll @@ -3653,18 +3653,20 @@ define amdgpu_ps half @v_fshl_i16_vss(i16 %lhs, i16 inreg %rhs, i16 inreg %amt) define amdgpu_ps i32 @s_fshl_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs, <2 x i16> inreg %amt) { ; GFX6-LABEL: s_fshl_v2i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_and_b32 s6, s4, 15 -; GFX6-NEXT: s_andn2_b32 s4, 15, s4 -; GFX6-NEXT: s_bfe_u32 s2, s2, 0xf0001 -; GFX6-NEXT: s_lshl_b32 s0, s0, s6 -; GFX6-NEXT: s_lshr_b32 s2, s2, s4 +; GFX6-NEXT: s_and_b32 s5, s2, 15 +; GFX6-NEXT: s_lshr_b32 s3, s0, 16 +; GFX6-NEXT: s_lshr_b32 s4, s2, 16 +; GFX6-NEXT: s_andn2_b32 s2, 15, s2 +; GFX6-NEXT: s_lshl_b32 s0, s0, s5 +; GFX6-NEXT: s_bfe_u32 s5, s1, 0xf0001 +; GFX6-NEXT: s_lshr_b32 s2, s5, s2 ; GFX6-NEXT: s_or_b32 s0, s0, s2 -; GFX6-NEXT: s_and_b32 s2, s5, 15 -; GFX6-NEXT: s_andn2_b32 s4, 15, s5 -; GFX6-NEXT: s_lshl_b32 s1, s1, s2 -; GFX6-NEXT: s_bfe_u32 s2, s3, 0xf0001 -; GFX6-NEXT: s_lshr_b32 s2, s2, s4 -; GFX6-NEXT: s_or_b32 s1, s1, s2 +; GFX6-NEXT: s_and_b32 s2, s4, 15 +; GFX6-NEXT: s_andn2_b32 s4, 15, s4 +; GFX6-NEXT: s_lshr_b32 s1, s1, 17 +; GFX6-NEXT: s_lshl_b32 s2, s3, s2 +; GFX6-NEXT: s_lshr_b32 s1, s1, s4 +; GFX6-NEXT: s_or_b32 s1, s2, s1 ; GFX6-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX6-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 @@ -3772,20 +3774,26 @@ define <2 x i16> @v_fshl_v2i16(<2 x i16> %lhs, <2 x i16> %rhs, <2 x i16> %amt) { ; GFX6-LABEL: v_fshl_v2i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v6, 15, v4 -; GFX6-NEXT: v_xor_b32_e32 v4, -1, v4 -; GFX6-NEXT: v_and_b32_e32 v4, 15, v4 -; GFX6-NEXT: v_bfe_u32 v2, v2, 1, 15 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, v6, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX6-NEXT: v_and_b32_e32 v5, 15, v2 +; GFX6-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX6-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, v5, v0 +; GFX6-NEXT: v_bfe_u32 v5, v1, 1, 15 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, v2, v5 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX6-NEXT: v_and_b32_e32 v2, 15, v5 -; GFX6-NEXT: v_xor_b32_e32 v4, -1, v5 +; GFX6-NEXT: v_and_b32_e32 v2, 15, v4 +; GFX6-NEXT: v_xor_b32_e32 v4, -1, v4 ; GFX6-NEXT: v_and_b32_e32 v4, 15, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, v2, v1 -; GFX6-NEXT: v_bfe_u32 v2, v3, 1, 15 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v2 -; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 17, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, v2, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, v4, v1 +; GFX6-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fshl_v2i16: @@ -3857,14 +3865,16 @@ define <2 x i16> @v_fshl_v2i16_4_8(<2 x i16> %lhs, <2 x i16> %rhs) { ; GFX6-LABEL: v_fshl_v2i16_4_8: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_bfe_u32 v2, v2, 1, 15 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX6-NEXT: v_bfe_u32 v3, v1, 1, 15 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 11, v2 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX6-NEXT: v_bfe_u32 v2, v3, 1, 15 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 7, v2 -; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 11, v3 +; GFX6-NEXT: v_alignbit_b32 v1, v2, v1, 24 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v3 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fshl_v2i16_4_8: @@ -3914,18 +3924,20 @@ define <2 x i16> @v_fshl_v2i16_4_8(<2 x i16> %lhs, <2 x i16> %rhs) { define amdgpu_ps float @v_fshl_v2i16_ssv(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs, <2 x i16> %amt) { ; GFX6-LABEL: v_fshl_v2i16_ssv: ; GFX6: ; %bb.0: +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX6-NEXT: v_and_b32_e32 v2, 15, v0 ; GFX6-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX6-NEXT: s_lshr_b32 s2, s0, 16 ; GFX6-NEXT: v_and_b32_e32 v0, 15, v0 ; GFX6-NEXT: v_lshl_b32_e32 v2, s0, v2 -; GFX6-NEXT: s_bfe_u32 s0, s2, 0xf0001 +; GFX6-NEXT: s_bfe_u32 s0, s1, 0xf0001 ; GFX6-NEXT: v_lshr_b32_e32 v0, s0, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX6-NEXT: v_and_b32_e32 v2, 15, v1 ; GFX6-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX6-NEXT: v_and_b32_e32 v1, 15, v1 -; GFX6-NEXT: s_bfe_u32 s0, s3, 0xf0001 -; GFX6-NEXT: v_lshl_b32_e32 v2, s1, v2 +; GFX6-NEXT: s_lshr_b32 s0, s1, 17 +; GFX6-NEXT: v_lshl_b32_e32 v2, s2, v2 ; GFX6-NEXT: v_lshr_b32_e32 v1, s0, v1 ; GFX6-NEXT: v_or_b32_e32 v1, v2, v1 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 @@ -4013,22 +4025,24 @@ define amdgpu_ps float @v_fshl_v2i16_ssv(<2 x i16> inreg %lhs, <2 x i16> inreg % define amdgpu_ps float @v_fshl_v2i16_svs(<2 x i16> inreg %lhs, <2 x i16> %rhs, <2 x i16> inreg %amt) { ; GFX6-LABEL: v_fshl_v2i16_svs: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_and_b32 s4, s2, 15 -; GFX6-NEXT: s_andn2_b32 s2, 15, s2 -; GFX6-NEXT: v_bfe_u32 v0, v0, 1, 15 +; GFX6-NEXT: s_lshr_b32 s3, s1, 16 +; GFX6-NEXT: s_and_b32 s4, s1, 15 +; GFX6-NEXT: s_andn2_b32 s1, 15, s1 +; GFX6-NEXT: v_bfe_u32 v1, v0, 1, 15 +; GFX6-NEXT: s_lshr_b32 s2, s0, 16 ; GFX6-NEXT: s_lshl_b32 s0, s0, s4 -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s2, v0 -; GFX6-NEXT: v_or_b32_e32 v0, s0, v0 -; GFX6-NEXT: s_and_b32 s0, s3, 15 -; GFX6-NEXT: s_andn2_b32 s2, 15, s3 -; GFX6-NEXT: v_bfe_u32 v1, v1, 1, 15 -; GFX6-NEXT: s_lshl_b32 s0, s1, s0 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, s2, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, s1, v1 ; GFX6-NEXT: v_or_b32_e32 v1, s0, v1 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: s_and_b32 s0, s3, 15 +; GFX6-NEXT: s_andn2_b32 s1, 15, s3 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 17, v0 +; GFX6-NEXT: s_lshl_b32 s0, s2, s0 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, s1, v0 +; GFX6-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: v_fshl_v2i16_svs: @@ -4103,16 +4117,18 @@ define amdgpu_ps float @v_fshl_v2i16_svs(<2 x i16> inreg %lhs, <2 x i16> %rhs, < define amdgpu_ps float @v_fshl_v2i16_vss(<2 x i16> %lhs, <2 x i16> inreg %rhs, <2 x i16> inreg %amt) { ; GFX6-LABEL: v_fshl_v2i16_vss: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_and_b32 s4, s2, 15 +; GFX6-NEXT: s_and_b32 s3, s1, 15 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX6-NEXT: s_lshr_b32 s2, s1, 16 +; GFX6-NEXT: s_andn2_b32 s1, 15, s1 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, s3, v0 +; GFX6-NEXT: s_bfe_u32 s3, s0, 0xf0001 +; GFX6-NEXT: s_lshr_b32 s1, s3, s1 +; GFX6-NEXT: v_or_b32_e32 v0, s1, v0 +; GFX6-NEXT: s_and_b32 s1, s2, 15 ; GFX6-NEXT: s_andn2_b32 s2, 15, s2 -; GFX6-NEXT: s_bfe_u32 s0, s0, 0xf0001 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, s4, v0 -; GFX6-NEXT: s_lshr_b32 s0, s0, s2 -; GFX6-NEXT: v_or_b32_e32 v0, s0, v0 -; GFX6-NEXT: s_and_b32 s0, s3, 15 -; GFX6-NEXT: s_andn2_b32 s2, 15, s3 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, s0, v1 -; GFX6-NEXT: s_bfe_u32 s0, s1, 0xf0001 +; GFX6-NEXT: s_lshr_b32 s0, s0, 17 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, s1, v1 ; GFX6-NEXT: s_lshr_b32 s0, s0, s2 ; GFX6-NEXT: v_or_b32_e32 v1, s0, v1 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 @@ -4209,29 +4225,31 @@ define amdgpu_ps float @v_fshl_v2i16_vss(<2 x i16> %lhs, <2 x i16> inreg %rhs, < define amdgpu_ps i48 @s_fshl_v3i16(<3 x i16> inreg %lhs, <3 x i16> inreg %rhs, <3 x i16> inreg %amt) { ; GFX6-LABEL: s_fshl_v3i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_and_b32 s9, s6, 15 -; GFX6-NEXT: s_andn2_b32 s6, 15, s6 +; GFX6-NEXT: s_and_b32 s8, s4, 15 +; GFX6-NEXT: s_lshr_b32 s6, s0, 16 +; GFX6-NEXT: s_lshr_b32 s7, s4, 16 +; GFX6-NEXT: s_andn2_b32 s4, 15, s4 +; GFX6-NEXT: s_lshl_b32 s0, s0, s8 +; GFX6-NEXT: s_bfe_u32 s8, s2, 0xf0001 +; GFX6-NEXT: s_lshr_b32 s4, s8, s4 +; GFX6-NEXT: s_or_b32 s0, s0, s4 +; GFX6-NEXT: s_and_b32 s4, s7, 15 +; GFX6-NEXT: s_andn2_b32 s7, 15, s7 +; GFX6-NEXT: s_lshr_b32 s2, s2, 17 +; GFX6-NEXT: s_lshl_b32 s4, s6, s4 +; GFX6-NEXT: s_lshr_b32 s2, s2, s7 +; GFX6-NEXT: s_or_b32 s2, s4, s2 +; GFX6-NEXT: s_and_b32 s4, s5, 15 +; GFX6-NEXT: s_andn2_b32 s5, 15, s5 ; GFX6-NEXT: s_bfe_u32 s3, s3, 0xf0001 -; GFX6-NEXT: s_lshl_b32 s0, s0, s9 -; GFX6-NEXT: s_lshr_b32 s3, s3, s6 -; GFX6-NEXT: s_or_b32 s0, s0, s3 -; GFX6-NEXT: s_and_b32 s3, s7, 15 -; GFX6-NEXT: s_andn2_b32 s6, 15, s7 -; GFX6-NEXT: s_lshl_b32 s1, s1, s3 -; GFX6-NEXT: s_bfe_u32 s3, s4, 0xf0001 -; GFX6-NEXT: s_lshr_b32 s3, s3, s6 +; GFX6-NEXT: s_lshl_b32 s1, s1, s4 +; GFX6-NEXT: s_lshr_b32 s3, s3, s5 +; GFX6-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX6-NEXT: s_or_b32 s1, s1, s3 -; GFX6-NEXT: s_and_b32 s3, s8, 15 -; GFX6-NEXT: s_andn2_b32 s4, 15, s8 -; GFX6-NEXT: s_lshl_b32 s2, s2, s3 -; GFX6-NEXT: s_bfe_u32 s3, s5, 0xf0001 -; GFX6-NEXT: s_lshr_b32 s3, s3, s4 -; GFX6-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX6-NEXT: s_or_b32 s2, s2, s3 ; GFX6-NEXT: s_and_b32 s0, 0xffff, s0 -; GFX6-NEXT: s_lshl_b32 s1, s1, 16 -; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_and_b32 s1, 0xffff, s2 +; GFX6-NEXT: s_lshl_b32 s2, s2, 16 +; GFX6-NEXT: s_or_b32 s0, s0, s2 +; GFX6-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_fshl_v3i16: @@ -4414,27 +4432,34 @@ define <3 x half> @v_fshl_v3i16(<3 x i16> %lhs, <3 x i16> %rhs, <3 x i16> %amt) ; GFX6-LABEL: v_fshl_v3i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v9, 15, v6 -; GFX6-NEXT: v_xor_b32_e32 v6, -1, v6 -; GFX6-NEXT: v_and_b32_e32 v6, 15, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v7, 16, v4 +; GFX6-NEXT: v_and_b32_e32 v8, 15, v4 +; GFX6-NEXT: v_xor_b32_e32 v4, -1, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX6-NEXT: v_and_b32_e32 v4, 15, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, v8, v0 +; GFX6-NEXT: v_bfe_u32 v8, v2, 1, 15 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, v4, v8 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX6-NEXT: v_and_b32_e32 v4, 15, v7 +; GFX6-NEXT: v_xor_b32_e32 v7, -1, v7 +; GFX6-NEXT: v_and_b32_e32 v7, 15, v7 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 17, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, v4, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, v7, v2 +; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX6-NEXT: v_and_b32_e32 v4, 15, v5 +; GFX6-NEXT: v_xor_b32_e32 v5, -1, v5 +; GFX6-NEXT: v_and_b32_e32 v5, 15, v5 ; GFX6-NEXT: v_bfe_u32 v3, v3, 1, 15 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, v9, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, v6, v3 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v3 -; GFX6-NEXT: v_and_b32_e32 v3, 15, v7 -; GFX6-NEXT: v_xor_b32_e32 v6, -1, v7 -; GFX6-NEXT: v_and_b32_e32 v6, 15, v6 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, v3, v1 -; GFX6-NEXT: v_bfe_u32 v3, v4, 1, 15 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, v6, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, v4, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, v5, v3 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v3 -; GFX6-NEXT: v_and_b32_e32 v3, 15, v8 -; GFX6-NEXT: v_xor_b32_e32 v4, -1, v8 -; GFX6-NEXT: v_and_b32_e32 v4, 15, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, v3, v2 -; GFX6-NEXT: v_bfe_u32 v3, v5, 1, 15 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, v4, v3 -; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fshl_v3i16: @@ -4535,36 +4560,40 @@ define <3 x half> @v_fshl_v3i16(<3 x i16> %lhs, <3 x i16> %rhs, <3 x i16> %amt) define amdgpu_ps <2 x i32> @s_fshl_v4i16(<4 x i16> inreg %lhs, <4 x i16> inreg %rhs, <4 x i16> inreg %amt) { ; GFX6-LABEL: s_fshl_v4i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_and_b32 s12, s8, 15 -; GFX6-NEXT: s_andn2_b32 s8, 15, s8 -; GFX6-NEXT: s_bfe_u32 s4, s4, 0xf0001 -; GFX6-NEXT: s_lshl_b32 s0, s0, s12 -; GFX6-NEXT: s_lshr_b32 s4, s4, s8 +; GFX6-NEXT: s_and_b32 s10, s4, 15 +; GFX6-NEXT: s_lshr_b32 s6, s0, 16 +; GFX6-NEXT: s_lshr_b32 s8, s4, 16 +; GFX6-NEXT: s_andn2_b32 s4, 15, s4 +; GFX6-NEXT: s_lshl_b32 s0, s0, s10 +; GFX6-NEXT: s_bfe_u32 s10, s2, 0xf0001 +; GFX6-NEXT: s_lshr_b32 s4, s10, s4 ; GFX6-NEXT: s_or_b32 s0, s0, s4 -; GFX6-NEXT: s_and_b32 s4, s9, 15 -; GFX6-NEXT: s_andn2_b32 s8, 15, s9 +; GFX6-NEXT: s_and_b32 s4, s8, 15 +; GFX6-NEXT: s_andn2_b32 s8, 15, s8 +; GFX6-NEXT: s_lshr_b32 s2, s2, 17 +; GFX6-NEXT: s_lshl_b32 s4, s6, s4 +; GFX6-NEXT: s_lshr_b32 s2, s2, s8 +; GFX6-NEXT: s_or_b32 s2, s4, s2 +; GFX6-NEXT: s_and_b32 s4, s5, 15 +; GFX6-NEXT: s_lshr_b32 s7, s1, 16 +; GFX6-NEXT: s_lshr_b32 s9, s5, 16 +; GFX6-NEXT: s_andn2_b32 s5, 15, s5 ; GFX6-NEXT: s_lshl_b32 s1, s1, s4 -; GFX6-NEXT: s_bfe_u32 s4, s5, 0xf0001 -; GFX6-NEXT: s_lshr_b32 s4, s4, s8 -; GFX6-NEXT: s_or_b32 s1, s1, s4 -; GFX6-NEXT: s_and_b32 s4, s10, 15 -; GFX6-NEXT: s_andn2_b32 s5, 15, s10 -; GFX6-NEXT: s_lshl_b32 s2, s2, s4 -; GFX6-NEXT: s_bfe_u32 s4, s6, 0xf0001 +; GFX6-NEXT: s_bfe_u32 s4, s3, 0xf0001 ; GFX6-NEXT: s_lshr_b32 s4, s4, s5 -; GFX6-NEXT: s_or_b32 s2, s2, s4 -; GFX6-NEXT: s_and_b32 s4, s11, 15 -; GFX6-NEXT: s_andn2_b32 s5, 15, s11 -; GFX6-NEXT: s_lshl_b32 s3, s3, s4 -; GFX6-NEXT: s_bfe_u32 s4, s7, 0xf0001 -; GFX6-NEXT: s_lshr_b32 s4, s4, s5 -; GFX6-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX6-NEXT: s_or_b32 s3, s3, s4 +; GFX6-NEXT: s_or_b32 s1, s1, s4 +; GFX6-NEXT: s_and_b32 s4, s9, 15 +; GFX6-NEXT: s_andn2_b32 s5, 15, s9 +; GFX6-NEXT: s_lshr_b32 s3, s3, 17 +; GFX6-NEXT: s_lshl_b32 s4, s7, s4 +; GFX6-NEXT: s_lshr_b32 s3, s3, s5 +; GFX6-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX6-NEXT: s_or_b32 s3, s4, s3 ; GFX6-NEXT: s_and_b32 s0, 0xffff, s0 -; GFX6-NEXT: s_lshl_b32 s1, s1, 16 -; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_and_b32 s1, 0xffff, s2 +; GFX6-NEXT: s_lshl_b32 s2, s2, 16 +; GFX6-NEXT: s_or_b32 s0, s0, s2 ; GFX6-NEXT: s_and_b32 s2, 0xffff, s3 +; GFX6-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX6-NEXT: s_lshl_b32 s2, s2, 16 ; GFX6-NEXT: s_or_b32 s1, s1, s2 ; GFX6-NEXT: ; return to shader part epilog @@ -4745,34 +4774,46 @@ define <4 x half> @v_fshl_v4i16(<4 x i16> %lhs, <4 x i16> %rhs, <4 x i16> %amt) ; GFX6-LABEL: v_fshl_v4i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v12, 15, v8 -; GFX6-NEXT: v_xor_b32_e32 v8, -1, v8 -; GFX6-NEXT: v_and_b32_e32 v8, 15, v8 -; GFX6-NEXT: v_bfe_u32 v4, v4, 1, 15 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, v12, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, v8, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v8, 16, v4 +; GFX6-NEXT: v_and_b32_e32 v10, 15, v4 +; GFX6-NEXT: v_xor_b32_e32 v4, -1, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX6-NEXT: v_and_b32_e32 v4, 15, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, v10, v0 +; GFX6-NEXT: v_bfe_u32 v10, v2, 1, 15 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, v4, v10 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX6-NEXT: v_and_b32_e32 v4, 15, v9 -; GFX6-NEXT: v_xor_b32_e32 v8, -1, v9 +; GFX6-NEXT: v_and_b32_e32 v4, 15, v8 +; GFX6-NEXT: v_xor_b32_e32 v8, -1, v8 ; GFX6-NEXT: v_and_b32_e32 v8, 15, v8 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, v4, v1 -; GFX6-NEXT: v_bfe_u32 v4, v5, 1, 15 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, v8, v4 -; GFX6-NEXT: v_or_b32_e32 v1, v1, v4 -; GFX6-NEXT: v_and_b32_e32 v4, 15, v10 -; GFX6-NEXT: v_xor_b32_e32 v5, -1, v10 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 17, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, v4, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, v8, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v9, 16, v5 +; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX6-NEXT: v_and_b32_e32 v4, 15, v5 +; GFX6-NEXT: v_xor_b32_e32 v5, -1, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v7, 16, v1 ; GFX6-NEXT: v_and_b32_e32 v5, 15, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX6-NEXT: v_bfe_u32 v4, v6, 1, 15 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, v4, v1 +; GFX6-NEXT: v_bfe_u32 v4, v3, 1, 15 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, v5, v4 -; GFX6-NEXT: v_or_b32_e32 v2, v2, v4 -; GFX6-NEXT: v_and_b32_e32 v4, 15, v11 -; GFX6-NEXT: v_xor_b32_e32 v5, -1, v11 +; GFX6-NEXT: v_xor_b32_e32 v5, -1, v9 +; GFX6-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX6-NEXT: v_and_b32_e32 v4, 15, v9 ; GFX6-NEXT: v_and_b32_e32 v5, 15, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, v4, v3 -; GFX6-NEXT: v_bfe_u32 v4, v7, 1, 15 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, v5, v4 -; GFX6-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 17, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, v4, v7 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, v5, v3 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX6-NEXT: v_or_b32_e32 v3, v4, v3 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v3 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fshl_v4i16: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll index 1e762f9a927f8..5afab53628c34 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll @@ -3404,20 +3404,22 @@ define amdgpu_ps half @v_fshr_i16_vss(i16 %lhs, i16 inreg %rhs, i16 inreg %amt) define amdgpu_ps i32 @s_fshr_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs, <2 x i16> inreg %amt) { ; GFX6-LABEL: s_fshr_v2i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_and_b32 s6, s4, 15 -; GFX6-NEXT: s_andn2_b32 s4, 15, s4 +; GFX6-NEXT: s_lshr_b32 s3, s0, 16 +; GFX6-NEXT: s_lshr_b32 s4, s1, 16 +; GFX6-NEXT: s_lshr_b32 s5, s2, 16 +; GFX6-NEXT: s_and_b32 s6, s2, 15 +; GFX6-NEXT: s_andn2_b32 s2, 15, s2 ; GFX6-NEXT: s_lshl_b32 s0, s0, 1 -; GFX6-NEXT: s_and_b32 s2, s2, 0xffff -; GFX6-NEXT: s_lshl_b32 s0, s0, s4 -; GFX6-NEXT: s_lshr_b32 s2, s2, s6 -; GFX6-NEXT: s_or_b32 s0, s0, s2 -; GFX6-NEXT: s_and_b32 s2, s5, 15 -; GFX6-NEXT: s_andn2_b32 s4, 15, s5 -; GFX6-NEXT: s_lshl_b32 s1, s1, 1 -; GFX6-NEXT: s_and_b32 s3, s3, 0xffff -; GFX6-NEXT: s_lshl_b32 s1, s1, s4 -; GFX6-NEXT: s_lshr_b32 s2, s3, s2 -; GFX6-NEXT: s_or_b32 s1, s1, s2 +; GFX6-NEXT: s_and_b32 s1, s1, 0xffff +; GFX6-NEXT: s_lshl_b32 s0, s0, s2 +; GFX6-NEXT: s_lshr_b32 s1, s1, s6 +; GFX6-NEXT: s_or_b32 s0, s0, s1 +; GFX6-NEXT: s_and_b32 s1, s5, 15 +; GFX6-NEXT: s_andn2_b32 s2, 15, s5 +; GFX6-NEXT: s_lshl_b32 s3, s3, 1 +; GFX6-NEXT: s_lshl_b32 s2, s3, s2 +; GFX6-NEXT: s_lshr_b32 s1, s4, s1 +; GFX6-NEXT: s_or_b32 s1, s2, s1 ; GFX6-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX6-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 @@ -3523,22 +3525,28 @@ define <2 x i16> @v_fshr_v2i16(<2 x i16> %lhs, <2 x i16> %rhs, <2 x i16> %amt) { ; GFX6-LABEL: v_fshr_v2i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v6, 15, v4 -; GFX6-NEXT: v_xor_b32_e32 v4, -1, v4 -; GFX6-NEXT: v_and_b32_e32 v4, 15, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX6-NEXT: v_and_b32_e32 v6, 15, v2 +; GFX6-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX6-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, v4, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v2 -; GFX6-NEXT: v_xor_b32_e32 v4, -1, v5 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX6-NEXT: v_and_b32_e32 v2, 15, v5 -; GFX6-NEXT: v_and_b32_e32 v4, 15, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 1, v1 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, v4, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, v2, v3 -; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, v2, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, v6, v1 +; GFX6-NEXT: v_xor_b32_e32 v2, -1, v5 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: v_and_b32_e32 v1, 15, v5 +; GFX6-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 1, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, v2, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, v1, v4 +; GFX6-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fshr_v2i16: @@ -3611,12 +3619,15 @@ define <2 x i16> @v_fshr_v2i16_4_8(<2 x i16> %lhs, <2 x i16> %rhs) { ; GFX6-LABEL: v_fshr_v2i16_4_8: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 12, v0 -; GFX6-NEXT: v_bfe_u32 v2, v2, 4, 12 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX6-NEXT: v_bfe_u32 v2, v3, 8, 8 -; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX6-NEXT: v_bfe_u32 v3, v1, 4, 12 +; GFX6-NEXT: v_alignbit_b32 v1, v2, v1, 24 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v3 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fshr_v2i16_4_8: @@ -3666,21 +3677,23 @@ define <2 x i16> @v_fshr_v2i16_4_8(<2 x i16> %lhs, <2 x i16> %rhs) { define amdgpu_ps float @v_fshr_v2i16_ssv(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs, <2 x i16> %amt) { ; GFX6-LABEL: v_fshr_v2i16_ssv: ; GFX6: ; %bb.0: +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX6-NEXT: v_and_b32_e32 v2, 15, v0 ; GFX6-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX6-NEXT: s_lshr_b32 s2, s0, 16 ; GFX6-NEXT: v_and_b32_e32 v0, 15, v0 ; GFX6-NEXT: s_lshl_b32 s0, s0, 1 ; GFX6-NEXT: v_lshl_b32_e32 v0, s0, v0 -; GFX6-NEXT: s_and_b32 s0, s2, 0xffff +; GFX6-NEXT: s_and_b32 s0, s1, 0xffff ; GFX6-NEXT: v_lshr_b32_e32 v2, s0, v2 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX6-NEXT: v_and_b32_e32 v2, 15, v1 ; GFX6-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX6-NEXT: s_lshr_b32 s3, s1, 16 ; GFX6-NEXT: v_and_b32_e32 v1, 15, v1 -; GFX6-NEXT: s_lshl_b32 s0, s1, 1 +; GFX6-NEXT: s_lshl_b32 s0, s2, 1 ; GFX6-NEXT: v_lshl_b32_e32 v1, s0, v1 -; GFX6-NEXT: s_and_b32 s0, s3, 0xffff -; GFX6-NEXT: v_lshr_b32_e32 v2, s0, v2 +; GFX6-NEXT: v_lshr_b32_e32 v2, s3, v2 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 @@ -3763,18 +3776,20 @@ define amdgpu_ps float @v_fshr_v2i16_ssv(<2 x i16> inreg %lhs, <2 x i16> inreg % define amdgpu_ps float @v_fshr_v2i16_svs(<2 x i16> inreg %lhs, <2 x i16> %rhs, <2 x i16> inreg %amt) { ; GFX6-LABEL: v_fshr_v2i16_svs: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_and_b32 s4, s2, 15 -; GFX6-NEXT: s_andn2_b32 s2, 15, s2 +; GFX6-NEXT: s_lshr_b32 s2, s0, 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX6-NEXT: s_lshr_b32 s3, s1, 16 +; GFX6-NEXT: s_and_b32 s4, s1, 15 +; GFX6-NEXT: s_andn2_b32 s1, 15, s1 ; GFX6-NEXT: s_lshl_b32 s0, s0, 1 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX6-NEXT: s_lshl_b32 s0, s0, s2 +; GFX6-NEXT: s_lshl_b32 s0, s0, s1 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, s4, v0 ; GFX6-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX6-NEXT: s_and_b32 s0, s3, 15 -; GFX6-NEXT: s_andn2_b32 s2, 15, s3 -; GFX6-NEXT: s_lshl_b32 s1, s1, 1 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX6-NEXT: s_lshl_b32 s1, s1, s2 +; GFX6-NEXT: s_andn2_b32 s1, 15, s3 +; GFX6-NEXT: s_lshl_b32 s2, s2, 1 +; GFX6-NEXT: s_lshl_b32 s1, s2, s1 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, s0, v1 ; GFX6-NEXT: v_or_b32_e32 v1, s1, v1 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 @@ -3865,19 +3880,21 @@ define amdgpu_ps float @v_fshr_v2i16_svs(<2 x i16> inreg %lhs, <2 x i16> %rhs, < define amdgpu_ps float @v_fshr_v2i16_vss(<2 x i16> %lhs, <2 x i16> inreg %rhs, <2 x i16> inreg %amt) { ; GFX6-LABEL: v_fshr_v2i16_vss: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_and_b32 s4, s2, 15 -; GFX6-NEXT: s_andn2_b32 s2, 15, s2 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX6-NEXT: s_lshr_b32 s2, s0, 16 +; GFX6-NEXT: s_lshr_b32 s3, s1, 16 +; GFX6-NEXT: s_and_b32 s4, s1, 15 +; GFX6-NEXT: s_andn2_b32 s1, 15, s1 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX6-NEXT: s_and_b32 s0, s0, 0xffff -; GFX6-NEXT: v_lshlrev_b32_e32 v0, s2, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, s1, v0 ; GFX6-NEXT: s_lshr_b32 s0, s0, s4 ; GFX6-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX6-NEXT: s_and_b32 s0, s3, 15 -; GFX6-NEXT: s_andn2_b32 s2, 15, s3 +; GFX6-NEXT: s_andn2_b32 s1, 15, s3 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 1, v1 -; GFX6-NEXT: s_and_b32 s1, s1, 0xffff -; GFX6-NEXT: v_lshlrev_b32_e32 v1, s2, v1 -; GFX6-NEXT: s_lshr_b32 s0, s1, s0 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, s1, v1 +; GFX6-NEXT: s_lshr_b32 s0, s2, s0 ; GFX6-NEXT: v_or_b32_e32 v1, s0, v1 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 @@ -3962,32 +3979,34 @@ define amdgpu_ps float @v_fshr_v2i16_vss(<2 x i16> %lhs, <2 x i16> inreg %rhs, < define amdgpu_ps i48 @s_fshr_v3i16(<3 x i16> inreg %lhs, <3 x i16> inreg %rhs, <3 x i16> inreg %amt) { ; GFX6-LABEL: s_fshr_v3i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_and_b32 s9, s6, 15 -; GFX6-NEXT: s_andn2_b32 s6, 15, s6 +; GFX6-NEXT: s_lshr_b32 s6, s0, 16 +; GFX6-NEXT: s_lshr_b32 s7, s2, 16 +; GFX6-NEXT: s_lshr_b32 s8, s4, 16 +; GFX6-NEXT: s_and_b32 s9, s4, 15 +; GFX6-NEXT: s_andn2_b32 s4, 15, s4 ; GFX6-NEXT: s_lshl_b32 s0, s0, 1 -; GFX6-NEXT: s_and_b32 s3, s3, 0xffff -; GFX6-NEXT: s_lshl_b32 s0, s0, s6 -; GFX6-NEXT: s_lshr_b32 s3, s3, s9 -; GFX6-NEXT: s_or_b32 s0, s0, s3 -; GFX6-NEXT: s_and_b32 s3, s7, 15 -; GFX6-NEXT: s_andn2_b32 s6, 15, s7 -; GFX6-NEXT: s_lshl_b32 s1, s1, 1 -; GFX6-NEXT: s_and_b32 s4, s4, 0xffff -; GFX6-NEXT: s_lshl_b32 s1, s1, s6 -; GFX6-NEXT: s_lshr_b32 s3, s4, s3 +; GFX6-NEXT: s_and_b32 s2, s2, 0xffff +; GFX6-NEXT: s_lshl_b32 s0, s0, s4 +; GFX6-NEXT: s_lshr_b32 s2, s2, s9 +; GFX6-NEXT: s_or_b32 s0, s0, s2 +; GFX6-NEXT: s_and_b32 s2, s8, 15 ; GFX6-NEXT: s_andn2_b32 s4, 15, s8 -; GFX6-NEXT: s_lshl_b32 s2, s2, 1 +; GFX6-NEXT: s_lshl_b32 s6, s6, 1 +; GFX6-NEXT: s_lshl_b32 s4, s6, s4 +; GFX6-NEXT: s_lshr_b32 s2, s7, s2 +; GFX6-NEXT: s_or_b32 s2, s4, s2 +; GFX6-NEXT: s_and_b32 s4, s5, 15 +; GFX6-NEXT: s_andn2_b32 s5, 15, s5 +; GFX6-NEXT: s_lshl_b32 s1, s1, 1 +; GFX6-NEXT: s_and_b32 s3, s3, 0xffff +; GFX6-NEXT: s_lshl_b32 s1, s1, s5 +; GFX6-NEXT: s_lshr_b32 s3, s3, s4 +; GFX6-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX6-NEXT: s_or_b32 s1, s1, s3 -; GFX6-NEXT: s_and_b32 s3, s8, 15 -; GFX6-NEXT: s_lshl_b32 s2, s2, s4 -; GFX6-NEXT: s_and_b32 s4, s5, 0xffff -; GFX6-NEXT: s_lshr_b32 s3, s4, s3 -; GFX6-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX6-NEXT: s_or_b32 s2, s2, s3 ; GFX6-NEXT: s_and_b32 s0, 0xffff, s0 -; GFX6-NEXT: s_lshl_b32 s1, s1, 16 -; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_and_b32 s1, 0xffff, s2 +; GFX6-NEXT: s_lshl_b32 s2, s2, 16 +; GFX6-NEXT: s_or_b32 s0, s0, s2 +; GFX6-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_fshr_v3i16: @@ -4166,30 +4185,37 @@ define <3 x half> @v_fshr_v3i16(<3 x i16> %lhs, <3 x i16> %rhs, <3 x i16> %amt) ; GFX6-LABEL: v_fshr_v3i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v9, 15, v6 -; GFX6-NEXT: v_xor_b32_e32 v6, -1, v6 -; GFX6-NEXT: v_and_b32_e32 v6, 15, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v8, 16, v4 +; GFX6-NEXT: v_and_b32_e32 v9, 15, v4 +; GFX6-NEXT: v_xor_b32_e32 v4, -1, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; GFX6-NEXT: v_and_b32_e32 v4, 15, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, v6, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, v9, v3 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v3 -; GFX6-NEXT: v_and_b32_e32 v3, 15, v7 -; GFX6-NEXT: v_xor_b32_e32 v6, -1, v7 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX6-NEXT: v_and_b32_e32 v6, 15, v6 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 1, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, v3, v4 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, v4, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, v9, v2 ; GFX6-NEXT: v_xor_b32_e32 v4, -1, v8 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, v6, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 15, v8 ; GFX6-NEXT: v_and_b32_e32 v4, 15, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 1, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v6, 1, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, v4, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, v2, v7 +; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX6-NEXT: v_and_b32_e32 v4, 15, v5 +; GFX6-NEXT: v_xor_b32_e32 v5, -1, v5 +; GFX6-NEXT: v_and_b32_e32 v5, 15, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 1, v1 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, v5, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, v4, v3 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v3 -; GFX6-NEXT: v_and_b32_e32 v3, 15, v8 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v5 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, v3, v4 -; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fshr_v3i16: @@ -4291,37 +4317,41 @@ define <3 x half> @v_fshr_v3i16(<3 x i16> %lhs, <3 x i16> %rhs, <3 x i16> %amt) define amdgpu_ps <2 x i32> @s_fshr_v4i16(<4 x i16> inreg %lhs, <4 x i16> inreg %rhs, <4 x i16> inreg %amt) { ; GFX6-LABEL: s_fshr_v4i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_and_b32 s12, s8, 15 -; GFX6-NEXT: s_andn2_b32 s8, 15, s8 +; GFX6-NEXT: s_lshr_b32 s6, s0, 16 +; GFX6-NEXT: s_lshr_b32 s8, s2, 16 +; GFX6-NEXT: s_lshr_b32 s10, s4, 16 +; GFX6-NEXT: s_and_b32 s12, s4, 15 +; GFX6-NEXT: s_andn2_b32 s4, 15, s4 ; GFX6-NEXT: s_lshl_b32 s0, s0, 1 -; GFX6-NEXT: s_and_b32 s4, s4, 0xffff -; GFX6-NEXT: s_lshl_b32 s0, s0, s8 -; GFX6-NEXT: s_lshr_b32 s4, s4, s12 -; GFX6-NEXT: s_or_b32 s0, s0, s4 -; GFX6-NEXT: s_and_b32 s4, s9, 15 -; GFX6-NEXT: s_andn2_b32 s8, 15, s9 -; GFX6-NEXT: s_lshl_b32 s1, s1, 1 -; GFX6-NEXT: s_and_b32 s5, s5, 0xffff -; GFX6-NEXT: s_lshl_b32 s1, s1, s8 -; GFX6-NEXT: s_lshr_b32 s4, s5, s4 -; GFX6-NEXT: s_or_b32 s1, s1, s4 -; GFX6-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX6-NEXT: s_and_b32 s0, 0xffff, s0 -; GFX6-NEXT: s_lshl_b32 s1, s1, 16 +; GFX6-NEXT: s_and_b32 s2, s2, 0xffff +; GFX6-NEXT: s_lshl_b32 s0, s0, s4 +; GFX6-NEXT: s_lshr_b32 s2, s2, s12 +; GFX6-NEXT: s_or_b32 s0, s0, s2 +; GFX6-NEXT: s_and_b32 s2, s10, 15 ; GFX6-NEXT: s_andn2_b32 s4, 15, s10 -; GFX6-NEXT: s_lshl_b32 s2, s2, 1 -; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_and_b32 s1, s10, 15 -; GFX6-NEXT: s_lshl_b32 s2, s2, s4 -; GFX6-NEXT: s_and_b32 s4, s6, 0xffff -; GFX6-NEXT: s_lshr_b32 s1, s4, s1 -; GFX6-NEXT: s_andn2_b32 s4, 15, s11 -; GFX6-NEXT: s_lshl_b32 s3, s3, 1 -; GFX6-NEXT: s_or_b32 s1, s2, s1 +; GFX6-NEXT: s_lshl_b32 s6, s6, 1 +; GFX6-NEXT: s_lshl_b32 s4, s6, s4 +; GFX6-NEXT: s_lshr_b32 s2, s8, s2 +; GFX6-NEXT: s_or_b32 s2, s4, s2 +; GFX6-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX6-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX6-NEXT: s_lshl_b32 s2, s2, 16 +; GFX6-NEXT: s_lshr_b32 s7, s1, 16 +; GFX6-NEXT: s_lshr_b32 s9, s3, 16 +; GFX6-NEXT: s_or_b32 s0, s0, s2 +; GFX6-NEXT: s_and_b32 s2, s5, 15 +; GFX6-NEXT: s_andn2_b32 s4, 15, s5 +; GFX6-NEXT: s_lshl_b32 s1, s1, 1 +; GFX6-NEXT: s_and_b32 s3, s3, 0xffff +; GFX6-NEXT: s_lshr_b32 s11, s5, 16 +; GFX6-NEXT: s_lshl_b32 s1, s1, s4 +; GFX6-NEXT: s_lshr_b32 s2, s3, s2 +; GFX6-NEXT: s_or_b32 s1, s1, s2 ; GFX6-NEXT: s_and_b32 s2, s11, 15 -; GFX6-NEXT: s_lshl_b32 s3, s3, s4 -; GFX6-NEXT: s_and_b32 s4, s7, 0xffff -; GFX6-NEXT: s_lshr_b32 s2, s4, s2 +; GFX6-NEXT: s_andn2_b32 s3, 15, s11 +; GFX6-NEXT: s_lshl_b32 s4, s7, 1 +; GFX6-NEXT: s_lshl_b32 s3, s4, s3 +; GFX6-NEXT: s_lshr_b32 s2, s9, s2 ; GFX6-NEXT: s_or_b32 s2, s3, s2 ; GFX6-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX6-NEXT: s_and_b32 s1, 0xffff, s1 @@ -4501,38 +4531,50 @@ define <4 x half> @v_fshr_v4i16(<4 x i16> %lhs, <4 x i16> %rhs, <4 x i16> %amt) ; GFX6-LABEL: v_fshr_v4i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v12, 15, v8 -; GFX6-NEXT: v_xor_b32_e32 v8, -1, v8 -; GFX6-NEXT: v_and_b32_e32 v8, 15, v8 +; GFX6-NEXT: v_lshrrev_b32_e32 v10, 16, v4 +; GFX6-NEXT: v_and_b32_e32 v12, 15, v4 +; GFX6-NEXT: v_xor_b32_e32 v4, -1, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v8, 16, v2 +; GFX6-NEXT: v_and_b32_e32 v4, 15, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, v8, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, v12, v4 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX6-NEXT: v_and_b32_e32 v4, 15, v9 -; GFX6-NEXT: v_xor_b32_e32 v8, -1, v9 -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX6-NEXT: v_and_b32_e32 v8, 15, v8 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, v4, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, v12, v2 +; GFX6-NEXT: v_xor_b32_e32 v4, -1, v10 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 15, v10 +; GFX6-NEXT: v_and_b32_e32 v4, 15, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v6, 1, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, v4, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, v2, v8 +; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_xor_b32_e32 v4, -1, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v9, 16, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v11, 16, v5 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 15, v5 +; GFX6-NEXT: v_and_b32_e32 v4, 15, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 1, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, v4, v5 -; GFX6-NEXT: v_xor_b32_e32 v5, -1, v10 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, v8, v1 -; GFX6-NEXT: v_and_b32_e32 v5, 15, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 1, v2 -; GFX6-NEXT: v_or_b32_e32 v1, v1, v4 -; GFX6-NEXT: v_and_b32_e32 v4, 15, v10 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, v5, v2 -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff, v6 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, v4, v5 -; GFX6-NEXT: v_xor_b32_e32 v5, -1, v11 -; GFX6-NEXT: v_and_b32_e32 v5, 15, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 1, v3 -; GFX6-NEXT: v_or_b32_e32 v2, v2, v4 -; GFX6-NEXT: v_and_b32_e32 v4, 15, v11 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff, v7 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, v4, v5 -; GFX6-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, v4, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, v2, v3 +; GFX6-NEXT: v_xor_b32_e32 v3, -1, v11 +; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 15, v11 +; GFX6-NEXT: v_and_b32_e32 v3, 15, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 1, v7 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, v3, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, v2, v9 +; GFX6-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fshr_v4i16: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll index d16dc348209e2..f0869a63a0977 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll @@ -517,10 +517,12 @@ define <3 x i8> @abs_vgpr_v3i8(<3 x i8> %arg) { define amdgpu_cs <2 x i16> @abs_sgpr_v2i16(<2 x i16> inreg %arg) { ; GFX6-LABEL: abs_sgpr_v2i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_sext_i32_i16 s0, s0 -; GFX6-NEXT: s_sext_i32_i16 s1, s1 +; GFX6-NEXT: s_sext_i32_i16 s1, s0 +; GFX6-NEXT: s_bfe_i32 s0, s0, 0x100010 ; GFX6-NEXT: s_abs_i32 s0, s0 ; GFX6-NEXT: s_abs_i32 s1, s1 +; GFX6-NEXT: s_lshl_b32 s0, s0, 16 +; GFX6-NEXT: s_or_b32 s0, s1, s0 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: abs_sgpr_v2i16: @@ -561,12 +563,14 @@ define <2 x i16> @abs_vgpr_v2i16(<2 x i16> %arg) { ; GFX6-LABEL: abs_vgpr_v2i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 0, v0 -; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16 -; GFX6-NEXT: v_max_i32_e32 v0, v0, v2 +; GFX6-NEXT: v_bfe_i32 v1, v0, 0, 16 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 0, v1 +; GFX6-NEXT: v_bfe_i32 v0, v0, 16, 16 ; GFX6-NEXT: v_max_i32_e32 v1, v1, v2 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 0, v0 +; GFX6-NEXT: v_max_i32_e32 v0, v0, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: abs_vgpr_v2i16: @@ -602,12 +606,14 @@ define <2 x i16> @abs_vgpr_v2i16(<2 x i16> %arg) { define amdgpu_cs <3 x i16> @abs_sgpr_v3i16(<3 x i16> inreg %arg) { ; GFX6-LABEL: abs_sgpr_v3i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_sext_i32_i16 s0, s0 -; GFX6-NEXT: s_sext_i32_i16 s1, s1 -; GFX6-NEXT: s_sext_i32_i16 s2, s2 +; GFX6-NEXT: s_sext_i32_i16 s2, s0 +; GFX6-NEXT: s_bfe_i32 s0, s0, 0x100010 ; GFX6-NEXT: s_abs_i32 s0, s0 -; GFX6-NEXT: s_abs_i32 s1, s1 ; GFX6-NEXT: s_abs_i32 s2, s2 +; GFX6-NEXT: s_sext_i32_i16 s1, s1 +; GFX6-NEXT: s_lshl_b32 s0, s0, 16 +; GFX6-NEXT: s_abs_i32 s1, s1 +; GFX6-NEXT: s_or_b32 s0, s2, s0 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: abs_sgpr_v3i16: @@ -653,15 +659,17 @@ define <3 x i16> @abs_vgpr_v3i16(<3 x i16> %arg) { ; GFX6-LABEL: abs_vgpr_v3i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX6-NEXT: v_bfe_i32 v2, v0, 0, 16 +; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v2 +; GFX6-NEXT: v_bfe_i32 v0, v0, 16, 16 +; GFX6-NEXT: v_max_i32_e32 v2, v2, v3 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v0 -; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16 ; GFX6-NEXT: v_max_i32_e32 v0, v0, v3 +; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v1 -; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_max_i32_e32 v1, v1, v3 -; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v2 -; GFX6-NEXT: v_max_i32_e32 v2, v2, v3 +; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: abs_vgpr_v3i16: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll index 1de5e136c400d..18b2430005dfb 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll @@ -658,14 +658,8 @@ define <6 x i16> @v_load_constant_v6i16_align8(ptr addrspace(4) %ptr) { ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: buffer_load_dwordx3 v[6:8], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_dwordx3 v[0:2], v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v6 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v7 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v8 -; GFX7-NEXT: v_mov_b32_e32 v0, v6 -; GFX7-NEXT: v_mov_b32_e32 v2, v7 -; GFX7-NEXT: v_mov_b32_e32 v4, v8 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: v_load_constant_v6i16_align8: @@ -674,15 +668,12 @@ define <6 x i16> @v_load_constant_v6i16_align8(ptr addrspace(4) %ptr) { ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:8 +; GFX6-NEXT: buffer_load_dwordx2 v[3:4], v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 offset:8 ; GFX6-NEXT: s_waitcnt vmcnt(1) -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v6 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v7 +; GFX6-NEXT: v_mov_b32_e32 v0, v3 +; GFX6-NEXT: v_mov_b32_e32 v1, v4 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; GFX6-NEXT: v_mov_b32_e32 v0, v6 -; GFX6-NEXT: v_mov_b32_e32 v2, v7 ; GFX6-NEXT: s_setpc_b64 s[30:31] %load = load <6 x i16>, ptr addrspace(4) %ptr, align 8 ret <6 x i16> %load diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll index 518af70cbbf9f..6b13bf675e036 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll @@ -765,12 +765,14 @@ define <2 x i16> @v_lshr_v2i16(<2 x i16> %value, <2 x i16> %amount) { ; GFX6-LABEL: v_lshr_v2i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v0, v2, v0 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, v2, v1 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, v1, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, v3, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_lshr_v2i16: @@ -800,8 +802,10 @@ define <2 x i16> @v_lshr_v2i16_15(<2 x i16> %value) { ; GFX6-LABEL: v_lshr_v2i16_15: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_bfe_u32 v0, v0, 15, 1 -; GFX6-NEXT: v_bfe_u32 v1, v1, 15, 1 +; GFX6-NEXT: v_bfe_u32 v1, v0, 15, 1 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 31, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_lshr_v2i16_15: @@ -831,10 +835,11 @@ define <2 x i16> @v_lshr_v2i16_15(<2 x i16> %value) { define amdgpu_ps i32 @s_lshr_v2i16(<2 x i16> inreg %value, <2 x i16> inreg %amount) { ; GFX6-LABEL: s_lshr_v2i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_and_b32 s1, s1, 0xffff +; GFX6-NEXT: s_lshr_b32 s2, s0, 16 +; GFX6-NEXT: s_lshr_b32 s3, s1, 16 ; GFX6-NEXT: s_and_b32 s0, s0, 0xffff -; GFX6-NEXT: s_lshr_b32 s1, s1, s3 -; GFX6-NEXT: s_lshr_b32 s0, s0, s2 +; GFX6-NEXT: s_lshr_b32 s0, s0, s1 +; GFX6-NEXT: s_lshr_b32 s1, s2, s3 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: ; return to shader part epilog @@ -877,12 +882,12 @@ define amdgpu_ps i32 @s_lshr_v2i16(<2 x i16> inreg %value, <2 x i16> inreg %amou define amdgpu_ps float @lshr_v2i16_sv(<2 x i16> inreg %value, <2 x i16> %amount) { ; GFX6-LABEL: lshr_v2i16_sv: ; GFX6: ; %bb.0: +; GFX6-NEXT: s_lshr_b32 s1, s0, 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: s_and_b32 s0, s0, 0xffff +; GFX6-NEXT: v_lshr_b32_e32 v1, s1, v1 ; GFX6-NEXT: v_lshr_b32_e32 v0, s0, v0 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX6-NEXT: s_and_b32 s0, s1, 0xffff -; GFX6-NEXT: v_lshr_b32_e32 v1, s0, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: ; return to shader part epilog @@ -913,12 +918,12 @@ define amdgpu_ps float @lshr_v2i16_sv(<2 x i16> inreg %value, <2 x i16> %amount) define amdgpu_ps float @lshr_v2i16_vs(<2 x i16> %value, <2 x i16> inreg %amount) { ; GFX6-LABEL: lshr_v2i16_vs: ; GFX6: ; %bb.0: +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX6-NEXT: s_lshr_b32 s1, s0, 16 ; GFX6-NEXT: s_and_b32 s0, s0, 0xffff ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, s1, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, s0, v0 -; GFX6-NEXT: s_and_b32 s0, s1, 0xffff -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, s0, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: ; return to shader part epilog @@ -961,22 +966,22 @@ define <2 x float> @v_lshr_v4i16(<4 x i16> %value, <4 x i16> %amount) { ; GFX6-LABEL: v_lshr_v4i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v0, v4, v0 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v5 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, v4, v1 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v2 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v2 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v7 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, v2, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v4 ; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, v4, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; GFX6-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, v3, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, v7, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_lshr_v4i16: @@ -1011,18 +1016,20 @@ define <2 x float> @v_lshr_v4i16(<4 x i16> %value, <4 x i16> %amount) { define amdgpu_ps <2 x i32> @s_lshr_v4i16(<4 x i16> inreg %value, <4 x i16> inreg %amount) { ; GFX6-LABEL: s_lshr_v4i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_and_b32 s1, s1, 0xffff +; GFX6-NEXT: s_lshr_b32 s4, s0, 16 +; GFX6-NEXT: s_lshr_b32 s6, s2, 16 ; GFX6-NEXT: s_and_b32 s0, s0, 0xffff -; GFX6-NEXT: s_lshr_b32 s1, s1, s5 -; GFX6-NEXT: s_and_b32 s3, s3, 0xffff -; GFX6-NEXT: s_lshr_b32 s0, s0, s4 -; GFX6-NEXT: s_and_b32 s2, s2, 0xffff -; GFX6-NEXT: s_lshr_b32 s3, s3, s7 -; GFX6-NEXT: s_lshl_b32 s1, s1, 16 -; GFX6-NEXT: s_lshr_b32 s2, s2, s6 -; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_lshl_b32 s1, s3, 16 -; GFX6-NEXT: s_or_b32 s1, s2, s1 +; GFX6-NEXT: s_lshr_b32 s5, s1, 16 +; GFX6-NEXT: s_lshr_b32 s7, s3, 16 +; GFX6-NEXT: s_lshr_b32 s0, s0, s2 +; GFX6-NEXT: s_lshr_b32 s2, s4, s6 +; GFX6-NEXT: s_and_b32 s1, s1, 0xffff +; GFX6-NEXT: s_lshr_b32 s1, s1, s3 +; GFX6-NEXT: s_lshr_b32 s3, s5, s7 +; GFX6-NEXT: s_lshl_b32 s2, s2, 16 +; GFX6-NEXT: s_or_b32 s0, s0, s2 +; GFX6-NEXT: s_lshl_b32 s2, s3, 16 +; GFX6-NEXT: s_or_b32 s1, s1, s2 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_lshr_v4i16: @@ -1106,38 +1113,38 @@ define <4 x float> @v_lshr_v8i16(<8 x i16> %value, <8 x i16> %amount) { ; GFX6-LABEL: v_lshr_v8i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v0, v8, v0 -; GFX6-NEXT: v_and_b32_e32 v8, 0xffff, v9 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, v8, v1 -; GFX6-NEXT: v_and_b32_e32 v8, 0xffff, v10 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, v8, v2 -; GFX6-NEXT: v_and_b32_e32 v8, 0xffff, v11 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, v8, v3 -; GFX6-NEXT: v_and_b32_e32 v8, 0xffff, v12 +; GFX6-NEXT: v_lshrrev_b32_e32 v8, 16, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v12, 16, v4 ; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, v8, v4 -; GFX6-NEXT: v_and_b32_e32 v8, 0xffff, v13 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v9, 16, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v13, 16, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, v4, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, v12, v8 ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, v8, v5 -; GFX6-NEXT: v_and_b32_e32 v8, 0xffff, v14 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v14, 16, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, v5, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, v13, v9 ; GFX6-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX6-NEXT: v_lshrrev_b32_e32 v6, v8, v6 -; GFX6-NEXT: v_and_b32_e32 v8, 0xffff, v15 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v11, 16, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v15, 16, v7 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v6, v14, v10 ; GFX6-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v7, v8, v7 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; GFX6-NEXT: v_or_b32_e32 v1, v2, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX6-NEXT: v_or_b32_e32 v3, v6, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, v7, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v7, v15, v11 +; GFX6-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v2, v2, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v7 +; GFX6-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_lshr_v8i16: @@ -1182,30 +1189,34 @@ define <4 x float> @v_lshr_v8i16(<8 x i16> %value, <8 x i16> %amount) { define amdgpu_ps <4 x i32> @s_lshr_v8i16(<8 x i16> inreg %value, <8 x i16> inreg %amount) { ; GFX6-LABEL: s_lshr_v8i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_and_b32 s1, s1, 0xffff +; GFX6-NEXT: s_lshr_b32 s8, s0, 16 +; GFX6-NEXT: s_lshr_b32 s12, s4, 16 ; GFX6-NEXT: s_and_b32 s0, s0, 0xffff -; GFX6-NEXT: s_lshr_b32 s1, s1, s9 -; GFX6-NEXT: s_and_b32 s3, s3, 0xffff -; GFX6-NEXT: s_lshr_b32 s0, s0, s8 +; GFX6-NEXT: s_lshr_b32 s9, s1, 16 +; GFX6-NEXT: s_lshr_b32 s13, s5, 16 +; GFX6-NEXT: s_lshr_b32 s0, s0, s4 +; GFX6-NEXT: s_lshr_b32 s4, s8, s12 +; GFX6-NEXT: s_and_b32 s1, s1, 0xffff +; GFX6-NEXT: s_lshr_b32 s10, s2, 16 +; GFX6-NEXT: s_lshr_b32 s14, s6, 16 +; GFX6-NEXT: s_lshr_b32 s1, s1, s5 +; GFX6-NEXT: s_lshr_b32 s5, s9, s13 ; GFX6-NEXT: s_and_b32 s2, s2, 0xffff -; GFX6-NEXT: s_lshr_b32 s3, s3, s11 -; GFX6-NEXT: s_and_b32 s5, s5, 0xffff -; GFX6-NEXT: s_and_b32 s7, s7, 0xffff -; GFX6-NEXT: s_lshl_b32 s1, s1, 16 -; GFX6-NEXT: s_lshr_b32 s2, s2, s10 -; GFX6-NEXT: s_and_b32 s4, s4, 0xffff -; GFX6-NEXT: s_lshr_b32 s5, s5, s13 -; GFX6-NEXT: s_and_b32 s6, s6, 0xffff -; GFX6-NEXT: s_lshr_b32 s7, s7, s15 -; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_lshl_b32 s1, s3, 16 -; GFX6-NEXT: s_lshr_b32 s4, s4, s12 -; GFX6-NEXT: s_lshr_b32 s6, s6, s14 -; GFX6-NEXT: s_or_b32 s1, s2, s1 -; GFX6-NEXT: s_lshl_b32 s2, s5, 16 -; GFX6-NEXT: s_lshl_b32 s3, s7, 16 -; GFX6-NEXT: s_or_b32 s2, s4, s2 -; GFX6-NEXT: s_or_b32 s3, s6, s3 +; GFX6-NEXT: s_lshl_b32 s4, s4, 16 +; GFX6-NEXT: s_lshr_b32 s11, s3, 16 +; GFX6-NEXT: s_lshr_b32 s15, s7, 16 +; GFX6-NEXT: s_lshr_b32 s2, s2, s6 +; GFX6-NEXT: s_lshr_b32 s6, s10, s14 +; GFX6-NEXT: s_and_b32 s3, s3, 0xffff +; GFX6-NEXT: s_or_b32 s0, s0, s4 +; GFX6-NEXT: s_lshl_b32 s4, s5, 16 +; GFX6-NEXT: s_lshr_b32 s3, s3, s7 +; GFX6-NEXT: s_lshr_b32 s7, s11, s15 +; GFX6-NEXT: s_or_b32 s1, s1, s4 +; GFX6-NEXT: s_lshl_b32 s4, s6, 16 +; GFX6-NEXT: s_or_b32 s2, s2, s4 +; GFX6-NEXT: s_lshl_b32 s4, s7, 16 +; GFX6-NEXT: s_or_b32 s3, s3, s4 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_lshr_v8i16: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll index 179af494f8da3..3bf687523727d 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll @@ -513,14 +513,8 @@ define amdgpu_ps float @v_orn2_i16_vs(i16 %src0, i16 inreg %src1) { define amdgpu_ps i32 @s_orn2_v2i16(<2 x i16> inreg %src0, <2 x i16> inreg %src1) { ; GFX6-LABEL: s_orn2_v2i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_lshl_b32 s0, s3, 16 -; GFX6-NEXT: s_and_b32 s1, s2, 0xffff -; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_lshl_b32 s1, s5, 16 -; GFX6-NEXT: s_and_b32 s2, s4, 0xffff -; GFX6-NEXT: s_or_b32 s1, s1, s2 -; GFX6-NEXT: s_xor_b32 s1, s1, -1 -; GFX6-NEXT: s_or_b32 s0, s0, s1 +; GFX6-NEXT: s_xor_b32 s0, s3, -1 +; GFX6-NEXT: s_or_b32 s0, s2, s0 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_orn2_v2i16: @@ -546,14 +540,8 @@ define amdgpu_ps i32 @s_orn2_v2i16(<2 x i16> inreg %src0, <2 x i16> inreg %src1) define amdgpu_ps i32 @s_orn2_v2i16_commute(<2 x i16> inreg %src0, <2 x i16> inreg %src1) { ; GFX6-LABEL: s_orn2_v2i16_commute: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_lshl_b32 s0, s3, 16 -; GFX6-NEXT: s_and_b32 s1, s2, 0xffff -; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_lshl_b32 s1, s5, 16 -; GFX6-NEXT: s_and_b32 s2, s4, 0xffff -; GFX6-NEXT: s_or_b32 s1, s1, s2 -; GFX6-NEXT: s_xor_b32 s1, s1, -1 -; GFX6-NEXT: s_or_b32 s0, s1, s0 +; GFX6-NEXT: s_xor_b32 s0, s3, -1 +; GFX6-NEXT: s_or_b32 s0, s0, s2 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_orn2_v2i16_commute: @@ -579,14 +567,8 @@ define amdgpu_ps i32 @s_orn2_v2i16_commute(<2 x i16> inreg %src0, <2 x i16> inre define amdgpu_ps { i32, i32 } @s_orn2_v2i16_multi_use(<2 x i16> inreg %src0, <2 x i16> inreg %src1) { ; GFX6-LABEL: s_orn2_v2i16_multi_use: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_lshl_b32 s0, s3, 16 -; GFX6-NEXT: s_and_b32 s1, s2, 0xffff -; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_lshl_b32 s1, s5, 16 -; GFX6-NEXT: s_and_b32 s2, s4, 0xffff -; GFX6-NEXT: s_or_b32 s1, s1, s2 -; GFX6-NEXT: s_xor_b32 s1, s1, -1 -; GFX6-NEXT: s_or_b32 s0, s0, s1 +; GFX6-NEXT: s_xor_b32 s1, s3, -1 +; GFX6-NEXT: s_or_b32 s0, s2, s1 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_orn2_v2i16_multi_use: @@ -619,18 +601,9 @@ define amdgpu_ps { i32, i32 } @s_orn2_v2i16_multi_use(<2 x i16> inreg %src0, <2 define amdgpu_ps { i32, i32 } @s_orn2_v2i16_multi_foldable_use(<2 x i16> inreg %src0, <2 x i16> inreg %src1, <2 x i16> inreg %src2) { ; GFX6-LABEL: s_orn2_v2i16_multi_foldable_use: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_lshl_b32 s0, s3, 16 -; GFX6-NEXT: s_and_b32 s1, s2, 0xffff -; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_lshl_b32 s1, s5, 16 -; GFX6-NEXT: s_and_b32 s2, s4, 0xffff -; GFX6-NEXT: s_or_b32 s1, s1, s2 -; GFX6-NEXT: s_lshl_b32 s2, s7, 16 -; GFX6-NEXT: s_and_b32 s3, s6, 0xffff -; GFX6-NEXT: s_or_b32 s2, s2, s3 -; GFX6-NEXT: s_xor_b32 s2, s2, -1 -; GFX6-NEXT: s_or_b32 s0, s0, s2 -; GFX6-NEXT: s_or_b32 s1, s1, s2 +; GFX6-NEXT: s_xor_b32 s1, s4, -1 +; GFX6-NEXT: s_or_b32 s0, s2, s1 +; GFX6-NEXT: s_or_b32 s1, s3, s1 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_orn2_v2i16_multi_foldable_use: @@ -662,26 +635,12 @@ define amdgpu_ps { i32, i32 } @s_orn2_v2i16_multi_foldable_use(<2 x i16> inreg % } define <2 x i16> @v_orn2_v2i16(<2 x i16> %src0, <2 x i16> %src1) { -; GFX6-LABEL: v_orn2_v2i16: -; GFX6: ; %bb.0: -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX6-NEXT: v_xor_b32_e32 v1, -1, v1 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX6-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: v_orn2_v2i16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_xor_b32_e32 v1, -1, v1 -; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: v_orn2_v2i16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_xor_b32_e32 v1, -1, v1 +; GCN-NEXT: v_or_b32_e32 v0, v0, v1 +; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX10PLUS-LABEL: v_orn2_v2i16: ; GFX10PLUS: ; %bb.0: @@ -697,19 +656,19 @@ define <2 x i16> @v_orn2_v2i16(<2 x i16> %src0, <2 x i16> %src1) { define amdgpu_ps i48 @s_orn2_v3i16(<3 x i16> inreg %src0, <3 x i16> inreg %src1) { ; GFX6-LABEL: s_orn2_v3i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_and_b32 s6, s6, 0xffff +; GFX6-NEXT: s_lshr_b32 s7, s4, 16 ; GFX6-NEXT: s_mov_b32 s0, -1 -; GFX6-NEXT: s_and_b32 s5, s5, 0xffff -; GFX6-NEXT: s_lshl_b32 s6, s6, 16 -; GFX6-NEXT: s_and_b32 s3, s3, 0xffff +; GFX6-NEXT: s_and_b32 s4, s4, 0xffff +; GFX6-NEXT: s_lshl_b32 s7, s7, 16 +; GFX6-NEXT: s_lshr_b32 s6, s2, 16 ; GFX6-NEXT: s_mov_b32 s1, 0xffff -; GFX6-NEXT: s_or_b32 s6, s5, s6 -; GFX6-NEXT: s_and_b32 s7, s7, 0xffff +; GFX6-NEXT: s_or_b32 s4, s4, s7 +; GFX6-NEXT: s_and_b32 s5, s5, 0xffff +; GFX6-NEXT: s_xor_b64 s[0:1], s[4:5], s[0:1] ; GFX6-NEXT: s_and_b32 s2, s2, 0xffff -; GFX6-NEXT: s_lshl_b32 s3, s3, 16 -; GFX6-NEXT: s_xor_b64 s[0:1], s[6:7], s[0:1] -; GFX6-NEXT: s_or_b32 s2, s2, s3 -; GFX6-NEXT: s_and_b32 s3, s4, 0xffff +; GFX6-NEXT: s_lshl_b32 s4, s6, 16 +; GFX6-NEXT: s_or_b32 s2, s2, s4 +; GFX6-NEXT: s_and_b32 s3, s3, 0xffff ; GFX6-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] ; GFX6-NEXT: s_lshr_b32 s2, s0, 16 ; GFX6-NEXT: s_and_b32 s0, s0, 0xffff @@ -750,19 +709,19 @@ define amdgpu_ps i48 @s_orn2_v3i16(<3 x i16> inreg %src0, <3 x i16> inreg %src1) define amdgpu_ps i48 @s_orn2_v3i16_commute(<3 x i16> inreg %src0, <3 x i16> inreg %src1) { ; GFX6-LABEL: s_orn2_v3i16_commute: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_and_b32 s6, s6, 0xffff +; GFX6-NEXT: s_lshr_b32 s7, s4, 16 ; GFX6-NEXT: s_mov_b32 s0, -1 -; GFX6-NEXT: s_and_b32 s5, s5, 0xffff -; GFX6-NEXT: s_lshl_b32 s6, s6, 16 -; GFX6-NEXT: s_and_b32 s3, s3, 0xffff +; GFX6-NEXT: s_and_b32 s4, s4, 0xffff +; GFX6-NEXT: s_lshl_b32 s7, s7, 16 +; GFX6-NEXT: s_lshr_b32 s6, s2, 16 ; GFX6-NEXT: s_mov_b32 s1, 0xffff -; GFX6-NEXT: s_or_b32 s6, s5, s6 -; GFX6-NEXT: s_and_b32 s7, s7, 0xffff +; GFX6-NEXT: s_or_b32 s4, s4, s7 +; GFX6-NEXT: s_and_b32 s5, s5, 0xffff +; GFX6-NEXT: s_xor_b64 s[0:1], s[4:5], s[0:1] ; GFX6-NEXT: s_and_b32 s2, s2, 0xffff -; GFX6-NEXT: s_lshl_b32 s3, s3, 16 -; GFX6-NEXT: s_xor_b64 s[0:1], s[6:7], s[0:1] -; GFX6-NEXT: s_or_b32 s2, s2, s3 -; GFX6-NEXT: s_and_b32 s3, s4, 0xffff +; GFX6-NEXT: s_lshl_b32 s4, s6, 16 +; GFX6-NEXT: s_or_b32 s2, s2, s4 +; GFX6-NEXT: s_and_b32 s3, s3, 0xffff ; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] ; GFX6-NEXT: s_lshr_b32 s2, s0, 16 ; GFX6-NEXT: s_and_b32 s0, s0, 0xffff @@ -803,30 +762,30 @@ define amdgpu_ps i48 @s_orn2_v3i16_commute(<3 x i16> inreg %src0, <3 x i16> inre define amdgpu_ps { i48, i48 } @s_orn2_v3i16_multi_use(<3 x i16> inreg %src0, <3 x i16> inreg %src1) { ; GFX6-LABEL: s_orn2_v3i16_multi_use: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_and_b32 s6, s6, 0xffff +; GFX6-NEXT: s_lshr_b32 s7, s4, 16 ; GFX6-NEXT: s_mov_b32 s0, -1 -; GFX6-NEXT: s_and_b32 s5, s5, 0xffff -; GFX6-NEXT: s_lshl_b32 s6, s6, 16 +; GFX6-NEXT: s_and_b32 s4, s4, 0xffff +; GFX6-NEXT: s_lshl_b32 s7, s7, 16 +; GFX6-NEXT: s_lshr_b32 s6, s2, 16 ; GFX6-NEXT: s_mov_b32 s1, 0xffff -; GFX6-NEXT: s_or_b32 s6, s5, s6 -; GFX6-NEXT: s_and_b32 s7, s7, 0xffff -; GFX6-NEXT: s_xor_b64 s[6:7], s[6:7], s[0:1] -; GFX6-NEXT: s_and_b32 s1, s3, 0xffff +; GFX6-NEXT: s_or_b32 s4, s4, s7 +; GFX6-NEXT: s_and_b32 s5, s5, 0xffff +; GFX6-NEXT: s_xor_b64 s[4:5], s[4:5], s[0:1] ; GFX6-NEXT: s_and_b32 s0, s2, 0xffff -; GFX6-NEXT: s_lshl_b32 s1, s1, 16 +; GFX6-NEXT: s_lshl_b32 s1, s6, 16 ; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_and_b32 s1, s4, 0xffff -; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7] +; GFX6-NEXT: s_and_b32 s1, s3, 0xffff +; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] ; GFX6-NEXT: s_lshr_b32 s2, s0, 16 -; GFX6-NEXT: s_lshr_b32 s5, s6, 16 +; GFX6-NEXT: s_lshr_b32 s7, s4, 16 ; GFX6-NEXT: s_and_b32 s0, s0, 0xffff ; GFX6-NEXT: s_lshl_b32 s2, s2, 16 ; GFX6-NEXT: s_or_b32 s0, s0, s2 -; GFX6-NEXT: s_and_b32 s2, s6, 0xffff -; GFX6-NEXT: s_lshl_b32 s3, s5, 16 +; GFX6-NEXT: s_and_b32 s2, s4, 0xffff +; GFX6-NEXT: s_lshl_b32 s3, s7, 16 ; GFX6-NEXT: s_and_b32 s1, s1, 0xffff ; GFX6-NEXT: s_or_b32 s2, s2, s3 -; GFX6-NEXT: s_and_b32 s3, s7, 0xffff +; GFX6-NEXT: s_and_b32 s3, s5, 0xffff ; GFX6-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_orn2_v3i16_multi_use: @@ -875,21 +834,25 @@ define <3 x i16> @v_orn2_v3i16(<3 x i16> %src0, <3 x i16> %src1) { ; GFX6-LABEL: v_orn2_v3i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX6-NEXT: v_or_b32_e32 v2, v2, v5 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX6-NEXT: v_xor_b32_e32 v3, 0xfff5, v3 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX6-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX6-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v5 -; GFX6-NEXT: v_xor_b32_e32 v3, -1, v3 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_xor_b32_e32 v4, 0xfff5, v4 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v3 -; GFX6-NEXT: v_or_b32_e32 v2, v1, v4 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_orn2_v3i16: @@ -917,17 +880,21 @@ define <3 x i16> @v_orn2_v3i16(<3 x i16> %src0, <3 x i16> %src1) { define amdgpu_ps i64 @s_orn2_v4i16(<4 x i16> inreg %src0, <4 x i16> inreg %src1) { ; GFX6-LABEL: s_orn2_v4i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_lshl_b32 s0, s3, 16 -; GFX6-NEXT: s_and_b32 s1, s2, 0xffff -; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_lshl_b32 s1, s5, 16 -; GFX6-NEXT: s_and_b32 s2, s4, 0xffff +; GFX6-NEXT: s_lshr_b32 s0, s2, 16 +; GFX6-NEXT: s_lshr_b32 s1, s3, 16 +; GFX6-NEXT: s_lshl_b32 s0, s0, 16 +; GFX6-NEXT: s_and_b32 s2, s2, 0xffff +; GFX6-NEXT: s_or_b32 s0, s0, s2 +; GFX6-NEXT: s_lshl_b32 s1, s1, 16 +; GFX6-NEXT: s_and_b32 s2, s3, 0xffff ; GFX6-NEXT: s_or_b32 s1, s1, s2 -; GFX6-NEXT: s_lshl_b32 s2, s7, 16 -; GFX6-NEXT: s_and_b32 s3, s6, 0xffff -; GFX6-NEXT: s_or_b32 s2, s2, s3 -; GFX6-NEXT: s_lshl_b32 s3, s9, 16 -; GFX6-NEXT: s_and_b32 s4, s8, 0xffff +; GFX6-NEXT: s_lshr_b32 s2, s4, 16 +; GFX6-NEXT: s_lshr_b32 s3, s5, 16 +; GFX6-NEXT: s_lshl_b32 s2, s2, 16 +; GFX6-NEXT: s_and_b32 s4, s4, 0xffff +; GFX6-NEXT: s_or_b32 s2, s2, s4 +; GFX6-NEXT: s_lshl_b32 s3, s3, 16 +; GFX6-NEXT: s_and_b32 s4, s5, 0xffff ; GFX6-NEXT: s_or_b32 s3, s3, s4 ; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], -1 ; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] @@ -957,17 +924,21 @@ define amdgpu_ps i64 @s_orn2_v4i16(<4 x i16> inreg %src0, <4 x i16> inreg %src1) define amdgpu_ps i64 @s_orn2_v4i16_commute(<4 x i16> inreg %src0, <4 x i16> inreg %src1) { ; GFX6-LABEL: s_orn2_v4i16_commute: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_lshl_b32 s0, s3, 16 -; GFX6-NEXT: s_and_b32 s1, s2, 0xffff -; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_lshl_b32 s1, s5, 16 -; GFX6-NEXT: s_and_b32 s2, s4, 0xffff +; GFX6-NEXT: s_lshr_b32 s0, s2, 16 +; GFX6-NEXT: s_lshr_b32 s1, s3, 16 +; GFX6-NEXT: s_lshl_b32 s0, s0, 16 +; GFX6-NEXT: s_and_b32 s2, s2, 0xffff +; GFX6-NEXT: s_or_b32 s0, s0, s2 +; GFX6-NEXT: s_lshl_b32 s1, s1, 16 +; GFX6-NEXT: s_and_b32 s2, s3, 0xffff ; GFX6-NEXT: s_or_b32 s1, s1, s2 -; GFX6-NEXT: s_lshl_b32 s2, s7, 16 -; GFX6-NEXT: s_and_b32 s3, s6, 0xffff -; GFX6-NEXT: s_or_b32 s2, s2, s3 -; GFX6-NEXT: s_lshl_b32 s3, s9, 16 -; GFX6-NEXT: s_and_b32 s4, s8, 0xffff +; GFX6-NEXT: s_lshr_b32 s2, s4, 16 +; GFX6-NEXT: s_lshr_b32 s3, s5, 16 +; GFX6-NEXT: s_lshl_b32 s2, s2, 16 +; GFX6-NEXT: s_and_b32 s4, s4, 0xffff +; GFX6-NEXT: s_or_b32 s2, s2, s4 +; GFX6-NEXT: s_lshl_b32 s3, s3, 16 +; GFX6-NEXT: s_and_b32 s4, s5, 0xffff ; GFX6-NEXT: s_or_b32 s3, s3, s4 ; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], -1 ; GFX6-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] @@ -997,17 +968,21 @@ define amdgpu_ps i64 @s_orn2_v4i16_commute(<4 x i16> inreg %src0, <4 x i16> inre define amdgpu_ps { i64, i64 } @s_orn2_v4i16_multi_use(<4 x i16> inreg %src0, <4 x i16> inreg %src1) { ; GFX6-LABEL: s_orn2_v4i16_multi_use: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_lshl_b32 s0, s3, 16 -; GFX6-NEXT: s_and_b32 s1, s2, 0xffff -; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_lshl_b32 s1, s5, 16 -; GFX6-NEXT: s_and_b32 s2, s4, 0xffff +; GFX6-NEXT: s_lshr_b32 s0, s2, 16 +; GFX6-NEXT: s_lshr_b32 s1, s3, 16 +; GFX6-NEXT: s_lshl_b32 s0, s0, 16 +; GFX6-NEXT: s_and_b32 s2, s2, 0xffff +; GFX6-NEXT: s_or_b32 s0, s0, s2 +; GFX6-NEXT: s_lshl_b32 s1, s1, 16 +; GFX6-NEXT: s_and_b32 s2, s3, 0xffff ; GFX6-NEXT: s_or_b32 s1, s1, s2 -; GFX6-NEXT: s_lshl_b32 s2, s7, 16 -; GFX6-NEXT: s_and_b32 s3, s6, 0xffff -; GFX6-NEXT: s_or_b32 s2, s2, s3 -; GFX6-NEXT: s_lshl_b32 s3, s9, 16 -; GFX6-NEXT: s_and_b32 s4, s8, 0xffff +; GFX6-NEXT: s_lshr_b32 s2, s4, 16 +; GFX6-NEXT: s_lshr_b32 s3, s5, 16 +; GFX6-NEXT: s_lshl_b32 s2, s2, 16 +; GFX6-NEXT: s_and_b32 s4, s4, 0xffff +; GFX6-NEXT: s_or_b32 s2, s2, s4 +; GFX6-NEXT: s_lshl_b32 s3, s3, 16 +; GFX6-NEXT: s_and_b32 s4, s5, 0xffff ; GFX6-NEXT: s_or_b32 s3, s3, s4 ; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], -1 ; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] @@ -1045,23 +1020,29 @@ define amdgpu_ps { i64, i64 } @s_orn2_v4i16_multi_use(<4 x i16> inreg %src0, <4 define amdgpu_ps { i64, i64 } @s_orn2_v4i16_multi_foldable_use(<4 x i16> inreg %src0, <4 x i16> inreg %src1, <4 x i16> inreg %src2) { ; GFX6-LABEL: s_orn2_v4i16_multi_foldable_use: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_lshl_b32 s0, s3, 16 -; GFX6-NEXT: s_and_b32 s1, s2, 0xffff -; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_lshl_b32 s1, s5, 16 -; GFX6-NEXT: s_and_b32 s2, s4, 0xffff +; GFX6-NEXT: s_lshr_b32 s0, s2, 16 +; GFX6-NEXT: s_lshr_b32 s1, s3, 16 +; GFX6-NEXT: s_lshl_b32 s0, s0, 16 +; GFX6-NEXT: s_and_b32 s2, s2, 0xffff +; GFX6-NEXT: s_or_b32 s0, s0, s2 +; GFX6-NEXT: s_lshl_b32 s1, s1, 16 +; GFX6-NEXT: s_and_b32 s2, s3, 0xffff ; GFX6-NEXT: s_or_b32 s1, s1, s2 -; GFX6-NEXT: s_lshl_b32 s2, s7, 16 -; GFX6-NEXT: s_and_b32 s3, s6, 0xffff -; GFX6-NEXT: s_or_b32 s2, s2, s3 -; GFX6-NEXT: s_lshl_b32 s3, s9, 16 -; GFX6-NEXT: s_and_b32 s4, s8, 0xffff +; GFX6-NEXT: s_lshr_b32 s2, s4, 16 +; GFX6-NEXT: s_lshr_b32 s3, s5, 16 +; GFX6-NEXT: s_lshl_b32 s2, s2, 16 +; GFX6-NEXT: s_and_b32 s4, s4, 0xffff +; GFX6-NEXT: s_or_b32 s2, s2, s4 +; GFX6-NEXT: s_lshl_b32 s3, s3, 16 +; GFX6-NEXT: s_and_b32 s4, s5, 0xffff ; GFX6-NEXT: s_or_b32 s3, s3, s4 -; GFX6-NEXT: s_lshl_b32 s4, s11, 16 -; GFX6-NEXT: s_and_b32 s5, s10, 0xffff -; GFX6-NEXT: s_or_b32 s4, s4, s5 -; GFX6-NEXT: s_lshl_b32 s5, s13, 16 -; GFX6-NEXT: s_and_b32 s6, s12, 0xffff +; GFX6-NEXT: s_lshr_b32 s4, s6, 16 +; GFX6-NEXT: s_lshr_b32 s5, s7, 16 +; GFX6-NEXT: s_lshl_b32 s4, s4, 16 +; GFX6-NEXT: s_and_b32 s6, s6, 0xffff +; GFX6-NEXT: s_or_b32 s4, s4, s6 +; GFX6-NEXT: s_lshl_b32 s5, s5, 16 +; GFX6-NEXT: s_and_b32 s6, s7, 0xffff ; GFX6-NEXT: s_or_b32 s5, s5, s6 ; GFX6-NEXT: s_xor_b64 s[4:5], s[4:5], -1 ; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] @@ -1100,24 +1081,26 @@ define <4 x i16> @v_orn2_v4i16(<4 x i16> %src0, <4 x i16> %src1) { ; GFX6-LABEL: v_orn2_v4i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX6-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_or_b32_e32 v1, v4, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v5 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v4 -; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v6 -; GFX6-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX6-NEXT: v_or_b32_e32 v3, v4, v3 ; GFX6-NEXT: v_xor_b32_e32 v2, -1, v2 ; GFX6-NEXT: v_xor_b32_e32 v3, -1, v3 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v1, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX6-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_orn2_v4i16: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll index f6e36241a05dc..3b7a6e12a1b7d 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll @@ -2773,26 +2773,34 @@ define <2 x i16> @v_saddsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) { ; GFX6-LABEL: v_saddsat_v2i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_min_i32_e32 v5, 0, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_min_i32_e32 v6, 0, v0 +; GFX6-NEXT: v_bfrev_b32_e32 v7, 1 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_max_i32_e32 v4, 0, v0 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, 0x80000000, v5 +; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v7, v6 ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 0x7fffffff, v4 -; GFX6-NEXT: v_max_i32_e32 v2, v5, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_min_i32_e32 v2, v2, v4 +; GFX6-NEXT: v_max_i32_e32 v1, v6, v1 +; GFX6-NEXT: v_min_i32_e32 v1, v1, v4 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX6-NEXT: v_min_i32_e32 v4, 0, v1 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GFX6-NEXT: v_bfrev_b32_e32 v5, -2 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; GFX6-NEXT: v_max_i32_e32 v3, 0, v1 ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 0x80000000, v4 -; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0x7fffffff, v3 +; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v5, v3 ; GFX6-NEXT: v_max_i32_e32 v2, v4, v2 ; GFX6-NEXT: v_min_i32_e32 v2, v2, v3 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2 -; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1 +; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_saddsat_v2i16: @@ -2834,17 +2842,19 @@ define <2 x i16> @v_saddsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) { define amdgpu_ps i32 @s_saddsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs) { ; GFX6-LABEL: s_saddsat_v2i16: ; GFX6: ; %bb.0: +; GFX6-NEXT: s_lshr_b32 s2, s0, 16 ; GFX6-NEXT: s_lshl_b32 s0, s0, 16 ; GFX6-NEXT: s_min_i32 s5, s0, 0 -; GFX6-NEXT: s_lshl_b32 s2, s2, 16 +; GFX6-NEXT: s_lshr_b32 s3, s1, 16 +; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_max_i32 s4, s0, 0 ; GFX6-NEXT: s_sub_i32 s5, 0x80000000, s5 ; GFX6-NEXT: s_sub_i32 s4, 0x7fffffff, s4 -; GFX6-NEXT: s_max_i32 s2, s5, s2 -; GFX6-NEXT: s_lshl_b32 s1, s1, 16 -; GFX6-NEXT: s_min_i32 s2, s2, s4 +; GFX6-NEXT: s_max_i32 s1, s5, s1 +; GFX6-NEXT: s_min_i32 s1, s1, s4 +; GFX6-NEXT: s_add_i32 s0, s0, s1 +; GFX6-NEXT: s_lshl_b32 s1, s2, 16 ; GFX6-NEXT: s_min_i32 s4, s1, 0 -; GFX6-NEXT: s_add_i32 s0, s0, s2 ; GFX6-NEXT: s_lshl_b32 s2, s3, 16 ; GFX6-NEXT: s_max_i32 s3, s1, 0 ; GFX6-NEXT: s_sub_i32 s4, 0x80000000, s4 @@ -2914,8 +2924,10 @@ define amdgpu_ps i32 @s_saddsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs define amdgpu_ps float @saddsat_v2i16_sv(<2 x i16> inreg %lhs, <2 x i16> %rhs) { ; GFX6-LABEL: saddsat_v2i16_sv: ; GFX6: ; %bb.0: +; GFX6-NEXT: s_lshr_b32 s1, s0, 16 ; GFX6-NEXT: s_lshl_b32 s0, s0, 16 ; GFX6-NEXT: s_min_i32 s3, s0, 0 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_max_i32 s2, s0, 0 ; GFX6-NEXT: s_sub_i32 s3, 0x80000000, s3 @@ -2981,9 +2993,11 @@ define amdgpu_ps float @saddsat_v2i16_sv(<2 x i16> inreg %lhs, <2 x i16> %rhs) { define amdgpu_ps float @saddsat_v2i16_vs(<2 x i16> %lhs, <2 x i16> inreg %rhs) { ; GFX6-LABEL: saddsat_v2i16_vs: ; GFX6: ; %bb.0: +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_min_i32_e32 v4, 0, v0 ; GFX6-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX6-NEXT: s_lshr_b32 s1, s0, 16 ; GFX6-NEXT: s_lshl_b32 s0, s0, 16 ; GFX6-NEXT: v_max_i32_e32 v2, 0, v0 ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v5, v4 @@ -3059,54 +3073,58 @@ define <2 x float> @v_saddsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { ; GFX6-LABEL: v_saddsat_v4i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_min_i32_e32 v10, 0, v0 ; GFX6-NEXT: v_bfrev_b32_e32 v11, 1 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_max_i32_e32 v8, 0, v0 ; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v11, v10 ; GFX6-NEXT: v_sub_i32_e32 v8, vcc, 0x7fffffff, v8 -; GFX6-NEXT: v_max_i32_e32 v4, v10, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_min_i32_e32 v4, v4, v8 -; GFX6-NEXT: v_min_i32_e32 v8, 0, v1 +; GFX6-NEXT: v_max_i32_e32 v2, v10, v2 +; GFX6-NEXT: v_min_i32_e32 v2, v2, v8 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX6-NEXT: v_min_i32_e32 v8, 0, v2 ; GFX6-NEXT: v_bfrev_b32_e32 v9, -2 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX6-NEXT: v_max_i32_e32 v5, 0, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX6-NEXT: v_max_i32_e32 v6, 0, v2 ; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v11, v8 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v9, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v9, v6 ; GFX6-NEXT: v_max_i32_e32 v4, v8, v4 -; GFX6-NEXT: v_min_i32_e32 v4, v4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6 -; GFX6-NEXT: v_min_i32_e32 v6, 0, v2 -; GFX6-NEXT: v_max_i32_e32 v5, 0, v2 -; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v11, v6 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v9, v5 -; GFX6-NEXT: v_max_i32_e32 v4, v6, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_min_i32_e32 v4, v4, v6 +; GFX6-NEXT: v_min_i32_e32 v6, 0, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_min_i32_e32 v4, v4, v5 +; GFX6-NEXT: v_max_i32_e32 v4, 0, v1 +; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v11, v6 +; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v9, v4 +; GFX6-NEXT: v_max_i32_e32 v3, v6, v3 +; GFX6-NEXT: v_min_i32_e32 v3, v3, v4 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX6-NEXT: v_min_i32_e32 v6, 0, v3 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v7 ; GFX6-NEXT: v_max_i32_e32 v5, 0, v3 ; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v11, v6 ; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v9, v5 ; GFX6-NEXT: v_max_i32_e32 v4, v6, v4 -; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1 +; GFX6-NEXT: v_ashrrev_i32_e32 v2, 16, v2 ; GFX6-NEXT: v_min_i32_e32 v4, v4, v5 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX6-NEXT: v_ashrrev_i32_e32 v2, 16, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX6-NEXT: v_ashrrev_i32_e32 v3, 16, v3 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v3 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -3168,52 +3186,56 @@ define <2 x float> @v_saddsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { define amdgpu_ps <2 x i32> @s_saddsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inreg %rhs) { ; GFX6-LABEL: s_saddsat_v4i16: ; GFX6: ; %bb.0: +; GFX6-NEXT: s_lshr_b32 s4, s0, 16 ; GFX6-NEXT: s_lshl_b32 s0, s0, 16 ; GFX6-NEXT: s_min_i32 s9, s0, 0 -; GFX6-NEXT: s_lshl_b32 s4, s4, 16 +; GFX6-NEXT: s_lshr_b32 s6, s2, 16 +; GFX6-NEXT: s_lshl_b32 s2, s2, 16 ; GFX6-NEXT: s_max_i32 s8, s0, 0 ; GFX6-NEXT: s_sub_i32 s9, 0x80000000, s9 ; GFX6-NEXT: s_sub_i32 s8, 0x7fffffff, s8 -; GFX6-NEXT: s_max_i32 s4, s9, s4 -; GFX6-NEXT: s_lshl_b32 s1, s1, 16 -; GFX6-NEXT: s_min_i32 s4, s4, s8 -; GFX6-NEXT: s_min_i32 s8, s1, 0 -; GFX6-NEXT: s_add_i32 s0, s0, s4 -; GFX6-NEXT: s_lshl_b32 s4, s5, 16 -; GFX6-NEXT: s_max_i32 s5, s1, 0 +; GFX6-NEXT: s_max_i32 s2, s9, s2 +; GFX6-NEXT: s_min_i32 s2, s2, s8 +; GFX6-NEXT: s_add_i32 s0, s0, s2 +; GFX6-NEXT: s_lshl_b32 s2, s4, 16 +; GFX6-NEXT: s_min_i32 s8, s2, 0 +; GFX6-NEXT: s_lshl_b32 s4, s6, 16 +; GFX6-NEXT: s_max_i32 s6, s2, 0 ; GFX6-NEXT: s_sub_i32 s8, 0x80000000, s8 -; GFX6-NEXT: s_sub_i32 s5, 0x7fffffff, s5 +; GFX6-NEXT: s_lshr_b32 s5, s1, 16 +; GFX6-NEXT: s_sub_i32 s6, 0x7fffffff, s6 ; GFX6-NEXT: s_max_i32 s4, s8, s4 -; GFX6-NEXT: s_min_i32 s4, s4, s5 -; GFX6-NEXT: s_lshl_b32 s2, s2, 16 -; GFX6-NEXT: s_add_i32 s1, s1, s4 -; GFX6-NEXT: s_lshl_b32 s4, s6, 16 -; GFX6-NEXT: s_min_i32 s6, s2, 0 -; GFX6-NEXT: s_max_i32 s5, s2, 0 -; GFX6-NEXT: s_sub_i32 s6, 0x80000000, s6 -; GFX6-NEXT: s_sub_i32 s5, 0x7fffffff, s5 -; GFX6-NEXT: s_max_i32 s4, s6, s4 +; GFX6-NEXT: s_lshl_b32 s1, s1, 16 +; GFX6-NEXT: s_min_i32 s4, s4, s6 +; GFX6-NEXT: s_min_i32 s6, s1, 0 +; GFX6-NEXT: s_lshr_b32 s7, s3, 16 +; GFX6-NEXT: s_add_i32 s2, s2, s4 ; GFX6-NEXT: s_lshl_b32 s3, s3, 16 -; GFX6-NEXT: s_min_i32 s4, s4, s5 +; GFX6-NEXT: s_max_i32 s4, s1, 0 +; GFX6-NEXT: s_sub_i32 s6, 0x80000000, s6 +; GFX6-NEXT: s_sub_i32 s4, 0x7fffffff, s4 +; GFX6-NEXT: s_max_i32 s3, s6, s3 +; GFX6-NEXT: s_min_i32 s3, s3, s4 +; GFX6-NEXT: s_add_i32 s1, s1, s3 +; GFX6-NEXT: s_lshl_b32 s3, s5, 16 ; GFX6-NEXT: s_min_i32 s6, s3, 0 -; GFX6-NEXT: s_add_i32 s2, s2, s4 ; GFX6-NEXT: s_lshl_b32 s4, s7, 16 ; GFX6-NEXT: s_max_i32 s5, s3, 0 ; GFX6-NEXT: s_sub_i32 s6, 0x80000000, s6 ; GFX6-NEXT: s_sub_i32 s5, 0x7fffffff, s5 ; GFX6-NEXT: s_max_i32 s4, s6, s4 -; GFX6-NEXT: s_ashr_i32 s1, s1, 16 +; GFX6-NEXT: s_ashr_i32 s2, s2, 16 ; GFX6-NEXT: s_min_i32 s4, s4, s5 ; GFX6-NEXT: s_ashr_i32 s0, s0, 16 ; GFX6-NEXT: s_add_i32 s3, s3, s4 -; GFX6-NEXT: s_and_b32 s1, s1, 0xffff -; GFX6-NEXT: s_ashr_i32 s2, s2, 16 +; GFX6-NEXT: s_and_b32 s2, s2, 0xffff ; GFX6-NEXT: s_ashr_i32 s3, s3, 16 ; GFX6-NEXT: s_and_b32 s0, s0, 0xffff -; GFX6-NEXT: s_lshl_b32 s1, s1, 16 -; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_and_b32 s1, s2, 0xffff +; GFX6-NEXT: s_lshl_b32 s2, s2, 16 +; GFX6-NEXT: s_ashr_i32 s1, s1, 16 +; GFX6-NEXT: s_or_b32 s0, s0, s2 ; GFX6-NEXT: s_and_b32 s2, s3, 0xffff +; GFX6-NEXT: s_and_b32 s1, s1, 0xffff ; GFX6-NEXT: s_lshl_b32 s2, s2, 16 ; GFX6-NEXT: s_or_b32 s1, s1, s2 ; GFX6-NEXT: ; return to shader part epilog @@ -3319,78 +3341,84 @@ define <3 x float> @v_saddsat_v6i16(<6 x i16> %lhs, <6 x i16> %rhs) { ; GFX6-LABEL: v_saddsat_v6i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_min_i32_e32 v14, 0, v0 ; GFX6-NEXT: v_bfrev_b32_e32 v15, 1 -; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v9, 16, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_max_i32_e32 v12, 0, v0 ; GFX6-NEXT: v_sub_i32_e32 v14, vcc, v15, v14 ; GFX6-NEXT: v_sub_i32_e32 v12, vcc, 0x7fffffff, v12 -; GFX6-NEXT: v_max_i32_e32 v6, v14, v6 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_min_i32_e32 v6, v6, v12 -; GFX6-NEXT: v_min_i32_e32 v12, 0, v1 +; GFX6-NEXT: v_max_i32_e32 v3, v14, v3 +; GFX6-NEXT: v_min_i32_e32 v3, v3, v12 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX6-NEXT: v_min_i32_e32 v12, 0, v3 ; GFX6-NEXT: v_bfrev_b32_e32 v13, -2 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v6 -; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; GFX6-NEXT: v_max_i32_e32 v7, 0, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v9 +; GFX6-NEXT: v_max_i32_e32 v9, 0, v3 ; GFX6-NEXT: v_sub_i32_e32 v12, vcc, v15, v12 -; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v13, v7 +; GFX6-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v13, v9 ; GFX6-NEXT: v_max_i32_e32 v6, v12, v6 -; GFX6-NEXT: v_min_i32_e32 v6, v6, v7 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v6 -; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v8 -; GFX6-NEXT: v_min_i32_e32 v8, 0, v2 -; GFX6-NEXT: v_max_i32_e32 v7, 0, v2 -; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v15, v8 -; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v13, v7 -; GFX6-NEXT: v_max_i32_e32 v6, v8, v6 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_min_i32_e32 v6, v6, v7 -; GFX6-NEXT: v_min_i32_e32 v8, 0, v3 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v6 -; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v9 -; GFX6-NEXT: v_max_i32_e32 v7, 0, v3 -; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v15, v8 -; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v13, v7 -; GFX6-NEXT: v_max_i32_e32 v6, v8, v6 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_min_i32_e32 v6, v6, v7 -; GFX6-NEXT: v_min_i32_e32 v8, 0, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_min_i32_e32 v6, v6, v9 +; GFX6-NEXT: v_min_i32_e32 v9, 0, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v10, 16, v4 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_max_i32_e32 v6, 0, v1 +; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v15, v9 +; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v13, v6 +; GFX6-NEXT: v_max_i32_e32 v4, v9, v4 +; GFX6-NEXT: v_min_i32_e32 v4, v4, v6 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v7 +; GFX6-NEXT: v_min_i32_e32 v9, 0, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v10 ; GFX6-NEXT: v_max_i32_e32 v7, 0, v4 -; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v15, v8 +; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v15, v9 +; GFX6-NEXT: v_lshrrev_b32_e32 v8, 16, v2 ; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v13, v7 -; GFX6-NEXT: v_max_i32_e32 v6, v8, v6 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX6-NEXT: v_max_i32_e32 v6, v9, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_min_i32_e32 v6, v6, v7 -; GFX6-NEXT: v_min_i32_e32 v8, 0, v5 +; GFX6-NEXT: v_min_i32_e32 v7, 0, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v11, 16, v5 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX6-NEXT: v_max_i32_e32 v6, 0, v2 +; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v15, v7 +; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v13, v6 +; GFX6-NEXT: v_max_i32_e32 v5, v7, v5 +; GFX6-NEXT: v_min_i32_e32 v5, v5, v6 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v8 +; GFX6-NEXT: v_min_i32_e32 v8, 0, v5 +; GFX6-NEXT: v_ashrrev_i32_e32 v3, 16, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v11 ; GFX6-NEXT: v_max_i32_e32 v7, 0, v5 ; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v15, v8 -; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1 +; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 ; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v13, v7 ; GFX6-NEXT: v_max_i32_e32 v6, v8, v6 -; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX6-NEXT: v_ashrrev_i32_e32 v4, 16, v4 ; GFX6-NEXT: v_min_i32_e32 v6, v6, v7 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX6-NEXT: v_ashrrev_i32_e32 v2, 16, v2 -; GFX6-NEXT: v_ashrrev_i32_e32 v3, 16, v3 -; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1 +; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v6 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v4 ; GFX6-NEXT: v_ashrrev_i32_e32 v5, 16, v5 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v3 -; GFX6-NEXT: v_ashrrev_i32_e32 v4, 16, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_ashrrev_i32_e32 v2, 16, v2 +; GFX6-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v5 -; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v4 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -3469,76 +3497,82 @@ define <3 x float> @v_saddsat_v6i16(<6 x i16> %lhs, <6 x i16> %rhs) { define amdgpu_ps <3 x i32> @s_saddsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inreg %rhs) { ; GFX6-LABEL: s_saddsat_v6i16: ; GFX6: ; %bb.0: +; GFX6-NEXT: s_lshr_b32 s6, s0, 16 ; GFX6-NEXT: s_lshl_b32 s0, s0, 16 ; GFX6-NEXT: s_min_i32 s13, s0, 0 -; GFX6-NEXT: s_lshl_b32 s6, s6, 16 +; GFX6-NEXT: s_lshr_b32 s9, s3, 16 +; GFX6-NEXT: s_lshl_b32 s3, s3, 16 ; GFX6-NEXT: s_max_i32 s12, s0, 0 ; GFX6-NEXT: s_sub_i32 s13, 0x80000000, s13 ; GFX6-NEXT: s_sub_i32 s12, 0x7fffffff, s12 -; GFX6-NEXT: s_max_i32 s6, s13, s6 -; GFX6-NEXT: s_lshl_b32 s1, s1, 16 -; GFX6-NEXT: s_min_i32 s6, s6, s12 -; GFX6-NEXT: s_min_i32 s12, s1, 0 -; GFX6-NEXT: s_add_i32 s0, s0, s6 -; GFX6-NEXT: s_lshl_b32 s6, s7, 16 -; GFX6-NEXT: s_max_i32 s7, s1, 0 +; GFX6-NEXT: s_max_i32 s3, s13, s3 +; GFX6-NEXT: s_min_i32 s3, s3, s12 +; GFX6-NEXT: s_add_i32 s0, s0, s3 +; GFX6-NEXT: s_lshl_b32 s3, s6, 16 +; GFX6-NEXT: s_min_i32 s12, s3, 0 +; GFX6-NEXT: s_lshl_b32 s6, s9, 16 +; GFX6-NEXT: s_max_i32 s9, s3, 0 ; GFX6-NEXT: s_sub_i32 s12, 0x80000000, s12 -; GFX6-NEXT: s_sub_i32 s7, 0x7fffffff, s7 +; GFX6-NEXT: s_lshr_b32 s7, s1, 16 +; GFX6-NEXT: s_sub_i32 s9, 0x7fffffff, s9 ; GFX6-NEXT: s_max_i32 s6, s12, s6 -; GFX6-NEXT: s_min_i32 s6, s6, s7 -; GFX6-NEXT: s_lshl_b32 s2, s2, 16 -; GFX6-NEXT: s_add_i32 s1, s1, s6 -; GFX6-NEXT: s_lshl_b32 s6, s8, 16 -; GFX6-NEXT: s_min_i32 s8, s2, 0 -; GFX6-NEXT: s_max_i32 s7, s2, 0 -; GFX6-NEXT: s_sub_i32 s8, 0x80000000, s8 -; GFX6-NEXT: s_sub_i32 s7, 0x7fffffff, s7 -; GFX6-NEXT: s_max_i32 s6, s8, s6 -; GFX6-NEXT: s_lshl_b32 s3, s3, 16 -; GFX6-NEXT: s_min_i32 s6, s6, s7 -; GFX6-NEXT: s_min_i32 s8, s3, 0 -; GFX6-NEXT: s_add_i32 s2, s2, s6 -; GFX6-NEXT: s_lshl_b32 s6, s9, 16 -; GFX6-NEXT: s_max_i32 s7, s3, 0 -; GFX6-NEXT: s_sub_i32 s8, 0x80000000, s8 -; GFX6-NEXT: s_sub_i32 s7, 0x7fffffff, s7 -; GFX6-NEXT: s_max_i32 s6, s8, s6 -; GFX6-NEXT: s_lshl_b32 s4, s4, 16 -; GFX6-NEXT: s_min_i32 s6, s6, s7 -; GFX6-NEXT: s_min_i32 s8, s4, 0 +; GFX6-NEXT: s_lshl_b32 s1, s1, 16 +; GFX6-NEXT: s_min_i32 s6, s6, s9 +; GFX6-NEXT: s_min_i32 s9, s1, 0 +; GFX6-NEXT: s_lshr_b32 s10, s4, 16 ; GFX6-NEXT: s_add_i32 s3, s3, s6 +; GFX6-NEXT: s_lshl_b32 s4, s4, 16 +; GFX6-NEXT: s_max_i32 s6, s1, 0 +; GFX6-NEXT: s_sub_i32 s9, 0x80000000, s9 +; GFX6-NEXT: s_sub_i32 s6, 0x7fffffff, s6 +; GFX6-NEXT: s_max_i32 s4, s9, s4 +; GFX6-NEXT: s_min_i32 s4, s4, s6 +; GFX6-NEXT: s_add_i32 s1, s1, s4 +; GFX6-NEXT: s_lshl_b32 s4, s7, 16 +; GFX6-NEXT: s_min_i32 s9, s4, 0 ; GFX6-NEXT: s_lshl_b32 s6, s10, 16 ; GFX6-NEXT: s_max_i32 s7, s4, 0 -; GFX6-NEXT: s_sub_i32 s8, 0x80000000, s8 +; GFX6-NEXT: s_sub_i32 s9, 0x80000000, s9 +; GFX6-NEXT: s_lshr_b32 s8, s2, 16 ; GFX6-NEXT: s_sub_i32 s7, 0x7fffffff, s7 -; GFX6-NEXT: s_max_i32 s6, s8, s6 -; GFX6-NEXT: s_lshl_b32 s5, s5, 16 +; GFX6-NEXT: s_max_i32 s6, s9, s6 +; GFX6-NEXT: s_lshl_b32 s2, s2, 16 ; GFX6-NEXT: s_min_i32 s6, s6, s7 -; GFX6-NEXT: s_min_i32 s8, s5, 0 +; GFX6-NEXT: s_min_i32 s7, s2, 0 +; GFX6-NEXT: s_lshr_b32 s11, s5, 16 ; GFX6-NEXT: s_add_i32 s4, s4, s6 +; GFX6-NEXT: s_lshl_b32 s5, s5, 16 +; GFX6-NEXT: s_max_i32 s6, s2, 0 +; GFX6-NEXT: s_sub_i32 s7, 0x80000000, s7 +; GFX6-NEXT: s_sub_i32 s6, 0x7fffffff, s6 +; GFX6-NEXT: s_max_i32 s5, s7, s5 +; GFX6-NEXT: s_min_i32 s5, s5, s6 +; GFX6-NEXT: s_add_i32 s2, s2, s5 +; GFX6-NEXT: s_lshl_b32 s5, s8, 16 +; GFX6-NEXT: s_min_i32 s8, s5, 0 +; GFX6-NEXT: s_ashr_i32 s3, s3, 16 ; GFX6-NEXT: s_lshl_b32 s6, s11, 16 ; GFX6-NEXT: s_max_i32 s7, s5, 0 ; GFX6-NEXT: s_sub_i32 s8, 0x80000000, s8 -; GFX6-NEXT: s_ashr_i32 s1, s1, 16 +; GFX6-NEXT: s_ashr_i32 s0, s0, 16 ; GFX6-NEXT: s_sub_i32 s7, 0x7fffffff, s7 ; GFX6-NEXT: s_max_i32 s6, s8, s6 -; GFX6-NEXT: s_ashr_i32 s0, s0, 16 +; GFX6-NEXT: s_and_b32 s3, s3, 0xffff +; GFX6-NEXT: s_ashr_i32 s4, s4, 16 ; GFX6-NEXT: s_min_i32 s6, s6, s7 -; GFX6-NEXT: s_and_b32 s1, s1, 0xffff -; GFX6-NEXT: s_ashr_i32 s2, s2, 16 -; GFX6-NEXT: s_ashr_i32 s3, s3, 16 -; GFX6-NEXT: s_add_i32 s5, s5, s6 ; GFX6-NEXT: s_and_b32 s0, s0, 0xffff -; GFX6-NEXT: s_lshl_b32 s1, s1, 16 +; GFX6-NEXT: s_lshl_b32 s3, s3, 16 +; GFX6-NEXT: s_ashr_i32 s1, s1, 16 +; GFX6-NEXT: s_add_i32 s5, s5, s6 +; GFX6-NEXT: s_or_b32 s0, s0, s3 +; GFX6-NEXT: s_and_b32 s3, s4, 0xffff ; GFX6-NEXT: s_ashr_i32 s5, s5, 16 -; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_and_b32 s1, s2, 0xffff -; GFX6-NEXT: s_and_b32 s2, s3, 0xffff -; GFX6-NEXT: s_ashr_i32 s4, s4, 16 -; GFX6-NEXT: s_lshl_b32 s2, s2, 16 +; GFX6-NEXT: s_and_b32 s1, s1, 0xffff +; GFX6-NEXT: s_lshl_b32 s3, s3, 16 +; GFX6-NEXT: s_ashr_i32 s2, s2, 16 +; GFX6-NEXT: s_or_b32 s1, s1, s3 ; GFX6-NEXT: s_and_b32 s3, s5, 0xffff -; GFX6-NEXT: s_or_b32 s1, s1, s2 -; GFX6-NEXT: s_and_b32 s2, s4, 0xffff +; GFX6-NEXT: s_and_b32 s2, s2, 0xffff ; GFX6-NEXT: s_lshl_b32 s3, s3, 16 ; GFX6-NEXT: s_or_b32 s2, s2, s3 ; GFX6-NEXT: ; return to shader part epilog @@ -3668,102 +3702,110 @@ define <4 x float> @v_saddsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { ; GFX6-LABEL: v_saddsat_v8i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v8, 16, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_min_i32_e32 v18, 0, v0 ; GFX6-NEXT: v_bfrev_b32_e32 v19, 1 -; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX6-NEXT: v_lshrrev_b32_e32 v12, 16, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX6-NEXT: v_max_i32_e32 v16, 0, v0 ; GFX6-NEXT: v_bfrev_b32_e32 v17, -2 ; GFX6-NEXT: v_sub_i32_e32 v18, vcc, v19, v18 ; GFX6-NEXT: v_sub_i32_e32 v16, vcc, v17, v16 -; GFX6-NEXT: v_max_i32_e32 v8, v18, v8 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_min_i32_e32 v8, v8, v16 -; GFX6-NEXT: v_min_i32_e32 v16, 0, v1 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v8 -; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; GFX6-NEXT: v_max_i32_e32 v9, 0, v1 +; GFX6-NEXT: v_max_i32_e32 v4, v18, v4 +; GFX6-NEXT: v_min_i32_e32 v4, v4, v16 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; GFX6-NEXT: v_min_i32_e32 v16, 0, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v12 +; GFX6-NEXT: v_max_i32_e32 v12, 0, v4 ; GFX6-NEXT: v_sub_i32_e32 v16, vcc, v19, v16 -; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v17, v9 +; GFX6-NEXT: v_lshrrev_b32_e32 v9, 16, v1 +; GFX6-NEXT: v_sub_i32_e32 v12, vcc, v17, v12 ; GFX6-NEXT: v_max_i32_e32 v8, v16, v8 -; GFX6-NEXT: v_min_i32_e32 v8, v8, v9 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v8 -; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v10 -; GFX6-NEXT: v_min_i32_e32 v10, 0, v2 -; GFX6-NEXT: v_max_i32_e32 v9, 0, v2 -; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v19, v10 -; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v17, v9 -; GFX6-NEXT: v_max_i32_e32 v8, v10, v8 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_min_i32_e32 v8, v8, v9 -; GFX6-NEXT: v_min_i32_e32 v10, 0, v3 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v8 -; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v11 -; GFX6-NEXT: v_max_i32_e32 v9, 0, v3 -; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v19, v10 -; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v17, v9 -; GFX6-NEXT: v_max_i32_e32 v8, v10, v8 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_min_i32_e32 v8, v8, v9 -; GFX6-NEXT: v_min_i32_e32 v10, 0, v4 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v8 -; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v12 -; GFX6-NEXT: v_max_i32_e32 v9, 0, v4 -; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v19, v10 -; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v17, v9 -; GFX6-NEXT: v_max_i32_e32 v8, v10, v8 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_min_i32_e32 v8, v8, v9 -; GFX6-NEXT: v_min_i32_e32 v10, 0, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_min_i32_e32 v8, v8, v12 +; GFX6-NEXT: v_min_i32_e32 v12, 0, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v13, 16, v5 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v8 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX6-NEXT: v_max_i32_e32 v8, 0, v1 +; GFX6-NEXT: v_sub_i32_e32 v12, vcc, v19, v12 +; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v17, v8 +; GFX6-NEXT: v_max_i32_e32 v5, v12, v5 +; GFX6-NEXT: v_min_i32_e32 v5, v5, v8 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v9 +; GFX6-NEXT: v_min_i32_e32 v12, 0, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v13 ; GFX6-NEXT: v_max_i32_e32 v9, 0, v5 -; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v19, v10 +; GFX6-NEXT: v_sub_i32_e32 v12, vcc, v19, v12 +; GFX6-NEXT: v_lshrrev_b32_e32 v10, 16, v2 ; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v17, v9 -; GFX6-NEXT: v_max_i32_e32 v8, v10, v8 -; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX6-NEXT: v_max_i32_e32 v8, v12, v8 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_min_i32_e32 v8, v8, v9 -; GFX6-NEXT: v_min_i32_e32 v10, 0, v6 +; GFX6-NEXT: v_min_i32_e32 v9, 0, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v14, 16, v6 ; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v8 +; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX6-NEXT: v_max_i32_e32 v8, 0, v2 +; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v19, v9 +; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v17, v8 +; GFX6-NEXT: v_max_i32_e32 v6, v9, v6 +; GFX6-NEXT: v_min_i32_e32 v6, v6, v8 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v10 +; GFX6-NEXT: v_min_i32_e32 v10, 0, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v14 ; GFX6-NEXT: v_max_i32_e32 v9, 0, v6 ; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v19, v10 +; GFX6-NEXT: v_lshrrev_b32_e32 v11, 16, v3 ; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v17, v9 ; GFX6-NEXT: v_max_i32_e32 v8, v10, v8 -; GFX6-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_min_i32_e32 v8, v8, v9 -; GFX6-NEXT: v_min_i32_e32 v10, 0, v7 -; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1 +; GFX6-NEXT: v_min_i32_e32 v9, 0, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v15, 16, v7 ; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v8 +; GFX6-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX6-NEXT: v_max_i32_e32 v8, 0, v3 +; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v19, v9 +; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v17, v8 +; GFX6-NEXT: v_max_i32_e32 v7, v9, v7 +; GFX6-NEXT: v_min_i32_e32 v7, v7, v8 +; GFX6-NEXT: v_ashrrev_i32_e32 v4, 16, v4 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v7 +; GFX6-NEXT: v_lshlrev_b32_e32 v7, 16, v11 +; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 +; GFX6-NEXT: v_min_i32_e32 v10, 0, v7 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX6-NEXT: v_ashrrev_i32_e32 v5, 16, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v15 ; GFX6-NEXT: v_max_i32_e32 v9, 0, v7 ; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v19, v10 -; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1 ; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v17, v9 ; GFX6-NEXT: v_max_i32_e32 v8, v10, v8 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v5 +; GFX6-NEXT: v_ashrrev_i32_e32 v6, 16, v6 +; GFX6-NEXT: v_min_i32_e32 v8, v8, v9 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX6-NEXT: v_ashrrev_i32_e32 v2, 16, v2 -; GFX6-NEXT: v_ashrrev_i32_e32 v3, 16, v3 -; GFX6-NEXT: v_min_i32_e32 v8, v8, v9 -; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_ashrrev_i32_e32 v5, 16, v5 ; GFX6-NEXT: v_add_i32_e32 v7, vcc, v7, v8 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v3 -; GFX6-NEXT: v_ashrrev_i32_e32 v4, 16, v4 +; GFX6-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v6 ; GFX6-NEXT: v_ashrrev_i32_e32 v7, 16, v7 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v5 -; GFX6-NEXT: v_ashrrev_i32_e32 v6, 16, v6 -; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_ashrrev_i32_e32 v3, 16, v3 +; GFX6-NEXT: v_or_b32_e32 v2, v2, v4 ; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v7 -; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v6 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX6-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -3859,100 +3901,108 @@ define <4 x float> @v_saddsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { define amdgpu_ps <4 x i32> @s_saddsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inreg %rhs) { ; GFX6-LABEL: s_saddsat_v8i16: ; GFX6: ; %bb.0: +; GFX6-NEXT: s_lshr_b32 s8, s0, 16 ; GFX6-NEXT: s_lshl_b32 s0, s0, 16 ; GFX6-NEXT: s_min_i32 s17, s0, 0 -; GFX6-NEXT: s_lshl_b32 s8, s8, 16 +; GFX6-NEXT: s_lshr_b32 s12, s4, 16 +; GFX6-NEXT: s_lshl_b32 s4, s4, 16 ; GFX6-NEXT: s_max_i32 s16, s0, 0 ; GFX6-NEXT: s_sub_i32 s17, 0x80000000, s17 ; GFX6-NEXT: s_sub_i32 s16, 0x7fffffff, s16 -; GFX6-NEXT: s_max_i32 s8, s17, s8 -; GFX6-NEXT: s_lshl_b32 s1, s1, 16 -; GFX6-NEXT: s_min_i32 s8, s8, s16 -; GFX6-NEXT: s_min_i32 s16, s1, 0 -; GFX6-NEXT: s_add_i32 s0, s0, s8 -; GFX6-NEXT: s_lshl_b32 s8, s9, 16 -; GFX6-NEXT: s_max_i32 s9, s1, 0 +; GFX6-NEXT: s_max_i32 s4, s17, s4 +; GFX6-NEXT: s_min_i32 s4, s4, s16 +; GFX6-NEXT: s_add_i32 s0, s0, s4 +; GFX6-NEXT: s_lshl_b32 s4, s8, 16 +; GFX6-NEXT: s_min_i32 s16, s4, 0 +; GFX6-NEXT: s_lshl_b32 s8, s12, 16 +; GFX6-NEXT: s_max_i32 s12, s4, 0 ; GFX6-NEXT: s_sub_i32 s16, 0x80000000, s16 -; GFX6-NEXT: s_sub_i32 s9, 0x7fffffff, s9 +; GFX6-NEXT: s_lshr_b32 s9, s1, 16 +; GFX6-NEXT: s_sub_i32 s12, 0x7fffffff, s12 ; GFX6-NEXT: s_max_i32 s8, s16, s8 -; GFX6-NEXT: s_min_i32 s8, s8, s9 -; GFX6-NEXT: s_lshl_b32 s2, s2, 16 -; GFX6-NEXT: s_add_i32 s1, s1, s8 -; GFX6-NEXT: s_lshl_b32 s8, s10, 16 -; GFX6-NEXT: s_min_i32 s10, s2, 0 -; GFX6-NEXT: s_max_i32 s9, s2, 0 -; GFX6-NEXT: s_sub_i32 s10, 0x80000000, s10 -; GFX6-NEXT: s_sub_i32 s9, 0x7fffffff, s9 -; GFX6-NEXT: s_max_i32 s8, s10, s8 -; GFX6-NEXT: s_lshl_b32 s3, s3, 16 -; GFX6-NEXT: s_min_i32 s8, s8, s9 -; GFX6-NEXT: s_min_i32 s10, s3, 0 -; GFX6-NEXT: s_add_i32 s2, s2, s8 -; GFX6-NEXT: s_lshl_b32 s8, s11, 16 -; GFX6-NEXT: s_max_i32 s9, s3, 0 -; GFX6-NEXT: s_sub_i32 s10, 0x80000000, s10 -; GFX6-NEXT: s_sub_i32 s9, 0x7fffffff, s9 -; GFX6-NEXT: s_max_i32 s8, s10, s8 -; GFX6-NEXT: s_lshl_b32 s4, s4, 16 -; GFX6-NEXT: s_min_i32 s8, s8, s9 -; GFX6-NEXT: s_min_i32 s10, s4, 0 -; GFX6-NEXT: s_add_i32 s3, s3, s8 -; GFX6-NEXT: s_lshl_b32 s8, s12, 16 -; GFX6-NEXT: s_max_i32 s9, s4, 0 -; GFX6-NEXT: s_sub_i32 s10, 0x80000000, s10 -; GFX6-NEXT: s_sub_i32 s9, 0x7fffffff, s9 -; GFX6-NEXT: s_max_i32 s8, s10, s8 -; GFX6-NEXT: s_lshl_b32 s5, s5, 16 -; GFX6-NEXT: s_min_i32 s8, s8, s9 -; GFX6-NEXT: s_min_i32 s10, s5, 0 +; GFX6-NEXT: s_lshl_b32 s1, s1, 16 +; GFX6-NEXT: s_min_i32 s8, s8, s12 +; GFX6-NEXT: s_min_i32 s12, s1, 0 +; GFX6-NEXT: s_lshr_b32 s13, s5, 16 ; GFX6-NEXT: s_add_i32 s4, s4, s8 +; GFX6-NEXT: s_lshl_b32 s5, s5, 16 +; GFX6-NEXT: s_max_i32 s8, s1, 0 +; GFX6-NEXT: s_sub_i32 s12, 0x80000000, s12 +; GFX6-NEXT: s_sub_i32 s8, 0x7fffffff, s8 +; GFX6-NEXT: s_max_i32 s5, s12, s5 +; GFX6-NEXT: s_min_i32 s5, s5, s8 +; GFX6-NEXT: s_add_i32 s1, s1, s5 +; GFX6-NEXT: s_lshl_b32 s5, s9, 16 +; GFX6-NEXT: s_min_i32 s12, s5, 0 ; GFX6-NEXT: s_lshl_b32 s8, s13, 16 ; GFX6-NEXT: s_max_i32 s9, s5, 0 -; GFX6-NEXT: s_sub_i32 s10, 0x80000000, s10 +; GFX6-NEXT: s_sub_i32 s12, 0x80000000, s12 +; GFX6-NEXT: s_lshr_b32 s10, s2, 16 ; GFX6-NEXT: s_sub_i32 s9, 0x7fffffff, s9 -; GFX6-NEXT: s_max_i32 s8, s10, s8 -; GFX6-NEXT: s_lshl_b32 s6, s6, 16 +; GFX6-NEXT: s_max_i32 s8, s12, s8 +; GFX6-NEXT: s_lshl_b32 s2, s2, 16 ; GFX6-NEXT: s_min_i32 s8, s8, s9 -; GFX6-NEXT: s_min_i32 s10, s6, 0 +; GFX6-NEXT: s_min_i32 s9, s2, 0 +; GFX6-NEXT: s_lshr_b32 s14, s6, 16 ; GFX6-NEXT: s_add_i32 s5, s5, s8 +; GFX6-NEXT: s_lshl_b32 s6, s6, 16 +; GFX6-NEXT: s_max_i32 s8, s2, 0 +; GFX6-NEXT: s_sub_i32 s9, 0x80000000, s9 +; GFX6-NEXT: s_sub_i32 s8, 0x7fffffff, s8 +; GFX6-NEXT: s_max_i32 s6, s9, s6 +; GFX6-NEXT: s_min_i32 s6, s6, s8 +; GFX6-NEXT: s_add_i32 s2, s2, s6 +; GFX6-NEXT: s_lshl_b32 s6, s10, 16 +; GFX6-NEXT: s_min_i32 s10, s6, 0 ; GFX6-NEXT: s_lshl_b32 s8, s14, 16 ; GFX6-NEXT: s_max_i32 s9, s6, 0 ; GFX6-NEXT: s_sub_i32 s10, 0x80000000, s10 +; GFX6-NEXT: s_lshr_b32 s11, s3, 16 ; GFX6-NEXT: s_sub_i32 s9, 0x7fffffff, s9 ; GFX6-NEXT: s_max_i32 s8, s10, s8 -; GFX6-NEXT: s_lshl_b32 s7, s7, 16 +; GFX6-NEXT: s_lshl_b32 s3, s3, 16 ; GFX6-NEXT: s_min_i32 s8, s8, s9 -; GFX6-NEXT: s_min_i32 s10, s7, 0 -; GFX6-NEXT: s_ashr_i32 s1, s1, 16 +; GFX6-NEXT: s_min_i32 s9, s3, 0 +; GFX6-NEXT: s_lshr_b32 s15, s7, 16 ; GFX6-NEXT: s_add_i32 s6, s6, s8 +; GFX6-NEXT: s_lshl_b32 s7, s7, 16 +; GFX6-NEXT: s_max_i32 s8, s3, 0 +; GFX6-NEXT: s_sub_i32 s9, 0x80000000, s9 +; GFX6-NEXT: s_sub_i32 s8, 0x7fffffff, s8 +; GFX6-NEXT: s_max_i32 s7, s9, s7 +; GFX6-NEXT: s_min_i32 s7, s7, s8 +; GFX6-NEXT: s_ashr_i32 s4, s4, 16 +; GFX6-NEXT: s_add_i32 s3, s3, s7 +; GFX6-NEXT: s_lshl_b32 s7, s11, 16 +; GFX6-NEXT: s_ashr_i32 s0, s0, 16 +; GFX6-NEXT: s_min_i32 s10, s7, 0 +; GFX6-NEXT: s_and_b32 s4, s4, 0xffff +; GFX6-NEXT: s_ashr_i32 s5, s5, 16 ; GFX6-NEXT: s_lshl_b32 s8, s15, 16 ; GFX6-NEXT: s_max_i32 s9, s7, 0 ; GFX6-NEXT: s_sub_i32 s10, 0x80000000, s10 -; GFX6-NEXT: s_ashr_i32 s0, s0, 16 +; GFX6-NEXT: s_and_b32 s0, s0, 0xffff +; GFX6-NEXT: s_lshl_b32 s4, s4, 16 +; GFX6-NEXT: s_ashr_i32 s1, s1, 16 ; GFX6-NEXT: s_sub_i32 s9, 0x7fffffff, s9 ; GFX6-NEXT: s_max_i32 s8, s10, s8 +; GFX6-NEXT: s_or_b32 s0, s0, s4 +; GFX6-NEXT: s_and_b32 s4, s5, 0xffff +; GFX6-NEXT: s_ashr_i32 s6, s6, 16 +; GFX6-NEXT: s_min_i32 s8, s8, s9 ; GFX6-NEXT: s_and_b32 s1, s1, 0xffff +; GFX6-NEXT: s_lshl_b32 s4, s4, 16 ; GFX6-NEXT: s_ashr_i32 s2, s2, 16 -; GFX6-NEXT: s_ashr_i32 s3, s3, 16 -; GFX6-NEXT: s_min_i32 s8, s8, s9 -; GFX6-NEXT: s_and_b32 s0, s0, 0xffff -; GFX6-NEXT: s_lshl_b32 s1, s1, 16 -; GFX6-NEXT: s_ashr_i32 s5, s5, 16 ; GFX6-NEXT: s_add_i32 s7, s7, s8 -; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_and_b32 s1, s2, 0xffff -; GFX6-NEXT: s_and_b32 s2, s3, 0xffff -; GFX6-NEXT: s_ashr_i32 s4, s4, 16 +; GFX6-NEXT: s_or_b32 s1, s1, s4 +; GFX6-NEXT: s_and_b32 s4, s6, 0xffff ; GFX6-NEXT: s_ashr_i32 s7, s7, 16 -; GFX6-NEXT: s_lshl_b32 s2, s2, 16 -; GFX6-NEXT: s_and_b32 s3, s5, 0xffff -; GFX6-NEXT: s_ashr_i32 s6, s6, 16 -; GFX6-NEXT: s_or_b32 s1, s1, s2 -; GFX6-NEXT: s_and_b32 s2, s4, 0xffff -; GFX6-NEXT: s_lshl_b32 s3, s3, 16 +; GFX6-NEXT: s_and_b32 s2, s2, 0xffff +; GFX6-NEXT: s_lshl_b32 s4, s4, 16 +; GFX6-NEXT: s_ashr_i32 s3, s3, 16 +; GFX6-NEXT: s_or_b32 s2, s2, s4 ; GFX6-NEXT: s_and_b32 s4, s7, 0xffff -; GFX6-NEXT: s_or_b32 s2, s2, s3 -; GFX6-NEXT: s_and_b32 s3, s6, 0xffff +; GFX6-NEXT: s_and_b32 s3, s3, 0xffff ; GFX6-NEXT: s_lshl_b32 s4, s4, 16 ; GFX6-NEXT: s_or_b32 s3, s3, s4 ; GFX6-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll index cfe655ff97975..ab7e11a78ed57 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll @@ -546,8 +546,12 @@ define <2 x i16> @v_sext_inreg_v2i16_8(<2 x i16> %value) { ; GFX6-LABEL: v_sext_inreg_v2i16_8: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 8 -; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 8 +; GFX6-NEXT: v_bfe_i32 v1, v0, 0, 8 +; GFX6-NEXT: v_bfe_i32 v0, v0, 16, 8 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_sext_inreg_v2i16_8: @@ -581,8 +585,12 @@ define <2 x i16> @v_sext_inreg_v2i16_15(<2 x i16> %value) { ; GFX6-LABEL: v_sext_inreg_v2i16_15: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 1 -; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 1 +; GFX6-NEXT: v_bfe_i32 v1, v0, 0, 1 +; GFX6-NEXT: v_bfe_i32 v0, v0, 16, 1 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_sext_inreg_v2i16_15: @@ -616,12 +624,12 @@ define <2 x i16> @v_sext_inreg_v2i16_15(<2 x i16> %value) { define amdgpu_ps i32 @s_sext_inreg_v2i16_11(<2 x i16> inreg %value) { ; GFX6-LABEL: s_sext_inreg_v2i16_11: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_bfe_i32 s1, s1, 0x50000 -; GFX6-NEXT: s_bfe_i32 s0, s0, 0x50000 -; GFX6-NEXT: s_and_b32 s1, s1, 0xffff +; GFX6-NEXT: s_bfe_i32 s1, s0, 0x50000 +; GFX6-NEXT: s_bfe_i32 s0, s0, 0x50010 ; GFX6-NEXT: s_and_b32 s0, s0, 0xffff -; GFX6-NEXT: s_lshl_b32 s1, s1, 16 -; GFX6-NEXT: s_or_b32 s0, s0, s1 +; GFX6-NEXT: s_and_b32 s1, s1, 0xffff +; GFX6-NEXT: s_lshl_b32 s0, s0, 16 +; GFX6-NEXT: s_or_b32 s0, s1, s0 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_sext_inreg_v2i16_11: @@ -684,18 +692,18 @@ define <2 x float> @v_sext_inreg_v4i16_3(<4 x i16> %value) { ; GFX6-LABEL: v_sext_inreg_v4i16_3: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 13 -; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 13 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 13 -; GFX6-NEXT: v_bfe_i32 v3, v3, 0, 13 +; GFX6-NEXT: v_bfe_i32 v2, v0, 0, 13 +; GFX6-NEXT: v_bfe_i32 v0, v0, 16, 13 +; GFX6-NEXT: v_bfe_i32 v3, v1, 0, 13 +; GFX6-NEXT: v_bfe_i32 v1, v1, 16, 13 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v1, v2, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_sext_inreg_v4i16_3: @@ -739,18 +747,18 @@ define <2 x float> @v_sext_inreg_v4i16_3(<4 x i16> %value) { define amdgpu_ps <2 x i32> @s_sext_inreg_v4i16_14(<4 x i16> inreg %value) { ; GFX6-LABEL: s_sext_inreg_v4i16_14: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_bfe_i32 s1, s1, 0x20000 -; GFX6-NEXT: s_bfe_i32 s0, s0, 0x20000 -; GFX6-NEXT: s_and_b32 s1, s1, 0xffff -; GFX6-NEXT: s_bfe_i32 s2, s2, 0x20000 -; GFX6-NEXT: s_bfe_i32 s3, s3, 0x20000 +; GFX6-NEXT: s_bfe_i32 s2, s0, 0x20000 +; GFX6-NEXT: s_bfe_i32 s0, s0, 0x20010 +; GFX6-NEXT: s_bfe_i32 s3, s1, 0x20000 +; GFX6-NEXT: s_bfe_i32 s1, s1, 0x20010 ; GFX6-NEXT: s_and_b32 s0, s0, 0xffff -; GFX6-NEXT: s_lshl_b32 s1, s1, 16 -; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_and_b32 s1, s2, 0xffff +; GFX6-NEXT: s_and_b32 s2, s2, 0xffff +; GFX6-NEXT: s_lshl_b32 s0, s0, 16 +; GFX6-NEXT: s_and_b32 s1, s1, 0xffff +; GFX6-NEXT: s_or_b32 s0, s2, s0 ; GFX6-NEXT: s_and_b32 s2, s3, 0xffff -; GFX6-NEXT: s_lshl_b32 s2, s2, 16 -; GFX6-NEXT: s_or_b32 s1, s1, s2 +; GFX6-NEXT: s_lshl_b32 s1, s1, 16 +; GFX6-NEXT: s_or_b32 s1, s2, s1 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_sext_inreg_v4i16_14: @@ -847,30 +855,30 @@ define <4 x float> @v_sext_inreg_v8i16_11(<8 x i16> %value) { ; GFX6-LABEL: v_sext_inreg_v8i16_11: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 5 -; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 5 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 5 -; GFX6-NEXT: v_bfe_i32 v3, v3, 0, 5 +; GFX6-NEXT: v_bfe_i32 v4, v0, 0, 5 +; GFX6-NEXT: v_bfe_i32 v0, v0, 16, 5 +; GFX6-NEXT: v_bfe_i32 v5, v1, 0, 5 +; GFX6-NEXT: v_bfe_i32 v1, v1, 16, 5 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX6-NEXT: v_bfe_i32 v6, v2, 0, 5 +; GFX6-NEXT: v_bfe_i32 v2, v2, 16, 5 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_bfe_i32 v7, v3, 0, 5 +; GFX6-NEXT: v_bfe_i32 v3, v3, 16, 5 +; GFX6-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_bfe_i32 v5, v5, 0, 5 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v3 -; GFX6-NEXT: v_bfe_i32 v4, v4, 0, 5 -; GFX6-NEXT: v_bfe_i32 v7, v7, 0, 5 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX6-NEXT: v_or_b32_e32 v1, v4, v1 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v5 -; GFX6-NEXT: v_bfe_i32 v6, v6, 0, 5 -; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 ; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v7 -; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v6 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_or_b32_e32 v3, v4, v3 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_sext_inreg_v8i16_11: @@ -932,30 +940,30 @@ define <4 x float> @v_sext_inreg_v8i16_11(<8 x i16> %value) { define amdgpu_ps <4 x i32> @s_sext_inreg_v8i16_5(<8 x i16> inreg %value) { ; GFX6-LABEL: s_sext_inreg_v8i16_5: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_bfe_i32 s1, s1, 0xb0000 -; GFX6-NEXT: s_bfe_i32 s0, s0, 0xb0000 -; GFX6-NEXT: s_and_b32 s1, s1, 0xffff -; GFX6-NEXT: s_bfe_i32 s2, s2, 0xb0000 -; GFX6-NEXT: s_bfe_i32 s3, s3, 0xb0000 +; GFX6-NEXT: s_bfe_i32 s4, s0, 0xb0000 +; GFX6-NEXT: s_bfe_i32 s0, s0, 0xb0010 +; GFX6-NEXT: s_bfe_i32 s5, s1, 0xb0000 +; GFX6-NEXT: s_bfe_i32 s1, s1, 0xb0010 ; GFX6-NEXT: s_and_b32 s0, s0, 0xffff +; GFX6-NEXT: s_bfe_i32 s6, s2, 0xb0000 +; GFX6-NEXT: s_bfe_i32 s2, s2, 0xb0010 +; GFX6-NEXT: s_and_b32 s4, s4, 0xffff +; GFX6-NEXT: s_lshl_b32 s0, s0, 16 +; GFX6-NEXT: s_and_b32 s1, s1, 0xffff +; GFX6-NEXT: s_bfe_i32 s7, s3, 0xb0000 +; GFX6-NEXT: s_bfe_i32 s3, s3, 0xb0010 +; GFX6-NEXT: s_or_b32 s0, s4, s0 +; GFX6-NEXT: s_and_b32 s4, s5, 0xffff ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 -; GFX6-NEXT: s_bfe_i32 s5, s5, 0xb0000 -; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_and_b32 s1, s2, 0xffff -; GFX6-NEXT: s_and_b32 s2, s3, 0xffff -; GFX6-NEXT: s_bfe_i32 s4, s4, 0xb0000 -; GFX6-NEXT: s_bfe_i32 s7, s7, 0xb0000 +; GFX6-NEXT: s_and_b32 s2, s2, 0xffff +; GFX6-NEXT: s_or_b32 s1, s4, s1 +; GFX6-NEXT: s_and_b32 s4, s6, 0xffff ; GFX6-NEXT: s_lshl_b32 s2, s2, 16 -; GFX6-NEXT: s_and_b32 s3, s5, 0xffff -; GFX6-NEXT: s_bfe_i32 s6, s6, 0xb0000 -; GFX6-NEXT: s_or_b32 s1, s1, s2 -; GFX6-NEXT: s_and_b32 s2, s4, 0xffff -; GFX6-NEXT: s_lshl_b32 s3, s3, 16 +; GFX6-NEXT: s_and_b32 s3, s3, 0xffff +; GFX6-NEXT: s_or_b32 s2, s4, s2 ; GFX6-NEXT: s_and_b32 s4, s7, 0xffff -; GFX6-NEXT: s_or_b32 s2, s2, s3 -; GFX6-NEXT: s_and_b32 s3, s6, 0xffff -; GFX6-NEXT: s_lshl_b32 s4, s4, 16 -; GFX6-NEXT: s_or_b32 s3, s3, s4 +; GFX6-NEXT: s_lshl_b32 s3, s3, 16 +; GFX6-NEXT: s_or_b32 s3, s4, s3 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_sext_inreg_v8i16_5: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll index 256d6d9a16fa9..511f04050f07e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll @@ -604,9 +604,6 @@ define i32 @v_shl_i32_zext_i16(i16 %x) { define amdgpu_ps <2 x i32> @s_shl_v2i32_zext_v2i16(<2 x i16> inreg %x) { ; GFX7-LABEL: s_shl_v2i32_zext_v2i16: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_lshl_b32 s1, s1, 16 -; GFX7-NEXT: s_and_b32 s0, s0, 0xffff -; GFX7-NEXT: s_or_b32 s0, s1, s0 ; GFX7-NEXT: s_and_b32 s0, s0, 0x3fff3fff ; GFX7-NEXT: s_lshr_b32 s1, s0, 16 ; GFX7-NEXT: s_lshl_b32 s0, s0, 2 @@ -661,9 +658,6 @@ define <2 x i32> @v_shl_v2i32_zext_v2i16(<2 x i16> %x) { ; GFX7-LABEL: v_shl_v2i32_zext_v2i16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 0x3fff3fff, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll index 2f03c7156babc..f589919992335 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll @@ -703,10 +703,15 @@ define <2 x i16> @v_shl_v2i16(<2 x i16> %value, <2 x i16> %amount) { ; GFX6-LABEL: v_shl_v2i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, v2, v0 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, v2, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, v1, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, v3, v2 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_shl_v2i16: @@ -736,8 +741,11 @@ define <2 x i16> @v_shl_v2i16_15(<2 x i16> %value) { ; GFX6-LABEL: v_shl_v2i16_15: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 15, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 15, v1 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 31, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_shl_v2i16_15: @@ -767,8 +775,10 @@ define <2 x i16> @v_shl_v2i16_15(<2 x i16> %value) { define amdgpu_ps i32 @s_shl_v2i16(<2 x i16> inreg %value, <2 x i16> inreg %amount) { ; GFX6-LABEL: s_shl_v2i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_lshl_b32 s1, s1, s3 -; GFX6-NEXT: s_lshl_b32 s0, s0, s2 +; GFX6-NEXT: s_lshr_b32 s2, s0, 16 +; GFX6-NEXT: s_lshr_b32 s3, s1, 16 +; GFX6-NEXT: s_lshl_b32 s0, s0, s1 +; GFX6-NEXT: s_lshl_b32 s1, s2, s3 ; GFX6-NEXT: s_and_b32 s1, s1, 0xffff ; GFX6-NEXT: s_and_b32 s0, s0, 0xffff ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 @@ -812,7 +822,8 @@ define amdgpu_ps i32 @s_shl_v2i16(<2 x i16> inreg %value, <2 x i16> inreg %amoun define amdgpu_ps float @shl_v2i16_sv(<2 x i16> inreg %value, <2 x i16> %amount) { ; GFX6-LABEL: shl_v2i16_sv: ; GFX6: ; %bb.0: -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: s_lshr_b32 s1, s0, 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_lshl_b32_e32 v1, s1, v1 ; GFX6-NEXT: v_lshl_b32_e32 v0, s0, v0 @@ -848,10 +859,11 @@ define amdgpu_ps float @shl_v2i16_sv(<2 x i16> inreg %value, <2 x i16> %amount) define amdgpu_ps float @shl_v2i16_vs(<2 x i16> %value, <2 x i16> inreg %amount) { ; GFX6-LABEL: shl_v2i16_vs: ; GFX6: ; %bb.0: +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX6-NEXT: s_lshr_b32 s1, s0, 16 ; GFX6-NEXT: s_and_b32 s0, s0, 0xffff +; GFX6-NEXT: v_lshlrev_b32_e32 v1, s1, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, s0, v0 -; GFX6-NEXT: s_and_b32 s0, s1, 0xffff -; GFX6-NEXT: v_lshlrev_b32_e32 v1, s0, v1 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -896,20 +908,22 @@ define <2 x float> @v_shl_v4i16(<4 x i16> %value, <4 x i16> %amount) { ; GFX6-LABEL: v_shl_v4i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, v4, v0 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, v4, v1 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v6 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v7 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, v4, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, v2, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, v6, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, v3, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, v7, v5 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v3 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -946,16 +960,20 @@ define <2 x float> @v_shl_v4i16(<4 x i16> %value, <4 x i16> %amount) { define amdgpu_ps <2 x i32> @s_shl_v4i16(<4 x i16> inreg %value, <4 x i16> inreg %amount) { ; GFX6-LABEL: s_shl_v4i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_lshl_b32 s1, s1, s5 -; GFX6-NEXT: s_lshl_b32 s0, s0, s4 -; GFX6-NEXT: s_and_b32 s1, s1, 0xffff -; GFX6-NEXT: s_lshl_b32 s2, s2, s6 -; GFX6-NEXT: s_lshl_b32 s3, s3, s7 +; GFX6-NEXT: s_lshr_b32 s4, s0, 16 +; GFX6-NEXT: s_lshr_b32 s6, s2, 16 +; GFX6-NEXT: s_lshl_b32 s0, s0, s2 +; GFX6-NEXT: s_lshl_b32 s2, s4, s6 +; GFX6-NEXT: s_lshr_b32 s5, s1, 16 +; GFX6-NEXT: s_lshr_b32 s7, s3, 16 +; GFX6-NEXT: s_and_b32 s2, s2, 0xffff +; GFX6-NEXT: s_lshl_b32 s1, s1, s3 +; GFX6-NEXT: s_lshl_b32 s3, s5, s7 ; GFX6-NEXT: s_and_b32 s0, s0, 0xffff -; GFX6-NEXT: s_lshl_b32 s1, s1, 16 -; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_and_b32 s1, s2, 0xffff +; GFX6-NEXT: s_lshl_b32 s2, s2, 16 +; GFX6-NEXT: s_or_b32 s0, s0, s2 ; GFX6-NEXT: s_and_b32 s2, s3, 0xffff +; GFX6-NEXT: s_and_b32 s1, s1, 0xffff ; GFX6-NEXT: s_lshl_b32 s2, s2, 16 ; GFX6-NEXT: s_or_b32 s1, s1, s2 ; GFX6-NEXT: ; return to shader part epilog @@ -1039,36 +1057,40 @@ define <4 x float> @v_shl_v8i16(<8 x i16> %value, <8 x i16> %amount) { ; GFX6-LABEL: v_shl_v8i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, v8, v0 -; GFX6-NEXT: v_and_b32_e32 v8, 0xffff, v9 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, v8, v1 -; GFX6-NEXT: v_and_b32_e32 v8, 0xffff, v10 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, v8, v2 -; GFX6-NEXT: v_and_b32_e32 v8, 0xffff, v11 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, v8, v3 -; GFX6-NEXT: v_and_b32_e32 v8, 0xffff, v12 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, v8, v4 -; GFX6-NEXT: v_and_b32_e32 v8, 0xffff, v13 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, v8, v5 -; GFX6-NEXT: v_and_b32_e32 v8, 0xffff, v14 +; GFX6-NEXT: v_lshrrev_b32_e32 v8, 16, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v12, 16, v4 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, v4, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, v12, v8 +; GFX6-NEXT: v_lshrrev_b32_e32 v9, 16, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v13, 16, v5 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, v5, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, v13, v9 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v6, v8, v6 -; GFX6-NEXT: v_and_b32_e32 v8, 0xffff, v15 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v7, v8, v7 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v5 -; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v14, 16, v6 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, v6, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v6, v14, v10 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v11, 16, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v15, 16, v7 +; GFX6-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX6-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, v7, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v7, v15, v11 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_or_b32_e32 v2, v2, v4 ; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v7 -; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v6 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX6-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -1115,28 +1137,36 @@ define <4 x float> @v_shl_v8i16(<8 x i16> %value, <8 x i16> %amount) { define amdgpu_ps <4 x i32> @s_shl_v8i16(<8 x i16> inreg %value, <8 x i16> inreg %amount) { ; GFX6-LABEL: s_shl_v8i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_lshl_b32 s1, s1, s9 -; GFX6-NEXT: s_lshl_b32 s0, s0, s8 -; GFX6-NEXT: s_and_b32 s1, s1, 0xffff -; GFX6-NEXT: s_lshl_b32 s2, s2, s10 -; GFX6-NEXT: s_lshl_b32 s3, s3, s11 +; GFX6-NEXT: s_lshr_b32 s8, s0, 16 +; GFX6-NEXT: s_lshr_b32 s12, s4, 16 +; GFX6-NEXT: s_lshl_b32 s0, s0, s4 +; GFX6-NEXT: s_lshl_b32 s4, s8, s12 +; GFX6-NEXT: s_lshr_b32 s9, s1, 16 +; GFX6-NEXT: s_lshr_b32 s13, s5, 16 +; GFX6-NEXT: s_and_b32 s4, s4, 0xffff +; GFX6-NEXT: s_lshl_b32 s1, s1, s5 +; GFX6-NEXT: s_lshl_b32 s5, s9, s13 ; GFX6-NEXT: s_and_b32 s0, s0, 0xffff -; GFX6-NEXT: s_lshl_b32 s1, s1, 16 -; GFX6-NEXT: s_lshl_b32 s5, s5, s13 -; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_and_b32 s1, s2, 0xffff -; GFX6-NEXT: s_and_b32 s2, s3, 0xffff -; GFX6-NEXT: s_lshl_b32 s4, s4, s12 -; GFX6-NEXT: s_lshl_b32 s7, s7, s15 -; GFX6-NEXT: s_lshl_b32 s2, s2, 16 -; GFX6-NEXT: s_and_b32 s3, s5, 0xffff -; GFX6-NEXT: s_lshl_b32 s6, s6, s14 -; GFX6-NEXT: s_or_b32 s1, s1, s2 -; GFX6-NEXT: s_and_b32 s2, s4, 0xffff -; GFX6-NEXT: s_lshl_b32 s3, s3, 16 +; GFX6-NEXT: s_lshl_b32 s4, s4, 16 +; GFX6-NEXT: s_lshr_b32 s10, s2, 16 +; GFX6-NEXT: s_lshr_b32 s14, s6, 16 +; GFX6-NEXT: s_or_b32 s0, s0, s4 +; GFX6-NEXT: s_and_b32 s4, s5, 0xffff +; GFX6-NEXT: s_lshl_b32 s2, s2, s6 +; GFX6-NEXT: s_lshl_b32 s6, s10, s14 +; GFX6-NEXT: s_and_b32 s1, s1, 0xffff +; GFX6-NEXT: s_lshl_b32 s4, s4, 16 +; GFX6-NEXT: s_lshr_b32 s11, s3, 16 +; GFX6-NEXT: s_lshr_b32 s15, s7, 16 +; GFX6-NEXT: s_or_b32 s1, s1, s4 +; GFX6-NEXT: s_and_b32 s4, s6, 0xffff +; GFX6-NEXT: s_lshl_b32 s3, s3, s7 +; GFX6-NEXT: s_lshl_b32 s7, s11, s15 +; GFX6-NEXT: s_and_b32 s2, s2, 0xffff +; GFX6-NEXT: s_lshl_b32 s4, s4, 16 +; GFX6-NEXT: s_or_b32 s2, s2, s4 ; GFX6-NEXT: s_and_b32 s4, s7, 0xffff -; GFX6-NEXT: s_or_b32 s2, s2, s3 -; GFX6-NEXT: s_and_b32 s3, s6, 0xffff +; GFX6-NEXT: s_and_b32 s3, s3, 0xffff ; GFX6-NEXT: s_lshl_b32 s4, s4, 16 ; GFX6-NEXT: s_or_b32 s3, s3, s4 ; GFX6-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll index 0a67d1a84ed7a..677baf991fd1d 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll @@ -2777,27 +2777,34 @@ define <2 x i16> @v_ssubsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) { ; GFX6-LABEL: v_ssubsat_v2i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_max_i32_e32 v4, -1, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x80000001, v4 -; GFX6-NEXT: v_min_i32_e32 v5, -1, v0 -; GFX6-NEXT: v_bfrev_b32_e32 v6, 1 -; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v6 -; GFX6-NEXT: v_max_i32_e32 v2, v4, v2 -; GFX6-NEXT: v_min_i32_e32 v2, v2, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x80000001, v4 +; GFX6-NEXT: v_min_i32_e32 v6, -1, v0 +; GFX6-NEXT: v_bfrev_b32_e32 v7, 1 +; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; GFX6-NEXT: v_max_i32_e32 v1, v4, v1 +; GFX6-NEXT: v_min_i32_e32 v1, v1, v6 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX6-NEXT: v_mov_b32_e32 v5, 0x80000001 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; GFX6-NEXT: v_max_i32_e32 v3, -1, v1 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, 0x80000001, v3 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; GFX6-NEXT: v_min_i32_e32 v4, -1, v1 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x80000000, v4 ; GFX6-NEXT: v_max_i32_e32 v2, v3, v2 ; GFX6-NEXT: v_min_i32_e32 v2, v2, v4 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v2 -; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1 +; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_ssubsat_v2i16: @@ -2839,16 +2846,18 @@ define <2 x i16> @v_ssubsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) { define amdgpu_ps i32 @s_ssubsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs) { ; GFX6-LABEL: s_ssubsat_v2i16: ; GFX6: ; %bb.0: +; GFX6-NEXT: s_lshr_b32 s2, s0, 16 ; GFX6-NEXT: s_lshl_b32 s0, s0, 16 ; GFX6-NEXT: s_max_i32 s4, s0, -1 -; GFX6-NEXT: s_lshl_b32 s2, s2, 16 +; GFX6-NEXT: s_lshr_b32 s3, s1, 16 +; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_add_i32 s4, s4, 0x80000001 ; GFX6-NEXT: s_min_i32 s5, s0, -1 ; GFX6-NEXT: s_add_i32 s5, s5, 0x80000000 -; GFX6-NEXT: s_max_i32 s2, s4, s2 -; GFX6-NEXT: s_min_i32 s2, s2, s5 -; GFX6-NEXT: s_lshl_b32 s1, s1, 16 -; GFX6-NEXT: s_sub_i32 s0, s0, s2 +; GFX6-NEXT: s_max_i32 s1, s4, s1 +; GFX6-NEXT: s_min_i32 s1, s1, s5 +; GFX6-NEXT: s_sub_i32 s0, s0, s1 +; GFX6-NEXT: s_lshl_b32 s1, s2, 16 ; GFX6-NEXT: s_lshl_b32 s2, s3, 16 ; GFX6-NEXT: s_max_i32 s3, s1, -1 ; GFX6-NEXT: s_add_i32 s3, s3, 0x80000001 @@ -2919,8 +2928,10 @@ define amdgpu_ps i32 @s_ssubsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs define amdgpu_ps float @ssubsat_v2i16_sv(<2 x i16> inreg %lhs, <2 x i16> %rhs) { ; GFX6-LABEL: ssubsat_v2i16_sv: ; GFX6: ; %bb.0: +; GFX6-NEXT: s_lshr_b32 s1, s0, 16 ; GFX6-NEXT: s_lshl_b32 s0, s0, 16 ; GFX6-NEXT: s_max_i32 s2, s0, -1 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_add_i32 s2, s2, 0x80000001 ; GFX6-NEXT: s_min_i32 s3, s0, -1 @@ -2986,8 +2997,10 @@ define amdgpu_ps float @ssubsat_v2i16_sv(<2 x i16> inreg %lhs, <2 x i16> %rhs) { define amdgpu_ps float @ssubsat_v2i16_vs(<2 x i16> %lhs, <2 x i16> inreg %rhs) { ; GFX6-LABEL: ssubsat_v2i16_vs: ; GFX6: ; %bb.0: +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_max_i32_e32 v2, -1, v0 +; GFX6-NEXT: s_lshr_b32 s1, s0, 16 ; GFX6-NEXT: s_lshl_b32 s0, s0, 16 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, 0x80000001, v2 ; GFX6-NEXT: v_min_i32_e32 v4, -1, v0 @@ -3064,54 +3077,58 @@ define <2 x float> @v_ssubsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { ; GFX6-LABEL: v_ssubsat_v4i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_max_i32_e32 v8, -1, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_add_i32_e32 v8, vcc, 0x80000001, v8 ; GFX6-NEXT: v_min_i32_e32 v10, -1, v0 ; GFX6-NEXT: v_bfrev_b32_e32 v11, 1 ; GFX6-NEXT: v_add_i32_e32 v10, vcc, v10, v11 -; GFX6-NEXT: v_max_i32_e32 v4, v8, v4 -; GFX6-NEXT: v_min_i32_e32 v4, v4, v10 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_max_i32_e32 v2, v8, v2 +; GFX6-NEXT: v_min_i32_e32 v2, v2, v10 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v4 ; GFX6-NEXT: v_mov_b32_e32 v9, 0x80000001 -; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX6-NEXT: v_max_i32_e32 v5, -1, v1 -; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v9 -; GFX6-NEXT: v_min_i32_e32 v8, -1, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX6-NEXT: v_max_i32_e32 v6, -1, v2 +; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v9 +; GFX6-NEXT: v_min_i32_e32 v8, -1, v2 ; GFX6-NEXT: v_add_i32_e32 v8, vcc, v8, v11 -; GFX6-NEXT: v_max_i32_e32 v4, v5, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_max_i32_e32 v4, v6, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v1 ; GFX6-NEXT: v_min_i32_e32 v4, v4, v8 -; GFX6-NEXT: v_max_i32_e32 v5, -1, v2 -; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6 -; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v9 -; GFX6-NEXT: v_min_i32_e32 v6, -1, v2 -; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v11 -; GFX6-NEXT: v_max_i32_e32 v4, v5, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v4 +; GFX6-NEXT: v_max_i32_e32 v4, -1, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v7, 16, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_min_i32_e32 v4, v4, v6 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v9 +; GFX6-NEXT: v_min_i32_e32 v6, -1, v1 +; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v11 +; GFX6-NEXT: v_max_i32_e32 v3, v4, v3 +; GFX6-NEXT: v_min_i32_e32 v3, v3, v6 +; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX6-NEXT: v_max_i32_e32 v5, -1, v3 -; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v7 ; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v9 ; GFX6-NEXT: v_min_i32_e32 v6, -1, v3 ; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v11 ; GFX6-NEXT: v_max_i32_e32 v4, v5, v4 -; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1 +; GFX6-NEXT: v_ashrrev_i32_e32 v2, 16, v2 ; GFX6-NEXT: v_min_i32_e32 v4, v4, v6 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v4 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX6-NEXT: v_ashrrev_i32_e32 v2, 16, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX6-NEXT: v_ashrrev_i32_e32 v3, 16, v3 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v3 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -3173,52 +3190,56 @@ define <2 x float> @v_ssubsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { define amdgpu_ps <2 x i32> @s_ssubsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inreg %rhs) { ; GFX6-LABEL: s_ssubsat_v4i16: ; GFX6: ; %bb.0: +; GFX6-NEXT: s_lshr_b32 s4, s0, 16 ; GFX6-NEXT: s_lshl_b32 s0, s0, 16 ; GFX6-NEXT: s_max_i32 s8, s0, -1 -; GFX6-NEXT: s_lshl_b32 s4, s4, 16 +; GFX6-NEXT: s_lshr_b32 s6, s2, 16 +; GFX6-NEXT: s_lshl_b32 s2, s2, 16 ; GFX6-NEXT: s_add_i32 s8, s8, 0x80000001 ; GFX6-NEXT: s_min_i32 s9, s0, -1 ; GFX6-NEXT: s_add_i32 s9, s9, 0x80000000 -; GFX6-NEXT: s_max_i32 s4, s8, s4 -; GFX6-NEXT: s_min_i32 s4, s4, s9 -; GFX6-NEXT: s_lshl_b32 s1, s1, 16 -; GFX6-NEXT: s_sub_i32 s0, s0, s4 -; GFX6-NEXT: s_lshl_b32 s4, s5, 16 -; GFX6-NEXT: s_max_i32 s5, s1, -1 -; GFX6-NEXT: s_add_i32 s5, s5, 0x80000001 -; GFX6-NEXT: s_min_i32 s8, s1, -1 +; GFX6-NEXT: s_max_i32 s2, s8, s2 +; GFX6-NEXT: s_min_i32 s2, s2, s9 +; GFX6-NEXT: s_sub_i32 s0, s0, s2 +; GFX6-NEXT: s_lshl_b32 s2, s4, 16 +; GFX6-NEXT: s_lshl_b32 s4, s6, 16 +; GFX6-NEXT: s_max_i32 s6, s2, -1 +; GFX6-NEXT: s_add_i32 s6, s6, 0x80000001 +; GFX6-NEXT: s_min_i32 s8, s2, -1 ; GFX6-NEXT: s_add_i32 s8, s8, 0x80000000 -; GFX6-NEXT: s_max_i32 s4, s5, s4 -; GFX6-NEXT: s_lshl_b32 s2, s2, 16 +; GFX6-NEXT: s_max_i32 s4, s6, s4 +; GFX6-NEXT: s_lshr_b32 s5, s1, 16 ; GFX6-NEXT: s_min_i32 s4, s4, s8 -; GFX6-NEXT: s_max_i32 s5, s2, -1 -; GFX6-NEXT: s_sub_i32 s1, s1, s4 -; GFX6-NEXT: s_lshl_b32 s4, s6, 16 -; GFX6-NEXT: s_add_i32 s5, s5, 0x80000001 -; GFX6-NEXT: s_min_i32 s6, s2, -1 -; GFX6-NEXT: s_add_i32 s6, s6, 0x80000000 -; GFX6-NEXT: s_max_i32 s4, s5, s4 +; GFX6-NEXT: s_lshl_b32 s1, s1, 16 +; GFX6-NEXT: s_sub_i32 s2, s2, s4 +; GFX6-NEXT: s_max_i32 s4, s1, -1 +; GFX6-NEXT: s_lshr_b32 s7, s3, 16 ; GFX6-NEXT: s_lshl_b32 s3, s3, 16 -; GFX6-NEXT: s_min_i32 s4, s4, s6 +; GFX6-NEXT: s_add_i32 s4, s4, 0x80000001 +; GFX6-NEXT: s_min_i32 s6, s1, -1 +; GFX6-NEXT: s_add_i32 s6, s6, 0x80000000 +; GFX6-NEXT: s_max_i32 s3, s4, s3 +; GFX6-NEXT: s_min_i32 s3, s3, s6 +; GFX6-NEXT: s_sub_i32 s1, s1, s3 +; GFX6-NEXT: s_lshl_b32 s3, s5, 16 ; GFX6-NEXT: s_max_i32 s5, s3, -1 -; GFX6-NEXT: s_sub_i32 s2, s2, s4 ; GFX6-NEXT: s_lshl_b32 s4, s7, 16 ; GFX6-NEXT: s_add_i32 s5, s5, 0x80000001 ; GFX6-NEXT: s_min_i32 s6, s3, -1 ; GFX6-NEXT: s_add_i32 s6, s6, 0x80000000 ; GFX6-NEXT: s_max_i32 s4, s5, s4 -; GFX6-NEXT: s_ashr_i32 s1, s1, 16 +; GFX6-NEXT: s_ashr_i32 s2, s2, 16 ; GFX6-NEXT: s_min_i32 s4, s4, s6 ; GFX6-NEXT: s_ashr_i32 s0, s0, 16 ; GFX6-NEXT: s_sub_i32 s3, s3, s4 -; GFX6-NEXT: s_and_b32 s1, s1, 0xffff -; GFX6-NEXT: s_ashr_i32 s2, s2, 16 +; GFX6-NEXT: s_and_b32 s2, s2, 0xffff ; GFX6-NEXT: s_ashr_i32 s3, s3, 16 ; GFX6-NEXT: s_and_b32 s0, s0, 0xffff -; GFX6-NEXT: s_lshl_b32 s1, s1, 16 -; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_and_b32 s1, s2, 0xffff +; GFX6-NEXT: s_lshl_b32 s2, s2, 16 +; GFX6-NEXT: s_ashr_i32 s1, s1, 16 +; GFX6-NEXT: s_or_b32 s0, s0, s2 ; GFX6-NEXT: s_and_b32 s2, s3, 0xffff +; GFX6-NEXT: s_and_b32 s1, s1, 0xffff ; GFX6-NEXT: s_lshl_b32 s2, s2, 16 ; GFX6-NEXT: s_or_b32 s1, s1, s2 ; GFX6-NEXT: ; return to shader part epilog @@ -3324,78 +3345,84 @@ define <3 x float> @v_ssubsat_v6i16(<6 x i16> %lhs, <6 x i16> %rhs) { ; GFX6-LABEL: v_ssubsat_v6i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_max_i32_e32 v12, -1, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v9, 16, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_add_i32_e32 v12, vcc, 0x80000001, v12 ; GFX6-NEXT: v_min_i32_e32 v14, -1, v0 ; GFX6-NEXT: v_bfrev_b32_e32 v15, 1 ; GFX6-NEXT: v_add_i32_e32 v14, vcc, v14, v15 -; GFX6-NEXT: v_max_i32_e32 v6, v12, v6 -; GFX6-NEXT: v_min_i32_e32 v6, v6, v14 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_max_i32_e32 v3, v12, v3 +; GFX6-NEXT: v_min_i32_e32 v3, v3, v14 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 ; GFX6-NEXT: v_mov_b32_e32 v13, 0x80000001 -; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v6 -; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; GFX6-NEXT: v_max_i32_e32 v7, -1, v1 -; GFX6-NEXT: v_add_i32_e32 v7, vcc, v7, v13 -; GFX6-NEXT: v_min_i32_e32 v12, -1, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v9 +; GFX6-NEXT: v_max_i32_e32 v9, -1, v3 +; GFX6-NEXT: v_add_i32_e32 v9, vcc, v9, v13 +; GFX6-NEXT: v_min_i32_e32 v12, -1, v3 ; GFX6-NEXT: v_add_i32_e32 v12, vcc, v12, v15 -; GFX6-NEXT: v_max_i32_e32 v6, v7, v6 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_max_i32_e32 v6, v9, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v7, 16, v1 ; GFX6-NEXT: v_min_i32_e32 v6, v6, v12 -; GFX6-NEXT: v_max_i32_e32 v7, -1, v2 -; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v6 -; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v8 -; GFX6-NEXT: v_add_i32_e32 v7, vcc, v7, v13 -; GFX6-NEXT: v_min_i32_e32 v8, -1, v2 -; GFX6-NEXT: v_add_i32_e32 v8, vcc, v8, v15 -; GFX6-NEXT: v_max_i32_e32 v6, v7, v6 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_min_i32_e32 v6, v6, v8 -; GFX6-NEXT: v_max_i32_e32 v7, -1, v3 -; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v6 -; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v9 -; GFX6-NEXT: v_add_i32_e32 v7, vcc, v7, v13 -; GFX6-NEXT: v_min_i32_e32 v8, -1, v3 -; GFX6-NEXT: v_add_i32_e32 v8, vcc, v8, v15 -; GFX6-NEXT: v_max_i32_e32 v6, v7, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v6 +; GFX6-NEXT: v_max_i32_e32 v6, -1, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v10, 16, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_min_i32_e32 v6, v6, v8 +; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v13 +; GFX6-NEXT: v_min_i32_e32 v9, -1, v1 +; GFX6-NEXT: v_add_i32_e32 v9, vcc, v9, v15 +; GFX6-NEXT: v_max_i32_e32 v4, v6, v4 +; GFX6-NEXT: v_min_i32_e32 v4, v4, v9 +; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v7 ; GFX6-NEXT: v_max_i32_e32 v7, -1, v4 -; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v10 ; GFX6-NEXT: v_add_i32_e32 v7, vcc, v7, v13 -; GFX6-NEXT: v_min_i32_e32 v8, -1, v4 -; GFX6-NEXT: v_add_i32_e32 v8, vcc, v8, v15 +; GFX6-NEXT: v_min_i32_e32 v9, -1, v4 +; GFX6-NEXT: v_add_i32_e32 v9, vcc, v9, v15 ; GFX6-NEXT: v_max_i32_e32 v6, v7, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v8, 16, v2 +; GFX6-NEXT: v_min_i32_e32 v6, v6, v9 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v4, v6 +; GFX6-NEXT: v_max_i32_e32 v6, -1, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v11, 16, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_min_i32_e32 v6, v6, v8 +; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v13 +; GFX6-NEXT: v_min_i32_e32 v7, -1, v2 +; GFX6-NEXT: v_add_i32_e32 v7, vcc, v7, v15 +; GFX6-NEXT: v_max_i32_e32 v5, v6, v5 +; GFX6-NEXT: v_min_i32_e32 v5, v5, v7 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v8 ; GFX6-NEXT: v_max_i32_e32 v7, -1, v5 -; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v4, v6 +; GFX6-NEXT: v_ashrrev_i32_e32 v3, 16, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v11 ; GFX6-NEXT: v_add_i32_e32 v7, vcc, v7, v13 ; GFX6-NEXT: v_min_i32_e32 v8, -1, v5 -; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1 +; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 ; GFX6-NEXT: v_add_i32_e32 v8, vcc, v8, v15 ; GFX6-NEXT: v_max_i32_e32 v6, v7, v6 -; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX6-NEXT: v_ashrrev_i32_e32 v4, 16, v4 ; GFX6-NEXT: v_min_i32_e32 v6, v6, v8 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX6-NEXT: v_ashrrev_i32_e32 v2, 16, v2 -; GFX6-NEXT: v_ashrrev_i32_e32 v3, 16, v3 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v6 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1 +; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v6 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v4 ; GFX6-NEXT: v_ashrrev_i32_e32 v5, 16, v5 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v3 -; GFX6-NEXT: v_ashrrev_i32_e32 v4, 16, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_ashrrev_i32_e32 v2, 16, v2 +; GFX6-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v5 -; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v4 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -3474,76 +3501,82 @@ define <3 x float> @v_ssubsat_v6i16(<6 x i16> %lhs, <6 x i16> %rhs) { define amdgpu_ps <3 x i32> @s_ssubsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inreg %rhs) { ; GFX6-LABEL: s_ssubsat_v6i16: ; GFX6: ; %bb.0: +; GFX6-NEXT: s_lshr_b32 s6, s0, 16 ; GFX6-NEXT: s_lshl_b32 s0, s0, 16 ; GFX6-NEXT: s_max_i32 s12, s0, -1 -; GFX6-NEXT: s_lshl_b32 s6, s6, 16 +; GFX6-NEXT: s_lshr_b32 s9, s3, 16 +; GFX6-NEXT: s_lshl_b32 s3, s3, 16 ; GFX6-NEXT: s_add_i32 s12, s12, 0x80000001 ; GFX6-NEXT: s_min_i32 s13, s0, -1 ; GFX6-NEXT: s_add_i32 s13, s13, 0x80000000 -; GFX6-NEXT: s_max_i32 s6, s12, s6 -; GFX6-NEXT: s_min_i32 s6, s6, s13 -; GFX6-NEXT: s_lshl_b32 s1, s1, 16 -; GFX6-NEXT: s_sub_i32 s0, s0, s6 -; GFX6-NEXT: s_lshl_b32 s6, s7, 16 -; GFX6-NEXT: s_max_i32 s7, s1, -1 -; GFX6-NEXT: s_add_i32 s7, s7, 0x80000001 -; GFX6-NEXT: s_min_i32 s12, s1, -1 +; GFX6-NEXT: s_max_i32 s3, s12, s3 +; GFX6-NEXT: s_min_i32 s3, s3, s13 +; GFX6-NEXT: s_sub_i32 s0, s0, s3 +; GFX6-NEXT: s_lshl_b32 s3, s6, 16 +; GFX6-NEXT: s_lshl_b32 s6, s9, 16 +; GFX6-NEXT: s_max_i32 s9, s3, -1 +; GFX6-NEXT: s_add_i32 s9, s9, 0x80000001 +; GFX6-NEXT: s_min_i32 s12, s3, -1 ; GFX6-NEXT: s_add_i32 s12, s12, 0x80000000 -; GFX6-NEXT: s_max_i32 s6, s7, s6 -; GFX6-NEXT: s_lshl_b32 s2, s2, 16 +; GFX6-NEXT: s_max_i32 s6, s9, s6 +; GFX6-NEXT: s_lshr_b32 s7, s1, 16 ; GFX6-NEXT: s_min_i32 s6, s6, s12 -; GFX6-NEXT: s_max_i32 s7, s2, -1 -; GFX6-NEXT: s_sub_i32 s1, s1, s6 -; GFX6-NEXT: s_lshl_b32 s6, s8, 16 -; GFX6-NEXT: s_add_i32 s7, s7, 0x80000001 -; GFX6-NEXT: s_min_i32 s8, s2, -1 -; GFX6-NEXT: s_add_i32 s8, s8, 0x80000000 -; GFX6-NEXT: s_max_i32 s6, s7, s6 -; GFX6-NEXT: s_lshl_b32 s3, s3, 16 -; GFX6-NEXT: s_min_i32 s6, s6, s8 -; GFX6-NEXT: s_max_i32 s7, s3, -1 -; GFX6-NEXT: s_sub_i32 s2, s2, s6 -; GFX6-NEXT: s_lshl_b32 s6, s9, 16 -; GFX6-NEXT: s_add_i32 s7, s7, 0x80000001 -; GFX6-NEXT: s_min_i32 s8, s3, -1 -; GFX6-NEXT: s_add_i32 s8, s8, 0x80000000 -; GFX6-NEXT: s_max_i32 s6, s7, s6 +; GFX6-NEXT: s_lshl_b32 s1, s1, 16 +; GFX6-NEXT: s_sub_i32 s3, s3, s6 +; GFX6-NEXT: s_max_i32 s6, s1, -1 +; GFX6-NEXT: s_lshr_b32 s10, s4, 16 ; GFX6-NEXT: s_lshl_b32 s4, s4, 16 -; GFX6-NEXT: s_min_i32 s6, s6, s8 +; GFX6-NEXT: s_add_i32 s6, s6, 0x80000001 +; GFX6-NEXT: s_min_i32 s9, s1, -1 +; GFX6-NEXT: s_add_i32 s9, s9, 0x80000000 +; GFX6-NEXT: s_max_i32 s4, s6, s4 +; GFX6-NEXT: s_min_i32 s4, s4, s9 +; GFX6-NEXT: s_sub_i32 s1, s1, s4 +; GFX6-NEXT: s_lshl_b32 s4, s7, 16 ; GFX6-NEXT: s_max_i32 s7, s4, -1 -; GFX6-NEXT: s_sub_i32 s3, s3, s6 ; GFX6-NEXT: s_lshl_b32 s6, s10, 16 ; GFX6-NEXT: s_add_i32 s7, s7, 0x80000001 -; GFX6-NEXT: s_min_i32 s8, s4, -1 -; GFX6-NEXT: s_add_i32 s8, s8, 0x80000000 +; GFX6-NEXT: s_min_i32 s9, s4, -1 +; GFX6-NEXT: s_add_i32 s9, s9, 0x80000000 ; GFX6-NEXT: s_max_i32 s6, s7, s6 +; GFX6-NEXT: s_lshr_b32 s8, s2, 16 +; GFX6-NEXT: s_min_i32 s6, s6, s9 +; GFX6-NEXT: s_lshl_b32 s2, s2, 16 +; GFX6-NEXT: s_sub_i32 s4, s4, s6 +; GFX6-NEXT: s_max_i32 s6, s2, -1 +; GFX6-NEXT: s_lshr_b32 s11, s5, 16 ; GFX6-NEXT: s_lshl_b32 s5, s5, 16 -; GFX6-NEXT: s_min_i32 s6, s6, s8 +; GFX6-NEXT: s_add_i32 s6, s6, 0x80000001 +; GFX6-NEXT: s_min_i32 s7, s2, -1 +; GFX6-NEXT: s_add_i32 s7, s7, 0x80000000 +; GFX6-NEXT: s_max_i32 s5, s6, s5 +; GFX6-NEXT: s_min_i32 s5, s5, s7 +; GFX6-NEXT: s_sub_i32 s2, s2, s5 +; GFX6-NEXT: s_lshl_b32 s5, s8, 16 ; GFX6-NEXT: s_max_i32 s7, s5, -1 -; GFX6-NEXT: s_sub_i32 s4, s4, s6 +; GFX6-NEXT: s_ashr_i32 s3, s3, 16 ; GFX6-NEXT: s_lshl_b32 s6, s11, 16 ; GFX6-NEXT: s_add_i32 s7, s7, 0x80000001 ; GFX6-NEXT: s_min_i32 s8, s5, -1 -; GFX6-NEXT: s_ashr_i32 s1, s1, 16 +; GFX6-NEXT: s_ashr_i32 s0, s0, 16 ; GFX6-NEXT: s_add_i32 s8, s8, 0x80000000 ; GFX6-NEXT: s_max_i32 s6, s7, s6 -; GFX6-NEXT: s_ashr_i32 s0, s0, 16 +; GFX6-NEXT: s_and_b32 s3, s3, 0xffff +; GFX6-NEXT: s_ashr_i32 s4, s4, 16 ; GFX6-NEXT: s_min_i32 s6, s6, s8 -; GFX6-NEXT: s_and_b32 s1, s1, 0xffff -; GFX6-NEXT: s_ashr_i32 s2, s2, 16 -; GFX6-NEXT: s_ashr_i32 s3, s3, 16 -; GFX6-NEXT: s_sub_i32 s5, s5, s6 ; GFX6-NEXT: s_and_b32 s0, s0, 0xffff -; GFX6-NEXT: s_lshl_b32 s1, s1, 16 +; GFX6-NEXT: s_lshl_b32 s3, s3, 16 +; GFX6-NEXT: s_ashr_i32 s1, s1, 16 +; GFX6-NEXT: s_sub_i32 s5, s5, s6 +; GFX6-NEXT: s_or_b32 s0, s0, s3 +; GFX6-NEXT: s_and_b32 s3, s4, 0xffff ; GFX6-NEXT: s_ashr_i32 s5, s5, 16 -; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_and_b32 s1, s2, 0xffff -; GFX6-NEXT: s_and_b32 s2, s3, 0xffff -; GFX6-NEXT: s_ashr_i32 s4, s4, 16 -; GFX6-NEXT: s_lshl_b32 s2, s2, 16 +; GFX6-NEXT: s_and_b32 s1, s1, 0xffff +; GFX6-NEXT: s_lshl_b32 s3, s3, 16 +; GFX6-NEXT: s_ashr_i32 s2, s2, 16 +; GFX6-NEXT: s_or_b32 s1, s1, s3 ; GFX6-NEXT: s_and_b32 s3, s5, 0xffff -; GFX6-NEXT: s_or_b32 s1, s1, s2 -; GFX6-NEXT: s_and_b32 s2, s4, 0xffff +; GFX6-NEXT: s_and_b32 s2, s2, 0xffff ; GFX6-NEXT: s_lshl_b32 s3, s3, 16 ; GFX6-NEXT: s_or_b32 s2, s2, s3 ; GFX6-NEXT: ; return to shader part epilog @@ -3673,102 +3706,110 @@ define <4 x float> @v_ssubsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { ; GFX6-LABEL: v_ssubsat_v8i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v8, 16, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_max_i32_e32 v16, -1, v0 ; GFX6-NEXT: v_mov_b32_e32 v17, 0x80000001 -; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX6-NEXT: v_lshrrev_b32_e32 v12, 16, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX6-NEXT: v_add_i32_e32 v16, vcc, v16, v17 ; GFX6-NEXT: v_min_i32_e32 v18, -1, v0 ; GFX6-NEXT: v_bfrev_b32_e32 v19, 1 ; GFX6-NEXT: v_add_i32_e32 v18, vcc, v18, v19 -; GFX6-NEXT: v_max_i32_e32 v8, v16, v8 -; GFX6-NEXT: v_min_i32_e32 v8, v8, v18 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v8 -; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; GFX6-NEXT: v_max_i32_e32 v9, -1, v1 -; GFX6-NEXT: v_add_i32_e32 v9, vcc, v9, v17 -; GFX6-NEXT: v_min_i32_e32 v16, -1, v1 +; GFX6-NEXT: v_max_i32_e32 v4, v16, v4 +; GFX6-NEXT: v_min_i32_e32 v4, v4, v18 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v12 +; GFX6-NEXT: v_max_i32_e32 v12, -1, v4 +; GFX6-NEXT: v_add_i32_e32 v12, vcc, v12, v17 +; GFX6-NEXT: v_min_i32_e32 v16, -1, v4 ; GFX6-NEXT: v_add_i32_e32 v16, vcc, v16, v19 -; GFX6-NEXT: v_max_i32_e32 v8, v9, v8 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_max_i32_e32 v8, v12, v8 +; GFX6-NEXT: v_lshrrev_b32_e32 v9, 16, v1 ; GFX6-NEXT: v_min_i32_e32 v8, v8, v16 -; GFX6-NEXT: v_max_i32_e32 v9, -1, v2 -; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v8 -; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v10 -; GFX6-NEXT: v_add_i32_e32 v9, vcc, v9, v17 -; GFX6-NEXT: v_min_i32_e32 v10, -1, v2 -; GFX6-NEXT: v_add_i32_e32 v10, vcc, v10, v19 -; GFX6-NEXT: v_max_i32_e32 v8, v9, v8 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_min_i32_e32 v8, v8, v10 -; GFX6-NEXT: v_max_i32_e32 v9, -1, v3 -; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v8 -; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v11 -; GFX6-NEXT: v_add_i32_e32 v9, vcc, v9, v17 -; GFX6-NEXT: v_min_i32_e32 v10, -1, v3 -; GFX6-NEXT: v_add_i32_e32 v10, vcc, v10, v19 -; GFX6-NEXT: v_max_i32_e32 v8, v9, v8 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_min_i32_e32 v8, v8, v10 -; GFX6-NEXT: v_max_i32_e32 v9, -1, v4 -; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v8 -; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v12 -; GFX6-NEXT: v_add_i32_e32 v9, vcc, v9, v17 -; GFX6-NEXT: v_min_i32_e32 v10, -1, v4 -; GFX6-NEXT: v_add_i32_e32 v10, vcc, v10, v19 -; GFX6-NEXT: v_max_i32_e32 v8, v9, v8 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v4, v8 +; GFX6-NEXT: v_max_i32_e32 v8, -1, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v13, 16, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_min_i32_e32 v8, v8, v10 +; GFX6-NEXT: v_add_i32_e32 v8, vcc, v8, v17 +; GFX6-NEXT: v_min_i32_e32 v12, -1, v1 +; GFX6-NEXT: v_add_i32_e32 v12, vcc, v12, v19 +; GFX6-NEXT: v_max_i32_e32 v5, v8, v5 +; GFX6-NEXT: v_min_i32_e32 v5, v5, v12 +; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v9 ; GFX6-NEXT: v_max_i32_e32 v9, -1, v5 -; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v4, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v13 ; GFX6-NEXT: v_add_i32_e32 v9, vcc, v9, v17 -; GFX6-NEXT: v_min_i32_e32 v10, -1, v5 -; GFX6-NEXT: v_add_i32_e32 v10, vcc, v10, v19 +; GFX6-NEXT: v_min_i32_e32 v12, -1, v5 +; GFX6-NEXT: v_add_i32_e32 v12, vcc, v12, v19 ; GFX6-NEXT: v_max_i32_e32 v8, v9, v8 +; GFX6-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; GFX6-NEXT: v_min_i32_e32 v8, v8, v12 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v8 +; GFX6-NEXT: v_max_i32_e32 v8, -1, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v14, 16, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX6-NEXT: v_min_i32_e32 v8, v8, v10 +; GFX6-NEXT: v_add_i32_e32 v8, vcc, v8, v17 +; GFX6-NEXT: v_min_i32_e32 v9, -1, v2 +; GFX6-NEXT: v_add_i32_e32 v9, vcc, v9, v19 +; GFX6-NEXT: v_max_i32_e32 v6, v8, v6 +; GFX6-NEXT: v_min_i32_e32 v6, v6, v9 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v10 ; GFX6-NEXT: v_max_i32_e32 v9, -1, v6 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v14 ; GFX6-NEXT: v_add_i32_e32 v9, vcc, v9, v17 ; GFX6-NEXT: v_min_i32_e32 v10, -1, v6 ; GFX6-NEXT: v_add_i32_e32 v10, vcc, v10, v19 ; GFX6-NEXT: v_max_i32_e32 v8, v9, v8 -; GFX6-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX6-NEXT: v_lshrrev_b32_e32 v11, 16, v3 ; GFX6-NEXT: v_min_i32_e32 v8, v8, v10 -; GFX6-NEXT: v_max_i32_e32 v9, -1, v7 -; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v6, v8 +; GFX6-NEXT: v_max_i32_e32 v8, -1, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v15, 16, v7 +; GFX6-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX6-NEXT: v_add_i32_e32 v8, vcc, v8, v17 +; GFX6-NEXT: v_min_i32_e32 v9, -1, v3 +; GFX6-NEXT: v_add_i32_e32 v9, vcc, v9, v19 +; GFX6-NEXT: v_max_i32_e32 v7, v8, v7 +; GFX6-NEXT: v_min_i32_e32 v7, v7, v9 +; GFX6-NEXT: v_ashrrev_i32_e32 v4, 16, v4 +; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v7 +; GFX6-NEXT: v_lshlrev_b32_e32 v7, 16, v11 +; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 +; GFX6-NEXT: v_max_i32_e32 v9, -1, v7 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX6-NEXT: v_ashrrev_i32_e32 v5, 16, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v15 ; GFX6-NEXT: v_add_i32_e32 v9, vcc, v9, v17 ; GFX6-NEXT: v_min_i32_e32 v10, -1, v7 -; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1 ; GFX6-NEXT: v_add_i32_e32 v10, vcc, v10, v19 ; GFX6-NEXT: v_max_i32_e32 v8, v9, v8 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v5 +; GFX6-NEXT: v_ashrrev_i32_e32 v6, 16, v6 +; GFX6-NEXT: v_min_i32_e32 v8, v8, v10 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX6-NEXT: v_ashrrev_i32_e32 v2, 16, v2 -; GFX6-NEXT: v_ashrrev_i32_e32 v3, 16, v3 -; GFX6-NEXT: v_min_i32_e32 v8, v8, v10 -; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_ashrrev_i32_e32 v5, 16, v5 ; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v7, v8 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v3 -; GFX6-NEXT: v_ashrrev_i32_e32 v4, 16, v4 +; GFX6-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v6 ; GFX6-NEXT: v_ashrrev_i32_e32 v7, 16, v7 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v5 -; GFX6-NEXT: v_ashrrev_i32_e32 v6, 16, v6 -; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_ashrrev_i32_e32 v3, 16, v3 +; GFX6-NEXT: v_or_b32_e32 v2, v2, v4 ; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v7 -; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v6 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX6-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -3864,100 +3905,108 @@ define <4 x float> @v_ssubsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { define amdgpu_ps <4 x i32> @s_ssubsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inreg %rhs) { ; GFX6-LABEL: s_ssubsat_v8i16: ; GFX6: ; %bb.0: +; GFX6-NEXT: s_lshr_b32 s8, s0, 16 ; GFX6-NEXT: s_lshl_b32 s0, s0, 16 ; GFX6-NEXT: s_max_i32 s16, s0, -1 -; GFX6-NEXT: s_lshl_b32 s8, s8, 16 +; GFX6-NEXT: s_lshr_b32 s12, s4, 16 +; GFX6-NEXT: s_lshl_b32 s4, s4, 16 ; GFX6-NEXT: s_add_i32 s16, s16, 0x80000001 ; GFX6-NEXT: s_min_i32 s17, s0, -1 ; GFX6-NEXT: s_add_i32 s17, s17, 0x80000000 -; GFX6-NEXT: s_max_i32 s8, s16, s8 -; GFX6-NEXT: s_min_i32 s8, s8, s17 -; GFX6-NEXT: s_lshl_b32 s1, s1, 16 -; GFX6-NEXT: s_sub_i32 s0, s0, s8 -; GFX6-NEXT: s_lshl_b32 s8, s9, 16 -; GFX6-NEXT: s_max_i32 s9, s1, -1 -; GFX6-NEXT: s_add_i32 s9, s9, 0x80000001 -; GFX6-NEXT: s_min_i32 s16, s1, -1 +; GFX6-NEXT: s_max_i32 s4, s16, s4 +; GFX6-NEXT: s_min_i32 s4, s4, s17 +; GFX6-NEXT: s_sub_i32 s0, s0, s4 +; GFX6-NEXT: s_lshl_b32 s4, s8, 16 +; GFX6-NEXT: s_lshl_b32 s8, s12, 16 +; GFX6-NEXT: s_max_i32 s12, s4, -1 +; GFX6-NEXT: s_add_i32 s12, s12, 0x80000001 +; GFX6-NEXT: s_min_i32 s16, s4, -1 ; GFX6-NEXT: s_add_i32 s16, s16, 0x80000000 -; GFX6-NEXT: s_max_i32 s8, s9, s8 -; GFX6-NEXT: s_lshl_b32 s2, s2, 16 +; GFX6-NEXT: s_max_i32 s8, s12, s8 +; GFX6-NEXT: s_lshr_b32 s9, s1, 16 ; GFX6-NEXT: s_min_i32 s8, s8, s16 -; GFX6-NEXT: s_max_i32 s9, s2, -1 -; GFX6-NEXT: s_sub_i32 s1, s1, s8 -; GFX6-NEXT: s_lshl_b32 s8, s10, 16 -; GFX6-NEXT: s_add_i32 s9, s9, 0x80000001 -; GFX6-NEXT: s_min_i32 s10, s2, -1 -; GFX6-NEXT: s_add_i32 s10, s10, 0x80000000 -; GFX6-NEXT: s_max_i32 s8, s9, s8 -; GFX6-NEXT: s_lshl_b32 s3, s3, 16 -; GFX6-NEXT: s_min_i32 s8, s8, s10 -; GFX6-NEXT: s_max_i32 s9, s3, -1 -; GFX6-NEXT: s_sub_i32 s2, s2, s8 -; GFX6-NEXT: s_lshl_b32 s8, s11, 16 -; GFX6-NEXT: s_add_i32 s9, s9, 0x80000001 -; GFX6-NEXT: s_min_i32 s10, s3, -1 -; GFX6-NEXT: s_add_i32 s10, s10, 0x80000000 -; GFX6-NEXT: s_max_i32 s8, s9, s8 -; GFX6-NEXT: s_lshl_b32 s4, s4, 16 -; GFX6-NEXT: s_min_i32 s8, s8, s10 -; GFX6-NEXT: s_max_i32 s9, s4, -1 -; GFX6-NEXT: s_sub_i32 s3, s3, s8 -; GFX6-NEXT: s_lshl_b32 s8, s12, 16 -; GFX6-NEXT: s_add_i32 s9, s9, 0x80000001 -; GFX6-NEXT: s_min_i32 s10, s4, -1 -; GFX6-NEXT: s_add_i32 s10, s10, 0x80000000 -; GFX6-NEXT: s_max_i32 s8, s9, s8 +; GFX6-NEXT: s_lshl_b32 s1, s1, 16 +; GFX6-NEXT: s_sub_i32 s4, s4, s8 +; GFX6-NEXT: s_max_i32 s8, s1, -1 +; GFX6-NEXT: s_lshr_b32 s13, s5, 16 ; GFX6-NEXT: s_lshl_b32 s5, s5, 16 -; GFX6-NEXT: s_min_i32 s8, s8, s10 +; GFX6-NEXT: s_add_i32 s8, s8, 0x80000001 +; GFX6-NEXT: s_min_i32 s12, s1, -1 +; GFX6-NEXT: s_add_i32 s12, s12, 0x80000000 +; GFX6-NEXT: s_max_i32 s5, s8, s5 +; GFX6-NEXT: s_min_i32 s5, s5, s12 +; GFX6-NEXT: s_sub_i32 s1, s1, s5 +; GFX6-NEXT: s_lshl_b32 s5, s9, 16 ; GFX6-NEXT: s_max_i32 s9, s5, -1 -; GFX6-NEXT: s_sub_i32 s4, s4, s8 ; GFX6-NEXT: s_lshl_b32 s8, s13, 16 ; GFX6-NEXT: s_add_i32 s9, s9, 0x80000001 -; GFX6-NEXT: s_min_i32 s10, s5, -1 -; GFX6-NEXT: s_add_i32 s10, s10, 0x80000000 +; GFX6-NEXT: s_min_i32 s12, s5, -1 +; GFX6-NEXT: s_add_i32 s12, s12, 0x80000000 ; GFX6-NEXT: s_max_i32 s8, s9, s8 +; GFX6-NEXT: s_lshr_b32 s10, s2, 16 +; GFX6-NEXT: s_min_i32 s8, s8, s12 +; GFX6-NEXT: s_lshl_b32 s2, s2, 16 +; GFX6-NEXT: s_sub_i32 s5, s5, s8 +; GFX6-NEXT: s_max_i32 s8, s2, -1 +; GFX6-NEXT: s_lshr_b32 s14, s6, 16 ; GFX6-NEXT: s_lshl_b32 s6, s6, 16 -; GFX6-NEXT: s_min_i32 s8, s8, s10 +; GFX6-NEXT: s_add_i32 s8, s8, 0x80000001 +; GFX6-NEXT: s_min_i32 s9, s2, -1 +; GFX6-NEXT: s_add_i32 s9, s9, 0x80000000 +; GFX6-NEXT: s_max_i32 s6, s8, s6 +; GFX6-NEXT: s_min_i32 s6, s6, s9 +; GFX6-NEXT: s_sub_i32 s2, s2, s6 +; GFX6-NEXT: s_lshl_b32 s6, s10, 16 ; GFX6-NEXT: s_max_i32 s9, s6, -1 -; GFX6-NEXT: s_sub_i32 s5, s5, s8 ; GFX6-NEXT: s_lshl_b32 s8, s14, 16 ; GFX6-NEXT: s_add_i32 s9, s9, 0x80000001 ; GFX6-NEXT: s_min_i32 s10, s6, -1 ; GFX6-NEXT: s_add_i32 s10, s10, 0x80000000 ; GFX6-NEXT: s_max_i32 s8, s9, s8 -; GFX6-NEXT: s_lshl_b32 s7, s7, 16 +; GFX6-NEXT: s_lshr_b32 s11, s3, 16 ; GFX6-NEXT: s_min_i32 s8, s8, s10 -; GFX6-NEXT: s_max_i32 s9, s7, -1 -; GFX6-NEXT: s_ashr_i32 s1, s1, 16 +; GFX6-NEXT: s_lshl_b32 s3, s3, 16 ; GFX6-NEXT: s_sub_i32 s6, s6, s8 +; GFX6-NEXT: s_max_i32 s8, s3, -1 +; GFX6-NEXT: s_lshr_b32 s15, s7, 16 +; GFX6-NEXT: s_lshl_b32 s7, s7, 16 +; GFX6-NEXT: s_add_i32 s8, s8, 0x80000001 +; GFX6-NEXT: s_min_i32 s9, s3, -1 +; GFX6-NEXT: s_add_i32 s9, s9, 0x80000000 +; GFX6-NEXT: s_max_i32 s7, s8, s7 +; GFX6-NEXT: s_min_i32 s7, s7, s9 +; GFX6-NEXT: s_ashr_i32 s4, s4, 16 +; GFX6-NEXT: s_sub_i32 s3, s3, s7 +; GFX6-NEXT: s_lshl_b32 s7, s11, 16 +; GFX6-NEXT: s_ashr_i32 s0, s0, 16 +; GFX6-NEXT: s_max_i32 s9, s7, -1 +; GFX6-NEXT: s_and_b32 s4, s4, 0xffff +; GFX6-NEXT: s_ashr_i32 s5, s5, 16 ; GFX6-NEXT: s_lshl_b32 s8, s15, 16 ; GFX6-NEXT: s_add_i32 s9, s9, 0x80000001 ; GFX6-NEXT: s_min_i32 s10, s7, -1 -; GFX6-NEXT: s_ashr_i32 s0, s0, 16 +; GFX6-NEXT: s_and_b32 s0, s0, 0xffff +; GFX6-NEXT: s_lshl_b32 s4, s4, 16 +; GFX6-NEXT: s_ashr_i32 s1, s1, 16 ; GFX6-NEXT: s_add_i32 s10, s10, 0x80000000 ; GFX6-NEXT: s_max_i32 s8, s9, s8 +; GFX6-NEXT: s_or_b32 s0, s0, s4 +; GFX6-NEXT: s_and_b32 s4, s5, 0xffff +; GFX6-NEXT: s_ashr_i32 s6, s6, 16 +; GFX6-NEXT: s_min_i32 s8, s8, s10 ; GFX6-NEXT: s_and_b32 s1, s1, 0xffff +; GFX6-NEXT: s_lshl_b32 s4, s4, 16 ; GFX6-NEXT: s_ashr_i32 s2, s2, 16 -; GFX6-NEXT: s_ashr_i32 s3, s3, 16 -; GFX6-NEXT: s_min_i32 s8, s8, s10 -; GFX6-NEXT: s_and_b32 s0, s0, 0xffff -; GFX6-NEXT: s_lshl_b32 s1, s1, 16 -; GFX6-NEXT: s_ashr_i32 s5, s5, 16 ; GFX6-NEXT: s_sub_i32 s7, s7, s8 -; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_and_b32 s1, s2, 0xffff -; GFX6-NEXT: s_and_b32 s2, s3, 0xffff -; GFX6-NEXT: s_ashr_i32 s4, s4, 16 +; GFX6-NEXT: s_or_b32 s1, s1, s4 +; GFX6-NEXT: s_and_b32 s4, s6, 0xffff ; GFX6-NEXT: s_ashr_i32 s7, s7, 16 -; GFX6-NEXT: s_lshl_b32 s2, s2, 16 -; GFX6-NEXT: s_and_b32 s3, s5, 0xffff -; GFX6-NEXT: s_ashr_i32 s6, s6, 16 -; GFX6-NEXT: s_or_b32 s1, s1, s2 -; GFX6-NEXT: s_and_b32 s2, s4, 0xffff -; GFX6-NEXT: s_lshl_b32 s3, s3, 16 +; GFX6-NEXT: s_and_b32 s2, s2, 0xffff +; GFX6-NEXT: s_lshl_b32 s4, s4, 16 +; GFX6-NEXT: s_ashr_i32 s3, s3, 16 +; GFX6-NEXT: s_or_b32 s2, s2, s4 ; GFX6-NEXT: s_and_b32 s4, s7, 0xffff -; GFX6-NEXT: s_or_b32 s2, s2, s3 -; GFX6-NEXT: s_and_b32 s3, s6, 0xffff +; GFX6-NEXT: s_and_b32 s3, s3, 0xffff ; GFX6-NEXT: s_lshl_b32 s4, s4, 16 ; GFX6-NEXT: s_or_b32 s3, s3, s4 ; GFX6-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sub.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sub.ll index e2fb704599250..a0cac724ef2c2 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sub.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sub.ll @@ -203,8 +203,14 @@ define <2 x i16> @v_sub_v2i16(<2 x i16> %a, <2 x i16> %b) { ; GFX7-LABEL: v_sub_v2i16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 -; GFX7-NEXT: v_sub_i32_e32 v1, vcc, v1, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX7-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 +; GFX7-NEXT: v_sub_i32_e32 v1, vcc, v2, v3 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_sub_v2i16: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll index 2c9519fa9d8a5..a30e3115cc463 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll @@ -2172,18 +2172,20 @@ define <2 x i16> @v_uaddsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) { ; GFX6-LABEL: v_uaddsat_v2i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_not_b32_e32 v4, v0 -; GFX6-NEXT: v_min_u32_e32 v2, v4, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GFX6-NEXT: v_not_b32_e32 v4, v0 +; GFX6-NEXT: v_min_u32_e32 v1, v4, v1 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; GFX6-NEXT: v_not_b32_e32 v3, v1 ; GFX6-NEXT: v_min_u32_e32 v2, v3, v2 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2 -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_alignbit_b32 v0, v1, v0, 16 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_uaddsat_v2i16: @@ -2212,12 +2214,14 @@ define <2 x i16> @v_uaddsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) { define amdgpu_ps i32 @s_uaddsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs) { ; GFX6-LABEL: s_uaddsat_v2i16: ; GFX6: ; %bb.0: +; GFX6-NEXT: s_lshr_b32 s2, s0, 16 ; GFX6-NEXT: s_lshl_b32 s0, s0, 16 -; GFX6-NEXT: s_lshl_b32 s2, s2, 16 -; GFX6-NEXT: s_not_b32 s4, s0 -; GFX6-NEXT: s_min_u32 s2, s4, s2 +; GFX6-NEXT: s_lshr_b32 s3, s1, 16 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 -; GFX6-NEXT: s_add_i32 s0, s0, s2 +; GFX6-NEXT: s_not_b32 s4, s0 +; GFX6-NEXT: s_min_u32 s1, s4, s1 +; GFX6-NEXT: s_add_i32 s0, s0, s1 +; GFX6-NEXT: s_lshl_b32 s1, s2, 16 ; GFX6-NEXT: s_lshl_b32 s2, s3, 16 ; GFX6-NEXT: s_not_b32 s3, s1 ; GFX6-NEXT: s_min_u32 s2, s3, s2 @@ -2264,7 +2268,9 @@ define amdgpu_ps i32 @s_uaddsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs define amdgpu_ps float @uaddsat_v2i16_sv(<2 x i16> inreg %lhs, <2 x i16> %rhs) { ; GFX6-LABEL: uaddsat_v2i16_sv: ; GFX6: ; %bb.0: +; GFX6-NEXT: s_lshr_b32 s1, s0, 16 ; GFX6-NEXT: s_lshl_b32 s0, s0, 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_not_b32 s2, s0 ; GFX6-NEXT: v_min_u32_e32 v0, s2, v0 @@ -2304,7 +2310,9 @@ define amdgpu_ps float @uaddsat_v2i16_sv(<2 x i16> inreg %lhs, <2 x i16> %rhs) { define amdgpu_ps float @uaddsat_v2i16_vs(<2 x i16> %lhs, <2 x i16> inreg %rhs) { ; GFX6-LABEL: uaddsat_v2i16_vs: ; GFX6: ; %bb.0: +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: s_lshr_b32 s1, s0, 16 ; GFX6-NEXT: s_lshl_b32 s0, s0, 16 ; GFX6-NEXT: v_not_b32_e32 v2, v0 ; GFX6-NEXT: v_min_u32_e32 v2, s0, v2 @@ -2356,30 +2364,34 @@ define <2 x float> @v_uaddsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { ; GFX6-LABEL: v_uaddsat_v4i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_not_b32_e32 v8, v0 -; GFX6-NEXT: v_min_u32_e32 v4, v8, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX6-NEXT: v_not_b32_e32 v5, v1 -; GFX6-NEXT: v_min_u32_e32 v4, v5, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v4 +; GFX6-NEXT: v_not_b32_e32 v8, v0 +; GFX6-NEXT: v_min_u32_e32 v2, v8, v2 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6 -; GFX6-NEXT: v_not_b32_e32 v5, v2 -; GFX6-NEXT: v_min_u32_e32 v4, v5, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_not_b32_e32 v6, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX6-NEXT: v_min_u32_e32 v4, v6, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v7, 16, v3 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_not_b32_e32 v4, v1 +; GFX6-NEXT: v_min_u32_e32 v3, v4, v3 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v7 ; GFX6-NEXT: v_not_b32_e32 v5, v3 ; GFX6-NEXT: v_min_u32_e32 v4, v5, v4 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; GFX6-NEXT: v_alignbit_b32 v1, v3, v2, 16 +; GFX6-NEXT: v_alignbit_b32 v0, v2, v0, 16 +; GFX6-NEXT: v_alignbit_b32 v1, v3, v1, 16 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_uaddsat_v4i16: @@ -2414,31 +2426,35 @@ define <2 x float> @v_uaddsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { define amdgpu_ps <2 x i32> @s_uaddsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inreg %rhs) { ; GFX6-LABEL: s_uaddsat_v4i16: ; GFX6: ; %bb.0: +; GFX6-NEXT: s_lshr_b32 s4, s0, 16 ; GFX6-NEXT: s_lshl_b32 s0, s0, 16 -; GFX6-NEXT: s_lshl_b32 s4, s4, 16 -; GFX6-NEXT: s_not_b32 s8, s0 -; GFX6-NEXT: s_min_u32 s4, s8, s4 -; GFX6-NEXT: s_lshl_b32 s1, s1, 16 -; GFX6-NEXT: s_add_i32 s0, s0, s4 -; GFX6-NEXT: s_lshl_b32 s4, s5, 16 -; GFX6-NEXT: s_not_b32 s5, s1 -; GFX6-NEXT: s_min_u32 s4, s5, s4 +; GFX6-NEXT: s_lshr_b32 s6, s2, 16 ; GFX6-NEXT: s_lshl_b32 s2, s2, 16 -; GFX6-NEXT: s_add_i32 s1, s1, s4 +; GFX6-NEXT: s_not_b32 s8, s0 +; GFX6-NEXT: s_min_u32 s2, s8, s2 +; GFX6-NEXT: s_add_i32 s0, s0, s2 +; GFX6-NEXT: s_lshl_b32 s2, s4, 16 ; GFX6-NEXT: s_lshl_b32 s4, s6, 16 -; GFX6-NEXT: s_not_b32 s5, s2 -; GFX6-NEXT: s_min_u32 s4, s5, s4 -; GFX6-NEXT: s_lshl_b32 s3, s3, 16 +; GFX6-NEXT: s_not_b32 s6, s2 +; GFX6-NEXT: s_lshr_b32 s5, s1, 16 +; GFX6-NEXT: s_min_u32 s4, s6, s4 +; GFX6-NEXT: s_lshl_b32 s1, s1, 16 +; GFX6-NEXT: s_lshr_b32 s7, s3, 16 ; GFX6-NEXT: s_add_i32 s2, s2, s4 +; GFX6-NEXT: s_lshl_b32 s3, s3, 16 +; GFX6-NEXT: s_not_b32 s4, s1 +; GFX6-NEXT: s_min_u32 s3, s4, s3 +; GFX6-NEXT: s_add_i32 s1, s1, s3 +; GFX6-NEXT: s_lshl_b32 s3, s5, 16 ; GFX6-NEXT: s_lshl_b32 s4, s7, 16 ; GFX6-NEXT: s_not_b32 s5, s3 ; GFX6-NEXT: s_min_u32 s4, s5, s4 ; GFX6-NEXT: s_add_i32 s3, s3, s4 -; GFX6-NEXT: s_lshr_b32 s1, s1, 16 +; GFX6-NEXT: s_lshr_b32 s2, s2, 16 ; GFX6-NEXT: s_lshr_b32 s3, s3, 16 ; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: v_mov_b32_e32 v1, s2 -; GFX6-NEXT: v_alignbit_b32 v0, s1, v0, 16 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_alignbit_b32 v0, s2, v0, 16 ; GFX6-NEXT: v_alignbit_b32 v1, s3, v1, 16 ; GFX6-NEXT: v_readfirstlane_b32 s0, v0 ; GFX6-NEXT: v_readfirstlane_b32 s1, v1 @@ -2509,42 +2525,48 @@ define <3 x float> @v_uaddsat_v6i16(<6 x i16> %lhs, <6 x i16> %rhs) { ; GFX6-LABEL: v_uaddsat_v6i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX6-NEXT: v_not_b32_e32 v12, v0 -; GFX6-NEXT: v_min_u32_e32 v6, v12, v6 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v6 -; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; GFX6-NEXT: v_not_b32_e32 v7, v1 -; GFX6-NEXT: v_min_u32_e32 v6, v7, v6 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v6 -; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v8 -; GFX6-NEXT: v_not_b32_e32 v7, v2 -; GFX6-NEXT: v_min_u32_e32 v6, v7, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v9, 16, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v6 +; GFX6-NEXT: v_not_b32_e32 v12, v0 +; GFX6-NEXT: v_min_u32_e32 v3, v12, v3 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v9 -; GFX6-NEXT: v_not_b32_e32 v7, v3 -; GFX6-NEXT: v_min_u32_e32 v6, v7, v6 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_not_b32_e32 v9, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; GFX6-NEXT: v_min_u32_e32 v6, v9, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v10, 16, v4 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_not_b32_e32 v6, v1 +; GFX6-NEXT: v_min_u32_e32 v4, v6, v4 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v7 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v10 ; GFX6-NEXT: v_not_b32_e32 v7, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v8, 16, v2 ; GFX6-NEXT: v_min_u32_e32 v6, v7, v6 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v11, 16, v5 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX6-NEXT: v_not_b32_e32 v6, v2 +; GFX6-NEXT: v_min_u32_e32 v5, v6, v5 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v11 ; GFX6-NEXT: v_not_b32_e32 v7, v5 ; GFX6-NEXT: v_min_u32_e32 v6, v7, v6 ; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v6 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; GFX6-NEXT: v_alignbit_b32 v1, v3, v2, 16 -; GFX6-NEXT: v_alignbit_b32 v2, v5, v4, 16 +; GFX6-NEXT: v_alignbit_b32 v0, v3, v0, 16 +; GFX6-NEXT: v_alignbit_b32 v1, v4, v1, 16 +; GFX6-NEXT: v_alignbit_b32 v2, v5, v2, 16 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_uaddsat_v6i16: @@ -2584,44 +2606,50 @@ define <3 x float> @v_uaddsat_v6i16(<6 x i16> %lhs, <6 x i16> %rhs) { define amdgpu_ps <3 x i32> @s_uaddsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inreg %rhs) { ; GFX6-LABEL: s_uaddsat_v6i16: ; GFX6: ; %bb.0: +; GFX6-NEXT: s_lshr_b32 s6, s0, 16 ; GFX6-NEXT: s_lshl_b32 s0, s0, 16 -; GFX6-NEXT: s_lshl_b32 s6, s6, 16 -; GFX6-NEXT: s_not_b32 s12, s0 -; GFX6-NEXT: s_min_u32 s6, s12, s6 -; GFX6-NEXT: s_lshl_b32 s1, s1, 16 -; GFX6-NEXT: s_add_i32 s0, s0, s6 -; GFX6-NEXT: s_lshl_b32 s6, s7, 16 -; GFX6-NEXT: s_not_b32 s7, s1 -; GFX6-NEXT: s_min_u32 s6, s7, s6 -; GFX6-NEXT: s_lshl_b32 s2, s2, 16 -; GFX6-NEXT: s_add_i32 s1, s1, s6 -; GFX6-NEXT: s_lshl_b32 s6, s8, 16 -; GFX6-NEXT: s_not_b32 s7, s2 -; GFX6-NEXT: s_min_u32 s6, s7, s6 +; GFX6-NEXT: s_lshr_b32 s9, s3, 16 ; GFX6-NEXT: s_lshl_b32 s3, s3, 16 -; GFX6-NEXT: s_add_i32 s2, s2, s6 +; GFX6-NEXT: s_not_b32 s12, s0 +; GFX6-NEXT: s_min_u32 s3, s12, s3 +; GFX6-NEXT: s_add_i32 s0, s0, s3 +; GFX6-NEXT: s_lshl_b32 s3, s6, 16 ; GFX6-NEXT: s_lshl_b32 s6, s9, 16 -; GFX6-NEXT: s_not_b32 s7, s3 -; GFX6-NEXT: s_min_u32 s6, s7, s6 -; GFX6-NEXT: s_lshl_b32 s4, s4, 16 +; GFX6-NEXT: s_not_b32 s9, s3 +; GFX6-NEXT: s_lshr_b32 s7, s1, 16 +; GFX6-NEXT: s_min_u32 s6, s9, s6 +; GFX6-NEXT: s_lshl_b32 s1, s1, 16 +; GFX6-NEXT: s_lshr_b32 s10, s4, 16 ; GFX6-NEXT: s_add_i32 s3, s3, s6 +; GFX6-NEXT: s_lshl_b32 s4, s4, 16 +; GFX6-NEXT: s_not_b32 s6, s1 +; GFX6-NEXT: s_min_u32 s4, s6, s4 +; GFX6-NEXT: s_add_i32 s1, s1, s4 +; GFX6-NEXT: s_lshl_b32 s4, s7, 16 ; GFX6-NEXT: s_lshl_b32 s6, s10, 16 ; GFX6-NEXT: s_not_b32 s7, s4 +; GFX6-NEXT: s_lshr_b32 s8, s2, 16 ; GFX6-NEXT: s_min_u32 s6, s7, s6 -; GFX6-NEXT: s_lshl_b32 s5, s5, 16 +; GFX6-NEXT: s_lshl_b32 s2, s2, 16 +; GFX6-NEXT: s_lshr_b32 s11, s5, 16 ; GFX6-NEXT: s_add_i32 s4, s4, s6 +; GFX6-NEXT: s_lshl_b32 s5, s5, 16 +; GFX6-NEXT: s_not_b32 s6, s2 +; GFX6-NEXT: s_min_u32 s5, s6, s5 +; GFX6-NEXT: s_add_i32 s2, s2, s5 +; GFX6-NEXT: s_lshl_b32 s5, s8, 16 ; GFX6-NEXT: s_lshl_b32 s6, s11, 16 ; GFX6-NEXT: s_not_b32 s7, s5 ; GFX6-NEXT: s_min_u32 s6, s7, s6 ; GFX6-NEXT: s_add_i32 s5, s5, s6 -; GFX6-NEXT: s_lshr_b32 s1, s1, 16 ; GFX6-NEXT: s_lshr_b32 s3, s3, 16 +; GFX6-NEXT: s_lshr_b32 s4, s4, 16 ; GFX6-NEXT: s_lshr_b32 s5, s5, 16 ; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: v_mov_b32_e32 v1, s2 -; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: v_alignbit_b32 v0, s1, v0, 16 -; GFX6-NEXT: v_alignbit_b32 v1, s3, v1, 16 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s2 +; GFX6-NEXT: v_alignbit_b32 v0, s3, v0, 16 +; GFX6-NEXT: v_alignbit_b32 v1, s4, v1, 16 ; GFX6-NEXT: v_alignbit_b32 v2, s5, v2, 16 ; GFX6-NEXT: v_readfirstlane_b32 s0, v0 ; GFX6-NEXT: v_readfirstlane_b32 s1, v1 @@ -2699,54 +2727,62 @@ define <4 x float> @v_uaddsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { ; GFX6-LABEL: v_uaddsat_v8i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v8, 16, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX6-NEXT: v_not_b32_e32 v16, v0 -; GFX6-NEXT: v_min_u32_e32 v8, v16, v8 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v8 -; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; GFX6-NEXT: v_not_b32_e32 v9, v1 -; GFX6-NEXT: v_min_u32_e32 v8, v9, v8 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v8 -; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v10 -; GFX6-NEXT: v_not_b32_e32 v9, v2 -; GFX6-NEXT: v_min_u32_e32 v8, v9, v8 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v8 -; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v11 -; GFX6-NEXT: v_not_b32_e32 v9, v3 -; GFX6-NEXT: v_min_u32_e32 v8, v9, v8 +; GFX6-NEXT: v_lshrrev_b32_e32 v12, 16, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v8 +; GFX6-NEXT: v_not_b32_e32 v16, v0 +; GFX6-NEXT: v_min_u32_e32 v4, v16, v4 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v12 -; GFX6-NEXT: v_not_b32_e32 v9, v4 -; GFX6-NEXT: v_min_u32_e32 v8, v9, v8 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX6-NEXT: v_not_b32_e32 v12, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v9, 16, v1 +; GFX6-NEXT: v_min_u32_e32 v8, v12, v8 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v13, 16, v5 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v8 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX6-NEXT: v_not_b32_e32 v8, v1 +; GFX6-NEXT: v_min_u32_e32 v5, v8, v5 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v9 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v13 ; GFX6-NEXT: v_not_b32_e32 v9, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v10, 16, v2 ; GFX6-NEXT: v_min_u32_e32 v8, v9, v8 -; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v14, 16, v6 ; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v8 +; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX6-NEXT: v_not_b32_e32 v8, v2 +; GFX6-NEXT: v_min_u32_e32 v6, v8, v6 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v10 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v14 ; GFX6-NEXT: v_not_b32_e32 v9, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v11, 16, v3 ; GFX6-NEXT: v_min_u32_e32 v8, v9, v8 -; GFX6-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v15, 16, v7 ; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v8 +; GFX6-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX6-NEXT: v_not_b32_e32 v8, v3 +; GFX6-NEXT: v_min_u32_e32 v7, v8, v7 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v7 +; GFX6-NEXT: v_lshlrev_b32_e32 v7, 16, v11 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v15 ; GFX6-NEXT: v_not_b32_e32 v9, v7 ; GFX6-NEXT: v_min_u32_e32 v8, v9, v8 ; GFX6-NEXT: v_add_i32_e32 v7, vcc, v7, v8 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; GFX6-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GFX6-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; GFX6-NEXT: v_alignbit_b32 v1, v3, v2, 16 -; GFX6-NEXT: v_alignbit_b32 v2, v5, v4, 16 -; GFX6-NEXT: v_alignbit_b32 v3, v7, v6, 16 +; GFX6-NEXT: v_alignbit_b32 v0, v4, v0, 16 +; GFX6-NEXT: v_alignbit_b32 v1, v5, v1, 16 +; GFX6-NEXT: v_alignbit_b32 v2, v6, v2, 16 +; GFX6-NEXT: v_alignbit_b32 v3, v7, v3, 16 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_uaddsat_v8i16: @@ -2791,57 +2827,65 @@ define <4 x float> @v_uaddsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { define amdgpu_ps <4 x i32> @s_uaddsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inreg %rhs) { ; GFX6-LABEL: s_uaddsat_v8i16: ; GFX6: ; %bb.0: +; GFX6-NEXT: s_lshr_b32 s8, s0, 16 ; GFX6-NEXT: s_lshl_b32 s0, s0, 16 -; GFX6-NEXT: s_lshl_b32 s8, s8, 16 -; GFX6-NEXT: s_not_b32 s16, s0 -; GFX6-NEXT: s_min_u32 s8, s16, s8 -; GFX6-NEXT: s_lshl_b32 s1, s1, 16 -; GFX6-NEXT: s_add_i32 s0, s0, s8 -; GFX6-NEXT: s_lshl_b32 s8, s9, 16 -; GFX6-NEXT: s_not_b32 s9, s1 -; GFX6-NEXT: s_min_u32 s8, s9, s8 -; GFX6-NEXT: s_lshl_b32 s2, s2, 16 -; GFX6-NEXT: s_add_i32 s1, s1, s8 -; GFX6-NEXT: s_lshl_b32 s8, s10, 16 -; GFX6-NEXT: s_not_b32 s9, s2 -; GFX6-NEXT: s_min_u32 s8, s9, s8 -; GFX6-NEXT: s_lshl_b32 s3, s3, 16 -; GFX6-NEXT: s_add_i32 s2, s2, s8 -; GFX6-NEXT: s_lshl_b32 s8, s11, 16 -; GFX6-NEXT: s_not_b32 s9, s3 -; GFX6-NEXT: s_min_u32 s8, s9, s8 +; GFX6-NEXT: s_lshr_b32 s12, s4, 16 ; GFX6-NEXT: s_lshl_b32 s4, s4, 16 -; GFX6-NEXT: s_add_i32 s3, s3, s8 +; GFX6-NEXT: s_not_b32 s16, s0 +; GFX6-NEXT: s_min_u32 s4, s16, s4 +; GFX6-NEXT: s_add_i32 s0, s0, s4 +; GFX6-NEXT: s_lshl_b32 s4, s8, 16 ; GFX6-NEXT: s_lshl_b32 s8, s12, 16 -; GFX6-NEXT: s_not_b32 s9, s4 -; GFX6-NEXT: s_min_u32 s8, s9, s8 -; GFX6-NEXT: s_lshl_b32 s5, s5, 16 +; GFX6-NEXT: s_not_b32 s12, s4 +; GFX6-NEXT: s_lshr_b32 s9, s1, 16 +; GFX6-NEXT: s_min_u32 s8, s12, s8 +; GFX6-NEXT: s_lshl_b32 s1, s1, 16 +; GFX6-NEXT: s_lshr_b32 s13, s5, 16 ; GFX6-NEXT: s_add_i32 s4, s4, s8 +; GFX6-NEXT: s_lshl_b32 s5, s5, 16 +; GFX6-NEXT: s_not_b32 s8, s1 +; GFX6-NEXT: s_min_u32 s5, s8, s5 +; GFX6-NEXT: s_add_i32 s1, s1, s5 +; GFX6-NEXT: s_lshl_b32 s5, s9, 16 ; GFX6-NEXT: s_lshl_b32 s8, s13, 16 ; GFX6-NEXT: s_not_b32 s9, s5 +; GFX6-NEXT: s_lshr_b32 s10, s2, 16 ; GFX6-NEXT: s_min_u32 s8, s9, s8 -; GFX6-NEXT: s_lshl_b32 s6, s6, 16 +; GFX6-NEXT: s_lshl_b32 s2, s2, 16 +; GFX6-NEXT: s_lshr_b32 s14, s6, 16 ; GFX6-NEXT: s_add_i32 s5, s5, s8 +; GFX6-NEXT: s_lshl_b32 s6, s6, 16 +; GFX6-NEXT: s_not_b32 s8, s2 +; GFX6-NEXT: s_min_u32 s6, s8, s6 +; GFX6-NEXT: s_add_i32 s2, s2, s6 +; GFX6-NEXT: s_lshl_b32 s6, s10, 16 ; GFX6-NEXT: s_lshl_b32 s8, s14, 16 ; GFX6-NEXT: s_not_b32 s9, s6 +; GFX6-NEXT: s_lshr_b32 s11, s3, 16 ; GFX6-NEXT: s_min_u32 s8, s9, s8 -; GFX6-NEXT: s_lshl_b32 s7, s7, 16 +; GFX6-NEXT: s_lshl_b32 s3, s3, 16 +; GFX6-NEXT: s_lshr_b32 s15, s7, 16 ; GFX6-NEXT: s_add_i32 s6, s6, s8 +; GFX6-NEXT: s_lshl_b32 s7, s7, 16 +; GFX6-NEXT: s_not_b32 s8, s3 +; GFX6-NEXT: s_min_u32 s7, s8, s7 +; GFX6-NEXT: s_add_i32 s3, s3, s7 +; GFX6-NEXT: s_lshl_b32 s7, s11, 16 ; GFX6-NEXT: s_lshl_b32 s8, s15, 16 ; GFX6-NEXT: s_not_b32 s9, s7 ; GFX6-NEXT: s_min_u32 s8, s9, s8 ; GFX6-NEXT: s_add_i32 s7, s7, s8 -; GFX6-NEXT: s_lshr_b32 s1, s1, 16 -; GFX6-NEXT: s_lshr_b32 s3, s3, 16 +; GFX6-NEXT: s_lshr_b32 s4, s4, 16 ; GFX6-NEXT: s_lshr_b32 s5, s5, 16 +; GFX6-NEXT: s_lshr_b32 s6, s6, 16 ; GFX6-NEXT: s_lshr_b32 s7, s7, 16 ; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: v_mov_b32_e32 v1, s2 -; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: v_mov_b32_e32 v3, s6 -; GFX6-NEXT: v_alignbit_b32 v0, s1, v0, 16 -; GFX6-NEXT: v_alignbit_b32 v1, s3, v1, 16 -; GFX6-NEXT: v_alignbit_b32 v2, s5, v2, 16 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s2 +; GFX6-NEXT: v_mov_b32_e32 v3, s3 +; GFX6-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX6-NEXT: v_alignbit_b32 v1, s5, v1, 16 +; GFX6-NEXT: v_alignbit_b32 v2, s6, v2, 16 ; GFX6-NEXT: v_alignbit_b32 v3, s7, v3, 16 ; GFX6-NEXT: v_readfirstlane_b32 s0, v0 ; GFX6-NEXT: v_readfirstlane_b32 s1, v1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll index 0b5224b0079b5..1c8a26ce6126d 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll @@ -2086,16 +2086,18 @@ define <2 x i16> @v_usubsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) { ; GFX6-LABEL: v_usubsat_v2i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_min_u32_e32 v2, v0, v2 -; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_min_u32_e32 v1, v0, v1 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; GFX6-NEXT: v_min_u32_e32 v2, v1, v2 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v2 -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_alignbit_b32 v0, v1, v0, 16 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_usubsat_v2i16: @@ -2124,11 +2126,13 @@ define <2 x i16> @v_usubsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) { define amdgpu_ps i32 @s_usubsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs) { ; GFX6-LABEL: s_usubsat_v2i16: ; GFX6: ; %bb.0: +; GFX6-NEXT: s_lshr_b32 s2, s0, 16 +; GFX6-NEXT: s_lshr_b32 s3, s1, 16 ; GFX6-NEXT: s_lshl_b32 s0, s0, 16 -; GFX6-NEXT: s_lshl_b32 s2, s2, 16 -; GFX6-NEXT: s_min_u32 s2, s0, s2 -; GFX6-NEXT: s_sub_i32 s0, s0, s2 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 +; GFX6-NEXT: s_min_u32 s1, s0, s1 +; GFX6-NEXT: s_sub_i32 s0, s0, s1 +; GFX6-NEXT: s_lshl_b32 s1, s2, 16 ; GFX6-NEXT: s_lshl_b32 s2, s3, 16 ; GFX6-NEXT: s_min_u32 s2, s1, s2 ; GFX6-NEXT: s_sub_i32 s1, s1, s2 @@ -2174,6 +2178,8 @@ define amdgpu_ps i32 @s_usubsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs define amdgpu_ps float @usubsat_v2i16_sv(<2 x i16> inreg %lhs, <2 x i16> %rhs) { ; GFX6-LABEL: usubsat_v2i16_sv: ; GFX6: ; %bb.0: +; GFX6-NEXT: s_lshr_b32 s1, s0, 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX6-NEXT: s_lshl_b32 s0, s0, 16 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_min_u32_e32 v0, s0, v0 @@ -2212,6 +2218,8 @@ define amdgpu_ps float @usubsat_v2i16_sv(<2 x i16> inreg %lhs, <2 x i16> %rhs) { define amdgpu_ps float @usubsat_v2i16_vs(<2 x i16> %lhs, <2 x i16> inreg %rhs) { ; GFX6-LABEL: usubsat_v2i16_vs: ; GFX6: ; %bb.0: +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX6-NEXT: s_lshr_b32 s1, s0, 16 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_lshl_b32 s0, s0, 16 ; GFX6-NEXT: v_min_u32_e32 v2, s0, v0 @@ -2262,26 +2270,30 @@ define <2 x float> @v_usubsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { ; GFX6-LABEL: v_usubsat_v4i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_min_u32_e32 v4, v0, v4 -; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX6-NEXT: v_min_u32_e32 v4, v1, v4 -; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_min_u32_e32 v2, v0, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_min_u32_e32 v4, v2, v4 +; GFX6-NEXT: v_min_u32_e32 v3, v1, v3 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v7 ; GFX6-NEXT: v_min_u32_e32 v4, v3, v4 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v4 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; GFX6-NEXT: v_alignbit_b32 v1, v3, v2, 16 +; GFX6-NEXT: v_alignbit_b32 v0, v2, v0, 16 +; GFX6-NEXT: v_alignbit_b32 v1, v3, v1, 16 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_usubsat_v4i16: @@ -2316,27 +2328,31 @@ define <2 x float> @v_usubsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { define amdgpu_ps <2 x i32> @s_usubsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inreg %rhs) { ; GFX6-LABEL: s_usubsat_v4i16: ; GFX6: ; %bb.0: +; GFX6-NEXT: s_lshr_b32 s4, s0, 16 +; GFX6-NEXT: s_lshr_b32 s6, s2, 16 ; GFX6-NEXT: s_lshl_b32 s0, s0, 16 -; GFX6-NEXT: s_lshl_b32 s4, s4, 16 -; GFX6-NEXT: s_min_u32 s4, s0, s4 -; GFX6-NEXT: s_sub_i32 s0, s0, s4 -; GFX6-NEXT: s_lshl_b32 s1, s1, 16 -; GFX6-NEXT: s_lshl_b32 s4, s5, 16 -; GFX6-NEXT: s_min_u32 s4, s1, s4 -; GFX6-NEXT: s_sub_i32 s1, s1, s4 ; GFX6-NEXT: s_lshl_b32 s2, s2, 16 +; GFX6-NEXT: s_min_u32 s2, s0, s2 +; GFX6-NEXT: s_lshr_b32 s5, s1, 16 +; GFX6-NEXT: s_lshr_b32 s7, s3, 16 +; GFX6-NEXT: s_sub_i32 s0, s0, s2 +; GFX6-NEXT: s_lshl_b32 s2, s4, 16 ; GFX6-NEXT: s_lshl_b32 s4, s6, 16 +; GFX6-NEXT: s_lshl_b32 s1, s1, 16 +; GFX6-NEXT: s_lshl_b32 s3, s3, 16 ; GFX6-NEXT: s_min_u32 s4, s2, s4 +; GFX6-NEXT: s_min_u32 s3, s1, s3 ; GFX6-NEXT: s_sub_i32 s2, s2, s4 -; GFX6-NEXT: s_lshl_b32 s3, s3, 16 +; GFX6-NEXT: s_sub_i32 s1, s1, s3 +; GFX6-NEXT: s_lshl_b32 s3, s5, 16 ; GFX6-NEXT: s_lshl_b32 s4, s7, 16 ; GFX6-NEXT: s_min_u32 s4, s3, s4 ; GFX6-NEXT: s_sub_i32 s3, s3, s4 -; GFX6-NEXT: s_lshr_b32 s1, s1, 16 +; GFX6-NEXT: s_lshr_b32 s2, s2, 16 ; GFX6-NEXT: s_lshr_b32 s3, s3, 16 ; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: v_mov_b32_e32 v1, s2 -; GFX6-NEXT: v_alignbit_b32 v0, s1, v0, 16 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_alignbit_b32 v0, s2, v0, 16 ; GFX6-NEXT: v_alignbit_b32 v1, s3, v1, 16 ; GFX6-NEXT: v_readfirstlane_b32 s0, v0 ; GFX6-NEXT: v_readfirstlane_b32 s1, v1 @@ -2407,36 +2423,42 @@ define <3 x float> @v_usubsat_v6i16(<6 x i16> %lhs, <6 x i16> %rhs) { ; GFX6-LABEL: v_usubsat_v6i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v9, 16, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX6-NEXT: v_min_u32_e32 v6, v0, v6 -; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v6 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; GFX6-NEXT: v_min_u32_e32 v6, v1, v6 -; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v6 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v8 -; GFX6-NEXT: v_min_u32_e32 v6, v2, v6 -; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_min_u32_e32 v3, v0, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v10, 16, v4 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v9 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX6-NEXT: v_min_u32_e32 v6, v3, v6 +; GFX6-NEXT: v_min_u32_e32 v4, v1, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v8, 16, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v11, 16, v5 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v6 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v7 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v10 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX6-NEXT: v_min_u32_e32 v6, v4, v6 +; GFX6-NEXT: v_min_u32_e32 v5, v2, v5 ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v4, v6 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v11 ; GFX6-NEXT: v_min_u32_e32 v6, v5, v6 ; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v6 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; GFX6-NEXT: v_alignbit_b32 v1, v3, v2, 16 -; GFX6-NEXT: v_alignbit_b32 v2, v5, v4, 16 +; GFX6-NEXT: v_alignbit_b32 v0, v3, v0, 16 +; GFX6-NEXT: v_alignbit_b32 v1, v4, v1, 16 +; GFX6-NEXT: v_alignbit_b32 v2, v5, v2, 16 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_usubsat_v6i16: @@ -2476,38 +2498,44 @@ define <3 x float> @v_usubsat_v6i16(<6 x i16> %lhs, <6 x i16> %rhs) { define amdgpu_ps <3 x i32> @s_usubsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inreg %rhs) { ; GFX6-LABEL: s_usubsat_v6i16: ; GFX6: ; %bb.0: +; GFX6-NEXT: s_lshr_b32 s6, s0, 16 +; GFX6-NEXT: s_lshr_b32 s9, s3, 16 ; GFX6-NEXT: s_lshl_b32 s0, s0, 16 -; GFX6-NEXT: s_lshl_b32 s6, s6, 16 -; GFX6-NEXT: s_min_u32 s6, s0, s6 -; GFX6-NEXT: s_sub_i32 s0, s0, s6 -; GFX6-NEXT: s_lshl_b32 s1, s1, 16 -; GFX6-NEXT: s_lshl_b32 s6, s7, 16 -; GFX6-NEXT: s_min_u32 s6, s1, s6 -; GFX6-NEXT: s_sub_i32 s1, s1, s6 -; GFX6-NEXT: s_lshl_b32 s2, s2, 16 -; GFX6-NEXT: s_lshl_b32 s6, s8, 16 -; GFX6-NEXT: s_min_u32 s6, s2, s6 -; GFX6-NEXT: s_sub_i32 s2, s2, s6 ; GFX6-NEXT: s_lshl_b32 s3, s3, 16 +; GFX6-NEXT: s_min_u32 s3, s0, s3 +; GFX6-NEXT: s_lshr_b32 s7, s1, 16 +; GFX6-NEXT: s_lshr_b32 s10, s4, 16 +; GFX6-NEXT: s_sub_i32 s0, s0, s3 +; GFX6-NEXT: s_lshl_b32 s3, s6, 16 ; GFX6-NEXT: s_lshl_b32 s6, s9, 16 +; GFX6-NEXT: s_lshl_b32 s1, s1, 16 +; GFX6-NEXT: s_lshl_b32 s4, s4, 16 ; GFX6-NEXT: s_min_u32 s6, s3, s6 +; GFX6-NEXT: s_min_u32 s4, s1, s4 +; GFX6-NEXT: s_lshr_b32 s8, s2, 16 +; GFX6-NEXT: s_lshr_b32 s11, s5, 16 ; GFX6-NEXT: s_sub_i32 s3, s3, s6 -; GFX6-NEXT: s_lshl_b32 s4, s4, 16 +; GFX6-NEXT: s_sub_i32 s1, s1, s4 +; GFX6-NEXT: s_lshl_b32 s4, s7, 16 ; GFX6-NEXT: s_lshl_b32 s6, s10, 16 +; GFX6-NEXT: s_lshl_b32 s2, s2, 16 +; GFX6-NEXT: s_lshl_b32 s5, s5, 16 ; GFX6-NEXT: s_min_u32 s6, s4, s6 +; GFX6-NEXT: s_min_u32 s5, s2, s5 ; GFX6-NEXT: s_sub_i32 s4, s4, s6 -; GFX6-NEXT: s_lshl_b32 s5, s5, 16 +; GFX6-NEXT: s_sub_i32 s2, s2, s5 +; GFX6-NEXT: s_lshl_b32 s5, s8, 16 ; GFX6-NEXT: s_lshl_b32 s6, s11, 16 ; GFX6-NEXT: s_min_u32 s6, s5, s6 ; GFX6-NEXT: s_sub_i32 s5, s5, s6 -; GFX6-NEXT: s_lshr_b32 s1, s1, 16 ; GFX6-NEXT: s_lshr_b32 s3, s3, 16 +; GFX6-NEXT: s_lshr_b32 s4, s4, 16 ; GFX6-NEXT: s_lshr_b32 s5, s5, 16 ; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: v_mov_b32_e32 v1, s2 -; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: v_alignbit_b32 v0, s1, v0, 16 -; GFX6-NEXT: v_alignbit_b32 v1, s3, v1, 16 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s2 +; GFX6-NEXT: v_alignbit_b32 v0, s3, v0, 16 +; GFX6-NEXT: v_alignbit_b32 v1, s4, v1, 16 ; GFX6-NEXT: v_alignbit_b32 v2, s5, v2, 16 ; GFX6-NEXT: v_readfirstlane_b32 s0, v0 ; GFX6-NEXT: v_readfirstlane_b32 s1, v1 @@ -2585,46 +2613,54 @@ define <4 x float> @v_usubsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { ; GFX6-LABEL: v_usubsat_v8i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v8, 16, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v12, 16, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX6-NEXT: v_min_u32_e32 v8, v0, v8 -; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v8 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; GFX6-NEXT: v_min_u32_e32 v8, v1, v8 -; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v8 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v10 -; GFX6-NEXT: v_min_u32_e32 v8, v2, v8 -; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v8 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v11 -; GFX6-NEXT: v_min_u32_e32 v8, v3, v8 -; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_min_u32_e32 v4, v0, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v9, 16, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v13, 16, v5 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v12 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX6-NEXT: v_min_u32_e32 v8, v4, v8 +; GFX6-NEXT: v_min_u32_e32 v5, v1, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v14, 16, v6 ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v4, v8 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v9 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v13 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; GFX6-NEXT: v_min_u32_e32 v8, v5, v8 +; GFX6-NEXT: v_min_u32_e32 v6, v2, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v11, 16, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v15, 16, v7 ; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v8 -; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v10 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v14 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; GFX6-NEXT: v_min_u32_e32 v8, v6, v8 +; GFX6-NEXT: v_min_u32_e32 v7, v3, v7 ; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v6, v8 -; GFX6-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v7 +; GFX6-NEXT: v_lshlrev_b32_e32 v7, 16, v11 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v15 ; GFX6-NEXT: v_min_u32_e32 v8, v7, v8 ; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v7, v8 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; GFX6-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GFX6-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; GFX6-NEXT: v_alignbit_b32 v1, v3, v2, 16 -; GFX6-NEXT: v_alignbit_b32 v2, v5, v4, 16 -; GFX6-NEXT: v_alignbit_b32 v3, v7, v6, 16 +; GFX6-NEXT: v_alignbit_b32 v0, v4, v0, 16 +; GFX6-NEXT: v_alignbit_b32 v1, v5, v1, 16 +; GFX6-NEXT: v_alignbit_b32 v2, v6, v2, 16 +; GFX6-NEXT: v_alignbit_b32 v3, v7, v3, 16 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_usubsat_v8i16: @@ -2669,49 +2705,57 @@ define <4 x float> @v_usubsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { define amdgpu_ps <4 x i32> @s_usubsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inreg %rhs) { ; GFX6-LABEL: s_usubsat_v8i16: ; GFX6: ; %bb.0: +; GFX6-NEXT: s_lshr_b32 s8, s0, 16 +; GFX6-NEXT: s_lshr_b32 s12, s4, 16 ; GFX6-NEXT: s_lshl_b32 s0, s0, 16 -; GFX6-NEXT: s_lshl_b32 s8, s8, 16 -; GFX6-NEXT: s_min_u32 s8, s0, s8 -; GFX6-NEXT: s_sub_i32 s0, s0, s8 -; GFX6-NEXT: s_lshl_b32 s1, s1, 16 -; GFX6-NEXT: s_lshl_b32 s8, s9, 16 -; GFX6-NEXT: s_min_u32 s8, s1, s8 -; GFX6-NEXT: s_sub_i32 s1, s1, s8 -; GFX6-NEXT: s_lshl_b32 s2, s2, 16 -; GFX6-NEXT: s_lshl_b32 s8, s10, 16 -; GFX6-NEXT: s_min_u32 s8, s2, s8 -; GFX6-NEXT: s_sub_i32 s2, s2, s8 -; GFX6-NEXT: s_lshl_b32 s3, s3, 16 -; GFX6-NEXT: s_lshl_b32 s8, s11, 16 -; GFX6-NEXT: s_min_u32 s8, s3, s8 -; GFX6-NEXT: s_sub_i32 s3, s3, s8 ; GFX6-NEXT: s_lshl_b32 s4, s4, 16 +; GFX6-NEXT: s_min_u32 s4, s0, s4 +; GFX6-NEXT: s_lshr_b32 s9, s1, 16 +; GFX6-NEXT: s_lshr_b32 s13, s5, 16 +; GFX6-NEXT: s_sub_i32 s0, s0, s4 +; GFX6-NEXT: s_lshl_b32 s4, s8, 16 ; GFX6-NEXT: s_lshl_b32 s8, s12, 16 +; GFX6-NEXT: s_lshl_b32 s1, s1, 16 +; GFX6-NEXT: s_lshl_b32 s5, s5, 16 ; GFX6-NEXT: s_min_u32 s8, s4, s8 +; GFX6-NEXT: s_min_u32 s5, s1, s5 +; GFX6-NEXT: s_lshr_b32 s10, s2, 16 +; GFX6-NEXT: s_lshr_b32 s14, s6, 16 ; GFX6-NEXT: s_sub_i32 s4, s4, s8 -; GFX6-NEXT: s_lshl_b32 s5, s5, 16 +; GFX6-NEXT: s_sub_i32 s1, s1, s5 +; GFX6-NEXT: s_lshl_b32 s5, s9, 16 ; GFX6-NEXT: s_lshl_b32 s8, s13, 16 +; GFX6-NEXT: s_lshl_b32 s2, s2, 16 +; GFX6-NEXT: s_lshl_b32 s6, s6, 16 ; GFX6-NEXT: s_min_u32 s8, s5, s8 +; GFX6-NEXT: s_min_u32 s6, s2, s6 +; GFX6-NEXT: s_lshr_b32 s11, s3, 16 +; GFX6-NEXT: s_lshr_b32 s15, s7, 16 ; GFX6-NEXT: s_sub_i32 s5, s5, s8 -; GFX6-NEXT: s_lshl_b32 s6, s6, 16 +; GFX6-NEXT: s_sub_i32 s2, s2, s6 +; GFX6-NEXT: s_lshl_b32 s6, s10, 16 ; GFX6-NEXT: s_lshl_b32 s8, s14, 16 +; GFX6-NEXT: s_lshl_b32 s3, s3, 16 +; GFX6-NEXT: s_lshl_b32 s7, s7, 16 ; GFX6-NEXT: s_min_u32 s8, s6, s8 +; GFX6-NEXT: s_min_u32 s7, s3, s7 ; GFX6-NEXT: s_sub_i32 s6, s6, s8 -; GFX6-NEXT: s_lshl_b32 s7, s7, 16 +; GFX6-NEXT: s_sub_i32 s3, s3, s7 +; GFX6-NEXT: s_lshl_b32 s7, s11, 16 ; GFX6-NEXT: s_lshl_b32 s8, s15, 16 ; GFX6-NEXT: s_min_u32 s8, s7, s8 ; GFX6-NEXT: s_sub_i32 s7, s7, s8 -; GFX6-NEXT: s_lshr_b32 s1, s1, 16 -; GFX6-NEXT: s_lshr_b32 s3, s3, 16 +; GFX6-NEXT: s_lshr_b32 s4, s4, 16 ; GFX6-NEXT: s_lshr_b32 s5, s5, 16 +; GFX6-NEXT: s_lshr_b32 s6, s6, 16 ; GFX6-NEXT: s_lshr_b32 s7, s7, 16 ; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: v_mov_b32_e32 v1, s2 -; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: v_mov_b32_e32 v3, s6 -; GFX6-NEXT: v_alignbit_b32 v0, s1, v0, 16 -; GFX6-NEXT: v_alignbit_b32 v1, s3, v1, 16 -; GFX6-NEXT: v_alignbit_b32 v2, s5, v2, 16 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s2 +; GFX6-NEXT: v_mov_b32_e32 v3, s3 +; GFX6-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX6-NEXT: v_alignbit_b32 v1, s5, v1, 16 +; GFX6-NEXT: v_alignbit_b32 v2, s6, v2, 16 ; GFX6-NEXT: v_alignbit_b32 v3, s7, v3, 16 ; GFX6-NEXT: v_readfirstlane_b32 s0, v0 ; GFX6-NEXT: v_readfirstlane_b32 s1, v1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll index 09fc810cce000..8371b6c033e8c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll @@ -27,32 +27,11 @@ entry: ; FIXME: fails to match define amdgpu_ps i32 @scalar_xnor_v2i16_one_use(<2 x i16> inreg %a, <2 x i16> inreg %b) { -; GFX7-LABEL: scalar_xnor_v2i16_one_use: -; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] -; GFX7-NEXT: s_lshl_b32 s1, s1, 16 -; GFX7-NEXT: s_and_b32 s0, s0, 0xffff -; GFX7-NEXT: s_or_b32 s0, s1, s0 -; GFX7-NEXT: s_xor_b32 s0, s0, -1 -; GFX7-NEXT: ; return to shader part epilog -; -; GFX8-LABEL: scalar_xnor_v2i16_one_use: -; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_xor_b32 s0, s0, s1 -; GFX8-NEXT: s_xor_b32 s0, s0, -1 -; GFX8-NEXT: ; return to shader part epilog -; -; GFX900-LABEL: scalar_xnor_v2i16_one_use: -; GFX900: ; %bb.0: ; %entry -; GFX900-NEXT: s_xor_b32 s0, s0, s1 -; GFX900-NEXT: s_xor_b32 s0, s0, -1 -; GFX900-NEXT: ; return to shader part epilog -; -; GFX906-LABEL: scalar_xnor_v2i16_one_use: -; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_xor_b32 s0, s0, s1 -; GFX906-NEXT: s_xor_b32 s0, s0, -1 -; GFX906-NEXT: ; return to shader part epilog +; GCN-LABEL: scalar_xnor_v2i16_one_use: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_xor_b32 s0, s0, s1 +; GCN-NEXT: s_xor_b32 s0, s0, -1 +; GCN-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: scalar_xnor_v2i16_one_use: ; GFX10: ; %bb.0: ; %entry @@ -110,8 +89,14 @@ define amdgpu_ps i64 @scalar_xnor_i64_one_use(i64 inreg %a, i64 inreg %b) { define amdgpu_ps i64 @scalar_xnor_v4i16_one_use(<4 x i16> inreg %a, <4 x i16> inreg %b) { ; GFX7-LABEL: scalar_xnor_v4i16_one_use: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5] -; GFX7-NEXT: s_xor_b64 s[2:3], s[2:3], s[6:7] +; GFX7-NEXT: s_mov_b32 s4, s1 +; GFX7-NEXT: s_mov_b32 s6, s3 +; GFX7-NEXT: s_lshr_b32 s1, s0, 16 +; GFX7-NEXT: s_lshr_b32 s3, s2, 16 +; GFX7-NEXT: s_lshr_b32 s5, s4, 16 +; GFX7-NEXT: s_lshr_b32 s7, s6, 16 +; GFX7-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] +; GFX7-NEXT: s_xor_b64 s[2:3], s[4:5], s[6:7] ; GFX7-NEXT: s_lshl_b32 s1, s1, 16 ; GFX7-NEXT: s_and_b32 s0, s0, 0xffff ; GFX7-NEXT: s_or_b32 s0, s1, s0 diff --git a/llvm/test/CodeGen/AMDGPU/abs_i16.ll b/llvm/test/CodeGen/AMDGPU/abs_i16.ll index 66cc7f3db03c2..7d99f29807a87 100644 --- a/llvm/test/CodeGen/AMDGPU/abs_i16.ll +++ b/llvm/test/CodeGen/AMDGPU/abs_i16.ll @@ -94,27 +94,27 @@ define <2 x i16> @v_abs_v2i16(<2 x i16> %arg) { ; GFX6-LABEL: v_abs_v2i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v0 ; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16 -; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 0, v0 -; GFX6-NEXT: v_max_i32_e32 v0, v2, v0 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 0, v1 ; GFX6-NEXT: v_max_i32_e32 v1, v2, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 0, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_max_i32_e32 v0, v2, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_abs_v2i16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_ashrrev_i32_e32 v1, 16, v0 ; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GFX7-NEXT: v_bfe_i32 v1, v1, 0, 16 -; GFX7-NEXT: v_sub_i32_e32 v2, vcc, 0, v0 -; GFX7-NEXT: v_max_i32_e32 v0, v2, v0 ; GFX7-NEXT: v_sub_i32_e32 v2, vcc, 0, v1 ; GFX7-NEXT: v_max_i32_e32 v1, v2, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_sub_i32_e32 v2, vcc, 0, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_max_i32_e32 v0, v2, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_abs_v2i16: @@ -169,35 +169,33 @@ define <3 x i16> @v_abs_v3i16(<3 x i16> %arg) { ; GFX6-LABEL: v_abs_v3i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX6-NEXT: v_ashrrev_i32_e32 v2, 16, v0 ; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16 -; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v0 -; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16 -; GFX6-NEXT: v_max_i32_e32 v0, v3, v0 -; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v1 -; GFX6-NEXT: v_max_i32_e32 v1, v3, v1 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GFX6-NEXT: v_max_i32_e32 v2, v3, v2 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v1 +; GFX6-NEXT: v_max_i32_e32 v1, v3, v1 +; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_max_i32_e32 v0, v3, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_abs_v3i16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX7-NEXT: v_ashrrev_i32_e32 v2, 16, v0 ; GFX7-NEXT: v_bfe_i32 v1, v1, 0, 16 -; GFX7-NEXT: v_sub_i32_e32 v3, vcc, 0, v0 -; GFX7-NEXT: v_bfe_i32 v2, v2, 0, 16 -; GFX7-NEXT: v_max_i32_e32 v0, v3, v0 -; GFX7-NEXT: v_sub_i32_e32 v3, vcc, 0, v1 -; GFX7-NEXT: v_max_i32_e32 v1, v3, v1 ; GFX7-NEXT: v_sub_i32_e32 v3, vcc, 0, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GFX7-NEXT: v_max_i32_e32 v2, v3, v2 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX7-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; GFX7-NEXT: v_sub_i32_e32 v3, vcc, 0, v1 +; GFX7-NEXT: v_max_i32_e32 v1, v3, v1 +; GFX7-NEXT: v_sub_i32_e32 v3, vcc, 0, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_max_i32_e32 v0, v3, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_abs_v3i16: @@ -262,45 +260,43 @@ define <4 x i16> @v_abs_v4i16(<4 x i16> %arg) { ; GFX6-LABEL: v_abs_v4i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX6-NEXT: v_ashrrev_i32_e32 v3, 16, v1 +; GFX6-NEXT: v_ashrrev_i32_e32 v2, 16, v0 +; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 0, v3 ; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16 -; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 0, v0 -; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16 -; GFX6-NEXT: v_max_i32_e32 v0, v4, v0 -; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 0, v1 -; GFX6-NEXT: v_bfe_i32 v3, v3, 0, 16 -; GFX6-NEXT: v_max_i32_e32 v1, v4, v1 +; GFX6-NEXT: v_max_i32_e32 v3, v4, v3 ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 0, v2 +; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GFX6-NEXT: v_max_i32_e32 v2, v4, v2 -; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 0, v3 -; GFX6-NEXT: v_max_i32_e32 v3, v4, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_or_b32_e32 v2, v2, v4 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 0, v1 +; GFX6-NEXT: v_max_i32_e32 v1, v4, v1 +; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 0, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_max_i32_e32 v0, v4, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX6-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_abs_v4i16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX7-NEXT: v_ashrrev_i32_e32 v3, 16, v1 +; GFX7-NEXT: v_ashrrev_i32_e32 v2, 16, v0 +; GFX7-NEXT: v_sub_i32_e32 v4, vcc, 0, v3 ; GFX7-NEXT: v_bfe_i32 v1, v1, 0, 16 -; GFX7-NEXT: v_sub_i32_e32 v4, vcc, 0, v0 -; GFX7-NEXT: v_bfe_i32 v2, v2, 0, 16 -; GFX7-NEXT: v_max_i32_e32 v0, v4, v0 -; GFX7-NEXT: v_sub_i32_e32 v4, vcc, 0, v1 -; GFX7-NEXT: v_bfe_i32 v3, v3, 0, 16 -; GFX7-NEXT: v_max_i32_e32 v1, v4, v1 +; GFX7-NEXT: v_max_i32_e32 v3, v4, v3 ; GFX7-NEXT: v_sub_i32_e32 v4, vcc, 0, v2 +; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GFX7-NEXT: v_max_i32_e32 v2, v4, v2 -; GFX7-NEXT: v_sub_i32_e32 v4, vcc, 0, v3 -; GFX7-NEXT: v_max_i32_e32 v3, v4, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_or_b32_e32 v2, v2, v4 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX7-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; GFX7-NEXT: v_sub_i32_e32 v4, vcc, 0, v1 +; GFX7-NEXT: v_max_i32_e32 v1, v4, v1 +; GFX7-NEXT: v_sub_i32_e32 v4, vcc, 0, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_max_i32_e32 v0, v4, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_abs_v4i16: @@ -368,61 +364,59 @@ define <6 x i16> @v_abs_v6i16(<6 x i16> %arg) { ; GFX6-LABEL: v_abs_v6i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16 -; GFX6-NEXT: v_sub_i32_e32 v6, vcc, 0, v0 -; GFX6-NEXT: v_bfe_i32 v4, v4, 0, 16 -; GFX6-NEXT: v_max_i32_e32 v0, v6, v0 -; GFX6-NEXT: v_sub_i32_e32 v6, vcc, 0, v1 -; GFX6-NEXT: v_bfe_i32 v5, v5, 0, 16 -; GFX6-NEXT: v_max_i32_e32 v1, v6, v1 -; GFX6-NEXT: v_sub_i32_e32 v6, vcc, 0, v4 -; GFX6-NEXT: v_max_i32_e32 v4, v6, v4 +; GFX6-NEXT: v_ashrrev_i32_e32 v5, 16, v2 +; GFX6-NEXT: v_ashrrev_i32_e32 v4, 16, v1 ; GFX6-NEXT: v_sub_i32_e32 v6, vcc, 0, v5 +; GFX6-NEXT: v_ashrrev_i32_e32 v3, 16, v0 ; GFX6-NEXT: v_max_i32_e32 v5, v6, v5 +; GFX6-NEXT: v_sub_i32_e32 v6, vcc, 0, v4 ; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16 -; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v5 -; GFX6-NEXT: v_bfe_i32 v3, v3, 0, 16 -; GFX6-NEXT: v_or_b32_e32 v4, v4, v6 -; GFX6-NEXT: v_sub_i32_e32 v6, vcc, 0, v2 -; GFX6-NEXT: v_max_i32_e32 v2, v6, v2 +; GFX6-NEXT: v_max_i32_e32 v4, v6, v4 ; GFX6-NEXT: v_sub_i32_e32 v6, vcc, 0, v3 +; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16 ; GFX6-NEXT: v_max_i32_e32 v3, v6, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_or_b32_e32 v2, v2, v6 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; GFX6-NEXT: v_sub_i32_e32 v6, vcc, 0, v2 +; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX6-NEXT: v_max_i32_e32 v2, v6, v2 +; GFX6-NEXT: v_sub_i32_e32 v6, vcc, 0, v1 +; GFX6-NEXT: v_max_i32_e32 v1, v6, v1 +; GFX6-NEXT: v_sub_i32_e32 v6, vcc, 0, v0 +; GFX6-NEXT: v_max_i32_e32 v0, v6, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX6-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_abs_v6i16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GFX7-NEXT: v_bfe_i32 v1, v1, 0, 16 -; GFX7-NEXT: v_sub_i32_e32 v6, vcc, 0, v0 -; GFX7-NEXT: v_bfe_i32 v4, v4, 0, 16 -; GFX7-NEXT: v_max_i32_e32 v0, v6, v0 -; GFX7-NEXT: v_sub_i32_e32 v6, vcc, 0, v1 -; GFX7-NEXT: v_bfe_i32 v5, v5, 0, 16 -; GFX7-NEXT: v_max_i32_e32 v1, v6, v1 -; GFX7-NEXT: v_sub_i32_e32 v6, vcc, 0, v4 -; GFX7-NEXT: v_max_i32_e32 v4, v6, v4 +; GFX7-NEXT: v_ashrrev_i32_e32 v5, 16, v2 +; GFX7-NEXT: v_ashrrev_i32_e32 v4, 16, v1 ; GFX7-NEXT: v_sub_i32_e32 v6, vcc, 0, v5 +; GFX7-NEXT: v_ashrrev_i32_e32 v3, 16, v0 ; GFX7-NEXT: v_max_i32_e32 v5, v6, v5 +; GFX7-NEXT: v_sub_i32_e32 v6, vcc, 0, v4 ; GFX7-NEXT: v_bfe_i32 v2, v2, 0, 16 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v5 -; GFX7-NEXT: v_bfe_i32 v3, v3, 0, 16 -; GFX7-NEXT: v_or_b32_e32 v4, v4, v6 -; GFX7-NEXT: v_sub_i32_e32 v6, vcc, 0, v2 -; GFX7-NEXT: v_max_i32_e32 v2, v6, v2 +; GFX7-NEXT: v_max_i32_e32 v4, v6, v4 ; GFX7-NEXT: v_sub_i32_e32 v6, vcc, 0, v3 +; GFX7-NEXT: v_bfe_i32 v1, v1, 0, 16 ; GFX7-NEXT: v_max_i32_e32 v3, v6, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_or_b32_e32 v2, v2, v6 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX7-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; GFX7-NEXT: v_sub_i32_e32 v6, vcc, 0, v2 +; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX7-NEXT: v_max_i32_e32 v2, v6, v2 +; GFX7-NEXT: v_sub_i32_e32 v6, vcc, 0, v1 +; GFX7-NEXT: v_max_i32_e32 v1, v6, v1 +; GFX7-NEXT: v_sub_i32_e32 v6, vcc, 0, v0 +; GFX7-NEXT: v_max_i32_e32 v0, v6, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_abs_v6i16: @@ -505,79 +499,75 @@ define <8 x i16> @v_abs_v8i16(<8 x i16> %arg) { ; GFX6-LABEL: v_abs_v8i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16 -; GFX6-NEXT: v_sub_i32_e32 v8, vcc, 0, v0 -; GFX6-NEXT: v_bfe_i32 v4, v4, 0, 16 -; GFX6-NEXT: v_max_i32_e32 v0, v8, v0 -; GFX6-NEXT: v_sub_i32_e32 v8, vcc, 0, v1 -; GFX6-NEXT: v_bfe_i32 v5, v5, 0, 16 -; GFX6-NEXT: v_max_i32_e32 v1, v8, v1 -; GFX6-NEXT: v_sub_i32_e32 v8, vcc, 0, v4 -; GFX6-NEXT: v_bfe_i32 v6, v6, 0, 16 -; GFX6-NEXT: v_max_i32_e32 v4, v8, v4 -; GFX6-NEXT: v_sub_i32_e32 v8, vcc, 0, v5 -; GFX6-NEXT: v_bfe_i32 v7, v7, 0, 16 -; GFX6-NEXT: v_max_i32_e32 v5, v8, v5 -; GFX6-NEXT: v_sub_i32_e32 v8, vcc, 0, v6 -; GFX6-NEXT: v_max_i32_e32 v6, v8, v6 +; GFX6-NEXT: v_ashrrev_i32_e32 v7, 16, v3 +; GFX6-NEXT: v_ashrrev_i32_e32 v6, 16, v2 ; GFX6-NEXT: v_sub_i32_e32 v8, vcc, 0, v7 +; GFX6-NEXT: v_ashrrev_i32_e32 v5, 16, v1 ; GFX6-NEXT: v_max_i32_e32 v7, v8, v7 -; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16 -; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v7 +; GFX6-NEXT: v_sub_i32_e32 v8, vcc, 0, v6 +; GFX6-NEXT: v_ashrrev_i32_e32 v4, 16, v0 +; GFX6-NEXT: v_max_i32_e32 v6, v8, v6 +; GFX6-NEXT: v_sub_i32_e32 v8, vcc, 0, v5 ; GFX6-NEXT: v_bfe_i32 v3, v3, 0, 16 -; GFX6-NEXT: v_or_b32_e32 v6, v6, v8 -; GFX6-NEXT: v_sub_i32_e32 v8, vcc, 0, v2 -; GFX6-NEXT: v_max_i32_e32 v2, v8, v2 +; GFX6-NEXT: v_max_i32_e32 v5, v8, v5 +; GFX6-NEXT: v_sub_i32_e32 v8, vcc, 0, v4 +; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16 +; GFX6-NEXT: v_max_i32_e32 v4, v8, v4 ; GFX6-NEXT: v_sub_i32_e32 v8, vcc, 0, v3 +; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16 ; GFX6-NEXT: v_max_i32_e32 v3, v8, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_sub_i32_e32 v8, vcc, 0, v2 +; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX6-NEXT: v_max_i32_e32 v2, v8, v2 +; GFX6-NEXT: v_sub_i32_e32 v8, vcc, 0, v1 +; GFX6-NEXT: v_max_i32_e32 v1, v8, v1 +; GFX6-NEXT: v_sub_i32_e32 v8, vcc, 0, v0 +; GFX6-NEXT: v_max_i32_e32 v0, v8, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_or_b32_e32 v2, v2, v8 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_or_b32_e32 v4, v4, v5 -; GFX6-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; GFX6-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v1, v1, v5 +; GFX6-NEXT: v_or_b32_e32 v2, v2, v4 +; GFX6-NEXT: v_or_b32_e32 v3, v3, v7 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_abs_v8i16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GFX7-NEXT: v_bfe_i32 v1, v1, 0, 16 -; GFX7-NEXT: v_sub_i32_e32 v8, vcc, 0, v0 -; GFX7-NEXT: v_bfe_i32 v4, v4, 0, 16 -; GFX7-NEXT: v_max_i32_e32 v0, v8, v0 -; GFX7-NEXT: v_sub_i32_e32 v8, vcc, 0, v1 -; GFX7-NEXT: v_bfe_i32 v5, v5, 0, 16 -; GFX7-NEXT: v_max_i32_e32 v1, v8, v1 -; GFX7-NEXT: v_sub_i32_e32 v8, vcc, 0, v4 -; GFX7-NEXT: v_bfe_i32 v6, v6, 0, 16 -; GFX7-NEXT: v_max_i32_e32 v4, v8, v4 -; GFX7-NEXT: v_sub_i32_e32 v8, vcc, 0, v5 -; GFX7-NEXT: v_bfe_i32 v7, v7, 0, 16 -; GFX7-NEXT: v_max_i32_e32 v5, v8, v5 -; GFX7-NEXT: v_sub_i32_e32 v8, vcc, 0, v6 -; GFX7-NEXT: v_max_i32_e32 v6, v8, v6 +; GFX7-NEXT: v_ashrrev_i32_e32 v7, 16, v3 +; GFX7-NEXT: v_ashrrev_i32_e32 v6, 16, v2 ; GFX7-NEXT: v_sub_i32_e32 v8, vcc, 0, v7 +; GFX7-NEXT: v_ashrrev_i32_e32 v5, 16, v1 ; GFX7-NEXT: v_max_i32_e32 v7, v8, v7 -; GFX7-NEXT: v_bfe_i32 v2, v2, 0, 16 -; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v7 +; GFX7-NEXT: v_sub_i32_e32 v8, vcc, 0, v6 +; GFX7-NEXT: v_ashrrev_i32_e32 v4, 16, v0 +; GFX7-NEXT: v_max_i32_e32 v6, v8, v6 +; GFX7-NEXT: v_sub_i32_e32 v8, vcc, 0, v5 ; GFX7-NEXT: v_bfe_i32 v3, v3, 0, 16 -; GFX7-NEXT: v_or_b32_e32 v6, v6, v8 -; GFX7-NEXT: v_sub_i32_e32 v8, vcc, 0, v2 -; GFX7-NEXT: v_max_i32_e32 v2, v8, v2 +; GFX7-NEXT: v_max_i32_e32 v5, v8, v5 +; GFX7-NEXT: v_sub_i32_e32 v8, vcc, 0, v4 +; GFX7-NEXT: v_bfe_i32 v2, v2, 0, 16 +; GFX7-NEXT: v_max_i32_e32 v4, v8, v4 ; GFX7-NEXT: v_sub_i32_e32 v8, vcc, 0, v3 +; GFX7-NEXT: v_bfe_i32 v1, v1, 0, 16 ; GFX7-NEXT: v_max_i32_e32 v3, v8, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_sub_i32_e32 v8, vcc, 0, v2 +; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX7-NEXT: v_max_i32_e32 v2, v8, v2 +; GFX7-NEXT: v_sub_i32_e32 v8, vcc, 0, v1 +; GFX7-NEXT: v_max_i32_e32 v1, v8, v1 +; GFX7-NEXT: v_sub_i32_e32 v8, vcc, 0, v0 +; GFX7-NEXT: v_max_i32_e32 v0, v8, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_or_b32_e32 v2, v2, v8 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 -; GFX7-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; GFX7-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v5 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v4 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v7 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_abs_v8i16: @@ -674,147 +664,139 @@ define <16 x i16> @v_abs_v16i16(<16 x i16> %arg) { ; GFX6-LABEL: v_abs_v16i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16 -; GFX6-NEXT: v_sub_i32_e32 v16, vcc, 0, v0 -; GFX6-NEXT: v_bfe_i32 v4, v4, 0, 16 -; GFX6-NEXT: v_max_i32_e32 v0, v16, v0 -; GFX6-NEXT: v_sub_i32_e32 v16, vcc, 0, v1 -; GFX6-NEXT: v_bfe_i32 v5, v5, 0, 16 -; GFX6-NEXT: v_max_i32_e32 v1, v16, v1 -; GFX6-NEXT: v_sub_i32_e32 v16, vcc, 0, v4 -; GFX6-NEXT: v_bfe_i32 v8, v8, 0, 16 -; GFX6-NEXT: v_max_i32_e32 v4, v16, v4 -; GFX6-NEXT: v_sub_i32_e32 v16, vcc, 0, v5 -; GFX6-NEXT: v_bfe_i32 v9, v9, 0, 16 -; GFX6-NEXT: v_max_i32_e32 v5, v16, v5 -; GFX6-NEXT: v_sub_i32_e32 v16, vcc, 0, v8 -; GFX6-NEXT: v_bfe_i32 v12, v12, 0, 16 -; GFX6-NEXT: v_max_i32_e32 v8, v16, v8 -; GFX6-NEXT: v_sub_i32_e32 v16, vcc, 0, v9 -; GFX6-NEXT: v_bfe_i32 v13, v13, 0, 16 -; GFX6-NEXT: v_max_i32_e32 v9, v16, v9 -; GFX6-NEXT: v_sub_i32_e32 v16, vcc, 0, v12 -; GFX6-NEXT: v_bfe_i32 v14, v14, 0, 16 -; GFX6-NEXT: v_max_i32_e32 v12, v16, v12 -; GFX6-NEXT: v_sub_i32_e32 v16, vcc, 0, v13 -; GFX6-NEXT: v_bfe_i32 v15, v15, 0, 16 -; GFX6-NEXT: v_max_i32_e32 v13, v16, v13 -; GFX6-NEXT: v_sub_i32_e32 v16, vcc, 0, v14 -; GFX6-NEXT: v_max_i32_e32 v14, v16, v14 +; GFX6-NEXT: v_ashrrev_i32_e32 v15, 16, v7 +; GFX6-NEXT: v_ashrrev_i32_e32 v14, 16, v6 ; GFX6-NEXT: v_sub_i32_e32 v16, vcc, 0, v15 +; GFX6-NEXT: v_ashrrev_i32_e32 v13, 16, v5 ; GFX6-NEXT: v_max_i32_e32 v15, v16, v15 -; GFX6-NEXT: v_bfe_i32 v10, v10, 0, 16 -; GFX6-NEXT: v_lshlrev_b32_e32 v16, 16, v15 -; GFX6-NEXT: v_bfe_i32 v11, v11, 0, 16 -; GFX6-NEXT: v_or_b32_e32 v14, v14, v16 -; GFX6-NEXT: v_sub_i32_e32 v16, vcc, 0, v10 -; GFX6-NEXT: v_max_i32_e32 v10, v16, v10 +; GFX6-NEXT: v_sub_i32_e32 v16, vcc, 0, v14 +; GFX6-NEXT: v_ashrrev_i32_e32 v12, 16, v4 +; GFX6-NEXT: v_max_i32_e32 v14, v16, v14 +; GFX6-NEXT: v_sub_i32_e32 v16, vcc, 0, v13 +; GFX6-NEXT: v_ashrrev_i32_e32 v11, 16, v3 +; GFX6-NEXT: v_max_i32_e32 v13, v16, v13 +; GFX6-NEXT: v_sub_i32_e32 v16, vcc, 0, v12 +; GFX6-NEXT: v_ashrrev_i32_e32 v10, 16, v2 +; GFX6-NEXT: v_max_i32_e32 v12, v16, v12 ; GFX6-NEXT: v_sub_i32_e32 v16, vcc, 0, v11 +; GFX6-NEXT: v_ashrrev_i32_e32 v9, 16, v1 ; GFX6-NEXT: v_max_i32_e32 v11, v16, v11 -; GFX6-NEXT: v_bfe_i32 v6, v6, 0, 16 -; GFX6-NEXT: v_lshlrev_b32_e32 v16, 16, v11 +; GFX6-NEXT: v_sub_i32_e32 v16, vcc, 0, v10 +; GFX6-NEXT: v_ashrrev_i32_e32 v8, 16, v0 +; GFX6-NEXT: v_max_i32_e32 v10, v16, v10 +; GFX6-NEXT: v_sub_i32_e32 v16, vcc, 0, v9 ; GFX6-NEXT: v_bfe_i32 v7, v7, 0, 16 -; GFX6-NEXT: v_or_b32_e32 v10, v10, v16 -; GFX6-NEXT: v_sub_i32_e32 v16, vcc, 0, v6 -; GFX6-NEXT: v_max_i32_e32 v6, v16, v6 +; GFX6-NEXT: v_max_i32_e32 v9, v16, v9 +; GFX6-NEXT: v_sub_i32_e32 v16, vcc, 0, v8 +; GFX6-NEXT: v_bfe_i32 v6, v6, 0, 16 +; GFX6-NEXT: v_max_i32_e32 v8, v16, v8 ; GFX6-NEXT: v_sub_i32_e32 v16, vcc, 0, v7 +; GFX6-NEXT: v_bfe_i32 v5, v5, 0, 16 ; GFX6-NEXT: v_max_i32_e32 v7, v16, v7 -; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16 -; GFX6-NEXT: v_lshlrev_b32_e32 v16, 16, v7 +; GFX6-NEXT: v_sub_i32_e32 v16, vcc, 0, v6 +; GFX6-NEXT: v_bfe_i32 v4, v4, 0, 16 +; GFX6-NEXT: v_max_i32_e32 v6, v16, v6 +; GFX6-NEXT: v_sub_i32_e32 v16, vcc, 0, v5 ; GFX6-NEXT: v_bfe_i32 v3, v3, 0, 16 -; GFX6-NEXT: v_or_b32_e32 v6, v6, v16 -; GFX6-NEXT: v_sub_i32_e32 v16, vcc, 0, v2 -; GFX6-NEXT: v_max_i32_e32 v2, v16, v2 +; GFX6-NEXT: v_max_i32_e32 v5, v16, v5 +; GFX6-NEXT: v_sub_i32_e32 v16, vcc, 0, v4 +; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16 +; GFX6-NEXT: v_max_i32_e32 v4, v16, v4 ; GFX6-NEXT: v_sub_i32_e32 v16, vcc, 0, v3 +; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16 ; GFX6-NEXT: v_max_i32_e32 v3, v16, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v16, 16, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX6-NEXT: v_sub_i32_e32 v16, vcc, 0, v2 +; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX6-NEXT: v_max_i32_e32 v2, v16, v2 +; GFX6-NEXT: v_sub_i32_e32 v16, vcc, 0, v1 +; GFX6-NEXT: v_max_i32_e32 v1, v16, v1 +; GFX6-NEXT: v_sub_i32_e32 v16, vcc, 0, v0 +; GFX6-NEXT: v_max_i32_e32 v0, v16, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v8 +; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v10 +; GFX6-NEXT: v_or_b32_e32 v2, v2, v8 +; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v12 +; GFX6-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; GFX6-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GFX6-NEXT: v_or_b32_e32 v2, v2, v16 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_or_b32_e32 v4, v4, v5 -; GFX6-NEXT: v_or_b32_e32 v8, v8, v9 -; GFX6-NEXT: v_or_b32_e32 v12, v12, v13 -; GFX6-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; GFX6-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; GFX6-NEXT: v_alignbit_b32 v9, v10, v9, 16 -; GFX6-NEXT: v_alignbit_b32 v13, v14, v13, 16 +; GFX6-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX6-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX6-NEXT: v_or_b32_e32 v4, v4, v8 +; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v14 +; GFX6-NEXT: v_or_b32_e32 v1, v1, v9 +; GFX6-NEXT: v_or_b32_e32 v3, v3, v11 +; GFX6-NEXT: v_or_b32_e32 v5, v5, v13 +; GFX6-NEXT: v_or_b32_e32 v6, v6, v8 +; GFX6-NEXT: v_or_b32_e32 v7, v7, v15 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_abs_v16i16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GFX7-NEXT: v_bfe_i32 v1, v1, 0, 16 -; GFX7-NEXT: v_sub_i32_e32 v16, vcc, 0, v0 -; GFX7-NEXT: v_bfe_i32 v4, v4, 0, 16 -; GFX7-NEXT: v_max_i32_e32 v0, v16, v0 -; GFX7-NEXT: v_sub_i32_e32 v16, vcc, 0, v1 -; GFX7-NEXT: v_bfe_i32 v5, v5, 0, 16 -; GFX7-NEXT: v_max_i32_e32 v1, v16, v1 -; GFX7-NEXT: v_sub_i32_e32 v16, vcc, 0, v4 -; GFX7-NEXT: v_bfe_i32 v8, v8, 0, 16 -; GFX7-NEXT: v_max_i32_e32 v4, v16, v4 -; GFX7-NEXT: v_sub_i32_e32 v16, vcc, 0, v5 -; GFX7-NEXT: v_bfe_i32 v9, v9, 0, 16 -; GFX7-NEXT: v_max_i32_e32 v5, v16, v5 -; GFX7-NEXT: v_sub_i32_e32 v16, vcc, 0, v8 -; GFX7-NEXT: v_bfe_i32 v12, v12, 0, 16 -; GFX7-NEXT: v_max_i32_e32 v8, v16, v8 -; GFX7-NEXT: v_sub_i32_e32 v16, vcc, 0, v9 -; GFX7-NEXT: v_bfe_i32 v13, v13, 0, 16 -; GFX7-NEXT: v_max_i32_e32 v9, v16, v9 -; GFX7-NEXT: v_sub_i32_e32 v16, vcc, 0, v12 -; GFX7-NEXT: v_bfe_i32 v14, v14, 0, 16 -; GFX7-NEXT: v_max_i32_e32 v12, v16, v12 -; GFX7-NEXT: v_sub_i32_e32 v16, vcc, 0, v13 -; GFX7-NEXT: v_bfe_i32 v15, v15, 0, 16 -; GFX7-NEXT: v_max_i32_e32 v13, v16, v13 -; GFX7-NEXT: v_sub_i32_e32 v16, vcc, 0, v14 -; GFX7-NEXT: v_max_i32_e32 v14, v16, v14 +; GFX7-NEXT: v_ashrrev_i32_e32 v15, 16, v7 +; GFX7-NEXT: v_ashrrev_i32_e32 v14, 16, v6 ; GFX7-NEXT: v_sub_i32_e32 v16, vcc, 0, v15 +; GFX7-NEXT: v_ashrrev_i32_e32 v13, 16, v5 ; GFX7-NEXT: v_max_i32_e32 v15, v16, v15 -; GFX7-NEXT: v_bfe_i32 v10, v10, 0, 16 -; GFX7-NEXT: v_lshlrev_b32_e32 v16, 16, v15 -; GFX7-NEXT: v_bfe_i32 v11, v11, 0, 16 -; GFX7-NEXT: v_or_b32_e32 v14, v14, v16 -; GFX7-NEXT: v_sub_i32_e32 v16, vcc, 0, v10 -; GFX7-NEXT: v_max_i32_e32 v10, v16, v10 +; GFX7-NEXT: v_sub_i32_e32 v16, vcc, 0, v14 +; GFX7-NEXT: v_ashrrev_i32_e32 v12, 16, v4 +; GFX7-NEXT: v_max_i32_e32 v14, v16, v14 +; GFX7-NEXT: v_sub_i32_e32 v16, vcc, 0, v13 +; GFX7-NEXT: v_ashrrev_i32_e32 v11, 16, v3 +; GFX7-NEXT: v_max_i32_e32 v13, v16, v13 +; GFX7-NEXT: v_sub_i32_e32 v16, vcc, 0, v12 +; GFX7-NEXT: v_ashrrev_i32_e32 v10, 16, v2 +; GFX7-NEXT: v_max_i32_e32 v12, v16, v12 ; GFX7-NEXT: v_sub_i32_e32 v16, vcc, 0, v11 +; GFX7-NEXT: v_ashrrev_i32_e32 v9, 16, v1 ; GFX7-NEXT: v_max_i32_e32 v11, v16, v11 -; GFX7-NEXT: v_bfe_i32 v6, v6, 0, 16 -; GFX7-NEXT: v_lshlrev_b32_e32 v16, 16, v11 +; GFX7-NEXT: v_sub_i32_e32 v16, vcc, 0, v10 +; GFX7-NEXT: v_ashrrev_i32_e32 v8, 16, v0 +; GFX7-NEXT: v_max_i32_e32 v10, v16, v10 +; GFX7-NEXT: v_sub_i32_e32 v16, vcc, 0, v9 ; GFX7-NEXT: v_bfe_i32 v7, v7, 0, 16 -; GFX7-NEXT: v_or_b32_e32 v10, v10, v16 -; GFX7-NEXT: v_sub_i32_e32 v16, vcc, 0, v6 -; GFX7-NEXT: v_max_i32_e32 v6, v16, v6 +; GFX7-NEXT: v_max_i32_e32 v9, v16, v9 +; GFX7-NEXT: v_sub_i32_e32 v16, vcc, 0, v8 +; GFX7-NEXT: v_bfe_i32 v6, v6, 0, 16 +; GFX7-NEXT: v_max_i32_e32 v8, v16, v8 ; GFX7-NEXT: v_sub_i32_e32 v16, vcc, 0, v7 +; GFX7-NEXT: v_bfe_i32 v5, v5, 0, 16 ; GFX7-NEXT: v_max_i32_e32 v7, v16, v7 -; GFX7-NEXT: v_bfe_i32 v2, v2, 0, 16 -; GFX7-NEXT: v_lshlrev_b32_e32 v16, 16, v7 +; GFX7-NEXT: v_sub_i32_e32 v16, vcc, 0, v6 +; GFX7-NEXT: v_bfe_i32 v4, v4, 0, 16 +; GFX7-NEXT: v_max_i32_e32 v6, v16, v6 +; GFX7-NEXT: v_sub_i32_e32 v16, vcc, 0, v5 ; GFX7-NEXT: v_bfe_i32 v3, v3, 0, 16 -; GFX7-NEXT: v_or_b32_e32 v6, v6, v16 -; GFX7-NEXT: v_sub_i32_e32 v16, vcc, 0, v2 -; GFX7-NEXT: v_max_i32_e32 v2, v16, v2 +; GFX7-NEXT: v_max_i32_e32 v5, v16, v5 +; GFX7-NEXT: v_sub_i32_e32 v16, vcc, 0, v4 +; GFX7-NEXT: v_bfe_i32 v2, v2, 0, 16 +; GFX7-NEXT: v_max_i32_e32 v4, v16, v4 ; GFX7-NEXT: v_sub_i32_e32 v16, vcc, 0, v3 +; GFX7-NEXT: v_bfe_i32 v1, v1, 0, 16 ; GFX7-NEXT: v_max_i32_e32 v3, v16, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v16, 16, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX7-NEXT: v_sub_i32_e32 v16, vcc, 0, v2 +; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX7-NEXT: v_max_i32_e32 v2, v16, v2 +; GFX7-NEXT: v_sub_i32_e32 v16, vcc, 0, v1 +; GFX7-NEXT: v_max_i32_e32 v1, v16, v1 +; GFX7-NEXT: v_sub_i32_e32 v16, vcc, 0, v0 +; GFX7-NEXT: v_max_i32_e32 v0, v16, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v10 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v12 +; GFX7-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; GFX7-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GFX7-NEXT: v_or_b32_e32 v2, v2, v16 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 -; GFX7-NEXT: v_or_b32_e32 v8, v8, v9 -; GFX7-NEXT: v_or_b32_e32 v12, v12, v13 -; GFX7-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; GFX7-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; GFX7-NEXT: v_alignbit_b32 v9, v10, v9, 16 -; GFX7-NEXT: v_alignbit_b32 v13, v14, v13, 16 +; GFX7-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v14 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v9 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v11 +; GFX7-NEXT: v_or_b32_e32 v5, v5, v13 +; GFX7-NEXT: v_or_b32_e32 v6, v6, v8 +; GFX7-NEXT: v_or_b32_e32 v7, v7, v15 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_abs_v16i16: @@ -958,287 +940,267 @@ define <32 x i16> @v_abs_v32i16(<32 x i16> %arg) { ; GFX6-LABEL: v_abs_v32i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v0 -; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16 -; GFX6-NEXT: v_max_i32_e32 v0, v31, v0 -; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v1 -; GFX6-NEXT: v_bfe_i32 v4, v4, 0, 16 -; GFX6-NEXT: v_max_i32_e32 v1, v31, v1 -; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v4 -; GFX6-NEXT: v_bfe_i32 v5, v5, 0, 16 -; GFX6-NEXT: v_max_i32_e32 v4, v31, v4 -; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v5 -; GFX6-NEXT: v_bfe_i32 v8, v8, 0, 16 -; GFX6-NEXT: v_max_i32_e32 v5, v31, v5 -; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v8 -; GFX6-NEXT: v_bfe_i32 v9, v9, 0, 16 -; GFX6-NEXT: v_max_i32_e32 v8, v31, v8 -; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v9 -; GFX6-NEXT: v_bfe_i32 v12, v12, 0, 16 -; GFX6-NEXT: v_max_i32_e32 v9, v31, v9 -; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v12 -; GFX6-NEXT: v_bfe_i32 v13, v13, 0, 16 -; GFX6-NEXT: v_max_i32_e32 v12, v31, v12 -; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v13 -; GFX6-NEXT: v_bfe_i32 v16, v16, 0, 16 -; GFX6-NEXT: v_max_i32_e32 v13, v31, v13 -; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v16 -; GFX6-NEXT: v_bfe_i32 v17, v17, 0, 16 -; GFX6-NEXT: v_max_i32_e32 v16, v31, v16 -; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v17 -; GFX6-NEXT: v_bfe_i32 v20, v20, 0, 16 -; GFX6-NEXT: v_max_i32_e32 v17, v31, v17 -; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v20 -; GFX6-NEXT: v_bfe_i32 v21, v21, 0, 16 -; GFX6-NEXT: v_max_i32_e32 v20, v31, v20 -; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v21 -; GFX6-NEXT: v_bfe_i32 v24, v24, 0, 16 -; GFX6-NEXT: v_max_i32_e32 v21, v31, v21 -; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v24 -; GFX6-NEXT: v_bfe_i32 v25, v25, 0, 16 -; GFX6-NEXT: v_max_i32_e32 v24, v31, v24 -; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v25 -; GFX6-NEXT: v_bfe_i32 v28, v28, 0, 16 -; GFX6-NEXT: v_max_i32_e32 v25, v31, v25 -; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v28 -; GFX6-NEXT: v_bfe_i32 v29, v29, 0, 16 -; GFX6-NEXT: v_max_i32_e32 v28, v31, v28 -; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v29 -; GFX6-NEXT: v_bfe_i32 v30, v30, 0, 16 -; GFX6-NEXT: v_max_i32_e32 v29, v31, v29 +; GFX6-NEXT: v_ashrrev_i32_e32 v18, 16, v15 +; GFX6-NEXT: v_ashrrev_i32_e32 v17, 16, v14 +; GFX6-NEXT: v_sub_i32_e32 v19, vcc, 0, v18 +; GFX6-NEXT: v_ashrrev_i32_e32 v16, 16, v13 +; GFX6-NEXT: v_max_i32_e32 v18, v19, v18 +; GFX6-NEXT: v_sub_i32_e32 v19, vcc, 0, v17 +; GFX6-NEXT: v_max_i32_e32 v17, v19, v17 +; GFX6-NEXT: v_sub_i32_e32 v19, vcc, 0, v16 +; GFX6-NEXT: v_max_i32_e32 v16, v19, v16 +; GFX6-NEXT: v_ashrrev_i32_e32 v19, 16, v12 +; GFX6-NEXT: v_sub_i32_e32 v20, vcc, 0, v19 +; GFX6-NEXT: v_max_i32_e32 v19, v20, v19 +; GFX6-NEXT: v_ashrrev_i32_e32 v20, 16, v11 +; GFX6-NEXT: v_sub_i32_e32 v21, vcc, 0, v20 +; GFX6-NEXT: v_max_i32_e32 v20, v21, v20 +; GFX6-NEXT: v_ashrrev_i32_e32 v21, 16, v10 +; GFX6-NEXT: v_sub_i32_e32 v22, vcc, 0, v21 +; GFX6-NEXT: v_max_i32_e32 v21, v22, v21 +; GFX6-NEXT: v_ashrrev_i32_e32 v22, 16, v9 +; GFX6-NEXT: v_sub_i32_e32 v23, vcc, 0, v22 +; GFX6-NEXT: v_max_i32_e32 v22, v23, v22 +; GFX6-NEXT: v_ashrrev_i32_e32 v23, 16, v8 +; GFX6-NEXT: v_sub_i32_e32 v24, vcc, 0, v23 +; GFX6-NEXT: v_max_i32_e32 v23, v24, v23 +; GFX6-NEXT: v_ashrrev_i32_e32 v24, 16, v7 +; GFX6-NEXT: v_sub_i32_e32 v25, vcc, 0, v24 +; GFX6-NEXT: v_max_i32_e32 v24, v25, v24 +; GFX6-NEXT: v_ashrrev_i32_e32 v25, 16, v6 +; GFX6-NEXT: v_sub_i32_e32 v26, vcc, 0, v25 +; GFX6-NEXT: v_max_i32_e32 v25, v26, v25 +; GFX6-NEXT: v_ashrrev_i32_e32 v26, 16, v5 +; GFX6-NEXT: v_sub_i32_e32 v27, vcc, 0, v26 +; GFX6-NEXT: v_max_i32_e32 v26, v27, v26 +; GFX6-NEXT: v_ashrrev_i32_e32 v27, 16, v4 +; GFX6-NEXT: v_sub_i32_e32 v28, vcc, 0, v27 +; GFX6-NEXT: v_max_i32_e32 v27, v28, v27 +; GFX6-NEXT: v_ashrrev_i32_e32 v28, 16, v3 +; GFX6-NEXT: v_sub_i32_e32 v29, vcc, 0, v28 +; GFX6-NEXT: v_max_i32_e32 v28, v29, v28 +; GFX6-NEXT: v_ashrrev_i32_e32 v29, 16, v2 +; GFX6-NEXT: v_sub_i32_e32 v30, vcc, 0, v29 +; GFX6-NEXT: v_max_i32_e32 v29, v30, v29 +; GFX6-NEXT: v_ashrrev_i32_e32 v30, 16, v1 ; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v30 ; GFX6-NEXT: v_max_i32_e32 v30, v31, v30 -; GFX6-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX6-NEXT: v_bfe_i32 v26, v26, 0, 16 -; GFX6-NEXT: v_bfe_i32 v27, v27, 0, 16 -; GFX6-NEXT: v_bfe_i32 v22, v22, 0, 16 -; GFX6-NEXT: v_bfe_i32 v23, v23, 0, 16 -; GFX6-NEXT: v_bfe_i32 v18, v18, 0, 16 -; GFX6-NEXT: v_bfe_i32 v19, v19, 0, 16 -; GFX6-NEXT: v_bfe_i32 v14, v14, 0, 16 -; GFX6-NEXT: v_bfe_i32 v15, v15, 0, 16 -; GFX6-NEXT: v_bfe_i32 v10, v10, 0, 16 -; GFX6-NEXT: v_bfe_i32 v11, v11, 0, 16 -; GFX6-NEXT: v_bfe_i32 v6, v6, 0, 16 -; GFX6-NEXT: v_bfe_i32 v7, v7, 0, 16 -; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16 -; GFX6-NEXT: v_bfe_i32 v3, v3, 0, 16 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GFX6-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GFX6-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GFX6-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GFX6-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GFX6-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_or_b32_e32 v4, v4, v5 -; GFX6-NEXT: v_or_b32_e32 v8, v8, v9 -; GFX6-NEXT: v_or_b32_e32 v12, v12, v13 -; GFX6-NEXT: v_or_b32_e32 v16, v16, v17 -; GFX6-NEXT: v_or_b32_e32 v20, v20, v21 -; GFX6-NEXT: v_or_b32_e32 v24, v24, v25 -; GFX6-NEXT: v_or_b32_e32 v28, v28, v29 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_bfe_i32 v31, v31, 0, 16 +; GFX6-NEXT: v_ashrrev_i32_e32 v31, 16, v0 ; GFX6-NEXT: v_sub_i32_e32 v32, vcc, 0, v31 +; GFX6-NEXT: v_bfe_i32 v15, v15, 0, 16 ; GFX6-NEXT: v_max_i32_e32 v31, v32, v31 -; GFX6-NEXT: v_lshlrev_b32_e32 v32, 16, v31 -; GFX6-NEXT: v_or_b32_e32 v30, v30, v32 -; GFX6-NEXT: v_sub_i32_e32 v32, vcc, 0, v26 -; GFX6-NEXT: v_max_i32_e32 v26, v32, v26 -; GFX6-NEXT: v_sub_i32_e32 v32, vcc, 0, v27 -; GFX6-NEXT: v_max_i32_e32 v27, v32, v27 -; GFX6-NEXT: v_lshlrev_b32_e32 v32, 16, v27 -; GFX6-NEXT: v_or_b32_e32 v26, v26, v32 -; GFX6-NEXT: v_sub_i32_e32 v32, vcc, 0, v22 -; GFX6-NEXT: v_max_i32_e32 v22, v32, v22 -; GFX6-NEXT: v_sub_i32_e32 v32, vcc, 0, v23 -; GFX6-NEXT: v_max_i32_e32 v23, v32, v23 -; GFX6-NEXT: v_lshlrev_b32_e32 v32, 16, v23 -; GFX6-NEXT: v_or_b32_e32 v22, v22, v32 -; GFX6-NEXT: v_sub_i32_e32 v32, vcc, 0, v18 -; GFX6-NEXT: v_max_i32_e32 v18, v32, v18 -; GFX6-NEXT: v_sub_i32_e32 v32, vcc, 0, v19 -; GFX6-NEXT: v_max_i32_e32 v19, v32, v19 -; GFX6-NEXT: v_lshlrev_b32_e32 v32, 16, v19 -; GFX6-NEXT: v_or_b32_e32 v18, v18, v32 -; GFX6-NEXT: v_sub_i32_e32 v32, vcc, 0, v14 -; GFX6-NEXT: v_max_i32_e32 v14, v32, v14 ; GFX6-NEXT: v_sub_i32_e32 v32, vcc, 0, v15 +; GFX6-NEXT: v_bfe_i32 v14, v14, 0, 16 ; GFX6-NEXT: v_max_i32_e32 v15, v32, v15 -; GFX6-NEXT: v_lshlrev_b32_e32 v32, 16, v15 -; GFX6-NEXT: v_or_b32_e32 v14, v14, v32 -; GFX6-NEXT: v_sub_i32_e32 v32, vcc, 0, v10 -; GFX6-NEXT: v_max_i32_e32 v10, v32, v10 +; GFX6-NEXT: v_sub_i32_e32 v32, vcc, 0, v14 +; GFX6-NEXT: v_bfe_i32 v13, v13, 0, 16 +; GFX6-NEXT: v_max_i32_e32 v14, v32, v14 +; GFX6-NEXT: v_sub_i32_e32 v32, vcc, 0, v13 +; GFX6-NEXT: v_bfe_i32 v12, v12, 0, 16 +; GFX6-NEXT: v_max_i32_e32 v13, v32, v13 +; GFX6-NEXT: v_sub_i32_e32 v32, vcc, 0, v12 +; GFX6-NEXT: v_bfe_i32 v11, v11, 0, 16 +; GFX6-NEXT: v_max_i32_e32 v12, v32, v12 ; GFX6-NEXT: v_sub_i32_e32 v32, vcc, 0, v11 +; GFX6-NEXT: v_bfe_i32 v10, v10, 0, 16 ; GFX6-NEXT: v_max_i32_e32 v11, v32, v11 -; GFX6-NEXT: v_lshlrev_b32_e32 v32, 16, v11 -; GFX6-NEXT: v_or_b32_e32 v10, v10, v32 -; GFX6-NEXT: v_sub_i32_e32 v32, vcc, 0, v6 -; GFX6-NEXT: v_max_i32_e32 v6, v32, v6 +; GFX6-NEXT: v_sub_i32_e32 v32, vcc, 0, v10 +; GFX6-NEXT: v_bfe_i32 v9, v9, 0, 16 +; GFX6-NEXT: v_max_i32_e32 v10, v32, v10 +; GFX6-NEXT: v_sub_i32_e32 v32, vcc, 0, v9 +; GFX6-NEXT: v_bfe_i32 v8, v8, 0, 16 +; GFX6-NEXT: v_max_i32_e32 v9, v32, v9 +; GFX6-NEXT: v_sub_i32_e32 v32, vcc, 0, v8 +; GFX6-NEXT: v_bfe_i32 v7, v7, 0, 16 +; GFX6-NEXT: v_max_i32_e32 v8, v32, v8 ; GFX6-NEXT: v_sub_i32_e32 v32, vcc, 0, v7 +; GFX6-NEXT: v_bfe_i32 v6, v6, 0, 16 ; GFX6-NEXT: v_max_i32_e32 v7, v32, v7 -; GFX6-NEXT: v_lshlrev_b32_e32 v32, 16, v7 -; GFX6-NEXT: v_or_b32_e32 v6, v6, v32 -; GFX6-NEXT: v_sub_i32_e32 v32, vcc, 0, v2 -; GFX6-NEXT: v_max_i32_e32 v2, v32, v2 +; GFX6-NEXT: v_sub_i32_e32 v32, vcc, 0, v6 +; GFX6-NEXT: v_bfe_i32 v5, v5, 0, 16 +; GFX6-NEXT: v_max_i32_e32 v6, v32, v6 +; GFX6-NEXT: v_sub_i32_e32 v32, vcc, 0, v5 +; GFX6-NEXT: v_bfe_i32 v4, v4, 0, 16 +; GFX6-NEXT: v_max_i32_e32 v5, v32, v5 +; GFX6-NEXT: v_sub_i32_e32 v32, vcc, 0, v4 +; GFX6-NEXT: v_bfe_i32 v3, v3, 0, 16 +; GFX6-NEXT: v_max_i32_e32 v4, v32, v4 ; GFX6-NEXT: v_sub_i32_e32 v32, vcc, 0, v3 +; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16 ; GFX6-NEXT: v_max_i32_e32 v3, v32, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v32, 16, v3 -; GFX6-NEXT: v_or_b32_e32 v2, v2, v32 -; GFX6-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; GFX6-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; GFX6-NEXT: v_alignbit_b32 v9, v10, v9, 16 -; GFX6-NEXT: v_alignbit_b32 v13, v14, v13, 16 -; GFX6-NEXT: v_alignbit_b32 v17, v18, v17, 16 -; GFX6-NEXT: v_alignbit_b32 v21, v22, v21, 16 -; GFX6-NEXT: v_alignbit_b32 v25, v26, v25, 16 -; GFX6-NEXT: v_alignbit_b32 v29, v30, v29, 16 +; GFX6-NEXT: v_sub_i32_e32 v32, vcc, 0, v2 +; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16 +; GFX6-NEXT: v_max_i32_e32 v2, v32, v2 +; GFX6-NEXT: v_sub_i32_e32 v32, vcc, 0, v1 +; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX6-NEXT: v_max_i32_e32 v1, v32, v1 +; GFX6-NEXT: v_sub_i32_e32 v32, vcc, 0, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX6-NEXT: v_max_i32_e32 v0, v32, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; GFX6-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; GFX6-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; GFX6-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GFX6-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GFX6-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; GFX6-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GFX6-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GFX6-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GFX6-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GFX6-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX6-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX6-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GFX6-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX6-NEXT: v_or_b32_e32 v13, v13, v16 +; GFX6-NEXT: v_lshlrev_b32_e32 v16, 16, v17 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v31 +; GFX6-NEXT: v_or_b32_e32 v1, v1, v30 +; GFX6-NEXT: v_or_b32_e32 v2, v2, v29 +; GFX6-NEXT: v_or_b32_e32 v3, v3, v28 +; GFX6-NEXT: v_or_b32_e32 v4, v4, v27 +; GFX6-NEXT: v_or_b32_e32 v5, v5, v26 +; GFX6-NEXT: v_or_b32_e32 v6, v6, v25 +; GFX6-NEXT: v_or_b32_e32 v7, v7, v24 +; GFX6-NEXT: v_or_b32_e32 v8, v8, v23 +; GFX6-NEXT: v_or_b32_e32 v9, v9, v22 +; GFX6-NEXT: v_or_b32_e32 v10, v10, v21 +; GFX6-NEXT: v_or_b32_e32 v11, v11, v20 +; GFX6-NEXT: v_or_b32_e32 v12, v12, v19 +; GFX6-NEXT: v_or_b32_e32 v14, v14, v16 +; GFX6-NEXT: v_or_b32_e32 v15, v15, v18 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_abs_v32i16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v0 -; GFX7-NEXT: v_bfe_i32 v1, v1, 0, 16 -; GFX7-NEXT: v_max_i32_e32 v0, v31, v0 -; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v1 -; GFX7-NEXT: v_bfe_i32 v4, v4, 0, 16 -; GFX7-NEXT: v_max_i32_e32 v1, v31, v1 -; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v4 -; GFX7-NEXT: v_bfe_i32 v5, v5, 0, 16 -; GFX7-NEXT: v_max_i32_e32 v4, v31, v4 -; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v5 -; GFX7-NEXT: v_bfe_i32 v8, v8, 0, 16 -; GFX7-NEXT: v_max_i32_e32 v5, v31, v5 -; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v8 -; GFX7-NEXT: v_bfe_i32 v9, v9, 0, 16 -; GFX7-NEXT: v_max_i32_e32 v8, v31, v8 -; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v9 -; GFX7-NEXT: v_bfe_i32 v12, v12, 0, 16 -; GFX7-NEXT: v_max_i32_e32 v9, v31, v9 -; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v12 -; GFX7-NEXT: v_bfe_i32 v13, v13, 0, 16 -; GFX7-NEXT: v_max_i32_e32 v12, v31, v12 -; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v13 -; GFX7-NEXT: v_bfe_i32 v16, v16, 0, 16 -; GFX7-NEXT: v_max_i32_e32 v13, v31, v13 -; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v16 -; GFX7-NEXT: v_bfe_i32 v17, v17, 0, 16 -; GFX7-NEXT: v_max_i32_e32 v16, v31, v16 -; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v17 -; GFX7-NEXT: v_bfe_i32 v20, v20, 0, 16 -; GFX7-NEXT: v_max_i32_e32 v17, v31, v17 -; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v20 -; GFX7-NEXT: v_bfe_i32 v21, v21, 0, 16 -; GFX7-NEXT: v_max_i32_e32 v20, v31, v20 -; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v21 -; GFX7-NEXT: v_bfe_i32 v24, v24, 0, 16 -; GFX7-NEXT: v_max_i32_e32 v21, v31, v21 -; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v24 -; GFX7-NEXT: v_bfe_i32 v25, v25, 0, 16 -; GFX7-NEXT: v_max_i32_e32 v24, v31, v24 -; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v25 -; GFX7-NEXT: v_bfe_i32 v28, v28, 0, 16 -; GFX7-NEXT: v_max_i32_e32 v25, v31, v25 -; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v28 -; GFX7-NEXT: v_bfe_i32 v29, v29, 0, 16 -; GFX7-NEXT: v_max_i32_e32 v28, v31, v28 -; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v29 -; GFX7-NEXT: v_bfe_i32 v30, v30, 0, 16 -; GFX7-NEXT: v_max_i32_e32 v29, v31, v29 +; GFX7-NEXT: v_ashrrev_i32_e32 v18, 16, v15 +; GFX7-NEXT: v_ashrrev_i32_e32 v17, 16, v14 +; GFX7-NEXT: v_sub_i32_e32 v19, vcc, 0, v18 +; GFX7-NEXT: v_ashrrev_i32_e32 v16, 16, v13 +; GFX7-NEXT: v_max_i32_e32 v18, v19, v18 +; GFX7-NEXT: v_sub_i32_e32 v19, vcc, 0, v17 +; GFX7-NEXT: v_max_i32_e32 v17, v19, v17 +; GFX7-NEXT: v_sub_i32_e32 v19, vcc, 0, v16 +; GFX7-NEXT: v_max_i32_e32 v16, v19, v16 +; GFX7-NEXT: v_ashrrev_i32_e32 v19, 16, v12 +; GFX7-NEXT: v_sub_i32_e32 v20, vcc, 0, v19 +; GFX7-NEXT: v_max_i32_e32 v19, v20, v19 +; GFX7-NEXT: v_ashrrev_i32_e32 v20, 16, v11 +; GFX7-NEXT: v_sub_i32_e32 v21, vcc, 0, v20 +; GFX7-NEXT: v_max_i32_e32 v20, v21, v20 +; GFX7-NEXT: v_ashrrev_i32_e32 v21, 16, v10 +; GFX7-NEXT: v_sub_i32_e32 v22, vcc, 0, v21 +; GFX7-NEXT: v_max_i32_e32 v21, v22, v21 +; GFX7-NEXT: v_ashrrev_i32_e32 v22, 16, v9 +; GFX7-NEXT: v_sub_i32_e32 v23, vcc, 0, v22 +; GFX7-NEXT: v_max_i32_e32 v22, v23, v22 +; GFX7-NEXT: v_ashrrev_i32_e32 v23, 16, v8 +; GFX7-NEXT: v_sub_i32_e32 v24, vcc, 0, v23 +; GFX7-NEXT: v_max_i32_e32 v23, v24, v23 +; GFX7-NEXT: v_ashrrev_i32_e32 v24, 16, v7 +; GFX7-NEXT: v_sub_i32_e32 v25, vcc, 0, v24 +; GFX7-NEXT: v_max_i32_e32 v24, v25, v24 +; GFX7-NEXT: v_ashrrev_i32_e32 v25, 16, v6 +; GFX7-NEXT: v_sub_i32_e32 v26, vcc, 0, v25 +; GFX7-NEXT: v_max_i32_e32 v25, v26, v25 +; GFX7-NEXT: v_ashrrev_i32_e32 v26, 16, v5 +; GFX7-NEXT: v_sub_i32_e32 v27, vcc, 0, v26 +; GFX7-NEXT: v_max_i32_e32 v26, v27, v26 +; GFX7-NEXT: v_ashrrev_i32_e32 v27, 16, v4 +; GFX7-NEXT: v_sub_i32_e32 v28, vcc, 0, v27 +; GFX7-NEXT: v_max_i32_e32 v27, v28, v27 +; GFX7-NEXT: v_ashrrev_i32_e32 v28, 16, v3 +; GFX7-NEXT: v_sub_i32_e32 v29, vcc, 0, v28 +; GFX7-NEXT: v_max_i32_e32 v28, v29, v28 +; GFX7-NEXT: v_ashrrev_i32_e32 v29, 16, v2 +; GFX7-NEXT: v_sub_i32_e32 v30, vcc, 0, v29 +; GFX7-NEXT: v_max_i32_e32 v29, v30, v29 +; GFX7-NEXT: v_ashrrev_i32_e32 v30, 16, v1 ; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v30 ; GFX7-NEXT: v_max_i32_e32 v30, v31, v30 -; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX7-NEXT: v_bfe_i32 v26, v26, 0, 16 -; GFX7-NEXT: v_bfe_i32 v27, v27, 0, 16 -; GFX7-NEXT: v_bfe_i32 v22, v22, 0, 16 -; GFX7-NEXT: v_bfe_i32 v23, v23, 0, 16 -; GFX7-NEXT: v_bfe_i32 v18, v18, 0, 16 -; GFX7-NEXT: v_bfe_i32 v19, v19, 0, 16 -; GFX7-NEXT: v_bfe_i32 v14, v14, 0, 16 -; GFX7-NEXT: v_bfe_i32 v15, v15, 0, 16 -; GFX7-NEXT: v_bfe_i32 v10, v10, 0, 16 -; GFX7-NEXT: v_bfe_i32 v11, v11, 0, 16 -; GFX7-NEXT: v_bfe_i32 v6, v6, 0, 16 -; GFX7-NEXT: v_bfe_i32 v7, v7, 0, 16 -; GFX7-NEXT: v_bfe_i32 v2, v2, 0, 16 -; GFX7-NEXT: v_bfe_i32 v3, v3, 0, 16 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GFX7-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GFX7-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GFX7-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GFX7-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GFX7-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 -; GFX7-NEXT: v_or_b32_e32 v8, v8, v9 -; GFX7-NEXT: v_or_b32_e32 v12, v12, v13 -; GFX7-NEXT: v_or_b32_e32 v16, v16, v17 -; GFX7-NEXT: v_or_b32_e32 v20, v20, v21 -; GFX7-NEXT: v_or_b32_e32 v24, v24, v25 -; GFX7-NEXT: v_or_b32_e32 v28, v28, v29 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_bfe_i32 v31, v31, 0, 16 +; GFX7-NEXT: v_ashrrev_i32_e32 v31, 16, v0 ; GFX7-NEXT: v_sub_i32_e32 v32, vcc, 0, v31 +; GFX7-NEXT: v_bfe_i32 v15, v15, 0, 16 ; GFX7-NEXT: v_max_i32_e32 v31, v32, v31 -; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v31 -; GFX7-NEXT: v_or_b32_e32 v30, v30, v32 -; GFX7-NEXT: v_sub_i32_e32 v32, vcc, 0, v26 -; GFX7-NEXT: v_max_i32_e32 v26, v32, v26 -; GFX7-NEXT: v_sub_i32_e32 v32, vcc, 0, v27 -; GFX7-NEXT: v_max_i32_e32 v27, v32, v27 -; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v27 -; GFX7-NEXT: v_or_b32_e32 v26, v26, v32 -; GFX7-NEXT: v_sub_i32_e32 v32, vcc, 0, v22 -; GFX7-NEXT: v_max_i32_e32 v22, v32, v22 -; GFX7-NEXT: v_sub_i32_e32 v32, vcc, 0, v23 -; GFX7-NEXT: v_max_i32_e32 v23, v32, v23 -; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v23 -; GFX7-NEXT: v_or_b32_e32 v22, v22, v32 -; GFX7-NEXT: v_sub_i32_e32 v32, vcc, 0, v18 -; GFX7-NEXT: v_max_i32_e32 v18, v32, v18 -; GFX7-NEXT: v_sub_i32_e32 v32, vcc, 0, v19 -; GFX7-NEXT: v_max_i32_e32 v19, v32, v19 -; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v19 -; GFX7-NEXT: v_or_b32_e32 v18, v18, v32 -; GFX7-NEXT: v_sub_i32_e32 v32, vcc, 0, v14 -; GFX7-NEXT: v_max_i32_e32 v14, v32, v14 ; GFX7-NEXT: v_sub_i32_e32 v32, vcc, 0, v15 +; GFX7-NEXT: v_bfe_i32 v14, v14, 0, 16 ; GFX7-NEXT: v_max_i32_e32 v15, v32, v15 -; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v15 -; GFX7-NEXT: v_or_b32_e32 v14, v14, v32 -; GFX7-NEXT: v_sub_i32_e32 v32, vcc, 0, v10 -; GFX7-NEXT: v_max_i32_e32 v10, v32, v10 +; GFX7-NEXT: v_sub_i32_e32 v32, vcc, 0, v14 +; GFX7-NEXT: v_bfe_i32 v13, v13, 0, 16 +; GFX7-NEXT: v_max_i32_e32 v14, v32, v14 +; GFX7-NEXT: v_sub_i32_e32 v32, vcc, 0, v13 +; GFX7-NEXT: v_bfe_i32 v12, v12, 0, 16 +; GFX7-NEXT: v_max_i32_e32 v13, v32, v13 +; GFX7-NEXT: v_sub_i32_e32 v32, vcc, 0, v12 +; GFX7-NEXT: v_bfe_i32 v11, v11, 0, 16 +; GFX7-NEXT: v_max_i32_e32 v12, v32, v12 ; GFX7-NEXT: v_sub_i32_e32 v32, vcc, 0, v11 +; GFX7-NEXT: v_bfe_i32 v10, v10, 0, 16 ; GFX7-NEXT: v_max_i32_e32 v11, v32, v11 -; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v11 -; GFX7-NEXT: v_or_b32_e32 v10, v10, v32 -; GFX7-NEXT: v_sub_i32_e32 v32, vcc, 0, v6 -; GFX7-NEXT: v_max_i32_e32 v6, v32, v6 +; GFX7-NEXT: v_sub_i32_e32 v32, vcc, 0, v10 +; GFX7-NEXT: v_bfe_i32 v9, v9, 0, 16 +; GFX7-NEXT: v_max_i32_e32 v10, v32, v10 +; GFX7-NEXT: v_sub_i32_e32 v32, vcc, 0, v9 +; GFX7-NEXT: v_bfe_i32 v8, v8, 0, 16 +; GFX7-NEXT: v_max_i32_e32 v9, v32, v9 +; GFX7-NEXT: v_sub_i32_e32 v32, vcc, 0, v8 +; GFX7-NEXT: v_bfe_i32 v7, v7, 0, 16 +; GFX7-NEXT: v_max_i32_e32 v8, v32, v8 ; GFX7-NEXT: v_sub_i32_e32 v32, vcc, 0, v7 +; GFX7-NEXT: v_bfe_i32 v6, v6, 0, 16 ; GFX7-NEXT: v_max_i32_e32 v7, v32, v7 -; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v6, v32 -; GFX7-NEXT: v_sub_i32_e32 v32, vcc, 0, v2 -; GFX7-NEXT: v_max_i32_e32 v2, v32, v2 +; GFX7-NEXT: v_sub_i32_e32 v32, vcc, 0, v6 +; GFX7-NEXT: v_bfe_i32 v5, v5, 0, 16 +; GFX7-NEXT: v_max_i32_e32 v6, v32, v6 +; GFX7-NEXT: v_sub_i32_e32 v32, vcc, 0, v5 +; GFX7-NEXT: v_bfe_i32 v4, v4, 0, 16 +; GFX7-NEXT: v_max_i32_e32 v5, v32, v5 +; GFX7-NEXT: v_sub_i32_e32 v32, vcc, 0, v4 +; GFX7-NEXT: v_bfe_i32 v3, v3, 0, 16 +; GFX7-NEXT: v_max_i32_e32 v4, v32, v4 ; GFX7-NEXT: v_sub_i32_e32 v32, vcc, 0, v3 +; GFX7-NEXT: v_bfe_i32 v2, v2, 0, 16 ; GFX7-NEXT: v_max_i32_e32 v3, v32, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v3 -; GFX7-NEXT: v_or_b32_e32 v2, v2, v32 -; GFX7-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; GFX7-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; GFX7-NEXT: v_alignbit_b32 v9, v10, v9, 16 -; GFX7-NEXT: v_alignbit_b32 v13, v14, v13, 16 -; GFX7-NEXT: v_alignbit_b32 v17, v18, v17, 16 -; GFX7-NEXT: v_alignbit_b32 v21, v22, v21, 16 -; GFX7-NEXT: v_alignbit_b32 v25, v26, v25, 16 -; GFX7-NEXT: v_alignbit_b32 v29, v30, v29, 16 +; GFX7-NEXT: v_sub_i32_e32 v32, vcc, 0, v2 +; GFX7-NEXT: v_bfe_i32 v1, v1, 0, 16 +; GFX7-NEXT: v_max_i32_e32 v2, v32, v2 +; GFX7-NEXT: v_sub_i32_e32 v32, vcc, 0, v1 +; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX7-NEXT: v_max_i32_e32 v1, v32, v1 +; GFX7-NEXT: v_sub_i32_e32 v32, vcc, 0, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX7-NEXT: v_max_i32_e32 v0, v32, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; GFX7-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; GFX7-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; GFX7-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GFX7-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GFX7-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; GFX7-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GFX7-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GFX7-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GFX7-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GFX7-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX7-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX7-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GFX7-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX7-NEXT: v_or_b32_e32 v13, v13, v16 +; GFX7-NEXT: v_lshlrev_b32_e32 v16, 16, v17 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v31 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v30 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v29 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v28 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v27 +; GFX7-NEXT: v_or_b32_e32 v5, v5, v26 +; GFX7-NEXT: v_or_b32_e32 v6, v6, v25 +; GFX7-NEXT: v_or_b32_e32 v7, v7, v24 +; GFX7-NEXT: v_or_b32_e32 v8, v8, v23 +; GFX7-NEXT: v_or_b32_e32 v9, v9, v22 +; GFX7-NEXT: v_or_b32_e32 v10, v10, v21 +; GFX7-NEXT: v_or_b32_e32 v11, v11, v20 +; GFX7-NEXT: v_or_b32_e32 v12, v12, v19 +; GFX7-NEXT: v_or_b32_e32 v14, v14, v16 +; GFX7-NEXT: v_or_b32_e32 v15, v15, v18 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_abs_v32i16: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll index 2fef934fa472e..2ce67c3848bae 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll @@ -22674,27 +22674,86 @@ define <64 x bfloat> @bitcast_v32i32_to_v64bf16(<32 x i32> %a, i32 %b) { ; SI-LABEL: bitcast_v32i32_to_v64bf16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; kill: killed $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; kill: killed $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; kill: killed $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; kill: killed $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; kill: killed $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; kill: killed $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; kill: killed $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; kill: killed $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; kill: killed $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; kill: killed $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; kill: killed $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; kill: killed $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; kill: killed $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; kill: killed $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; kill: killed $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; kill: killed $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; kill: killed $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; kill: killed $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; kill: killed $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; kill: killed $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; kill: killed $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; kill: killed $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; kill: killed $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; kill: killed $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; kill: killed $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; kill: killed $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; kill: killed $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; kill: killed $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; kill: killed $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; kill: killed $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: ; implicit-def: $vgpr58 @@ -22725,193 +22784,130 @@ define <64 x bfloat> @bitcast_v32i32_to_v64bf16(<32 x i32> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; kill: killed $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB16_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v30 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v62 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v30 ; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v29 ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v28 ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v27 ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v26 ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v25 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v24 ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v23 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v22 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v21 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v20 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v19 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v62 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v18 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v62 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v17 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) -; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v63 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v16 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v63 -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v15 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v15 -; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v14 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v14 -; SI-NEXT: v_and_b32_e32 v37, 0xffff0000, v13 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v13 -; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v12 -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v12 -; SI-NEXT: v_and_b32_e32 v49, 0xffff0000, v11 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v11 -; SI-NEXT: v_and_b32_e32 v51, 0xffff0000, v10 -; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v10 -; SI-NEXT: v_and_b32_e32 v53, 0xffff0000, v9 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v9 -; SI-NEXT: v_and_b32_e32 v55, 0xffff0000, v8 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v8 -; SI-NEXT: v_and_b32_e32 v41, 0xffff0000, v7 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v7 -; SI-NEXT: v_and_b32_e32 v43, 0xffff0000, v6 -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v6 -; SI-NEXT: v_and_b32_e32 v45, 0xffff0000, v5 -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v5 -; SI-NEXT: v_and_b32_e32 v47, 0xffff0000, v4 -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v4 -; SI-NEXT: v_and_b32_e32 v57, 0xffff0000, v3 -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v3 -; SI-NEXT: v_and_b32_e32 v59, 0xffff0000, v2 -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v2 -; SI-NEXT: v_and_b32_e32 v61, 0xffff0000, v1 -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v1 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_and_b32_e32 v63, 0xffff0000, v62 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v14 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v14 +; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v13 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v13 +; SI-NEXT: v_and_b32_e32 v37, 0xffff0000, v12 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v12 +; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v11 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v11 +; SI-NEXT: v_and_b32_e32 v49, 0xffff0000, v10 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v10 +; SI-NEXT: v_and_b32_e32 v51, 0xffff0000, v9 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v9 +; SI-NEXT: v_and_b32_e32 v53, 0xffff0000, v8 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v8 +; SI-NEXT: v_and_b32_e32 v55, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v7 +; SI-NEXT: v_and_b32_e32 v41, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v6 +; SI-NEXT: v_and_b32_e32 v43, 0xffff0000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v5 +; SI-NEXT: v_and_b32_e32 v45, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v4 +; SI-NEXT: v_and_b32_e32 v47, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v3 +; SI-NEXT: v_and_b32_e32 v57, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v2 +; SI-NEXT: v_and_b32_e32 v59, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v1 +; SI-NEXT: v_and_b32_e32 v61, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v0 +; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 @@ -22943,85 +22939,82 @@ define <64 x bfloat> @bitcast_v32i32_to_v64bf16(<32 x i32> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: .LBB16_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB16_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v62 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v63 -; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v32 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v62 ; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v31 +; SI-NEXT: v_and_b32_e32 v63, 0xffff0000, v31 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 ; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v30 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v30 ; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 ; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v29 ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 ; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v28 ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 ; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v27 ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 ; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v26 ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 ; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v25 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 ; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v24 ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v23 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v22 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v21 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v20 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v19 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v18 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 @@ -23038,369 +23031,274 @@ define <64 x bfloat> @bitcast_v32i32_to_v64bf16(<32 x i32> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 ; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 ; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v17 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v16 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v15 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v15 -; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v14 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v14 -; SI-NEXT: v_and_b32_e32 v37, 0xffff0000, v13 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v13 -; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v12 -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v12 -; SI-NEXT: v_and_b32_e32 v49, 0xffff0000, v11 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v11 -; SI-NEXT: v_and_b32_e32 v51, 0xffff0000, v10 -; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v10 -; SI-NEXT: v_and_b32_e32 v53, 0xffff0000, v9 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v9 -; SI-NEXT: v_and_b32_e32 v55, 0xffff0000, v8 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v8 -; SI-NEXT: v_and_b32_e32 v41, 0xffff0000, v7 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v7 -; SI-NEXT: v_and_b32_e32 v43, 0xffff0000, v6 -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v6 -; SI-NEXT: v_and_b32_e32 v45, 0xffff0000, v5 -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v5 -; SI-NEXT: v_and_b32_e32 v47, 0xffff0000, v4 -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v4 -; SI-NEXT: v_and_b32_e32 v57, 0xffff0000, v3 -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v3 -; SI-NEXT: v_and_b32_e32 v59, 0xffff0000, v2 -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v2 -; SI-NEXT: v_and_b32_e32 v61, 0xffff0000, v1 -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v1 -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v14 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v14 +; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v13 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v13 +; SI-NEXT: v_and_b32_e32 v37, 0xffff0000, v12 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v12 +; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v11 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v11 +; SI-NEXT: v_and_b32_e32 v49, 0xffff0000, v10 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v10 +; SI-NEXT: v_and_b32_e32 v51, 0xffff0000, v9 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v9 +; SI-NEXT: v_and_b32_e32 v53, 0xffff0000, v8 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v8 +; SI-NEXT: v_and_b32_e32 v55, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v7 +; SI-NEXT: v_and_b32_e32 v41, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v6 +; SI-NEXT: v_and_b32_e32 v43, 0xffff0000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v5 +; SI-NEXT: v_and_b32_e32 v45, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v4 +; SI-NEXT: v_and_b32_e32 v47, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v3 +; SI-NEXT: v_and_b32_e32 v57, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v2 +; SI-NEXT: v_and_b32_e32 v59, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v1 +; SI-NEXT: v_and_b32_e32 v61, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v0 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: .LBB16_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v61 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v60 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v61 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v60 +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v59 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v58 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v57 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v56 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v47 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v46 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v45 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v44 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v43 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v42 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v41 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v40 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v55 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v54 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v53 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v52 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v51 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v50 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v49 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v48 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v39 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v38 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v37 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v36 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v35 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v34 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v33 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v32 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v57 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v56 +; SI-NEXT: v_alignbit_b32 v2, v2, v3, 16 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v47 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v46 +; SI-NEXT: v_alignbit_b32 v3, v3, v4, 16 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v45 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v44 +; SI-NEXT: v_alignbit_b32 v4, v4, v5, 16 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v43 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_mul_f32_e32 v6, 1.0, v42 +; SI-NEXT: v_alignbit_b32 v5, v5, v6, 16 +; SI-NEXT: v_mul_f32_e32 v6, 1.0, v41 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v40 +; SI-NEXT: v_alignbit_b32 v6, v6, v7, 16 +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v55 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v54 +; SI-NEXT: v_alignbit_b32 v7, v7, v8, 16 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v53 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v52 +; SI-NEXT: v_alignbit_b32 v8, v8, v9, 16 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v51 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v50 +; SI-NEXT: v_alignbit_b32 v9, v9, v10, 16 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v49 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v48 +; SI-NEXT: v_alignbit_b32 v10, v10, v11, 16 +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v39 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_mul_f32_e32 v12, 1.0, v38 +; SI-NEXT: v_alignbit_b32 v11, v11, v12, 16 +; SI-NEXT: v_mul_f32_e32 v12, 1.0, v37 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_mul_f32_e32 v13, 1.0, v36 +; SI-NEXT: v_alignbit_b32 v12, v12, v13, 16 +; SI-NEXT: v_mul_f32_e32 v13, 1.0, v35 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_mul_f32_e32 v14, 1.0, v34 +; SI-NEXT: v_alignbit_b32 v13, v13, v14, 16 +; SI-NEXT: v_mul_f32_e32 v14, 1.0, v33 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v15, 1.0, v32 +; SI-NEXT: v_alignbit_b32 v14, v14, v15, 16 +; SI-NEXT: v_mul_f32_e32 v15, 1.0, v31 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_alignbit_b32 v15, v15, v16, 16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v18 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v19 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v20 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v21 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v23 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v24 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v25 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v26 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v27 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v28 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_mul_f32_e32 v29, 1.0, v29 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v30 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v31 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_alignbit_b32 v16, v16, v17, 16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_alignbit_b32 v17, v17, v18, 16 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_alignbit_b32 v18, v18, v19, 16 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_alignbit_b32 v19, v19, v20, 16 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_alignbit_b32 v20, v20, v21, 16 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_alignbit_b32 v21, v21, v22, 16 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_alignbit_b32 v22, v22, v23, 16 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_alignbit_b32 v23, v23, v24, 16 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_alignbit_b32 v24, v24, v25, 16 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_alignbit_b32 v25, v25, v26, 16 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_alignbit_b32 v26, v26, v27, 16 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_alignbit_b32 v27, v27, v28, 16 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_alignbit_b32 v28, v28, v29, 16 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v31 +; SI-NEXT: v_mul_f32_e32 v29, 1.0, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_alignbit_b32 v29, v29, v30, 16 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_alignbit_b32 v30, v30, v31, 16 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v63 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_alignbit_b32 v31, v31, v32, 16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v32i32_to_v64bf16: @@ -23576,104 +23474,104 @@ define inreg <64 x bfloat> @bitcast_v32i32_to_v64bf16_scalar(<32 x i32> inreg %a ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_writelane_b32 v21, s30, 0 -; SI-NEXT: v_writelane_b32 v21, s31, 1 -; SI-NEXT: v_writelane_b32 v21, s34, 2 -; SI-NEXT: v_writelane_b32 v21, s35, 3 -; SI-NEXT: v_writelane_b32 v21, s36, 4 -; SI-NEXT: v_writelane_b32 v21, s37, 5 -; SI-NEXT: v_writelane_b32 v21, s38, 6 -; SI-NEXT: v_writelane_b32 v21, s39, 7 -; SI-NEXT: v_writelane_b32 v21, s48, 8 -; SI-NEXT: v_writelane_b32 v21, s49, 9 -; SI-NEXT: v_writelane_b32 v21, s50, 10 -; SI-NEXT: v_writelane_b32 v21, s51, 11 -; SI-NEXT: v_writelane_b32 v21, s52, 12 -; SI-NEXT: v_writelane_b32 v21, s53, 13 -; SI-NEXT: v_writelane_b32 v21, s54, 14 -; SI-NEXT: v_writelane_b32 v21, s55, 15 -; SI-NEXT: v_writelane_b32 v21, s64, 16 -; SI-NEXT: v_writelane_b32 v21, s65, 17 -; SI-NEXT: v_writelane_b32 v21, s66, 18 -; SI-NEXT: v_writelane_b32 v21, s67, 19 -; SI-NEXT: v_writelane_b32 v21, s68, 20 -; SI-NEXT: v_writelane_b32 v21, s69, 21 -; SI-NEXT: v_mov_b32_e32 v20, s16 -; SI-NEXT: v_writelane_b32 v21, s70, 22 -; SI-NEXT: v_readfirstlane_b32 s48, v20 -; SI-NEXT: v_mov_b32_e32 v20, s17 -; SI-NEXT: v_writelane_b32 v21, s71, 23 -; SI-NEXT: v_readfirstlane_b32 s49, v20 -; SI-NEXT: v_mov_b32_e32 v20, s18 -; SI-NEXT: v_writelane_b32 v21, s80, 24 -; SI-NEXT: v_readfirstlane_b32 s50, v20 -; SI-NEXT: v_mov_b32_e32 v20, s19 -; SI-NEXT: v_writelane_b32 v21, s81, 25 -; SI-NEXT: v_readfirstlane_b32 s51, v20 -; SI-NEXT: v_mov_b32_e32 v20, s20 -; SI-NEXT: v_writelane_b32 v21, s82, 26 -; SI-NEXT: v_readfirstlane_b32 s52, v20 -; SI-NEXT: v_mov_b32_e32 v20, s21 -; SI-NEXT: v_writelane_b32 v21, s83, 27 -; SI-NEXT: v_readfirstlane_b32 s53, v20 -; SI-NEXT: v_mov_b32_e32 v20, s22 -; SI-NEXT: v_writelane_b32 v21, s84, 28 -; SI-NEXT: v_readfirstlane_b32 s54, v20 -; SI-NEXT: v_mov_b32_e32 v20, s23 -; SI-NEXT: v_writelane_b32 v21, s85, 29 -; SI-NEXT: v_readfirstlane_b32 s55, v20 -; SI-NEXT: v_mov_b32_e32 v20, s24 -; SI-NEXT: v_writelane_b32 v21, s86, 30 -; SI-NEXT: v_readfirstlane_b32 s64, v20 -; SI-NEXT: v_mov_b32_e32 v20, s25 -; SI-NEXT: v_writelane_b32 v21, s87, 31 -; SI-NEXT: v_readfirstlane_b32 s65, v20 -; SI-NEXT: v_mov_b32_e32 v20, s26 -; SI-NEXT: v_writelane_b32 v21, s96, 32 -; SI-NEXT: v_readfirstlane_b32 s66, v20 -; SI-NEXT: v_mov_b32_e32 v20, s27 -; SI-NEXT: v_writelane_b32 v21, s97, 33 -; SI-NEXT: v_readfirstlane_b32 s67, v20 -; SI-NEXT: v_mov_b32_e32 v20, s28 -; SI-NEXT: v_writelane_b32 v21, s98, 34 -; SI-NEXT: v_readfirstlane_b32 s68, v20 -; SI-NEXT: v_mov_b32_e32 v20, s29 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 -; SI-NEXT: v_writelane_b32 v21, s99, 35 -; SI-NEXT: v_readfirstlane_b32 s69, v20 -; SI-NEXT: v_readfirstlane_b32 s70, v1 -; SI-NEXT: v_readfirstlane_b32 s71, v2 -; SI-NEXT: v_readfirstlane_b32 s80, v3 -; SI-NEXT: v_readfirstlane_b32 s81, v4 -; SI-NEXT: v_readfirstlane_b32 s82, v5 -; SI-NEXT: v_readfirstlane_b32 s83, v6 -; SI-NEXT: v_readfirstlane_b32 s84, v7 -; SI-NEXT: v_readfirstlane_b32 s85, v8 -; SI-NEXT: v_readfirstlane_b32 s86, v9 -; SI-NEXT: v_readfirstlane_b32 s87, v10 -; SI-NEXT: v_readfirstlane_b32 s96, v11 -; SI-NEXT: v_readfirstlane_b32 s97, v12 -; SI-NEXT: v_readfirstlane_b32 s98, v13 -; SI-NEXT: v_readfirstlane_b32 s99, v14 -; SI-NEXT: v_readfirstlane_b32 s6, v15 -; SI-NEXT: v_readfirstlane_b32 s7, v16 -; SI-NEXT: v_readfirstlane_b32 s8, v17 +; SI-NEXT: v_writelane_b32 v33, s30, 0 +; SI-NEXT: v_writelane_b32 v33, s31, 1 +; SI-NEXT: v_writelane_b32 v33, s34, 2 +; SI-NEXT: v_writelane_b32 v33, s35, 3 +; SI-NEXT: v_writelane_b32 v33, s36, 4 +; SI-NEXT: v_writelane_b32 v33, s37, 5 +; SI-NEXT: v_writelane_b32 v33, s38, 6 +; SI-NEXT: v_writelane_b32 v33, s39, 7 +; SI-NEXT: v_writelane_b32 v33, s48, 8 +; SI-NEXT: v_writelane_b32 v33, s49, 9 +; SI-NEXT: v_writelane_b32 v33, s50, 10 +; SI-NEXT: v_writelane_b32 v33, s51, 11 +; SI-NEXT: v_writelane_b32 v33, s52, 12 +; SI-NEXT: v_writelane_b32 v33, s53, 13 +; SI-NEXT: v_writelane_b32 v33, s54, 14 +; SI-NEXT: v_writelane_b32 v33, s55, 15 +; SI-NEXT: v_writelane_b32 v33, s64, 16 +; SI-NEXT: v_writelane_b32 v33, s65, 17 +; SI-NEXT: v_writelane_b32 v33, s66, 18 +; SI-NEXT: v_writelane_b32 v33, s67, 19 +; SI-NEXT: v_writelane_b32 v33, s68, 20 +; SI-NEXT: v_writelane_b32 v33, s69, 21 +; SI-NEXT: v_mov_b32_e32 v19, s16 +; SI-NEXT: v_writelane_b32 v33, s70, 22 +; SI-NEXT: v_readfirstlane_b32 s48, v19 +; SI-NEXT: v_mov_b32_e32 v19, s17 +; SI-NEXT: v_writelane_b32 v33, s71, 23 +; SI-NEXT: v_readfirstlane_b32 s49, v19 +; SI-NEXT: v_mov_b32_e32 v19, s18 +; SI-NEXT: v_writelane_b32 v33, s80, 24 +; SI-NEXT: v_readfirstlane_b32 s50, v19 +; SI-NEXT: v_mov_b32_e32 v19, s19 +; SI-NEXT: v_writelane_b32 v33, s81, 25 +; SI-NEXT: v_readfirstlane_b32 s51, v19 +; SI-NEXT: v_mov_b32_e32 v19, s20 +; SI-NEXT: v_writelane_b32 v33, s82, 26 +; SI-NEXT: v_readfirstlane_b32 s52, v19 +; SI-NEXT: v_mov_b32_e32 v19, s21 +; SI-NEXT: v_writelane_b32 v33, s83, 27 +; SI-NEXT: v_readfirstlane_b32 s53, v19 +; SI-NEXT: v_mov_b32_e32 v19, s22 +; SI-NEXT: v_writelane_b32 v33, s84, 28 +; SI-NEXT: v_readfirstlane_b32 s54, v19 +; SI-NEXT: v_mov_b32_e32 v19, s23 +; SI-NEXT: v_writelane_b32 v33, s85, 29 +; SI-NEXT: v_readfirstlane_b32 s55, v19 +; SI-NEXT: v_mov_b32_e32 v19, s24 +; SI-NEXT: v_writelane_b32 v33, s86, 30 +; SI-NEXT: v_readfirstlane_b32 s64, v19 +; SI-NEXT: v_mov_b32_e32 v19, s25 +; SI-NEXT: v_writelane_b32 v33, s87, 31 +; SI-NEXT: v_readfirstlane_b32 s65, v19 +; SI-NEXT: v_mov_b32_e32 v19, s26 +; SI-NEXT: v_writelane_b32 v33, s96, 32 +; SI-NEXT: v_readfirstlane_b32 s66, v19 +; SI-NEXT: v_mov_b32_e32 v19, s27 +; SI-NEXT: v_writelane_b32 v33, s97, 33 +; SI-NEXT: v_readfirstlane_b32 s67, v19 +; SI-NEXT: v_mov_b32_e32 v19, s28 +; SI-NEXT: v_writelane_b32 v33, s98, 34 +; SI-NEXT: v_readfirstlane_b32 s68, v19 +; SI-NEXT: v_mov_b32_e32 v19, s29 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: v_writelane_b32 v33, s99, 35 +; SI-NEXT: v_readfirstlane_b32 s69, v19 +; SI-NEXT: v_readfirstlane_b32 s70, v0 +; SI-NEXT: v_readfirstlane_b32 s71, v1 +; SI-NEXT: v_readfirstlane_b32 s80, v2 +; SI-NEXT: v_readfirstlane_b32 s81, v3 +; SI-NEXT: v_readfirstlane_b32 s82, v4 +; SI-NEXT: v_readfirstlane_b32 s83, v5 +; SI-NEXT: v_readfirstlane_b32 s84, v6 +; SI-NEXT: v_readfirstlane_b32 s85, v7 +; SI-NEXT: v_readfirstlane_b32 s86, v8 +; SI-NEXT: v_readfirstlane_b32 s87, v9 +; SI-NEXT: v_readfirstlane_b32 s96, v10 +; SI-NEXT: v_readfirstlane_b32 s97, v11 +; SI-NEXT: v_readfirstlane_b32 s98, v12 +; SI-NEXT: v_readfirstlane_b32 s99, v13 +; SI-NEXT: v_readfirstlane_b32 s6, v14 +; SI-NEXT: v_readfirstlane_b32 s7, v15 +; SI-NEXT: v_readfirstlane_b32 s8, v16 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s9, v18 -; SI-NEXT: ; implicit-def: $vgpr22 : SGPR spill to VGPR lane +; SI-NEXT: v_readfirstlane_b32 s9, v17 +; SI-NEXT: ; implicit-def: $vgpr34 : SGPR spill to VGPR lane ; SI-NEXT: s_cbranch_scc0 .LBB17_2 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_and_b32 s4, s49, 0xffff0000 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v22, s4, 1 +; SI-NEXT: v_writelane_b32 v34, s4, 1 ; SI-NEXT: s_lshl_b32 s4, s49, 16 -; SI-NEXT: v_writelane_b32 v22, s4, 0 +; SI-NEXT: v_writelane_b32 v34, s4, 0 ; SI-NEXT: s_and_b32 s4, s48, 0xffff0000 -; SI-NEXT: v_writelane_b32 v22, s4, 2 +; SI-NEXT: v_writelane_b32 v34, s4, 2 ; SI-NEXT: s_lshl_b32 s4, s48, 16 ; SI-NEXT: s_and_b32 s11, s9, 0xffff0000 ; SI-NEXT: s_lshl_b32 s10, s9, 16 @@ -23735,7 +23633,7 @@ define inreg <64 x bfloat> @bitcast_v32i32_to_v64bf16_scalar(<32 x i32> inreg %a ; SI-NEXT: s_lshl_b32 s36, s51, 16 ; SI-NEXT: s_and_b32 s39, s50, 0xffff0000 ; SI-NEXT: s_lshl_b32 s38, s50, 16 -; SI-NEXT: v_writelane_b32 v22, s4, 3 +; SI-NEXT: v_writelane_b32 v34, s4, 3 ; SI-NEXT: s_mov_b64 s[4:5], 0 ; SI-NEXT: s_branch .LBB17_3 ; SI-NEXT: .LBB17_2: @@ -23840,14 +23738,8 @@ define inreg <64 x bfloat> @bitcast_v32i32_to_v64bf16_scalar(<32 x i32> inreg %a ; SI-NEXT: s_mov_b32 s45, s47 ; SI-NEXT: s_mov_b32 s46, s56 ; SI-NEXT: s_mov_b32 s47, s57 -; SI-NEXT: s_mov_b32 s56, s58 -; SI-NEXT: s_mov_b32 s57, s59 -; SI-NEXT: s_mov_b32 s58, s60 -; SI-NEXT: s_mov_b32 s59, s61 -; SI-NEXT: s_mov_b32 s60, s62 -; SI-NEXT: s_mov_b32 s61, s63 -; SI-NEXT: v_readlane_b32 s62, v22, 0 -; SI-NEXT: v_readlane_b32 s63, v22, 1 +; SI-NEXT: v_readlane_b32 s56, v34, 0 +; SI-NEXT: v_readlane_b32 s57, v34, 1 ; SI-NEXT: s_cbranch_vccnz .LBB17_5 ; SI-NEXT: ; %bb.4: ; %cmp.true ; SI-NEXT: s_add_i32 s48, s48, 3 @@ -23913,12 +23805,12 @@ define inreg <64 x bfloat> @bitcast_v32i32_to_v64bf16_scalar(<32 x i32> inreg %a ; SI-NEXT: s_lshl_b32 s44, s82, 16 ; SI-NEXT: s_and_b32 s47, s81, 0xffff0000 ; SI-NEXT: s_lshl_b32 s46, s81, 16 -; SI-NEXT: s_and_b32 s57, s80, 0xffff0000 -; SI-NEXT: s_lshl_b32 s56, s80, 16 -; SI-NEXT: s_and_b32 s59, s71, 0xffff0000 -; SI-NEXT: s_lshl_b32 s58, s71, 16 -; SI-NEXT: s_and_b32 s61, s70, 0xffff0000 -; SI-NEXT: s_lshl_b32 s60, s70, 16 +; SI-NEXT: s_and_b32 s59, s80, 0xffff0000 +; SI-NEXT: s_lshl_b32 s58, s80, 16 +; SI-NEXT: s_and_b32 s61, s71, 0xffff0000 +; SI-NEXT: s_lshl_b32 s60, s71, 16 +; SI-NEXT: s_and_b32 s63, s70, 0xffff0000 +; SI-NEXT: s_lshl_b32 s62, s70, 16 ; SI-NEXT: s_and_b32 s73, s69, 0xffff0000 ; SI-NEXT: s_lshl_b32 s72, s69, 16 ; SI-NEXT: s_and_b32 s75, s68, 0xffff0000 @@ -23943,278 +23835,185 @@ define inreg <64 x bfloat> @bitcast_v32i32_to_v64bf16_scalar(<32 x i32> inreg %a ; SI-NEXT: s_lshl_b32 s36, s51, 16 ; SI-NEXT: s_and_b32 s39, s50, 0xffff0000 ; SI-NEXT: s_lshl_b32 s38, s50, 16 -; SI-NEXT: s_and_b32 s63, s49, 0xffff0000 -; SI-NEXT: s_lshl_b32 s62, s49, 16 +; SI-NEXT: s_and_b32 s57, s49, 0xffff0000 +; SI-NEXT: s_lshl_b32 s56, s49, 16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v22, s6, 2 +; SI-NEXT: v_writelane_b32 v34, s6, 2 ; SI-NEXT: s_lshl_b32 s6, s48, 16 -; SI-NEXT: v_writelane_b32 v22, s6, 3 +; SI-NEXT: v_writelane_b32 v34, s6, 3 ; SI-NEXT: .LBB17_5: ; %end -; SI-NEXT: v_readlane_b32 s6, v22, 2 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s6 -; SI-NEXT: v_readlane_b32 s6, v22, 3 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s6 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_readlane_b32 s99, v21, 35 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s63 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s62 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s39 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s38 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s37 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s36 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s35 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s34 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s31 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s30 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s95 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s94 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s93 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s92 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s91 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s90 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s89 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s88 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s79 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s78 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s77 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s76 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s75 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s74 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s73 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s72 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s61 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s60 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s59 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s58 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_readlane_b32 s6, v34, 2 +; SI-NEXT: v_mul_f32_e64 v0, 1.0, s6 +; SI-NEXT: v_readlane_b32 s6, v34, 3 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_mul_f32_e64 v0, 1.0, s6 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s57 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s56 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s47 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s46 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s45 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s44 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s43 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s42 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s41 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s40 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s29 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s28 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s27 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s26 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s25 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s24 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s23 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s22 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s21 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s20 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s19 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s18 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s17 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s16 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s15 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s14 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s13 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s12 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s11 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s10 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s5 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: v_readlane_b32 s98, v21, 34 -; SI-NEXT: v_readlane_b32 s97, v21, 33 -; SI-NEXT: v_readlane_b32 s96, v21, 32 -; SI-NEXT: v_readlane_b32 s87, v21, 31 -; SI-NEXT: v_readlane_b32 s86, v21, 30 -; SI-NEXT: v_readlane_b32 s85, v21, 29 -; SI-NEXT: v_readlane_b32 s84, v21, 28 -; SI-NEXT: v_readlane_b32 s83, v21, 27 -; SI-NEXT: v_readlane_b32 s82, v21, 26 -; SI-NEXT: v_readlane_b32 s81, v21, 25 -; SI-NEXT: v_readlane_b32 s80, v21, 24 -; SI-NEXT: v_readlane_b32 s71, v21, 23 -; SI-NEXT: v_readlane_b32 s70, v21, 22 -; SI-NEXT: v_readlane_b32 s69, v21, 21 -; SI-NEXT: v_readlane_b32 s68, v21, 20 -; SI-NEXT: v_readlane_b32 s67, v21, 19 -; SI-NEXT: v_readlane_b32 s66, v21, 18 -; SI-NEXT: v_readlane_b32 s65, v21, 17 -; SI-NEXT: v_readlane_b32 s64, v21, 16 -; SI-NEXT: v_readlane_b32 s55, v21, 15 -; SI-NEXT: v_readlane_b32 s54, v21, 14 -; SI-NEXT: v_readlane_b32 s53, v21, 13 -; SI-NEXT: v_readlane_b32 s52, v21, 12 -; SI-NEXT: v_readlane_b32 s51, v21, 11 -; SI-NEXT: v_readlane_b32 s50, v21, 10 -; SI-NEXT: v_readlane_b32 s49, v21, 9 -; SI-NEXT: v_readlane_b32 s48, v21, 8 -; SI-NEXT: v_readlane_b32 s39, v21, 7 -; SI-NEXT: v_readlane_b32 s38, v21, 6 -; SI-NEXT: v_readlane_b32 s37, v21, 5 -; SI-NEXT: v_readlane_b32 s36, v21, 4 -; SI-NEXT: v_readlane_b32 s35, v21, 3 -; SI-NEXT: v_readlane_b32 s34, v21, 2 -; SI-NEXT: v_readlane_b32 s31, v21, 1 -; SI-NEXT: v_readlane_b32 s30, v21, 0 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s39 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s38 +; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s37 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s36 +; SI-NEXT: v_lshr_b64 v[3:4], v[3:4], 16 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s35 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s34 +; SI-NEXT: v_lshr_b64 v[4:5], v[4:5], 16 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s31 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s30 +; SI-NEXT: v_lshr_b64 v[5:6], v[5:6], 16 +; SI-NEXT: v_mul_f32_e64 v6, 1.0, s95 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_mul_f32_e64 v6, 1.0, s94 +; SI-NEXT: v_lshr_b64 v[6:7], v[6:7], 16 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s93 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v7 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s92 +; SI-NEXT: v_lshr_b64 v[7:8], v[7:8], 16 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s91 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v8 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s90 +; SI-NEXT: v_lshr_b64 v[8:9], v[8:9], 16 +; SI-NEXT: v_mul_f32_e64 v9, 1.0, s89 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v9 +; SI-NEXT: v_mul_f32_e64 v9, 1.0, s88 +; SI-NEXT: v_lshr_b64 v[9:10], v[9:10], 16 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s79 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s78 +; SI-NEXT: v_lshr_b64 v[10:11], v[10:11], 16 +; SI-NEXT: v_mul_f32_e64 v11, 1.0, s77 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v11 +; SI-NEXT: v_mul_f32_e64 v11, 1.0, s76 +; SI-NEXT: v_lshr_b64 v[11:12], v[11:12], 16 +; SI-NEXT: v_mul_f32_e64 v12, 1.0, s75 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v12 +; SI-NEXT: v_mul_f32_e64 v12, 1.0, s74 +; SI-NEXT: v_lshr_b64 v[12:13], v[12:13], 16 +; SI-NEXT: v_mul_f32_e64 v13, 1.0, s73 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v13 +; SI-NEXT: v_mul_f32_e64 v13, 1.0, s72 +; SI-NEXT: v_lshr_b64 v[13:14], v[13:14], 16 +; SI-NEXT: v_mul_f32_e64 v14, 1.0, s63 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_mul_f32_e64 v14, 1.0, s62 +; SI-NEXT: v_lshr_b64 v[14:15], v[14:15], 16 +; SI-NEXT: v_mul_f32_e64 v15, 1.0, s61 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v15 +; SI-NEXT: v_mul_f32_e64 v15, 1.0, s60 +; SI-NEXT: v_lshr_b64 v[15:16], v[15:16], 16 +; SI-NEXT: v_mul_f32_e64 v16, 1.0, s59 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v16 +; SI-NEXT: v_mul_f32_e64 v16, 1.0, s58 +; SI-NEXT: v_lshr_b64 v[16:17], v[16:17], 16 +; SI-NEXT: v_mul_f32_e64 v17, 1.0, s47 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v17 +; SI-NEXT: v_mul_f32_e64 v17, 1.0, s46 +; SI-NEXT: v_lshr_b64 v[17:18], v[17:18], 16 +; SI-NEXT: v_mul_f32_e64 v18, 1.0, s45 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; SI-NEXT: v_mul_f32_e64 v18, 1.0, s44 +; SI-NEXT: v_lshr_b64 v[18:19], v[18:19], 16 +; SI-NEXT: v_mul_f32_e64 v19, 1.0, s43 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v19 +; SI-NEXT: v_mul_f32_e64 v19, 1.0, s42 +; SI-NEXT: v_lshr_b64 v[19:20], v[19:20], 16 +; SI-NEXT: v_mul_f32_e64 v20, 1.0, s41 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v20 +; SI-NEXT: v_mul_f32_e64 v20, 1.0, s40 +; SI-NEXT: v_lshr_b64 v[20:21], v[20:21], 16 +; SI-NEXT: v_mul_f32_e64 v21, 1.0, s29 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v21 +; SI-NEXT: v_mul_f32_e64 v21, 1.0, s28 +; SI-NEXT: v_lshr_b64 v[21:22], v[21:22], 16 +; SI-NEXT: v_mul_f32_e64 v22, 1.0, s27 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 +; SI-NEXT: v_mul_f32_e64 v22, 1.0, s26 +; SI-NEXT: v_lshr_b64 v[22:23], v[22:23], 16 +; SI-NEXT: v_mul_f32_e64 v23, 1.0, s25 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v23 +; SI-NEXT: v_mul_f32_e64 v23, 1.0, s24 +; SI-NEXT: v_lshr_b64 v[23:24], v[23:24], 16 +; SI-NEXT: v_mul_f32_e64 v24, 1.0, s23 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v24 +; SI-NEXT: v_mul_f32_e64 v24, 1.0, s22 +; SI-NEXT: v_lshr_b64 v[24:25], v[24:25], 16 +; SI-NEXT: v_mul_f32_e64 v25, 1.0, s21 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v25 +; SI-NEXT: v_mul_f32_e64 v25, 1.0, s20 +; SI-NEXT: v_lshr_b64 v[25:26], v[25:26], 16 +; SI-NEXT: v_mul_f32_e64 v26, 1.0, s19 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v26 +; SI-NEXT: v_mul_f32_e64 v26, 1.0, s18 +; SI-NEXT: v_lshr_b64 v[26:27], v[26:27], 16 +; SI-NEXT: v_mul_f32_e64 v27, 1.0, s17 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v27 +; SI-NEXT: v_mul_f32_e64 v27, 1.0, s16 +; SI-NEXT: v_lshr_b64 v[27:28], v[27:28], 16 +; SI-NEXT: v_mul_f32_e64 v28, 1.0, s15 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v28 +; SI-NEXT: v_mul_f32_e64 v28, 1.0, s14 +; SI-NEXT: v_lshr_b64 v[28:29], v[28:29], 16 +; SI-NEXT: v_mul_f32_e64 v29, 1.0, s13 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v29 +; SI-NEXT: v_mul_f32_e64 v29, 1.0, s12 +; SI-NEXT: v_lshr_b64 v[29:30], v[29:30], 16 +; SI-NEXT: v_mul_f32_e64 v30, 1.0, s11 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v30 +; SI-NEXT: v_mul_f32_e64 v30, 1.0, s10 +; SI-NEXT: v_lshr_b64 v[30:31], v[30:31], 16 +; SI-NEXT: v_mul_f32_e64 v31, 1.0, s5 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v31 +; SI-NEXT: v_mul_f32_e64 v31, 1.0, s4 +; SI-NEXT: v_lshr_b64 v[31:32], v[31:32], 16 +; SI-NEXT: v_readlane_b32 s99, v33, 35 +; SI-NEXT: v_readlane_b32 s98, v33, 34 +; SI-NEXT: v_readlane_b32 s97, v33, 33 +; SI-NEXT: v_readlane_b32 s96, v33, 32 +; SI-NEXT: v_readlane_b32 s87, v33, 31 +; SI-NEXT: v_readlane_b32 s86, v33, 30 +; SI-NEXT: v_readlane_b32 s85, v33, 29 +; SI-NEXT: v_readlane_b32 s84, v33, 28 +; SI-NEXT: v_readlane_b32 s83, v33, 27 +; SI-NEXT: v_readlane_b32 s82, v33, 26 +; SI-NEXT: v_readlane_b32 s81, v33, 25 +; SI-NEXT: v_readlane_b32 s80, v33, 24 +; SI-NEXT: v_readlane_b32 s71, v33, 23 +; SI-NEXT: v_readlane_b32 s70, v33, 22 +; SI-NEXT: v_readlane_b32 s69, v33, 21 +; SI-NEXT: v_readlane_b32 s68, v33, 20 +; SI-NEXT: v_readlane_b32 s67, v33, 19 +; SI-NEXT: v_readlane_b32 s66, v33, 18 +; SI-NEXT: v_readlane_b32 s65, v33, 17 +; SI-NEXT: v_readlane_b32 s64, v33, 16 +; SI-NEXT: v_readlane_b32 s55, v33, 15 +; SI-NEXT: v_readlane_b32 s54, v33, 14 +; SI-NEXT: v_readlane_b32 s53, v33, 13 +; SI-NEXT: v_readlane_b32 s52, v33, 12 +; SI-NEXT: v_readlane_b32 s51, v33, 11 +; SI-NEXT: v_readlane_b32 s50, v33, 10 +; SI-NEXT: v_readlane_b32 s49, v33, 9 +; SI-NEXT: v_readlane_b32 s48, v33, 8 +; SI-NEXT: v_readlane_b32 s39, v33, 7 +; SI-NEXT: v_readlane_b32 s38, v33, 6 +; SI-NEXT: v_readlane_b32 s37, v33, 5 +; SI-NEXT: v_readlane_b32 s36, v33, 4 +; SI-NEXT: v_readlane_b32 s35, v33, 3 +; SI-NEXT: v_readlane_b32 s34, v33, 2 +; SI-NEXT: v_readlane_b32 s31, v33, 1 +; SI-NEXT: v_readlane_b32 s30, v33, 0 ; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[4:5] -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v32i32_to_v64bf16_scalar: @@ -24461,382 +24260,260 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) { ; SI-LABEL: bitcast_v64bf16_to_v32i32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:132 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:88 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v63, 0xffff0000, v0 +; SI-NEXT: v_and_b32_e32 v62, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_mul_f32_e32 v63, 1.0, v63 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v63, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v62 +; SI-NEXT: v_and_b32_e32 v61, 0xffff0000, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v11 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v10 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v61 +; SI-NEXT: v_and_b32_e32 v60, 0xffff0000, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v12 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v60 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v15 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v59, 0xffff0000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v58, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v57, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v47, 0xffff0000, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v46, 0xffff0000, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v45, 0xffff0000, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_and_b32_e32 v44, 0xffff0000, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v43, 0xffff0000, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_and_b32_e32 v42, 0xffff0000, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v41, 0xffff0000, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v40, 0xffff0000, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_and_b32_e32 v55, 0xffff0000, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v30 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v29 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_and_b32_e32 v34, 0xffff0000, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_and_b32_e32 v36, 0xffff0000, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_and_b32_e32 v48, 0xffff0000, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_and_b32_e32 v49, 0xffff0000, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_and_b32_e32 v50, 0xffff0000, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_and_b32_e32 v51, 0xffff0000, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_and_b32_e32 v52, 0xffff0000, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_and_b32_e32 v53, 0xffff0000, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_and_b32_e32 v54, 0xffff0000, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_mul_f32_e32 v62, 1.0, v54 +; SI-NEXT: v_mul_f32_e32 v60, 1.0, v53 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v56, 0xffff0000, v37 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v38 +; SI-NEXT: v_and_b32_e32 v38, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v38 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v17 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v59 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v5 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v18 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v58 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v21 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v6 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v20 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v57 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v23 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v22 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v24 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v47 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v27 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v26 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v29 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v28 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: v_mul_f32_e32 v62, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v60, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v61, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v58, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v59, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v56, 1.0, v7 -; SI-NEXT: v_mul_f32_e32 v57, 1.0, v6 -; SI-NEXT: v_mul_f32_e32 v47, 1.0, v8 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v46 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v9 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v33 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v45 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v34 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v10 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v35 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v44 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v36 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v11 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v37 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v43 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v38 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v39 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v42 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v48 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v49 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v41 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v50 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v51 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v40 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v52 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v53 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v55 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v54 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:100 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:112 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:108 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:120 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:116 -; SI-NEXT: v_mul_f32_e32 v39, 1.0, v41 -; SI-NEXT: v_mul_f32_e32 v51, 1.0, v42 -; SI-NEXT: v_mul_f32_e32 v32, 1.0, v30 -; SI-NEXT: v_mul_f32_e32 v52, 1.0, v55 -; SI-NEXT: v_mul_f32_e32 v55, 1.0, v40 -; SI-NEXT: v_mul_f32_e32 v34, 1.0, v43 -; SI-NEXT: v_mul_f32_e32 v38, 1.0, v44 -; SI-NEXT: v_mul_f32_e32 v33, 1.0, v45 -; SI-NEXT: v_mul_f32_e32 v35, 1.0, v46 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_mul_f32_e32 v41, 1.0, v0 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_mul_f32_e32 v42, 1.0, v1 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_mul_f32_e32 v54, 1.0, v2 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_mul_f32_e32 v40, 1.0, v3 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_mul_f32_e32 v48, 1.0, v4 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_mul_f32_e32 v53, 1.0, v5 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_mul_f32_e32 v36, 1.0, v6 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_mul_f32_e32 v49, 1.0, v7 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v37, 1.0, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v50, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v17 +; SI-NEXT: v_mul_f32_e32 v61, 1.0, v18 +; SI-NEXT: v_mul_f32_e32 v58, 1.0, v52 +; SI-NEXT: v_mul_f32_e32 v59, 1.0, v19 +; SI-NEXT: v_mul_f32_e32 v47, 1.0, v51 +; SI-NEXT: v_mul_f32_e32 v57, 1.0, v20 +; SI-NEXT: v_mul_f32_e32 v45, 1.0, v50 +; SI-NEXT: v_mul_f32_e32 v46, 1.0, v21 +; SI-NEXT: v_mul_f32_e32 v43, 1.0, v49 +; SI-NEXT: v_mul_f32_e32 v44, 1.0, v22 +; SI-NEXT: v_mul_f32_e32 v41, 1.0, v48 +; SI-NEXT: v_mul_f32_e32 v42, 1.0, v23 +; SI-NEXT: v_mul_f32_e32 v55, 1.0, v39 +; SI-NEXT: v_mul_f32_e32 v40, 1.0, v24 +; SI-NEXT: v_mul_f32_e32 v53, 1.0, v36 +; SI-NEXT: v_mul_f32_e32 v54, 1.0, v25 +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v35 +; SI-NEXT: v_mul_f32_e32 v52, 1.0, v26 +; SI-NEXT: v_mul_f32_e32 v49, 1.0, v34 +; SI-NEXT: v_mul_f32_e32 v50, 1.0, v27 +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v33 +; SI-NEXT: v_mul_f32_e32 v48, 1.0, v28 +; SI-NEXT: v_mul_f32_e32 v36, 1.0, v32 +; SI-NEXT: v_mul_f32_e32 v38, 1.0, v29 +; SI-NEXT: v_mul_f32_e32 v34, 1.0, v31 +; SI-NEXT: v_mul_f32_e32 v35, 1.0, v30 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v56 +; SI-NEXT: v_mul_f32_e32 v33, 1.0, v37 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB18_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v62 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v60 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v58 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v56 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v39 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v34 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v41 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v54 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v48 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v36 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v37 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: v_alignbit_b32 v0, v0, v63, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v61, 16 -; SI-NEXT: v_alignbit_b32 v2, v2, v59, 16 -; SI-NEXT: v_alignbit_b32 v3, v3, v57, 16 -; SI-NEXT: v_alignbit_b32 v24, v24, v51, 16 -; SI-NEXT: v_alignbit_b32 v25, v25, v38, 16 -; SI-NEXT: v_alignbit_b32 v26, v26, v35, 16 -; SI-NEXT: v_alignbit_b32 v27, v27, v42, 16 -; SI-NEXT: v_alignbit_b32 v28, v28, v40, 16 -; SI-NEXT: v_alignbit_b32 v29, v29, v53, 16 -; SI-NEXT: v_alignbit_b32 v30, v30, v49, 16 -; SI-NEXT: v_alignbit_b32 v31, v31, v50, 16 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_alignbit_b32 v5, v5, v6, 16 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_alignbit_b32 v16, v16, v17, 16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_alignbit_b32 v6, v6, v7, 16 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_alignbit_b32 v17, v17, v18, 16 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_alignbit_b32 v7, v7, v8, 16 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_alignbit_b32 v18, v18, v19, 16 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_alignbit_b32 v8, v8, v9, 16 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_alignbit_b32 v19, v19, v20, 16 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_alignbit_b32 v9, v9, v10, 16 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_alignbit_b32 v20, v20, v21, 16 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_alignbit_b32 v10, v10, v11, 16 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_alignbit_b32 v21, v21, v22, 16 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_alignbit_b32 v11, v11, v12, 16 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_alignbit_b32 v22, v22, v23, 16 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v52 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: v_alignbit_b32 v23, v23, v55, 16 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_alignbit_b32 v12, v12, v13, 16 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_alignbit_b32 v13, v13, v14, 16 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_alignbit_b32 v14, v14, v15, 16 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_alignbit_b32 v4, v4, v47, 16 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_alignbit_b32 v15, v15, v32, 16 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 @@ -24882,282 +24559,422 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v58 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v47 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v45 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v43 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v34 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: v_alignbit_b32 v19, v19, v59, 16 +; SI-NEXT: v_alignbit_b32 v20, v20, v57, 16 +; SI-NEXT: v_alignbit_b32 v21, v21, v46, 16 +; SI-NEXT: v_alignbit_b32 v22, v22, v44, 16 +; SI-NEXT: v_alignbit_b32 v23, v23, v42, 16 +; SI-NEXT: v_alignbit_b32 v24, v24, v40, 16 +; SI-NEXT: v_alignbit_b32 v25, v25, v54, 16 +; SI-NEXT: v_alignbit_b32 v26, v26, v52, 16 +; SI-NEXT: v_alignbit_b32 v27, v27, v50, 16 +; SI-NEXT: v_alignbit_b32 v28, v28, v48, 16 +; SI-NEXT: v_alignbit_b32 v29, v29, v38, 16 +; SI-NEXT: v_alignbit_b32 v30, v30, v35, 16 +; SI-NEXT: v_alignbit_b32 v31, v31, v33, 16 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_alignbit_b32 v0, v0, v63, 16 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_alignbit_b32 v2, v2, v3, 16 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_alignbit_b32 v3, v3, v4, 16 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_alignbit_b32 v4, v4, v5, 16 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_alignbit_b32 v5, v5, v6, 16 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_alignbit_b32 v6, v6, v7, 16 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_alignbit_b32 v7, v7, v8, 16 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_alignbit_b32 v8, v8, v9, 16 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_alignbit_b32 v9, v9, v10, 16 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_alignbit_b32 v10, v10, v11, 16 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_alignbit_b32 v11, v11, v12, 16 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_alignbit_b32 v12, v12, v13, 16 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_alignbit_b32 v13, v13, v14, 16 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_alignbit_b32 v14, v14, v15, 16 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_alignbit_b32 v15, v15, v16, 16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_alignbit_b32 v16, v16, v17, 16 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v62 +; SI-NEXT: v_alignbit_b32 v17, v17, v18, 16 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v60 +; SI-NEXT: v_alignbit_b32 v18, v18, v61, 16 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: .LBB18_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB18_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v62 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v60 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v63 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v61 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; SI-NEXT: v_alignbit_b32 v1, v3, v2, 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v58 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v59 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v56 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v57 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v47 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v52 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v62 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v60 +; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v58 +; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v47 +; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v45 +; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v43 +; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v41 ; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 ; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v39 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v55 ; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 ; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v34 +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v53 ; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 ; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v33 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v51 ; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 ; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 -; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v41 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v49 ; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 ; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v54 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v39 ; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 ; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 -; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v48 +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v36 ; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 ; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v36 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v34 ; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v3, v2, 16 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 ; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 ; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 ; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 ; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 ; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: v_alignbit_b32 v8, v9, v8, 16 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 ; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 ; SI-NEXT: v_alignbit_b32 v10, v11, v10, 16 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 ; SI-NEXT: v_alignbit_b32 v11, v12, v11, 16 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 ; SI-NEXT: v_alignbit_b32 v12, v13, v12, 16 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 ; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; SI-NEXT: v_alignbit_b32 v14, v15, v14, 16 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v32 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 ; SI-NEXT: v_alignbit_b32 v15, v16, v15, 16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v37 -; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; SI-NEXT: v_alignbit_b32 v16, v17, v16, 16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 ; SI-NEXT: v_alignbit_b32 v17, v18, v17, 16 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v61 ; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; SI-NEXT: v_alignbit_b32 v18, v19, v18, 16 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v59 ; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 ; SI-NEXT: v_alignbit_b32 v19, v20, v19, 16 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v57 ; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 ; SI-NEXT: v_alignbit_b32 v20, v21, v20, 16 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v46 ; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 ; SI-NEXT: v_alignbit_b32 v21, v22, v21, 16 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v44 ; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 ; SI-NEXT: v_alignbit_b32 v22, v23, v22, 16 -; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v55 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v42 ; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 ; SI-NEXT: v_alignbit_b32 v23, v24, v23, 16 -; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v51 +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v40 ; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 ; SI-NEXT: v_alignbit_b32 v24, v25, v24, 16 -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v38 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v54 ; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 ; SI-NEXT: v_alignbit_b32 v25, v26, v25, 16 -; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v35 +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v52 ; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 ; SI-NEXT: v_alignbit_b32 v26, v27, v26, 16 -; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v42 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v50 ; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 ; SI-NEXT: v_alignbit_b32 v27, v28, v27, 16 -; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v40 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v48 ; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 ; SI-NEXT: v_alignbit_b32 v28, v29, v28, 16 -; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v53 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v38 ; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 ; SI-NEXT: v_alignbit_b32 v29, v30, v29, 16 -; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v49 +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v35 ; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 ; SI-NEXT: v_alignbit_b32 v30, v31, v30, 16 -; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v50 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v33 ; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 ; SI-NEXT: v_alignbit_b32 v31, v32, v31, 16 ; SI-NEXT: .LBB18_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v64bf16_to_v32i32: @@ -27435,667 +27252,665 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; SI-LABEL: bitcast_v64bf16_to_v32i32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:68 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v63, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v7 -; SI-NEXT: v_mov_b32_e32 v43, v21 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v11 -; SI-NEXT: v_mov_b32_e32 v54, v29 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v43 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v54 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v41, v23 -; SI-NEXT: v_mov_b32_e32 v29, v20 -; SI-NEXT: v_mul_f32_e32 v57, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v59, 1.0, v9 -; SI-NEXT: v_mul_f32_e32 v61, 1.0, v13 -; SI-NEXT: v_mul_f32_e32 v23, 1.0, v15 -; SI-NEXT: v_mul_f32_e32 v44, 1.0, v17 -; SI-NEXT: v_mul_f32_e32 v21, 1.0, v19 -; SI-NEXT: v_mul_f32_e32 v20, 1.0, v41 -; SI-NEXT: v_mul_f32_e32 v17, 1.0, v25 -; SI-NEXT: v_mul_f32_e32 v15, 1.0, v27 -; SI-NEXT: v_mul_f32_e64 v25, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v3, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v5, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v7, 1.0, s29 -; SI-NEXT: v_mul_f32_e32 v9, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v54, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v11, 1.0, v6 -; SI-NEXT: v_mul_f32_e32 v56, 1.0, v8 -; SI-NEXT: v_mul_f32_e32 v13, 1.0, v10 -; SI-NEXT: v_mul_f32_e32 v58, 1.0, v12 -; SI-NEXT: v_mul_f32_e32 v60, 1.0, v14 -; SI-NEXT: v_mul_f32_e32 v62, 1.0, v16 -; SI-NEXT: v_mul_f32_e32 v47, 1.0, v22 -; SI-NEXT: v_mul_f32_e32 v22, 1.0, v28 -; SI-NEXT: v_mul_f32_e64 v19, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v14, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v16, 1.0, s22 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v32 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v33 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v34 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v35 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v36 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v37 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v38 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v39 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v48 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v5 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v16 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v16 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v7 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v30 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v17 +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v15 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v14 +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v13 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v12 +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v11 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v10 +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v9 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v8 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v28 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v34, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_and_b32 s6, s29, 0xffff0000 +; SI-NEXT: s_lshl_b32 s7, s29, 16 +; SI-NEXT: s_and_b32 s8, s28, 0xffff0000 +; SI-NEXT: s_lshl_b32 s9, s28, 16 +; SI-NEXT: s_and_b32 s10, s27, 0xffff0000 +; SI-NEXT: s_lshl_b32 s11, s27, 16 +; SI-NEXT: s_and_b32 s12, s26, 0xffff0000 +; SI-NEXT: s_lshl_b32 s13, s26, 16 +; SI-NEXT: s_and_b32 s14, s25, 0xffff0000 +; SI-NEXT: s_lshl_b32 s15, s25, 16 +; SI-NEXT: s_and_b32 s25, s24, 0xffff0000 +; SI-NEXT: s_lshl_b32 s24, s24, 16 +; SI-NEXT: s_and_b32 s26, s23, 0xffff0000 +; SI-NEXT: s_lshl_b32 s23, s23, 16 +; SI-NEXT: s_and_b32 s27, s22, 0xffff0000 +; SI-NEXT: s_lshl_b32 s22, s22, 16 +; SI-NEXT: s_and_b32 s28, s21, 0xffff0000 +; SI-NEXT: s_lshl_b32 s21, s21, 16 +; SI-NEXT: s_and_b32 s29, s20, 0xffff0000 +; SI-NEXT: s_lshl_b32 s20, s20, 16 +; SI-NEXT: s_and_b32 s40, s19, 0xffff0000 +; SI-NEXT: s_lshl_b32 s19, s19, 16 +; SI-NEXT: s_and_b32 s41, s18, 0xffff0000 +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_and_b32 s42, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_and_b32 s43, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s16, s16, 16 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v49 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v27 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v26 +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v25 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v20 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s19 -; SI-NEXT: v_mul_f32_e32 v39, 1.0, v0 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v45 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v19 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mul_f32_e64 v35, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v33, 1.0, s27 -; SI-NEXT: v_mul_f32_e32 v32, 1.0, v18 -; SI-NEXT: v_mul_f32_e32 v34, 1.0, v29 -; SI-NEXT: v_mul_f32_e32 v36, 1.0, v24 -; SI-NEXT: v_mul_f32_e32 v38, 1.0, v26 -; SI-NEXT: v_mul_f32_e32 v31, 1.0, v30 -; SI-NEXT: v_mul_f32_e32 v24, 1.0, v51 -; SI-NEXT: v_mul_f32_e32 v41, 1.0, v53 -; SI-NEXT: v_mul_f32_e32 v26, 1.0, v55 -; SI-NEXT: v_mul_f32_e32 v43, 1.0, v40 -; SI-NEXT: v_mul_f32_e32 v28, 1.0, v42 -; SI-NEXT: v_mul_f32_e32 v51, 1.0, v50 -; SI-NEXT: v_mul_f32_e32 v53, 1.0, v52 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v46 -; SI-NEXT: v_mul_f32_e64 v48, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v18, 1.0, s24 -; SI-NEXT: v_mul_f32_e64 v29, 1.0, s26 +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v35 +; SI-NEXT: v_mul_f32_e32 v50, 1.0, v34 +; SI-NEXT: v_mul_f32_e32 v42, 1.0, v33 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_mul_f32_e32 v59, 1.0, v32 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mul_f32_e32 v61, 1.0, v31 +; SI-NEXT: v_mul_f32_e32 v29, 1.0, v29 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e64 v63, 1.0, s43 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_mul_f32_e64 v23, 1.0, s42 +; SI-NEXT: v_mul_f32_e64 v55, 1.0, s41 +; SI-NEXT: v_mul_f32_e64 v57, 1.0, s40 +; SI-NEXT: v_mul_f32_e64 v25, 1.0, s29 ; SI-NEXT: v_mul_f32_e64 v45, 1.0, s28 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e64 v52, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v43, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v37, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v47, 1.0, s14 +; SI-NEXT: v_mul_f32_e64 v33, 1.0, s12 +; SI-NEXT: v_mul_f32_e64 v27, 1.0, s10 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mul_f32_e64 v21, 1.0, s8 +; SI-NEXT: v_mul_f32_e64 v19, 1.0, s6 +; SI-NEXT: v_mul_f32_e32 v35, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v34, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v49, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v48, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v38, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v54, 1.0, v36 +; SI-NEXT: v_mul_f32_e32 v53, 1.0, v17 +; SI-NEXT: v_mul_f32_e64 v62, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v60, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v58, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v56, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v41, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v6, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v40, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v46, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s15 +; SI-NEXT: v_mul_f32_e64 v44, 1.0, s13 +; SI-NEXT: v_mul_f32_e64 v12, 1.0, s11 +; SI-NEXT: v_mul_f32_e64 v36, 1.0, s9 +; SI-NEXT: v_mul_f32_e64 v14, 1.0, s7 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB19_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v0, v19 -; SI-NEXT: v_mov_b32_e32 v37, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v33 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v63 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v57 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v61 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v63 +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v44 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v21 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_lshr_b64 v[0:1], v[62:63], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[60:61], 16 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v55 +; SI-NEXT: v_lshr_b64 v[2:3], v[58:59], 16 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v57 +; SI-NEXT: v_lshr_b64 v[3:4], v[56:57], 16 +; SI-NEXT: v_mov_b32_e32 v4, v41 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v25 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[4:5], v[41:42], 16 +; SI-NEXT: v_mov_b32_e32 v5, v6 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v45 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v52 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v43 +; SI-NEXT: v_mov_b32_e32 v61, v37 +; SI-NEXT: v_mov_b32_e32 v59, v47 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v61 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v59 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v50 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v29 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v63, v39 ; SI-NEXT: s_mov_b64 s[4:5], 0 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[0:1], v[19:20], 16 -; SI-NEXT: v_mov_b32_e32 v1, v48 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[1:2], v[48:49], 16 -; SI-NEXT: v_mov_b32_e32 v2, v14 -; SI-NEXT: v_mov_b32_e32 v49, v15 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v3 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[2:3], v[14:15], 16 -; SI-NEXT: v_mov_b32_e32 v3, v16 -; SI-NEXT: v_mov_b32_e32 v20, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v35 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[3:4], v[16:17], 16 -; SI-NEXT: v_mov_b32_e32 v4, v18 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v5 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[5:6], v[6:7], 16 +; SI-NEXT: v_mov_b32_e32 v6, v40 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[4:5], v[18:19], 16 -; SI-NEXT: v_mov_b32_e32 v5, v29 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[6:7], v[40:41], 16 +; SI-NEXT: v_mov_b32_e32 v7, v8 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[5:6], v[29:30], 16 -; SI-NEXT: v_mov_b32_e32 v6, v45 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[6:7], v[45:46], 16 -; SI-NEXT: v_mov_b32_e32 v7, v39 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[7:8], v[39:40], 16 -; SI-NEXT: v_mov_b32_e32 v8, v9 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v37 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v49 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_lshr_b64 v[7:8], v[8:9], 16 +; SI-NEXT: v_mov_b32_e32 v8, v46 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[8:9], v[9:10], 16 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v9 -; SI-NEXT: v_mov_b32_e32 v9, v54 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[8:9], v[46:47], 16 +; SI-NEXT: v_mov_b32_e32 v9, v10 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[9:10], v[54:55], 16 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v55, v13 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v10 -; SI-NEXT: v_mov_b32_e32 v10, v11 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[10:11], v[11:12], 16 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v11 -; SI-NEXT: v_mov_b32_e32 v11, v56 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[11:12], v[56:57], 16 -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v56, v44 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v12 -; SI-NEXT: v_lshr_b64 v[12:13], v[13:14], 16 -; SI-NEXT: v_mov_b32_e32 v13, v58 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[13:14], v[58:59], 16 -; SI-NEXT: v_mov_b32_e32 v14, v60 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[9:10], v[10:11], 16 +; SI-NEXT: v_mov_b32_e32 v10, v44 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[14:15], v[60:61], 16 -; SI-NEXT: v_mov_b32_e32 v15, v62 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[10:11], v[44:45], 16 +; SI-NEXT: v_mov_b32_e32 v11, v12 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v45, v50 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[11:12], v[12:13], 16 +; SI-NEXT: v_mov_b32_e32 v12, v36 +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[12:13], v[36:37], 16 +; SI-NEXT: v_mov_b32_e32 v13, v14 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v39 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[13:14], v[14:15], 16 +; SI-NEXT: v_mov_b32_e32 v14, v35 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[14:15], v[35:36], 16 +; SI-NEXT: v_mov_b32_e32 v15, v16 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[15:16], v[62:63], 16 +; SI-NEXT: v_lshr_b64 v[15:16], v[16:17], 16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 ; SI-NEXT: v_mov_b32_e32 v16, v32 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshr_b64 v[16:17], v[32:33], 16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: v_mov_b32_e32 v33, v34 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v17 +; SI-NEXT: v_mov_b32_e32 v17, v18 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[17:18], v[18:19], 16 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v18 +; SI-NEXT: v_lshr_b64 v[18:19], v[34:35], 16 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v19 +; SI-NEXT: v_mov_b32_e32 v19, v20 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v17 -; SI-NEXT: v_mov_b32_e32 v40, v17 -; SI-NEXT: v_lshr_b64 v[17:18], v[34:35], 16 -; SI-NEXT: v_lshr_b64 v[18:19], v[47:48], 16 -; SI-NEXT: v_lshr_b64 v[19:20], v[36:37], 16 -; SI-NEXT: v_mov_b32_e32 v20, v38 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[20:21], v[38:39], 16 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_mov_b32_e32 v34, v47 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v21 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[19:20], v[20:21], 16 +; SI-NEXT: v_mov_b32_e32 v20, v51 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[20:21], v[51:52], 16 ; SI-NEXT: v_mov_b32_e32 v21, v22 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshr_b64 v[21:22], v[22:23], 16 -; SI-NEXT: v_mov_b32_e32 v22, v31 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v22 +; SI-NEXT: v_mov_b32_e32 v22, v49 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[22:23], v[31:32], 16 +; SI-NEXT: v_lshr_b64 v[22:23], v[49:50], 16 ; SI-NEXT: v_mov_b32_e32 v23, v24 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshr_b64 v[23:24], v[24:25], 16 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v52 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v30 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v24 -; SI-NEXT: v_mov_b32_e32 v24, v41 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[24:25], v[41:42], 16 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v41, v26 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v50 -; SI-NEXT: v_mov_b32_e32 v42, v51 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v24 +; SI-NEXT: v_mov_b32_e32 v24, v48 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[24:25], v[48:49], 16 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v57 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v43 +; SI-NEXT: v_mov_b32_e32 v57, v31 +; SI-NEXT: v_mov_b32_e32 v43, v54 +; SI-NEXT: v_mov_b32_e32 v48, v53 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v25 +; SI-NEXT: v_mov_b32_e32 v25, v26 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v41 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshr_b64 v[25:26], v[26:27], 16 -; SI-NEXT: v_mov_b32_e32 v26, v43 -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v26, v38 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[26:27], v[38:39], 16 +; SI-NEXT: v_mov_b32_e32 v27, v28 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v38, v30 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[26:27], v[43:44], 16 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v43, v28 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v27 ; SI-NEXT: v_lshr_b64 v[27:28], v[28:29], 16 -; SI-NEXT: v_lshr_b64 v[28:29], v[51:52], 16 -; SI-NEXT: v_lshr_b64 v[29:30], v[53:54], 16 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v52, v53 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v30 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshr_b64 v[30:31], v[31:32], 16 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v57 -; SI-NEXT: v_mov_b32_e32 v53, v31 -; SI-NEXT: v_lshr_b64 v[31:32], v[31:32], 16 +; SI-NEXT: v_lshr_b64 v[28:29], v[31:32], 16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v37 +; SI-NEXT: v_lshr_b64 v[29:30], v[30:31], 16 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v37, v61 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v30 +; SI-NEXT: v_lshr_b64 v[30:31], v[54:55], 16 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v47 +; SI-NEXT: v_mov_b32_e32 v47, v59 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_lshr_b64 v[31:32], v[53:54], 16 ; SI-NEXT: s_branch .LBB19_3 ; SI-NEXT: .LBB19_2: -; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v56, v44 -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v55, v13 -; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v63, v39 +; SI-NEXT: v_mov_b32_e32 v45, v50 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: v_mov_b32_e32 v33, v34 -; SI-NEXT: v_mov_b32_e32 v34, v47 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v43, v28 -; SI-NEXT: v_mov_b32_e32 v52, v53 -; SI-NEXT: v_mov_b32_e32 v53, v0 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v57, v31 +; SI-NEXT: v_mov_b32_e32 v38, v30 +; SI-NEXT: v_mov_b32_e32 v43, v54 +; SI-NEXT: v_mov_b32_e32 v48, v53 ; SI-NEXT: s_mov_b64 s[4:5], -1 -; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v41, v26 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v42, v51 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: .LBB19_3: ; %Flow -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v37, v34 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: v_mov_b32_e32 v34, v33 -; SI-NEXT: v_mov_b32_e32 v35, v56 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_mov_b32_e32 v32, v40 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_mov_b32_e32 v33, v38 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mov_b32_e32 v51, v46 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mov_b32_e32 v54, v46 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mov_b32_e32 v44, v46 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mov_b32_e32 v45, v56 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mov_b32_e32 v47, v56 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mov_b32_e32 v58, v60 -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_mov_b32_e32 v33, v50 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: s_cbranch_vccnz .LBB19_5 ; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v57 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v62 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v61 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v60 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v49 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v39 -; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v59 -; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v40 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v35 -; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v32 -; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v53 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v32 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v42 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v61 +; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v53, v57 +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v48 ; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 ; SI-NEXT: v_lshr_b64 v[1:2], v[2:3], 16 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v60 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v36 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v58 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v56 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshr_b64 v[3:4], v[3:4], 16 +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v51 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v58 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_lshr_b64 v[3:4], v[3:4], 16 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v56 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_lshr_b64 v[4:5], v[4:5], 16 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v47 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_lshr_b64 v[5:6], v[5:6], 16 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v45 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v50 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_lshr_b64 v[6:7], v[6:7], 16 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v46 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_lshr_b64 v[7:8], v[7:8], 16 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v44 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v37 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: v_lshr_b64 v[8:9], v[8:9], 16 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v54 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v47 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 ; SI-NEXT: v_lshr_b64 v[9:10], v[9:10], 16 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v51 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v35 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 ; SI-NEXT: v_lshr_b64 v[10:11], v[10:11], 16 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v50 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_lshr_b64 v[11:12], v[11:12], 16 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v55 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_lshr_b64 v[12:13], v[12:13], 16 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v48 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_lshr_b64 v[13:14], v[13:14], 16 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v38 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v63 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; SI-NEXT: v_lshr_b64 v[14:15], v[14:15], 16 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v33 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v45 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 ; SI-NEXT: v_lshr_b64 v[15:16], v[15:16], 16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v33 ; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_lshr_b64 v[16:17], v[16:17], 16 -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v34 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v59 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 ; SI-NEXT: v_lshr_b64 v[17:18], v[17:18], 16 -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v37 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v34 ; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; SI-NEXT: v_lshr_b64 v[18:19], v[18:19], 16 -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v36 -; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; SI-NEXT: v_lshr_b64 v[19:20], v[19:20], 16 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 ; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 ; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 ; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v33 ; SI-NEXT: v_lshr_b64 v[32:33], v[32:33], 16 ; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_lshr_b64 v[19:20], v[19:20], 16 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -28103,9 +27918,10 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 ; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 ; SI-NEXT: v_lshr_b64 v[20:21], v[20:21], 16 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 @@ -28114,9 +27930,10 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 ; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 ; SI-NEXT: v_lshr_b64 v[21:22], v[21:22], 16 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 @@ -28125,9 +27942,9 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 ; SI-NEXT: v_lshr_b64 v[22:23], v[22:23], 16 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 @@ -28136,9 +27953,9 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 ; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 ; SI-NEXT: v_lshr_b64 v[23:24], v[23:24], 16 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 ; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 @@ -28147,12 +27964,20 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 ; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 ; SI-NEXT: v_lshr_b64 v[24:25], v[24:25], 16 -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v41 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 ; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 ; SI-NEXT: v_lshr_b64 v[25:26], v[25:26], 16 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 @@ -28161,45 +27986,45 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 ; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 ; SI-NEXT: v_lshr_b64 v[26:27], v[26:27], 16 -; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v43 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 ; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 ; SI-NEXT: v_lshr_b64 v[27:28], v[27:28], 16 -; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v42 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v53 ; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 ; SI-NEXT: v_lshr_b64 v[28:29], v[28:29], 16 -; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v52 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v38 ; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 ; SI-NEXT: v_lshr_b64 v[29:30], v[29:30], 16 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v43 ; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 -; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 ; SI-NEXT: v_lshr_b64 v[30:31], v[30:31], 16 ; SI-NEXT: v_mov_b32_e32 v31, v32 ; SI-NEXT: .LBB19_5: ; %end -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v64bf16_to_v32i32_scalar: @@ -30903,33 +30728,32 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-LABEL: bitcast_v32i32_to_v64f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 ; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr43 @@ -30942,16 +30766,16 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: s_waitcnt vmcnt(1) @@ -31021,17 +30845,13 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; kill: killed $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB20_2 ; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v62 ; SI-NEXT: v_cvt_f32_f16_e32 v32, v31 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v42, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v29 @@ -31041,114 +30861,97 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v52, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v63 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v29 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v23 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v27 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v34, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v30 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v38, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v27 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v44, v3 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v46, v2 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v60, v1 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v44, v3 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v56, v2 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 @@ -31162,26 +30965,29 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v47, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v61, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v62 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v31, v29 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v58, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v0 +; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 @@ -31198,89 +31004,87 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr62 ; SI-NEXT: .LBB20_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB20_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v25 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v9, v48 -; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: s_waitcnt expcnt(2) ; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v24 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v9, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v38 ; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v36 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v44, vcc, 3, v63 -; SI-NEXT: v_add_i32_e32 v46, vcc, 3, v62 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v44 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v46 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: v_add_i32_e32 v44, vcc, 3, v62 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v44 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v63 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v5, v62 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v9, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v46 ; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 ; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v20 ; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v22 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v9, v34 -; SI-NEXT: v_mov_b32_e32 v34, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v32 -; SI-NEXT: v_mov_b32_e32 v32, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v32 +; SI-NEXT: v_mov_b32_e32 v32, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v31 ; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v19 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v19 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v61 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v63 ; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; SI-NEXT: v_add_i32_e32 v33, vcc, 3, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v18 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v61 ; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 ; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; SI-NEXT: v_add_i32_e32 v33, vcc, 3, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v57 -; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v59 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 ; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 ; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 ; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 @@ -31288,37 +31092,42 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 ; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 ; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v26 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 ; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 ; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 ; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 ; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v26 ; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v27 ; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v28 ; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v29 ; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v44 ; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 ; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v44, v3 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v33 ; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 @@ -31336,14 +31145,20 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v60, v33 ; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 ; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 ; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 ; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v47 ; SI-NEXT: v_cvt_f32_f16_e32 v33, v45 ; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 ; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 @@ -31354,343 +31169,248 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 ; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 ; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v2 -; SI-NEXT: v_mov_b32_e32 v50, v28 -; SI-NEXT: v_mov_b32_e32 v48, v29 -; SI-NEXT: v_mov_b32_e32 v38, v30 -; SI-NEXT: v_mov_b32_e32 v58, v27 -; SI-NEXT: v_mov_b32_e32 v56, v8 -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v45, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v1 +; SI-NEXT: v_mov_b32_e32 v38, v28 +; SI-NEXT: v_mov_b32_e32 v36, v30 +; SI-NEXT: v_mov_b32_e32 v46, v27 +; SI-NEXT: v_mov_b32_e32 v31, v29 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: .LBB20_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v60 -; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v0, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v56 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v46 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v55 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v47 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 -; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 -; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 -; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 -; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 -; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 -; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 -; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 -; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 -; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 -; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 -; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v35 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v63 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v26, v48 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v40 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v6, v41 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v8, v53 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v10, v49 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v12, v37 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v14, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v34 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v23, v25, v23 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 +; SI-NEXT: v_or_b32_e32 v25, v27, v25 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v26, v50 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_or_b32_e32 v26, v27, v26 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v54 +; SI-NEXT: v_or_b32_e32 v27, v29, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v38 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_or_b32_e32 v28, v29, v28 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v42 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v29, v31, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v36 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_or_b32_e32 v30, v31, v30 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v32 +; SI-NEXT: v_or_b32_e32 v31, v33, v31 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v58 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v50 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v48 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v38 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v36 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x78, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v31 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v32i32_to_v64f16: @@ -31866,61 +31586,61 @@ define inreg <64 x half> @bitcast_v32i32_to_v64f16_scalar(<32 x i32> inreg %a, i ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[4:5] -; SI-NEXT: v_mov_b32_e32 v20, s16 -; SI-NEXT: v_readfirstlane_b32 s40, v20 -; SI-NEXT: v_mov_b32_e32 v20, s17 -; SI-NEXT: v_readfirstlane_b32 s41, v20 -; SI-NEXT: v_mov_b32_e32 v20, s18 -; SI-NEXT: v_readfirstlane_b32 s42, v20 -; SI-NEXT: v_mov_b32_e32 v20, s19 -; SI-NEXT: v_readfirstlane_b32 s43, v20 -; SI-NEXT: v_mov_b32_e32 v20, s20 -; SI-NEXT: v_readfirstlane_b32 s44, v20 -; SI-NEXT: v_mov_b32_e32 v20, s21 -; SI-NEXT: v_readfirstlane_b32 s45, v20 -; SI-NEXT: v_mov_b32_e32 v20, s22 -; SI-NEXT: v_readfirstlane_b32 s46, v20 -; SI-NEXT: v_mov_b32_e32 v20, s23 -; SI-NEXT: v_readfirstlane_b32 s47, v20 -; SI-NEXT: v_mov_b32_e32 v20, s24 -; SI-NEXT: v_readfirstlane_b32 s24, v20 -; SI-NEXT: v_mov_b32_e32 v20, s25 -; SI-NEXT: v_readfirstlane_b32 s25, v20 -; SI-NEXT: v_mov_b32_e32 v20, s26 +; SI-NEXT: v_mov_b32_e32 v19, s16 +; SI-NEXT: v_readfirstlane_b32 s40, v19 +; SI-NEXT: v_mov_b32_e32 v19, s17 +; SI-NEXT: v_readfirstlane_b32 s41, v19 +; SI-NEXT: v_mov_b32_e32 v19, s18 +; SI-NEXT: v_readfirstlane_b32 s42, v19 +; SI-NEXT: v_mov_b32_e32 v19, s19 +; SI-NEXT: v_readfirstlane_b32 s43, v19 +; SI-NEXT: v_mov_b32_e32 v19, s20 +; SI-NEXT: v_readfirstlane_b32 s44, v19 +; SI-NEXT: v_mov_b32_e32 v19, s21 +; SI-NEXT: v_readfirstlane_b32 s45, v19 +; SI-NEXT: v_mov_b32_e32 v19, s22 +; SI-NEXT: v_readfirstlane_b32 s46, v19 +; SI-NEXT: v_mov_b32_e32 v19, s23 +; SI-NEXT: v_readfirstlane_b32 s47, v19 +; SI-NEXT: v_mov_b32_e32 v19, s24 +; SI-NEXT: v_readfirstlane_b32 s24, v19 +; SI-NEXT: v_mov_b32_e32 v19, s25 +; SI-NEXT: v_readfirstlane_b32 s25, v19 +; SI-NEXT: v_mov_b32_e32 v19, s26 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_writelane_b32 v63, s30, 0 -; SI-NEXT: v_readfirstlane_b32 s26, v20 -; SI-NEXT: v_mov_b32_e32 v20, s27 +; SI-NEXT: v_readfirstlane_b32 s26, v19 +; SI-NEXT: v_mov_b32_e32 v19, s27 ; SI-NEXT: v_writelane_b32 v63, s31, 1 -; SI-NEXT: v_readfirstlane_b32 s27, v20 -; SI-NEXT: v_mov_b32_e32 v20, s28 +; SI-NEXT: v_readfirstlane_b32 s27, v19 +; SI-NEXT: v_mov_b32_e32 v19, s28 ; SI-NEXT: v_writelane_b32 v63, s34, 2 -; SI-NEXT: v_readfirstlane_b32 s28, v20 -; SI-NEXT: v_mov_b32_e32 v20, s29 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 +; SI-NEXT: v_readfirstlane_b32 s28, v19 +; SI-NEXT: v_mov_b32_e32 v19, s29 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 ; SI-NEXT: v_writelane_b32 v63, s35, 3 -; SI-NEXT: v_readfirstlane_b32 s29, v20 -; SI-NEXT: v_readfirstlane_b32 s23, v1 -; SI-NEXT: v_readfirstlane_b32 s22, v2 -; SI-NEXT: v_readfirstlane_b32 s21, v3 -; SI-NEXT: v_readfirstlane_b32 s20, v4 -; SI-NEXT: v_readfirstlane_b32 s19, v5 -; SI-NEXT: v_readfirstlane_b32 s18, v6 -; SI-NEXT: v_readfirstlane_b32 s17, v7 -; SI-NEXT: v_readfirstlane_b32 s16, v8 -; SI-NEXT: v_readfirstlane_b32 s15, v9 -; SI-NEXT: v_readfirstlane_b32 s14, v10 -; SI-NEXT: v_readfirstlane_b32 s13, v11 -; SI-NEXT: v_readfirstlane_b32 s12, v12 -; SI-NEXT: v_readfirstlane_b32 s11, v13 -; SI-NEXT: v_readfirstlane_b32 s10, v14 -; SI-NEXT: v_readfirstlane_b32 s8, v15 -; SI-NEXT: v_readfirstlane_b32 s7, v16 -; SI-NEXT: v_readfirstlane_b32 s6, v17 +; SI-NEXT: v_readfirstlane_b32 s29, v19 +; SI-NEXT: v_readfirstlane_b32 s23, v0 +; SI-NEXT: v_readfirstlane_b32 s22, v1 +; SI-NEXT: v_readfirstlane_b32 s21, v2 +; SI-NEXT: v_readfirstlane_b32 s20, v3 +; SI-NEXT: v_readfirstlane_b32 s19, v4 +; SI-NEXT: v_readfirstlane_b32 s18, v5 +; SI-NEXT: v_readfirstlane_b32 s17, v6 +; SI-NEXT: v_readfirstlane_b32 s16, v7 +; SI-NEXT: v_readfirstlane_b32 s15, v8 +; SI-NEXT: v_readfirstlane_b32 s14, v9 +; SI-NEXT: v_readfirstlane_b32 s13, v10 +; SI-NEXT: v_readfirstlane_b32 s12, v11 +; SI-NEXT: v_readfirstlane_b32 s11, v12 +; SI-NEXT: v_readfirstlane_b32 s10, v13 +; SI-NEXT: v_readfirstlane_b32 s8, v14 +; SI-NEXT: v_readfirstlane_b32 s7, v15 +; SI-NEXT: v_readfirstlane_b32 s6, v16 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s9, v18 +; SI-NEXT: v_readfirstlane_b32 s9, v17 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill @@ -31939,116 +31659,110 @@ define inreg <64 x half> @bitcast_v32i32_to_v64f16_scalar(<32 x i32> inreg %a, i ; SI-NEXT: s_cbranch_scc0 .LBB21_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_lshr_b32 s4, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 ; SI-NEXT: s_lshr_b32 s4, s6, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s6 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s4 ; SI-NEXT: s_lshr_b32 s4, s7, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s4 ; SI-NEXT: s_lshr_b32 s4, s8, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s4 ; SI-NEXT: s_lshr_b32 s4, s10, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s4 ; SI-NEXT: s_lshr_b32 s4, s11, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 ; SI-NEXT: s_lshr_b32 s4, s12, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s4 ; SI-NEXT: s_lshr_b32 s4, s13, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s4 ; SI-NEXT: s_lshr_b32 s4, s14, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s4 ; SI-NEXT: s_lshr_b32 s4, s15, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 ; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 ; SI-NEXT: s_lshr_b32 s4, s17, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 ; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 ; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 ; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 ; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 ; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 ; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 ; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 ; SI-NEXT: s_lshr_b32 s4, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 ; SI-NEXT: s_lshr_b32 s4, s27, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 ; SI-NEXT: s_lshr_b32 s4, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v51, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 ; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v53, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 ; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v55, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 ; SI-NEXT: s_lshr_b32 s4, s47, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v41, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 ; SI-NEXT: s_lshr_b32 s4, s46, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v43, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 ; SI-NEXT: s_lshr_b32 s4, s45, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v45, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 ; SI-NEXT: s_lshr_b32 s4, s44, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v47, s4 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v58, s4 ; SI-NEXT: s_lshr_b32 s4, s43, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v57, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 ; SI-NEXT: s_lshr_b32 s4, s42, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v58, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 ; SI-NEXT: s_lshr_b32 s4, s41, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v59, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 ; SI-NEXT: s_lshr_b32 s4, s40, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v61, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v49, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v54, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v40, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v42, s47 -; SI-NEXT: v_cvt_f32_f16_e32 v44, s46 -; SI-NEXT: v_cvt_f32_f16_e32 v46, s45 -; SI-NEXT: v_cvt_f32_f16_e32 v56, s44 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v51, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v53, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v54, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v55, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v40, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v41, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v42, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v43, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v44, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v45, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v46, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v47, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v56, s47 +; SI-NEXT: v_cvt_f32_f16_e32 v57, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s45 +; SI-NEXT: v_cvt_f32_f16_e32 v59, s44 ; SI-NEXT: v_cvt_f32_f16_e32 v60, s43 -; SI-NEXT: v_cvt_f32_f16_e32 v62, s42 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v61, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v62, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s40 ; SI-NEXT: s_cbranch_execnz .LBB21_3 ; SI-NEXT: .LBB21_2: ; %cmp.true ; SI-NEXT: s_add_i32 s9, s9, 3 ; SI-NEXT: s_lshr_b32 s35, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s35 -; SI-NEXT: s_add_i32 s6, s6, 3 -; SI-NEXT: s_lshr_b32 s34, s6, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s35 ; SI-NEXT: s_add_i32 s40, s40, 3 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, s34 ; SI-NEXT: s_add_i32 s41, s41, 3 ; SI-NEXT: s_add_i32 s42, s42, 3 ; SI-NEXT: s_add_i32 s43, s43, 3 @@ -32078,6 +31792,7 @@ define inreg <64 x half> @bitcast_v32i32_to_v64f16_scalar(<32 x i32> inreg %a, i ; SI-NEXT: s_add_i32 s10, s10, 3 ; SI-NEXT: s_add_i32 s8, s8, 3 ; SI-NEXT: s_add_i32 s7, s7, 3 +; SI-NEXT: s_add_i32 s6, s6, 3 ; SI-NEXT: s_lshr_b32 s4, s40, 16 ; SI-NEXT: s_lshr_b32 s5, s41, 16 ; SI-NEXT: s_lshr_b32 s56, s42, 16 @@ -32108,302 +31823,173 @@ define inreg <64 x half> @bitcast_v32i32_to_v64f16_scalar(<32 x i32> inreg %a, i ; SI-NEXT: s_lshr_b32 vcc_hi, s10, 16 ; SI-NEXT: s_lshr_b32 s30, s8, 16 ; SI-NEXT: s_lshr_b32 s31, s7, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v49, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v54, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v40, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v42, s47 -; SI-NEXT: v_cvt_f32_f16_e32 v44, s46 -; SI-NEXT: v_cvt_f32_f16_e32 v46, s45 -; SI-NEXT: v_cvt_f32_f16_e32 v56, s44 +; SI-NEXT: s_lshr_b32 s34, s6, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v51, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v53, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v54, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v55, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v40, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v41, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v42, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v43, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v44, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v45, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v46, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v47, s24 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v56, s47 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v57, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s45 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v59, s44 +; SI-NEXT: s_waitcnt expcnt(2) ; SI-NEXT: v_cvt_f32_f16_e32 v60, s43 -; SI-NEXT: v_cvt_f32_f16_e32 v62, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s40 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, s31 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s30 -; SI-NEXT: v_cvt_f32_f16_e32 v6, vcc_hi -; SI-NEXT: v_cvt_f32_f16_e32 v9, vcc_lo -; SI-NEXT: v_cvt_f32_f16_e32 v11, s95 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s94 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s93 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s92 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s91 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s90 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s89 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s88 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s79 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s78 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s77 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s76 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s75 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s74 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s73 -; SI-NEXT: v_cvt_f32_f16_e32 v51, s72 -; SI-NEXT: v_cvt_f32_f16_e32 v53, s63 -; SI-NEXT: v_cvt_f32_f16_e32 v55, s62 -; SI-NEXT: v_cvt_f32_f16_e32 v41, s61 -; SI-NEXT: v_cvt_f32_f16_e32 v43, s60 -; SI-NEXT: v_cvt_f32_f16_e32 v45, s59 -; SI-NEXT: v_cvt_f32_f16_e32 v47, s58 -; SI-NEXT: v_cvt_f32_f16_e32 v57, s57 -; SI-NEXT: v_cvt_f32_f16_e32 v58, s56 -; SI-NEXT: v_cvt_f32_f16_e32 v59, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v61, s4 -; SI-NEXT: .LBB21_3: ; %end ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v61, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_readlane_b32 s35, v63, 3 -; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v61 -; SI-NEXT: v_or_b32_e32 v2, v2, v61 -; SI-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v59 -; SI-NEXT: v_readlane_b32 s34, v63, 2 -; SI-NEXT: v_readlane_b32 s31, v63, 1 -; SI-NEXT: v_readlane_b32 s30, v63, 0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v62 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v60 -; SI-NEXT: v_add_i32_e32 v57, vcc, 12, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v57, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v56 -; SI-NEXT: v_add_i32_e32 v47, vcc, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v47, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v46 -; SI-NEXT: v_add_i32_e32 v45, vcc, 20, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v45, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v44 -; SI-NEXT: v_add_i32_e32 v43, vcc, 24, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v43, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v42 -; SI-NEXT: v_add_i32_e32 v41, vcc, 28, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v41, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v40 -; SI-NEXT: v_add_i32_e32 v55, vcc, 32, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v55, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v54 -; SI-NEXT: v_add_i32_e32 v53, vcc, 36, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v53, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v52 -; SI-NEXT: v_add_i32_e32 v51, vcc, 40, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v51, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v50 -; SI-NEXT: v_add_i32_e32 v48, vcc, 44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v48, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v49 -; SI-NEXT: v_add_i32_e32 v38, vcc, 48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v38, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v39 -; SI-NEXT: v_add_i32_e32 v36, vcc, 52, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v36, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 -; SI-NEXT: v_add_i32_e32 v34, vcc, 56, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v34, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v35 -; SI-NEXT: v_add_i32_e32 v32, vcc, 60, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v32, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 -; SI-NEXT: v_add_i32_e32 v30, vcc, 64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v30, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v31 -; SI-NEXT: v_add_i32_e32 v28, vcc, 0x44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v28, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v29 -; SI-NEXT: v_add_i32_e32 v26, vcc, 0x48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v26, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v27 -; SI-NEXT: v_add_i32_e32 v23, vcc, 0x4c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v23, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v25 -; SI-NEXT: v_add_i32_e32 v21, vcc, 0x50, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v21, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v24 -; SI-NEXT: v_add_i32_e32 v19, vcc, 0x54, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v19, s[0:3], 0 offen +; SI-NEXT: v_cvt_f32_f16_e32 v61, s42 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v22 -; SI-NEXT: v_add_i32_e32 v17, vcc, 0x58, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v17, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v20 -; SI-NEXT: v_add_i32_e32 v15, vcc, 0x5c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v15, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v18 -; SI-NEXT: v_add_i32_e32 v13, vcc, 0x60, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v13, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v16 -; SI-NEXT: v_add_i32_e32 v11, vcc, 0x64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v11, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v14 -; SI-NEXT: v_add_i32_e32 v9, vcc, 0x68, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v9, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v12 -; SI-NEXT: v_add_i32_e32 v6, vcc, 0x6c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v10 -; SI-NEXT: v_add_i32_e32 v4, vcc, 0x70, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v8 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: v_cvt_f32_f16_e32 v62, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s40 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v31, s34 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s31 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s30 +; SI-NEXT: v_cvt_f32_f16_e32 v26, vcc_hi +; SI-NEXT: v_cvt_f32_f16_e32 v27, vcc_lo +; SI-NEXT: v_cvt_f32_f16_e32 v24, s95 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s94 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s93 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s92 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s91 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s90 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s89 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s88 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s79 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s78 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s77 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s76 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s75 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s74 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s73 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s72 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s63 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s62 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s61 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s60 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s59 +; SI-NEXT: v_cvt_f32_f16_e32 v58, s58 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s57 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s56 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v2, v7 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x78, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 +; SI-NEXT: .LBB21_3: ; %end +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v30, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v62 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_or_b32_e32 v1, v30, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_or_b32_e32 v2, v30, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_or_b32_e32 v3, v30, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v58 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_cvt_f16_f32_e32 v58, v59 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_cvt_f16_f32_e32 v59, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v59 +; SI-NEXT: v_or_b32_e32 v5, v5, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_or_b32_e32 v6, v30, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_or_b32_e32 v8, v30, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_or_b32_e32 v10, v30, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_or_b32_e32 v12, v30, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v56 +; SI-NEXT: v_or_b32_e32 v14, v30, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v46 +; SI-NEXT: v_or_b32_e32 v16, v30, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v44 +; SI-NEXT: v_or_b32_e32 v18, v30, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v42 +; SI-NEXT: v_or_b32_e32 v20, v30, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v40 +; SI-NEXT: v_or_b32_e32 v22, v30, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v39 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v45 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_or_b32_e32 v4, v58, v4 +; SI-NEXT: v_or_b32_e32 v7, v56, v7 +; SI-NEXT: v_or_b32_e32 v9, v46, v9 +; SI-NEXT: v_or_b32_e32 v11, v44, v11 +; SI-NEXT: v_or_b32_e32 v13, v42, v13 +; SI-NEXT: v_or_b32_e32 v15, v40, v15 +; SI-NEXT: v_or_b32_e32 v24, v30, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v37 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v2, v5 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -32419,78 +32005,120 @@ define inreg <64 x half> @bitcast_v32i32_to_v64f16_scalar(<32 x i32> inreg %a, i ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v37, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_or_b32_e32 v26, v30, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 +; SI-NEXT: v_or_b32_e32 v28, v30, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v33 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v53 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v35 +; SI-NEXT: v_or_b32_e32 v30, v31, v30 +; SI-NEXT: v_or_b32_e32 v17, v54, v17 +; SI-NEXT: v_or_b32_e32 v19, v52, v19 +; SI-NEXT: v_or_b32_e32 v21, v50, v21 +; SI-NEXT: v_or_b32_e32 v23, v48, v23 +; SI-NEXT: v_or_b32_e32 v25, v38, v25 +; SI-NEXT: v_or_b32_e32 v27, v36, v27 +; SI-NEXT: v_or_b32_e32 v29, v34, v29 +; SI-NEXT: v_readlane_b32 s35, v63, 3 +; SI-NEXT: v_readlane_b32 s34, v63, 2 +; SI-NEXT: v_readlane_b32 s31, v63, 1 +; SI-NEXT: v_readlane_b32 s30, v63, 0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v33 +; SI-NEXT: v_or_b32_e32 v31, v32, v31 ; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[4:5] -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB21_4: -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr62 ; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr62 ; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: s_branch .LBB21_2 ; ; VI-LABEL: bitcast_v32i32_to_v64f16_scalar: @@ -32737,755 +32365,789 @@ define <32 x i32> @bitcast_v64f16_to_v32i32(<64 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v64f16_to_v32i32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v62, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:132 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:88 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:84 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v63, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:4 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v62 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v61 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v2 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v60 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v3 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v31 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v57 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v63 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v53 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v33 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v34 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v50 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v35 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v36 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v35 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v37 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v33 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v38 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v48 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v39 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v48 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v48, v46 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v45 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v30 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v50 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v44 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v52 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v53 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v43 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v42 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v55, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v41 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:100 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:112 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:108 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:120 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:116 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f16_f32_e32 v53, v0 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v52, v1 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f16_f32_e32 v50, v2 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v39, v3 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v38, v4 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v37, v5 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v36, v6 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v35, v7 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v34, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v33, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB22_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v47 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v34 -; SI-NEXT: v_or_b32_e32 v31, v33, v31 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v63 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v61 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v59 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v60 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v58 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v56 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v46 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v44 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v40 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v54 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v49 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v53 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v50 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v38 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v36 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: v_or_b32_e32 v0, v62, v0 -; SI-NEXT: v_or_b32_e32 v1, v60, v1 -; SI-NEXT: v_or_b32_e32 v2, v58, v2 -; SI-NEXT: v_or_b32_e32 v3, v56, v3 -; SI-NEXT: v_or_b32_e32 v25, v51, v25 -; SI-NEXT: v_or_b32_e32 v26, v48, v26 -; SI-NEXT: v_or_b32_e32 v27, v52, v27 -; SI-NEXT: v_or_b32_e32 v28, v39, v28 -; SI-NEXT: v_or_b32_e32 v29, v37, v29 -; SI-NEXT: v_or_b32_e32 v30, v35, v30 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v35 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: v_or_b32_e32 v18, v59, v18 +; SI-NEXT: v_or_b32_e32 v19, v57, v19 +; SI-NEXT: v_or_b32_e32 v20, v47, v20 +; SI-NEXT: v_or_b32_e32 v21, v45, v21 +; SI-NEXT: v_or_b32_e32 v22, v43, v22 +; SI-NEXT: v_or_b32_e32 v23, v41, v23 +; SI-NEXT: v_or_b32_e32 v24, v55, v24 +; SI-NEXT: v_or_b32_e32 v25, v53, v25 +; SI-NEXT: v_or_b32_e32 v26, v51, v26 +; SI-NEXT: v_or_b32_e32 v27, v49, v27 +; SI-NEXT: v_or_b32_e32 v28, v38, v28 +; SI-NEXT: v_or_b32_e32 v29, v36, v29 +; SI-NEXT: v_or_b32_e32 v30, v34, v30 +; SI-NEXT: v_or_b32_e32 v31, v33, v31 ; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v32 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_or_b32_e32 v21, v22, v21 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_or_b32_e32 v23, v24, v23 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v40 -; SI-NEXT: v_or_b32_e32 v24, v55, v24 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v62 +; SI-NEXT: v_or_b32_e32 v17, v61, v17 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: .LBB22_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB22_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v58 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v56 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v43 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v41 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v26, v54 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v57 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v53 ; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v51 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v51 ; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v39 ; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 ; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v50 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v38 ; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v36 ; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 ; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 ; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 ; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 ; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v34 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v62 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v60 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_or_b32_e32 v18, v19, v18 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v20, v56 ; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v46 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v22, v20 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v45 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_or_b32_e32 v21, v22, v21 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v42 ; SI-NEXT: v_or_b32_e32 v22, v24, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v24, v40 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; SI-NEXT: v_or_b32_e32 v23, v25, v23 ; SI-NEXT: v_cvt_f32_f16_e32 v25, v55 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: v_or_b32_e32 v24, v25, v24 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v52 ; SI-NEXT: v_or_b32_e32 v25, v27, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v50 ; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 ; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 ; SI-NEXT: v_or_b32_e32 v26, v28, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v49 ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 ; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 ; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 ; SI-NEXT: v_or_b32_e32 v27, v28, v27 ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v37 ; SI-NEXT: v_or_b32_e32 v28, v30, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v35 ; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 ; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 ; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 ; SI-NEXT: v_or_b32_e32 v29, v31, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v34 ; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 ; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 ; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 @@ -33494,23 +33156,23 @@ define <32 x i32> @bitcast_v64f16_to_v32i32(<64 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v31, v33, v31 ; SI-NEXT: .LBB22_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v64f16_to_v32i32: @@ -33751,548 +33413,676 @@ define inreg <32 x i32> @bitcast_v64f16_to_v32i32_scalar(<64 x half> inreg %a, i ; SI-LABEL: bitcast_v64f16_to_v32i32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v53, v26 -; SI-NEXT: v_mov_b32_e32 v45, v6 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:48 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:44 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:68 -; SI-NEXT: v_mov_b32_e32 v54, v14 -; SI-NEXT: v_mov_b32_e32 v55, v12 -; SI-NEXT: v_mov_b32_e32 v41, v11 -; SI-NEXT: v_mov_b32_e32 v40, v10 -; SI-NEXT: v_mov_b32_e32 v44, v9 -; SI-NEXT: v_mov_b32_e32 v43, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v15 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v47, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v2 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v46, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v44, s20 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v42, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v13 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v56, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v1 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v40, s24 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v17, v40 +; SI-NEXT: s_lshr_b32 s4, s29, 16 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v58, s4 +; SI-NEXT: s_lshr_b32 s4, s28, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v59, s4 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: s_lshr_b32 s4, s27, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v61, s4 +; SI-NEXT: s_lshr_b32 s4, s26, 16 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v62, s4 +; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s4 +; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s4 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v51, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v53, s4 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s4 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v46 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s4 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v47 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s4 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v55, s25 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v45, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v45 ; SI-NEXT: v_cvt_f16_f32_e32 v45, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v0, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v1, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v2, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v3, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v4, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v5, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v7, s28 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f16_f32_e32 v25, v39 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f16_f32_e32 v48, v26 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f16_f32_e32 v26, v31 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f16_f32_e32 v39, v6 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f16_f32_e32 v27, v42 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f16_f32_e32 v38, v60 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f16_f32_e32 v28, v37 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v37, v62 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v29, v63 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v31, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s26 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v49, v26 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v62 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v43, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v41, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v63, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v60, s28 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v57, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v57 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v30, v33 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v32, v34 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v34, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v63, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v62, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v60, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v42, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v35, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v33, s26 -; SI-NEXT: v_cvt_f16_f32_e32 v6, s29 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v44, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v42, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v56, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v18, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v19, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v20, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v21, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v22, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v23, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v24, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v25, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v26, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v27, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v28, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v29, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v30, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v31, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v57, v0 +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB23_2 ; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v6 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; SI-NEXT: v_or_b32_e32 v3, v7, v3 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v14 +; SI-NEXT: v_or_b32_e32 v5, v11, v5 +; SI-NEXT: v_or_b32_e32 v6, v13, v6 +; SI-NEXT: v_or_b32_e32 v7, v15, v7 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v56 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v46 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v21, v22, v21 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v52 -; SI-NEXT: v_or_b32_e32 v5, v33, v5 -; SI-NEXT: v_mov_b32_e32 v33, v52 -; SI-NEXT: v_mov_b32_e32 v52, v51 -; SI-NEXT: v_or_b32_e32 v22, v51, v22 -; SI-NEXT: v_mov_b32_e32 v51, v23 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_or_b32_e32 v23, v50, v23 -; SI-NEXT: v_mov_b32_e32 v50, v24 ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_or_b32_e32 v24, v49, v24 -; SI-NEXT: v_mov_b32_e32 v49, v25 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_or_b32_e32 v25, v48, v25 -; SI-NEXT: v_mov_b32_e32 v48, v26 ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v26, v39, v26 -; SI-NEXT: v_mov_b32_e32 v39, v27 ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v9 -; SI-NEXT: v_or_b32_e32 v27, v38, v27 -; SI-NEXT: v_mov_b32_e32 v38, v28 ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_or_b32_e32 v28, v37, v28 -; SI-NEXT: v_mov_b32_e32 v37, v29 ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v4, v35, v4 -; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_or_b32_e32 v9, v14, v9 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v58 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v46 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v61 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v47 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_mov_b32_e32 v35, v54 -; SI-NEXT: v_or_b32_e32 v19, v54, v19 -; SI-NEXT: v_mov_b32_e32 v54, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v29, v31, v29 ; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v34 -; SI-NEXT: v_or_b32_e32 v0, v63, v0 -; SI-NEXT: v_or_b32_e32 v1, v62, v1 -; SI-NEXT: v_or_b32_e32 v2, v60, v2 -; SI-NEXT: v_or_b32_e32 v3, v42, v3 -; SI-NEXT: v_or_b32_e32 v10, v56, v10 -; SI-NEXT: v_mov_b32_e32 v63, v44 -; SI-NEXT: v_or_b32_e32 v11, v44, v11 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_or_b32_e32 v10, v62, v10 ; SI-NEXT: v_mov_b32_e32 v62, v61 -; SI-NEXT: v_mov_b32_e32 v60, v59 +; SI-NEXT: v_or_b32_e32 v11, v35, v11 +; SI-NEXT: v_mov_b32_e32 v35, v59 ; SI-NEXT: v_or_b32_e32 v12, v59, v12 -; SI-NEXT: v_mov_b32_e32 v58, v57 -; SI-NEXT: v_or_b32_e32 v13, v57, v13 -; SI-NEXT: v_mov_b32_e32 v56, v47 -; SI-NEXT: v_mov_b32_e32 v46, v45 -; SI-NEXT: v_or_b32_e32 v14, v45, v14 -; SI-NEXT: v_mov_b32_e32 v44, v43 -; SI-NEXT: v_or_b32_e32 v15, v43, v15 -; SI-NEXT: v_mov_b32_e32 v42, v41 -; SI-NEXT: v_or_b32_e32 v16, v41, v16 -; SI-NEXT: v_or_b32_e32 v17, v40, v17 -; SI-NEXT: v_mov_b32_e32 v40, v55 -; SI-NEXT: v_or_b32_e32 v18, v55, v18 -; SI-NEXT: v_or_b32_e32 v20, v53, v20 -; SI-NEXT: v_or_b32_e32 v30, v32, v30 -; SI-NEXT: v_mov_b32_e32 v32, v34 -; SI-NEXT: v_or_b32_e32 v31, v36, v31 +; SI-NEXT: v_mov_b32_e32 v33, v58 +; SI-NEXT: v_or_b32_e32 v13, v58, v13 +; SI-NEXT: v_mov_b32_e32 v32, v56 +; SI-NEXT: v_or_b32_e32 v14, v47, v14 +; SI-NEXT: v_mov_b32_e32 v47, v46 +; SI-NEXT: v_or_b32_e32 v15, v45, v15 +; SI-NEXT: v_mov_b32_e32 v45, v44 +; SI-NEXT: v_or_b32_e32 v18, v40, v18 +; SI-NEXT: v_or_b32_e32 v19, v55, v19 +; SI-NEXT: v_mov_b32_e32 v55, v54 +; SI-NEXT: v_or_b32_e32 v20, v54, v20 +; SI-NEXT: v_or_b32_e32 v21, v53, v21 +; SI-NEXT: v_mov_b32_e32 v53, v52 +; SI-NEXT: v_or_b32_e32 v22, v52, v22 +; SI-NEXT: v_or_b32_e32 v23, v51, v23 +; SI-NEXT: v_mov_b32_e32 v51, v50 +; SI-NEXT: v_or_b32_e32 v24, v50, v24 +; SI-NEXT: v_or_b32_e32 v25, v49, v25 +; SI-NEXT: v_mov_b32_e32 v49, v48 +; SI-NEXT: v_or_b32_e32 v26, v48, v26 +; SI-NEXT: v_or_b32_e32 v27, v39, v27 +; SI-NEXT: v_mov_b32_e32 v39, v38 +; SI-NEXT: v_or_b32_e32 v28, v38, v28 +; SI-NEXT: v_or_b32_e32 v29, v63, v29 +; SI-NEXT: v_mov_b32_e32 v63, v60 +; SI-NEXT: v_or_b32_e32 v30, v60, v30 +; SI-NEXT: v_or_b32_e32 v31, v57, v31 ; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v4, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v16 +; SI-NEXT: v_or_b32_e32 v4, v9, v4 +; SI-NEXT: v_or_b32_e32 v8, v17, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v44 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v42 +; SI-NEXT: v_or_b32_e32 v9, v36, v9 +; SI-NEXT: v_mov_b32_e32 v36, v34 +; SI-NEXT: v_or_b32_e32 v16, v43, v16 +; SI-NEXT: v_mov_b32_e32 v43, v42 +; SI-NEXT: v_or_b32_e32 v17, v41, v17 +; SI-NEXT: v_mov_b32_e32 v41, v40 ; SI-NEXT: s_branch .LBB23_3 ; SI-NEXT: .LBB23_2: -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v63, v44 +; SI-NEXT: v_mov_b32_e32 v36, v34 ; SI-NEXT: v_mov_b32_e32 v62, v61 -; SI-NEXT: v_mov_b32_e32 v60, v59 -; SI-NEXT: v_mov_b32_e32 v58, v57 -; SI-NEXT: v_mov_b32_e32 v56, v47 -; SI-NEXT: v_mov_b32_e32 v46, v45 -; SI-NEXT: v_mov_b32_e32 v44, v43 -; SI-NEXT: v_mov_b32_e32 v42, v41 -; SI-NEXT: v_mov_b32_e32 v40, v55 -; SI-NEXT: v_mov_b32_e32 v35, v54 -; SI-NEXT: v_mov_b32_e32 v54, v20 -; SI-NEXT: v_mov_b32_e32 v33, v52 -; SI-NEXT: v_mov_b32_e32 v32, v34 -; SI-NEXT: v_mov_b32_e32 v52, v51 -; SI-NEXT: v_mov_b32_e32 v51, v23 -; SI-NEXT: v_mov_b32_e32 v50, v24 -; SI-NEXT: v_mov_b32_e32 v49, v25 -; SI-NEXT: v_mov_b32_e32 v48, v26 -; SI-NEXT: v_mov_b32_e32 v39, v27 -; SI-NEXT: v_mov_b32_e32 v38, v28 -; SI-NEXT: v_mov_b32_e32 v37, v29 +; SI-NEXT: v_mov_b32_e32 v35, v59 +; SI-NEXT: v_mov_b32_e32 v33, v58 +; SI-NEXT: v_mov_b32_e32 v32, v56 +; SI-NEXT: v_mov_b32_e32 v47, v46 +; SI-NEXT: v_mov_b32_e32 v45, v44 +; SI-NEXT: v_mov_b32_e32 v43, v42 +; SI-NEXT: v_mov_b32_e32 v41, v40 +; SI-NEXT: v_mov_b32_e32 v55, v54 +; SI-NEXT: v_mov_b32_e32 v53, v52 +; SI-NEXT: v_mov_b32_e32 v51, v50 +; SI-NEXT: v_mov_b32_e32 v49, v48 +; SI-NEXT: v_mov_b32_e32 v39, v38 +; SI-NEXT: v_mov_b32_e32 v63, v60 ; SI-NEXT: s_mov_b64 s[4:5], -1 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: .LBB23_3: ; %Flow -; SI-NEXT: v_mov_b32_e32 v34, v33 +; SI-NEXT: v_mov_b32_e32 v60, v39 +; SI-NEXT: v_mov_b32_e32 v38, v49 +; SI-NEXT: v_mov_b32_e32 v39, v51 +; SI-NEXT: v_mov_b32_e32 v48, v53 +; SI-NEXT: v_mov_b32_e32 v49, v55 +; SI-NEXT: v_mov_b32_e32 v50, v41 +; SI-NEXT: v_mov_b32_e32 v51, v43 +; SI-NEXT: v_mov_b32_e32 v52, v45 +; SI-NEXT: v_mov_b32_e32 v53, v47 +; SI-NEXT: v_mov_b32_e32 v54, v32 +; SI-NEXT: v_mov_b32_e32 v32, v33 ; SI-NEXT: v_mov_b32_e32 v33, v35 -; SI-NEXT: v_mov_b32_e32 v35, v40 -; SI-NEXT: v_mov_b32_e32 v53, v42 -; SI-NEXT: v_mov_b32_e32 v40, v46 -; SI-NEXT: v_mov_b32_e32 v41, v56 -; SI-NEXT: v_mov_b32_e32 v42, v58 -; SI-NEXT: v_mov_b32_e32 v43, v60 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v34, v36 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; SI-NEXT: s_cbranch_vccnz .LBB23_5 ; SI-NEXT: ; %bb.4: ; %cmp.true ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v47 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v32 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v43 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v36 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v40 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_mov_b32_e32 v55, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v33 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v38 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v38 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v36 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v60 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v2, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 ; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v58 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v46 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v44 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v42 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v40 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v37 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v35 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v34 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v62 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v62 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v54 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v53 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v52 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v51 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 @@ -34300,55 +34090,63 @@ define inreg <32 x i32> @bitcast_v64f16_to_v32i32_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_or_b32_e32 v18, v19, v18 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v54 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; SI-NEXT: v_or_b32_e32 v20, v22, v20 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_or_b32_e32 v21, v22, v21 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v51 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v22, v24, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v50 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; SI-NEXT: v_or_b32_e32 v23, v25, v23 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v25, v39 ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: v_or_b32_e32 v24, v25, v24 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v48 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v25, v27, v25 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 ; SI-NEXT: v_or_b32_e32 v26, v28, v26 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_or_b32_e32 v25, v27, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v39 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 @@ -34356,46 +34154,43 @@ define inreg <32 x i32> @bitcast_v64f16_to_v32i32_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 ; SI-NEXT: v_or_b32_e32 v27, v28, v27 ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v37 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v28, v30, v28 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; SI-NEXT: v_or_b32_e32 v29, v31, v29 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 ; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 ; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_or_b32_e32 v29, v31, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v63 ; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 ; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 ; SI-NEXT: v_or_b32_e32 v30, v31, v30 ; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v32 ; SI-NEXT: v_or_b32_e32 v31, v33, v31 ; SI-NEXT: .LBB23_5: ; %end -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -34723,390 +34518,292 @@ define <64 x i16> @bitcast_v32i32_to_v64i16(<32 x i32> %a, i32 %b) { ; SI-LABEL: bitcast_v32i32_to_v64i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; kill: killed $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 +; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB24_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_alignbit_b32 v33, v31, v32, 16 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v34, v30, v29, 16 -; SI-NEXT: v_alignbit_b32 v35, v28, v27, 16 -; SI-NEXT: v_alignbit_b32 v36, v26, v25, 16 -; SI-NEXT: v_alignbit_b32 v37, v24, v23, 16 -; SI-NEXT: v_alignbit_b32 v38, v22, v21, 16 -; SI-NEXT: v_alignbit_b32 v48, v20, v19, 16 -; SI-NEXT: v_alignbit_b32 v50, v18, v17, 16 -; SI-NEXT: v_alignbit_b32 v52, v16, v15, 16 -; SI-NEXT: v_alignbit_b32 v54, v14, v13, 16 -; SI-NEXT: v_alignbit_b32 v41, v12, v11, 16 -; SI-NEXT: v_alignbit_b32 v43, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v45, v8, v7, 16 -; SI-NEXT: v_alignbit_b32 v47, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v32, v31, v30, 16 +; SI-NEXT: v_alignbit_b32 v33, v29, v28, 16 +; SI-NEXT: v_alignbit_b32 v34, v27, v26, 16 +; SI-NEXT: v_alignbit_b32 v35, v25, v24, 16 +; SI-NEXT: v_alignbit_b32 v36, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v37, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v38, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v39, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v48, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v49, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v50, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v53, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v55, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v42, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v44, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v47, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v15 ; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_alignbit_b32 v58, v4, v3, 16 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v13 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v11 ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_alignbit_b32 v60, v2, v1, 16 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v31 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v9 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v7 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v5 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v3 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v1 ; SI-NEXT: .LBB24_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB24_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v31 -; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 ; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 ; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 ; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 ; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 ; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 ; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 ; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 ; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 ; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 ; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 ; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 ; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 -; SI-NEXT: v_alignbit_b32 v33, v31, v32, 16 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v34, v30, v29, 16 -; SI-NEXT: v_alignbit_b32 v35, v28, v27, 16 -; SI-NEXT: v_alignbit_b32 v36, v26, v25, 16 -; SI-NEXT: v_alignbit_b32 v37, v24, v23, 16 -; SI-NEXT: v_alignbit_b32 v38, v22, v21, 16 -; SI-NEXT: v_alignbit_b32 v48, v20, v19, 16 -; SI-NEXT: v_alignbit_b32 v50, v18, v17, 16 -; SI-NEXT: v_alignbit_b32 v52, v16, v15, 16 -; SI-NEXT: v_alignbit_b32 v54, v14, v13, 16 -; SI-NEXT: v_alignbit_b32 v41, v12, v11, 16 -; SI-NEXT: v_alignbit_b32 v43, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v45, v8, v7, 16 -; SI-NEXT: v_alignbit_b32 v47, v6, v5, 16 +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v31 +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; SI-NEXT: v_alignbit_b32 v32, v31, v30, 16 +; SI-NEXT: v_alignbit_b32 v33, v29, v28, 16 +; SI-NEXT: v_alignbit_b32 v34, v27, v26, 16 +; SI-NEXT: v_alignbit_b32 v35, v25, v24, 16 +; SI-NEXT: v_alignbit_b32 v36, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v37, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v38, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v39, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v48, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v49, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v50, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v53, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v55, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v42, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v44, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v47, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v15 ; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_alignbit_b32 v58, v4, v3, 16 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v13 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v11 ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_alignbit_b32 v60, v2, v1, 16 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v31 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v9 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v7 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v5 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v3 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v1 ; SI-NEXT: .LBB24_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v47 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v44 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v42 +; SI-NEXT: v_or_b32_e32 v0, v0, v47 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v60 -; SI-NEXT: v_or_b32_e32 v1, v1, v60 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v58 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v63 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v47 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v62 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v45 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v61 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v43 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v59 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v57 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v56 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v46 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v44 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v42 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v24 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v25 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v26 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v53 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v27 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v28 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v29 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v30 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v49 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v63 +; SI-NEXT: v_or_b32_e32 v2, v2, v44 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v62 +; SI-NEXT: v_or_b32_e32 v4, v4, v42 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v61 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v55 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v47 +; SI-NEXT: v_or_b32_e32 v3, v3, v44 +; SI-NEXT: v_or_b32_e32 v5, v5, v42 +; SI-NEXT: v_or_b32_e32 v6, v6, v55 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v60 +; SI-NEXT: v_or_b32_e32 v8, v8, v53 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v59 +; SI-NEXT: v_or_b32_e32 v10, v10, v50 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v58 +; SI-NEXT: v_or_b32_e32 v12, v12, v49 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v57 +; SI-NEXT: v_or_b32_e32 v14, v14, v48 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v56 +; SI-NEXT: v_or_b32_e32 v16, v16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v46 +; SI-NEXT: v_or_b32_e32 v18, v18, v38 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v45 +; SI-NEXT: v_or_b32_e32 v20, v20, v37 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v43 +; SI-NEXT: v_or_b32_e32 v22, v22, v36 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v41 +; SI-NEXT: v_or_b32_e32 v24, v24, v35 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v40 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_or_b32_e32 v26, v26, v34 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v54 +; SI-NEXT: v_or_b32_e32 v28, v28, v33 +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v52 +; SI-NEXT: v_or_b32_e32 v30, v30, v32 +; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v51 +; SI-NEXT: v_or_b32_e32 v7, v7, v55 +; SI-NEXT: v_or_b32_e32 v9, v9, v53 +; SI-NEXT: v_or_b32_e32 v11, v11, v50 +; SI-NEXT: v_or_b32_e32 v13, v13, v49 +; SI-NEXT: v_or_b32_e32 v15, v15, v48 +; SI-NEXT: v_or_b32_e32 v17, v17, v39 +; SI-NEXT: v_or_b32_e32 v19, v19, v38 +; SI-NEXT: v_or_b32_e32 v21, v21, v37 +; SI-NEXT: v_or_b32_e32 v23, v23, v36 +; SI-NEXT: v_or_b32_e32 v25, v25, v35 +; SI-NEXT: v_or_b32_e32 v27, v27, v34 +; SI-NEXT: v_or_b32_e32 v29, v29, v33 +; SI-NEXT: v_or_b32_e32 v31, v31, v32 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v31 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v32i32_to_v64i16: @@ -35282,79 +34979,79 @@ define inreg <64 x i16> @bitcast_v32i32_to_v64i16_scalar(<32 x i32> inreg %a, i3 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v21, s30, 0 -; SI-NEXT: v_writelane_b32 v21, s31, 1 -; SI-NEXT: v_writelane_b32 v21, s34, 2 -; SI-NEXT: v_writelane_b32 v21, s35, 3 -; SI-NEXT: v_writelane_b32 v21, s36, 4 -; SI-NEXT: v_writelane_b32 v21, s37, 5 -; SI-NEXT: v_writelane_b32 v21, s38, 6 -; SI-NEXT: v_mov_b32_e32 v20, s16 -; SI-NEXT: v_writelane_b32 v21, s39, 7 -; SI-NEXT: v_readfirstlane_b32 s56, v20 -; SI-NEXT: v_mov_b32_e32 v20, s17 -; SI-NEXT: v_writelane_b32 v21, s48, 8 -; SI-NEXT: v_readfirstlane_b32 s57, v20 -; SI-NEXT: v_mov_b32_e32 v20, s18 -; SI-NEXT: v_writelane_b32 v21, s49, 9 -; SI-NEXT: v_readfirstlane_b32 s46, v20 -; SI-NEXT: v_mov_b32_e32 v20, s19 -; SI-NEXT: v_writelane_b32 v21, s50, 10 -; SI-NEXT: v_readfirstlane_b32 s47, v20 -; SI-NEXT: v_mov_b32_e32 v20, s20 -; SI-NEXT: v_writelane_b32 v21, s51, 11 -; SI-NEXT: v_readfirstlane_b32 s44, v20 -; SI-NEXT: v_mov_b32_e32 v20, s21 -; SI-NEXT: v_writelane_b32 v21, s52, 12 -; SI-NEXT: v_readfirstlane_b32 s45, v20 -; SI-NEXT: v_mov_b32_e32 v20, s22 -; SI-NEXT: v_writelane_b32 v21, s53, 13 -; SI-NEXT: v_readfirstlane_b32 s42, v20 -; SI-NEXT: v_mov_b32_e32 v20, s23 -; SI-NEXT: v_writelane_b32 v21, s54, 14 -; SI-NEXT: v_readfirstlane_b32 s43, v20 -; SI-NEXT: v_mov_b32_e32 v20, s24 -; SI-NEXT: v_writelane_b32 v21, s55, 15 -; SI-NEXT: v_readfirstlane_b32 s40, v20 -; SI-NEXT: v_mov_b32_e32 v20, s25 -; SI-NEXT: v_writelane_b32 v21, s64, 16 -; SI-NEXT: v_readfirstlane_b32 s41, v20 -; SI-NEXT: v_mov_b32_e32 v20, s26 -; SI-NEXT: v_writelane_b32 v21, s65, 17 -; SI-NEXT: v_readfirstlane_b32 s24, v20 -; SI-NEXT: v_mov_b32_e32 v20, s27 -; SI-NEXT: v_writelane_b32 v21, s66, 18 -; SI-NEXT: v_readfirstlane_b32 s25, v20 -; SI-NEXT: v_mov_b32_e32 v20, s28 -; SI-NEXT: v_writelane_b32 v21, s67, 19 -; SI-NEXT: v_readfirstlane_b32 s22, v20 -; SI-NEXT: v_mov_b32_e32 v20, s29 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 -; SI-NEXT: v_writelane_b32 v21, s68, 20 -; SI-NEXT: v_readfirstlane_b32 s23, v20 -; SI-NEXT: v_readfirstlane_b32 s20, v1 -; SI-NEXT: v_readfirstlane_b32 s21, v2 -; SI-NEXT: v_readfirstlane_b32 s18, v3 -; SI-NEXT: v_readfirstlane_b32 s19, v4 -; SI-NEXT: v_readfirstlane_b32 s16, v5 -; SI-NEXT: v_readfirstlane_b32 s17, v6 -; SI-NEXT: v_readfirstlane_b32 s14, v7 -; SI-NEXT: v_readfirstlane_b32 s15, v8 -; SI-NEXT: v_readfirstlane_b32 s12, v9 -; SI-NEXT: v_readfirstlane_b32 s13, v10 -; SI-NEXT: v_readfirstlane_b32 s10, v11 -; SI-NEXT: v_readfirstlane_b32 s11, v12 -; SI-NEXT: v_readfirstlane_b32 s8, v13 -; SI-NEXT: v_readfirstlane_b32 s9, v14 -; SI-NEXT: v_readfirstlane_b32 s6, v15 -; SI-NEXT: v_readfirstlane_b32 s7, v16 -; SI-NEXT: v_readfirstlane_b32 s4, v17 +; SI-NEXT: v_writelane_b32 v32, s30, 0 +; SI-NEXT: v_writelane_b32 v32, s31, 1 +; SI-NEXT: v_writelane_b32 v32, s34, 2 +; SI-NEXT: v_writelane_b32 v32, s35, 3 +; SI-NEXT: v_writelane_b32 v32, s36, 4 +; SI-NEXT: v_writelane_b32 v32, s37, 5 +; SI-NEXT: v_writelane_b32 v32, s38, 6 +; SI-NEXT: v_mov_b32_e32 v19, s16 +; SI-NEXT: v_writelane_b32 v32, s39, 7 +; SI-NEXT: v_readfirstlane_b32 s56, v19 +; SI-NEXT: v_mov_b32_e32 v19, s17 +; SI-NEXT: v_writelane_b32 v32, s48, 8 +; SI-NEXT: v_readfirstlane_b32 s57, v19 +; SI-NEXT: v_mov_b32_e32 v19, s18 +; SI-NEXT: v_writelane_b32 v32, s49, 9 +; SI-NEXT: v_readfirstlane_b32 s46, v19 +; SI-NEXT: v_mov_b32_e32 v19, s19 +; SI-NEXT: v_writelane_b32 v32, s50, 10 +; SI-NEXT: v_readfirstlane_b32 s47, v19 +; SI-NEXT: v_mov_b32_e32 v19, s20 +; SI-NEXT: v_writelane_b32 v32, s51, 11 +; SI-NEXT: v_readfirstlane_b32 s44, v19 +; SI-NEXT: v_mov_b32_e32 v19, s21 +; SI-NEXT: v_writelane_b32 v32, s52, 12 +; SI-NEXT: v_readfirstlane_b32 s45, v19 +; SI-NEXT: v_mov_b32_e32 v19, s22 +; SI-NEXT: v_writelane_b32 v32, s53, 13 +; SI-NEXT: v_readfirstlane_b32 s42, v19 +; SI-NEXT: v_mov_b32_e32 v19, s23 +; SI-NEXT: v_writelane_b32 v32, s54, 14 +; SI-NEXT: v_readfirstlane_b32 s43, v19 +; SI-NEXT: v_mov_b32_e32 v19, s24 +; SI-NEXT: v_writelane_b32 v32, s55, 15 +; SI-NEXT: v_readfirstlane_b32 s40, v19 +; SI-NEXT: v_mov_b32_e32 v19, s25 +; SI-NEXT: v_writelane_b32 v32, s64, 16 +; SI-NEXT: v_readfirstlane_b32 s41, v19 +; SI-NEXT: v_mov_b32_e32 v19, s26 +; SI-NEXT: v_writelane_b32 v32, s65, 17 +; SI-NEXT: v_readfirstlane_b32 s24, v19 +; SI-NEXT: v_mov_b32_e32 v19, s27 +; SI-NEXT: v_writelane_b32 v32, s66, 18 +; SI-NEXT: v_readfirstlane_b32 s25, v19 +; SI-NEXT: v_mov_b32_e32 v19, s28 +; SI-NEXT: v_writelane_b32 v32, s67, 19 +; SI-NEXT: v_readfirstlane_b32 s22, v19 +; SI-NEXT: v_mov_b32_e32 v19, s29 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: v_writelane_b32 v32, s68, 20 +; SI-NEXT: v_readfirstlane_b32 s23, v19 +; SI-NEXT: v_readfirstlane_b32 s20, v0 +; SI-NEXT: v_readfirstlane_b32 s21, v1 +; SI-NEXT: v_readfirstlane_b32 s18, v2 +; SI-NEXT: v_readfirstlane_b32 s19, v3 +; SI-NEXT: v_readfirstlane_b32 s16, v4 +; SI-NEXT: v_readfirstlane_b32 s17, v5 +; SI-NEXT: v_readfirstlane_b32 s14, v6 +; SI-NEXT: v_readfirstlane_b32 s15, v7 +; SI-NEXT: v_readfirstlane_b32 s12, v8 +; SI-NEXT: v_readfirstlane_b32 s13, v9 +; SI-NEXT: v_readfirstlane_b32 s10, v10 +; SI-NEXT: v_readfirstlane_b32 s11, v11 +; SI-NEXT: v_readfirstlane_b32 s8, v12 +; SI-NEXT: v_readfirstlane_b32 s9, v13 +; SI-NEXT: v_readfirstlane_b32 s6, v14 +; SI-NEXT: v_readfirstlane_b32 s7, v15 +; SI-NEXT: v_readfirstlane_b32 s4, v16 ; SI-NEXT: s_and_b64 s[26:27], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s5, v18 -; SI-NEXT: v_writelane_b32 v21, s69, 21 +; SI-NEXT: v_readfirstlane_b32 s5, v17 +; SI-NEXT: v_writelane_b32 v32, s69, 21 ; SI-NEXT: s_cbranch_scc0 .LBB25_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_lshr_b32 s38, s5, 16 @@ -35375,8 +35072,8 @@ define inreg <64 x i16> @bitcast_v32i32_to_v64i16_scalar(<32 x i32> inreg %a, i3 ; SI-NEXT: s_lshr_b32 s69, s57, 16 ; SI-NEXT: s_lshr_b64 s[26:27], s[4:5], 16 ; SI-NEXT: s_lshr_b64 s[28:29], s[6:7], 16 -; SI-NEXT: s_lshr_b64 s[58:59], s[8:9], 16 -; SI-NEXT: s_lshr_b64 s[60:61], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[58:59], s[10:11], 16 ; SI-NEXT: s_lshr_b64 s[62:63], s[12:13], 16 ; SI-NEXT: s_lshr_b64 s[72:73], s[14:15], 16 ; SI-NEXT: s_lshr_b64 s[74:75], s[16:17], 16 @@ -35425,8 +35122,8 @@ define inreg <64 x i16> @bitcast_v32i32_to_v64i16_scalar(<32 x i32> inreg %a, i3 ; SI-NEXT: s_add_i32 s4, s4, 3 ; SI-NEXT: s_lshr_b64 s[26:27], s[4:5], 16 ; SI-NEXT: s_lshr_b64 s[28:29], s[6:7], 16 -; SI-NEXT: s_lshr_b64 s[58:59], s[8:9], 16 -; SI-NEXT: s_lshr_b64 s[60:61], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[58:59], s[10:11], 16 ; SI-NEXT: s_lshr_b64 s[62:63], s[12:13], 16 ; SI-NEXT: s_lshr_b64 s[72:73], s[14:15], 16 ; SI-NEXT: s_lshr_b64 s[74:75], s[16:17], 16 @@ -35459,247 +35156,157 @@ define inreg <64 x i16> @bitcast_v32i32_to_v64i16_scalar(<32 x i32> inreg %a, i3 ; SI-NEXT: s_lshl_b32 s27, s36, 16 ; SI-NEXT: s_and_b32 s29, s56, 0xffff ; SI-NEXT: s_or_b32 s27, s29, s27 -; SI-NEXT: v_mov_b32_e32 v1, s27 -; SI-NEXT: s_and_b32 s27, s57, 0xffff -; SI-NEXT: s_lshl_b32 s29, s69, 16 -; SI-NEXT: s_or_b32 s27, s27, s29 -; SI-NEXT: v_mov_b32_e32 v2, s27 -; SI-NEXT: s_lshl_b32 s27, s34, 16 -; SI-NEXT: s_and_b32 s29, s46, 0xffff -; SI-NEXT: s_or_b32 s27, s29, s27 -; SI-NEXT: v_mov_b32_e32 v3, s27 -; SI-NEXT: s_and_b32 s27, s47, 0xffff -; SI-NEXT: s_lshl_b32 s29, s68, 16 -; SI-NEXT: s_or_b32 s27, s27, s29 -; SI-NEXT: v_mov_b32_e32 v4, s27 -; SI-NEXT: s_lshl_b32 s27, s30, 16 -; SI-NEXT: s_and_b32 s29, s44, 0xffff -; SI-NEXT: s_or_b32 s27, s29, s27 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; SI-NEXT: v_mov_b32_e32 v5, s27 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v1, vcc, 8, v0 -; SI-NEXT: s_and_b32 s27, s45, 0xffff -; SI-NEXT: s_lshl_b32 s29, s67, 16 -; SI-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v1, vcc, 12, v0 -; SI-NEXT: s_or_b32 s27, s27, s29 -; SI-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v1, vcc, 16, v0 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mov_b32_e32 v2, s27 -; SI-NEXT: s_and_b32 s27, s42, 0xffff -; SI-NEXT: s_lshl_b32 s29, s94, 16 -; SI-NEXT: buffer_store_dword v5, v1, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v1, vcc, 20, v0 -; SI-NEXT: s_or_b32 s27, s27, s29 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s27 -; SI-NEXT: s_and_b32 s27, s43, 0xffff -; SI-NEXT: s_lshl_b32 s29, s66, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 24, v0 -; SI-NEXT: s_or_b32 s27, s27, s29 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s27 -; SI-NEXT: s_and_b32 s27, s40, 0xffff -; SI-NEXT: s_lshl_b32 s29, s92, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 28, v0 -; SI-NEXT: s_or_b32 s27, s27, s29 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s27 -; SI-NEXT: s_and_b32 s27, s41, 0xffff -; SI-NEXT: s_lshl_b32 s29, s65, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 32, v0 -; SI-NEXT: s_or_b32 s27, s27, s29 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s27 +; SI-NEXT: s_and_b32 s29, s57, 0xffff +; SI-NEXT: s_lshl_b32 s56, s69, 16 +; SI-NEXT: s_or_b32 s29, s29, s56 +; SI-NEXT: s_lshl_b32 s56, s34, 16 +; SI-NEXT: s_and_b32 s46, s46, 0xffff +; SI-NEXT: s_or_b32 s46, s46, s56 +; SI-NEXT: s_and_b32 s47, s47, 0xffff +; SI-NEXT: s_lshl_b32 s56, s68, 16 +; SI-NEXT: s_or_b32 s47, s47, s56 +; SI-NEXT: s_lshl_b32 s56, s30, 16 +; SI-NEXT: s_and_b32 s44, s44, 0xffff +; SI-NEXT: s_or_b32 s44, s44, s56 +; SI-NEXT: s_and_b32 s45, s45, 0xffff +; SI-NEXT: s_lshl_b32 s56, s67, 16 +; SI-NEXT: s_or_b32 s45, s45, s56 +; SI-NEXT: s_lshl_b32 s56, s94, 16 +; SI-NEXT: s_and_b32 s42, s42, 0xffff +; SI-NEXT: s_or_b32 s42, s42, s56 +; SI-NEXT: s_and_b32 s43, s43, 0xffff +; SI-NEXT: s_lshl_b32 s56, s66, 16 +; SI-NEXT: s_or_b32 s43, s43, s56 +; SI-NEXT: s_lshl_b32 s56, s92, 16 +; SI-NEXT: s_and_b32 s40, s40, 0xffff +; SI-NEXT: s_or_b32 s40, s40, s56 +; SI-NEXT: s_and_b32 s41, s41, 0xffff +; SI-NEXT: s_lshl_b32 s56, s65, 16 +; SI-NEXT: s_or_b32 s41, s41, s56 ; SI-NEXT: s_and_b32 s24, s24, 0xffff -; SI-NEXT: s_lshl_b32 s27, s90, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 36, v0 -; SI-NEXT: s_or_b32 s24, s24, s27 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s24 -; SI-NEXT: s_and_b32 s24, s25, 0xffff -; SI-NEXT: s_lshl_b32 s25, s64, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 40, v0 -; SI-NEXT: s_or_b32 s24, s24, s25 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s24 +; SI-NEXT: s_lshl_b32 s56, s90, 16 +; SI-NEXT: s_or_b32 s24, s24, s56 +; SI-NEXT: s_and_b32 s25, s25, 0xffff +; SI-NEXT: s_lshl_b32 s56, s64, 16 +; SI-NEXT: s_or_b32 s25, s25, s56 ; SI-NEXT: s_and_b32 s22, s22, 0xffff -; SI-NEXT: s_lshl_b32 s24, s88, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 44, v0 -; SI-NEXT: s_or_b32 s22, s22, s24 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s22 -; SI-NEXT: s_and_b32 s22, s23, 0xffff -; SI-NEXT: s_lshl_b32 s23, s55, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 48, v0 -; SI-NEXT: s_or_b32 s22, s22, s23 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s22 +; SI-NEXT: s_lshl_b32 s56, s88, 16 +; SI-NEXT: s_or_b32 s22, s22, s56 +; SI-NEXT: s_and_b32 s23, s23, 0xffff +; SI-NEXT: s_lshl_b32 s56, s55, 16 +; SI-NEXT: s_or_b32 s23, s23, s56 ; SI-NEXT: s_and_b32 s20, s20, 0xffff -; SI-NEXT: s_lshl_b32 s22, s78, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 52, v0 -; SI-NEXT: s_or_b32 s20, s20, s22 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s20 -; SI-NEXT: s_and_b32 s20, s21, 0xffff -; SI-NEXT: s_lshl_b32 s21, s54, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 56, v0 -; SI-NEXT: s_or_b32 s20, s20, s21 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s20 +; SI-NEXT: s_lshl_b32 s56, s78, 16 +; SI-NEXT: s_or_b32 s20, s20, s56 +; SI-NEXT: s_and_b32 s21, s21, 0xffff +; SI-NEXT: s_lshl_b32 s56, s54, 16 +; SI-NEXT: s_or_b32 s21, s21, s56 ; SI-NEXT: s_and_b32 s18, s18, 0xffff -; SI-NEXT: s_lshl_b32 s20, s76, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 60, v0 -; SI-NEXT: s_or_b32 s18, s18, s20 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s18 -; SI-NEXT: s_and_b32 s18, s19, 0xffff -; SI-NEXT: s_lshl_b32 s19, s53, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 64, v0 -; SI-NEXT: s_or_b32 s18, s18, s19 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: s_lshl_b32 s56, s76, 16 +; SI-NEXT: s_or_b32 s18, s18, s56 +; SI-NEXT: s_and_b32 s19, s19, 0xffff +; SI-NEXT: s_lshl_b32 s56, s53, 16 +; SI-NEXT: s_or_b32 s19, s19, s56 ; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: s_lshl_b32 s18, s74, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x44, v0 -; SI-NEXT: s_or_b32 s16, s16, s18 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s17, 0xffff -; SI-NEXT: s_lshl_b32 s17, s52, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x48, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_lshl_b32 s56, s74, 16 +; SI-NEXT: s_or_b32 s16, s16, s56 +; SI-NEXT: s_and_b32 s17, s17, 0xffff +; SI-NEXT: s_lshl_b32 s56, s52, 16 +; SI-NEXT: s_or_b32 s17, s17, s56 ; SI-NEXT: s_and_b32 s14, s14, 0xffff -; SI-NEXT: s_lshl_b32 s16, s72, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x4c, v0 -; SI-NEXT: s_or_b32 s14, s14, s16 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s14 -; SI-NEXT: s_and_b32 s14, s15, 0xffff -; SI-NEXT: s_lshl_b32 s15, s51, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x50, v0 -; SI-NEXT: s_or_b32 s14, s14, s15 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s14 +; SI-NEXT: s_lshl_b32 s56, s72, 16 +; SI-NEXT: s_or_b32 s14, s14, s56 +; SI-NEXT: s_and_b32 s15, s15, 0xffff +; SI-NEXT: s_lshl_b32 s56, s51, 16 +; SI-NEXT: s_or_b32 s15, s15, s56 ; SI-NEXT: s_and_b32 s12, s12, 0xffff -; SI-NEXT: s_lshl_b32 s14, s62, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x54, v0 -; SI-NEXT: s_or_b32 s12, s12, s14 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s12 -; SI-NEXT: s_and_b32 s12, s13, 0xffff -; SI-NEXT: s_lshl_b32 s13, s50, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x58, v0 -; SI-NEXT: s_or_b32 s12, s12, s13 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s12 +; SI-NEXT: s_lshl_b32 s56, s62, 16 +; SI-NEXT: s_or_b32 s12, s12, s56 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_lshl_b32 s56, s50, 16 +; SI-NEXT: s_or_b32 s13, s13, s56 ; SI-NEXT: s_and_b32 s10, s10, 0xffff -; SI-NEXT: s_lshl_b32 s12, s60, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x5c, v0 -; SI-NEXT: s_or_b32 s10, s10, s12 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s10 -; SI-NEXT: s_and_b32 s10, s11, 0xffff -; SI-NEXT: s_lshl_b32 s11, s49, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x60, v0 -; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: s_lshl_b32 s56, s58, 16 +; SI-NEXT: s_or_b32 s10, s10, s56 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: s_lshl_b32 s56, s49, 16 +; SI-NEXT: s_or_b32 s11, s11, s56 ; SI-NEXT: s_and_b32 s8, s8, 0xffff -; SI-NEXT: s_lshl_b32 s10, s58, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x64, v0 -; SI-NEXT: s_or_b32 s8, s8, s10 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s8 -; SI-NEXT: s_and_b32 s8, s9, 0xffff -; SI-NEXT: s_lshl_b32 s9, s48, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x68, v0 -; SI-NEXT: s_or_b32 s8, s8, s9 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: s_lshl_b32 s56, s60, 16 ; SI-NEXT: s_and_b32 s6, s6, 0xffff -; SI-NEXT: s_lshl_b32 s8, s28, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x6c, v0 -; SI-NEXT: s_or_b32 s6, s6, s8 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: s_and_b32 s6, s7, 0xffff -; SI-NEXT: s_lshl_b32 s7, s39, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x70, v0 -; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_lshl_b32 s28, s28, 16 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_lshl_b32 s6, s26, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x74, v0 -; SI-NEXT: s_or_b32 s4, s4, s6 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s4 -; SI-NEXT: s_and_b32 s4, s5, 0xffff -; SI-NEXT: s_lshl_b32 s5, s38, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x78, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; SI-NEXT: v_mov_b32_e32 v1, s4 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: v_readlane_b32 s69, v21, 21 -; SI-NEXT: v_readlane_b32 s68, v21, 20 -; SI-NEXT: v_readlane_b32 s67, v21, 19 -; SI-NEXT: v_readlane_b32 s66, v21, 18 -; SI-NEXT: v_readlane_b32 s65, v21, 17 -; SI-NEXT: v_readlane_b32 s64, v21, 16 -; SI-NEXT: v_readlane_b32 s55, v21, 15 -; SI-NEXT: v_readlane_b32 s54, v21, 14 -; SI-NEXT: v_readlane_b32 s53, v21, 13 -; SI-NEXT: v_readlane_b32 s52, v21, 12 -; SI-NEXT: v_readlane_b32 s51, v21, 11 -; SI-NEXT: v_readlane_b32 s50, v21, 10 -; SI-NEXT: v_readlane_b32 s49, v21, 9 -; SI-NEXT: v_readlane_b32 s48, v21, 8 -; SI-NEXT: v_readlane_b32 s39, v21, 7 -; SI-NEXT: v_readlane_b32 s38, v21, 6 -; SI-NEXT: v_readlane_b32 s37, v21, 5 -; SI-NEXT: v_readlane_b32 s36, v21, 4 -; SI-NEXT: v_readlane_b32 s35, v21, 3 -; SI-NEXT: v_readlane_b32 s34, v21, 2 -; SI-NEXT: v_readlane_b32 s31, v21, 1 -; SI-NEXT: v_readlane_b32 s30, v21, 0 +; SI-NEXT: s_lshl_b32 s26, s26, 16 +; SI-NEXT: s_or_b32 s8, s8, s56 +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_lshl_b32 s56, s48, 16 +; SI-NEXT: s_or_b32 s6, s6, s28 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_lshl_b32 s28, s39, 16 +; SI-NEXT: s_or_b32 s4, s4, s26 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s26, s38, 16 +; SI-NEXT: s_or_b32 s9, s9, s56 +; SI-NEXT: s_or_b32 s7, s7, s28 +; SI-NEXT: s_or_b32 s5, s5, s26 +; SI-NEXT: v_mov_b32_e32 v0, s27 +; SI-NEXT: v_mov_b32_e32 v1, s29 +; SI-NEXT: v_mov_b32_e32 v2, s46 +; SI-NEXT: v_mov_b32_e32 v3, s47 +; SI-NEXT: v_mov_b32_e32 v4, s44 +; SI-NEXT: v_mov_b32_e32 v5, s45 +; SI-NEXT: v_mov_b32_e32 v6, s42 +; SI-NEXT: v_mov_b32_e32 v7, s43 +; SI-NEXT: v_mov_b32_e32 v8, s40 +; SI-NEXT: v_mov_b32_e32 v9, s41 +; SI-NEXT: v_mov_b32_e32 v10, s24 +; SI-NEXT: v_mov_b32_e32 v11, s25 +; SI-NEXT: v_mov_b32_e32 v12, s22 +; SI-NEXT: v_mov_b32_e32 v13, s23 +; SI-NEXT: v_mov_b32_e32 v14, s20 +; SI-NEXT: v_mov_b32_e32 v15, s21 +; SI-NEXT: v_mov_b32_e32 v16, s18 +; SI-NEXT: v_mov_b32_e32 v17, s19 +; SI-NEXT: v_mov_b32_e32 v18, s16 +; SI-NEXT: v_mov_b32_e32 v19, s17 +; SI-NEXT: v_mov_b32_e32 v20, s14 +; SI-NEXT: v_mov_b32_e32 v21, s15 +; SI-NEXT: v_mov_b32_e32 v22, s12 +; SI-NEXT: v_mov_b32_e32 v23, s13 +; SI-NEXT: v_mov_b32_e32 v24, s10 +; SI-NEXT: v_mov_b32_e32 v25, s11 +; SI-NEXT: v_mov_b32_e32 v26, s8 +; SI-NEXT: v_mov_b32_e32 v27, s9 +; SI-NEXT: v_mov_b32_e32 v28, s6 +; SI-NEXT: v_mov_b32_e32 v29, s7 +; SI-NEXT: v_mov_b32_e32 v30, s4 +; SI-NEXT: v_mov_b32_e32 v31, s5 +; SI-NEXT: v_readlane_b32 s69, v32, 21 +; SI-NEXT: v_readlane_b32 s68, v32, 20 +; SI-NEXT: v_readlane_b32 s67, v32, 19 +; SI-NEXT: v_readlane_b32 s66, v32, 18 +; SI-NEXT: v_readlane_b32 s65, v32, 17 +; SI-NEXT: v_readlane_b32 s64, v32, 16 +; SI-NEXT: v_readlane_b32 s55, v32, 15 +; SI-NEXT: v_readlane_b32 s54, v32, 14 +; SI-NEXT: v_readlane_b32 s53, v32, 13 +; SI-NEXT: v_readlane_b32 s52, v32, 12 +; SI-NEXT: v_readlane_b32 s51, v32, 11 +; SI-NEXT: v_readlane_b32 s50, v32, 10 +; SI-NEXT: v_readlane_b32 s49, v32, 9 +; SI-NEXT: v_readlane_b32 s48, v32, 8 +; SI-NEXT: v_readlane_b32 s39, v32, 7 +; SI-NEXT: v_readlane_b32 s38, v32, 6 +; SI-NEXT: v_readlane_b32 s37, v32, 5 +; SI-NEXT: v_readlane_b32 s36, v32, 4 +; SI-NEXT: v_readlane_b32 s35, v32, 3 +; SI-NEXT: v_readlane_b32 s34, v32, 2 +; SI-NEXT: v_readlane_b32 s31, v32, 1 +; SI-NEXT: v_readlane_b32 s30, v32, 0 ; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[4:5] -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB25_4: ; SI-NEXT: ; implicit-def: $sgpr36 @@ -35726,12 +35333,12 @@ define inreg <64 x i16> @bitcast_v32i32_to_v64i16_scalar(<32 x i32> inreg %a, i3 ; SI-NEXT: ; implicit-def: $sgpr51 ; SI-NEXT: ; implicit-def: $sgpr62 ; SI-NEXT: ; implicit-def: $sgpr50 -; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr58 ; SI-NEXT: ; implicit-def: $sgpr49 ; SI-NEXT: ; implicit-def: $sgpr48 ; SI-NEXT: ; implicit-def: $sgpr39 ; SI-NEXT: ; implicit-def: $sgpr38 -; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr60 ; SI-NEXT: ; implicit-def: $sgpr28 ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: s_branch .LBB25_2 @@ -35980,186 +35587,319 @@ define <32 x i32> @bitcast_v64i16_to_v32i32(<64 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v64i16_to_v32i32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v62, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:132 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:88 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:104 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:112 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:128 -; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v25 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 +; SI-NEXT: v_mov_b32_e32 v45, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v19 +; SI-NEXT: v_mov_b32_e32 v46, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v20 +; SI-NEXT: v_mov_b32_e32 v47, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v21 +; SI-NEXT: v_mov_b32_e32 v56, v6 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v22 +; SI-NEXT: v_mov_b32_e32 v57, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v23 +; SI-NEXT: v_mov_b32_e32 v58, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v24 +; SI-NEXT: v_mov_b32_e32 v59, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v25 +; SI-NEXT: v_mov_b32_e32 v60, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v26 +; SI-NEXT: v_mov_b32_e32 v61, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v27 +; SI-NEXT: v_mov_b32_e32 v43, v11 +; SI-NEXT: v_mov_b32_e32 v44, v10 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v18 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v17 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v16 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v15 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v43 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v44 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v45 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v46 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v47 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v56 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v57 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v58 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v59 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v60 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v61 ; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v10 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v8 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v12 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v18 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v20 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v22 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v24 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v26 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v28 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v30 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v62 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v9 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v33 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:36 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:4 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v31 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:108 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:100 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v63 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB26_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v22, 0xffff, v41 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v46 +; SI-NEXT: v_or_b32_e32 v8, v8, v32 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v43 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v32 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v62 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v61 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v60 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v59 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v58 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v57 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v56 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v47 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v45 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v44 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: v_or_b32_e32 v0, v0, v49 +; SI-NEXT: v_or_b32_e32 v1, v1, v42 +; SI-NEXT: v_or_b32_e32 v2, v2, v34 +; SI-NEXT: v_or_b32_e32 v3, v3, v41 +; SI-NEXT: v_or_b32_e32 v4, v4, v48 +; SI-NEXT: v_or_b32_e32 v5, v5, v40 +; SI-NEXT: v_or_b32_e32 v6, v6, v33 +; SI-NEXT: v_or_b32_e32 v7, v7, v55 +; SI-NEXT: v_or_b32_e32 v9, v9, v54 +; SI-NEXT: v_or_b32_e32 v10, v10, v39 +; SI-NEXT: v_or_b32_e32 v11, v11, v53 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v12, v12, v38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v24, v24, v25 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_or_b32_e32 v13, v13, v52 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_or_b32_e32 v14, v14, v37 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v26, v26, v27 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_or_b32_e32 v15, v15, v51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v16, v16, v36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: v_or_b32_e32 v28, v28, v29 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_or_b32_e32 v17, v17, v50 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: v_or_b32_e32 v29, v29, v30 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v18, v18, v35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; SI-NEXT: v_or_b32_e32 v30, v30, v31 +; SI-NEXT: v_and_b32_e32 v31, 0xffff, v63 +; SI-NEXT: v_or_b32_e32 v31, v31, v32 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 @@ -36189,362 +35929,213 @@ define <32 x i32> @bitcast_v64i16_to_v32i32(<64 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v48 -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v35 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: v_or_b32_e32 v16, v16, v53 -; SI-NEXT: v_or_b32_e32 v17, v17, v51 -; SI-NEXT: v_or_b32_e32 v18, v18, v50 ; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; SI-NEXT: v_or_b32_e32 v22, v22, v23 -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v54 -; SI-NEXT: v_or_b32_e32 v19, v19, v39 -; SI-NEXT: v_or_b32_e32 v23, v23, v24 -; SI-NEXT: v_and_b32_e32 v24, 0xffff, v52 -; SI-NEXT: v_or_b32_e32 v24, v24, v25 -; SI-NEXT: v_and_b32_e32 v25, 0xffff, v49 -; SI-NEXT: v_or_b32_e32 v25, v25, v26 -; SI-NEXT: v_and_b32_e32 v26, 0xffff, v37 -; SI-NEXT: v_or_b32_e32 v26, v26, v27 -; SI-NEXT: v_and_b32_e32 v27, 0xffff, v33 -; SI-NEXT: v_or_b32_e32 v27, v27, v28 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: .LBB26_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB26_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v46 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v8, v32, v8 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v62 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v61 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v60 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v59 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v58 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v57 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v56 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v47 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v45 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v44 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v43 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_or_b32_e32 v0, v49, v0 +; SI-NEXT: v_or_b32_e32 v1, v42, v1 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v2, v34, v2 +; SI-NEXT: v_or_b32_e32 v3, v41, v3 +; SI-NEXT: v_or_b32_e32 v4, v48, v4 +; SI-NEXT: v_or_b32_e32 v5, v40, v5 +; SI-NEXT: v_or_b32_e32 v6, v33, v6 +; SI-NEXT: v_or_b32_e32 v7, v55, v7 +; SI-NEXT: v_or_b32_e32 v9, v54, v9 +; SI-NEXT: v_or_b32_e32 v10, v39, v10 +; SI-NEXT: v_or_b32_e32 v11, v53, v11 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 ; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_or_b32_e32 v12, v38, v12 +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 +; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v13, v52, v13 +; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 +; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 ; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v14, v37, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 +; SI-NEXT: v_add_i32_e32 v25, vcc, s6, v25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v26, v27, v26 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: v_or_b32_e32 v0, v0, v63 -; SI-NEXT: v_or_b32_e32 v1, v1, v62 -; SI-NEXT: v_or_b32_e32 v2, v2, v61 -; SI-NEXT: v_or_b32_e32 v3, v3, v60 -; SI-NEXT: v_or_b32_e32 v4, v4, v59 -; SI-NEXT: v_or_b32_e32 v5, v5, v58 -; SI-NEXT: v_or_b32_e32 v6, v6, v57 -; SI-NEXT: v_or_b32_e32 v7, v7, v56 -; SI-NEXT: v_or_b32_e32 v8, v8, v47 -; SI-NEXT: v_or_b32_e32 v9, v9, v46 -; SI-NEXT: v_or_b32_e32 v10, v10, v45 -; SI-NEXT: v_or_b32_e32 v11, v11, v44 -; SI-NEXT: v_or_b32_e32 v12, v12, v43 -; SI-NEXT: v_or_b32_e32 v13, v13, v42 -; SI-NEXT: v_or_b32_e32 v14, v14, v40 -; SI-NEXT: v_or_b32_e32 v15, v15, v55 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: v_or_b32_e32 v15, v51, v15 +; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 +; SI-NEXT: v_add_i32_e32 v26, vcc, s6, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v16, v36, v16 +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 +; SI-NEXT: v_add_i32_e32 v27, vcc, s6, v27 ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 ; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; SI-NEXT: v_or_b32_e32 v28, v28, v29 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; SI-NEXT: v_or_b32_e32 v20, v20, v36 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; SI-NEXT: v_or_b32_e32 v29, v29, v30 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v30, 0xffff, v30 -; SI-NEXT: v_or_b32_e32 v30, v30, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; SI-NEXT: v_or_b32_e32 v21, v21, v34 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; kill: killed $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; kill: killed $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; kill: killed $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; kill: killed $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; SI-NEXT: v_or_b32_e32 v31, v31, v38 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; kill: killed $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; kill: killed $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; kill: killed $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; kill: killed $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; kill: killed $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; kill: killed $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; kill: killed $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; kill: killed $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; kill: killed $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: .LBB26_2: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB26_4 -; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v41 -; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v48 -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v35 -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v32 -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v28, v29, v28 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 ; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; SI-NEXT: s_mov_b32 s6, 0x30000 -; SI-NEXT: v_or_b32_e32 v16, v53, v16 -; SI-NEXT: v_or_b32_e32 v17, v51, v17 -; SI-NEXT: v_or_b32_e32 v18, v50, v18 -; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 +; SI-NEXT: v_or_b32_e32 v17, v50, v17 ; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 -; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v54 -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; SI-NEXT: v_or_b32_e32 v23, v24, v23 -; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v52 -; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v49 -; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; SI-NEXT: v_or_b32_e32 v25, v26, v25 -; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v37 -; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; SI-NEXT: v_or_b32_e32 v26, v27, v26 -; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v33 -; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; SI-NEXT: v_or_b32_e32 v27, v28, v27 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; SI-NEXT: v_or_b32_e32 v0, v63, v0 -; SI-NEXT: v_or_b32_e32 v1, v62, v1 -; SI-NEXT: v_or_b32_e32 v2, v61, v2 -; SI-NEXT: v_or_b32_e32 v3, v60, v3 -; SI-NEXT: v_or_b32_e32 v4, v59, v4 -; SI-NEXT: v_or_b32_e32 v5, v58, v5 -; SI-NEXT: v_or_b32_e32 v6, v57, v6 -; SI-NEXT: v_or_b32_e32 v7, v56, v7 -; SI-NEXT: v_or_b32_e32 v8, v47, v8 -; SI-NEXT: v_or_b32_e32 v9, v46, v9 -; SI-NEXT: v_or_b32_e32 v10, v45, v10 -; SI-NEXT: v_or_b32_e32 v11, v44, v11 -; SI-NEXT: v_or_b32_e32 v12, v43, v12 -; SI-NEXT: v_or_b32_e32 v13, v42, v13 -; SI-NEXT: v_or_b32_e32 v14, v40, v14 -; SI-NEXT: v_or_b32_e32 v15, v55, v15 -; SI-NEXT: v_or_b32_e32 v19, v39, v19 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; SI-NEXT: v_or_b32_e32 v28, v29, v28 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 -; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 -; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 -; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 -; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 -; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 -; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 -; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 -; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v22 -; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v23 -; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v24 -; SI-NEXT: v_add_i32_e32 v25, vcc, s6, v25 -; SI-NEXT: v_add_i32_e32 v26, vcc, s6, v26 -; SI-NEXT: v_add_i32_e32 v27, vcc, s6, v27 ; SI-NEXT: v_add_i32_e32 v28, vcc, s6, v28 -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; SI-NEXT: v_or_b32_e32 v20, v36, v20 -; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v20 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 ; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 ; SI-NEXT: v_or_b32_e32 v29, v30, v29 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v18, v35, v18 +; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 ; SI-NEXT: v_add_i32_e32 v29, vcc, 0x30000, v29 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 ; SI-NEXT: v_and_b32_e32 v30, 0xffff, v30 ; SI-NEXT: v_or_b32_e32 v30, v31, v30 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; SI-NEXT: v_or_b32_e32 v21, v34, v21 -; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v21 -; SI-NEXT: v_add_i32_e32 v30, vcc, 0x30000, v30 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v31 +; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v63 ; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; SI-NEXT: v_or_b32_e32 v31, v38, v31 +; SI-NEXT: v_or_b32_e32 v31, v32, v31 +; SI-NEXT: v_add_i32_e32 v30, vcc, 0x30000, v30 ; SI-NEXT: v_add_i32_e32 v31, vcc, 0x30000, v31 ; SI-NEXT: .LBB26_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -36785,442 +36376,387 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3 ; SI-LABEL: bitcast_v64i16_to_v32i32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v54, v12 -; SI-NEXT: v_mov_b32_e32 v34, v10 -; SI-NEXT: v_mov_b32_e32 v35, v8 -; SI-NEXT: v_mov_b32_e32 v38, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:68 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v29 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v40 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v40, v1 +; SI-NEXT: v_mov_b32_e32 v55, v2 +; SI-NEXT: v_mov_b32_e32 v41, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v40 +; SI-NEXT: v_mov_b32_e32 v54, v3 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v55 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v54 +; SI-NEXT: v_mov_b32_e32 v52, v5 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v53 +; SI-NEXT: v_mov_b32_e32 v51, v6 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v52 +; SI-NEXT: v_mov_b32_e32 v50, v7 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v51 +; SI-NEXT: v_mov_b32_e32 v49, v8 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v50 +; SI-NEXT: v_mov_b32_e32 v48, v9 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v49 +; SI-NEXT: v_mov_b32_e32 v39, v10 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v48 +; SI-NEXT: v_mov_b32_e32 v38, v11 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v39 +; SI-NEXT: v_mov_b32_e32 v37, v12 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v38 +; SI-NEXT: v_mov_b32_e32 v36, v13 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v37 +; SI-NEXT: v_mov_b32_e32 v35, v14 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v36 +; SI-NEXT: v_mov_b32_e32 v34, v15 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v35 +; SI-NEXT: v_mov_b32_e32 v33, v16 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v34 +; SI-NEXT: v_mov_b32_e32 v32, v17 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v33 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v41 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s7, s28, 16 +; SI-NEXT: s_lshr_b32 s8, s27, 16 +; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: s_lshr_b32 s13, s22, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s40, s19, 16 +; SI-NEXT: s_lshr_b32 s41, s18, 16 +; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v32 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v36 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v8 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v12 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v33 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v50 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v49 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: s_cbranch_scc0 .LBB27_2 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB27_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 -; SI-NEXT: v_or_b32_e32 v7, v0, v48 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v4 -; SI-NEXT: v_or_b32_e32 v9, v0, v39 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v6 -; SI-NEXT: v_or_b32_e32 v10, v0, v47 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 -; SI-NEXT: v_or_b32_e32 v11, v0, v46 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 -; SI-NEXT: v_or_b32_e32 v12, v0, v45 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v41 +; SI-NEXT: v_or_b32_e32 v14, v0, v45 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; SI-NEXT: v_or_b32_e32 v16, v0, v43 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v54 -; SI-NEXT: v_or_b32_e32 v13, v0, v44 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v14 -; SI-NEXT: v_mov_b32_e32 v35, v34 -; SI-NEXT: v_mov_b32_e32 v34, v54 -; SI-NEXT: v_mov_b32_e32 v54, v14 -; SI-NEXT: v_or_b32_e32 v14, v0, v43 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_or_b32_e32 v15, v0, v15 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v18 -; SI-NEXT: v_or_b32_e32 v16, v0, v42 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v20 -; SI-NEXT: v_or_b32_e32 v17, v0, v17 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v22 -; SI-NEXT: v_or_b32_e32 v18, v0, v41 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v24 -; SI-NEXT: v_or_b32_e32 v19, v0, v19 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v26 -; SI-NEXT: v_or_b32_e32 v20, v0, v37 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v28 -; SI-NEXT: v_or_b32_e32 v21, v0, v21 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 -; SI-NEXT: v_or_b32_e32 v22, v0, v61 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 -; SI-NEXT: v_or_b32_e32 v23, v0, v23 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 ; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: v_or_b32_e32 v24, v0, v57 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v63 +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: v_or_b32_e32 v17, v0, v42 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: v_or_b32_e32 v25, v0, v25 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v62 -; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: v_or_b32_e32 v26, v0, v40 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s44, s42, 16 +; SI-NEXT: v_or_b32_e32 v18, v0, v63 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 -; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: v_or_b32_e32 v27, v0, v27 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v60 -; SI-NEXT: s_or_b32 s7, s7, s8 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: v_or_b32_e32 v28, v0, v5 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v59 -; SI-NEXT: s_or_b32 s8, s8, s9 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: v_or_b32_e32 v29, v0, v29 +; SI-NEXT: s_or_b32 s5, s5, s44 +; SI-NEXT: s_and_b32 s44, s18, 0xffff +; SI-NEXT: s_lshl_b32 s45, s41, 16 +; SI-NEXT: v_or_b32_e32 v19, v0, v62 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 -; SI-NEXT: s_or_b32 s9, s9, s10 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: s_or_b32 s44, s44, s45 +; SI-NEXT: s_and_b32 s45, s19, 0xffff +; SI-NEXT: s_lshl_b32 s46, s40, 16 +; SI-NEXT: v_or_b32_e32 v20, v0, v61 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: s_or_b32 s45, s45, s46 +; SI-NEXT: s_and_b32 s46, s20, 0xffff +; SI-NEXT: s_lshl_b32 s47, s15, 16 +; SI-NEXT: v_or_b32_e32 v21, v0, v60 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: s_or_b32 s46, s46, s47 +; SI-NEXT: s_and_b32 s47, s21, 0xffff +; SI-NEXT: s_lshl_b32 s56, s14, 16 +; SI-NEXT: v_or_b32_e32 v22, v0, v59 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; SI-NEXT: s_or_b32 s47, s47, s56 +; SI-NEXT: s_and_b32 s56, s22, 0xffff +; SI-NEXT: s_lshl_b32 s57, s13, 16 +; SI-NEXT: v_or_b32_e32 v23, v0, v58 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: s_or_b32 s56, s56, s57 +; SI-NEXT: s_and_b32 s57, s23, 0xffff +; SI-NEXT: s_lshl_b32 s58, s12, 16 +; SI-NEXT: v_or_b32_e32 v24, v0, v57 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: s_or_b32 s57, s57, s58 +; SI-NEXT: s_and_b32 s58, s24, 0xffff +; SI-NEXT: s_lshl_b32 s59, s11, 16 +; SI-NEXT: v_or_b32_e32 v25, v0, v56 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: s_or_b32 s58, s58, s59 +; SI-NEXT: s_and_b32 s59, s25, 0xffff +; SI-NEXT: s_lshl_b32 s60, s10, 16 +; SI-NEXT: v_or_b32_e32 v26, v0, v47 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: s_or_b32 s59, s59, s60 +; SI-NEXT: s_and_b32 s60, s26, 0xffff +; SI-NEXT: s_lshl_b32 s61, s9, 16 +; SI-NEXT: v_or_b32_e32 v27, v0, v46 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: s_or_b32 s60, s60, s61 +; SI-NEXT: s_and_b32 s61, s27, 0xffff +; SI-NEXT: s_lshl_b32 s62, s8, 16 +; SI-NEXT: v_or_b32_e32 v28, v0, v5 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: s_or_b32 s61, s61, s62 +; SI-NEXT: s_and_b32 s62, s28, 0xffff +; SI-NEXT: s_lshl_b32 s63, s7, 16 +; SI-NEXT: v_or_b32_e32 v29, v0, v4 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: s_or_b32 s62, s62, s63 +; SI-NEXT: s_and_b32 s63, s29, 0xffff +; SI-NEXT: s_lshl_b32 s72, s6, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v40 ; SI-NEXT: v_or_b32_e32 v30, v0, v3 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v58 -; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_mov_b32_e32 v36, v38 -; SI-NEXT: v_mov_b32_e32 v38, v2 -; SI-NEXT: v_or_b32_e32 v8, v1, v56 -; SI-NEXT: v_mov_b32_e32 v42, v41 -; SI-NEXT: v_mov_b32_e32 v50, v37 -; SI-NEXT: v_mov_b32_e32 v55, v61 -; SI-NEXT: v_mov_b32_e32 v33, v32 -; SI-NEXT: v_mov_b32_e32 v53, v63 -; SI-NEXT: v_mov_b32_e32 v62, v52 -; SI-NEXT: v_mov_b32_e32 v60, v59 -; SI-NEXT: v_mov_b32_e32 v49, v51 -; SI-NEXT: v_or_b32_e32 v31, v0, v31 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: s_or_b32 s63, s63, s72 +; SI-NEXT: v_or_b32_e32 v15, v1, v44 +; SI-NEXT: v_or_b32_e32 v31, v0, v2 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_mov_b32_e32 v5, s9 -; SI-NEXT: v_mov_b32_e32 v6, s10 -; SI-NEXT: s_mov_b64 s[4:5], 0 -; SI-NEXT: s_branch .LBB27_3 -; SI-NEXT: .LBB27_2: +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_mov_b32_e32 v2, s44 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v3, s45 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v42, v41 -; SI-NEXT: v_mov_b32_e32 v50, v37 -; SI-NEXT: v_mov_b32_e32 v36, v38 -; SI-NEXT: v_mov_b32_e32 v55, v61 -; SI-NEXT: v_mov_b32_e32 v38, v2 -; SI-NEXT: v_mov_b32_e32 v35, v34 -; SI-NEXT: v_mov_b32_e32 v34, v54 -; SI-NEXT: v_mov_b32_e32 v54, v14 -; SI-NEXT: v_mov_b32_e32 v33, v32 -; SI-NEXT: v_mov_b32_e32 v53, v63 -; SI-NEXT: v_mov_b32_e32 v62, v52 -; SI-NEXT: v_mov_b32_e32 v60, v59 -; SI-NEXT: v_mov_b32_e32 v49, v51 -; SI-NEXT: s_mov_b64 s[4:5], -1 -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: .LBB27_3: ; %Flow -; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] -; SI-NEXT: v_mov_b32_e32 v58, v49 -; SI-NEXT: s_cbranch_vccnz .LBB27_5 -; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v48, v0 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v38 +; SI-NEXT: v_mov_b32_e32 v4, s46 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, s47 +; SI-NEXT: v_mov_b32_e32 v6, s56 +; SI-NEXT: v_mov_b32_e32 v7, s57 +; SI-NEXT: v_mov_b32_e32 v8, s58 +; SI-NEXT: v_mov_b32_e32 v9, s59 +; SI-NEXT: v_mov_b32_e32 v10, s60 +; SI-NEXT: v_mov_b32_e32 v11, s61 +; SI-NEXT: v_mov_b32_e32 v12, s62 +; SI-NEXT: v_mov_b32_e32 v13, s63 +; SI-NEXT: s_cbranch_execnz .LBB27_3 +; SI-NEXT: .LBB27_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v40 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v56, v1 -; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1 -; SI-NEXT: v_mov_b32_e32 v52, v53 -; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: s_or_b32 s7, s8, s7 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: s_or_b32 s8, s9, s8 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: s_or_b32 s9, s10, s9 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: s_or_b32 s10, s11, s10 -; SI-NEXT: s_add_i32 s4, s4, 0x30000 -; SI-NEXT: s_add_i32 s5, s5, 0x30000 -; SI-NEXT: s_add_i32 s6, s6, 0x30000 -; SI-NEXT: s_add_i32 s7, s7, 0x30000 -; SI-NEXT: s_add_i32 s8, s8, 0x30000 -; SI-NEXT: s_add_i32 s9, s9, 0x30000 -; SI-NEXT: s_add_i32 s10, s10, 0x30000 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_mov_b32_e32 v5, s9 -; SI-NEXT: v_mov_b32_e32 v6, s10 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v39, v0 -; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v47, v0 -; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v46, v0 -; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_or_b32_e32 v1, v44, v1 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v45, v0 -; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v44, v0 -; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v54 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v43, v0 ; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v0, v43, v0 ; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v54 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v0, v42, v0 ; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: v_or_b32_e32 v0, v63, v0 ; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v0, v62, v0 ; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v50, v0 +; SI-NEXT: v_or_b32_e32 v0, v61, v0 ; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v0, v60, v0 ; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v55, v0 +; SI-NEXT: v_or_b32_e32 v0, v59, v0 ; SI-NEXT: v_add_i32_e32 v22, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v0, v58, v0 ; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v57, v0 ; SI-NEXT: v_add_i32_e32 v24, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v0, v56, v0 ; SI-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v40, v0 +; SI-NEXT: v_or_b32_e32 v0, v47, v0 ; SI-NEXT: v_add_i32_e32 v26, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v62 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v0, v46, v0 ; SI-NEXT: v_add_i32_e32 v27, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v28, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v60 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s16, s42, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: s_and_b32 s16, s18, 0xffff +; SI-NEXT: s_lshl_b32 s17, s41, 16 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_and_b32 s17, s19, 0xffff +; SI-NEXT: s_lshl_b32 s18, s40, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: s_and_b32 s18, s20, 0xffff +; SI-NEXT: s_lshl_b32 s15, s15, 16 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_or_b32 s15, s15, s18 +; SI-NEXT: s_and_b32 s18, s21, 0xffff +; SI-NEXT: s_lshl_b32 s14, s14, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s14, s14, s18 +; SI-NEXT: s_and_b32 s18, s22, 0xffff +; SI-NEXT: s_lshl_b32 s13, s13, 16 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_or_b32 s13, s13, s18 +; SI-NEXT: s_and_b32 s18, s23, 0xffff +; SI-NEXT: s_lshl_b32 s12, s12, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s12, s12, s18 +; SI-NEXT: s_and_b32 s18, s24, 0xffff +; SI-NEXT: s_lshl_b32 s11, s11, 16 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_or_b32 s11, s11, s18 +; SI-NEXT: s_and_b32 s18, s25, 0xffff +; SI-NEXT: s_lshl_b32 s10, s10, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s10, s10, s18 +; SI-NEXT: s_and_b32 s18, s26, 0xffff +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_or_b32 s9, s9, s18 +; SI-NEXT: s_and_b32 s18, s27, 0xffff +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_or_b32 s8, s8, s18 +; SI-NEXT: s_and_b32 s18, s28, 0xffff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_or_b32 s7, s7, s18 +; SI-NEXT: s_and_b32 s18, s29, 0xffff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_or_b32 s6, s6, s18 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s16, s16, 0x30000 +; SI-NEXT: s_add_i32 s17, s17, 0x30000 +; SI-NEXT: s_add_i32 s15, s15, 0x30000 +; SI-NEXT: s_add_i32 s14, s14, 0x30000 +; SI-NEXT: s_add_i32 s13, s13, 0x30000 +; SI-NEXT: s_add_i32 s12, s12, 0x30000 +; SI-NEXT: s_add_i32 s11, s11, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v3, s17 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v4, s15 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, s14 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v29, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v58 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_mov_b32_e32 v6, s13 +; SI-NEXT: v_mov_b32_e32 v7, s12 +; SI-NEXT: v_mov_b32_e32 v8, s11 +; SI-NEXT: v_mov_b32_e32 v9, s10 +; SI-NEXT: v_mov_b32_e32 v10, s9 +; SI-NEXT: v_mov_b32_e32 v11, s8 +; SI-NEXT: v_mov_b32_e32 v12, s7 +; SI-NEXT: v_mov_b32_e32 v13, s6 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v30, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v31, vcc, 0x30000, v0 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: .LBB27_5: ; %end -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: .LBB27_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB27_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_branch .LBB27_2 ; ; VI-LABEL: bitcast_v64i16_to_v32i32_scalar: ; VI: ; %bb.0: @@ -59930,27 +59466,86 @@ define <64 x bfloat> @bitcast_v32f32_to_v64bf16(<32 x float> %a, i32 %b) { ; SI-LABEL: bitcast_v32f32_to_v64bf16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; kill: killed $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; kill: killed $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; kill: killed $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; kill: killed $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; kill: killed $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; kill: killed $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; kill: killed $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; kill: killed $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; kill: killed $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; kill: killed $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; kill: killed $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; kill: killed $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; kill: killed $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; kill: killed $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; kill: killed $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; kill: killed $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; kill: killed $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; kill: killed $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; kill: killed $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; kill: killed $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; kill: killed $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; kill: killed $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; kill: killed $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; kill: killed $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; kill: killed $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; kill: killed $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; kill: killed $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; kill: killed $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; kill: killed $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; kill: killed $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: ; implicit-def: $vgpr58 @@ -59981,193 +59576,130 @@ define <64 x bfloat> @bitcast_v32f32_to_v64bf16(<32 x float> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; kill: killed $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB40_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v30 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v62 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v30 ; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v29 ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v28 ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v27 ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v26 ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v25 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v24 ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v23 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v22 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v21 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v20 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v19 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v62 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v18 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v62 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v17 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) -; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v63 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v16 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v63 -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v15 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v15 -; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v14 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v14 -; SI-NEXT: v_and_b32_e32 v37, 0xffff0000, v13 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v13 -; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v12 -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v12 -; SI-NEXT: v_and_b32_e32 v49, 0xffff0000, v11 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v11 -; SI-NEXT: v_and_b32_e32 v51, 0xffff0000, v10 -; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v10 -; SI-NEXT: v_and_b32_e32 v53, 0xffff0000, v9 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v9 -; SI-NEXT: v_and_b32_e32 v55, 0xffff0000, v8 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v8 -; SI-NEXT: v_and_b32_e32 v41, 0xffff0000, v7 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v7 -; SI-NEXT: v_and_b32_e32 v43, 0xffff0000, v6 -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v6 -; SI-NEXT: v_and_b32_e32 v45, 0xffff0000, v5 -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v5 -; SI-NEXT: v_and_b32_e32 v47, 0xffff0000, v4 -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v4 -; SI-NEXT: v_and_b32_e32 v57, 0xffff0000, v3 -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v3 -; SI-NEXT: v_and_b32_e32 v59, 0xffff0000, v2 -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v2 -; SI-NEXT: v_and_b32_e32 v61, 0xffff0000, v1 -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v1 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_and_b32_e32 v63, 0xffff0000, v62 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v14 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v14 +; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v13 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v13 +; SI-NEXT: v_and_b32_e32 v37, 0xffff0000, v12 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v12 +; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v11 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v11 +; SI-NEXT: v_and_b32_e32 v49, 0xffff0000, v10 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v10 +; SI-NEXT: v_and_b32_e32 v51, 0xffff0000, v9 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v9 +; SI-NEXT: v_and_b32_e32 v53, 0xffff0000, v8 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v8 +; SI-NEXT: v_and_b32_e32 v55, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v7 +; SI-NEXT: v_and_b32_e32 v41, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v6 +; SI-NEXT: v_and_b32_e32 v43, 0xffff0000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v5 +; SI-NEXT: v_and_b32_e32 v45, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v4 +; SI-NEXT: v_and_b32_e32 v47, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v3 +; SI-NEXT: v_and_b32_e32 v57, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v2 +; SI-NEXT: v_and_b32_e32 v59, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v1 +; SI-NEXT: v_and_b32_e32 v61, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v0 +; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 @@ -60199,85 +59731,82 @@ define <64 x bfloat> @bitcast_v32f32_to_v64bf16(<32 x float> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: .LBB40_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB40_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v32, 1.0, v62 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_f32_e32 v31, 1.0, v63 -; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v32 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_add_f32_e32 v31, 1.0, v62 ; SI-NEXT: v_add_f32_e32 v30, 1.0, v30 -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v31 +; SI-NEXT: v_and_b32_e32 v63, 0xffff0000, v31 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 ; SI-NEXT: v_add_f32_e32 v29, 1.0, v29 -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v30 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v30 ; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 ; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v29 ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 ; SI-NEXT: v_add_f32_e32 v27, 1.0, v27 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v28 ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 ; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v27 ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 ; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v26 ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 ; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v25 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 ; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v24 ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v23 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v22 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v21 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v20 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v19 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v18 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 @@ -60294,369 +59823,274 @@ define <64 x bfloat> @bitcast_v32f32_to_v64bf16(<32 x float> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 ; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 ; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v17 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v16 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v15 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v15 -; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v14 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v14 -; SI-NEXT: v_and_b32_e32 v37, 0xffff0000, v13 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v13 -; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v12 -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v12 -; SI-NEXT: v_and_b32_e32 v49, 0xffff0000, v11 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v11 -; SI-NEXT: v_and_b32_e32 v51, 0xffff0000, v10 -; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v10 -; SI-NEXT: v_and_b32_e32 v53, 0xffff0000, v9 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v9 -; SI-NEXT: v_and_b32_e32 v55, 0xffff0000, v8 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v8 -; SI-NEXT: v_and_b32_e32 v41, 0xffff0000, v7 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v7 -; SI-NEXT: v_and_b32_e32 v43, 0xffff0000, v6 -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v6 -; SI-NEXT: v_and_b32_e32 v45, 0xffff0000, v5 -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v5 -; SI-NEXT: v_and_b32_e32 v47, 0xffff0000, v4 -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v4 -; SI-NEXT: v_and_b32_e32 v57, 0xffff0000, v3 -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v3 -; SI-NEXT: v_and_b32_e32 v59, 0xffff0000, v2 -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v2 -; SI-NEXT: v_and_b32_e32 v61, 0xffff0000, v1 -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v1 -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v14 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v14 +; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v13 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v13 +; SI-NEXT: v_and_b32_e32 v37, 0xffff0000, v12 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v12 +; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v11 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v11 +; SI-NEXT: v_and_b32_e32 v49, 0xffff0000, v10 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v10 +; SI-NEXT: v_and_b32_e32 v51, 0xffff0000, v9 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v9 +; SI-NEXT: v_and_b32_e32 v53, 0xffff0000, v8 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v8 +; SI-NEXT: v_and_b32_e32 v55, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v7 +; SI-NEXT: v_and_b32_e32 v41, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v6 +; SI-NEXT: v_and_b32_e32 v43, 0xffff0000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v5 +; SI-NEXT: v_and_b32_e32 v45, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v4 +; SI-NEXT: v_and_b32_e32 v47, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v3 +; SI-NEXT: v_and_b32_e32 v57, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v2 +; SI-NEXT: v_and_b32_e32 v59, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v1 +; SI-NEXT: v_and_b32_e32 v61, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v0 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: .LBB40_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v61 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v60 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v61 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v60 +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v59 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v58 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v57 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v56 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v47 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v46 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v45 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v44 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v43 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v42 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v41 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v40 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v55 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v54 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v53 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v52 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v51 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v50 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v49 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v48 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v39 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v38 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v37 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v36 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v35 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v34 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v33 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v32 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v57 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v56 +; SI-NEXT: v_alignbit_b32 v2, v2, v3, 16 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v47 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v46 +; SI-NEXT: v_alignbit_b32 v3, v3, v4, 16 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v45 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v44 +; SI-NEXT: v_alignbit_b32 v4, v4, v5, 16 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v43 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_mul_f32_e32 v6, 1.0, v42 +; SI-NEXT: v_alignbit_b32 v5, v5, v6, 16 +; SI-NEXT: v_mul_f32_e32 v6, 1.0, v41 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v40 +; SI-NEXT: v_alignbit_b32 v6, v6, v7, 16 +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v55 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v54 +; SI-NEXT: v_alignbit_b32 v7, v7, v8, 16 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v53 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v52 +; SI-NEXT: v_alignbit_b32 v8, v8, v9, 16 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v51 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v50 +; SI-NEXT: v_alignbit_b32 v9, v9, v10, 16 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v49 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v48 +; SI-NEXT: v_alignbit_b32 v10, v10, v11, 16 +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v39 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_mul_f32_e32 v12, 1.0, v38 +; SI-NEXT: v_alignbit_b32 v11, v11, v12, 16 +; SI-NEXT: v_mul_f32_e32 v12, 1.0, v37 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_mul_f32_e32 v13, 1.0, v36 +; SI-NEXT: v_alignbit_b32 v12, v12, v13, 16 +; SI-NEXT: v_mul_f32_e32 v13, 1.0, v35 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_mul_f32_e32 v14, 1.0, v34 +; SI-NEXT: v_alignbit_b32 v13, v13, v14, 16 +; SI-NEXT: v_mul_f32_e32 v14, 1.0, v33 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v15, 1.0, v32 +; SI-NEXT: v_alignbit_b32 v14, v14, v15, 16 +; SI-NEXT: v_mul_f32_e32 v15, 1.0, v31 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_alignbit_b32 v15, v15, v16, 16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v18 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v19 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v20 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v21 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v23 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v24 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v25 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v26 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v27 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v28 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_mul_f32_e32 v29, 1.0, v29 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v30 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v31 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_alignbit_b32 v16, v16, v17, 16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_alignbit_b32 v17, v17, v18, 16 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_alignbit_b32 v18, v18, v19, 16 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_alignbit_b32 v19, v19, v20, 16 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_alignbit_b32 v20, v20, v21, 16 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_alignbit_b32 v21, v21, v22, 16 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_alignbit_b32 v22, v22, v23, 16 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_alignbit_b32 v23, v23, v24, 16 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_alignbit_b32 v24, v24, v25, 16 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_alignbit_b32 v25, v25, v26, 16 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_alignbit_b32 v26, v26, v27, 16 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_alignbit_b32 v27, v27, v28, 16 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_alignbit_b32 v28, v28, v29, 16 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v31 +; SI-NEXT: v_mul_f32_e32 v29, 1.0, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_alignbit_b32 v29, v29, v30, 16 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_alignbit_b32 v30, v30, v31, 16 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v63 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_alignbit_b32 v31, v31, v32, 16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v32f32_to_v64bf16: @@ -60816,8 +60250,8 @@ define inreg <64 x bfloat> @bitcast_v32f32_to_v64bf16_scalar(<32 x float> inreg ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_writelane_b32 v63, s30, 0 @@ -60842,68 +60276,68 @@ define inreg <64 x bfloat> @bitcast_v32f32_to_v64bf16_scalar(<32 x float> inreg ; SI-NEXT: v_writelane_b32 v63, s67, 19 ; SI-NEXT: v_writelane_b32 v63, s68, 20 ; SI-NEXT: v_writelane_b32 v63, s69, 21 -; SI-NEXT: v_mov_b32_e32 v20, s16 +; SI-NEXT: v_mov_b32_e32 v19, s16 ; SI-NEXT: v_writelane_b32 v63, s70, 22 -; SI-NEXT: v_readfirstlane_b32 s6, v20 -; SI-NEXT: v_mov_b32_e32 v20, s17 +; SI-NEXT: v_readfirstlane_b32 s6, v19 +; SI-NEXT: v_mov_b32_e32 v19, s17 ; SI-NEXT: v_writelane_b32 v63, s71, 23 -; SI-NEXT: v_readfirstlane_b32 s7, v20 -; SI-NEXT: v_mov_b32_e32 v20, s18 +; SI-NEXT: v_readfirstlane_b32 s7, v19 +; SI-NEXT: v_mov_b32_e32 v19, s18 ; SI-NEXT: v_writelane_b32 v63, s80, 24 -; SI-NEXT: v_readfirstlane_b32 s10, v20 -; SI-NEXT: v_mov_b32_e32 v20, s19 +; SI-NEXT: v_readfirstlane_b32 s10, v19 +; SI-NEXT: v_mov_b32_e32 v19, s19 ; SI-NEXT: v_writelane_b32 v63, s81, 25 -; SI-NEXT: v_readfirstlane_b32 s12, v20 -; SI-NEXT: v_mov_b32_e32 v20, s20 +; SI-NEXT: v_readfirstlane_b32 s12, v19 +; SI-NEXT: v_mov_b32_e32 v19, s20 ; SI-NEXT: v_writelane_b32 v63, s82, 26 -; SI-NEXT: v_readfirstlane_b32 s14, v20 -; SI-NEXT: v_mov_b32_e32 v20, s21 +; SI-NEXT: v_readfirstlane_b32 s14, v19 +; SI-NEXT: v_mov_b32_e32 v19, s21 ; SI-NEXT: v_writelane_b32 v63, s83, 27 -; SI-NEXT: v_readfirstlane_b32 s8, v20 -; SI-NEXT: v_mov_b32_e32 v20, s22 +; SI-NEXT: v_readfirstlane_b32 s8, v19 +; SI-NEXT: v_mov_b32_e32 v19, s22 ; SI-NEXT: v_writelane_b32 v63, s84, 28 -; SI-NEXT: v_readfirstlane_b32 s9, v20 -; SI-NEXT: v_mov_b32_e32 v20, s23 +; SI-NEXT: v_readfirstlane_b32 s9, v19 +; SI-NEXT: v_mov_b32_e32 v19, s23 ; SI-NEXT: v_writelane_b32 v63, s85, 29 -; SI-NEXT: v_readfirstlane_b32 s11, v20 -; SI-NEXT: v_mov_b32_e32 v20, s24 +; SI-NEXT: v_readfirstlane_b32 s11, v19 +; SI-NEXT: v_mov_b32_e32 v19, s24 ; SI-NEXT: v_writelane_b32 v63, s86, 30 -; SI-NEXT: v_readfirstlane_b32 s13, v20 -; SI-NEXT: v_mov_b32_e32 v20, s25 +; SI-NEXT: v_readfirstlane_b32 s13, v19 +; SI-NEXT: v_mov_b32_e32 v19, s25 ; SI-NEXT: v_writelane_b32 v63, s87, 31 -; SI-NEXT: v_readfirstlane_b32 s15, v20 -; SI-NEXT: v_mov_b32_e32 v20, s26 +; SI-NEXT: v_readfirstlane_b32 s15, v19 +; SI-NEXT: v_mov_b32_e32 v19, s26 ; SI-NEXT: v_writelane_b32 v63, s96, 32 -; SI-NEXT: v_readfirstlane_b32 s16, v20 -; SI-NEXT: v_mov_b32_e32 v20, s27 +; SI-NEXT: v_readfirstlane_b32 s16, v19 +; SI-NEXT: v_mov_b32_e32 v19, s27 ; SI-NEXT: v_writelane_b32 v63, s97, 33 -; SI-NEXT: v_readfirstlane_b32 s17, v20 -; SI-NEXT: v_mov_b32_e32 v20, s28 +; SI-NEXT: v_readfirstlane_b32 s17, v19 +; SI-NEXT: v_mov_b32_e32 v19, s28 ; SI-NEXT: v_writelane_b32 v63, s98, 34 -; SI-NEXT: v_readfirstlane_b32 s18, v20 -; SI-NEXT: v_mov_b32_e32 v20, s29 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 +; SI-NEXT: v_readfirstlane_b32 s18, v19 +; SI-NEXT: v_mov_b32_e32 v19, s29 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 ; SI-NEXT: v_writelane_b32 v63, s99, 35 -; SI-NEXT: v_readfirstlane_b32 s19, v20 -; SI-NEXT: v_readfirstlane_b32 s20, v1 -; SI-NEXT: v_readfirstlane_b32 s21, v2 -; SI-NEXT: v_readfirstlane_b32 s22, v3 -; SI-NEXT: v_readfirstlane_b32 s23, v4 -; SI-NEXT: v_readfirstlane_b32 s24, v5 -; SI-NEXT: v_readfirstlane_b32 s25, v6 -; SI-NEXT: v_readfirstlane_b32 s26, v7 -; SI-NEXT: v_readfirstlane_b32 s27, v8 -; SI-NEXT: v_readfirstlane_b32 s28, v9 -; SI-NEXT: v_readfirstlane_b32 s29, v10 -; SI-NEXT: v_readfirstlane_b32 s40, v11 -; SI-NEXT: v_readfirstlane_b32 s41, v12 -; SI-NEXT: v_readfirstlane_b32 s42, v13 -; SI-NEXT: v_readfirstlane_b32 s43, v14 -; SI-NEXT: v_readfirstlane_b32 s44, v15 -; SI-NEXT: v_readfirstlane_b32 s45, v16 -; SI-NEXT: v_readfirstlane_b32 s46, v17 +; SI-NEXT: v_readfirstlane_b32 s19, v19 +; SI-NEXT: v_readfirstlane_b32 s20, v0 +; SI-NEXT: v_readfirstlane_b32 s21, v1 +; SI-NEXT: v_readfirstlane_b32 s22, v2 +; SI-NEXT: v_readfirstlane_b32 s23, v3 +; SI-NEXT: v_readfirstlane_b32 s24, v4 +; SI-NEXT: v_readfirstlane_b32 s25, v5 +; SI-NEXT: v_readfirstlane_b32 s26, v6 +; SI-NEXT: v_readfirstlane_b32 s27, v7 +; SI-NEXT: v_readfirstlane_b32 s28, v8 +; SI-NEXT: v_readfirstlane_b32 s29, v9 +; SI-NEXT: v_readfirstlane_b32 s40, v10 +; SI-NEXT: v_readfirstlane_b32 s41, v11 +; SI-NEXT: v_readfirstlane_b32 s42, v12 +; SI-NEXT: v_readfirstlane_b32 s43, v13 +; SI-NEXT: v_readfirstlane_b32 s44, v14 +; SI-NEXT: v_readfirstlane_b32 s45, v15 +; SI-NEXT: v_readfirstlane_b32 s46, v16 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s47, v18 +; SI-NEXT: v_readfirstlane_b32 s47, v17 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill @@ -60991,109 +60425,116 @@ define inreg <64 x bfloat> @bitcast_v32f32_to_v64bf16_scalar(<32 x float> inreg ; SI-NEXT: s_lshl_b32 s59, s6, 16 ; SI-NEXT: s_cbranch_execnz .LBB41_4 ; SI-NEXT: .LBB41_2: ; %cmp.true -; SI-NEXT: v_add_f32_e64 v2, s12, 1.0 -; SI-NEXT: v_add_f32_e64 v4, s47, 1.0 -; SI-NEXT: v_add_f32_e64 v1, s10, 1.0 -; SI-NEXT: v_add_f32_e64 v6, s46, 1.0 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_and_b32_e32 v58, 0xffff0000, v2 -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v2 -; SI-NEXT: v_add_f32_e64 v2, s7, 1.0 -; SI-NEXT: v_add_f32_e64 v3, s14, 1.0 -; SI-NEXT: v_add_f32_e64 v45, s8, 1.0 -; SI-NEXT: v_add_f32_e64 v43, s9, 1.0 -; SI-NEXT: v_add_f32_e64 v41, s11, 1.0 -; SI-NEXT: v_add_f32_e64 v55, s13, 1.0 -; SI-NEXT: v_add_f32_e64 v53, s15, 1.0 -; SI-NEXT: v_add_f32_e64 v51, s16, 1.0 -; SI-NEXT: v_add_f32_e64 v49, s17, 1.0 -; SI-NEXT: v_add_f32_e64 v39, s18, 1.0 -; SI-NEXT: v_add_f32_e64 v37, s19, 1.0 -; SI-NEXT: v_add_f32_e64 v35, s20, 1.0 -; SI-NEXT: v_add_f32_e64 v33, s21, 1.0 -; SI-NEXT: v_add_f32_e64 v31, s22, 1.0 -; SI-NEXT: v_add_f32_e64 v29, s23, 1.0 -; SI-NEXT: v_add_f32_e64 v27, s24, 1.0 -; SI-NEXT: v_add_f32_e64 v25, s25, 1.0 -; SI-NEXT: v_add_f32_e64 v23, s26, 1.0 -; SI-NEXT: v_add_f32_e64 v21, s27, 1.0 -; SI-NEXT: v_add_f32_e64 v19, s28, 1.0 -; SI-NEXT: v_add_f32_e64 v17, s29, 1.0 -; SI-NEXT: v_add_f32_e64 v15, s40, 1.0 -; SI-NEXT: v_add_f32_e64 v13, s41, 1.0 -; SI-NEXT: v_add_f32_e64 v11, s42, 1.0 -; SI-NEXT: v_add_f32_e64 v9, s43, 1.0 -; SI-NEXT: v_add_f32_e64 v7, s44, 1.0 -; SI-NEXT: v_add_f32_e64 v5, s45, 1.0 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v4, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_and_b32_e32 v60, 0xffff0000, v1 -; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 -; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v2 -; SI-NEXT: v_add_f32_e64 v2, s6, 1.0 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e64 v29, s47, 1.0 +; SI-NEXT: v_add_f32_e64 v28, s46, 1.0 +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v29 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_add_f32_e64 v27, s45, 1.0 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_add_f32_e64 v0, s10, 1.0 +; SI-NEXT: v_add_f32_e64 v4, s9, 1.0 +; SI-NEXT: v_add_f32_e64 v5, s11, 1.0 +; SI-NEXT: v_add_f32_e64 v12, s20, 1.0 +; SI-NEXT: v_add_f32_e64 v13, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v14, s22, 1.0 +; SI-NEXT: v_add_f32_e64 v15, s23, 1.0 +; SI-NEXT: v_add_f32_e64 v26, s44, 1.0 +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v13 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v15 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v17 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v19 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v21 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v23 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v25 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 ; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v27 +; SI-NEXT: v_add_f32_e64 v2, s14, 1.0 +; SI-NEXT: v_add_f32_e64 v3, s8, 1.0 +; SI-NEXT: v_add_f32_e64 v9, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v10, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v11, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v22, s40, 1.0 +; SI-NEXT: v_add_f32_e64 v23, s41, 1.0 +; SI-NEXT: v_add_f32_e64 v24, s42, 1.0 +; SI-NEXT: v_add_f32_e64 v25, s43, 1.0 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v29 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v31 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; SI-NEXT: v_and_b32_e32 v34, 0xffff0000, v33 -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; SI-NEXT: v_and_b32_e32 v36, 0xffff0000, v35 -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; SI-NEXT: v_and_b32_e32 v38, 0xffff0000, v37 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; SI-NEXT: v_and_b32_e32 v48, 0xffff0000, v39 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; SI-NEXT: v_and_b32_e32 v50, 0xffff0000, v49 -; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; SI-NEXT: v_and_b32_e32 v52, 0xffff0000, v51 -; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; SI-NEXT: v_and_b32_e32 v54, 0xffff0000, v53 -; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; SI-NEXT: v_and_b32_e32 v40, 0xffff0000, v55 -; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; SI-NEXT: v_and_b32_e32 v42, 0xffff0000, v41 -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v41 -; SI-NEXT: v_and_b32_e32 v44, 0xffff0000, v43 -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v43 -; SI-NEXT: v_and_b32_e32 v46, 0xffff0000, v45 -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v45 -; SI-NEXT: v_and_b32_e32 v56, 0xffff0000, v3 -; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v49, 0xffff0000, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_and_b32_e32 v37, 0xffff0000, v15 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v15 +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v14 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v14 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v13 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v13 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v12 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v12 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v5 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v0 +; SI-NEXT: v_add_f32_e64 v0, s7, 1.0 +; SI-NEXT: v_add_f32_e64 v1, s12, 1.0 +; SI-NEXT: v_add_f32_e64 v6, s13, 1.0 +; SI-NEXT: v_add_f32_e64 v7, s15, 1.0 +; SI-NEXT: v_add_f32_e64 v8, s16, 1.0 +; SI-NEXT: v_add_f32_e64 v16, s24, 1.0 +; SI-NEXT: v_add_f32_e64 v17, s25, 1.0 +; SI-NEXT: v_add_f32_e64 v18, s26, 1.0 +; SI-NEXT: v_add_f32_e64 v19, s27, 1.0 +; SI-NEXT: v_add_f32_e64 v20, s28, 1.0 +; SI-NEXT: v_add_f32_e64 v21, s29, 1.0 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v51, 0xffff0000, v25 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v25 +; SI-NEXT: v_and_b32_e32 v53, 0xffff0000, v24 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v24 +; SI-NEXT: v_and_b32_e32 v55, 0xffff0000, v23 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v23 +; SI-NEXT: v_and_b32_e32 v41, 0xffff0000, v22 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v22 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v11 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v11 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v10 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v10 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v9 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v9 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v3 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_add_f32_e64 v0, s6, 1.0 +; SI-NEXT: v_and_b32_e32 v43, 0xffff0000, v21 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v21 +; SI-NEXT: v_and_b32_e32 v45, 0xffff0000, v20 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v20 +; SI-NEXT: v_and_b32_e32 v47, 0xffff0000, v19 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v19 +; SI-NEXT: v_and_b32_e32 v57, 0xffff0000, v18 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v18 +; SI-NEXT: v_and_b32_e32 v59, 0xffff0000, v17 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v17 +; SI-NEXT: v_and_b32_e32 v61, 0xffff0000, v16 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v16 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v8 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v8 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v7 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v6 +; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: s_branch .LBB41_5 ; SI-NEXT: .LBB41_3: ; SI-NEXT: ; implicit-def: $sgpr4 @@ -61166,291 +60607,191 @@ define inreg <64 x bfloat> @bitcast_v32f32_to_v64bf16_scalar(<32 x float> inreg ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: s_branch .LBB41_2 ; SI-NEXT: .LBB41_4: +; SI-NEXT: v_mov_b32_e32 v7, s63 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v7, s61 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v7, s60 ; SI-NEXT: v_readlane_b32 s4, v62, 0 -; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v7, s4 ; SI-NEXT: v_readlane_b32 s4, v62, 1 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: v_mov_b32_e32 v7, s4 ; SI-NEXT: v_readlane_b32 s4, v62, 2 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: v_mov_b32_e32 v7, s4 ; SI-NEXT: v_readlane_b32 s4, v62, 3 -; SI-NEXT: v_mov_b32_e32 v2, s59 -; SI-NEXT: v_mov_b32_e32 v3, s58 -; SI-NEXT: v_mov_b32_e32 v61, s57 -; SI-NEXT: v_mov_b32_e32 v1, s56 -; SI-NEXT: v_mov_b32_e32 v59, s99 -; SI-NEXT: v_mov_b32_e32 v60, s98 -; SI-NEXT: v_mov_b32_e32 v57, s97 -; SI-NEXT: v_mov_b32_e32 v58, s96 -; SI-NEXT: v_mov_b32_e32 v47, s87 -; SI-NEXT: v_mov_b32_e32 v56, s86 -; SI-NEXT: v_mov_b32_e32 v45, s85 -; SI-NEXT: v_mov_b32_e32 v46, s84 -; SI-NEXT: v_mov_b32_e32 v43, s83 -; SI-NEXT: v_mov_b32_e32 v44, s82 -; SI-NEXT: v_mov_b32_e32 v41, s81 -; SI-NEXT: v_mov_b32_e32 v42, s80 -; SI-NEXT: v_mov_b32_e32 v55, s71 -; SI-NEXT: v_mov_b32_e32 v40, s70 -; SI-NEXT: v_mov_b32_e32 v53, s69 -; SI-NEXT: v_mov_b32_e32 v54, s68 -; SI-NEXT: v_mov_b32_e32 v51, s67 -; SI-NEXT: v_mov_b32_e32 v52, s66 -; SI-NEXT: v_mov_b32_e32 v49, s65 -; SI-NEXT: v_mov_b32_e32 v50, s64 -; SI-NEXT: v_mov_b32_e32 v39, s55 -; SI-NEXT: v_mov_b32_e32 v48, s54 -; SI-NEXT: v_mov_b32_e32 v37, s53 -; SI-NEXT: v_mov_b32_e32 v38, s52 -; SI-NEXT: v_mov_b32_e32 v35, s51 -; SI-NEXT: v_mov_b32_e32 v36, s50 -; SI-NEXT: v_mov_b32_e32 v33, s49 -; SI-NEXT: v_mov_b32_e32 v34, s48 -; SI-NEXT: v_mov_b32_e32 v31, s39 -; SI-NEXT: v_mov_b32_e32 v32, s38 -; SI-NEXT: v_mov_b32_e32 v29, s37 -; SI-NEXT: v_mov_b32_e32 v30, s36 -; SI-NEXT: v_mov_b32_e32 v27, s35 -; SI-NEXT: v_mov_b32_e32 v28, s34 -; SI-NEXT: v_mov_b32_e32 v25, s31 -; SI-NEXT: v_mov_b32_e32 v26, s30 -; SI-NEXT: v_mov_b32_e32 v23, s95 -; SI-NEXT: v_mov_b32_e32 v24, s94 -; SI-NEXT: v_mov_b32_e32 v21, s93 -; SI-NEXT: v_mov_b32_e32 v22, s92 -; SI-NEXT: v_mov_b32_e32 v19, s91 -; SI-NEXT: v_mov_b32_e32 v20, s90 -; SI-NEXT: v_mov_b32_e32 v17, s89 -; SI-NEXT: v_mov_b32_e32 v18, s88 -; SI-NEXT: v_mov_b32_e32 v15, s79 -; SI-NEXT: v_mov_b32_e32 v16, s78 -; SI-NEXT: v_mov_b32_e32 v13, s77 -; SI-NEXT: v_mov_b32_e32 v14, s76 -; SI-NEXT: v_mov_b32_e32 v11, s75 -; SI-NEXT: v_mov_b32_e32 v12, s74 -; SI-NEXT: v_mov_b32_e32 v9, s73 -; SI-NEXT: v_mov_b32_e32 v10, s72 -; SI-NEXT: v_mov_b32_e32 v7, s63 -; SI-NEXT: v_mov_b32_e32 v8, s62 -; SI-NEXT: v_mov_b32_e32 v5, s61 -; SI-NEXT: v_mov_b32_e32 v6, s60 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: v_mov_b32_e32 v7, s4 +; SI-NEXT: v_mov_b32_e32 v0, s59 +; SI-NEXT: v_mov_b32_e32 v1, s58 +; SI-NEXT: v_mov_b32_e32 v2, s57 +; SI-NEXT: v_mov_b32_e32 v3, s56 +; SI-NEXT: v_mov_b32_e32 v4, s99 +; SI-NEXT: v_mov_b32_e32 v5, s98 +; SI-NEXT: v_mov_b32_e32 v6, s97 +; SI-NEXT: v_mov_b32_e32 v39, s96 +; SI-NEXT: v_mov_b32_e32 v38, s87 +; SI-NEXT: v_mov_b32_e32 v9, s86 +; SI-NEXT: v_mov_b32_e32 v10, s85 +; SI-NEXT: v_mov_b32_e32 v11, s84 +; SI-NEXT: v_mov_b32_e32 v12, s83 +; SI-NEXT: v_mov_b32_e32 v13, s82 +; SI-NEXT: v_mov_b32_e32 v14, s81 +; SI-NEXT: v_mov_b32_e32 v15, s80 +; SI-NEXT: v_mov_b32_e32 v48, s71 +; SI-NEXT: v_mov_b32_e32 v17, s70 +; SI-NEXT: v_mov_b32_e32 v18, s69 +; SI-NEXT: v_mov_b32_e32 v19, s68 +; SI-NEXT: v_mov_b32_e32 v20, s67 +; SI-NEXT: v_mov_b32_e32 v21, s66 +; SI-NEXT: v_mov_b32_e32 v22, s65 +; SI-NEXT: v_mov_b32_e32 v23, s64 +; SI-NEXT: v_mov_b32_e32 v24, s55 +; SI-NEXT: v_mov_b32_e32 v25, s54 +; SI-NEXT: v_mov_b32_e32 v26, s53 +; SI-NEXT: v_mov_b32_e32 v27, s52 +; SI-NEXT: v_mov_b32_e32 v28, s51 +; SI-NEXT: v_mov_b32_e32 v29, s50 +; SI-NEXT: v_mov_b32_e32 v30, s49 +; SI-NEXT: v_mov_b32_e32 v31, s48 +; SI-NEXT: v_mov_b32_e32 v32, s39 +; SI-NEXT: v_mov_b32_e32 v33, s38 +; SI-NEXT: v_mov_b32_e32 v36, s37 +; SI-NEXT: v_mov_b32_e32 v37, s36 +; SI-NEXT: v_mov_b32_e32 v60, s35 +; SI-NEXT: v_mov_b32_e32 v61, s34 +; SI-NEXT: v_mov_b32_e32 v58, s31 +; SI-NEXT: v_mov_b32_e32 v59, s30 +; SI-NEXT: v_mov_b32_e32 v56, s95 +; SI-NEXT: v_mov_b32_e32 v57, s94 +; SI-NEXT: v_mov_b32_e32 v46, s93 +; SI-NEXT: v_mov_b32_e32 v47, s92 +; SI-NEXT: v_mov_b32_e32 v44, s91 +; SI-NEXT: v_mov_b32_e32 v45, s90 +; SI-NEXT: v_mov_b32_e32 v42, s89 +; SI-NEXT: v_mov_b32_e32 v43, s88 +; SI-NEXT: v_mov_b32_e32 v40, s79 +; SI-NEXT: v_mov_b32_e32 v41, s78 +; SI-NEXT: v_mov_b32_e32 v54, s77 +; SI-NEXT: v_mov_b32_e32 v55, s76 +; SI-NEXT: v_mov_b32_e32 v52, s75 +; SI-NEXT: v_mov_b32_e32 v53, s74 +; SI-NEXT: v_mov_b32_e32 v50, s73 +; SI-NEXT: v_mov_b32_e32 v51, s72 +; SI-NEXT: v_mov_b32_e32 v49, s62 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: .LBB41_5: ; %end -; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 +; SI-NEXT: v_mul_f32_e32 v34, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v5 ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v61 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v60 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v59 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v58 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v57 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v56 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v47 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v46 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v45 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v44 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v43 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v42 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v41 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v40 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v55 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v54 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v53 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v52 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v51 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v50 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v49 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v48 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v39 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v38 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v37 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v36 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v35 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v34 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v33 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v32 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v31 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v30 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v29 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v28 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v27 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v26 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v25 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v24 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v23 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v22 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v21 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v20 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v19 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v18 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v17 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v16 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v15 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v14 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v13 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v12 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v11 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v10 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v9 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v8 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v7 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v39 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v6 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v9 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v11 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v6 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v5 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v10 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v13 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v15 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v17 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v19 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v21 +; SI-NEXT: v_mul_f32_e32 v15, 1.0, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v33 +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v37 +; SI-NEXT: v_mul_f32_e32 v13, 1.0, v20 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v61 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v59 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v57 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v47 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v45 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v43 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v41 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v55 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v53 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v51 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v49 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v48 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v4 +; SI-NEXT: v_lshr_b64 v[33:34], v[34:35], 16 +; SI-NEXT: v_lshr_b64 v[34:35], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[2:3], v[7:8], 16 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v38 +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v36 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v26 +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v28 +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v30 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v32 +; SI-NEXT: v_mul_f32_e32 v60, 1.0, v60 +; SI-NEXT: v_mul_f32_e32 v58, 1.0, v58 +; SI-NEXT: v_mul_f32_e32 v56, 1.0, v56 +; SI-NEXT: v_mul_f32_e32 v46, 1.0, v46 +; SI-NEXT: v_mul_f32_e32 v44, 1.0, v44 +; SI-NEXT: v_mul_f32_e32 v42, 1.0, v42 +; SI-NEXT: v_mul_f32_e32 v40, 1.0, v40 +; SI-NEXT: v_mul_f32_e32 v54, 1.0, v54 +; SI-NEXT: v_mul_f32_e32 v52, 1.0, v52 +; SI-NEXT: v_mul_f32_e32 v50, 1.0, v50 ; SI-NEXT: v_readlane_b32 s99, v63, 35 ; SI-NEXT: v_readlane_b32 s98, v63, 34 ; SI-NEXT: v_readlane_b32 s97, v63, 33 @@ -61487,24 +60828,45 @@ define inreg <64 x bfloat> @bitcast_v32f32_to_v64bf16_scalar(<32 x float> inreg ; SI-NEXT: v_readlane_b32 s34, v63, 2 ; SI-NEXT: v_readlane_b32 s31, v63, 1 ; SI-NEXT: v_readlane_b32 s30, v63, 0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v4 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_mul_f32_e32 v48, 1.0, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshr_b64 v[3:4], v[3:4], 16 +; SI-NEXT: v_lshr_b64 v[4:5], v[5:6], 16 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshr_b64 v[5:6], v[5:6], 16 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_mul_f32_e32 v38, 1.0, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshr_b64 v[6:7], v[6:7], 16 +; SI-NEXT: v_lshr_b64 v[7:8], v[15:16], 16 +; SI-NEXT: v_lshr_b64 v[8:9], v[9:10], 16 +; SI-NEXT: v_lshr_b64 v[9:10], v[11:12], 16 +; SI-NEXT: v_lshr_b64 v[10:11], v[13:14], 16 +; SI-NEXT: v_lshr_b64 v[11:12], v[22:23], 16 +; SI-NEXT: v_lshr_b64 v[12:13], v[24:25], 16 +; SI-NEXT: v_lshr_b64 v[13:14], v[26:27], 16 +; SI-NEXT: v_lshr_b64 v[14:15], v[28:29], 16 +; SI-NEXT: v_lshr_b64 v[15:16], v[30:31], 16 +; SI-NEXT: v_lshr_b64 v[16:17], v[17:18], 16 +; SI-NEXT: v_lshr_b64 v[17:18], v[19:20], 16 +; SI-NEXT: v_lshr_b64 v[18:19], v[60:61], 16 +; SI-NEXT: v_lshr_b64 v[19:20], v[58:59], 16 +; SI-NEXT: v_lshr_b64 v[20:21], v[56:57], 16 +; SI-NEXT: v_lshr_b64 v[21:22], v[46:47], 16 +; SI-NEXT: v_lshr_b64 v[22:23], v[44:45], 16 +; SI-NEXT: v_lshr_b64 v[23:24], v[42:43], 16 +; SI-NEXT: v_lshr_b64 v[24:25], v[40:41], 16 ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -61519,11 +60881,34 @@ define inreg <64 x bfloat> @bitcast_v32f32_to_v64bf16_scalar(<32 x float> inreg ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: v_lshr_b64 v[25:26], v[54:55], 16 +; SI-NEXT: v_lshr_b64 v[26:27], v[52:53], 16 +; SI-NEXT: v_lshr_b64 v[27:28], v[50:51], 16 +; SI-NEXT: v_lshr_b64 v[28:29], v[48:49], 16 +; SI-NEXT: v_lshr_b64 v[29:30], v[38:39], 16 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v36, 1.0, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: v_lshr_b64 v[30:31], v[36:37], 16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshr_b64 v[31:32], v[0:1], 16 +; SI-NEXT: v_mov_b32_e32 v0, v33 +; SI-NEXT: v_mov_b32_e32 v1, v34 ; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[4:5] -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v32f32_to_v64bf16_scalar: @@ -61754,382 +61139,254 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) { ; SI-LABEL: bitcast_v64bf16_to_v32f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:132 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:88 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v63, 0xffff0000, v0 +; SI-NEXT: v_and_b32_e32 v62, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_mul_f32_e32 v63, 1.0, v63 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v63, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v62 +; SI-NEXT: v_and_b32_e32 v61, 0xffff0000, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v11 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v10 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v61 +; SI-NEXT: v_and_b32_e32 v60, 0xffff0000, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v12 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v15 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v17 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v60 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v59, 0xffff0000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v58, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v57, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v47, 0xffff0000, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v46, 0xffff0000, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v45, 0xffff0000, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_and_b32_e32 v44, 0xffff0000, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v43, 0xffff0000, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_and_b32_e32 v42, 0xffff0000, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v41, 0xffff0000, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v40, 0xffff0000, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_and_b32_e32 v55, 0xffff0000, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v30 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v29 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_and_b32_e32 v34, 0xffff0000, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_and_b32_e32 v36, 0xffff0000, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_and_b32_e32 v48, 0xffff0000, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_and_b32_e32 v49, 0xffff0000, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_and_b32_e32 v50, 0xffff0000, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_and_b32_e32 v51, 0xffff0000, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_and_b32_e32 v52, 0xffff0000, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_and_b32_e32 v53, 0xffff0000, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_and_b32_e32 v54, 0xffff0000, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_mul_f32_e32 v62, 1.0, v54 +; SI-NEXT: v_mul_f32_e32 v60, 1.0, v53 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v56, 0xffff0000, v37 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v38 +; SI-NEXT: v_and_b32_e32 v38, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v18 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v38 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v21 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v20 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v59 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v23 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v5 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v22 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v58 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v6 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v24 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v57 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v27 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v26 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v47 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v29 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v28 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: v_mul_f32_e32 v62, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v60, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v61, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v58, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v59, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v56, 1.0, v7 -; SI-NEXT: v_mul_f32_e32 v57, 1.0, v6 -; SI-NEXT: v_mul_f32_e32 v47, 1.0, v8 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v46 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v9 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v33 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v45 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v34 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v10 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v35 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v44 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v36 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v11 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v37 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v43 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v38 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v39 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v42 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v48 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v49 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v41 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v50 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v51 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v40 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v52 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v53 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v55 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v54 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:100 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:112 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:108 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:120 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:116 -; SI-NEXT: v_mul_f32_e32 v39, 1.0, v41 -; SI-NEXT: v_mul_f32_e32 v51, 1.0, v42 -; SI-NEXT: v_mul_f32_e32 v32, 1.0, v30 -; SI-NEXT: v_mul_f32_e32 v52, 1.0, v55 -; SI-NEXT: v_mul_f32_e32 v55, 1.0, v40 -; SI-NEXT: v_mul_f32_e32 v34, 1.0, v43 -; SI-NEXT: v_mul_f32_e32 v38, 1.0, v44 -; SI-NEXT: v_mul_f32_e32 v33, 1.0, v45 -; SI-NEXT: v_mul_f32_e32 v35, 1.0, v46 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_mul_f32_e32 v41, 1.0, v0 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_mul_f32_e32 v42, 1.0, v1 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_mul_f32_e32 v54, 1.0, v2 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_mul_f32_e32 v40, 1.0, v3 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_mul_f32_e32 v48, 1.0, v4 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_mul_f32_e32 v53, 1.0, v5 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_mul_f32_e32 v36, 1.0, v6 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_mul_f32_e32 v49, 1.0, v7 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v37, 1.0, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v50, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v17 +; SI-NEXT: v_mul_f32_e32 v61, 1.0, v18 +; SI-NEXT: v_mul_f32_e32 v58, 1.0, v52 +; SI-NEXT: v_mul_f32_e32 v59, 1.0, v19 +; SI-NEXT: v_mul_f32_e32 v47, 1.0, v51 +; SI-NEXT: v_mul_f32_e32 v57, 1.0, v20 +; SI-NEXT: v_mul_f32_e32 v45, 1.0, v50 +; SI-NEXT: v_mul_f32_e32 v46, 1.0, v21 +; SI-NEXT: v_mul_f32_e32 v43, 1.0, v49 +; SI-NEXT: v_mul_f32_e32 v44, 1.0, v22 +; SI-NEXT: v_mul_f32_e32 v41, 1.0, v48 +; SI-NEXT: v_mul_f32_e32 v42, 1.0, v23 +; SI-NEXT: v_mul_f32_e32 v55, 1.0, v39 +; SI-NEXT: v_mul_f32_e32 v40, 1.0, v24 +; SI-NEXT: v_mul_f32_e32 v53, 1.0, v36 +; SI-NEXT: v_mul_f32_e32 v54, 1.0, v25 +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v35 +; SI-NEXT: v_mul_f32_e32 v52, 1.0, v26 +; SI-NEXT: v_mul_f32_e32 v49, 1.0, v34 +; SI-NEXT: v_mul_f32_e32 v50, 1.0, v27 +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v33 +; SI-NEXT: v_mul_f32_e32 v48, 1.0, v28 +; SI-NEXT: v_mul_f32_e32 v36, 1.0, v32 +; SI-NEXT: v_mul_f32_e32 v38, 1.0, v29 +; SI-NEXT: v_mul_f32_e32 v34, 1.0, v31 +; SI-NEXT: v_mul_f32_e32 v35, 1.0, v30 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v56 +; SI-NEXT: v_mul_f32_e32 v33, 1.0, v37 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB42_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v62 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v60 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v58 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v56 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v39 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v34 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v41 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v54 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v48 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v36 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v37 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: v_alignbit_b32 v0, v0, v63, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v61, 16 -; SI-NEXT: v_alignbit_b32 v2, v2, v59, 16 -; SI-NEXT: v_alignbit_b32 v3, v3, v57, 16 -; SI-NEXT: v_alignbit_b32 v24, v24, v51, 16 -; SI-NEXT: v_alignbit_b32 v25, v25, v38, 16 -; SI-NEXT: v_alignbit_b32 v26, v26, v35, 16 -; SI-NEXT: v_alignbit_b32 v27, v27, v42, 16 -; SI-NEXT: v_alignbit_b32 v28, v28, v40, 16 -; SI-NEXT: v_alignbit_b32 v29, v29, v53, 16 -; SI-NEXT: v_alignbit_b32 v30, v30, v49, 16 -; SI-NEXT: v_alignbit_b32 v31, v31, v50, 16 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_alignbit_b32 v5, v5, v6, 16 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_alignbit_b32 v16, v16, v17, 16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_alignbit_b32 v6, v6, v7, 16 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_alignbit_b32 v17, v17, v18, 16 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_alignbit_b32 v7, v7, v8, 16 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_alignbit_b32 v18, v18, v19, 16 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_alignbit_b32 v8, v8, v9, 16 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_alignbit_b32 v19, v19, v20, 16 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_alignbit_b32 v9, v9, v10, 16 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_alignbit_b32 v20, v20, v21, 16 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_alignbit_b32 v10, v10, v11, 16 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_alignbit_b32 v21, v21, v22, 16 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_alignbit_b32 v11, v11, v12, 16 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_alignbit_b32 v22, v22, v23, 16 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v52 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: v_alignbit_b32 v23, v23, v55, 16 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_alignbit_b32 v12, v12, v13, 16 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_alignbit_b32 v13, v13, v14, 16 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_alignbit_b32 v14, v14, v15, 16 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_alignbit_b32 v4, v4, v47, 16 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_alignbit_b32 v15, v15, v32, 16 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 @@ -62175,282 +61432,428 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v58 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v47 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v45 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v43 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v34 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: v_alignbit_b32 v19, v19, v59, 16 +; SI-NEXT: v_alignbit_b32 v20, v20, v57, 16 +; SI-NEXT: v_alignbit_b32 v21, v21, v46, 16 +; SI-NEXT: v_alignbit_b32 v22, v22, v44, 16 +; SI-NEXT: v_alignbit_b32 v23, v23, v42, 16 +; SI-NEXT: v_alignbit_b32 v24, v24, v40, 16 +; SI-NEXT: v_alignbit_b32 v25, v25, v54, 16 +; SI-NEXT: v_alignbit_b32 v26, v26, v52, 16 +; SI-NEXT: v_alignbit_b32 v27, v27, v50, 16 +; SI-NEXT: v_alignbit_b32 v28, v28, v48, 16 +; SI-NEXT: v_alignbit_b32 v29, v29, v38, 16 +; SI-NEXT: v_alignbit_b32 v30, v30, v35, 16 +; SI-NEXT: v_alignbit_b32 v31, v31, v33, 16 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_alignbit_b32 v0, v0, v63, 16 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_alignbit_b32 v2, v2, v3, 16 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_alignbit_b32 v3, v3, v4, 16 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_alignbit_b32 v4, v4, v5, 16 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_alignbit_b32 v5, v5, v6, 16 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_alignbit_b32 v6, v6, v7, 16 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_alignbit_b32 v7, v7, v8, 16 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_alignbit_b32 v8, v8, v9, 16 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_alignbit_b32 v9, v9, v10, 16 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_alignbit_b32 v10, v10, v11, 16 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_alignbit_b32 v11, v11, v12, 16 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_alignbit_b32 v12, v12, v13, 16 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_alignbit_b32 v13, v13, v14, 16 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_alignbit_b32 v14, v14, v15, 16 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_alignbit_b32 v15, v15, v16, 16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_alignbit_b32 v16, v16, v17, 16 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v62 +; SI-NEXT: v_alignbit_b32 v17, v17, v18, 16 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v60 +; SI-NEXT: v_alignbit_b32 v18, v18, v61, 16 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: .LBB42_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB42_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v62 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v60 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v63 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v61 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; SI-NEXT: v_alignbit_b32 v1, v3, v2, 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v58 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v59 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v56 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v57 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v47 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v52 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v62 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v60 +; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v58 +; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v47 +; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v45 +; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v43 +; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v41 ; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 ; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v39 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v55 ; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 ; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v34 +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v53 ; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 ; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v33 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v51 ; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 ; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 -; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v41 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v49 ; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 ; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v54 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v39 ; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 ; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 -; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v48 +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v36 ; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 ; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v36 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v34 ; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v3, v2, 16 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 ; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 ; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 ; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 ; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 ; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: v_alignbit_b32 v8, v9, v8, 16 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 ; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 ; SI-NEXT: v_alignbit_b32 v10, v11, v10, 16 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 ; SI-NEXT: v_alignbit_b32 v11, v12, v11, 16 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 ; SI-NEXT: v_alignbit_b32 v12, v13, v12, 16 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 ; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; SI-NEXT: v_alignbit_b32 v14, v15, v14, 16 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v32 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 ; SI-NEXT: v_alignbit_b32 v15, v16, v15, 16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v37 -; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; SI-NEXT: v_alignbit_b32 v16, v17, v16, 16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 ; SI-NEXT: v_alignbit_b32 v17, v18, v17, 16 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v61 ; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; SI-NEXT: v_alignbit_b32 v18, v19, v18, 16 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v59 ; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 ; SI-NEXT: v_alignbit_b32 v19, v20, v19, 16 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v57 ; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 ; SI-NEXT: v_alignbit_b32 v20, v21, v20, 16 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v46 ; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 ; SI-NEXT: v_alignbit_b32 v21, v22, v21, 16 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v44 ; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 ; SI-NEXT: v_alignbit_b32 v22, v23, v22, 16 -; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v55 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v42 ; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 ; SI-NEXT: v_alignbit_b32 v23, v24, v23, 16 -; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v51 +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v40 ; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 ; SI-NEXT: v_alignbit_b32 v24, v25, v24, 16 -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v38 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v54 ; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 ; SI-NEXT: v_alignbit_b32 v25, v26, v25, 16 -; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v35 +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v52 ; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 ; SI-NEXT: v_alignbit_b32 v26, v27, v26, 16 -; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v42 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v50 ; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 ; SI-NEXT: v_alignbit_b32 v27, v28, v27, 16 -; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v40 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v48 ; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 ; SI-NEXT: v_alignbit_b32 v28, v29, v28, 16 -; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v53 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v38 ; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 ; SI-NEXT: v_alignbit_b32 v29, v30, v29, 16 -; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v49 +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v35 ; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 ; SI-NEXT: v_alignbit_b32 v30, v31, v30, 16 -; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v50 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v33 ; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 ; SI-NEXT: v_alignbit_b32 v31, v32, v31, 16 ; SI-NEXT: .LBB42_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v64bf16_to_v32f32: @@ -64728,667 +64131,665 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; SI-LABEL: bitcast_v64bf16_to_v32f32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:68 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v63, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v7 -; SI-NEXT: v_mov_b32_e32 v43, v21 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v11 -; SI-NEXT: v_mov_b32_e32 v54, v29 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v43 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v54 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v41, v23 -; SI-NEXT: v_mov_b32_e32 v29, v20 -; SI-NEXT: v_mul_f32_e32 v57, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v59, 1.0, v9 -; SI-NEXT: v_mul_f32_e32 v61, 1.0, v13 -; SI-NEXT: v_mul_f32_e32 v23, 1.0, v15 -; SI-NEXT: v_mul_f32_e32 v44, 1.0, v17 -; SI-NEXT: v_mul_f32_e32 v21, 1.0, v19 -; SI-NEXT: v_mul_f32_e32 v20, 1.0, v41 -; SI-NEXT: v_mul_f32_e32 v17, 1.0, v25 -; SI-NEXT: v_mul_f32_e32 v15, 1.0, v27 -; SI-NEXT: v_mul_f32_e64 v25, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v3, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v5, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v7, 1.0, s29 -; SI-NEXT: v_mul_f32_e32 v9, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v54, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v11, 1.0, v6 -; SI-NEXT: v_mul_f32_e32 v56, 1.0, v8 -; SI-NEXT: v_mul_f32_e32 v13, 1.0, v10 -; SI-NEXT: v_mul_f32_e32 v58, 1.0, v12 -; SI-NEXT: v_mul_f32_e32 v60, 1.0, v14 -; SI-NEXT: v_mul_f32_e32 v62, 1.0, v16 -; SI-NEXT: v_mul_f32_e32 v47, 1.0, v22 -; SI-NEXT: v_mul_f32_e32 v22, 1.0, v28 -; SI-NEXT: v_mul_f32_e64 v19, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v14, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v16, 1.0, s22 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v32 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v33 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v34 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v35 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v36 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v37 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v38 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v39 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v48 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v5 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v16 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v16 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v7 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v30 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v17 +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v15 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v14 +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v13 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v12 +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v11 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v10 +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v9 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v8 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v28 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v34, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_and_b32 s6, s29, 0xffff0000 +; SI-NEXT: s_lshl_b32 s7, s29, 16 +; SI-NEXT: s_and_b32 s8, s28, 0xffff0000 +; SI-NEXT: s_lshl_b32 s9, s28, 16 +; SI-NEXT: s_and_b32 s10, s27, 0xffff0000 +; SI-NEXT: s_lshl_b32 s11, s27, 16 +; SI-NEXT: s_and_b32 s12, s26, 0xffff0000 +; SI-NEXT: s_lshl_b32 s13, s26, 16 +; SI-NEXT: s_and_b32 s14, s25, 0xffff0000 +; SI-NEXT: s_lshl_b32 s15, s25, 16 +; SI-NEXT: s_and_b32 s25, s24, 0xffff0000 +; SI-NEXT: s_lshl_b32 s24, s24, 16 +; SI-NEXT: s_and_b32 s26, s23, 0xffff0000 +; SI-NEXT: s_lshl_b32 s23, s23, 16 +; SI-NEXT: s_and_b32 s27, s22, 0xffff0000 +; SI-NEXT: s_lshl_b32 s22, s22, 16 +; SI-NEXT: s_and_b32 s28, s21, 0xffff0000 +; SI-NEXT: s_lshl_b32 s21, s21, 16 +; SI-NEXT: s_and_b32 s29, s20, 0xffff0000 +; SI-NEXT: s_lshl_b32 s20, s20, 16 +; SI-NEXT: s_and_b32 s40, s19, 0xffff0000 +; SI-NEXT: s_lshl_b32 s19, s19, 16 +; SI-NEXT: s_and_b32 s41, s18, 0xffff0000 +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_and_b32 s42, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_and_b32 s43, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s16, s16, 16 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v49 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v27 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v26 +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v25 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v20 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s19 -; SI-NEXT: v_mul_f32_e32 v39, 1.0, v0 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v45 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v19 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mul_f32_e64 v35, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v33, 1.0, s27 -; SI-NEXT: v_mul_f32_e32 v32, 1.0, v18 -; SI-NEXT: v_mul_f32_e32 v34, 1.0, v29 -; SI-NEXT: v_mul_f32_e32 v36, 1.0, v24 -; SI-NEXT: v_mul_f32_e32 v38, 1.0, v26 -; SI-NEXT: v_mul_f32_e32 v31, 1.0, v30 -; SI-NEXT: v_mul_f32_e32 v24, 1.0, v51 -; SI-NEXT: v_mul_f32_e32 v41, 1.0, v53 -; SI-NEXT: v_mul_f32_e32 v26, 1.0, v55 -; SI-NEXT: v_mul_f32_e32 v43, 1.0, v40 -; SI-NEXT: v_mul_f32_e32 v28, 1.0, v42 -; SI-NEXT: v_mul_f32_e32 v51, 1.0, v50 -; SI-NEXT: v_mul_f32_e32 v53, 1.0, v52 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v46 -; SI-NEXT: v_mul_f32_e64 v48, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v18, 1.0, s24 -; SI-NEXT: v_mul_f32_e64 v29, 1.0, s26 +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v35 +; SI-NEXT: v_mul_f32_e32 v50, 1.0, v34 +; SI-NEXT: v_mul_f32_e32 v42, 1.0, v33 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_mul_f32_e32 v59, 1.0, v32 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mul_f32_e32 v61, 1.0, v31 +; SI-NEXT: v_mul_f32_e32 v29, 1.0, v29 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e64 v63, 1.0, s43 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_mul_f32_e64 v23, 1.0, s42 +; SI-NEXT: v_mul_f32_e64 v55, 1.0, s41 +; SI-NEXT: v_mul_f32_e64 v57, 1.0, s40 +; SI-NEXT: v_mul_f32_e64 v25, 1.0, s29 ; SI-NEXT: v_mul_f32_e64 v45, 1.0, s28 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e64 v52, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v43, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v37, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v47, 1.0, s14 +; SI-NEXT: v_mul_f32_e64 v33, 1.0, s12 +; SI-NEXT: v_mul_f32_e64 v27, 1.0, s10 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mul_f32_e64 v21, 1.0, s8 +; SI-NEXT: v_mul_f32_e64 v19, 1.0, s6 +; SI-NEXT: v_mul_f32_e32 v35, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v34, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v49, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v48, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v38, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v54, 1.0, v36 +; SI-NEXT: v_mul_f32_e32 v53, 1.0, v17 +; SI-NEXT: v_mul_f32_e64 v62, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v60, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v58, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v56, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v41, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v6, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v40, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v46, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s15 +; SI-NEXT: v_mul_f32_e64 v44, 1.0, s13 +; SI-NEXT: v_mul_f32_e64 v12, 1.0, s11 +; SI-NEXT: v_mul_f32_e64 v36, 1.0, s9 +; SI-NEXT: v_mul_f32_e64 v14, 1.0, s7 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB43_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v0, v19 -; SI-NEXT: v_mov_b32_e32 v37, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v33 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v63 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v57 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v61 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v63 +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v44 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v21 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_lshr_b64 v[0:1], v[62:63], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[60:61], 16 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v55 +; SI-NEXT: v_lshr_b64 v[2:3], v[58:59], 16 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v57 +; SI-NEXT: v_lshr_b64 v[3:4], v[56:57], 16 +; SI-NEXT: v_mov_b32_e32 v4, v41 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v25 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[4:5], v[41:42], 16 +; SI-NEXT: v_mov_b32_e32 v5, v6 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v45 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v52 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v43 +; SI-NEXT: v_mov_b32_e32 v61, v37 +; SI-NEXT: v_mov_b32_e32 v59, v47 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v61 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v59 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v50 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v29 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v63, v39 ; SI-NEXT: s_mov_b64 s[4:5], 0 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[0:1], v[19:20], 16 -; SI-NEXT: v_mov_b32_e32 v1, v48 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[1:2], v[48:49], 16 -; SI-NEXT: v_mov_b32_e32 v2, v14 -; SI-NEXT: v_mov_b32_e32 v49, v15 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v3 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[2:3], v[14:15], 16 -; SI-NEXT: v_mov_b32_e32 v3, v16 -; SI-NEXT: v_mov_b32_e32 v20, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v35 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[3:4], v[16:17], 16 -; SI-NEXT: v_mov_b32_e32 v4, v18 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v5 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[5:6], v[6:7], 16 +; SI-NEXT: v_mov_b32_e32 v6, v40 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[4:5], v[18:19], 16 -; SI-NEXT: v_mov_b32_e32 v5, v29 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[6:7], v[40:41], 16 +; SI-NEXT: v_mov_b32_e32 v7, v8 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[5:6], v[29:30], 16 -; SI-NEXT: v_mov_b32_e32 v6, v45 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[6:7], v[45:46], 16 -; SI-NEXT: v_mov_b32_e32 v7, v39 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[7:8], v[39:40], 16 -; SI-NEXT: v_mov_b32_e32 v8, v9 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v37 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v49 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_lshr_b64 v[7:8], v[8:9], 16 +; SI-NEXT: v_mov_b32_e32 v8, v46 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[8:9], v[9:10], 16 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v9 -; SI-NEXT: v_mov_b32_e32 v9, v54 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[8:9], v[46:47], 16 +; SI-NEXT: v_mov_b32_e32 v9, v10 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[9:10], v[54:55], 16 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v55, v13 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v10 -; SI-NEXT: v_mov_b32_e32 v10, v11 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[10:11], v[11:12], 16 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v11 -; SI-NEXT: v_mov_b32_e32 v11, v56 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[11:12], v[56:57], 16 -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v56, v44 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v12 -; SI-NEXT: v_lshr_b64 v[12:13], v[13:14], 16 -; SI-NEXT: v_mov_b32_e32 v13, v58 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[13:14], v[58:59], 16 -; SI-NEXT: v_mov_b32_e32 v14, v60 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[9:10], v[10:11], 16 +; SI-NEXT: v_mov_b32_e32 v10, v44 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[14:15], v[60:61], 16 -; SI-NEXT: v_mov_b32_e32 v15, v62 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[10:11], v[44:45], 16 +; SI-NEXT: v_mov_b32_e32 v11, v12 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v45, v50 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[11:12], v[12:13], 16 +; SI-NEXT: v_mov_b32_e32 v12, v36 +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[12:13], v[36:37], 16 +; SI-NEXT: v_mov_b32_e32 v13, v14 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v39 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[13:14], v[14:15], 16 +; SI-NEXT: v_mov_b32_e32 v14, v35 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[14:15], v[35:36], 16 +; SI-NEXT: v_mov_b32_e32 v15, v16 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[15:16], v[62:63], 16 +; SI-NEXT: v_lshr_b64 v[15:16], v[16:17], 16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 ; SI-NEXT: v_mov_b32_e32 v16, v32 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshr_b64 v[16:17], v[32:33], 16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: v_mov_b32_e32 v33, v34 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v17 +; SI-NEXT: v_mov_b32_e32 v17, v18 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[17:18], v[18:19], 16 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v18 +; SI-NEXT: v_lshr_b64 v[18:19], v[34:35], 16 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v19 +; SI-NEXT: v_mov_b32_e32 v19, v20 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v17 -; SI-NEXT: v_mov_b32_e32 v40, v17 -; SI-NEXT: v_lshr_b64 v[17:18], v[34:35], 16 -; SI-NEXT: v_lshr_b64 v[18:19], v[47:48], 16 -; SI-NEXT: v_lshr_b64 v[19:20], v[36:37], 16 -; SI-NEXT: v_mov_b32_e32 v20, v38 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[20:21], v[38:39], 16 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_mov_b32_e32 v34, v47 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v21 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[19:20], v[20:21], 16 +; SI-NEXT: v_mov_b32_e32 v20, v51 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[20:21], v[51:52], 16 ; SI-NEXT: v_mov_b32_e32 v21, v22 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshr_b64 v[21:22], v[22:23], 16 -; SI-NEXT: v_mov_b32_e32 v22, v31 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v22 +; SI-NEXT: v_mov_b32_e32 v22, v49 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[22:23], v[31:32], 16 +; SI-NEXT: v_lshr_b64 v[22:23], v[49:50], 16 ; SI-NEXT: v_mov_b32_e32 v23, v24 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshr_b64 v[23:24], v[24:25], 16 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v52 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v30 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v24 -; SI-NEXT: v_mov_b32_e32 v24, v41 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[24:25], v[41:42], 16 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v41, v26 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v50 -; SI-NEXT: v_mov_b32_e32 v42, v51 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v24 +; SI-NEXT: v_mov_b32_e32 v24, v48 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[24:25], v[48:49], 16 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v57 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v43 +; SI-NEXT: v_mov_b32_e32 v57, v31 +; SI-NEXT: v_mov_b32_e32 v43, v54 +; SI-NEXT: v_mov_b32_e32 v48, v53 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v25 +; SI-NEXT: v_mov_b32_e32 v25, v26 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v41 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshr_b64 v[25:26], v[26:27], 16 -; SI-NEXT: v_mov_b32_e32 v26, v43 -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v26, v38 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[26:27], v[38:39], 16 +; SI-NEXT: v_mov_b32_e32 v27, v28 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v38, v30 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[26:27], v[43:44], 16 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v43, v28 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v27 ; SI-NEXT: v_lshr_b64 v[27:28], v[28:29], 16 -; SI-NEXT: v_lshr_b64 v[28:29], v[51:52], 16 -; SI-NEXT: v_lshr_b64 v[29:30], v[53:54], 16 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v52, v53 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v30 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshr_b64 v[30:31], v[31:32], 16 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v57 -; SI-NEXT: v_mov_b32_e32 v53, v31 -; SI-NEXT: v_lshr_b64 v[31:32], v[31:32], 16 +; SI-NEXT: v_lshr_b64 v[28:29], v[31:32], 16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v37 +; SI-NEXT: v_lshr_b64 v[29:30], v[30:31], 16 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v37, v61 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v30 +; SI-NEXT: v_lshr_b64 v[30:31], v[54:55], 16 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v47 +; SI-NEXT: v_mov_b32_e32 v47, v59 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_lshr_b64 v[31:32], v[53:54], 16 ; SI-NEXT: s_branch .LBB43_3 ; SI-NEXT: .LBB43_2: -; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v56, v44 -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v55, v13 -; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v63, v39 +; SI-NEXT: v_mov_b32_e32 v45, v50 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: v_mov_b32_e32 v33, v34 -; SI-NEXT: v_mov_b32_e32 v34, v47 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v43, v28 -; SI-NEXT: v_mov_b32_e32 v52, v53 -; SI-NEXT: v_mov_b32_e32 v53, v0 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v57, v31 +; SI-NEXT: v_mov_b32_e32 v38, v30 +; SI-NEXT: v_mov_b32_e32 v43, v54 +; SI-NEXT: v_mov_b32_e32 v48, v53 ; SI-NEXT: s_mov_b64 s[4:5], -1 -; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v41, v26 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v42, v51 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: .LBB43_3: ; %Flow -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v37, v34 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: v_mov_b32_e32 v34, v33 -; SI-NEXT: v_mov_b32_e32 v35, v56 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_mov_b32_e32 v32, v40 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_mov_b32_e32 v33, v38 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mov_b32_e32 v51, v46 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mov_b32_e32 v54, v46 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mov_b32_e32 v44, v46 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mov_b32_e32 v45, v56 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mov_b32_e32 v47, v56 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mov_b32_e32 v58, v60 -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_mov_b32_e32 v33, v50 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: s_cbranch_vccnz .LBB43_5 ; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v57 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v62 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v61 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v60 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v49 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v39 -; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v59 -; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v40 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v35 -; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v32 -; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v53 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v32 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v42 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v61 +; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v53, v57 +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v48 ; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 ; SI-NEXT: v_lshr_b64 v[1:2], v[2:3], 16 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v60 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v36 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v58 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v56 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshr_b64 v[3:4], v[3:4], 16 +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v51 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v58 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_lshr_b64 v[3:4], v[3:4], 16 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v56 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_lshr_b64 v[4:5], v[4:5], 16 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v47 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_lshr_b64 v[5:6], v[5:6], 16 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v45 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v50 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_lshr_b64 v[6:7], v[6:7], 16 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v46 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_lshr_b64 v[7:8], v[7:8], 16 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v44 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v37 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: v_lshr_b64 v[8:9], v[8:9], 16 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v54 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v47 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 ; SI-NEXT: v_lshr_b64 v[9:10], v[9:10], 16 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v51 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v35 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 ; SI-NEXT: v_lshr_b64 v[10:11], v[10:11], 16 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v50 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_lshr_b64 v[11:12], v[11:12], 16 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v55 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_lshr_b64 v[12:13], v[12:13], 16 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v48 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_lshr_b64 v[13:14], v[13:14], 16 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v38 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v63 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; SI-NEXT: v_lshr_b64 v[14:15], v[14:15], 16 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v33 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v45 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 ; SI-NEXT: v_lshr_b64 v[15:16], v[15:16], 16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v33 ; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_lshr_b64 v[16:17], v[16:17], 16 -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v34 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v59 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 ; SI-NEXT: v_lshr_b64 v[17:18], v[17:18], 16 -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v37 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v34 ; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; SI-NEXT: v_lshr_b64 v[18:19], v[18:19], 16 -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v36 -; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; SI-NEXT: v_lshr_b64 v[19:20], v[19:20], 16 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 ; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 ; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 ; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v33 ; SI-NEXT: v_lshr_b64 v[32:33], v[32:33], 16 ; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_lshr_b64 v[19:20], v[19:20], 16 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -65396,9 +64797,10 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 ; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 ; SI-NEXT: v_lshr_b64 v[20:21], v[20:21], 16 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 @@ -65407,9 +64809,10 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 ; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 ; SI-NEXT: v_lshr_b64 v[21:22], v[21:22], 16 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 @@ -65418,9 +64821,9 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 ; SI-NEXT: v_lshr_b64 v[22:23], v[22:23], 16 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 @@ -65429,9 +64832,9 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 ; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 ; SI-NEXT: v_lshr_b64 v[23:24], v[23:24], 16 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 ; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 @@ -65440,12 +64843,20 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 ; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 ; SI-NEXT: v_lshr_b64 v[24:25], v[24:25], 16 -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v41 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 ; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 ; SI-NEXT: v_lshr_b64 v[25:26], v[25:26], 16 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 @@ -65454,45 +64865,45 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 ; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 ; SI-NEXT: v_lshr_b64 v[26:27], v[26:27], 16 -; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v43 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 ; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 ; SI-NEXT: v_lshr_b64 v[27:28], v[27:28], 16 -; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v42 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v53 ; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 ; SI-NEXT: v_lshr_b64 v[28:29], v[28:29], 16 -; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v52 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v38 ; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 ; SI-NEXT: v_lshr_b64 v[29:30], v[29:30], 16 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v43 ; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 -; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 ; SI-NEXT: v_lshr_b64 v[30:31], v[30:31], 16 ; SI-NEXT: v_mov_b32_e32 v31, v32 ; SI-NEXT: .LBB43_5: ; %end -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v64bf16_to_v32f32_scalar: @@ -68196,33 +67607,32 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-LABEL: bitcast_v32f32_to_v64f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 ; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr43 @@ -68235,16 +67645,16 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: s_waitcnt vmcnt(1) @@ -68314,17 +67724,13 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; kill: killed $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB44_2 ; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v62 ; SI-NEXT: v_cvt_f32_f16_e32 v32, v31 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v42, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v29 @@ -68334,114 +67740,97 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v52, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v63 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v29 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v23 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v27 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v34, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v30 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v38, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v27 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v44, v3 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v46, v2 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v60, v1 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v44, v3 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v56, v2 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 @@ -68455,26 +67844,29 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v47, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v61, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v62 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v31, v29 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v58, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v0 +; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 @@ -68491,89 +67883,87 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr62 ; SI-NEXT: .LBB44_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB44_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v25 -; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v9, v48 +; SI-NEXT: s_waitcnt expcnt(2) ; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 -; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v9, v38 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v24 ; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v38 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v36 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_f32_e32 v44, 1.0, v63 -; SI-NEXT: v_add_f32_e32 v46, 1.0, v62 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v44 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v46 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v44, 1.0, v62 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v44 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v63 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v5, v62 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v9, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v46 ; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 ; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 -; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v20 ; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v22 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v9, v34 -; SI-NEXT: v_mov_b32_e32 v34, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v32 -; SI-NEXT: v_mov_b32_e32 v32, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v32 +; SI-NEXT: v_mov_b32_e32 v32, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v31 ; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v19 -; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v61 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v19 ; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 -; SI-NEXT: v_add_f32_e32 v33, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v63 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v18 ; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 -; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 ; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v61 +; SI-NEXT: v_add_f32_e32 v33, 1.0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v57 -; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v59 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 ; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 ; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 ; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 @@ -68581,37 +67971,42 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 ; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 ; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v26 +; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 ; SI-NEXT: v_add_f32_e32 v27, 1.0, v27 ; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 ; SI-NEXT: v_add_f32_e32 v29, 1.0, v29 ; SI-NEXT: v_add_f32_e32 v30, 1.0, v30 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v26 ; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v27 ; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v28 ; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v29 ; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v44 ; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 ; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v44, v3 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v33 ; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 @@ -68629,14 +68024,20 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v60, v33 ; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 ; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 ; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 ; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v47 ; SI-NEXT: v_cvt_f32_f16_e32 v33, v45 ; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 ; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 @@ -68647,343 +68048,248 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 ; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 ; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v2 -; SI-NEXT: v_mov_b32_e32 v50, v28 -; SI-NEXT: v_mov_b32_e32 v48, v29 -; SI-NEXT: v_mov_b32_e32 v38, v30 -; SI-NEXT: v_mov_b32_e32 v58, v27 -; SI-NEXT: v_mov_b32_e32 v56, v8 -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v45, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v1 +; SI-NEXT: v_mov_b32_e32 v38, v28 +; SI-NEXT: v_mov_b32_e32 v36, v30 +; SI-NEXT: v_mov_b32_e32 v46, v27 +; SI-NEXT: v_mov_b32_e32 v31, v29 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: .LBB44_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v60 -; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v0, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v56 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v46 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v44 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v47 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 -; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 -; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 -; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 -; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 -; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 -; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 -; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 -; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 -; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 -; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v55 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 -; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v35 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v63 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v26, v48 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v40 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v6, v41 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v8, v53 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v10, v49 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v12, v37 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v14, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v34 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v23, v25, v23 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 +; SI-NEXT: v_or_b32_e32 v25, v27, v25 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v26, v50 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_or_b32_e32 v26, v27, v26 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v54 +; SI-NEXT: v_or_b32_e32 v27, v29, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v38 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_or_b32_e32 v28, v29, v28 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v42 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v29, v31, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v36 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_or_b32_e32 v30, v31, v30 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v32 +; SI-NEXT: v_or_b32_e32 v31, v33, v31 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v58 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v50 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v48 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v38 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v36 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x78, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v31 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v32f32_to_v64f16: @@ -69142,7 +68448,7 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a, ; SI-LABEL: bitcast_v32f32_to_v64f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -69159,306 +68465,300 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a, ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v42, s16 +; SI-NEXT: v_mov_b32_e32 v48, s16 +; SI-NEXT: v_mov_b32_e32 v40, s17 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mov_b32_e32 v43, s17 ; SI-NEXT: v_mov_b32_e32 v41, s18 -; SI-NEXT: v_mov_b32_e32 v40, s19 -; SI-NEXT: v_mov_b32_e32 v55, s20 -; SI-NEXT: v_mov_b32_e32 v54, s21 -; SI-NEXT: v_mov_b32_e32 v53, s22 -; SI-NEXT: v_mov_b32_e32 v52, s23 -; SI-NEXT: v_mov_b32_e32 v51, s24 -; SI-NEXT: v_mov_b32_e32 v50, s25 -; SI-NEXT: v_mov_b32_e32 v49, s26 -; SI-NEXT: v_mov_b32_e32 v39, s27 -; SI-NEXT: v_mov_b32_e32 v38, s28 +; SI-NEXT: v_mov_b32_e32 v55, s19 +; SI-NEXT: v_mov_b32_e32 v54, s20 +; SI-NEXT: v_mov_b32_e32 v53, s21 +; SI-NEXT: v_mov_b32_e32 v52, s22 +; SI-NEXT: v_mov_b32_e32 v51, s23 +; SI-NEXT: v_mov_b32_e32 v50, s24 +; SI-NEXT: v_mov_b32_e32 v49, s25 +; SI-NEXT: v_mov_b32_e32 v39, s26 +; SI-NEXT: v_mov_b32_e32 v19, s27 +; SI-NEXT: v_mov_b32_e32 v18, s28 ; SI-NEXT: v_mov_b32_e32 v37, s29 ; SI-NEXT: s_cbranch_scc0 .LBB45_4 ; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v17 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v60, v20 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v20 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v20 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v20 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v20 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v20 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v20 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v62, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v39 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v51 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v55 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v48 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v20 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v20 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v20 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v20 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v20 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v20 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v20 ; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v20 ; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v20 ; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v20 ; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v20 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v40 ; SI-NEXT: v_cvt_f32_f16_e32 v33, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v18 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v61, v19 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v17 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v20, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v19 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v19 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v19 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v19 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v13 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v20, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v19 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v20, v15 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v20, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v39 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v20, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v50 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v20, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v52 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v20, v11 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v20, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v53 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v20, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v54 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v20, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v55 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v5 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v20, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v41 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v20, v6 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v20, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v43 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v20, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v42 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v20, v3 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v1 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v20, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v20, v1 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v20, v0 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v20, v37 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v19 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v19 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v18 ; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v19 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v19 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v19 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v54 ; SI-NEXT: s_cbranch_execnz .LBB45_3 ; SI-NEXT: .LBB45_2: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 -; SI-NEXT: v_add_f32_e32 v24, 1.0, v41 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v12 -; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v62 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v11 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v41 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v10 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v60 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v10 ; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v9 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v63 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v9 ; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v8 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v58 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v61 ; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 -; SI-NEXT: v_add_f32_e32 v21, 1.0, v43 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v56 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v7 ; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 -; SI-NEXT: v_add_f32_e32 v19, 1.0, v42 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v46 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v6 -; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 -; SI-NEXT: v_add_f32_e32 v28, 1.0, v40 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v4, v59 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v5 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v43 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v4 -; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v56 ; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v27, 1.0, v40 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v3 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v42 -; SI-NEXT: v_add_f32_e32 v30, 1.0, v55 -; SI-NEXT: v_add_f32_e32 v27, 1.0, v53 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v1 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v41 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v35, 1.0, v37 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v46 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_add_f32_e32 v29, 1.0, v54 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v2 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v29, 1.0, v55 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v55 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v4, v43 +; SI-NEXT: v_add_f32_e32 v31, 1.0, v53 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v2 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v35 -; SI-NEXT: v_add_f32_e32 v34, 1.0, v38 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v4, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v34, 1.0, v37 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v4, v40 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v54 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v4, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v34 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v53 -; SI-NEXT: v_add_f32_e32 v25, 1.0, v52 -; SI-NEXT: v_add_f32_e32 v23, 1.0, v51 -; SI-NEXT: v_add_f32_e32 v22, 1.0, v50 -; SI-NEXT: v_add_f32_e32 v20, 1.0, v49 -; SI-NEXT: v_add_f32_e32 v32, 1.0, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v18 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v53 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v48 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v52 +; SI-NEXT: v_add_f32_e32 v25, 1.0, v51 +; SI-NEXT: v_add_f32_e32 v26, 1.0, v50 +; SI-NEXT: v_add_f32_e32 v28, 1.0, v49 +; SI-NEXT: v_add_f32_e32 v30, 1.0, v39 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 ; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 ; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 ; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 ; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 ; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 -; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v29 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v32 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v34 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v35 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v18 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v17 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 @@ -69471,343 +68771,236 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a, ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 ; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 ; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 ; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 ; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v51 ; SI-NEXT: v_cvt_f32_f16_e32 v56, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v4 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v59, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v3 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: .LBB45_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v48 -; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v36 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v35 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v3, v21 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v29 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v32 -; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v29 -; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v27 -; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v25 -; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v62 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v23 -; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v22 -; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v20 -; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v63 -; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 -; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v25 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v46 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v62 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v31 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v45 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v60 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v61 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v42 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v56 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v57 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v12, v43 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v23, v25, v23 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 +; SI-NEXT: v_or_b32_e32 v25, v27, v25 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v26, v44 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_or_b32_e32 v26, v27, v26 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 +; SI-NEXT: v_or_b32_e32 v27, v29, v27 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v28, v47 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v44 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v47 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x78, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_or_b32_e32 v28, v29, v28 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v58 ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -69824,106 +69017,112 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a, ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_or_b32_e32 v29, v31, v29 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_or_b32_e32 v30, v31, v30 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v32 +; SI-NEXT: v_or_b32_e32 v31, v33, v31 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB45_4: +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; kill: killed $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; kill: killed $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr62 ; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; kill: killed $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; kill: killed $vgpr42 ; SI-NEXT: ; kill: killed $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 ; SI-NEXT: ; kill: killed $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; kill: killed $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; kill: killed $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; kill: killed $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; kill: killed $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; kill: killed $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; kill: killed $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; kill: killed $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; kill: killed $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; kill: killed $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; kill: killed $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; kill: killed $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; kill: killed $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; kill: killed $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; kill: killed $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; kill: killed $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; kill: killed $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; kill: killed $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; kill: killed $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; kill: killed $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; kill: killed $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; kill: killed $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; kill: killed $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; kill: killed $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; kill: killed $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; kill: killed $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; kill: killed $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; kill: killed $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; kill: killed $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: s_branch .LBB45_2 ; ; VI-LABEL: bitcast_v32f32_to_v64f16_scalar: @@ -70154,755 +69353,789 @@ define <32 x float> @bitcast_v64f16_to_v32f32(<64 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v64f16_to_v32f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v62, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:132 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:88 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:84 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v63, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:4 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v62 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v61 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v2 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v60 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v3 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v31 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v57 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v63 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v53 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v33 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v34 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v50 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v35 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v36 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v35 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v37 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v33 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v38 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v48 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v39 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v48 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v48, v46 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v45 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v30 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v50 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v44 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v52 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v53 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v43 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v42 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v41 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v55, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:100 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:112 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:108 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:120 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:116 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f16_f32_e32 v53, v0 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v52, v1 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f16_f32_e32 v50, v2 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v39, v3 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v38, v4 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v37, v5 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v36, v6 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v35, v7 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v34, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v33, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB46_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v47 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v34 -; SI-NEXT: v_or_b32_e32 v31, v33, v31 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v63 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v61 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v59 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v60 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v58 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v56 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v46 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v44 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v40 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v54 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v49 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v53 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v50 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v38 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v36 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: v_or_b32_e32 v0, v62, v0 -; SI-NEXT: v_or_b32_e32 v1, v60, v1 -; SI-NEXT: v_or_b32_e32 v2, v58, v2 -; SI-NEXT: v_or_b32_e32 v3, v56, v3 -; SI-NEXT: v_or_b32_e32 v25, v51, v25 -; SI-NEXT: v_or_b32_e32 v26, v48, v26 -; SI-NEXT: v_or_b32_e32 v27, v52, v27 -; SI-NEXT: v_or_b32_e32 v28, v39, v28 -; SI-NEXT: v_or_b32_e32 v29, v37, v29 -; SI-NEXT: v_or_b32_e32 v30, v35, v30 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v35 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: v_or_b32_e32 v18, v59, v18 +; SI-NEXT: v_or_b32_e32 v19, v57, v19 +; SI-NEXT: v_or_b32_e32 v20, v47, v20 +; SI-NEXT: v_or_b32_e32 v21, v45, v21 +; SI-NEXT: v_or_b32_e32 v22, v43, v22 +; SI-NEXT: v_or_b32_e32 v23, v41, v23 +; SI-NEXT: v_or_b32_e32 v24, v55, v24 +; SI-NEXT: v_or_b32_e32 v25, v53, v25 +; SI-NEXT: v_or_b32_e32 v26, v51, v26 +; SI-NEXT: v_or_b32_e32 v27, v49, v27 +; SI-NEXT: v_or_b32_e32 v28, v38, v28 +; SI-NEXT: v_or_b32_e32 v29, v36, v29 +; SI-NEXT: v_or_b32_e32 v30, v34, v30 +; SI-NEXT: v_or_b32_e32 v31, v33, v31 ; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v32 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_or_b32_e32 v21, v22, v21 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_or_b32_e32 v23, v24, v23 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v40 -; SI-NEXT: v_or_b32_e32 v24, v55, v24 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v62 +; SI-NEXT: v_or_b32_e32 v17, v61, v17 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: .LBB46_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB46_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v58 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v56 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v43 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v41 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v26, v54 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v57 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v53 ; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v51 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v51 ; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v39 ; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 ; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v50 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v38 ; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v36 ; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 ; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 ; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 ; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 ; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v34 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v62 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v60 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_or_b32_e32 v18, v19, v18 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v20, v56 ; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v46 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v22, v20 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v45 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_or_b32_e32 v21, v22, v21 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v42 ; SI-NEXT: v_or_b32_e32 v22, v24, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v24, v40 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; SI-NEXT: v_or_b32_e32 v23, v25, v23 ; SI-NEXT: v_cvt_f32_f16_e32 v25, v55 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: v_or_b32_e32 v24, v25, v24 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v52 ; SI-NEXT: v_or_b32_e32 v25, v27, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v50 ; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 ; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 ; SI-NEXT: v_or_b32_e32 v26, v28, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v49 ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 ; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 ; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 ; SI-NEXT: v_or_b32_e32 v27, v28, v27 ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v37 ; SI-NEXT: v_or_b32_e32 v28, v30, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v35 ; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 ; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 ; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 ; SI-NEXT: v_or_b32_e32 v29, v31, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v34 ; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 ; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 ; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 @@ -70911,23 +70144,23 @@ define <32 x float> @bitcast_v64f16_to_v32f32(<64 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v31, v33, v31 ; SI-NEXT: .LBB46_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v64f16_to_v32f32: @@ -71168,548 +70401,676 @@ define inreg <32 x float> @bitcast_v64f16_to_v32f32_scalar(<64 x half> inreg %a, ; SI-LABEL: bitcast_v64f16_to_v32f32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v53, v26 -; SI-NEXT: v_mov_b32_e32 v45, v6 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:48 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:44 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:68 -; SI-NEXT: v_mov_b32_e32 v54, v14 -; SI-NEXT: v_mov_b32_e32 v55, v12 -; SI-NEXT: v_mov_b32_e32 v41, v11 -; SI-NEXT: v_mov_b32_e32 v40, v10 -; SI-NEXT: v_mov_b32_e32 v44, v9 -; SI-NEXT: v_mov_b32_e32 v43, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v15 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v47, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v2 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v46, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v44, s20 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v42, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v13 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v56, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v1 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v40, s24 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v17, v40 +; SI-NEXT: s_lshr_b32 s4, s29, 16 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v58, s4 +; SI-NEXT: s_lshr_b32 s4, s28, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v59, s4 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: s_lshr_b32 s4, s27, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v61, s4 +; SI-NEXT: s_lshr_b32 s4, s26, 16 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v62, s4 +; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s4 +; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s4 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v51, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v53, s4 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s4 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v46 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s4 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v47 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s4 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v55, s25 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v45, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v45 ; SI-NEXT: v_cvt_f16_f32_e32 v45, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v0, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v1, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v2, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v3, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v4, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v5, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v7, s28 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f16_f32_e32 v25, v39 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f16_f32_e32 v48, v26 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f16_f32_e32 v26, v31 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f16_f32_e32 v39, v6 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f16_f32_e32 v27, v42 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f16_f32_e32 v38, v60 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f16_f32_e32 v28, v37 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v37, v62 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v29, v63 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v31, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s26 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v49, v26 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v62 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v43, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v41, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v63, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v60, s28 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v57, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v57 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v30, v33 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v32, v34 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v34, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v63, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v62, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v60, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v42, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v35, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v33, s26 -; SI-NEXT: v_cvt_f16_f32_e32 v6, s29 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: s_cbranch_scc0 .LBB47_2 -; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v21, v22, v21 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v52 -; SI-NEXT: v_or_b32_e32 v5, v33, v5 -; SI-NEXT: v_mov_b32_e32 v33, v52 -; SI-NEXT: v_mov_b32_e32 v52, v51 -; SI-NEXT: v_or_b32_e32 v22, v51, v22 -; SI-NEXT: v_mov_b32_e32 v51, v23 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_or_b32_e32 v23, v50, v23 -; SI-NEXT: v_mov_b32_e32 v50, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_or_b32_e32 v24, v49, v24 -; SI-NEXT: v_mov_b32_e32 v49, v25 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_or_b32_e32 v25, v48, v25 -; SI-NEXT: v_mov_b32_e32 v48, v26 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v26, v39, v26 -; SI-NEXT: v_mov_b32_e32 v39, v27 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v9 -; SI-NEXT: v_or_b32_e32 v27, v38, v27 -; SI-NEXT: v_mov_b32_e32 v38, v28 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_or_b32_e32 v28, v37, v28 -; SI-NEXT: v_mov_b32_e32 v37, v29 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v4, v35, v4 -; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_or_b32_e32 v9, v14, v9 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v58 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v46 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v61 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v47 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v42, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v56, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v18, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v19, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v20, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v21, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v22, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v23, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v24, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v25, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v26, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v27, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v28, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v29, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v30, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v31, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v57, v0 +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB47_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v6 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; SI-NEXT: v_or_b32_e32 v3, v7, v3 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v14 +; SI-NEXT: v_or_b32_e32 v5, v11, v5 +; SI-NEXT: v_or_b32_e32 v6, v13, v6 +; SI-NEXT: v_or_b32_e32 v7, v15, v7 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v56 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v46 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_mov_b32_e32 v35, v54 -; SI-NEXT: v_or_b32_e32 v19, v54, v19 -; SI-NEXT: v_mov_b32_e32 v54, v20 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v29, v31, v29 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 ; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v34 -; SI-NEXT: v_or_b32_e32 v0, v63, v0 -; SI-NEXT: v_or_b32_e32 v1, v62, v1 -; SI-NEXT: v_or_b32_e32 v2, v60, v2 -; SI-NEXT: v_or_b32_e32 v3, v42, v3 -; SI-NEXT: v_or_b32_e32 v10, v56, v10 -; SI-NEXT: v_mov_b32_e32 v63, v44 -; SI-NEXT: v_or_b32_e32 v11, v44, v11 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_or_b32_e32 v10, v62, v10 ; SI-NEXT: v_mov_b32_e32 v62, v61 -; SI-NEXT: v_mov_b32_e32 v60, v59 +; SI-NEXT: v_or_b32_e32 v11, v35, v11 +; SI-NEXT: v_mov_b32_e32 v35, v59 ; SI-NEXT: v_or_b32_e32 v12, v59, v12 -; SI-NEXT: v_mov_b32_e32 v58, v57 -; SI-NEXT: v_or_b32_e32 v13, v57, v13 -; SI-NEXT: v_mov_b32_e32 v56, v47 -; SI-NEXT: v_mov_b32_e32 v46, v45 -; SI-NEXT: v_or_b32_e32 v14, v45, v14 -; SI-NEXT: v_mov_b32_e32 v44, v43 -; SI-NEXT: v_or_b32_e32 v15, v43, v15 -; SI-NEXT: v_mov_b32_e32 v42, v41 -; SI-NEXT: v_or_b32_e32 v16, v41, v16 -; SI-NEXT: v_or_b32_e32 v17, v40, v17 -; SI-NEXT: v_mov_b32_e32 v40, v55 -; SI-NEXT: v_or_b32_e32 v18, v55, v18 -; SI-NEXT: v_or_b32_e32 v20, v53, v20 -; SI-NEXT: v_or_b32_e32 v30, v32, v30 -; SI-NEXT: v_mov_b32_e32 v32, v34 -; SI-NEXT: v_or_b32_e32 v31, v36, v31 +; SI-NEXT: v_mov_b32_e32 v33, v58 +; SI-NEXT: v_or_b32_e32 v13, v58, v13 +; SI-NEXT: v_mov_b32_e32 v32, v56 +; SI-NEXT: v_or_b32_e32 v14, v47, v14 +; SI-NEXT: v_mov_b32_e32 v47, v46 +; SI-NEXT: v_or_b32_e32 v15, v45, v15 +; SI-NEXT: v_mov_b32_e32 v45, v44 +; SI-NEXT: v_or_b32_e32 v18, v40, v18 +; SI-NEXT: v_or_b32_e32 v19, v55, v19 +; SI-NEXT: v_mov_b32_e32 v55, v54 +; SI-NEXT: v_or_b32_e32 v20, v54, v20 +; SI-NEXT: v_or_b32_e32 v21, v53, v21 +; SI-NEXT: v_mov_b32_e32 v53, v52 +; SI-NEXT: v_or_b32_e32 v22, v52, v22 +; SI-NEXT: v_or_b32_e32 v23, v51, v23 +; SI-NEXT: v_mov_b32_e32 v51, v50 +; SI-NEXT: v_or_b32_e32 v24, v50, v24 +; SI-NEXT: v_or_b32_e32 v25, v49, v25 +; SI-NEXT: v_mov_b32_e32 v49, v48 +; SI-NEXT: v_or_b32_e32 v26, v48, v26 +; SI-NEXT: v_or_b32_e32 v27, v39, v27 +; SI-NEXT: v_mov_b32_e32 v39, v38 +; SI-NEXT: v_or_b32_e32 v28, v38, v28 +; SI-NEXT: v_or_b32_e32 v29, v63, v29 +; SI-NEXT: v_mov_b32_e32 v63, v60 +; SI-NEXT: v_or_b32_e32 v30, v60, v30 +; SI-NEXT: v_or_b32_e32 v31, v57, v31 ; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v4, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v16 +; SI-NEXT: v_or_b32_e32 v4, v9, v4 +; SI-NEXT: v_or_b32_e32 v8, v17, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v44 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v42 +; SI-NEXT: v_or_b32_e32 v9, v36, v9 +; SI-NEXT: v_mov_b32_e32 v36, v34 +; SI-NEXT: v_or_b32_e32 v16, v43, v16 +; SI-NEXT: v_mov_b32_e32 v43, v42 +; SI-NEXT: v_or_b32_e32 v17, v41, v17 +; SI-NEXT: v_mov_b32_e32 v41, v40 ; SI-NEXT: s_branch .LBB47_3 ; SI-NEXT: .LBB47_2: -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v63, v44 +; SI-NEXT: v_mov_b32_e32 v36, v34 ; SI-NEXT: v_mov_b32_e32 v62, v61 -; SI-NEXT: v_mov_b32_e32 v60, v59 -; SI-NEXT: v_mov_b32_e32 v58, v57 -; SI-NEXT: v_mov_b32_e32 v56, v47 -; SI-NEXT: v_mov_b32_e32 v46, v45 -; SI-NEXT: v_mov_b32_e32 v44, v43 -; SI-NEXT: v_mov_b32_e32 v42, v41 -; SI-NEXT: v_mov_b32_e32 v40, v55 -; SI-NEXT: v_mov_b32_e32 v35, v54 -; SI-NEXT: v_mov_b32_e32 v54, v20 -; SI-NEXT: v_mov_b32_e32 v33, v52 -; SI-NEXT: v_mov_b32_e32 v32, v34 -; SI-NEXT: v_mov_b32_e32 v52, v51 -; SI-NEXT: v_mov_b32_e32 v51, v23 -; SI-NEXT: v_mov_b32_e32 v50, v24 -; SI-NEXT: v_mov_b32_e32 v49, v25 -; SI-NEXT: v_mov_b32_e32 v48, v26 -; SI-NEXT: v_mov_b32_e32 v39, v27 -; SI-NEXT: v_mov_b32_e32 v38, v28 -; SI-NEXT: v_mov_b32_e32 v37, v29 +; SI-NEXT: v_mov_b32_e32 v35, v59 +; SI-NEXT: v_mov_b32_e32 v33, v58 +; SI-NEXT: v_mov_b32_e32 v32, v56 +; SI-NEXT: v_mov_b32_e32 v47, v46 +; SI-NEXT: v_mov_b32_e32 v45, v44 +; SI-NEXT: v_mov_b32_e32 v43, v42 +; SI-NEXT: v_mov_b32_e32 v41, v40 +; SI-NEXT: v_mov_b32_e32 v55, v54 +; SI-NEXT: v_mov_b32_e32 v53, v52 +; SI-NEXT: v_mov_b32_e32 v51, v50 +; SI-NEXT: v_mov_b32_e32 v49, v48 +; SI-NEXT: v_mov_b32_e32 v39, v38 +; SI-NEXT: v_mov_b32_e32 v63, v60 ; SI-NEXT: s_mov_b64 s[4:5], -1 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: .LBB47_3: ; %Flow -; SI-NEXT: v_mov_b32_e32 v34, v33 +; SI-NEXT: v_mov_b32_e32 v60, v39 +; SI-NEXT: v_mov_b32_e32 v38, v49 +; SI-NEXT: v_mov_b32_e32 v39, v51 +; SI-NEXT: v_mov_b32_e32 v48, v53 +; SI-NEXT: v_mov_b32_e32 v49, v55 +; SI-NEXT: v_mov_b32_e32 v50, v41 +; SI-NEXT: v_mov_b32_e32 v51, v43 +; SI-NEXT: v_mov_b32_e32 v52, v45 +; SI-NEXT: v_mov_b32_e32 v53, v47 +; SI-NEXT: v_mov_b32_e32 v54, v32 +; SI-NEXT: v_mov_b32_e32 v32, v33 ; SI-NEXT: v_mov_b32_e32 v33, v35 -; SI-NEXT: v_mov_b32_e32 v35, v40 -; SI-NEXT: v_mov_b32_e32 v53, v42 -; SI-NEXT: v_mov_b32_e32 v40, v46 -; SI-NEXT: v_mov_b32_e32 v41, v56 -; SI-NEXT: v_mov_b32_e32 v42, v58 -; SI-NEXT: v_mov_b32_e32 v43, v60 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v34, v36 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; SI-NEXT: s_cbranch_vccnz .LBB47_5 ; SI-NEXT: ; %bb.4: ; %cmp.true ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v47 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v32 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v43 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v36 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v40 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_mov_b32_e32 v55, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v33 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v38 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v38 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v36 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v60 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v2, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 ; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v58 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v46 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v44 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v42 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v40 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v37 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v35 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v34 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v62 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v62 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v54 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v53 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v52 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v51 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 @@ -71717,55 +71078,63 @@ define inreg <32 x float> @bitcast_v64f16_to_v32f32_scalar(<64 x half> inreg %a, ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_or_b32_e32 v18, v19, v18 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v54 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; SI-NEXT: v_or_b32_e32 v20, v22, v20 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_or_b32_e32 v21, v22, v21 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v51 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v22, v24, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v50 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; SI-NEXT: v_or_b32_e32 v23, v25, v23 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v25, v39 ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: v_or_b32_e32 v24, v25, v24 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v48 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v25, v27, v25 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 ; SI-NEXT: v_or_b32_e32 v26, v28, v26 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_or_b32_e32 v25, v27, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v39 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 @@ -71773,46 +71142,43 @@ define inreg <32 x float> @bitcast_v64f16_to_v32f32_scalar(<64 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 ; SI-NEXT: v_or_b32_e32 v27, v28, v27 ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v37 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v28, v30, v28 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; SI-NEXT: v_or_b32_e32 v29, v31, v29 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 ; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 ; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_or_b32_e32 v29, v31, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v63 ; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 ; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 ; SI-NEXT: v_or_b32_e32 v30, v31, v30 ; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v32 ; SI-NEXT: v_or_b32_e32 v31, v33, v31 ; SI-NEXT: .LBB47_5: ; %end -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -72140,390 +71506,292 @@ define <64 x i16> @bitcast_v32f32_to_v64i16(<32 x float> %a, i32 %b) { ; SI-LABEL: bitcast_v32f32_to_v64i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; kill: killed $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 +; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB48_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_alignbit_b32 v33, v31, v32, 16 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v34, v30, v29, 16 -; SI-NEXT: v_alignbit_b32 v35, v28, v27, 16 -; SI-NEXT: v_alignbit_b32 v36, v26, v25, 16 -; SI-NEXT: v_alignbit_b32 v37, v24, v23, 16 -; SI-NEXT: v_alignbit_b32 v38, v22, v21, 16 -; SI-NEXT: v_alignbit_b32 v48, v20, v19, 16 -; SI-NEXT: v_alignbit_b32 v50, v18, v17, 16 -; SI-NEXT: v_alignbit_b32 v52, v16, v15, 16 -; SI-NEXT: v_alignbit_b32 v54, v14, v13, 16 -; SI-NEXT: v_alignbit_b32 v41, v12, v11, 16 -; SI-NEXT: v_alignbit_b32 v43, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v45, v8, v7, 16 -; SI-NEXT: v_alignbit_b32 v47, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v32, v31, v30, 16 +; SI-NEXT: v_alignbit_b32 v33, v29, v28, 16 +; SI-NEXT: v_alignbit_b32 v34, v27, v26, 16 +; SI-NEXT: v_alignbit_b32 v35, v25, v24, 16 +; SI-NEXT: v_alignbit_b32 v36, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v37, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v38, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v39, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v48, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v49, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v50, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v53, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v55, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v42, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v44, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v47, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v15 ; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_alignbit_b32 v58, v4, v3, 16 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v13 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v11 ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_alignbit_b32 v60, v2, v1, 16 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v31 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v9 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v7 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v5 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v3 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v1 ; SI-NEXT: .LBB48_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB48_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v31, 1.0, v31 -; SI-NEXT: v_add_f32_e32 v32, 1.0, v32 -; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 -; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 ; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 -; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 ; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 -; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 ; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 -; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 ; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 -; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 ; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 -; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 ; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 -; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 ; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 -; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 ; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 -; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 ; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 -; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 ; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 -; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 ; SI-NEXT: v_add_f32_e32 v27, 1.0, v27 -; SI-NEXT: v_add_f32_e32 v30, 1.0, v30 +; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 ; SI-NEXT: v_add_f32_e32 v29, 1.0, v29 -; SI-NEXT: v_alignbit_b32 v33, v31, v32, 16 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v34, v30, v29, 16 -; SI-NEXT: v_alignbit_b32 v35, v28, v27, 16 -; SI-NEXT: v_alignbit_b32 v36, v26, v25, 16 -; SI-NEXT: v_alignbit_b32 v37, v24, v23, 16 -; SI-NEXT: v_alignbit_b32 v38, v22, v21, 16 -; SI-NEXT: v_alignbit_b32 v48, v20, v19, 16 -; SI-NEXT: v_alignbit_b32 v50, v18, v17, 16 -; SI-NEXT: v_alignbit_b32 v52, v16, v15, 16 -; SI-NEXT: v_alignbit_b32 v54, v14, v13, 16 -; SI-NEXT: v_alignbit_b32 v41, v12, v11, 16 -; SI-NEXT: v_alignbit_b32 v43, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v45, v8, v7, 16 -; SI-NEXT: v_alignbit_b32 v47, v6, v5, 16 +; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; SI-NEXT: v_add_f32_e32 v31, 1.0, v31 +; SI-NEXT: v_add_f32_e32 v30, 1.0, v30 +; SI-NEXT: v_alignbit_b32 v32, v31, v30, 16 +; SI-NEXT: v_alignbit_b32 v33, v29, v28, 16 +; SI-NEXT: v_alignbit_b32 v34, v27, v26, 16 +; SI-NEXT: v_alignbit_b32 v35, v25, v24, 16 +; SI-NEXT: v_alignbit_b32 v36, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v37, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v38, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v39, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v48, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v49, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v50, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v53, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v55, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v42, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v44, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v47, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v15 ; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_alignbit_b32 v58, v4, v3, 16 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v13 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v11 ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_alignbit_b32 v60, v2, v1, 16 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v31 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v9 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v7 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v5 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v3 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v1 ; SI-NEXT: .LBB48_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v47 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v44 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v42 +; SI-NEXT: v_or_b32_e32 v0, v0, v47 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v60 -; SI-NEXT: v_or_b32_e32 v1, v1, v60 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v58 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v63 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v47 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v62 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v45 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v61 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v43 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v59 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v57 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v56 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v46 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v44 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v42 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v24 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v25 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v26 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v53 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v27 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v28 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v29 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v30 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v49 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v63 +; SI-NEXT: v_or_b32_e32 v2, v2, v44 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v62 +; SI-NEXT: v_or_b32_e32 v4, v4, v42 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v61 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v55 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v47 +; SI-NEXT: v_or_b32_e32 v3, v3, v44 +; SI-NEXT: v_or_b32_e32 v5, v5, v42 +; SI-NEXT: v_or_b32_e32 v6, v6, v55 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v60 +; SI-NEXT: v_or_b32_e32 v8, v8, v53 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v59 +; SI-NEXT: v_or_b32_e32 v10, v10, v50 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v58 +; SI-NEXT: v_or_b32_e32 v12, v12, v49 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v57 +; SI-NEXT: v_or_b32_e32 v14, v14, v48 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v56 +; SI-NEXT: v_or_b32_e32 v16, v16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v46 +; SI-NEXT: v_or_b32_e32 v18, v18, v38 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v45 +; SI-NEXT: v_or_b32_e32 v20, v20, v37 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v43 +; SI-NEXT: v_or_b32_e32 v22, v22, v36 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v41 +; SI-NEXT: v_or_b32_e32 v24, v24, v35 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v40 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_or_b32_e32 v26, v26, v34 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v54 +; SI-NEXT: v_or_b32_e32 v28, v28, v33 +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v52 +; SI-NEXT: v_or_b32_e32 v30, v30, v32 +; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v51 +; SI-NEXT: v_or_b32_e32 v7, v7, v55 +; SI-NEXT: v_or_b32_e32 v9, v9, v53 +; SI-NEXT: v_or_b32_e32 v11, v11, v50 +; SI-NEXT: v_or_b32_e32 v13, v13, v49 +; SI-NEXT: v_or_b32_e32 v15, v15, v48 +; SI-NEXT: v_or_b32_e32 v17, v17, v39 +; SI-NEXT: v_or_b32_e32 v19, v19, v38 +; SI-NEXT: v_or_b32_e32 v21, v21, v37 +; SI-NEXT: v_or_b32_e32 v23, v23, v36 +; SI-NEXT: v_or_b32_e32 v25, v25, v35 +; SI-NEXT: v_or_b32_e32 v27, v27, v34 +; SI-NEXT: v_or_b32_e32 v29, v29, v33 +; SI-NEXT: v_or_b32_e32 v31, v31, v32 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v31 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v32f32_to_v64i16: @@ -72682,22 +71950,22 @@ define inreg <64 x i16> @bitcast_v32f32_to_v64i16_scalar(<32 x float> inreg %a, ; SI-LABEL: bitcast_v32f32_to_v64i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 -; SI-NEXT: v_mov_b32_e32 v31, s16 -; SI-NEXT: v_mov_b32_e32 v32, s17 -; SI-NEXT: v_mov_b32_e32 v29, s18 -; SI-NEXT: v_mov_b32_e32 v30, s19 -; SI-NEXT: v_mov_b32_e32 v27, s20 -; SI-NEXT: v_mov_b32_e32 v28, s21 -; SI-NEXT: v_mov_b32_e32 v25, s22 -; SI-NEXT: v_mov_b32_e32 v26, s23 -; SI-NEXT: v_mov_b32_e32 v23, s24 -; SI-NEXT: v_mov_b32_e32 v24, s25 -; SI-NEXT: v_mov_b32_e32 v21, s26 -; SI-NEXT: v_mov_b32_e32 v22, s27 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: v_mov_b32_e32 v30, s16 +; SI-NEXT: v_mov_b32_e32 v31, s17 +; SI-NEXT: v_mov_b32_e32 v28, s18 +; SI-NEXT: v_mov_b32_e32 v29, s19 +; SI-NEXT: v_mov_b32_e32 v26, s20 +; SI-NEXT: v_mov_b32_e32 v27, s21 +; SI-NEXT: v_mov_b32_e32 v24, s22 +; SI-NEXT: v_mov_b32_e32 v25, s23 +; SI-NEXT: v_mov_b32_e32 v22, s24 +; SI-NEXT: v_mov_b32_e32 v23, s25 +; SI-NEXT: v_mov_b32_e32 v20, s26 +; SI-NEXT: v_mov_b32_e32 v21, s27 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mov_b32_e32 v19, s28 -; SI-NEXT: v_mov_b32_e32 v20, s29 +; SI-NEXT: v_mov_b32_e32 v18, s28 +; SI-NEXT: v_mov_b32_e32 v19, s29 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -72716,326 +71984,257 @@ define inreg <64 x i16> @bitcast_v32f32_to_v64i16_scalar(<32 x float> inreg %a, ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB49_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshr_b64 v[36:37], v[17:18], 16 -; SI-NEXT: v_lshr_b64 v[51:52], v[5:6], 16 -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v15 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[36:37], v[15:16], 16 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v18 -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[36:37], v[13:14], 16 -; SI-NEXT: v_lshr_b64 v[52:53], v[21:22], 16 -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[36:37], v[11:12], 16 -; SI-NEXT: v_lshr_b64 v[37:38], v[9:10], 16 -; SI-NEXT: v_lshr_b64 v[38:39], v[7:8], 16 -; SI-NEXT: v_mov_b32_e32 v53, v40 -; SI-NEXT: v_lshr_b64 v[39:40], v[23:24], 16 -; SI-NEXT: v_lshr_b64 v[40:41], v[25:26], 16 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v16 -; SI-NEXT: v_lshr_b64 v[54:55], v[3:4], 16 -; SI-NEXT: v_lshr_b64 v[41:42], v[27:28], 16 -; SI-NEXT: v_mov_b32_e32 v55, v48 -; SI-NEXT: v_lshr_b64 v[48:49], v[1:2], 16 -; SI-NEXT: v_lshr_b64 v[42:43], v[29:30], 16 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v32 -; SI-NEXT: v_lshr_b64 v[49:50], v[19:20], 16 -; SI-NEXT: v_lshr_b64 v[43:44], v[31:32], 16 +; SI-NEXT: v_lshr_b64 v[32:33], v[16:17], 16 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[32:33], v[14:15], 16 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[32:33], v[12:13], 16 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[32:33], v[10:11], 16 +; SI-NEXT: v_lshr_b64 v[56:57], v[4:5], 16 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[57:58], v[2:3], 16 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v17 +; SI-NEXT: v_lshr_b64 v[58:59], v[0:1], 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[32:33], v[20:21], 16 +; SI-NEXT: v_mov_b32_e32 v59, v34 +; SI-NEXT: v_lshr_b64 v[33:34], v[22:23], 16 +; SI-NEXT: v_lshr_b64 v[34:35], v[24:25], 16 +; SI-NEXT: v_lshr_b64 v[46:47], v[8:9], 16 +; SI-NEXT: v_lshr_b64 v[35:36], v[26:27], 16 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v31 +; SI-NEXT: v_lshr_b64 v[47:48], v[6:7], 16 +; SI-NEXT: v_lshr_b64 v[40:41], v[18:19], 16 +; SI-NEXT: v_lshr_b64 v[38:39], v[28:29], 16 +; SI-NEXT: v_lshr_b64 v[36:37], v[30:31], 16 ; SI-NEXT: s_cbranch_execnz .LBB49_3 ; SI-NEXT: .LBB49_2: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 ; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_lshr_b64 v[32:33], v[16:17], 16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[32:33], v[14:15], 16 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 ; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[32:33], v[12:13], 16 ; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 -; SI-NEXT: v_lshr_b64 v[33:34], v[17:18], 16 ; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 -; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 -; SI-NEXT: v_lshr_b64 v[36:37], v[11:12], 16 -; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 -; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 -; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 -; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: v_lshr_b64 v[37:38], v[9:10], 16 -; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[32:33], v[10:11], 16 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[33:34], v[15:16], 16 -; SI-NEXT: v_lshr_b64 v[38:39], v[7:8], 16 -; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; SI-NEXT: v_lshr_b64 v[32:33], v[20:21], 16 ; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 -; SI-NEXT: v_lshr_b64 v[39:40], v[23:24], 16 -; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_lshr_b64 v[33:34], v[22:23], 16 ; SI-NEXT: v_add_f32_e32 v27, 1.0, v27 -; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 -; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: v_lshr_b64 v[40:41], v[25:26], 16 +; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_lshr_b64 v[56:57], v[4:5], 16 +; SI-NEXT: v_lshr_b64 v[34:35], v[24:25], 16 +; SI-NEXT: v_add_f32_e32 v31, 1.0, v31 ; SI-NEXT: v_add_f32_e32 v30, 1.0, v30 ; SI-NEXT: v_add_f32_e32 v29, 1.0, v29 -; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 ; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 ; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 -; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[33:34], v[13:14], 16 -; SI-NEXT: v_lshr_b64 v[41:42], v[27:28], 16 -; SI-NEXT: v_add_f32_e32 v32, 1.0, v32 -; SI-NEXT: v_add_f32_e32 v31, 1.0, v31 -; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 -; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 -; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 -; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 -; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 -; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_lshr_b64 v[51:52], v[5:6], 16 -; SI-NEXT: v_lshr_b64 v[48:49], v[1:2], 16 -; SI-NEXT: v_lshr_b64 v[42:43], v[29:30], 16 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: v_lshr_b64 v[54:55], v[3:4], 16 -; SI-NEXT: v_lshr_b64 v[49:50], v[19:20], 16 -; SI-NEXT: v_lshr_b64 v[52:53], v[21:22], 16 -; SI-NEXT: v_lshr_b64 v[43:44], v[31:32], 16 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v22 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v24 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v32 +; SI-NEXT: v_lshr_b64 v[46:47], v[8:9], 16 +; SI-NEXT: v_lshr_b64 v[57:58], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[35:36], v[26:27], 16 +; SI-NEXT: v_lshr_b64 v[47:48], v[6:7], 16 +; SI-NEXT: v_lshr_b64 v[58:59], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[40:41], v[18:19], 16 +; SI-NEXT: v_lshr_b64 v[38:39], v[28:29], 16 +; SI-NEXT: v_lshr_b64 v[36:37], v[30:31], 16 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v31 +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: .LBB49_3: ; %end -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v43 -; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; SI-NEXT: v_or_b32_e32 v31, v31, v50 -; SI-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v31, 0xffff, v32 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v35 -; SI-NEXT: v_or_b32_e32 v31, v31, v32 -; SI-NEXT: v_add_i32_e32 v32, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v31, v32, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v42 -; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; SI-NEXT: v_or_b32_e32 v29, v29, v31 -; SI-NEXT: v_add_i32_e32 v31, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v29, v31, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v29, 0xffff, v30 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v46 -; SI-NEXT: v_or_b32_e32 v29, v29, v30 -; SI-NEXT: v_add_i32_e32 v30, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v29, v30, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v41 -; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; SI-NEXT: v_or_b32_e32 v27, v27, v29 -; SI-NEXT: v_add_i32_e32 v29, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v27, v29, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v27, 0xffff, v28 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v45 -; SI-NEXT: v_or_b32_e32 v27, v27, v28 -; SI-NEXT: v_add_i32_e32 v28, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v27, v28, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v40 -; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; SI-NEXT: v_or_b32_e32 v25, v25, v27 -; SI-NEXT: v_add_i32_e32 v27, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v25, v27, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; SI-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; SI-NEXT: v_or_b32_e32 v36, v30, v36 +; SI-NEXT: v_and_b32_e32 v30, 0xffff, v31 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v52 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v25, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v37, v30, v31 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v38 +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: v_or_b32_e32 v38, v28, v30 +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v50 +; SI-NEXT: v_or_b32_e32 v39, v28, v29 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v35 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v48, v26, v28 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v49 +; SI-NEXT: v_or_b32_e32 v49, v26, v27 ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v34 -; SI-NEXT: v_or_b32_e32 v25, v25, v26 -; SI-NEXT: v_add_i32_e32 v26, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v25, v26, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v39 -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; SI-NEXT: v_or_b32_e32 v23, v23, v25 -; SI-NEXT: v_add_i32_e32 v25, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v23, v25, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v50, v24, v26 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v51 +; SI-NEXT: v_or_b32_e32 v51, v24, v25 ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v33 -; SI-NEXT: v_or_b32_e32 v23, v23, v24 -; SI-NEXT: v_add_i32_e32 v24, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v23, v24, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v52 -; SI-NEXT: v_or_b32_e32 v21, v21, v23 -; SI-NEXT: v_add_i32_e32 v23, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v21, v23, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v22 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v63 -; SI-NEXT: v_or_b32_e32 v21, v21, v22 -; SI-NEXT: v_add_i32_e32 v22, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v21, v22, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v49 -; SI-NEXT: v_or_b32_e32 v19, v19, v21 -; SI-NEXT: v_add_i32_e32 v21, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v19, v21, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v62 -; SI-NEXT: v_or_b32_e32 v19, v19, v20 -; SI-NEXT: v_add_i32_e32 v20, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v48 -; SI-NEXT: v_or_b32_e32 v1, v1, v19 -; SI-NEXT: v_add_i32_e32 v19, vcc, 56, v0 -; SI-NEXT: buffer_store_dword v1, v19, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v61 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v60 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v59 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v58 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v57 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v56 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v52, v22, v24 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v53 +; SI-NEXT: v_or_b32_e32 v53, v22, v23 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v32 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v54, v20, v22 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v55 +; SI-NEXT: v_or_b32_e32 v55, v20, v21 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v40 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v40, v18, v20 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v45 +; SI-NEXT: v_or_b32_e32 v41, v18, v19 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v58 +; SI-NEXT: v_or_b32_e32 v32, v0, v18 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v61 +; SI-NEXT: v_or_b32_e32 v33, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v57 +; SI-NEXT: v_or_b32_e32 v34, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v44 +; SI-NEXT: v_or_b32_e32 v35, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v56 +; SI-NEXT: v_or_b32_e32 v18, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v60 +; SI-NEXT: v_or_b32_e32 v19, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v47 +; SI-NEXT: v_or_b32_e32 v20, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v43 +; SI-NEXT: v_or_b32_e32 v21, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v46 +; SI-NEXT: v_or_b32_e32 v22, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v42 +; SI-NEXT: v_or_b32_e32 v23, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v10 +; SI-NEXT: v_mov_b32_e32 v3, v39 +; SI-NEXT: v_mov_b32_e32 v4, v48 +; SI-NEXT: v_mov_b32_e32 v5, v49 +; SI-NEXT: v_mov_b32_e32 v6, v50 +; SI-NEXT: v_mov_b32_e32 v7, v51 +; SI-NEXT: v_mov_b32_e32 v8, v52 +; SI-NEXT: v_mov_b32_e32 v9, v53 +; SI-NEXT: v_mov_b32_e32 v10, v54 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v47 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v24, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v63 +; SI-NEXT: v_or_b32_e32 v25, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v12 +; SI-NEXT: v_mov_b32_e32 v12, v40 +; SI-NEXT: v_mov_b32_e32 v11, v55 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v26, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v62 +; SI-NEXT: v_or_b32_e32 v27, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v14 +; SI-NEXT: v_mov_b32_e32 v13, v41 +; SI-NEXT: v_mov_b32_e32 v14, v32 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v53 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v28, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v15 +; SI-NEXT: v_mov_b32_e32 v15, v33 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v29, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, v38 +; SI-NEXT: v_mov_b32_e32 v16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v30, v0, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v59 ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -73052,47 +72251,55 @@ define inreg <64 x i16> @bitcast_v32f32_to_v64i16_scalar(<32 x float> inreg %a, ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v17 +; SI-NEXT: v_or_b32_e32 v31, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, v36 +; SI-NEXT: v_mov_b32_e32 v1, v37 +; SI-NEXT: v_mov_b32_e32 v17, v35 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB49_4: -; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; kill: killed $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 ; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_branch .LBB49_2 ; ; VI-LABEL: bitcast_v32f32_to_v64i16_scalar: @@ -73323,186 +72530,319 @@ define <32 x float> @bitcast_v64i16_to_v32f32(<64 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v64i16_to_v32f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v62, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:132 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:88 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:104 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:112 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:128 -; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v25 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 +; SI-NEXT: v_mov_b32_e32 v45, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v19 +; SI-NEXT: v_mov_b32_e32 v46, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v20 +; SI-NEXT: v_mov_b32_e32 v47, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v21 +; SI-NEXT: v_mov_b32_e32 v56, v6 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v22 +; SI-NEXT: v_mov_b32_e32 v57, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v23 +; SI-NEXT: v_mov_b32_e32 v58, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v24 +; SI-NEXT: v_mov_b32_e32 v59, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v25 +; SI-NEXT: v_mov_b32_e32 v60, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v26 +; SI-NEXT: v_mov_b32_e32 v61, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v27 +; SI-NEXT: v_mov_b32_e32 v43, v11 +; SI-NEXT: v_mov_b32_e32 v44, v10 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v18 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v17 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v16 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v15 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v43 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v44 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v45 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v46 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v47 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v56 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v57 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v58 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v59 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v60 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v61 ; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v10 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v8 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v12 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v18 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v20 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v22 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v24 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v26 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v28 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v30 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v62 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v9 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v33 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:36 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:4 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v31 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:108 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:100 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v63 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB50_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v22, 0xffff, v41 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v46 +; SI-NEXT: v_or_b32_e32 v8, v8, v32 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v43 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v32 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v62 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v61 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v60 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v59 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v58 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v57 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v56 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v47 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v45 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v44 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: v_or_b32_e32 v0, v0, v49 +; SI-NEXT: v_or_b32_e32 v1, v1, v42 +; SI-NEXT: v_or_b32_e32 v2, v2, v34 +; SI-NEXT: v_or_b32_e32 v3, v3, v41 +; SI-NEXT: v_or_b32_e32 v4, v4, v48 +; SI-NEXT: v_or_b32_e32 v5, v5, v40 +; SI-NEXT: v_or_b32_e32 v6, v6, v33 +; SI-NEXT: v_or_b32_e32 v7, v7, v55 +; SI-NEXT: v_or_b32_e32 v9, v9, v54 +; SI-NEXT: v_or_b32_e32 v10, v10, v39 +; SI-NEXT: v_or_b32_e32 v11, v11, v53 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v12, v12, v38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v24, v24, v25 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_or_b32_e32 v13, v13, v52 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_or_b32_e32 v14, v14, v37 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v26, v26, v27 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_or_b32_e32 v15, v15, v51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v16, v16, v36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: v_or_b32_e32 v28, v28, v29 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_or_b32_e32 v17, v17, v50 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: v_or_b32_e32 v29, v29, v30 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v18, v18, v35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; SI-NEXT: v_or_b32_e32 v30, v30, v31 +; SI-NEXT: v_and_b32_e32 v31, 0xffff, v63 +; SI-NEXT: v_or_b32_e32 v31, v31, v32 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 @@ -73532,255 +72872,53 @@ define <32 x float> @bitcast_v64i16_to_v32f32(<64 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v48 -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v35 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: v_or_b32_e32 v16, v16, v53 -; SI-NEXT: v_or_b32_e32 v17, v17, v51 -; SI-NEXT: v_or_b32_e32 v18, v18, v50 ; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; SI-NEXT: v_or_b32_e32 v22, v22, v23 -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v54 -; SI-NEXT: v_or_b32_e32 v19, v19, v39 -; SI-NEXT: v_or_b32_e32 v23, v23, v24 -; SI-NEXT: v_and_b32_e32 v24, 0xffff, v52 -; SI-NEXT: v_or_b32_e32 v24, v24, v25 -; SI-NEXT: v_and_b32_e32 v25, 0xffff, v49 -; SI-NEXT: v_or_b32_e32 v25, v25, v26 -; SI-NEXT: v_and_b32_e32 v26, 0xffff, v37 -; SI-NEXT: v_or_b32_e32 v26, v26, v27 -; SI-NEXT: v_and_b32_e32 v27, 0xffff, v33 -; SI-NEXT: v_or_b32_e32 v27, v27, v28 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: v_or_b32_e32 v0, v0, v63 -; SI-NEXT: v_or_b32_e32 v1, v1, v62 -; SI-NEXT: v_or_b32_e32 v2, v2, v61 -; SI-NEXT: v_or_b32_e32 v3, v3, v60 -; SI-NEXT: v_or_b32_e32 v4, v4, v59 -; SI-NEXT: v_or_b32_e32 v5, v5, v58 -; SI-NEXT: v_or_b32_e32 v6, v6, v57 -; SI-NEXT: v_or_b32_e32 v7, v7, v56 -; SI-NEXT: v_or_b32_e32 v8, v8, v47 -; SI-NEXT: v_or_b32_e32 v9, v9, v46 -; SI-NEXT: v_or_b32_e32 v10, v10, v45 -; SI-NEXT: v_or_b32_e32 v11, v11, v44 -; SI-NEXT: v_or_b32_e32 v12, v12, v43 -; SI-NEXT: v_or_b32_e32 v13, v13, v42 -; SI-NEXT: v_or_b32_e32 v14, v14, v40 -; SI-NEXT: v_or_b32_e32 v15, v15, v55 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; SI-NEXT: v_or_b32_e32 v28, v28, v29 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; SI-NEXT: v_or_b32_e32 v20, v20, v36 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; SI-NEXT: v_or_b32_e32 v29, v29, v30 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v30, 0xffff, v30 -; SI-NEXT: v_or_b32_e32 v30, v30, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; SI-NEXT: v_or_b32_e32 v21, v21, v34 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; kill: killed $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; kill: killed $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; kill: killed $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; kill: killed $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; SI-NEXT: v_or_b32_e32 v31, v31, v38 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; kill: killed $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; kill: killed $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; kill: killed $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; kill: killed $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; kill: killed $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; kill: killed $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; kill: killed $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; kill: killed $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; kill: killed $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: .LBB50_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB50_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v41 -; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v46 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v8, v32, v8 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v48 -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v35 -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v32 -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; SI-NEXT: s_mov_b32 s6, 0x30000 -; SI-NEXT: v_or_b32_e32 v16, v53, v16 -; SI-NEXT: v_or_b32_e32 v17, v51, v17 -; SI-NEXT: v_or_b32_e32 v18, v50, v18 -; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 -; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 -; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v54 -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; SI-NEXT: v_or_b32_e32 v23, v24, v23 -; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v52 -; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v49 -; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; SI-NEXT: v_or_b32_e32 v25, v26, v25 -; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v37 -; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; SI-NEXT: v_or_b32_e32 v26, v27, v26 -; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v33 -; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; SI-NEXT: v_or_b32_e32 v27, v28, v27 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v62 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v61 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v60 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v59 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v58 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v57 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v56 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v47 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v45 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v44 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v43 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 @@ -73789,32 +72927,21 @@ define <32 x float> @bitcast_v64i16_to_v32f32(<64 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; SI-NEXT: v_or_b32_e32 v0, v63, v0 -; SI-NEXT: v_or_b32_e32 v1, v62, v1 -; SI-NEXT: v_or_b32_e32 v2, v61, v2 -; SI-NEXT: v_or_b32_e32 v3, v60, v3 -; SI-NEXT: v_or_b32_e32 v4, v59, v4 -; SI-NEXT: v_or_b32_e32 v5, v58, v5 -; SI-NEXT: v_or_b32_e32 v6, v57, v6 -; SI-NEXT: v_or_b32_e32 v7, v56, v7 -; SI-NEXT: v_or_b32_e32 v8, v47, v8 -; SI-NEXT: v_or_b32_e32 v9, v46, v9 -; SI-NEXT: v_or_b32_e32 v10, v45, v10 -; SI-NEXT: v_or_b32_e32 v11, v44, v11 -; SI-NEXT: v_or_b32_e32 v12, v43, v12 -; SI-NEXT: v_or_b32_e32 v13, v42, v13 -; SI-NEXT: v_or_b32_e32 v14, v40, v14 -; SI-NEXT: v_or_b32_e32 v15, v55, v15 -; SI-NEXT: v_or_b32_e32 v19, v39, v19 +; SI-NEXT: v_or_b32_e32 v0, v49, v0 +; SI-NEXT: v_or_b32_e32 v1, v42, v1 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v2, v34, v2 +; SI-NEXT: v_or_b32_e32 v3, v41, v3 +; SI-NEXT: v_or_b32_e32 v4, v48, v4 +; SI-NEXT: v_or_b32_e32 v5, v40, v5 +; SI-NEXT: v_or_b32_e32 v6, v33, v6 +; SI-NEXT: v_or_b32_e32 v7, v55, v7 +; SI-NEXT: v_or_b32_e32 v9, v54, v9 +; SI-NEXT: v_or_b32_e32 v10, v39, v10 +; SI-NEXT: v_or_b32_e32 v11, v53, v11 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 @@ -73822,72 +72949,136 @@ define <32 x float> @bitcast_v64i16_to_v32f32(<64 x i16> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 ; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 ; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; SI-NEXT: v_or_b32_e32 v28, v29, v28 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 ; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 ; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 ; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 ; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 -; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 -; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 -; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 -; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 ; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v12, v38, v12 +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 ; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_or_b32_e32 v13, v52, v13 +; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 ; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_or_b32_e32 v14, v37, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 ; SI-NEXT: v_add_i32_e32 v25, vcc, s6, v25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v26, v27, v26 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_or_b32_e32 v15, v51, v15 +; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 ; SI-NEXT: v_add_i32_e32 v26, vcc, s6, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v16, v36, v16 +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 ; SI-NEXT: v_add_i32_e32 v27, vcc, s6, v27 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: v_or_b32_e32 v28, v29, v28 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_or_b32_e32 v17, v50, v17 +; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 ; SI-NEXT: v_add_i32_e32 v28, vcc, s6, v28 -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; SI-NEXT: v_or_b32_e32 v20, v36, v20 -; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v20 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 ; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 ; SI-NEXT: v_or_b32_e32 v29, v30, v29 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v18, v35, v18 +; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 ; SI-NEXT: v_add_i32_e32 v29, vcc, 0x30000, v29 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 ; SI-NEXT: v_and_b32_e32 v30, 0xffff, v30 ; SI-NEXT: v_or_b32_e32 v30, v31, v30 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; SI-NEXT: v_or_b32_e32 v21, v34, v21 -; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v21 -; SI-NEXT: v_add_i32_e32 v30, vcc, 0x30000, v30 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v31 +; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v63 ; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; SI-NEXT: v_or_b32_e32 v31, v38, v31 +; SI-NEXT: v_or_b32_e32 v31, v32, v31 +; SI-NEXT: v_add_i32_e32 v30, vcc, 0x30000, v30 ; SI-NEXT: v_add_i32_e32 v31, vcc, 0x30000, v31 ; SI-NEXT: .LBB50_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -74128,442 +73319,387 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a, ; SI-LABEL: bitcast_v64i16_to_v32f32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v54, v12 -; SI-NEXT: v_mov_b32_e32 v34, v10 -; SI-NEXT: v_mov_b32_e32 v35, v8 -; SI-NEXT: v_mov_b32_e32 v38, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:68 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v29 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v40 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v40, v1 +; SI-NEXT: v_mov_b32_e32 v55, v2 +; SI-NEXT: v_mov_b32_e32 v41, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v40 +; SI-NEXT: v_mov_b32_e32 v54, v3 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v55 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v54 +; SI-NEXT: v_mov_b32_e32 v52, v5 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v53 +; SI-NEXT: v_mov_b32_e32 v51, v6 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v52 +; SI-NEXT: v_mov_b32_e32 v50, v7 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v51 +; SI-NEXT: v_mov_b32_e32 v49, v8 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v50 +; SI-NEXT: v_mov_b32_e32 v48, v9 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v49 +; SI-NEXT: v_mov_b32_e32 v39, v10 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v48 +; SI-NEXT: v_mov_b32_e32 v38, v11 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v39 +; SI-NEXT: v_mov_b32_e32 v37, v12 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v38 +; SI-NEXT: v_mov_b32_e32 v36, v13 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v37 +; SI-NEXT: v_mov_b32_e32 v35, v14 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v36 +; SI-NEXT: v_mov_b32_e32 v34, v15 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v35 +; SI-NEXT: v_mov_b32_e32 v33, v16 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v34 +; SI-NEXT: v_mov_b32_e32 v32, v17 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v33 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v41 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s7, s28, 16 +; SI-NEXT: s_lshr_b32 s8, s27, 16 +; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: s_lshr_b32 s13, s22, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s40, s19, 16 +; SI-NEXT: s_lshr_b32 s41, s18, 16 +; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v32 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v36 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v8 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v12 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v33 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v50 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v49 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: s_cbranch_scc0 .LBB51_2 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB51_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 -; SI-NEXT: v_or_b32_e32 v7, v0, v48 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v4 -; SI-NEXT: v_or_b32_e32 v9, v0, v39 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v6 -; SI-NEXT: v_or_b32_e32 v10, v0, v47 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 -; SI-NEXT: v_or_b32_e32 v11, v0, v46 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 -; SI-NEXT: v_or_b32_e32 v12, v0, v45 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v41 +; SI-NEXT: v_or_b32_e32 v14, v0, v45 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; SI-NEXT: v_or_b32_e32 v16, v0, v43 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v54 -; SI-NEXT: v_or_b32_e32 v13, v0, v44 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v14 -; SI-NEXT: v_mov_b32_e32 v35, v34 -; SI-NEXT: v_mov_b32_e32 v34, v54 -; SI-NEXT: v_mov_b32_e32 v54, v14 -; SI-NEXT: v_or_b32_e32 v14, v0, v43 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_or_b32_e32 v15, v0, v15 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v18 -; SI-NEXT: v_or_b32_e32 v16, v0, v42 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v20 -; SI-NEXT: v_or_b32_e32 v17, v0, v17 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v22 -; SI-NEXT: v_or_b32_e32 v18, v0, v41 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v24 -; SI-NEXT: v_or_b32_e32 v19, v0, v19 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v26 -; SI-NEXT: v_or_b32_e32 v20, v0, v37 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v28 -; SI-NEXT: v_or_b32_e32 v21, v0, v21 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 -; SI-NEXT: v_or_b32_e32 v22, v0, v61 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 -; SI-NEXT: v_or_b32_e32 v23, v0, v23 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 ; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: v_or_b32_e32 v24, v0, v57 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v63 +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: v_or_b32_e32 v17, v0, v42 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: v_or_b32_e32 v25, v0, v25 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v62 -; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: v_or_b32_e32 v26, v0, v40 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s44, s42, 16 +; SI-NEXT: v_or_b32_e32 v18, v0, v63 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 -; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: v_or_b32_e32 v27, v0, v27 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v60 -; SI-NEXT: s_or_b32 s7, s7, s8 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: v_or_b32_e32 v28, v0, v5 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v59 -; SI-NEXT: s_or_b32 s8, s8, s9 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: v_or_b32_e32 v29, v0, v29 +; SI-NEXT: s_or_b32 s5, s5, s44 +; SI-NEXT: s_and_b32 s44, s18, 0xffff +; SI-NEXT: s_lshl_b32 s45, s41, 16 +; SI-NEXT: v_or_b32_e32 v19, v0, v62 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 -; SI-NEXT: s_or_b32 s9, s9, s10 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: s_or_b32 s44, s44, s45 +; SI-NEXT: s_and_b32 s45, s19, 0xffff +; SI-NEXT: s_lshl_b32 s46, s40, 16 +; SI-NEXT: v_or_b32_e32 v20, v0, v61 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: s_or_b32 s45, s45, s46 +; SI-NEXT: s_and_b32 s46, s20, 0xffff +; SI-NEXT: s_lshl_b32 s47, s15, 16 +; SI-NEXT: v_or_b32_e32 v21, v0, v60 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: s_or_b32 s46, s46, s47 +; SI-NEXT: s_and_b32 s47, s21, 0xffff +; SI-NEXT: s_lshl_b32 s56, s14, 16 +; SI-NEXT: v_or_b32_e32 v22, v0, v59 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; SI-NEXT: s_or_b32 s47, s47, s56 +; SI-NEXT: s_and_b32 s56, s22, 0xffff +; SI-NEXT: s_lshl_b32 s57, s13, 16 +; SI-NEXT: v_or_b32_e32 v23, v0, v58 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: s_or_b32 s56, s56, s57 +; SI-NEXT: s_and_b32 s57, s23, 0xffff +; SI-NEXT: s_lshl_b32 s58, s12, 16 +; SI-NEXT: v_or_b32_e32 v24, v0, v57 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: s_or_b32 s57, s57, s58 +; SI-NEXT: s_and_b32 s58, s24, 0xffff +; SI-NEXT: s_lshl_b32 s59, s11, 16 +; SI-NEXT: v_or_b32_e32 v25, v0, v56 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: s_or_b32 s58, s58, s59 +; SI-NEXT: s_and_b32 s59, s25, 0xffff +; SI-NEXT: s_lshl_b32 s60, s10, 16 +; SI-NEXT: v_or_b32_e32 v26, v0, v47 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: s_or_b32 s59, s59, s60 +; SI-NEXT: s_and_b32 s60, s26, 0xffff +; SI-NEXT: s_lshl_b32 s61, s9, 16 +; SI-NEXT: v_or_b32_e32 v27, v0, v46 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: s_or_b32 s60, s60, s61 +; SI-NEXT: s_and_b32 s61, s27, 0xffff +; SI-NEXT: s_lshl_b32 s62, s8, 16 +; SI-NEXT: v_or_b32_e32 v28, v0, v5 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: s_or_b32 s61, s61, s62 +; SI-NEXT: s_and_b32 s62, s28, 0xffff +; SI-NEXT: s_lshl_b32 s63, s7, 16 +; SI-NEXT: v_or_b32_e32 v29, v0, v4 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: s_or_b32 s62, s62, s63 +; SI-NEXT: s_and_b32 s63, s29, 0xffff +; SI-NEXT: s_lshl_b32 s72, s6, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v40 ; SI-NEXT: v_or_b32_e32 v30, v0, v3 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v58 -; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_mov_b32_e32 v36, v38 -; SI-NEXT: v_mov_b32_e32 v38, v2 -; SI-NEXT: v_or_b32_e32 v8, v1, v56 -; SI-NEXT: v_mov_b32_e32 v42, v41 -; SI-NEXT: v_mov_b32_e32 v50, v37 -; SI-NEXT: v_mov_b32_e32 v55, v61 -; SI-NEXT: v_mov_b32_e32 v33, v32 -; SI-NEXT: v_mov_b32_e32 v53, v63 -; SI-NEXT: v_mov_b32_e32 v62, v52 -; SI-NEXT: v_mov_b32_e32 v60, v59 -; SI-NEXT: v_mov_b32_e32 v49, v51 -; SI-NEXT: v_or_b32_e32 v31, v0, v31 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: s_or_b32 s63, s63, s72 +; SI-NEXT: v_or_b32_e32 v15, v1, v44 +; SI-NEXT: v_or_b32_e32 v31, v0, v2 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_mov_b32_e32 v5, s9 -; SI-NEXT: v_mov_b32_e32 v6, s10 -; SI-NEXT: s_mov_b64 s[4:5], 0 -; SI-NEXT: s_branch .LBB51_3 -; SI-NEXT: .LBB51_2: +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_mov_b32_e32 v2, s44 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v3, s45 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v42, v41 -; SI-NEXT: v_mov_b32_e32 v50, v37 -; SI-NEXT: v_mov_b32_e32 v36, v38 -; SI-NEXT: v_mov_b32_e32 v55, v61 -; SI-NEXT: v_mov_b32_e32 v38, v2 -; SI-NEXT: v_mov_b32_e32 v35, v34 -; SI-NEXT: v_mov_b32_e32 v34, v54 -; SI-NEXT: v_mov_b32_e32 v54, v14 -; SI-NEXT: v_mov_b32_e32 v33, v32 -; SI-NEXT: v_mov_b32_e32 v53, v63 -; SI-NEXT: v_mov_b32_e32 v62, v52 -; SI-NEXT: v_mov_b32_e32 v60, v59 -; SI-NEXT: v_mov_b32_e32 v49, v51 -; SI-NEXT: s_mov_b64 s[4:5], -1 -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: .LBB51_3: ; %Flow -; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] -; SI-NEXT: v_mov_b32_e32 v58, v49 -; SI-NEXT: s_cbranch_vccnz .LBB51_5 -; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v48, v0 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v38 +; SI-NEXT: v_mov_b32_e32 v4, s46 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, s47 +; SI-NEXT: v_mov_b32_e32 v6, s56 +; SI-NEXT: v_mov_b32_e32 v7, s57 +; SI-NEXT: v_mov_b32_e32 v8, s58 +; SI-NEXT: v_mov_b32_e32 v9, s59 +; SI-NEXT: v_mov_b32_e32 v10, s60 +; SI-NEXT: v_mov_b32_e32 v11, s61 +; SI-NEXT: v_mov_b32_e32 v12, s62 +; SI-NEXT: v_mov_b32_e32 v13, s63 +; SI-NEXT: s_cbranch_execnz .LBB51_3 +; SI-NEXT: .LBB51_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v40 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v56, v1 -; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1 -; SI-NEXT: v_mov_b32_e32 v52, v53 -; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: s_or_b32 s7, s8, s7 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: s_or_b32 s8, s9, s8 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: s_or_b32 s9, s10, s9 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: s_or_b32 s10, s11, s10 -; SI-NEXT: s_add_i32 s4, s4, 0x30000 -; SI-NEXT: s_add_i32 s5, s5, 0x30000 -; SI-NEXT: s_add_i32 s6, s6, 0x30000 -; SI-NEXT: s_add_i32 s7, s7, 0x30000 -; SI-NEXT: s_add_i32 s8, s8, 0x30000 -; SI-NEXT: s_add_i32 s9, s9, 0x30000 -; SI-NEXT: s_add_i32 s10, s10, 0x30000 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_mov_b32_e32 v5, s9 -; SI-NEXT: v_mov_b32_e32 v6, s10 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v39, v0 -; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v47, v0 -; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v46, v0 -; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_or_b32_e32 v1, v44, v1 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v45, v0 -; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v44, v0 -; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v54 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v43, v0 ; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v0, v43, v0 ; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v54 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v0, v42, v0 ; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: v_or_b32_e32 v0, v63, v0 ; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v0, v62, v0 ; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v50, v0 +; SI-NEXT: v_or_b32_e32 v0, v61, v0 ; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v0, v60, v0 ; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v55, v0 +; SI-NEXT: v_or_b32_e32 v0, v59, v0 ; SI-NEXT: v_add_i32_e32 v22, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v0, v58, v0 ; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v57, v0 ; SI-NEXT: v_add_i32_e32 v24, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v0, v56, v0 ; SI-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v40, v0 +; SI-NEXT: v_or_b32_e32 v0, v47, v0 ; SI-NEXT: v_add_i32_e32 v26, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v62 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v0, v46, v0 ; SI-NEXT: v_add_i32_e32 v27, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v28, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v60 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s16, s42, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: s_and_b32 s16, s18, 0xffff +; SI-NEXT: s_lshl_b32 s17, s41, 16 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_and_b32 s17, s19, 0xffff +; SI-NEXT: s_lshl_b32 s18, s40, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: s_and_b32 s18, s20, 0xffff +; SI-NEXT: s_lshl_b32 s15, s15, 16 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_or_b32 s15, s15, s18 +; SI-NEXT: s_and_b32 s18, s21, 0xffff +; SI-NEXT: s_lshl_b32 s14, s14, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s14, s14, s18 +; SI-NEXT: s_and_b32 s18, s22, 0xffff +; SI-NEXT: s_lshl_b32 s13, s13, 16 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_or_b32 s13, s13, s18 +; SI-NEXT: s_and_b32 s18, s23, 0xffff +; SI-NEXT: s_lshl_b32 s12, s12, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s12, s12, s18 +; SI-NEXT: s_and_b32 s18, s24, 0xffff +; SI-NEXT: s_lshl_b32 s11, s11, 16 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_or_b32 s11, s11, s18 +; SI-NEXT: s_and_b32 s18, s25, 0xffff +; SI-NEXT: s_lshl_b32 s10, s10, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s10, s10, s18 +; SI-NEXT: s_and_b32 s18, s26, 0xffff +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_or_b32 s9, s9, s18 +; SI-NEXT: s_and_b32 s18, s27, 0xffff +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_or_b32 s8, s8, s18 +; SI-NEXT: s_and_b32 s18, s28, 0xffff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_or_b32 s7, s7, s18 +; SI-NEXT: s_and_b32 s18, s29, 0xffff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_or_b32 s6, s6, s18 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s16, s16, 0x30000 +; SI-NEXT: s_add_i32 s17, s17, 0x30000 +; SI-NEXT: s_add_i32 s15, s15, 0x30000 +; SI-NEXT: s_add_i32 s14, s14, 0x30000 +; SI-NEXT: s_add_i32 s13, s13, 0x30000 +; SI-NEXT: s_add_i32 s12, s12, 0x30000 +; SI-NEXT: s_add_i32 s11, s11, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v3, s17 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v4, s15 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, s14 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v29, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v58 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_mov_b32_e32 v6, s13 +; SI-NEXT: v_mov_b32_e32 v7, s12 +; SI-NEXT: v_mov_b32_e32 v8, s11 +; SI-NEXT: v_mov_b32_e32 v9, s10 +; SI-NEXT: v_mov_b32_e32 v10, s9 +; SI-NEXT: v_mov_b32_e32 v11, s8 +; SI-NEXT: v_mov_b32_e32 v12, s7 +; SI-NEXT: v_mov_b32_e32 v13, s6 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v30, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v31, vcc, 0x30000, v0 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: .LBB51_5: ; %end -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: .LBB51_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB51_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_branch .LBB51_2 ; ; VI-LABEL: bitcast_v64i16_to_v32f32_scalar: ; VI: ; %bb.0: @@ -95552,27 +94688,86 @@ define <64 x bfloat> @bitcast_v16i64_to_v64bf16(<16 x i64> %a, i32 %b) { ; SI-LABEL: bitcast_v16i64_to_v64bf16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; kill: killed $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; kill: killed $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; kill: killed $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; kill: killed $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; kill: killed $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; kill: killed $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; kill: killed $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; kill: killed $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; kill: killed $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; kill: killed $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; kill: killed $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; kill: killed $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; kill: killed $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; kill: killed $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; kill: killed $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; kill: killed $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; kill: killed $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; kill: killed $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; kill: killed $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; kill: killed $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; kill: killed $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; kill: killed $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; kill: killed $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; kill: killed $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; kill: killed $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; kill: killed $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; kill: killed $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; kill: killed $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; kill: killed $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; kill: killed $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: ; implicit-def: $vgpr58 @@ -95603,193 +94798,130 @@ define <64 x bfloat> @bitcast_v16i64_to_v64bf16(<16 x i64> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; kill: killed $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB60_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v30 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v62 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v30 ; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v29 ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v28 ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v27 ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v26 ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v25 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v24 ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v23 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v22 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v21 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v20 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v19 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v62 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v18 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v62 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v17 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) -; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v63 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v16 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v63 -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v15 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v15 -; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v14 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v14 -; SI-NEXT: v_and_b32_e32 v37, 0xffff0000, v13 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v13 -; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v12 -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v12 -; SI-NEXT: v_and_b32_e32 v49, 0xffff0000, v11 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v11 -; SI-NEXT: v_and_b32_e32 v51, 0xffff0000, v10 -; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v10 -; SI-NEXT: v_and_b32_e32 v53, 0xffff0000, v9 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v9 -; SI-NEXT: v_and_b32_e32 v55, 0xffff0000, v8 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v8 -; SI-NEXT: v_and_b32_e32 v41, 0xffff0000, v7 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v7 -; SI-NEXT: v_and_b32_e32 v43, 0xffff0000, v6 -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v6 -; SI-NEXT: v_and_b32_e32 v45, 0xffff0000, v5 -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v5 -; SI-NEXT: v_and_b32_e32 v47, 0xffff0000, v4 -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v4 -; SI-NEXT: v_and_b32_e32 v57, 0xffff0000, v3 -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v3 -; SI-NEXT: v_and_b32_e32 v59, 0xffff0000, v2 -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v2 -; SI-NEXT: v_and_b32_e32 v61, 0xffff0000, v1 -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v1 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_and_b32_e32 v63, 0xffff0000, v62 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v14 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v14 +; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v13 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v13 +; SI-NEXT: v_and_b32_e32 v37, 0xffff0000, v12 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v12 +; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v11 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v11 +; SI-NEXT: v_and_b32_e32 v49, 0xffff0000, v10 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v10 +; SI-NEXT: v_and_b32_e32 v51, 0xffff0000, v9 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v9 +; SI-NEXT: v_and_b32_e32 v53, 0xffff0000, v8 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v8 +; SI-NEXT: v_and_b32_e32 v55, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v7 +; SI-NEXT: v_and_b32_e32 v41, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v6 +; SI-NEXT: v_and_b32_e32 v43, 0xffff0000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v5 +; SI-NEXT: v_and_b32_e32 v45, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v4 +; SI-NEXT: v_and_b32_e32 v47, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v3 +; SI-NEXT: v_and_b32_e32 v57, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v2 +; SI-NEXT: v_and_b32_e32 v59, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v1 +; SI-NEXT: v_and_b32_e32 v61, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v0 +; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 @@ -95821,465 +94953,366 @@ define <64 x bfloat> @bitcast_v16i64_to_v64bf16(<16 x i64> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: .LBB60_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB60_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: v_addc_u32_e32 v12, vcc, 0, v12, vcc -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; SI-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_addc_u32_e32 v16, vcc, 0, v16, vcc -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; SI-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; SI-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc -; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; SI-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc -; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; SI-NEXT: v_addc_u32_e32 v24, vcc, 0, v24, vcc -; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; SI-NEXT: v_addc_u32_e32 v26, vcc, 0, v26, vcc -; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; SI-NEXT: v_addc_u32_e32 v28, vcc, 0, v28, vcc -; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 -; SI-NEXT: v_addc_u32_e32 v30, vcc, 0, v30, vcc +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_addc_u32_e32 v29, vcc, 0, v29, vcc +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v63 -; SI-NEXT: v_addc_u32_e32 v32, vcc, 0, v62, vcc -; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v32 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v31 -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: v_addc_u32_e32 v31, vcc, 0, v62, vcc +; SI-NEXT: v_and_b32_e32 v63, 0xffff0000, v31 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v30 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v30 ; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v29 ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v28 ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v27 ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v26 ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v25 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v24 ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v23 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v22 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v21 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v20 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v19 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v18 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v17 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v16 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v15 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v15 -; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v14 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v14 -; SI-NEXT: v_and_b32_e32 v37, 0xffff0000, v13 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v13 -; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v12 -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v12 -; SI-NEXT: v_and_b32_e32 v49, 0xffff0000, v11 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v11 -; SI-NEXT: v_and_b32_e32 v51, 0xffff0000, v10 -; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v10 -; SI-NEXT: v_and_b32_e32 v53, 0xffff0000, v9 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v9 -; SI-NEXT: v_and_b32_e32 v55, 0xffff0000, v8 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v8 -; SI-NEXT: v_and_b32_e32 v41, 0xffff0000, v7 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v7 -; SI-NEXT: v_and_b32_e32 v43, 0xffff0000, v6 -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v6 -; SI-NEXT: v_and_b32_e32 v45, 0xffff0000, v5 -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v5 -; SI-NEXT: v_and_b32_e32 v47, 0xffff0000, v4 -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v4 -; SI-NEXT: v_and_b32_e32 v57, 0xffff0000, v3 -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v3 -; SI-NEXT: v_and_b32_e32 v59, 0xffff0000, v2 -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v2 -; SI-NEXT: v_and_b32_e32 v61, 0xffff0000, v1 -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v1 -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v14 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v14 +; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v13 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v13 +; SI-NEXT: v_and_b32_e32 v37, 0xffff0000, v12 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v12 +; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v11 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v11 +; SI-NEXT: v_and_b32_e32 v49, 0xffff0000, v10 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v10 +; SI-NEXT: v_and_b32_e32 v51, 0xffff0000, v9 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v9 +; SI-NEXT: v_and_b32_e32 v53, 0xffff0000, v8 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v8 +; SI-NEXT: v_and_b32_e32 v55, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v7 +; SI-NEXT: v_and_b32_e32 v41, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v6 +; SI-NEXT: v_and_b32_e32 v43, 0xffff0000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v5 +; SI-NEXT: v_and_b32_e32 v45, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v4 +; SI-NEXT: v_and_b32_e32 v47, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v3 +; SI-NEXT: v_and_b32_e32 v57, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v2 +; SI-NEXT: v_and_b32_e32 v59, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v1 +; SI-NEXT: v_and_b32_e32 v61, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v0 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: .LBB60_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v61 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v60 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v61 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v60 +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v59 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v58 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v57 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v56 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v47 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v46 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v45 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v44 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v43 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v42 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v41 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v40 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v55 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v54 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v53 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v52 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v51 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v50 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v49 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v48 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v39 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v38 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v37 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v36 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v35 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v34 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v33 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v32 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v57 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v56 +; SI-NEXT: v_alignbit_b32 v2, v2, v3, 16 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v47 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v46 +; SI-NEXT: v_alignbit_b32 v3, v3, v4, 16 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v45 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v44 +; SI-NEXT: v_alignbit_b32 v4, v4, v5, 16 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v43 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_mul_f32_e32 v6, 1.0, v42 +; SI-NEXT: v_alignbit_b32 v5, v5, v6, 16 +; SI-NEXT: v_mul_f32_e32 v6, 1.0, v41 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v40 +; SI-NEXT: v_alignbit_b32 v6, v6, v7, 16 +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v55 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v54 +; SI-NEXT: v_alignbit_b32 v7, v7, v8, 16 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v53 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v52 +; SI-NEXT: v_alignbit_b32 v8, v8, v9, 16 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v51 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v50 +; SI-NEXT: v_alignbit_b32 v9, v9, v10, 16 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v49 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v48 +; SI-NEXT: v_alignbit_b32 v10, v10, v11, 16 +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v39 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_mul_f32_e32 v12, 1.0, v38 +; SI-NEXT: v_alignbit_b32 v11, v11, v12, 16 +; SI-NEXT: v_mul_f32_e32 v12, 1.0, v37 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_mul_f32_e32 v13, 1.0, v36 +; SI-NEXT: v_alignbit_b32 v12, v12, v13, 16 +; SI-NEXT: v_mul_f32_e32 v13, 1.0, v35 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_mul_f32_e32 v14, 1.0, v34 +; SI-NEXT: v_alignbit_b32 v13, v13, v14, 16 +; SI-NEXT: v_mul_f32_e32 v14, 1.0, v33 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v15, 1.0, v32 +; SI-NEXT: v_alignbit_b32 v14, v14, v15, 16 +; SI-NEXT: v_mul_f32_e32 v15, 1.0, v31 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_alignbit_b32 v15, v15, v16, 16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v18 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v19 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v20 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v21 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v23 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v24 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v25 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v26 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v27 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v28 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_mul_f32_e32 v29, 1.0, v29 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v30 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v31 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_alignbit_b32 v16, v16, v17, 16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_alignbit_b32 v17, v17, v18, 16 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_alignbit_b32 v18, v18, v19, 16 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_alignbit_b32 v19, v19, v20, 16 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_alignbit_b32 v20, v20, v21, 16 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_alignbit_b32 v21, v21, v22, 16 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_alignbit_b32 v22, v22, v23, 16 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_alignbit_b32 v23, v23, v24, 16 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_alignbit_b32 v24, v24, v25, 16 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_alignbit_b32 v25, v25, v26, 16 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_alignbit_b32 v26, v26, v27, 16 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_alignbit_b32 v27, v27, v28, 16 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_alignbit_b32 v28, v28, v29, 16 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v31 +; SI-NEXT: v_mul_f32_e32 v29, 1.0, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_alignbit_b32 v29, v29, v30, 16 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_alignbit_b32 v30, v30, v31, 16 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v63 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_alignbit_b32 v31, v31, v32, 16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16i64_to_v64bf16: @@ -96463,106 +95496,106 @@ define inreg <64 x bfloat> @bitcast_v16i64_to_v64bf16_scalar(<16 x i64> inreg %a ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_writelane_b32 v21, s30, 0 -; SI-NEXT: v_writelane_b32 v21, s31, 1 -; SI-NEXT: v_writelane_b32 v21, s34, 2 -; SI-NEXT: v_writelane_b32 v21, s35, 3 -; SI-NEXT: v_writelane_b32 v21, s36, 4 -; SI-NEXT: v_writelane_b32 v21, s37, 5 -; SI-NEXT: v_writelane_b32 v21, s38, 6 -; SI-NEXT: v_writelane_b32 v21, s39, 7 -; SI-NEXT: v_writelane_b32 v21, s48, 8 -; SI-NEXT: v_writelane_b32 v21, s49, 9 -; SI-NEXT: v_writelane_b32 v21, s50, 10 -; SI-NEXT: v_writelane_b32 v21, s51, 11 -; SI-NEXT: v_writelane_b32 v21, s52, 12 -; SI-NEXT: v_writelane_b32 v21, s53, 13 -; SI-NEXT: v_writelane_b32 v21, s54, 14 -; SI-NEXT: v_writelane_b32 v21, s55, 15 -; SI-NEXT: v_writelane_b32 v21, s64, 16 -; SI-NEXT: v_writelane_b32 v21, s65, 17 -; SI-NEXT: v_writelane_b32 v21, s66, 18 -; SI-NEXT: v_writelane_b32 v21, s67, 19 -; SI-NEXT: v_writelane_b32 v21, s68, 20 -; SI-NEXT: v_writelane_b32 v21, s69, 21 -; SI-NEXT: v_mov_b32_e32 v20, s16 -; SI-NEXT: v_writelane_b32 v21, s70, 22 -; SI-NEXT: v_readfirstlane_b32 s48, v20 -; SI-NEXT: v_mov_b32_e32 v20, s17 -; SI-NEXT: v_writelane_b32 v21, s71, 23 -; SI-NEXT: v_readfirstlane_b32 s49, v20 -; SI-NEXT: v_mov_b32_e32 v20, s18 -; SI-NEXT: v_writelane_b32 v21, s80, 24 -; SI-NEXT: v_readfirstlane_b32 s50, v20 -; SI-NEXT: v_mov_b32_e32 v20, s19 -; SI-NEXT: v_writelane_b32 v21, s81, 25 -; SI-NEXT: v_readfirstlane_b32 s51, v20 -; SI-NEXT: v_mov_b32_e32 v20, s20 -; SI-NEXT: v_writelane_b32 v21, s82, 26 -; SI-NEXT: v_readfirstlane_b32 s52, v20 -; SI-NEXT: v_mov_b32_e32 v20, s21 -; SI-NEXT: v_writelane_b32 v21, s83, 27 -; SI-NEXT: v_readfirstlane_b32 s53, v20 -; SI-NEXT: v_mov_b32_e32 v20, s22 -; SI-NEXT: v_writelane_b32 v21, s84, 28 -; SI-NEXT: v_readfirstlane_b32 s54, v20 -; SI-NEXT: v_mov_b32_e32 v20, s23 -; SI-NEXT: v_writelane_b32 v21, s85, 29 -; SI-NEXT: v_readfirstlane_b32 s55, v20 -; SI-NEXT: v_mov_b32_e32 v20, s24 -; SI-NEXT: v_writelane_b32 v21, s86, 30 -; SI-NEXT: v_readfirstlane_b32 s64, v20 -; SI-NEXT: v_mov_b32_e32 v20, s25 -; SI-NEXT: v_writelane_b32 v21, s87, 31 -; SI-NEXT: v_readfirstlane_b32 s65, v20 -; SI-NEXT: v_mov_b32_e32 v20, s26 -; SI-NEXT: v_writelane_b32 v21, s96, 32 -; SI-NEXT: v_readfirstlane_b32 s66, v20 -; SI-NEXT: v_mov_b32_e32 v20, s27 -; SI-NEXT: v_writelane_b32 v21, s97, 33 -; SI-NEXT: v_readfirstlane_b32 s67, v20 -; SI-NEXT: v_mov_b32_e32 v20, s28 -; SI-NEXT: v_writelane_b32 v21, s98, 34 -; SI-NEXT: v_readfirstlane_b32 s68, v20 -; SI-NEXT: v_mov_b32_e32 v20, s29 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 -; SI-NEXT: v_writelane_b32 v21, s99, 35 -; SI-NEXT: v_readfirstlane_b32 s69, v20 -; SI-NEXT: v_readfirstlane_b32 s70, v1 -; SI-NEXT: v_readfirstlane_b32 s71, v2 -; SI-NEXT: v_readfirstlane_b32 s80, v3 -; SI-NEXT: v_readfirstlane_b32 s81, v4 -; SI-NEXT: v_readfirstlane_b32 s82, v5 -; SI-NEXT: v_readfirstlane_b32 s83, v6 -; SI-NEXT: v_readfirstlane_b32 s84, v7 -; SI-NEXT: v_readfirstlane_b32 s85, v8 -; SI-NEXT: v_readfirstlane_b32 s86, v9 -; SI-NEXT: v_readfirstlane_b32 s87, v10 -; SI-NEXT: v_readfirstlane_b32 s96, v11 -; SI-NEXT: v_readfirstlane_b32 s97, v12 -; SI-NEXT: v_readfirstlane_b32 s98, v13 -; SI-NEXT: v_readfirstlane_b32 s99, v14 -; SI-NEXT: v_readfirstlane_b32 s6, v15 -; SI-NEXT: v_readfirstlane_b32 s7, v16 -; SI-NEXT: v_readfirstlane_b32 s8, v17 +; SI-NEXT: v_writelane_b32 v33, s30, 0 +; SI-NEXT: v_writelane_b32 v33, s31, 1 +; SI-NEXT: v_writelane_b32 v33, s34, 2 +; SI-NEXT: v_writelane_b32 v33, s35, 3 +; SI-NEXT: v_writelane_b32 v33, s36, 4 +; SI-NEXT: v_writelane_b32 v33, s37, 5 +; SI-NEXT: v_writelane_b32 v33, s38, 6 +; SI-NEXT: v_writelane_b32 v33, s39, 7 +; SI-NEXT: v_writelane_b32 v33, s48, 8 +; SI-NEXT: v_writelane_b32 v33, s49, 9 +; SI-NEXT: v_writelane_b32 v33, s50, 10 +; SI-NEXT: v_writelane_b32 v33, s51, 11 +; SI-NEXT: v_writelane_b32 v33, s52, 12 +; SI-NEXT: v_writelane_b32 v33, s53, 13 +; SI-NEXT: v_writelane_b32 v33, s54, 14 +; SI-NEXT: v_writelane_b32 v33, s55, 15 +; SI-NEXT: v_writelane_b32 v33, s64, 16 +; SI-NEXT: v_writelane_b32 v33, s65, 17 +; SI-NEXT: v_writelane_b32 v33, s66, 18 +; SI-NEXT: v_writelane_b32 v33, s67, 19 +; SI-NEXT: v_writelane_b32 v33, s68, 20 +; SI-NEXT: v_writelane_b32 v33, s69, 21 +; SI-NEXT: v_mov_b32_e32 v19, s16 +; SI-NEXT: v_writelane_b32 v33, s70, 22 +; SI-NEXT: v_readfirstlane_b32 s48, v19 +; SI-NEXT: v_mov_b32_e32 v19, s17 +; SI-NEXT: v_writelane_b32 v33, s71, 23 +; SI-NEXT: v_readfirstlane_b32 s49, v19 +; SI-NEXT: v_mov_b32_e32 v19, s18 +; SI-NEXT: v_writelane_b32 v33, s80, 24 +; SI-NEXT: v_readfirstlane_b32 s50, v19 +; SI-NEXT: v_mov_b32_e32 v19, s19 +; SI-NEXT: v_writelane_b32 v33, s81, 25 +; SI-NEXT: v_readfirstlane_b32 s51, v19 +; SI-NEXT: v_mov_b32_e32 v19, s20 +; SI-NEXT: v_writelane_b32 v33, s82, 26 +; SI-NEXT: v_readfirstlane_b32 s52, v19 +; SI-NEXT: v_mov_b32_e32 v19, s21 +; SI-NEXT: v_writelane_b32 v33, s83, 27 +; SI-NEXT: v_readfirstlane_b32 s53, v19 +; SI-NEXT: v_mov_b32_e32 v19, s22 +; SI-NEXT: v_writelane_b32 v33, s84, 28 +; SI-NEXT: v_readfirstlane_b32 s54, v19 +; SI-NEXT: v_mov_b32_e32 v19, s23 +; SI-NEXT: v_writelane_b32 v33, s85, 29 +; SI-NEXT: v_readfirstlane_b32 s55, v19 +; SI-NEXT: v_mov_b32_e32 v19, s24 +; SI-NEXT: v_writelane_b32 v33, s86, 30 +; SI-NEXT: v_readfirstlane_b32 s64, v19 +; SI-NEXT: v_mov_b32_e32 v19, s25 +; SI-NEXT: v_writelane_b32 v33, s87, 31 +; SI-NEXT: v_readfirstlane_b32 s65, v19 +; SI-NEXT: v_mov_b32_e32 v19, s26 +; SI-NEXT: v_writelane_b32 v33, s96, 32 +; SI-NEXT: v_readfirstlane_b32 s66, v19 +; SI-NEXT: v_mov_b32_e32 v19, s27 +; SI-NEXT: v_writelane_b32 v33, s97, 33 +; SI-NEXT: v_readfirstlane_b32 s67, v19 +; SI-NEXT: v_mov_b32_e32 v19, s28 +; SI-NEXT: v_writelane_b32 v33, s98, 34 +; SI-NEXT: v_readfirstlane_b32 s68, v19 +; SI-NEXT: v_mov_b32_e32 v19, s29 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: v_writelane_b32 v33, s99, 35 +; SI-NEXT: v_readfirstlane_b32 s69, v19 +; SI-NEXT: v_readfirstlane_b32 s70, v0 +; SI-NEXT: v_readfirstlane_b32 s71, v1 +; SI-NEXT: v_readfirstlane_b32 s80, v2 +; SI-NEXT: v_readfirstlane_b32 s81, v3 +; SI-NEXT: v_readfirstlane_b32 s82, v4 +; SI-NEXT: v_readfirstlane_b32 s83, v5 +; SI-NEXT: v_readfirstlane_b32 s84, v6 +; SI-NEXT: v_readfirstlane_b32 s85, v7 +; SI-NEXT: v_readfirstlane_b32 s86, v8 +; SI-NEXT: v_readfirstlane_b32 s87, v9 +; SI-NEXT: v_readfirstlane_b32 s96, v10 +; SI-NEXT: v_readfirstlane_b32 s97, v11 +; SI-NEXT: v_readfirstlane_b32 s98, v12 +; SI-NEXT: v_readfirstlane_b32 s99, v13 +; SI-NEXT: v_readfirstlane_b32 s6, v14 +; SI-NEXT: v_readfirstlane_b32 s7, v15 +; SI-NEXT: v_readfirstlane_b32 s8, v16 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s9, v18 -; SI-NEXT: ; implicit-def: $vgpr22 : SGPR spill to VGPR lane +; SI-NEXT: v_readfirstlane_b32 s9, v17 +; SI-NEXT: ; implicit-def: $vgpr34 : SGPR spill to VGPR lane ; SI-NEXT: s_cbranch_scc0 .LBB61_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_and_b32 s4, s9, 0xffff0000 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v22, s4, 0 +; SI-NEXT: v_writelane_b32 v34, s4, 0 ; SI-NEXT: s_lshl_b32 s4, s9, 16 -; SI-NEXT: v_writelane_b32 v22, s4, 1 +; SI-NEXT: v_writelane_b32 v34, s4, 1 ; SI-NEXT: s_and_b32 s4, s8, 0xffff0000 -; SI-NEXT: v_writelane_b32 v22, s4, 2 +; SI-NEXT: v_writelane_b32 v34, s4, 2 ; SI-NEXT: s_lshl_b32 s4, s8, 16 -; SI-NEXT: v_writelane_b32 v22, s4, 3 +; SI-NEXT: v_writelane_b32 v34, s4, 3 ; SI-NEXT: s_and_b32 s11, s7, 0xffff0000 ; SI-NEXT: s_lshl_b32 s10, s7, 16 ; SI-NEXT: s_and_b32 s13, s6, 0xffff0000 @@ -96659,11 +95692,11 @@ define inreg <64 x bfloat> @bitcast_v16i64_to_v64bf16_scalar(<16 x i64> inreg %a ; SI-NEXT: s_addc_u32 s9, s9, 0 ; SI-NEXT: s_and_b32 s10, s9, 0xffff0000 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v22, s10, 0 +; SI-NEXT: v_writelane_b32 v34, s10, 0 ; SI-NEXT: s_lshl_b32 s9, s9, 16 -; SI-NEXT: v_writelane_b32 v22, s9, 1 +; SI-NEXT: v_writelane_b32 v34, s9, 1 ; SI-NEXT: s_and_b32 s9, s8, 0xffff0000 -; SI-NEXT: v_writelane_b32 v22, s9, 2 +; SI-NEXT: v_writelane_b32 v34, s9, 2 ; SI-NEXT: s_lshl_b32 s8, s8, 16 ; SI-NEXT: s_and_b32 s11, s7, 0xffff0000 ; SI-NEXT: s_lshl_b32 s10, s7, 16 @@ -96725,275 +95758,182 @@ define inreg <64 x bfloat> @bitcast_v16i64_to_v64bf16_scalar(<16 x i64> inreg %a ; SI-NEXT: s_lshl_b32 s36, s5, 16 ; SI-NEXT: s_and_b32 s39, s4, 0xffff0000 ; SI-NEXT: s_lshl_b32 s38, s4, 16 -; SI-NEXT: v_writelane_b32 v22, s8, 3 +; SI-NEXT: v_writelane_b32 v34, s8, 3 ; SI-NEXT: .LBB61_3: ; %end -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s39 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s38 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_readlane_b32 s4, v22, 2 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v0, 1.0, s39 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_mul_f32_e64 v0, 1.0, s38 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s37 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s36 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s35 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s34 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s31 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s30 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s95 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s94 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s93 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s92 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s91 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s90 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s89 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s88 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s79 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s78 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s77 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s76 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s75 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s74 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s73 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s72 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s63 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s62 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s61 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s60 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s59 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s58 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s57 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s56 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s47 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s46 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s45 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s44 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s43 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s42 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s41 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s40 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s29 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s28 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s27 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s26 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s25 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s24 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s23 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s22 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s21 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s20 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s19 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s18 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s17 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s16 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s15 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s14 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s13 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s12 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s11 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s10 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 -; SI-NEXT: v_readlane_b32 s4, v22, 3 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 -; SI-NEXT: v_readlane_b32 s4, v22, 0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 -; SI-NEXT: v_readlane_b32 s4, v22, 1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: v_readlane_b32 s99, v21, 35 -; SI-NEXT: v_readlane_b32 s98, v21, 34 -; SI-NEXT: v_readlane_b32 s97, v21, 33 -; SI-NEXT: v_readlane_b32 s96, v21, 32 -; SI-NEXT: v_readlane_b32 s87, v21, 31 -; SI-NEXT: v_readlane_b32 s86, v21, 30 -; SI-NEXT: v_readlane_b32 s85, v21, 29 -; SI-NEXT: v_readlane_b32 s84, v21, 28 -; SI-NEXT: v_readlane_b32 s83, v21, 27 -; SI-NEXT: v_readlane_b32 s82, v21, 26 -; SI-NEXT: v_readlane_b32 s81, v21, 25 -; SI-NEXT: v_readlane_b32 s80, v21, 24 -; SI-NEXT: v_readlane_b32 s71, v21, 23 -; SI-NEXT: v_readlane_b32 s70, v21, 22 -; SI-NEXT: v_readlane_b32 s69, v21, 21 -; SI-NEXT: v_readlane_b32 s68, v21, 20 -; SI-NEXT: v_readlane_b32 s67, v21, 19 -; SI-NEXT: v_readlane_b32 s66, v21, 18 -; SI-NEXT: v_readlane_b32 s65, v21, 17 -; SI-NEXT: v_readlane_b32 s64, v21, 16 -; SI-NEXT: v_readlane_b32 s55, v21, 15 -; SI-NEXT: v_readlane_b32 s54, v21, 14 -; SI-NEXT: v_readlane_b32 s53, v21, 13 -; SI-NEXT: v_readlane_b32 s52, v21, 12 -; SI-NEXT: v_readlane_b32 s51, v21, 11 -; SI-NEXT: v_readlane_b32 s50, v21, 10 -; SI-NEXT: v_readlane_b32 s49, v21, 9 -; SI-NEXT: v_readlane_b32 s48, v21, 8 -; SI-NEXT: v_readlane_b32 s39, v21, 7 -; SI-NEXT: v_readlane_b32 s38, v21, 6 -; SI-NEXT: v_readlane_b32 s37, v21, 5 -; SI-NEXT: v_readlane_b32 s36, v21, 4 -; SI-NEXT: v_readlane_b32 s35, v21, 3 -; SI-NEXT: v_readlane_b32 s34, v21, 2 -; SI-NEXT: v_readlane_b32 s31, v21, 1 -; SI-NEXT: v_readlane_b32 s30, v21, 0 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s35 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s34 +; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s31 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s30 +; SI-NEXT: v_lshr_b64 v[3:4], v[3:4], 16 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s95 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s94 +; SI-NEXT: v_lshr_b64 v[4:5], v[4:5], 16 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s93 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s92 +; SI-NEXT: v_lshr_b64 v[5:6], v[5:6], 16 +; SI-NEXT: v_mul_f32_e64 v6, 1.0, s91 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_mul_f32_e64 v6, 1.0, s90 +; SI-NEXT: v_lshr_b64 v[6:7], v[6:7], 16 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s89 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v7 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s88 +; SI-NEXT: v_lshr_b64 v[7:8], v[7:8], 16 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s79 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v8 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s78 +; SI-NEXT: v_lshr_b64 v[8:9], v[8:9], 16 +; SI-NEXT: v_mul_f32_e64 v9, 1.0, s77 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v9 +; SI-NEXT: v_mul_f32_e64 v9, 1.0, s76 +; SI-NEXT: v_lshr_b64 v[9:10], v[9:10], 16 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s75 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s74 +; SI-NEXT: v_lshr_b64 v[10:11], v[10:11], 16 +; SI-NEXT: v_mul_f32_e64 v11, 1.0, s73 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v11 +; SI-NEXT: v_mul_f32_e64 v11, 1.0, s72 +; SI-NEXT: v_lshr_b64 v[11:12], v[11:12], 16 +; SI-NEXT: v_mul_f32_e64 v12, 1.0, s63 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v12 +; SI-NEXT: v_mul_f32_e64 v12, 1.0, s62 +; SI-NEXT: v_lshr_b64 v[12:13], v[12:13], 16 +; SI-NEXT: v_mul_f32_e64 v13, 1.0, s61 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v13 +; SI-NEXT: v_mul_f32_e64 v13, 1.0, s60 +; SI-NEXT: v_lshr_b64 v[13:14], v[13:14], 16 +; SI-NEXT: v_mul_f32_e64 v14, 1.0, s59 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_mul_f32_e64 v14, 1.0, s58 +; SI-NEXT: v_lshr_b64 v[14:15], v[14:15], 16 +; SI-NEXT: v_mul_f32_e64 v15, 1.0, s57 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v15 +; SI-NEXT: v_mul_f32_e64 v15, 1.0, s56 +; SI-NEXT: v_lshr_b64 v[15:16], v[15:16], 16 +; SI-NEXT: v_mul_f32_e64 v16, 1.0, s47 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v16 +; SI-NEXT: v_mul_f32_e64 v16, 1.0, s46 +; SI-NEXT: v_lshr_b64 v[16:17], v[16:17], 16 +; SI-NEXT: v_mul_f32_e64 v17, 1.0, s45 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v17 +; SI-NEXT: v_mul_f32_e64 v17, 1.0, s44 +; SI-NEXT: v_lshr_b64 v[17:18], v[17:18], 16 +; SI-NEXT: v_mul_f32_e64 v18, 1.0, s43 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; SI-NEXT: v_mul_f32_e64 v18, 1.0, s42 +; SI-NEXT: v_lshr_b64 v[18:19], v[18:19], 16 +; SI-NEXT: v_mul_f32_e64 v19, 1.0, s41 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v19 +; SI-NEXT: v_mul_f32_e64 v19, 1.0, s40 +; SI-NEXT: v_lshr_b64 v[19:20], v[19:20], 16 +; SI-NEXT: v_mul_f32_e64 v20, 1.0, s29 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v20 +; SI-NEXT: v_mul_f32_e64 v20, 1.0, s28 +; SI-NEXT: v_lshr_b64 v[20:21], v[20:21], 16 +; SI-NEXT: v_mul_f32_e64 v21, 1.0, s27 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v21 +; SI-NEXT: v_mul_f32_e64 v21, 1.0, s26 +; SI-NEXT: v_lshr_b64 v[21:22], v[21:22], 16 +; SI-NEXT: v_mul_f32_e64 v22, 1.0, s25 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 +; SI-NEXT: v_mul_f32_e64 v22, 1.0, s24 +; SI-NEXT: v_lshr_b64 v[22:23], v[22:23], 16 +; SI-NEXT: v_mul_f32_e64 v23, 1.0, s23 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v23 +; SI-NEXT: v_mul_f32_e64 v23, 1.0, s22 +; SI-NEXT: v_lshr_b64 v[23:24], v[23:24], 16 +; SI-NEXT: v_mul_f32_e64 v24, 1.0, s21 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v24 +; SI-NEXT: v_mul_f32_e64 v24, 1.0, s20 +; SI-NEXT: v_lshr_b64 v[24:25], v[24:25], 16 +; SI-NEXT: v_mul_f32_e64 v25, 1.0, s19 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v25 +; SI-NEXT: v_mul_f32_e64 v25, 1.0, s18 +; SI-NEXT: v_lshr_b64 v[25:26], v[25:26], 16 +; SI-NEXT: v_mul_f32_e64 v26, 1.0, s17 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v26 +; SI-NEXT: v_mul_f32_e64 v26, 1.0, s16 +; SI-NEXT: v_lshr_b64 v[26:27], v[26:27], 16 +; SI-NEXT: v_mul_f32_e64 v27, 1.0, s15 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v27 +; SI-NEXT: v_mul_f32_e64 v27, 1.0, s14 +; SI-NEXT: v_lshr_b64 v[27:28], v[27:28], 16 +; SI-NEXT: v_mul_f32_e64 v28, 1.0, s13 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v28 +; SI-NEXT: v_mul_f32_e64 v28, 1.0, s12 +; SI-NEXT: v_lshr_b64 v[28:29], v[28:29], 16 +; SI-NEXT: v_mul_f32_e64 v29, 1.0, s11 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v29 +; SI-NEXT: v_mul_f32_e64 v29, 1.0, s10 +; SI-NEXT: v_lshr_b64 v[29:30], v[29:30], 16 +; SI-NEXT: v_readlane_b32 s4, v34, 2 +; SI-NEXT: v_mul_f32_e64 v30, 1.0, s4 +; SI-NEXT: v_readlane_b32 s4, v34, 3 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v30 +; SI-NEXT: v_mul_f32_e64 v30, 1.0, s4 +; SI-NEXT: v_lshr_b64 v[30:31], v[30:31], 16 +; SI-NEXT: v_readlane_b32 s4, v34, 0 +; SI-NEXT: v_mul_f32_e64 v31, 1.0, s4 +; SI-NEXT: v_readlane_b32 s4, v34, 1 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v31 +; SI-NEXT: v_mul_f32_e64 v31, 1.0, s4 +; SI-NEXT: v_lshr_b64 v[31:32], v[31:32], 16 +; SI-NEXT: v_readlane_b32 s99, v33, 35 +; SI-NEXT: v_readlane_b32 s98, v33, 34 +; SI-NEXT: v_readlane_b32 s97, v33, 33 +; SI-NEXT: v_readlane_b32 s96, v33, 32 +; SI-NEXT: v_readlane_b32 s87, v33, 31 +; SI-NEXT: v_readlane_b32 s86, v33, 30 +; SI-NEXT: v_readlane_b32 s85, v33, 29 +; SI-NEXT: v_readlane_b32 s84, v33, 28 +; SI-NEXT: v_readlane_b32 s83, v33, 27 +; SI-NEXT: v_readlane_b32 s82, v33, 26 +; SI-NEXT: v_readlane_b32 s81, v33, 25 +; SI-NEXT: v_readlane_b32 s80, v33, 24 +; SI-NEXT: v_readlane_b32 s71, v33, 23 +; SI-NEXT: v_readlane_b32 s70, v33, 22 +; SI-NEXT: v_readlane_b32 s69, v33, 21 +; SI-NEXT: v_readlane_b32 s68, v33, 20 +; SI-NEXT: v_readlane_b32 s67, v33, 19 +; SI-NEXT: v_readlane_b32 s66, v33, 18 +; SI-NEXT: v_readlane_b32 s65, v33, 17 +; SI-NEXT: v_readlane_b32 s64, v33, 16 +; SI-NEXT: v_readlane_b32 s55, v33, 15 +; SI-NEXT: v_readlane_b32 s54, v33, 14 +; SI-NEXT: v_readlane_b32 s53, v33, 13 +; SI-NEXT: v_readlane_b32 s52, v33, 12 +; SI-NEXT: v_readlane_b32 s51, v33, 11 +; SI-NEXT: v_readlane_b32 s50, v33, 10 +; SI-NEXT: v_readlane_b32 s49, v33, 9 +; SI-NEXT: v_readlane_b32 s48, v33, 8 +; SI-NEXT: v_readlane_b32 s39, v33, 7 +; SI-NEXT: v_readlane_b32 s38, v33, 6 +; SI-NEXT: v_readlane_b32 s37, v33, 5 +; SI-NEXT: v_readlane_b32 s36, v33, 4 +; SI-NEXT: v_readlane_b32 s35, v33, 3 +; SI-NEXT: v_readlane_b32 s34, v33, 2 +; SI-NEXT: v_readlane_b32 s31, v33, 1 +; SI-NEXT: v_readlane_b32 s30, v33, 0 ; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[4:5] -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB61_4: ; SI-NEXT: ; implicit-def: $sgpr4 @@ -97318,382 +96258,254 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) { ; SI-LABEL: bitcast_v64bf16_to_v16i64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:132 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:88 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v63, 0xffff0000, v0 +; SI-NEXT: v_and_b32_e32 v62, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_mul_f32_e32 v63, 1.0, v63 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v63, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v62 +; SI-NEXT: v_and_b32_e32 v61, 0xffff0000, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v11 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v10 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v61 +; SI-NEXT: v_and_b32_e32 v60, 0xffff0000, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v12 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v15 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v17 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v60 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v59, 0xffff0000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v58, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v57, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v47, 0xffff0000, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v46, 0xffff0000, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v45, 0xffff0000, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_and_b32_e32 v44, 0xffff0000, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v43, 0xffff0000, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_and_b32_e32 v42, 0xffff0000, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v41, 0xffff0000, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v40, 0xffff0000, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_and_b32_e32 v55, 0xffff0000, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v30 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v29 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_and_b32_e32 v34, 0xffff0000, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_and_b32_e32 v36, 0xffff0000, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_and_b32_e32 v48, 0xffff0000, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_and_b32_e32 v49, 0xffff0000, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_and_b32_e32 v50, 0xffff0000, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_and_b32_e32 v51, 0xffff0000, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_and_b32_e32 v52, 0xffff0000, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_and_b32_e32 v53, 0xffff0000, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_and_b32_e32 v54, 0xffff0000, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_mul_f32_e32 v62, 1.0, v54 +; SI-NEXT: v_mul_f32_e32 v60, 1.0, v53 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v56, 0xffff0000, v37 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v38 +; SI-NEXT: v_and_b32_e32 v38, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v18 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v38 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v21 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v20 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v59 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v23 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v5 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v22 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v58 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v6 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v24 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v57 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v27 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v26 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v47 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v29 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v28 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: v_mul_f32_e32 v62, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v60, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v61, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v58, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v59, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v56, 1.0, v7 -; SI-NEXT: v_mul_f32_e32 v57, 1.0, v6 -; SI-NEXT: v_mul_f32_e32 v47, 1.0, v8 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v46 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v9 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v33 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v45 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v34 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v10 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v35 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v44 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v36 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v11 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v37 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v43 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v38 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v39 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v42 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v48 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v49 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v41 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v50 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v51 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v40 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v52 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v53 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v55 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v54 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:100 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:112 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:108 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:120 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:116 -; SI-NEXT: v_mul_f32_e32 v39, 1.0, v41 -; SI-NEXT: v_mul_f32_e32 v51, 1.0, v42 -; SI-NEXT: v_mul_f32_e32 v32, 1.0, v30 -; SI-NEXT: v_mul_f32_e32 v52, 1.0, v55 -; SI-NEXT: v_mul_f32_e32 v55, 1.0, v40 -; SI-NEXT: v_mul_f32_e32 v34, 1.0, v43 -; SI-NEXT: v_mul_f32_e32 v38, 1.0, v44 -; SI-NEXT: v_mul_f32_e32 v33, 1.0, v45 -; SI-NEXT: v_mul_f32_e32 v35, 1.0, v46 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_mul_f32_e32 v41, 1.0, v0 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_mul_f32_e32 v42, 1.0, v1 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_mul_f32_e32 v54, 1.0, v2 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_mul_f32_e32 v40, 1.0, v3 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_mul_f32_e32 v48, 1.0, v4 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_mul_f32_e32 v53, 1.0, v5 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_mul_f32_e32 v36, 1.0, v6 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_mul_f32_e32 v49, 1.0, v7 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v37, 1.0, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v50, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v17 +; SI-NEXT: v_mul_f32_e32 v61, 1.0, v18 +; SI-NEXT: v_mul_f32_e32 v58, 1.0, v52 +; SI-NEXT: v_mul_f32_e32 v59, 1.0, v19 +; SI-NEXT: v_mul_f32_e32 v47, 1.0, v51 +; SI-NEXT: v_mul_f32_e32 v57, 1.0, v20 +; SI-NEXT: v_mul_f32_e32 v45, 1.0, v50 +; SI-NEXT: v_mul_f32_e32 v46, 1.0, v21 +; SI-NEXT: v_mul_f32_e32 v43, 1.0, v49 +; SI-NEXT: v_mul_f32_e32 v44, 1.0, v22 +; SI-NEXT: v_mul_f32_e32 v41, 1.0, v48 +; SI-NEXT: v_mul_f32_e32 v42, 1.0, v23 +; SI-NEXT: v_mul_f32_e32 v55, 1.0, v39 +; SI-NEXT: v_mul_f32_e32 v40, 1.0, v24 +; SI-NEXT: v_mul_f32_e32 v53, 1.0, v36 +; SI-NEXT: v_mul_f32_e32 v54, 1.0, v25 +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v35 +; SI-NEXT: v_mul_f32_e32 v52, 1.0, v26 +; SI-NEXT: v_mul_f32_e32 v49, 1.0, v34 +; SI-NEXT: v_mul_f32_e32 v50, 1.0, v27 +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v33 +; SI-NEXT: v_mul_f32_e32 v48, 1.0, v28 +; SI-NEXT: v_mul_f32_e32 v36, 1.0, v32 +; SI-NEXT: v_mul_f32_e32 v38, 1.0, v29 +; SI-NEXT: v_mul_f32_e32 v34, 1.0, v31 +; SI-NEXT: v_mul_f32_e32 v35, 1.0, v30 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v56 +; SI-NEXT: v_mul_f32_e32 v33, 1.0, v37 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB62_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v62 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v60 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v58 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v56 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v39 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v34 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v41 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v54 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v48 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v36 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v37 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: v_alignbit_b32 v0, v0, v63, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v61, 16 -; SI-NEXT: v_alignbit_b32 v2, v2, v59, 16 -; SI-NEXT: v_alignbit_b32 v3, v3, v57, 16 -; SI-NEXT: v_alignbit_b32 v24, v24, v51, 16 -; SI-NEXT: v_alignbit_b32 v25, v25, v38, 16 -; SI-NEXT: v_alignbit_b32 v26, v26, v35, 16 -; SI-NEXT: v_alignbit_b32 v27, v27, v42, 16 -; SI-NEXT: v_alignbit_b32 v28, v28, v40, 16 -; SI-NEXT: v_alignbit_b32 v29, v29, v53, 16 -; SI-NEXT: v_alignbit_b32 v30, v30, v49, 16 -; SI-NEXT: v_alignbit_b32 v31, v31, v50, 16 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_alignbit_b32 v5, v5, v6, 16 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_alignbit_b32 v16, v16, v17, 16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_alignbit_b32 v6, v6, v7, 16 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_alignbit_b32 v17, v17, v18, 16 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_alignbit_b32 v7, v7, v8, 16 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_alignbit_b32 v18, v18, v19, 16 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_alignbit_b32 v8, v8, v9, 16 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_alignbit_b32 v19, v19, v20, 16 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_alignbit_b32 v9, v9, v10, 16 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_alignbit_b32 v20, v20, v21, 16 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_alignbit_b32 v10, v10, v11, 16 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_alignbit_b32 v21, v21, v22, 16 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_alignbit_b32 v11, v11, v12, 16 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_alignbit_b32 v22, v22, v23, 16 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v52 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: v_alignbit_b32 v23, v23, v55, 16 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_alignbit_b32 v12, v12, v13, 16 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_alignbit_b32 v13, v13, v14, 16 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_alignbit_b32 v14, v14, v15, 16 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_alignbit_b32 v4, v4, v47, 16 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_alignbit_b32 v15, v15, v32, 16 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 @@ -97739,282 +96551,428 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v58 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v47 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v45 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v43 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v34 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: v_alignbit_b32 v19, v19, v59, 16 +; SI-NEXT: v_alignbit_b32 v20, v20, v57, 16 +; SI-NEXT: v_alignbit_b32 v21, v21, v46, 16 +; SI-NEXT: v_alignbit_b32 v22, v22, v44, 16 +; SI-NEXT: v_alignbit_b32 v23, v23, v42, 16 +; SI-NEXT: v_alignbit_b32 v24, v24, v40, 16 +; SI-NEXT: v_alignbit_b32 v25, v25, v54, 16 +; SI-NEXT: v_alignbit_b32 v26, v26, v52, 16 +; SI-NEXT: v_alignbit_b32 v27, v27, v50, 16 +; SI-NEXT: v_alignbit_b32 v28, v28, v48, 16 +; SI-NEXT: v_alignbit_b32 v29, v29, v38, 16 +; SI-NEXT: v_alignbit_b32 v30, v30, v35, 16 +; SI-NEXT: v_alignbit_b32 v31, v31, v33, 16 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_alignbit_b32 v0, v0, v63, 16 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_alignbit_b32 v2, v2, v3, 16 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_alignbit_b32 v3, v3, v4, 16 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_alignbit_b32 v4, v4, v5, 16 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_alignbit_b32 v5, v5, v6, 16 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_alignbit_b32 v6, v6, v7, 16 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_alignbit_b32 v7, v7, v8, 16 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_alignbit_b32 v8, v8, v9, 16 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_alignbit_b32 v9, v9, v10, 16 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_alignbit_b32 v10, v10, v11, 16 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_alignbit_b32 v11, v11, v12, 16 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_alignbit_b32 v12, v12, v13, 16 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_alignbit_b32 v13, v13, v14, 16 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_alignbit_b32 v14, v14, v15, 16 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_alignbit_b32 v15, v15, v16, 16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_alignbit_b32 v16, v16, v17, 16 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v62 +; SI-NEXT: v_alignbit_b32 v17, v17, v18, 16 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v60 +; SI-NEXT: v_alignbit_b32 v18, v18, v61, 16 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: .LBB62_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB62_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v62 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v60 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v63 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v61 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; SI-NEXT: v_alignbit_b32 v1, v3, v2, 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v58 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v59 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v56 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v57 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v47 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v52 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v62 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v60 +; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v58 +; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v47 +; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v45 +; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v43 +; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v41 ; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 ; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v39 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v55 ; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 ; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v34 +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v53 ; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 ; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v33 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v51 ; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 ; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 -; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v41 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v49 ; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 ; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v54 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v39 ; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 ; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 -; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v48 +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v36 ; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 ; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v36 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v34 ; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v3, v2, 16 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 ; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 ; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 ; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 ; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 ; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: v_alignbit_b32 v8, v9, v8, 16 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 ; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 ; SI-NEXT: v_alignbit_b32 v10, v11, v10, 16 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 ; SI-NEXT: v_alignbit_b32 v11, v12, v11, 16 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 ; SI-NEXT: v_alignbit_b32 v12, v13, v12, 16 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 ; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; SI-NEXT: v_alignbit_b32 v14, v15, v14, 16 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v32 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 ; SI-NEXT: v_alignbit_b32 v15, v16, v15, 16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v37 -; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; SI-NEXT: v_alignbit_b32 v16, v17, v16, 16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 ; SI-NEXT: v_alignbit_b32 v17, v18, v17, 16 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v61 ; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; SI-NEXT: v_alignbit_b32 v18, v19, v18, 16 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v59 ; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 ; SI-NEXT: v_alignbit_b32 v19, v20, v19, 16 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v57 ; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 ; SI-NEXT: v_alignbit_b32 v20, v21, v20, 16 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v46 ; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 ; SI-NEXT: v_alignbit_b32 v21, v22, v21, 16 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v44 ; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 ; SI-NEXT: v_alignbit_b32 v22, v23, v22, 16 -; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v55 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v42 ; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 ; SI-NEXT: v_alignbit_b32 v23, v24, v23, 16 -; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v51 +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v40 ; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 ; SI-NEXT: v_alignbit_b32 v24, v25, v24, 16 -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v38 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v54 ; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 ; SI-NEXT: v_alignbit_b32 v25, v26, v25, 16 -; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v35 +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v52 ; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 ; SI-NEXT: v_alignbit_b32 v26, v27, v26, 16 -; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v42 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v50 ; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 ; SI-NEXT: v_alignbit_b32 v27, v28, v27, 16 -; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v40 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v48 ; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 ; SI-NEXT: v_alignbit_b32 v28, v29, v28, 16 -; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v53 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v38 ; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 ; SI-NEXT: v_alignbit_b32 v29, v30, v29, 16 -; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v49 +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v35 ; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 ; SI-NEXT: v_alignbit_b32 v30, v31, v30, 16 -; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v50 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v33 ; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 ; SI-NEXT: v_alignbit_b32 v31, v32, v31, 16 ; SI-NEXT: .LBB62_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v64bf16_to_v16i64: @@ -100292,667 +99250,665 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; SI-LABEL: bitcast_v64bf16_to_v16i64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:68 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v63, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v7 -; SI-NEXT: v_mov_b32_e32 v43, v21 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v11 -; SI-NEXT: v_mov_b32_e32 v54, v29 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v43 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v54 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v41, v23 -; SI-NEXT: v_mov_b32_e32 v29, v20 -; SI-NEXT: v_mul_f32_e32 v57, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v59, 1.0, v9 -; SI-NEXT: v_mul_f32_e32 v61, 1.0, v13 -; SI-NEXT: v_mul_f32_e32 v23, 1.0, v15 -; SI-NEXT: v_mul_f32_e32 v44, 1.0, v17 -; SI-NEXT: v_mul_f32_e32 v21, 1.0, v19 -; SI-NEXT: v_mul_f32_e32 v20, 1.0, v41 -; SI-NEXT: v_mul_f32_e32 v17, 1.0, v25 -; SI-NEXT: v_mul_f32_e32 v15, 1.0, v27 -; SI-NEXT: v_mul_f32_e64 v25, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v3, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v5, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v7, 1.0, s29 -; SI-NEXT: v_mul_f32_e32 v9, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v54, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v11, 1.0, v6 -; SI-NEXT: v_mul_f32_e32 v56, 1.0, v8 -; SI-NEXT: v_mul_f32_e32 v13, 1.0, v10 -; SI-NEXT: v_mul_f32_e32 v58, 1.0, v12 -; SI-NEXT: v_mul_f32_e32 v60, 1.0, v14 -; SI-NEXT: v_mul_f32_e32 v62, 1.0, v16 -; SI-NEXT: v_mul_f32_e32 v47, 1.0, v22 -; SI-NEXT: v_mul_f32_e32 v22, 1.0, v28 -; SI-NEXT: v_mul_f32_e64 v19, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v14, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v16, 1.0, s22 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v32 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v33 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v34 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v35 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v36 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v37 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v38 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v39 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v48 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v5 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v16 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v16 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v7 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v30 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v17 +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v15 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v14 +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v13 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v12 +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v11 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v10 +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v9 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v8 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v28 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v34, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_and_b32 s6, s29, 0xffff0000 +; SI-NEXT: s_lshl_b32 s7, s29, 16 +; SI-NEXT: s_and_b32 s8, s28, 0xffff0000 +; SI-NEXT: s_lshl_b32 s9, s28, 16 +; SI-NEXT: s_and_b32 s10, s27, 0xffff0000 +; SI-NEXT: s_lshl_b32 s11, s27, 16 +; SI-NEXT: s_and_b32 s12, s26, 0xffff0000 +; SI-NEXT: s_lshl_b32 s13, s26, 16 +; SI-NEXT: s_and_b32 s14, s25, 0xffff0000 +; SI-NEXT: s_lshl_b32 s15, s25, 16 +; SI-NEXT: s_and_b32 s25, s24, 0xffff0000 +; SI-NEXT: s_lshl_b32 s24, s24, 16 +; SI-NEXT: s_and_b32 s26, s23, 0xffff0000 +; SI-NEXT: s_lshl_b32 s23, s23, 16 +; SI-NEXT: s_and_b32 s27, s22, 0xffff0000 +; SI-NEXT: s_lshl_b32 s22, s22, 16 +; SI-NEXT: s_and_b32 s28, s21, 0xffff0000 +; SI-NEXT: s_lshl_b32 s21, s21, 16 +; SI-NEXT: s_and_b32 s29, s20, 0xffff0000 +; SI-NEXT: s_lshl_b32 s20, s20, 16 +; SI-NEXT: s_and_b32 s40, s19, 0xffff0000 +; SI-NEXT: s_lshl_b32 s19, s19, 16 +; SI-NEXT: s_and_b32 s41, s18, 0xffff0000 +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_and_b32 s42, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_and_b32 s43, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s16, s16, 16 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v49 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v27 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v26 +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v25 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v20 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s19 -; SI-NEXT: v_mul_f32_e32 v39, 1.0, v0 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v45 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v19 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mul_f32_e64 v35, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v33, 1.0, s27 -; SI-NEXT: v_mul_f32_e32 v32, 1.0, v18 -; SI-NEXT: v_mul_f32_e32 v34, 1.0, v29 -; SI-NEXT: v_mul_f32_e32 v36, 1.0, v24 -; SI-NEXT: v_mul_f32_e32 v38, 1.0, v26 -; SI-NEXT: v_mul_f32_e32 v31, 1.0, v30 -; SI-NEXT: v_mul_f32_e32 v24, 1.0, v51 -; SI-NEXT: v_mul_f32_e32 v41, 1.0, v53 -; SI-NEXT: v_mul_f32_e32 v26, 1.0, v55 -; SI-NEXT: v_mul_f32_e32 v43, 1.0, v40 -; SI-NEXT: v_mul_f32_e32 v28, 1.0, v42 -; SI-NEXT: v_mul_f32_e32 v51, 1.0, v50 -; SI-NEXT: v_mul_f32_e32 v53, 1.0, v52 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v46 -; SI-NEXT: v_mul_f32_e64 v48, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v18, 1.0, s24 -; SI-NEXT: v_mul_f32_e64 v29, 1.0, s26 +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v35 +; SI-NEXT: v_mul_f32_e32 v50, 1.0, v34 +; SI-NEXT: v_mul_f32_e32 v42, 1.0, v33 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_mul_f32_e32 v59, 1.0, v32 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mul_f32_e32 v61, 1.0, v31 +; SI-NEXT: v_mul_f32_e32 v29, 1.0, v29 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e64 v63, 1.0, s43 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_mul_f32_e64 v23, 1.0, s42 +; SI-NEXT: v_mul_f32_e64 v55, 1.0, s41 +; SI-NEXT: v_mul_f32_e64 v57, 1.0, s40 +; SI-NEXT: v_mul_f32_e64 v25, 1.0, s29 ; SI-NEXT: v_mul_f32_e64 v45, 1.0, s28 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e64 v52, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v43, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v37, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v47, 1.0, s14 +; SI-NEXT: v_mul_f32_e64 v33, 1.0, s12 +; SI-NEXT: v_mul_f32_e64 v27, 1.0, s10 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mul_f32_e64 v21, 1.0, s8 +; SI-NEXT: v_mul_f32_e64 v19, 1.0, s6 +; SI-NEXT: v_mul_f32_e32 v35, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v34, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v49, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v48, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v38, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v54, 1.0, v36 +; SI-NEXT: v_mul_f32_e32 v53, 1.0, v17 +; SI-NEXT: v_mul_f32_e64 v62, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v60, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v58, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v56, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v41, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v6, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v40, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v46, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s15 +; SI-NEXT: v_mul_f32_e64 v44, 1.0, s13 +; SI-NEXT: v_mul_f32_e64 v12, 1.0, s11 +; SI-NEXT: v_mul_f32_e64 v36, 1.0, s9 +; SI-NEXT: v_mul_f32_e64 v14, 1.0, s7 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB63_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v0, v19 -; SI-NEXT: v_mov_b32_e32 v37, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v33 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v63 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v57 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v61 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v63 +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v44 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v21 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_lshr_b64 v[0:1], v[62:63], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[60:61], 16 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v55 +; SI-NEXT: v_lshr_b64 v[2:3], v[58:59], 16 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v57 +; SI-NEXT: v_lshr_b64 v[3:4], v[56:57], 16 +; SI-NEXT: v_mov_b32_e32 v4, v41 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v25 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[4:5], v[41:42], 16 +; SI-NEXT: v_mov_b32_e32 v5, v6 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v45 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v52 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v43 +; SI-NEXT: v_mov_b32_e32 v61, v37 +; SI-NEXT: v_mov_b32_e32 v59, v47 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v61 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v59 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v50 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v29 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v63, v39 ; SI-NEXT: s_mov_b64 s[4:5], 0 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[0:1], v[19:20], 16 -; SI-NEXT: v_mov_b32_e32 v1, v48 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[1:2], v[48:49], 16 -; SI-NEXT: v_mov_b32_e32 v2, v14 -; SI-NEXT: v_mov_b32_e32 v49, v15 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v3 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[2:3], v[14:15], 16 -; SI-NEXT: v_mov_b32_e32 v3, v16 -; SI-NEXT: v_mov_b32_e32 v20, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v35 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[3:4], v[16:17], 16 -; SI-NEXT: v_mov_b32_e32 v4, v18 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v5 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[5:6], v[6:7], 16 +; SI-NEXT: v_mov_b32_e32 v6, v40 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[4:5], v[18:19], 16 -; SI-NEXT: v_mov_b32_e32 v5, v29 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[6:7], v[40:41], 16 +; SI-NEXT: v_mov_b32_e32 v7, v8 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[5:6], v[29:30], 16 -; SI-NEXT: v_mov_b32_e32 v6, v45 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[6:7], v[45:46], 16 -; SI-NEXT: v_mov_b32_e32 v7, v39 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[7:8], v[39:40], 16 -; SI-NEXT: v_mov_b32_e32 v8, v9 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v37 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v49 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_lshr_b64 v[7:8], v[8:9], 16 +; SI-NEXT: v_mov_b32_e32 v8, v46 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[8:9], v[9:10], 16 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v9 -; SI-NEXT: v_mov_b32_e32 v9, v54 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[8:9], v[46:47], 16 +; SI-NEXT: v_mov_b32_e32 v9, v10 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[9:10], v[54:55], 16 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v55, v13 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v10 -; SI-NEXT: v_mov_b32_e32 v10, v11 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[10:11], v[11:12], 16 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v11 -; SI-NEXT: v_mov_b32_e32 v11, v56 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[11:12], v[56:57], 16 -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v56, v44 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v12 -; SI-NEXT: v_lshr_b64 v[12:13], v[13:14], 16 -; SI-NEXT: v_mov_b32_e32 v13, v58 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[13:14], v[58:59], 16 -; SI-NEXT: v_mov_b32_e32 v14, v60 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[9:10], v[10:11], 16 +; SI-NEXT: v_mov_b32_e32 v10, v44 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[14:15], v[60:61], 16 -; SI-NEXT: v_mov_b32_e32 v15, v62 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[10:11], v[44:45], 16 +; SI-NEXT: v_mov_b32_e32 v11, v12 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v45, v50 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[11:12], v[12:13], 16 +; SI-NEXT: v_mov_b32_e32 v12, v36 +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[12:13], v[36:37], 16 +; SI-NEXT: v_mov_b32_e32 v13, v14 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v39 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[13:14], v[14:15], 16 +; SI-NEXT: v_mov_b32_e32 v14, v35 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[14:15], v[35:36], 16 +; SI-NEXT: v_mov_b32_e32 v15, v16 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[15:16], v[62:63], 16 +; SI-NEXT: v_lshr_b64 v[15:16], v[16:17], 16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 ; SI-NEXT: v_mov_b32_e32 v16, v32 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshr_b64 v[16:17], v[32:33], 16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: v_mov_b32_e32 v33, v34 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v17 +; SI-NEXT: v_mov_b32_e32 v17, v18 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[17:18], v[18:19], 16 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v18 +; SI-NEXT: v_lshr_b64 v[18:19], v[34:35], 16 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v19 +; SI-NEXT: v_mov_b32_e32 v19, v20 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v17 -; SI-NEXT: v_mov_b32_e32 v40, v17 -; SI-NEXT: v_lshr_b64 v[17:18], v[34:35], 16 -; SI-NEXT: v_lshr_b64 v[18:19], v[47:48], 16 -; SI-NEXT: v_lshr_b64 v[19:20], v[36:37], 16 -; SI-NEXT: v_mov_b32_e32 v20, v38 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[20:21], v[38:39], 16 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_mov_b32_e32 v34, v47 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v21 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[19:20], v[20:21], 16 +; SI-NEXT: v_mov_b32_e32 v20, v51 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[20:21], v[51:52], 16 ; SI-NEXT: v_mov_b32_e32 v21, v22 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshr_b64 v[21:22], v[22:23], 16 -; SI-NEXT: v_mov_b32_e32 v22, v31 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v22 +; SI-NEXT: v_mov_b32_e32 v22, v49 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[22:23], v[31:32], 16 +; SI-NEXT: v_lshr_b64 v[22:23], v[49:50], 16 ; SI-NEXT: v_mov_b32_e32 v23, v24 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshr_b64 v[23:24], v[24:25], 16 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v52 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v30 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v24 -; SI-NEXT: v_mov_b32_e32 v24, v41 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[24:25], v[41:42], 16 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v41, v26 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v50 -; SI-NEXT: v_mov_b32_e32 v42, v51 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v24 +; SI-NEXT: v_mov_b32_e32 v24, v48 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[24:25], v[48:49], 16 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v57 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v43 +; SI-NEXT: v_mov_b32_e32 v57, v31 +; SI-NEXT: v_mov_b32_e32 v43, v54 +; SI-NEXT: v_mov_b32_e32 v48, v53 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v25 +; SI-NEXT: v_mov_b32_e32 v25, v26 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v41 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshr_b64 v[25:26], v[26:27], 16 -; SI-NEXT: v_mov_b32_e32 v26, v43 -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v26, v38 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[26:27], v[38:39], 16 +; SI-NEXT: v_mov_b32_e32 v27, v28 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v38, v30 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[26:27], v[43:44], 16 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v43, v28 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v27 ; SI-NEXT: v_lshr_b64 v[27:28], v[28:29], 16 -; SI-NEXT: v_lshr_b64 v[28:29], v[51:52], 16 -; SI-NEXT: v_lshr_b64 v[29:30], v[53:54], 16 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v52, v53 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v30 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshr_b64 v[30:31], v[31:32], 16 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v57 -; SI-NEXT: v_mov_b32_e32 v53, v31 -; SI-NEXT: v_lshr_b64 v[31:32], v[31:32], 16 +; SI-NEXT: v_lshr_b64 v[28:29], v[31:32], 16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v37 +; SI-NEXT: v_lshr_b64 v[29:30], v[30:31], 16 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v37, v61 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v30 +; SI-NEXT: v_lshr_b64 v[30:31], v[54:55], 16 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v47 +; SI-NEXT: v_mov_b32_e32 v47, v59 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_lshr_b64 v[31:32], v[53:54], 16 ; SI-NEXT: s_branch .LBB63_3 ; SI-NEXT: .LBB63_2: -; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v56, v44 -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v55, v13 -; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v63, v39 +; SI-NEXT: v_mov_b32_e32 v45, v50 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: v_mov_b32_e32 v33, v34 -; SI-NEXT: v_mov_b32_e32 v34, v47 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v43, v28 -; SI-NEXT: v_mov_b32_e32 v52, v53 -; SI-NEXT: v_mov_b32_e32 v53, v0 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v57, v31 +; SI-NEXT: v_mov_b32_e32 v38, v30 +; SI-NEXT: v_mov_b32_e32 v43, v54 +; SI-NEXT: v_mov_b32_e32 v48, v53 ; SI-NEXT: s_mov_b64 s[4:5], -1 -; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v41, v26 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v42, v51 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: .LBB63_3: ; %Flow -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v37, v34 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: v_mov_b32_e32 v34, v33 -; SI-NEXT: v_mov_b32_e32 v35, v56 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_mov_b32_e32 v32, v40 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_mov_b32_e32 v33, v38 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mov_b32_e32 v51, v46 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mov_b32_e32 v54, v46 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mov_b32_e32 v44, v46 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mov_b32_e32 v45, v56 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mov_b32_e32 v47, v56 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mov_b32_e32 v58, v60 -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_mov_b32_e32 v33, v50 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: s_cbranch_vccnz .LBB63_5 ; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v57 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v62 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v61 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v60 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v49 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v39 -; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v59 -; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v40 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v35 -; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v32 -; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v53 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v32 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v42 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v61 +; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v53, v57 +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v48 ; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 ; SI-NEXT: v_lshr_b64 v[1:2], v[2:3], 16 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v60 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v36 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v58 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v56 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshr_b64 v[3:4], v[3:4], 16 +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v51 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v58 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_lshr_b64 v[3:4], v[3:4], 16 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v56 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_lshr_b64 v[4:5], v[4:5], 16 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v47 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_lshr_b64 v[5:6], v[5:6], 16 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v45 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v50 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_lshr_b64 v[6:7], v[6:7], 16 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v46 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_lshr_b64 v[7:8], v[7:8], 16 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v44 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v37 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: v_lshr_b64 v[8:9], v[8:9], 16 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v54 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v47 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 ; SI-NEXT: v_lshr_b64 v[9:10], v[9:10], 16 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v51 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v35 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 ; SI-NEXT: v_lshr_b64 v[10:11], v[10:11], 16 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v50 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_lshr_b64 v[11:12], v[11:12], 16 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v55 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_lshr_b64 v[12:13], v[12:13], 16 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v48 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_lshr_b64 v[13:14], v[13:14], 16 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v38 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v63 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; SI-NEXT: v_lshr_b64 v[14:15], v[14:15], 16 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v33 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v45 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 ; SI-NEXT: v_lshr_b64 v[15:16], v[15:16], 16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v33 ; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_lshr_b64 v[16:17], v[16:17], 16 -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v34 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v59 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 ; SI-NEXT: v_lshr_b64 v[17:18], v[17:18], 16 -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v37 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v34 ; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; SI-NEXT: v_lshr_b64 v[18:19], v[18:19], 16 -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v36 -; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; SI-NEXT: v_lshr_b64 v[19:20], v[19:20], 16 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 ; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 ; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 ; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v33 ; SI-NEXT: v_lshr_b64 v[32:33], v[32:33], 16 ; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_lshr_b64 v[19:20], v[19:20], 16 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -100960,9 +99916,10 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 ; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 ; SI-NEXT: v_lshr_b64 v[20:21], v[20:21], 16 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 @@ -100971,9 +99928,10 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 ; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 ; SI-NEXT: v_lshr_b64 v[21:22], v[21:22], 16 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 @@ -100982,9 +99940,9 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 ; SI-NEXT: v_lshr_b64 v[22:23], v[22:23], 16 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 @@ -100993,9 +99951,9 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 ; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 ; SI-NEXT: v_lshr_b64 v[23:24], v[23:24], 16 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 ; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 @@ -101004,12 +99962,20 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 ; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 ; SI-NEXT: v_lshr_b64 v[24:25], v[24:25], 16 -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v41 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 ; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 ; SI-NEXT: v_lshr_b64 v[25:26], v[25:26], 16 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 @@ -101018,45 +99984,45 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 ; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 ; SI-NEXT: v_lshr_b64 v[26:27], v[26:27], 16 -; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v43 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 ; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 ; SI-NEXT: v_lshr_b64 v[27:28], v[27:28], 16 -; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v42 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v53 ; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 ; SI-NEXT: v_lshr_b64 v[28:29], v[28:29], 16 -; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v52 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v38 ; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 ; SI-NEXT: v_lshr_b64 v[29:30], v[29:30], 16 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v43 ; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 -; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 ; SI-NEXT: v_lshr_b64 v[30:31], v[30:31], 16 ; SI-NEXT: v_mov_b32_e32 v31, v32 ; SI-NEXT: .LBB63_5: ; %end -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v64bf16_to_v16i64_scalar: @@ -103760,34 +102726,32 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-LABEL: bitcast_v16i64_to_v64f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 ; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr43 @@ -103798,19 +102762,20 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 ; SI-NEXT: ; implicit-def: $vgpr31 @@ -103878,19 +102843,15 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; kill: killed $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB64_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v31 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v40, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 @@ -103898,100 +102859,97 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v52, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v30 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v23 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v28 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v34, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v30 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v38, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v27 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v46, v4 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v44, v3 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v42, v3 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v35, v2 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v56, v2 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 @@ -104003,27 +102961,31 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v47, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v61, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v62 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v60, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v29 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v58, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v0 +; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 @@ -104055,125 +103017,116 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: .LBB64_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB64_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v33, vcc, 3, v1 -; SI-NEXT: v_addc_u32_e32 v35, vcc, 0, v2, vcc -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; SI-NEXT: v_add_i32_e32 v33, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v35, vcc, 0, v1, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: v_addc_u32_e32 v12, vcc, 0, v12, vcc -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; SI-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; SI-NEXT: v_addc_u32_e32 v16, vcc, 0, v16, vcc -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; SI-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; SI-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc -; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; SI-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc -; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; SI-NEXT: v_addc_u32_e32 v24, vcc, 0, v24, vcc -; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v10 -; SI-NEXT: v_addc_u32_e32 v26, vcc, 0, v26, vcc -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; SI-NEXT: v_addc_u32_e32 v28, vcc, 0, v28, vcc -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v25 -; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v9 -; SI-NEXT: v_addc_u32_e32 v30, vcc, 0, v30, vcc -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v7 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v48 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v44, vcc, 3, v63 -; SI-NEXT: v_addc_u32_e32 v46, vcc, 0, v62, vcc +; SI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 ; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v44 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v46 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v50 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v38 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v23 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v58 -; SI-NEXT: v_mov_b32_e32 v58, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v34 -; SI-NEXT: v_mov_b32_e32 v34, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v32 -; SI-NEXT: v_mov_b32_e32 v32, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v56 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v61 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v38 +; SI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; SI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v6 +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v36 +; SI-NEXT: v_addc_u32_e32 v29, vcc, 0, v29, vcc +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_addc_u32_e32 v42, vcc, 0, v62, vcc +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v30 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v62 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v46 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v19 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v32 +; SI-NEXT: v_mov_b32_e32 v32, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v31 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v63 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v12 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v57 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v61 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v10 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v59 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v26 ; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v27 ; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v28 ; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v29 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v44 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v42 ; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v4 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v3 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v35 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v33 ; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 @@ -104190,17 +103143,23 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v35 ; SI-NEXT: v_cvt_f32_f16_e32 v60, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 ; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 ; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 ; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v47 ; SI-NEXT: v_cvt_f32_f16_e32 v33, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v43 ; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 ; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 ; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 @@ -104208,343 +103167,249 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 ; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 ; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v6 -; SI-NEXT: v_mov_b32_e32 v50, v29 -; SI-NEXT: v_mov_b32_e32 v48, v30 -; SI-NEXT: v_mov_b32_e32 v56, v28 -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v43, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v5 +; SI-NEXT: v_mov_b32_e32 v38, v28 +; SI-NEXT: v_mov_b32_e32 v36, v30 +; SI-NEXT: v_mov_b32_e32 v46, v27 +; SI-NEXT: v_mov_b32_e32 v31, v29 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: .LBB64_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v60 -; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v0, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v56 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v35 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v44 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v46 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 -; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 -; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 -; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 -; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 -; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 -; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 -; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 -; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 -; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 -; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 -; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v55 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v35 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v63 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v26, v48 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v40 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v6, v41 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v8, v53 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v10, v49 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v12, v37 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v14, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v34 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v23, v25, v23 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v58 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 +; SI-NEXT: v_or_b32_e32 v25, v27, v25 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v26, v50 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_or_b32_e32 v26, v27, v26 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v54 +; SI-NEXT: v_or_b32_e32 v27, v29, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v38 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_or_b32_e32 v28, v29, v28 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v44 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v29, v31, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v36 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_or_b32_e32 v30, v31, v30 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v32 +; SI-NEXT: v_or_b32_e32 v31, v33, v31 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v56 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v50 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v48 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v36 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x78, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v31 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16i64_to_v64f16: @@ -104728,61 +103593,61 @@ define inreg <64 x half> @bitcast_v16i64_to_v64f16_scalar(<16 x i64> inreg %a, i ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[4:5] -; SI-NEXT: v_mov_b32_e32 v20, s16 -; SI-NEXT: v_readfirstlane_b32 s40, v20 -; SI-NEXT: v_mov_b32_e32 v20, s17 -; SI-NEXT: v_readfirstlane_b32 s44, v20 -; SI-NEXT: v_mov_b32_e32 v20, s18 -; SI-NEXT: v_readfirstlane_b32 s41, v20 -; SI-NEXT: v_mov_b32_e32 v20, s19 -; SI-NEXT: v_readfirstlane_b32 s45, v20 -; SI-NEXT: v_mov_b32_e32 v20, s20 -; SI-NEXT: v_readfirstlane_b32 s42, v20 -; SI-NEXT: v_mov_b32_e32 v20, s21 -; SI-NEXT: v_readfirstlane_b32 s46, v20 -; SI-NEXT: v_mov_b32_e32 v20, s22 -; SI-NEXT: v_readfirstlane_b32 s43, v20 -; SI-NEXT: v_mov_b32_e32 v20, s23 -; SI-NEXT: v_readfirstlane_b32 s47, v20 -; SI-NEXT: v_mov_b32_e32 v20, s24 -; SI-NEXT: v_readfirstlane_b32 s24, v20 -; SI-NEXT: v_mov_b32_e32 v20, s25 -; SI-NEXT: v_readfirstlane_b32 s56, v20 -; SI-NEXT: v_mov_b32_e32 v20, s26 +; SI-NEXT: v_mov_b32_e32 v19, s16 +; SI-NEXT: v_readfirstlane_b32 s40, v19 +; SI-NEXT: v_mov_b32_e32 v19, s17 +; SI-NEXT: v_readfirstlane_b32 s44, v19 +; SI-NEXT: v_mov_b32_e32 v19, s18 +; SI-NEXT: v_readfirstlane_b32 s41, v19 +; SI-NEXT: v_mov_b32_e32 v19, s19 +; SI-NEXT: v_readfirstlane_b32 s45, v19 +; SI-NEXT: v_mov_b32_e32 v19, s20 +; SI-NEXT: v_readfirstlane_b32 s42, v19 +; SI-NEXT: v_mov_b32_e32 v19, s21 +; SI-NEXT: v_readfirstlane_b32 s46, v19 +; SI-NEXT: v_mov_b32_e32 v19, s22 +; SI-NEXT: v_readfirstlane_b32 s43, v19 +; SI-NEXT: v_mov_b32_e32 v19, s23 +; SI-NEXT: v_readfirstlane_b32 s47, v19 +; SI-NEXT: v_mov_b32_e32 v19, s24 +; SI-NEXT: v_readfirstlane_b32 s24, v19 +; SI-NEXT: v_mov_b32_e32 v19, s25 +; SI-NEXT: v_readfirstlane_b32 s56, v19 +; SI-NEXT: v_mov_b32_e32 v19, s26 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_writelane_b32 v63, s30, 0 -; SI-NEXT: v_readfirstlane_b32 s25, v20 -; SI-NEXT: v_mov_b32_e32 v20, s27 +; SI-NEXT: v_readfirstlane_b32 s25, v19 +; SI-NEXT: v_mov_b32_e32 v19, s27 ; SI-NEXT: v_writelane_b32 v63, s31, 1 -; SI-NEXT: v_readfirstlane_b32 s27, v20 -; SI-NEXT: v_mov_b32_e32 v20, s28 +; SI-NEXT: v_readfirstlane_b32 s27, v19 +; SI-NEXT: v_mov_b32_e32 v19, s28 ; SI-NEXT: v_writelane_b32 v63, s34, 2 -; SI-NEXT: v_readfirstlane_b32 s26, v20 -; SI-NEXT: v_mov_b32_e32 v20, s29 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 +; SI-NEXT: v_readfirstlane_b32 s26, v19 +; SI-NEXT: v_mov_b32_e32 v19, s29 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 ; SI-NEXT: v_writelane_b32 v63, s35, 3 -; SI-NEXT: v_readfirstlane_b32 s28, v20 -; SI-NEXT: v_readfirstlane_b32 s22, v1 -; SI-NEXT: v_readfirstlane_b32 s23, v2 -; SI-NEXT: v_readfirstlane_b32 s20, v3 -; SI-NEXT: v_readfirstlane_b32 s21, v4 -; SI-NEXT: v_readfirstlane_b32 s18, v5 -; SI-NEXT: v_readfirstlane_b32 s19, v6 -; SI-NEXT: v_readfirstlane_b32 s16, v7 -; SI-NEXT: v_readfirstlane_b32 s17, v8 -; SI-NEXT: v_readfirstlane_b32 s14, v9 -; SI-NEXT: v_readfirstlane_b32 s15, v10 -; SI-NEXT: v_readfirstlane_b32 s12, v11 -; SI-NEXT: v_readfirstlane_b32 s13, v12 -; SI-NEXT: v_readfirstlane_b32 s10, v13 -; SI-NEXT: v_readfirstlane_b32 s11, v14 -; SI-NEXT: v_readfirstlane_b32 s7, v15 -; SI-NEXT: v_readfirstlane_b32 s8, v16 -; SI-NEXT: v_readfirstlane_b32 s6, v17 +; SI-NEXT: v_readfirstlane_b32 s28, v19 +; SI-NEXT: v_readfirstlane_b32 s22, v0 +; SI-NEXT: v_readfirstlane_b32 s23, v1 +; SI-NEXT: v_readfirstlane_b32 s20, v2 +; SI-NEXT: v_readfirstlane_b32 s21, v3 +; SI-NEXT: v_readfirstlane_b32 s18, v4 +; SI-NEXT: v_readfirstlane_b32 s19, v5 +; SI-NEXT: v_readfirstlane_b32 s16, v6 +; SI-NEXT: v_readfirstlane_b32 s17, v7 +; SI-NEXT: v_readfirstlane_b32 s14, v8 +; SI-NEXT: v_readfirstlane_b32 s15, v9 +; SI-NEXT: v_readfirstlane_b32 s12, v10 +; SI-NEXT: v_readfirstlane_b32 s13, v11 +; SI-NEXT: v_readfirstlane_b32 s10, v12 +; SI-NEXT: v_readfirstlane_b32 s11, v13 +; SI-NEXT: v_readfirstlane_b32 s7, v14 +; SI-NEXT: v_readfirstlane_b32 s8, v15 +; SI-NEXT: v_readfirstlane_b32 s6, v16 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s9, v18 +; SI-NEXT: v_readfirstlane_b32 s9, v17 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill @@ -104801,105 +103666,104 @@ define inreg <64 x half> @bitcast_v16i64_to_v64f16_scalar(<16 x i64> inreg %a, i ; SI-NEXT: s_cbranch_scc0 .LBB65_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_lshr_b32 s4, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 ; SI-NEXT: s_lshr_b32 s4, s6, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s6 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s4 ; SI-NEXT: s_lshr_b32 s4, s8, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s4 ; SI-NEXT: s_lshr_b32 s4, s7, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s4 ; SI-NEXT: s_lshr_b32 s4, s11, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s4 ; SI-NEXT: s_lshr_b32 s4, s10, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 ; SI-NEXT: s_lshr_b32 s4, s13, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s4 ; SI-NEXT: s_lshr_b32 s4, s12, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s4 ; SI-NEXT: s_lshr_b32 s4, s15, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s4 ; SI-NEXT: s_lshr_b32 s4, s14, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 ; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 ; SI-NEXT: s_lshr_b32 s4, s16, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 ; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 ; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 ; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 ; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 ; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 ; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 ; SI-NEXT: s_lshr_b32 s4, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 ; SI-NEXT: s_lshr_b32 s4, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 ; SI-NEXT: s_lshr_b32 s4, s27, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 ; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v51, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 ; SI-NEXT: s_lshr_b32 s4, s56, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v53, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 ; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v55, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 ; SI-NEXT: s_lshr_b32 s4, s47, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v41, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 ; SI-NEXT: s_lshr_b32 s4, s43, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v43, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 ; SI-NEXT: s_lshr_b32 s4, s46, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v45, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 ; SI-NEXT: s_lshr_b32 s4, s42, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v47, s4 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v58, s4 ; SI-NEXT: s_lshr_b32 s4, s45, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v57, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 ; SI-NEXT: s_lshr_b32 s4, s41, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v58, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 ; SI-NEXT: s_lshr_b32 s4, s44, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v59, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 ; SI-NEXT: s_lshr_b32 s4, s40, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v61, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v49, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v54, s56 -; SI-NEXT: v_cvt_f32_f16_e32 v40, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v42, s47 -; SI-NEXT: v_cvt_f32_f16_e32 v44, s43 -; SI-NEXT: v_cvt_f32_f16_e32 v46, s46 -; SI-NEXT: v_cvt_f32_f16_e32 v56, s42 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v51, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v53, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v54, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v55, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v40, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v41, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v42, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v43, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v44, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v45, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v46, s56 +; SI-NEXT: v_cvt_f32_f16_e32 v47, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v56, s47 +; SI-NEXT: v_cvt_f32_f16_e32 v57, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v59, s42 ; SI-NEXT: v_cvt_f32_f16_e32 v60, s45 -; SI-NEXT: v_cvt_f32_f16_e32 v62, s41 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, s44 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v61, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v62, s44 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s40 ; SI-NEXT: s_cbranch_execnz .LBB65_3 ; SI-NEXT: .LBB65_2: ; %cmp.true ; SI-NEXT: s_add_u32 s4, s40, 3 @@ -104965,307 +103829,174 @@ define inreg <64 x half> @bitcast_v16i64_to_v64f16_scalar(<16 x i64> inreg %a, i ; SI-NEXT: s_add_u32 s6, s6, 3 ; SI-NEXT: s_addc_u32 s9, s9, 0 ; SI-NEXT: s_lshr_b32 s35, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s35 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s35 ; SI-NEXT: s_lshr_b32 s34, s6, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s6 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, s34 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v49, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v54, s56 -; SI-NEXT: v_cvt_f32_f16_e32 v40, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v42, s47 -; SI-NEXT: v_cvt_f32_f16_e32 v44, s43 -; SI-NEXT: v_cvt_f32_f16_e32 v46, s46 -; SI-NEXT: v_cvt_f32_f16_e32 v56, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v51, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v53, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v54, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v55, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v40, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v41, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v42, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v43, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v44, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v45, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v46, s56 +; SI-NEXT: v_cvt_f32_f16_e32 v47, s24 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v56, s47 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v57, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s46 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v59, s42 +; SI-NEXT: s_waitcnt expcnt(2) ; SI-NEXT: v_cvt_f32_f16_e32 v60, s44 -; SI-NEXT: v_cvt_f32_f16_e32 v62, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, s31 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s30 -; SI-NEXT: v_cvt_f32_f16_e32 v6, vcc_hi -; SI-NEXT: v_cvt_f32_f16_e32 v9, vcc_lo -; SI-NEXT: v_cvt_f32_f16_e32 v11, s95 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s94 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s93 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s92 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s91 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s90 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s89 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s88 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s79 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s78 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s77 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s76 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s75 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s74 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s73 -; SI-NEXT: v_cvt_f32_f16_e32 v51, s72 -; SI-NEXT: v_cvt_f32_f16_e32 v53, s63 -; SI-NEXT: v_cvt_f32_f16_e32 v55, s62 -; SI-NEXT: v_cvt_f32_f16_e32 v41, s61 -; SI-NEXT: v_cvt_f32_f16_e32 v43, s60 -; SI-NEXT: v_cvt_f32_f16_e32 v45, s59 -; SI-NEXT: v_cvt_f32_f16_e32 v47, s58 -; SI-NEXT: v_cvt_f32_f16_e32 v57, s57 -; SI-NEXT: v_cvt_f32_f16_e32 v58, s45 -; SI-NEXT: v_cvt_f32_f16_e32 v59, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v61, s29 -; SI-NEXT: .LBB65_3: ; %end ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v61, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_readlane_b32 s35, v63, 3 -; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v61 -; SI-NEXT: v_or_b32_e32 v2, v2, v61 -; SI-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v59 -; SI-NEXT: v_readlane_b32 s34, v63, 2 -; SI-NEXT: v_readlane_b32 s31, v63, 1 -; SI-NEXT: v_readlane_b32 s30, v63, 0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v62 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v60 -; SI-NEXT: v_add_i32_e32 v57, vcc, 12, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v57, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v56 -; SI-NEXT: v_add_i32_e32 v47, vcc, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v47, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v46 -; SI-NEXT: v_add_i32_e32 v45, vcc, 20, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v45, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v44 -; SI-NEXT: v_add_i32_e32 v43, vcc, 24, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v43, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v42 -; SI-NEXT: v_add_i32_e32 v41, vcc, 28, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v41, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v40 -; SI-NEXT: v_add_i32_e32 v55, vcc, 32, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v55, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v54 -; SI-NEXT: v_add_i32_e32 v53, vcc, 36, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v53, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v52 -; SI-NEXT: v_add_i32_e32 v51, vcc, 40, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v51, s[0:3], 0 offen +; SI-NEXT: v_cvt_f32_f16_e32 v61, s41 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v50 -; SI-NEXT: v_add_i32_e32 v48, vcc, 44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v48, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v49 -; SI-NEXT: v_add_i32_e32 v38, vcc, 48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v38, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v39 -; SI-NEXT: v_add_i32_e32 v36, vcc, 52, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v36, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 -; SI-NEXT: v_add_i32_e32 v34, vcc, 56, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v34, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v35 -; SI-NEXT: v_add_i32_e32 v32, vcc, 60, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v32, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 -; SI-NEXT: v_add_i32_e32 v30, vcc, 64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v30, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v31 -; SI-NEXT: v_add_i32_e32 v28, vcc, 0x44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v28, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v29 -; SI-NEXT: v_add_i32_e32 v26, vcc, 0x48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v26, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v27 -; SI-NEXT: v_add_i32_e32 v23, vcc, 0x4c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v23, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v25 -; SI-NEXT: v_add_i32_e32 v21, vcc, 0x50, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v21, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v24 -; SI-NEXT: v_add_i32_e32 v19, vcc, 0x54, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v19, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v22 -; SI-NEXT: v_add_i32_e32 v17, vcc, 0x58, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v17, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v20 -; SI-NEXT: v_add_i32_e32 v15, vcc, 0x5c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v15, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v18 -; SI-NEXT: v_add_i32_e32 v13, vcc, 0x60, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v13, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v16 -; SI-NEXT: v_add_i32_e32 v11, vcc, 0x64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v11, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v14 -; SI-NEXT: v_add_i32_e32 v9, vcc, 0x68, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v9, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v12 -; SI-NEXT: v_add_i32_e32 v6, vcc, 0x6c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v10 -; SI-NEXT: v_add_i32_e32 v4, vcc, 0x70, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v8 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v2, v7 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x78, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v62, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v31, s34 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s31 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s30 +; SI-NEXT: v_cvt_f32_f16_e32 v26, vcc_hi +; SI-NEXT: v_cvt_f32_f16_e32 v27, vcc_lo +; SI-NEXT: v_cvt_f32_f16_e32 v24, s95 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s94 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s93 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s92 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s91 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s90 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s89 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s88 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s79 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s78 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s77 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s76 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s75 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s74 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s73 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s72 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s63 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s62 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s61 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s60 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s59 +; SI-NEXT: v_cvt_f32_f16_e32 v58, s58 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s57 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s45 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s40 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, s29 +; SI-NEXT: .LBB65_3: ; %end +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v30, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v62 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_or_b32_e32 v1, v30, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_or_b32_e32 v2, v30, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_or_b32_e32 v3, v30, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v58 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_cvt_f16_f32_e32 v58, v59 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_cvt_f16_f32_e32 v59, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v59 +; SI-NEXT: v_or_b32_e32 v5, v5, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_or_b32_e32 v6, v30, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_or_b32_e32 v8, v30, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_or_b32_e32 v10, v30, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_or_b32_e32 v12, v30, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v56 +; SI-NEXT: v_or_b32_e32 v14, v30, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v46 +; SI-NEXT: v_or_b32_e32 v16, v30, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v44 +; SI-NEXT: v_or_b32_e32 v18, v30, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v42 +; SI-NEXT: v_or_b32_e32 v20, v30, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v40 +; SI-NEXT: v_or_b32_e32 v22, v30, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v39 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v45 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_or_b32_e32 v4, v58, v4 +; SI-NEXT: v_or_b32_e32 v7, v56, v7 +; SI-NEXT: v_or_b32_e32 v9, v46, v9 +; SI-NEXT: v_or_b32_e32 v11, v44, v11 +; SI-NEXT: v_or_b32_e32 v13, v42, v13 +; SI-NEXT: v_or_b32_e32 v15, v40, v15 +; SI-NEXT: v_or_b32_e32 v24, v30, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v37 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v2, v5 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -105281,78 +104012,120 @@ define inreg <64 x half> @bitcast_v16i64_to_v64f16_scalar(<16 x i64> inreg %a, i ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v37, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_or_b32_e32 v26, v30, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 +; SI-NEXT: v_or_b32_e32 v28, v30, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v33 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v53 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v35 +; SI-NEXT: v_or_b32_e32 v30, v31, v30 +; SI-NEXT: v_or_b32_e32 v17, v54, v17 +; SI-NEXT: v_or_b32_e32 v19, v52, v19 +; SI-NEXT: v_or_b32_e32 v21, v50, v21 +; SI-NEXT: v_or_b32_e32 v23, v48, v23 +; SI-NEXT: v_or_b32_e32 v25, v38, v25 +; SI-NEXT: v_or_b32_e32 v27, v36, v27 +; SI-NEXT: v_or_b32_e32 v29, v34, v29 +; SI-NEXT: v_readlane_b32 s35, v63, 3 +; SI-NEXT: v_readlane_b32 s34, v63, 2 +; SI-NEXT: v_readlane_b32 s31, v63, 1 +; SI-NEXT: v_readlane_b32 s30, v63, 0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v33 +; SI-NEXT: v_or_b32_e32 v31, v32, v31 ; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[4:5] -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB65_4: -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr62 ; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr62 ; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: s_branch .LBB65_2 ; ; VI-LABEL: bitcast_v16i64_to_v64f16_scalar: @@ -105607,755 +104380,789 @@ define <16 x i64> @bitcast_v64f16_to_v16i64(<64 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v64f16_to_v16i64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v62, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:132 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:88 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:4 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v63, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v62 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v61 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v2 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v60 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v3 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v31 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v57 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v63 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v53 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v33 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v34 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v50 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v35 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v36 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v35 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v37 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v33 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v38 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v48 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v39 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v48 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v48, v46 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v45 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v30 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v50 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v44 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v52 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v53 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v43 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v42 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v41 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v55, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:100 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:112 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:108 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:120 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:116 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f16_f32_e32 v53, v0 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v52, v1 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f16_f32_e32 v50, v2 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v39, v3 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v38, v4 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v37, v5 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v36, v6 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v35, v7 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v34, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v33, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB66_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v47 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v34 -; SI-NEXT: v_or_b32_e32 v31, v33, v31 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v63 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v61 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v59 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v60 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v58 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v56 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v46 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v44 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v40 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v54 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v49 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v53 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v50 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v38 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v36 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: v_or_b32_e32 v0, v62, v0 -; SI-NEXT: v_or_b32_e32 v1, v60, v1 -; SI-NEXT: v_or_b32_e32 v2, v58, v2 -; SI-NEXT: v_or_b32_e32 v3, v56, v3 -; SI-NEXT: v_or_b32_e32 v25, v51, v25 -; SI-NEXT: v_or_b32_e32 v26, v48, v26 -; SI-NEXT: v_or_b32_e32 v27, v52, v27 -; SI-NEXT: v_or_b32_e32 v28, v39, v28 -; SI-NEXT: v_or_b32_e32 v29, v37, v29 -; SI-NEXT: v_or_b32_e32 v30, v35, v30 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v35 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: v_or_b32_e32 v18, v59, v18 +; SI-NEXT: v_or_b32_e32 v19, v57, v19 +; SI-NEXT: v_or_b32_e32 v20, v47, v20 +; SI-NEXT: v_or_b32_e32 v21, v45, v21 +; SI-NEXT: v_or_b32_e32 v22, v43, v22 +; SI-NEXT: v_or_b32_e32 v23, v41, v23 +; SI-NEXT: v_or_b32_e32 v24, v55, v24 +; SI-NEXT: v_or_b32_e32 v25, v53, v25 +; SI-NEXT: v_or_b32_e32 v26, v51, v26 +; SI-NEXT: v_or_b32_e32 v27, v49, v27 +; SI-NEXT: v_or_b32_e32 v28, v38, v28 +; SI-NEXT: v_or_b32_e32 v29, v36, v29 +; SI-NEXT: v_or_b32_e32 v30, v34, v30 +; SI-NEXT: v_or_b32_e32 v31, v33, v31 ; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v32 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_or_b32_e32 v21, v22, v21 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_or_b32_e32 v23, v24, v23 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v40 -; SI-NEXT: v_or_b32_e32 v24, v55, v24 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v62 +; SI-NEXT: v_or_b32_e32 v17, v61, v17 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: .LBB66_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB66_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v58 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v56 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v43 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v41 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v26, v54 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v57 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v53 ; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v51 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v51 ; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v39 ; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 ; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v50 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v38 ; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v36 ; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 ; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 ; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 ; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 ; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v34 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v62 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v60 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_or_b32_e32 v18, v19, v18 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v20, v56 ; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v46 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v22, v20 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v45 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_or_b32_e32 v21, v22, v21 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v42 ; SI-NEXT: v_or_b32_e32 v22, v24, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v24, v40 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; SI-NEXT: v_or_b32_e32 v23, v25, v23 ; SI-NEXT: v_cvt_f32_f16_e32 v25, v55 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: v_or_b32_e32 v24, v25, v24 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v52 ; SI-NEXT: v_or_b32_e32 v25, v27, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v50 ; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 ; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 ; SI-NEXT: v_or_b32_e32 v26, v28, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v49 ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 ; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 ; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 ; SI-NEXT: v_or_b32_e32 v27, v28, v27 ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v37 ; SI-NEXT: v_or_b32_e32 v28, v30, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v35 ; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 ; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 ; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 ; SI-NEXT: v_or_b32_e32 v29, v31, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v34 ; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 ; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 ; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 @@ -106364,23 +105171,23 @@ define <16 x i64> @bitcast_v64f16_to_v16i64(<64 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v31, v33, v31 ; SI-NEXT: .LBB66_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v64f16_to_v16i64: @@ -106621,548 +105428,676 @@ define inreg <16 x i64> @bitcast_v64f16_to_v16i64_scalar(<64 x half> inreg %a, i ; SI-LABEL: bitcast_v64f16_to_v16i64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v53, v26 -; SI-NEXT: v_mov_b32_e32 v45, v6 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:48 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:44 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:68 -; SI-NEXT: v_mov_b32_e32 v54, v14 -; SI-NEXT: v_mov_b32_e32 v55, v12 -; SI-NEXT: v_mov_b32_e32 v41, v11 -; SI-NEXT: v_mov_b32_e32 v40, v10 -; SI-NEXT: v_mov_b32_e32 v44, v9 -; SI-NEXT: v_mov_b32_e32 v43, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v15 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v47, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v2 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v46, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v44, s20 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v42, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v13 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v56, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v1 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v40, s24 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v17, v40 +; SI-NEXT: s_lshr_b32 s4, s29, 16 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v58, s4 +; SI-NEXT: s_lshr_b32 s4, s28, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v59, s4 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: s_lshr_b32 s4, s27, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v61, s4 +; SI-NEXT: s_lshr_b32 s4, s26, 16 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v62, s4 +; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s4 +; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s4 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v51, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v53, s4 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s4 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v46 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s4 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v47 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s4 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v55, s25 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v45, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v45 ; SI-NEXT: v_cvt_f16_f32_e32 v45, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v0, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v1, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v2, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v3, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v4, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v5, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v7, s28 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f16_f32_e32 v25, v39 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f16_f32_e32 v48, v26 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f16_f32_e32 v26, v31 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f16_f32_e32 v39, v6 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f16_f32_e32 v27, v42 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f16_f32_e32 v38, v60 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f16_f32_e32 v28, v37 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v37, v62 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v29, v63 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v31, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s26 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v49, v26 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v62 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v43, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v41, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v63, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v60, s28 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v57, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v57 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v30, v33 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v32, v34 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v34, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v63, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v62, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v60, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v42, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v35, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v33, s26 -; SI-NEXT: v_cvt_f16_f32_e32 v6, s29 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v44, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v42, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v56, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v18, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v19, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v20, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v21, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v22, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v23, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v24, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v25, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v26, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v27, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v28, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v29, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v30, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v31, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v57, v0 +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB67_2 ; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v6 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; SI-NEXT: v_or_b32_e32 v3, v7, v3 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v14 +; SI-NEXT: v_or_b32_e32 v5, v11, v5 +; SI-NEXT: v_or_b32_e32 v6, v13, v6 +; SI-NEXT: v_or_b32_e32 v7, v15, v7 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v56 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v46 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v21, v22, v21 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v52 -; SI-NEXT: v_or_b32_e32 v5, v33, v5 -; SI-NEXT: v_mov_b32_e32 v33, v52 -; SI-NEXT: v_mov_b32_e32 v52, v51 -; SI-NEXT: v_or_b32_e32 v22, v51, v22 -; SI-NEXT: v_mov_b32_e32 v51, v23 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_or_b32_e32 v23, v50, v23 -; SI-NEXT: v_mov_b32_e32 v50, v24 ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_or_b32_e32 v24, v49, v24 -; SI-NEXT: v_mov_b32_e32 v49, v25 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_or_b32_e32 v25, v48, v25 -; SI-NEXT: v_mov_b32_e32 v48, v26 ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v26, v39, v26 -; SI-NEXT: v_mov_b32_e32 v39, v27 ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v9 -; SI-NEXT: v_or_b32_e32 v27, v38, v27 -; SI-NEXT: v_mov_b32_e32 v38, v28 ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_or_b32_e32 v28, v37, v28 -; SI-NEXT: v_mov_b32_e32 v37, v29 ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v4, v35, v4 -; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_or_b32_e32 v9, v14, v9 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v58 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v46 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v61 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v47 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_mov_b32_e32 v35, v54 -; SI-NEXT: v_or_b32_e32 v19, v54, v19 -; SI-NEXT: v_mov_b32_e32 v54, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v29, v31, v29 ; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v34 -; SI-NEXT: v_or_b32_e32 v0, v63, v0 -; SI-NEXT: v_or_b32_e32 v1, v62, v1 -; SI-NEXT: v_or_b32_e32 v2, v60, v2 -; SI-NEXT: v_or_b32_e32 v3, v42, v3 -; SI-NEXT: v_or_b32_e32 v10, v56, v10 -; SI-NEXT: v_mov_b32_e32 v63, v44 -; SI-NEXT: v_or_b32_e32 v11, v44, v11 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_or_b32_e32 v10, v62, v10 ; SI-NEXT: v_mov_b32_e32 v62, v61 -; SI-NEXT: v_mov_b32_e32 v60, v59 +; SI-NEXT: v_or_b32_e32 v11, v35, v11 +; SI-NEXT: v_mov_b32_e32 v35, v59 ; SI-NEXT: v_or_b32_e32 v12, v59, v12 -; SI-NEXT: v_mov_b32_e32 v58, v57 -; SI-NEXT: v_or_b32_e32 v13, v57, v13 -; SI-NEXT: v_mov_b32_e32 v56, v47 -; SI-NEXT: v_mov_b32_e32 v46, v45 -; SI-NEXT: v_or_b32_e32 v14, v45, v14 -; SI-NEXT: v_mov_b32_e32 v44, v43 -; SI-NEXT: v_or_b32_e32 v15, v43, v15 -; SI-NEXT: v_mov_b32_e32 v42, v41 -; SI-NEXT: v_or_b32_e32 v16, v41, v16 -; SI-NEXT: v_or_b32_e32 v17, v40, v17 -; SI-NEXT: v_mov_b32_e32 v40, v55 -; SI-NEXT: v_or_b32_e32 v18, v55, v18 -; SI-NEXT: v_or_b32_e32 v20, v53, v20 -; SI-NEXT: v_or_b32_e32 v30, v32, v30 -; SI-NEXT: v_mov_b32_e32 v32, v34 -; SI-NEXT: v_or_b32_e32 v31, v36, v31 +; SI-NEXT: v_mov_b32_e32 v33, v58 +; SI-NEXT: v_or_b32_e32 v13, v58, v13 +; SI-NEXT: v_mov_b32_e32 v32, v56 +; SI-NEXT: v_or_b32_e32 v14, v47, v14 +; SI-NEXT: v_mov_b32_e32 v47, v46 +; SI-NEXT: v_or_b32_e32 v15, v45, v15 +; SI-NEXT: v_mov_b32_e32 v45, v44 +; SI-NEXT: v_or_b32_e32 v18, v40, v18 +; SI-NEXT: v_or_b32_e32 v19, v55, v19 +; SI-NEXT: v_mov_b32_e32 v55, v54 +; SI-NEXT: v_or_b32_e32 v20, v54, v20 +; SI-NEXT: v_or_b32_e32 v21, v53, v21 +; SI-NEXT: v_mov_b32_e32 v53, v52 +; SI-NEXT: v_or_b32_e32 v22, v52, v22 +; SI-NEXT: v_or_b32_e32 v23, v51, v23 +; SI-NEXT: v_mov_b32_e32 v51, v50 +; SI-NEXT: v_or_b32_e32 v24, v50, v24 +; SI-NEXT: v_or_b32_e32 v25, v49, v25 +; SI-NEXT: v_mov_b32_e32 v49, v48 +; SI-NEXT: v_or_b32_e32 v26, v48, v26 +; SI-NEXT: v_or_b32_e32 v27, v39, v27 +; SI-NEXT: v_mov_b32_e32 v39, v38 +; SI-NEXT: v_or_b32_e32 v28, v38, v28 +; SI-NEXT: v_or_b32_e32 v29, v63, v29 +; SI-NEXT: v_mov_b32_e32 v63, v60 +; SI-NEXT: v_or_b32_e32 v30, v60, v30 +; SI-NEXT: v_or_b32_e32 v31, v57, v31 ; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v4, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v16 +; SI-NEXT: v_or_b32_e32 v4, v9, v4 +; SI-NEXT: v_or_b32_e32 v8, v17, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v44 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v42 +; SI-NEXT: v_or_b32_e32 v9, v36, v9 +; SI-NEXT: v_mov_b32_e32 v36, v34 +; SI-NEXT: v_or_b32_e32 v16, v43, v16 +; SI-NEXT: v_mov_b32_e32 v43, v42 +; SI-NEXT: v_or_b32_e32 v17, v41, v17 +; SI-NEXT: v_mov_b32_e32 v41, v40 ; SI-NEXT: s_branch .LBB67_3 ; SI-NEXT: .LBB67_2: -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v63, v44 +; SI-NEXT: v_mov_b32_e32 v36, v34 ; SI-NEXT: v_mov_b32_e32 v62, v61 -; SI-NEXT: v_mov_b32_e32 v60, v59 -; SI-NEXT: v_mov_b32_e32 v58, v57 -; SI-NEXT: v_mov_b32_e32 v56, v47 -; SI-NEXT: v_mov_b32_e32 v46, v45 -; SI-NEXT: v_mov_b32_e32 v44, v43 -; SI-NEXT: v_mov_b32_e32 v42, v41 -; SI-NEXT: v_mov_b32_e32 v40, v55 -; SI-NEXT: v_mov_b32_e32 v35, v54 -; SI-NEXT: v_mov_b32_e32 v54, v20 -; SI-NEXT: v_mov_b32_e32 v33, v52 -; SI-NEXT: v_mov_b32_e32 v32, v34 -; SI-NEXT: v_mov_b32_e32 v52, v51 -; SI-NEXT: v_mov_b32_e32 v51, v23 -; SI-NEXT: v_mov_b32_e32 v50, v24 -; SI-NEXT: v_mov_b32_e32 v49, v25 -; SI-NEXT: v_mov_b32_e32 v48, v26 -; SI-NEXT: v_mov_b32_e32 v39, v27 -; SI-NEXT: v_mov_b32_e32 v38, v28 -; SI-NEXT: v_mov_b32_e32 v37, v29 +; SI-NEXT: v_mov_b32_e32 v35, v59 +; SI-NEXT: v_mov_b32_e32 v33, v58 +; SI-NEXT: v_mov_b32_e32 v32, v56 +; SI-NEXT: v_mov_b32_e32 v47, v46 +; SI-NEXT: v_mov_b32_e32 v45, v44 +; SI-NEXT: v_mov_b32_e32 v43, v42 +; SI-NEXT: v_mov_b32_e32 v41, v40 +; SI-NEXT: v_mov_b32_e32 v55, v54 +; SI-NEXT: v_mov_b32_e32 v53, v52 +; SI-NEXT: v_mov_b32_e32 v51, v50 +; SI-NEXT: v_mov_b32_e32 v49, v48 +; SI-NEXT: v_mov_b32_e32 v39, v38 +; SI-NEXT: v_mov_b32_e32 v63, v60 ; SI-NEXT: s_mov_b64 s[4:5], -1 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: .LBB67_3: ; %Flow -; SI-NEXT: v_mov_b32_e32 v34, v33 +; SI-NEXT: v_mov_b32_e32 v60, v39 +; SI-NEXT: v_mov_b32_e32 v38, v49 +; SI-NEXT: v_mov_b32_e32 v39, v51 +; SI-NEXT: v_mov_b32_e32 v48, v53 +; SI-NEXT: v_mov_b32_e32 v49, v55 +; SI-NEXT: v_mov_b32_e32 v50, v41 +; SI-NEXT: v_mov_b32_e32 v51, v43 +; SI-NEXT: v_mov_b32_e32 v52, v45 +; SI-NEXT: v_mov_b32_e32 v53, v47 +; SI-NEXT: v_mov_b32_e32 v54, v32 +; SI-NEXT: v_mov_b32_e32 v32, v33 ; SI-NEXT: v_mov_b32_e32 v33, v35 -; SI-NEXT: v_mov_b32_e32 v35, v40 -; SI-NEXT: v_mov_b32_e32 v53, v42 -; SI-NEXT: v_mov_b32_e32 v40, v46 -; SI-NEXT: v_mov_b32_e32 v41, v56 -; SI-NEXT: v_mov_b32_e32 v42, v58 -; SI-NEXT: v_mov_b32_e32 v43, v60 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v34, v36 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; SI-NEXT: s_cbranch_vccnz .LBB67_5 ; SI-NEXT: ; %bb.4: ; %cmp.true ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v47 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v32 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v43 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v36 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v40 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_mov_b32_e32 v55, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v33 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v38 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v38 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v36 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v60 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v2, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 ; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v58 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v46 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v44 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v42 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v40 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v37 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v35 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v34 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v62 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v62 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v54 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v53 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v52 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v51 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 @@ -107170,55 +106105,63 @@ define inreg <16 x i64> @bitcast_v64f16_to_v16i64_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_or_b32_e32 v18, v19, v18 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v54 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; SI-NEXT: v_or_b32_e32 v20, v22, v20 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_or_b32_e32 v21, v22, v21 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v51 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v22, v24, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v50 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; SI-NEXT: v_or_b32_e32 v23, v25, v23 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v25, v39 ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: v_or_b32_e32 v24, v25, v24 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v48 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v25, v27, v25 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 ; SI-NEXT: v_or_b32_e32 v26, v28, v26 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_or_b32_e32 v25, v27, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v39 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 @@ -107226,46 +106169,43 @@ define inreg <16 x i64> @bitcast_v64f16_to_v16i64_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 ; SI-NEXT: v_or_b32_e32 v27, v28, v27 ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v37 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v28, v30, v28 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; SI-NEXT: v_or_b32_e32 v29, v31, v29 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 ; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 ; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_or_b32_e32 v29, v31, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v63 ; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 ; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 ; SI-NEXT: v_or_b32_e32 v30, v31, v30 ; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v32 ; SI-NEXT: v_or_b32_e32 v31, v33, v31 ; SI-NEXT: .LBB67_5: ; %end -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -107593,388 +106533,292 @@ define <64 x i16> @bitcast_v16i64_to_v64i16(<16 x i64> %a, i32 %b) { ; SI-LABEL: bitcast_v16i64_to_v64i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; kill: killed $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 +; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB68_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_alignbit_b32 v33, v31, v32, 16 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v34, v30, v29, 16 -; SI-NEXT: v_alignbit_b32 v35, v28, v27, 16 -; SI-NEXT: v_alignbit_b32 v36, v26, v25, 16 -; SI-NEXT: v_alignbit_b32 v37, v24, v23, 16 -; SI-NEXT: v_alignbit_b32 v38, v22, v21, 16 -; SI-NEXT: v_alignbit_b32 v39, v20, v19, 16 -; SI-NEXT: v_alignbit_b32 v49, v18, v17, 16 -; SI-NEXT: v_alignbit_b32 v51, v16, v15, 16 -; SI-NEXT: v_alignbit_b32 v54, v14, v13, 16 -; SI-NEXT: v_alignbit_b32 v40, v12, v11, 16 -; SI-NEXT: v_alignbit_b32 v42, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v45, v8, v7, 16 -; SI-NEXT: v_alignbit_b32 v47, v6, v5, 16 -; SI-NEXT: v_alignbit_b32 v57, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v32, v31, v30, 16 +; SI-NEXT: v_alignbit_b32 v33, v29, v28, 16 +; SI-NEXT: v_alignbit_b32 v34, v27, v26, 16 +; SI-NEXT: v_alignbit_b32 v35, v25, v24, 16 +; SI-NEXT: v_alignbit_b32 v36, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v37, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v38, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v39, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v48, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v49, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v50, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v52, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v54, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v41, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v44, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v47, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v15 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v13 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v11 ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_alignbit_b32 v60, v2, v1, 16 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v31 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v9 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v7 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v5 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v3 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v1 ; SI-NEXT: .LBB68_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB68_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: v_addc_u32_e32 v12, vcc, 0, v12, vcc -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; SI-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; SI-NEXT: v_addc_u32_e32 v16, vcc, 0, v16, vcc -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; SI-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; SI-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc -; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; SI-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc -; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; SI-NEXT: v_addc_u32_e32 v24, vcc, 0, v24, vcc -; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; SI-NEXT: v_addc_u32_e32 v26, vcc, 0, v26, vcc -; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; SI-NEXT: v_addc_u32_e32 v28, vcc, 0, v28, vcc -; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 -; SI-NEXT: v_addc_u32_e32 v30, vcc, 0, v30, vcc -; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_addc_u32_e32 v29, vcc, 0, v29, vcc +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 ; SI-NEXT: v_addc_u32_e32 v31, vcc, 0, v31, vcc -; SI-NEXT: v_alignbit_b32 v33, v31, v32, 16 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v34, v30, v29, 16 -; SI-NEXT: v_alignbit_b32 v35, v28, v27, 16 -; SI-NEXT: v_alignbit_b32 v36, v26, v25, 16 -; SI-NEXT: v_alignbit_b32 v37, v24, v23, 16 -; SI-NEXT: v_alignbit_b32 v38, v22, v21, 16 -; SI-NEXT: v_alignbit_b32 v39, v20, v19, 16 -; SI-NEXT: v_alignbit_b32 v49, v18, v17, 16 -; SI-NEXT: v_alignbit_b32 v51, v16, v15, 16 -; SI-NEXT: v_alignbit_b32 v54, v14, v13, 16 -; SI-NEXT: v_alignbit_b32 v40, v12, v11, 16 -; SI-NEXT: v_alignbit_b32 v42, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v45, v8, v7, 16 -; SI-NEXT: v_alignbit_b32 v47, v6, v5, 16 -; SI-NEXT: v_alignbit_b32 v57, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v32, v31, v30, 16 +; SI-NEXT: v_alignbit_b32 v33, v29, v28, 16 +; SI-NEXT: v_alignbit_b32 v34, v27, v26, 16 +; SI-NEXT: v_alignbit_b32 v35, v25, v24, 16 +; SI-NEXT: v_alignbit_b32 v36, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v37, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v38, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v39, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v48, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v49, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v50, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v52, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v54, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v41, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v44, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v47, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v15 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v13 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v11 ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_alignbit_b32 v60, v2, v1, 16 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v31 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v9 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v7 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v5 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v3 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v1 ; SI-NEXT: .LBB68_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v47 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v44 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v41 +; SI-NEXT: v_or_b32_e32 v0, v0, v47 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v60 -; SI-NEXT: v_or_b32_e32 v1, v1, v60 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v57 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v63 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v47 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v62 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v45 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v61 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v42 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v59 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v58 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v56 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v46 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v49 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v44 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v43 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v24 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v25 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v26 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v53 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v27 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v28 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v29 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v30 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v63 +; SI-NEXT: v_or_b32_e32 v2, v2, v44 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v62 +; SI-NEXT: v_or_b32_e32 v4, v4, v41 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v61 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v47 +; SI-NEXT: v_or_b32_e32 v3, v3, v44 +; SI-NEXT: v_or_b32_e32 v5, v5, v41 +; SI-NEXT: v_or_b32_e32 v6, v6, v54 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v60 +; SI-NEXT: v_or_b32_e32 v8, v8, v52 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v59 +; SI-NEXT: v_or_b32_e32 v10, v10, v50 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v58 +; SI-NEXT: v_or_b32_e32 v12, v12, v49 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v57 +; SI-NEXT: v_or_b32_e32 v14, v14, v48 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v56 +; SI-NEXT: v_or_b32_e32 v16, v16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v46 +; SI-NEXT: v_or_b32_e32 v18, v18, v38 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v45 +; SI-NEXT: v_or_b32_e32 v20, v20, v37 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v43 +; SI-NEXT: v_or_b32_e32 v22, v22, v36 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v42 +; SI-NEXT: v_or_b32_e32 v24, v24, v35 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v40 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_or_b32_e32 v26, v26, v34 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v55 +; SI-NEXT: v_or_b32_e32 v28, v28, v33 +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v53 +; SI-NEXT: v_or_b32_e32 v30, v30, v32 +; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v51 +; SI-NEXT: v_or_b32_e32 v7, v7, v54 +; SI-NEXT: v_or_b32_e32 v9, v9, v52 +; SI-NEXT: v_or_b32_e32 v11, v11, v50 +; SI-NEXT: v_or_b32_e32 v13, v13, v49 +; SI-NEXT: v_or_b32_e32 v15, v15, v48 +; SI-NEXT: v_or_b32_e32 v17, v17, v39 +; SI-NEXT: v_or_b32_e32 v19, v19, v38 +; SI-NEXT: v_or_b32_e32 v21, v21, v37 +; SI-NEXT: v_or_b32_e32 v23, v23, v36 +; SI-NEXT: v_or_b32_e32 v25, v25, v35 +; SI-NEXT: v_or_b32_e32 v27, v27, v34 +; SI-NEXT: v_or_b32_e32 v29, v29, v33 +; SI-NEXT: v_or_b32_e32 v31, v31, v32 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v31 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16i64_to_v64i16: @@ -108158,79 +107002,79 @@ define inreg <64 x i16> @bitcast_v16i64_to_v64i16_scalar(<16 x i64> inreg %a, i3 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v21, s30, 0 -; SI-NEXT: v_writelane_b32 v21, s31, 1 -; SI-NEXT: v_writelane_b32 v21, s34, 2 -; SI-NEXT: v_writelane_b32 v21, s35, 3 -; SI-NEXT: v_writelane_b32 v21, s36, 4 -; SI-NEXT: v_writelane_b32 v21, s37, 5 -; SI-NEXT: v_writelane_b32 v21, s38, 6 -; SI-NEXT: v_mov_b32_e32 v20, s16 -; SI-NEXT: v_writelane_b32 v21, s39, 7 -; SI-NEXT: v_readfirstlane_b32 s56, v20 -; SI-NEXT: v_mov_b32_e32 v20, s17 -; SI-NEXT: v_writelane_b32 v21, s48, 8 -; SI-NEXT: v_readfirstlane_b32 s57, v20 -; SI-NEXT: v_mov_b32_e32 v20, s18 -; SI-NEXT: v_writelane_b32 v21, s49, 9 -; SI-NEXT: v_readfirstlane_b32 s46, v20 -; SI-NEXT: v_mov_b32_e32 v20, s19 -; SI-NEXT: v_writelane_b32 v21, s50, 10 -; SI-NEXT: v_readfirstlane_b32 s47, v20 -; SI-NEXT: v_mov_b32_e32 v20, s20 -; SI-NEXT: v_writelane_b32 v21, s51, 11 -; SI-NEXT: v_readfirstlane_b32 s44, v20 -; SI-NEXT: v_mov_b32_e32 v20, s21 -; SI-NEXT: v_writelane_b32 v21, s52, 12 -; SI-NEXT: v_readfirstlane_b32 s45, v20 -; SI-NEXT: v_mov_b32_e32 v20, s22 -; SI-NEXT: v_writelane_b32 v21, s53, 13 -; SI-NEXT: v_readfirstlane_b32 s42, v20 -; SI-NEXT: v_mov_b32_e32 v20, s23 -; SI-NEXT: v_writelane_b32 v21, s54, 14 -; SI-NEXT: v_readfirstlane_b32 s43, v20 -; SI-NEXT: v_mov_b32_e32 v20, s24 -; SI-NEXT: v_writelane_b32 v21, s55, 15 -; SI-NEXT: v_readfirstlane_b32 s40, v20 -; SI-NEXT: v_mov_b32_e32 v20, s25 -; SI-NEXT: v_writelane_b32 v21, s64, 16 -; SI-NEXT: v_readfirstlane_b32 s41, v20 -; SI-NEXT: v_mov_b32_e32 v20, s26 -; SI-NEXT: v_writelane_b32 v21, s65, 17 -; SI-NEXT: v_readfirstlane_b32 s24, v20 -; SI-NEXT: v_mov_b32_e32 v20, s27 -; SI-NEXT: v_writelane_b32 v21, s66, 18 -; SI-NEXT: v_readfirstlane_b32 s25, v20 -; SI-NEXT: v_mov_b32_e32 v20, s28 -; SI-NEXT: v_writelane_b32 v21, s67, 19 -; SI-NEXT: v_readfirstlane_b32 s22, v20 -; SI-NEXT: v_mov_b32_e32 v20, s29 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 -; SI-NEXT: v_writelane_b32 v21, s68, 20 -; SI-NEXT: v_readfirstlane_b32 s23, v20 -; SI-NEXT: v_readfirstlane_b32 s20, v1 -; SI-NEXT: v_readfirstlane_b32 s21, v2 -; SI-NEXT: v_readfirstlane_b32 s18, v3 -; SI-NEXT: v_readfirstlane_b32 s19, v4 -; SI-NEXT: v_readfirstlane_b32 s16, v5 -; SI-NEXT: v_readfirstlane_b32 s17, v6 -; SI-NEXT: v_readfirstlane_b32 s14, v7 -; SI-NEXT: v_readfirstlane_b32 s15, v8 -; SI-NEXT: v_readfirstlane_b32 s12, v9 -; SI-NEXT: v_readfirstlane_b32 s13, v10 -; SI-NEXT: v_readfirstlane_b32 s10, v11 -; SI-NEXT: v_readfirstlane_b32 s11, v12 -; SI-NEXT: v_readfirstlane_b32 s8, v13 -; SI-NEXT: v_readfirstlane_b32 s9, v14 -; SI-NEXT: v_readfirstlane_b32 s6, v15 -; SI-NEXT: v_readfirstlane_b32 s7, v16 -; SI-NEXT: v_readfirstlane_b32 s4, v17 +; SI-NEXT: v_writelane_b32 v32, s30, 0 +; SI-NEXT: v_writelane_b32 v32, s31, 1 +; SI-NEXT: v_writelane_b32 v32, s34, 2 +; SI-NEXT: v_writelane_b32 v32, s35, 3 +; SI-NEXT: v_writelane_b32 v32, s36, 4 +; SI-NEXT: v_writelane_b32 v32, s37, 5 +; SI-NEXT: v_writelane_b32 v32, s38, 6 +; SI-NEXT: v_mov_b32_e32 v19, s16 +; SI-NEXT: v_writelane_b32 v32, s39, 7 +; SI-NEXT: v_readfirstlane_b32 s56, v19 +; SI-NEXT: v_mov_b32_e32 v19, s17 +; SI-NEXT: v_writelane_b32 v32, s48, 8 +; SI-NEXT: v_readfirstlane_b32 s57, v19 +; SI-NEXT: v_mov_b32_e32 v19, s18 +; SI-NEXT: v_writelane_b32 v32, s49, 9 +; SI-NEXT: v_readfirstlane_b32 s46, v19 +; SI-NEXT: v_mov_b32_e32 v19, s19 +; SI-NEXT: v_writelane_b32 v32, s50, 10 +; SI-NEXT: v_readfirstlane_b32 s47, v19 +; SI-NEXT: v_mov_b32_e32 v19, s20 +; SI-NEXT: v_writelane_b32 v32, s51, 11 +; SI-NEXT: v_readfirstlane_b32 s44, v19 +; SI-NEXT: v_mov_b32_e32 v19, s21 +; SI-NEXT: v_writelane_b32 v32, s52, 12 +; SI-NEXT: v_readfirstlane_b32 s45, v19 +; SI-NEXT: v_mov_b32_e32 v19, s22 +; SI-NEXT: v_writelane_b32 v32, s53, 13 +; SI-NEXT: v_readfirstlane_b32 s42, v19 +; SI-NEXT: v_mov_b32_e32 v19, s23 +; SI-NEXT: v_writelane_b32 v32, s54, 14 +; SI-NEXT: v_readfirstlane_b32 s43, v19 +; SI-NEXT: v_mov_b32_e32 v19, s24 +; SI-NEXT: v_writelane_b32 v32, s55, 15 +; SI-NEXT: v_readfirstlane_b32 s40, v19 +; SI-NEXT: v_mov_b32_e32 v19, s25 +; SI-NEXT: v_writelane_b32 v32, s64, 16 +; SI-NEXT: v_readfirstlane_b32 s41, v19 +; SI-NEXT: v_mov_b32_e32 v19, s26 +; SI-NEXT: v_writelane_b32 v32, s65, 17 +; SI-NEXT: v_readfirstlane_b32 s24, v19 +; SI-NEXT: v_mov_b32_e32 v19, s27 +; SI-NEXT: v_writelane_b32 v32, s66, 18 +; SI-NEXT: v_readfirstlane_b32 s25, v19 +; SI-NEXT: v_mov_b32_e32 v19, s28 +; SI-NEXT: v_writelane_b32 v32, s67, 19 +; SI-NEXT: v_readfirstlane_b32 s22, v19 +; SI-NEXT: v_mov_b32_e32 v19, s29 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: v_writelane_b32 v32, s68, 20 +; SI-NEXT: v_readfirstlane_b32 s23, v19 +; SI-NEXT: v_readfirstlane_b32 s20, v0 +; SI-NEXT: v_readfirstlane_b32 s21, v1 +; SI-NEXT: v_readfirstlane_b32 s18, v2 +; SI-NEXT: v_readfirstlane_b32 s19, v3 +; SI-NEXT: v_readfirstlane_b32 s16, v4 +; SI-NEXT: v_readfirstlane_b32 s17, v5 +; SI-NEXT: v_readfirstlane_b32 s14, v6 +; SI-NEXT: v_readfirstlane_b32 s15, v7 +; SI-NEXT: v_readfirstlane_b32 s12, v8 +; SI-NEXT: v_readfirstlane_b32 s13, v9 +; SI-NEXT: v_readfirstlane_b32 s10, v10 +; SI-NEXT: v_readfirstlane_b32 s11, v11 +; SI-NEXT: v_readfirstlane_b32 s8, v12 +; SI-NEXT: v_readfirstlane_b32 s9, v13 +; SI-NEXT: v_readfirstlane_b32 s6, v14 +; SI-NEXT: v_readfirstlane_b32 s7, v15 +; SI-NEXT: v_readfirstlane_b32 s4, v16 ; SI-NEXT: s_and_b64 s[26:27], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s5, v18 -; SI-NEXT: v_writelane_b32 v21, s69, 21 +; SI-NEXT: v_readfirstlane_b32 s5, v17 +; SI-NEXT: v_writelane_b32 v32, s69, 21 ; SI-NEXT: s_cbranch_scc0 .LBB69_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_lshr_b32 s38, s5, 16 @@ -108251,8 +107095,8 @@ define inreg <64 x i16> @bitcast_v16i64_to_v64i16_scalar(<16 x i64> inreg %a, i3 ; SI-NEXT: s_lshr_b32 s69, s57, 16 ; SI-NEXT: s_lshr_b64 s[26:27], s[4:5], 16 ; SI-NEXT: s_lshr_b64 s[28:29], s[6:7], 16 -; SI-NEXT: s_lshr_b64 s[58:59], s[8:9], 16 -; SI-NEXT: s_lshr_b64 s[60:61], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[58:59], s[10:11], 16 ; SI-NEXT: s_lshr_b64 s[62:63], s[12:13], 16 ; SI-NEXT: s_lshr_b64 s[72:73], s[14:15], 16 ; SI-NEXT: s_lshr_b64 s[74:75], s[16:17], 16 @@ -108317,8 +107161,8 @@ define inreg <64 x i16> @bitcast_v16i64_to_v64i16_scalar(<16 x i64> inreg %a, i3 ; SI-NEXT: s_lshr_b32 s69, s57, 16 ; SI-NEXT: s_lshr_b64 s[26:27], s[4:5], 16 ; SI-NEXT: s_lshr_b64 s[28:29], s[6:7], 16 -; SI-NEXT: s_lshr_b64 s[58:59], s[8:9], 16 -; SI-NEXT: s_lshr_b64 s[60:61], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[58:59], s[10:11], 16 ; SI-NEXT: s_lshr_b64 s[62:63], s[12:13], 16 ; SI-NEXT: s_lshr_b64 s[72:73], s[14:15], 16 ; SI-NEXT: s_lshr_b64 s[74:75], s[16:17], 16 @@ -108335,247 +107179,157 @@ define inreg <64 x i16> @bitcast_v16i64_to_v64i16_scalar(<16 x i64> inreg %a, i3 ; SI-NEXT: s_lshl_b32 s27, s36, 16 ; SI-NEXT: s_and_b32 s29, s56, 0xffff ; SI-NEXT: s_or_b32 s27, s29, s27 -; SI-NEXT: v_mov_b32_e32 v1, s27 -; SI-NEXT: s_and_b32 s27, s57, 0xffff -; SI-NEXT: s_lshl_b32 s29, s69, 16 -; SI-NEXT: s_or_b32 s27, s27, s29 -; SI-NEXT: v_mov_b32_e32 v2, s27 -; SI-NEXT: s_lshl_b32 s27, s34, 16 -; SI-NEXT: s_and_b32 s29, s46, 0xffff -; SI-NEXT: s_or_b32 s27, s29, s27 -; SI-NEXT: v_mov_b32_e32 v3, s27 -; SI-NEXT: s_and_b32 s27, s47, 0xffff -; SI-NEXT: s_lshl_b32 s29, s68, 16 -; SI-NEXT: s_or_b32 s27, s27, s29 -; SI-NEXT: v_mov_b32_e32 v4, s27 -; SI-NEXT: s_lshl_b32 s27, s30, 16 -; SI-NEXT: s_and_b32 s29, s44, 0xffff -; SI-NEXT: s_or_b32 s27, s29, s27 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; SI-NEXT: v_mov_b32_e32 v5, s27 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v1, vcc, 8, v0 -; SI-NEXT: s_and_b32 s27, s45, 0xffff -; SI-NEXT: s_lshl_b32 s29, s67, 16 -; SI-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v1, vcc, 12, v0 -; SI-NEXT: s_or_b32 s27, s27, s29 -; SI-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v1, vcc, 16, v0 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mov_b32_e32 v2, s27 -; SI-NEXT: s_and_b32 s27, s42, 0xffff -; SI-NEXT: s_lshl_b32 s29, s94, 16 -; SI-NEXT: buffer_store_dword v5, v1, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v1, vcc, 20, v0 -; SI-NEXT: s_or_b32 s27, s27, s29 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s27 -; SI-NEXT: s_and_b32 s27, s43, 0xffff -; SI-NEXT: s_lshl_b32 s29, s66, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 24, v0 -; SI-NEXT: s_or_b32 s27, s27, s29 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s27 -; SI-NEXT: s_and_b32 s27, s40, 0xffff -; SI-NEXT: s_lshl_b32 s29, s92, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 28, v0 -; SI-NEXT: s_or_b32 s27, s27, s29 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s27 -; SI-NEXT: s_and_b32 s27, s41, 0xffff -; SI-NEXT: s_lshl_b32 s29, s65, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 32, v0 -; SI-NEXT: s_or_b32 s27, s27, s29 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s27 +; SI-NEXT: s_and_b32 s29, s57, 0xffff +; SI-NEXT: s_lshl_b32 s56, s69, 16 +; SI-NEXT: s_or_b32 s29, s29, s56 +; SI-NEXT: s_lshl_b32 s56, s34, 16 +; SI-NEXT: s_and_b32 s46, s46, 0xffff +; SI-NEXT: s_or_b32 s46, s46, s56 +; SI-NEXT: s_and_b32 s47, s47, 0xffff +; SI-NEXT: s_lshl_b32 s56, s68, 16 +; SI-NEXT: s_or_b32 s47, s47, s56 +; SI-NEXT: s_lshl_b32 s56, s30, 16 +; SI-NEXT: s_and_b32 s44, s44, 0xffff +; SI-NEXT: s_or_b32 s44, s44, s56 +; SI-NEXT: s_and_b32 s45, s45, 0xffff +; SI-NEXT: s_lshl_b32 s56, s67, 16 +; SI-NEXT: s_or_b32 s45, s45, s56 +; SI-NEXT: s_lshl_b32 s56, s94, 16 +; SI-NEXT: s_and_b32 s42, s42, 0xffff +; SI-NEXT: s_or_b32 s42, s42, s56 +; SI-NEXT: s_and_b32 s43, s43, 0xffff +; SI-NEXT: s_lshl_b32 s56, s66, 16 +; SI-NEXT: s_or_b32 s43, s43, s56 +; SI-NEXT: s_lshl_b32 s56, s92, 16 +; SI-NEXT: s_and_b32 s40, s40, 0xffff +; SI-NEXT: s_or_b32 s40, s40, s56 +; SI-NEXT: s_and_b32 s41, s41, 0xffff +; SI-NEXT: s_lshl_b32 s56, s65, 16 +; SI-NEXT: s_or_b32 s41, s41, s56 ; SI-NEXT: s_and_b32 s24, s24, 0xffff -; SI-NEXT: s_lshl_b32 s27, s90, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 36, v0 -; SI-NEXT: s_or_b32 s24, s24, s27 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s24 -; SI-NEXT: s_and_b32 s24, s25, 0xffff -; SI-NEXT: s_lshl_b32 s25, s64, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 40, v0 -; SI-NEXT: s_or_b32 s24, s24, s25 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s24 +; SI-NEXT: s_lshl_b32 s56, s90, 16 +; SI-NEXT: s_or_b32 s24, s24, s56 +; SI-NEXT: s_and_b32 s25, s25, 0xffff +; SI-NEXT: s_lshl_b32 s56, s64, 16 +; SI-NEXT: s_or_b32 s25, s25, s56 ; SI-NEXT: s_and_b32 s22, s22, 0xffff -; SI-NEXT: s_lshl_b32 s24, s88, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 44, v0 -; SI-NEXT: s_or_b32 s22, s22, s24 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s22 -; SI-NEXT: s_and_b32 s22, s23, 0xffff -; SI-NEXT: s_lshl_b32 s23, s55, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 48, v0 -; SI-NEXT: s_or_b32 s22, s22, s23 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s22 +; SI-NEXT: s_lshl_b32 s56, s88, 16 +; SI-NEXT: s_or_b32 s22, s22, s56 +; SI-NEXT: s_and_b32 s23, s23, 0xffff +; SI-NEXT: s_lshl_b32 s56, s55, 16 +; SI-NEXT: s_or_b32 s23, s23, s56 ; SI-NEXT: s_and_b32 s20, s20, 0xffff -; SI-NEXT: s_lshl_b32 s22, s78, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 52, v0 -; SI-NEXT: s_or_b32 s20, s20, s22 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s20 -; SI-NEXT: s_and_b32 s20, s21, 0xffff -; SI-NEXT: s_lshl_b32 s21, s54, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 56, v0 -; SI-NEXT: s_or_b32 s20, s20, s21 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s20 +; SI-NEXT: s_lshl_b32 s56, s78, 16 +; SI-NEXT: s_or_b32 s20, s20, s56 +; SI-NEXT: s_and_b32 s21, s21, 0xffff +; SI-NEXT: s_lshl_b32 s56, s54, 16 +; SI-NEXT: s_or_b32 s21, s21, s56 ; SI-NEXT: s_and_b32 s18, s18, 0xffff -; SI-NEXT: s_lshl_b32 s20, s76, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 60, v0 -; SI-NEXT: s_or_b32 s18, s18, s20 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s18 -; SI-NEXT: s_and_b32 s18, s19, 0xffff -; SI-NEXT: s_lshl_b32 s19, s53, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 64, v0 -; SI-NEXT: s_or_b32 s18, s18, s19 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: s_lshl_b32 s56, s76, 16 +; SI-NEXT: s_or_b32 s18, s18, s56 +; SI-NEXT: s_and_b32 s19, s19, 0xffff +; SI-NEXT: s_lshl_b32 s56, s53, 16 +; SI-NEXT: s_or_b32 s19, s19, s56 ; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: s_lshl_b32 s18, s74, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x44, v0 -; SI-NEXT: s_or_b32 s16, s16, s18 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s17, 0xffff -; SI-NEXT: s_lshl_b32 s17, s52, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x48, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_lshl_b32 s56, s74, 16 +; SI-NEXT: s_or_b32 s16, s16, s56 +; SI-NEXT: s_and_b32 s17, s17, 0xffff +; SI-NEXT: s_lshl_b32 s56, s52, 16 +; SI-NEXT: s_or_b32 s17, s17, s56 ; SI-NEXT: s_and_b32 s14, s14, 0xffff -; SI-NEXT: s_lshl_b32 s16, s72, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x4c, v0 -; SI-NEXT: s_or_b32 s14, s14, s16 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s14 -; SI-NEXT: s_and_b32 s14, s15, 0xffff -; SI-NEXT: s_lshl_b32 s15, s51, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x50, v0 -; SI-NEXT: s_or_b32 s14, s14, s15 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s14 +; SI-NEXT: s_lshl_b32 s56, s72, 16 +; SI-NEXT: s_or_b32 s14, s14, s56 +; SI-NEXT: s_and_b32 s15, s15, 0xffff +; SI-NEXT: s_lshl_b32 s56, s51, 16 +; SI-NEXT: s_or_b32 s15, s15, s56 ; SI-NEXT: s_and_b32 s12, s12, 0xffff -; SI-NEXT: s_lshl_b32 s14, s62, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x54, v0 -; SI-NEXT: s_or_b32 s12, s12, s14 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s12 -; SI-NEXT: s_and_b32 s12, s13, 0xffff -; SI-NEXT: s_lshl_b32 s13, s50, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x58, v0 -; SI-NEXT: s_or_b32 s12, s12, s13 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s12 +; SI-NEXT: s_lshl_b32 s56, s62, 16 +; SI-NEXT: s_or_b32 s12, s12, s56 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_lshl_b32 s56, s50, 16 +; SI-NEXT: s_or_b32 s13, s13, s56 ; SI-NEXT: s_and_b32 s10, s10, 0xffff -; SI-NEXT: s_lshl_b32 s12, s60, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x5c, v0 -; SI-NEXT: s_or_b32 s10, s10, s12 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s10 -; SI-NEXT: s_and_b32 s10, s11, 0xffff -; SI-NEXT: s_lshl_b32 s11, s49, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x60, v0 -; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: s_lshl_b32 s56, s58, 16 +; SI-NEXT: s_or_b32 s10, s10, s56 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: s_lshl_b32 s56, s49, 16 +; SI-NEXT: s_or_b32 s11, s11, s56 ; SI-NEXT: s_and_b32 s8, s8, 0xffff -; SI-NEXT: s_lshl_b32 s10, s58, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x64, v0 -; SI-NEXT: s_or_b32 s8, s8, s10 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s8 -; SI-NEXT: s_and_b32 s8, s9, 0xffff -; SI-NEXT: s_lshl_b32 s9, s48, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x68, v0 -; SI-NEXT: s_or_b32 s8, s8, s9 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: s_lshl_b32 s56, s60, 16 ; SI-NEXT: s_and_b32 s6, s6, 0xffff -; SI-NEXT: s_lshl_b32 s8, s28, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x6c, v0 -; SI-NEXT: s_or_b32 s6, s6, s8 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: s_and_b32 s6, s7, 0xffff -; SI-NEXT: s_lshl_b32 s7, s39, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x70, v0 -; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_lshl_b32 s28, s28, 16 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_lshl_b32 s6, s26, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x74, v0 -; SI-NEXT: s_or_b32 s4, s4, s6 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s4 -; SI-NEXT: s_and_b32 s4, s5, 0xffff -; SI-NEXT: s_lshl_b32 s5, s38, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x78, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; SI-NEXT: v_mov_b32_e32 v1, s4 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: v_readlane_b32 s69, v21, 21 -; SI-NEXT: v_readlane_b32 s68, v21, 20 -; SI-NEXT: v_readlane_b32 s67, v21, 19 -; SI-NEXT: v_readlane_b32 s66, v21, 18 -; SI-NEXT: v_readlane_b32 s65, v21, 17 -; SI-NEXT: v_readlane_b32 s64, v21, 16 -; SI-NEXT: v_readlane_b32 s55, v21, 15 -; SI-NEXT: v_readlane_b32 s54, v21, 14 -; SI-NEXT: v_readlane_b32 s53, v21, 13 -; SI-NEXT: v_readlane_b32 s52, v21, 12 -; SI-NEXT: v_readlane_b32 s51, v21, 11 -; SI-NEXT: v_readlane_b32 s50, v21, 10 -; SI-NEXT: v_readlane_b32 s49, v21, 9 -; SI-NEXT: v_readlane_b32 s48, v21, 8 -; SI-NEXT: v_readlane_b32 s39, v21, 7 -; SI-NEXT: v_readlane_b32 s38, v21, 6 -; SI-NEXT: v_readlane_b32 s37, v21, 5 -; SI-NEXT: v_readlane_b32 s36, v21, 4 -; SI-NEXT: v_readlane_b32 s35, v21, 3 -; SI-NEXT: v_readlane_b32 s34, v21, 2 -; SI-NEXT: v_readlane_b32 s31, v21, 1 -; SI-NEXT: v_readlane_b32 s30, v21, 0 +; SI-NEXT: s_lshl_b32 s26, s26, 16 +; SI-NEXT: s_or_b32 s8, s8, s56 +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_lshl_b32 s56, s48, 16 +; SI-NEXT: s_or_b32 s6, s6, s28 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_lshl_b32 s28, s39, 16 +; SI-NEXT: s_or_b32 s4, s4, s26 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s26, s38, 16 +; SI-NEXT: s_or_b32 s9, s9, s56 +; SI-NEXT: s_or_b32 s7, s7, s28 +; SI-NEXT: s_or_b32 s5, s5, s26 +; SI-NEXT: v_mov_b32_e32 v0, s27 +; SI-NEXT: v_mov_b32_e32 v1, s29 +; SI-NEXT: v_mov_b32_e32 v2, s46 +; SI-NEXT: v_mov_b32_e32 v3, s47 +; SI-NEXT: v_mov_b32_e32 v4, s44 +; SI-NEXT: v_mov_b32_e32 v5, s45 +; SI-NEXT: v_mov_b32_e32 v6, s42 +; SI-NEXT: v_mov_b32_e32 v7, s43 +; SI-NEXT: v_mov_b32_e32 v8, s40 +; SI-NEXT: v_mov_b32_e32 v9, s41 +; SI-NEXT: v_mov_b32_e32 v10, s24 +; SI-NEXT: v_mov_b32_e32 v11, s25 +; SI-NEXT: v_mov_b32_e32 v12, s22 +; SI-NEXT: v_mov_b32_e32 v13, s23 +; SI-NEXT: v_mov_b32_e32 v14, s20 +; SI-NEXT: v_mov_b32_e32 v15, s21 +; SI-NEXT: v_mov_b32_e32 v16, s18 +; SI-NEXT: v_mov_b32_e32 v17, s19 +; SI-NEXT: v_mov_b32_e32 v18, s16 +; SI-NEXT: v_mov_b32_e32 v19, s17 +; SI-NEXT: v_mov_b32_e32 v20, s14 +; SI-NEXT: v_mov_b32_e32 v21, s15 +; SI-NEXT: v_mov_b32_e32 v22, s12 +; SI-NEXT: v_mov_b32_e32 v23, s13 +; SI-NEXT: v_mov_b32_e32 v24, s10 +; SI-NEXT: v_mov_b32_e32 v25, s11 +; SI-NEXT: v_mov_b32_e32 v26, s8 +; SI-NEXT: v_mov_b32_e32 v27, s9 +; SI-NEXT: v_mov_b32_e32 v28, s6 +; SI-NEXT: v_mov_b32_e32 v29, s7 +; SI-NEXT: v_mov_b32_e32 v30, s4 +; SI-NEXT: v_mov_b32_e32 v31, s5 +; SI-NEXT: v_readlane_b32 s69, v32, 21 +; SI-NEXT: v_readlane_b32 s68, v32, 20 +; SI-NEXT: v_readlane_b32 s67, v32, 19 +; SI-NEXT: v_readlane_b32 s66, v32, 18 +; SI-NEXT: v_readlane_b32 s65, v32, 17 +; SI-NEXT: v_readlane_b32 s64, v32, 16 +; SI-NEXT: v_readlane_b32 s55, v32, 15 +; SI-NEXT: v_readlane_b32 s54, v32, 14 +; SI-NEXT: v_readlane_b32 s53, v32, 13 +; SI-NEXT: v_readlane_b32 s52, v32, 12 +; SI-NEXT: v_readlane_b32 s51, v32, 11 +; SI-NEXT: v_readlane_b32 s50, v32, 10 +; SI-NEXT: v_readlane_b32 s49, v32, 9 +; SI-NEXT: v_readlane_b32 s48, v32, 8 +; SI-NEXT: v_readlane_b32 s39, v32, 7 +; SI-NEXT: v_readlane_b32 s38, v32, 6 +; SI-NEXT: v_readlane_b32 s37, v32, 5 +; SI-NEXT: v_readlane_b32 s36, v32, 4 +; SI-NEXT: v_readlane_b32 s35, v32, 3 +; SI-NEXT: v_readlane_b32 s34, v32, 2 +; SI-NEXT: v_readlane_b32 s31, v32, 1 +; SI-NEXT: v_readlane_b32 s30, v32, 0 ; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[4:5] -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB69_4: ; SI-NEXT: ; implicit-def: $sgpr36 @@ -108602,12 +107356,12 @@ define inreg <64 x i16> @bitcast_v16i64_to_v64i16_scalar(<16 x i64> inreg %a, i3 ; SI-NEXT: ; implicit-def: $sgpr51 ; SI-NEXT: ; implicit-def: $sgpr62 ; SI-NEXT: ; implicit-def: $sgpr50 -; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr58 ; SI-NEXT: ; implicit-def: $sgpr49 ; SI-NEXT: ; implicit-def: $sgpr48 ; SI-NEXT: ; implicit-def: $sgpr39 ; SI-NEXT: ; implicit-def: $sgpr38 -; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr60 ; SI-NEXT: ; implicit-def: $sgpr28 ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: s_branch .LBB69_2 @@ -108864,186 +107618,319 @@ define <16 x i64> @bitcast_v64i16_to_v16i64(<64 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v64i16_to_v16i64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v62, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:132 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:88 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:104 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:112 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:128 -; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v25 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 +; SI-NEXT: v_mov_b32_e32 v45, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v19 +; SI-NEXT: v_mov_b32_e32 v46, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v20 +; SI-NEXT: v_mov_b32_e32 v47, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v21 +; SI-NEXT: v_mov_b32_e32 v56, v6 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v22 +; SI-NEXT: v_mov_b32_e32 v57, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v23 +; SI-NEXT: v_mov_b32_e32 v58, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v24 +; SI-NEXT: v_mov_b32_e32 v59, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v25 +; SI-NEXT: v_mov_b32_e32 v60, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v26 +; SI-NEXT: v_mov_b32_e32 v61, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v27 +; SI-NEXT: v_mov_b32_e32 v43, v11 +; SI-NEXT: v_mov_b32_e32 v44, v10 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v18 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v17 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v16 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v15 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v43 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v44 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v45 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v46 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v47 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v56 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v57 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v58 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v59 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v60 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v61 ; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v10 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v8 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v12 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v18 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v20 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v22 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v24 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v26 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v28 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v30 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v62 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v9 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v33 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:36 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:4 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v31 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:108 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:100 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v63 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB70_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v22, 0xffff, v41 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v46 +; SI-NEXT: v_or_b32_e32 v8, v8, v32 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v43 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v32 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v62 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v61 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v60 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v59 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v58 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v57 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v56 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v47 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v45 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v44 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: v_or_b32_e32 v0, v0, v49 +; SI-NEXT: v_or_b32_e32 v1, v1, v42 +; SI-NEXT: v_or_b32_e32 v2, v2, v34 +; SI-NEXT: v_or_b32_e32 v3, v3, v41 +; SI-NEXT: v_or_b32_e32 v4, v4, v48 +; SI-NEXT: v_or_b32_e32 v5, v5, v40 +; SI-NEXT: v_or_b32_e32 v6, v6, v33 +; SI-NEXT: v_or_b32_e32 v7, v7, v55 +; SI-NEXT: v_or_b32_e32 v9, v9, v54 +; SI-NEXT: v_or_b32_e32 v10, v10, v39 +; SI-NEXT: v_or_b32_e32 v11, v11, v53 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v12, v12, v38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v24, v24, v25 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_or_b32_e32 v13, v13, v52 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_or_b32_e32 v14, v14, v37 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v26, v26, v27 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_or_b32_e32 v15, v15, v51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v16, v16, v36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: v_or_b32_e32 v28, v28, v29 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_or_b32_e32 v17, v17, v50 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: v_or_b32_e32 v29, v29, v30 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v18, v18, v35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; SI-NEXT: v_or_b32_e32 v30, v30, v31 +; SI-NEXT: v_and_b32_e32 v31, 0xffff, v63 +; SI-NEXT: v_or_b32_e32 v31, v31, v32 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 @@ -109073,362 +107960,213 @@ define <16 x i64> @bitcast_v64i16_to_v16i64(<64 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v48 -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v35 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: v_or_b32_e32 v16, v16, v53 -; SI-NEXT: v_or_b32_e32 v17, v17, v51 -; SI-NEXT: v_or_b32_e32 v18, v18, v50 ; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; SI-NEXT: v_or_b32_e32 v22, v22, v23 -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v54 -; SI-NEXT: v_or_b32_e32 v19, v19, v39 -; SI-NEXT: v_or_b32_e32 v23, v23, v24 -; SI-NEXT: v_and_b32_e32 v24, 0xffff, v52 -; SI-NEXT: v_or_b32_e32 v24, v24, v25 -; SI-NEXT: v_and_b32_e32 v25, 0xffff, v49 -; SI-NEXT: v_or_b32_e32 v25, v25, v26 -; SI-NEXT: v_and_b32_e32 v26, 0xffff, v37 -; SI-NEXT: v_or_b32_e32 v26, v26, v27 -; SI-NEXT: v_and_b32_e32 v27, 0xffff, v33 -; SI-NEXT: v_or_b32_e32 v27, v27, v28 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: .LBB70_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB70_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v46 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v8, v32, v8 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v62 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v61 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v60 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v59 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v58 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v57 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v56 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v47 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v45 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v44 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v43 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_or_b32_e32 v0, v49, v0 +; SI-NEXT: v_or_b32_e32 v1, v42, v1 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v2, v34, v2 +; SI-NEXT: v_or_b32_e32 v3, v41, v3 +; SI-NEXT: v_or_b32_e32 v4, v48, v4 +; SI-NEXT: v_or_b32_e32 v5, v40, v5 +; SI-NEXT: v_or_b32_e32 v6, v33, v6 +; SI-NEXT: v_or_b32_e32 v7, v55, v7 +; SI-NEXT: v_or_b32_e32 v9, v54, v9 +; SI-NEXT: v_or_b32_e32 v10, v39, v10 +; SI-NEXT: v_or_b32_e32 v11, v53, v11 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 ; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_or_b32_e32 v12, v38, v12 +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 +; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: v_or_b32_e32 v0, v0, v63 -; SI-NEXT: v_or_b32_e32 v1, v1, v62 -; SI-NEXT: v_or_b32_e32 v2, v2, v61 -; SI-NEXT: v_or_b32_e32 v3, v3, v60 -; SI-NEXT: v_or_b32_e32 v4, v4, v59 -; SI-NEXT: v_or_b32_e32 v5, v5, v58 -; SI-NEXT: v_or_b32_e32 v6, v6, v57 -; SI-NEXT: v_or_b32_e32 v7, v7, v56 -; SI-NEXT: v_or_b32_e32 v8, v8, v47 -; SI-NEXT: v_or_b32_e32 v9, v9, v46 -; SI-NEXT: v_or_b32_e32 v10, v10, v45 -; SI-NEXT: v_or_b32_e32 v11, v11, v44 -; SI-NEXT: v_or_b32_e32 v12, v12, v43 -; SI-NEXT: v_or_b32_e32 v13, v13, v42 -; SI-NEXT: v_or_b32_e32 v14, v14, v40 -; SI-NEXT: v_or_b32_e32 v15, v15, v55 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; SI-NEXT: v_or_b32_e32 v28, v28, v29 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; SI-NEXT: v_or_b32_e32 v20, v20, v36 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; SI-NEXT: v_or_b32_e32 v29, v29, v30 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v30, 0xffff, v30 -; SI-NEXT: v_or_b32_e32 v30, v30, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; SI-NEXT: v_or_b32_e32 v21, v21, v34 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; kill: killed $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; kill: killed $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; kill: killed $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; kill: killed $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: v_or_b32_e32 v13, v52, v13 +; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 +; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v24 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; SI-NEXT: v_or_b32_e32 v31, v31, v38 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; kill: killed $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; kill: killed $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; kill: killed $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; kill: killed $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; kill: killed $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; kill: killed $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; kill: killed $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; kill: killed $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; kill: killed $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: .LBB70_2: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB70_4 -; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v41 -; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v48 -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v35 -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v32 -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; SI-NEXT: s_mov_b32 s6, 0x30000 -; SI-NEXT: v_or_b32_e32 v16, v53, v16 -; SI-NEXT: v_or_b32_e32 v17, v51, v17 -; SI-NEXT: v_or_b32_e32 v18, v50, v18 -; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 -; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 -; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v54 -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; SI-NEXT: v_or_b32_e32 v23, v24, v23 -; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v52 -; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 ; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 ; SI-NEXT: v_or_b32_e32 v25, v26, v25 -; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v37 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_or_b32_e32 v14, v37, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 +; SI-NEXT: v_add_i32_e32 v25, vcc, s6, v25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 ; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 ; SI-NEXT: v_or_b32_e32 v26, v27, v26 -; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v33 -; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; SI-NEXT: v_or_b32_e32 v27, v28, v27 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; SI-NEXT: v_or_b32_e32 v0, v63, v0 -; SI-NEXT: v_or_b32_e32 v1, v62, v1 -; SI-NEXT: v_or_b32_e32 v2, v61, v2 -; SI-NEXT: v_or_b32_e32 v3, v60, v3 -; SI-NEXT: v_or_b32_e32 v4, v59, v4 -; SI-NEXT: v_or_b32_e32 v5, v58, v5 -; SI-NEXT: v_or_b32_e32 v6, v57, v6 -; SI-NEXT: v_or_b32_e32 v7, v56, v7 -; SI-NEXT: v_or_b32_e32 v8, v47, v8 -; SI-NEXT: v_or_b32_e32 v9, v46, v9 -; SI-NEXT: v_or_b32_e32 v10, v45, v10 -; SI-NEXT: v_or_b32_e32 v11, v44, v11 -; SI-NEXT: v_or_b32_e32 v12, v43, v12 -; SI-NEXT: v_or_b32_e32 v13, v42, v13 -; SI-NEXT: v_or_b32_e32 v14, v40, v14 -; SI-NEXT: v_or_b32_e32 v15, v55, v15 -; SI-NEXT: v_or_b32_e32 v19, v39, v19 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_or_b32_e32 v15, v51, v15 +; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 +; SI-NEXT: v_add_i32_e32 v26, vcc, s6, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v16, v36, v16 +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 +; SI-NEXT: v_add_i32_e32 v27, vcc, s6, v27 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 ; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 ; SI-NEXT: v_or_b32_e32 v28, v29, v28 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 -; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 -; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 -; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 -; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 -; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 -; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 -; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 -; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v22 -; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v23 -; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v24 -; SI-NEXT: v_add_i32_e32 v25, vcc, s6, v25 -; SI-NEXT: v_add_i32_e32 v26, vcc, s6, v26 -; SI-NEXT: v_add_i32_e32 v27, vcc, s6, v27 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_or_b32_e32 v17, v50, v17 +; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 ; SI-NEXT: v_add_i32_e32 v28, vcc, s6, v28 -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; SI-NEXT: v_or_b32_e32 v20, v36, v20 -; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v20 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 ; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 ; SI-NEXT: v_or_b32_e32 v29, v30, v29 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v18, v35, v18 +; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 ; SI-NEXT: v_add_i32_e32 v29, vcc, 0x30000, v29 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 ; SI-NEXT: v_and_b32_e32 v30, 0xffff, v30 ; SI-NEXT: v_or_b32_e32 v30, v31, v30 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; SI-NEXT: v_or_b32_e32 v21, v34, v21 -; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v21 -; SI-NEXT: v_add_i32_e32 v30, vcc, 0x30000, v30 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v31 +; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v63 ; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; SI-NEXT: v_or_b32_e32 v31, v38, v31 +; SI-NEXT: v_or_b32_e32 v31, v32, v31 +; SI-NEXT: v_add_i32_e32 v30, vcc, 0x30000, v30 ; SI-NEXT: v_add_i32_e32 v31, vcc, 0x30000, v31 ; SI-NEXT: .LBB70_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -109669,442 +108407,387 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3 ; SI-LABEL: bitcast_v64i16_to_v16i64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v54, v12 -; SI-NEXT: v_mov_b32_e32 v34, v10 -; SI-NEXT: v_mov_b32_e32 v35, v8 -; SI-NEXT: v_mov_b32_e32 v38, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:68 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v29 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v40 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v40, v1 +; SI-NEXT: v_mov_b32_e32 v55, v2 +; SI-NEXT: v_mov_b32_e32 v41, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v40 +; SI-NEXT: v_mov_b32_e32 v54, v3 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v55 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v54 +; SI-NEXT: v_mov_b32_e32 v52, v5 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v53 +; SI-NEXT: v_mov_b32_e32 v51, v6 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v52 +; SI-NEXT: v_mov_b32_e32 v50, v7 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v51 +; SI-NEXT: v_mov_b32_e32 v49, v8 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v50 +; SI-NEXT: v_mov_b32_e32 v48, v9 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v49 +; SI-NEXT: v_mov_b32_e32 v39, v10 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v48 +; SI-NEXT: v_mov_b32_e32 v38, v11 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v39 +; SI-NEXT: v_mov_b32_e32 v37, v12 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v38 +; SI-NEXT: v_mov_b32_e32 v36, v13 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v37 +; SI-NEXT: v_mov_b32_e32 v35, v14 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v36 +; SI-NEXT: v_mov_b32_e32 v34, v15 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v35 +; SI-NEXT: v_mov_b32_e32 v33, v16 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v34 +; SI-NEXT: v_mov_b32_e32 v32, v17 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v33 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v41 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s7, s28, 16 +; SI-NEXT: s_lshr_b32 s8, s27, 16 +; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: s_lshr_b32 s13, s22, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s40, s19, 16 +; SI-NEXT: s_lshr_b32 s41, s18, 16 +; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v32 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v36 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v8 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v12 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v33 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v50 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v49 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: s_cbranch_scc0 .LBB71_2 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB71_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 -; SI-NEXT: v_or_b32_e32 v7, v0, v48 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v4 -; SI-NEXT: v_or_b32_e32 v9, v0, v39 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v6 -; SI-NEXT: v_or_b32_e32 v10, v0, v47 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 -; SI-NEXT: v_or_b32_e32 v11, v0, v46 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 -; SI-NEXT: v_or_b32_e32 v12, v0, v45 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v41 +; SI-NEXT: v_or_b32_e32 v14, v0, v45 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; SI-NEXT: v_or_b32_e32 v16, v0, v43 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v54 -; SI-NEXT: v_or_b32_e32 v13, v0, v44 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v14 -; SI-NEXT: v_mov_b32_e32 v35, v34 -; SI-NEXT: v_mov_b32_e32 v34, v54 -; SI-NEXT: v_mov_b32_e32 v54, v14 -; SI-NEXT: v_or_b32_e32 v14, v0, v43 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_or_b32_e32 v15, v0, v15 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v18 -; SI-NEXT: v_or_b32_e32 v16, v0, v42 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v20 -; SI-NEXT: v_or_b32_e32 v17, v0, v17 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v22 -; SI-NEXT: v_or_b32_e32 v18, v0, v41 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v24 -; SI-NEXT: v_or_b32_e32 v19, v0, v19 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v26 -; SI-NEXT: v_or_b32_e32 v20, v0, v37 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v28 -; SI-NEXT: v_or_b32_e32 v21, v0, v21 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 -; SI-NEXT: v_or_b32_e32 v22, v0, v61 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 -; SI-NEXT: v_or_b32_e32 v23, v0, v23 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 ; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: v_or_b32_e32 v24, v0, v57 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v63 +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: v_or_b32_e32 v17, v0, v42 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: v_or_b32_e32 v25, v0, v25 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v62 -; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: v_or_b32_e32 v26, v0, v40 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s44, s42, 16 +; SI-NEXT: v_or_b32_e32 v18, v0, v63 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 -; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: v_or_b32_e32 v27, v0, v27 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v60 -; SI-NEXT: s_or_b32 s7, s7, s8 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: v_or_b32_e32 v28, v0, v5 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v59 -; SI-NEXT: s_or_b32 s8, s8, s9 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: v_or_b32_e32 v29, v0, v29 +; SI-NEXT: s_or_b32 s5, s5, s44 +; SI-NEXT: s_and_b32 s44, s18, 0xffff +; SI-NEXT: s_lshl_b32 s45, s41, 16 +; SI-NEXT: v_or_b32_e32 v19, v0, v62 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 -; SI-NEXT: s_or_b32 s9, s9, s10 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: s_or_b32 s44, s44, s45 +; SI-NEXT: s_and_b32 s45, s19, 0xffff +; SI-NEXT: s_lshl_b32 s46, s40, 16 +; SI-NEXT: v_or_b32_e32 v20, v0, v61 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: s_or_b32 s45, s45, s46 +; SI-NEXT: s_and_b32 s46, s20, 0xffff +; SI-NEXT: s_lshl_b32 s47, s15, 16 +; SI-NEXT: v_or_b32_e32 v21, v0, v60 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: s_or_b32 s46, s46, s47 +; SI-NEXT: s_and_b32 s47, s21, 0xffff +; SI-NEXT: s_lshl_b32 s56, s14, 16 +; SI-NEXT: v_or_b32_e32 v22, v0, v59 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; SI-NEXT: s_or_b32 s47, s47, s56 +; SI-NEXT: s_and_b32 s56, s22, 0xffff +; SI-NEXT: s_lshl_b32 s57, s13, 16 +; SI-NEXT: v_or_b32_e32 v23, v0, v58 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: s_or_b32 s56, s56, s57 +; SI-NEXT: s_and_b32 s57, s23, 0xffff +; SI-NEXT: s_lshl_b32 s58, s12, 16 +; SI-NEXT: v_or_b32_e32 v24, v0, v57 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: s_or_b32 s57, s57, s58 +; SI-NEXT: s_and_b32 s58, s24, 0xffff +; SI-NEXT: s_lshl_b32 s59, s11, 16 +; SI-NEXT: v_or_b32_e32 v25, v0, v56 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: s_or_b32 s58, s58, s59 +; SI-NEXT: s_and_b32 s59, s25, 0xffff +; SI-NEXT: s_lshl_b32 s60, s10, 16 +; SI-NEXT: v_or_b32_e32 v26, v0, v47 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: s_or_b32 s59, s59, s60 +; SI-NEXT: s_and_b32 s60, s26, 0xffff +; SI-NEXT: s_lshl_b32 s61, s9, 16 +; SI-NEXT: v_or_b32_e32 v27, v0, v46 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: s_or_b32 s60, s60, s61 +; SI-NEXT: s_and_b32 s61, s27, 0xffff +; SI-NEXT: s_lshl_b32 s62, s8, 16 +; SI-NEXT: v_or_b32_e32 v28, v0, v5 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: s_or_b32 s61, s61, s62 +; SI-NEXT: s_and_b32 s62, s28, 0xffff +; SI-NEXT: s_lshl_b32 s63, s7, 16 +; SI-NEXT: v_or_b32_e32 v29, v0, v4 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: s_or_b32 s62, s62, s63 +; SI-NEXT: s_and_b32 s63, s29, 0xffff +; SI-NEXT: s_lshl_b32 s72, s6, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v40 ; SI-NEXT: v_or_b32_e32 v30, v0, v3 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v58 -; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_mov_b32_e32 v36, v38 -; SI-NEXT: v_mov_b32_e32 v38, v2 -; SI-NEXT: v_or_b32_e32 v8, v1, v56 -; SI-NEXT: v_mov_b32_e32 v42, v41 -; SI-NEXT: v_mov_b32_e32 v50, v37 -; SI-NEXT: v_mov_b32_e32 v55, v61 -; SI-NEXT: v_mov_b32_e32 v33, v32 -; SI-NEXT: v_mov_b32_e32 v53, v63 -; SI-NEXT: v_mov_b32_e32 v62, v52 -; SI-NEXT: v_mov_b32_e32 v60, v59 -; SI-NEXT: v_mov_b32_e32 v49, v51 -; SI-NEXT: v_or_b32_e32 v31, v0, v31 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: s_or_b32 s63, s63, s72 +; SI-NEXT: v_or_b32_e32 v15, v1, v44 +; SI-NEXT: v_or_b32_e32 v31, v0, v2 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_mov_b32_e32 v5, s9 -; SI-NEXT: v_mov_b32_e32 v6, s10 -; SI-NEXT: s_mov_b64 s[4:5], 0 -; SI-NEXT: s_branch .LBB71_3 -; SI-NEXT: .LBB71_2: +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_mov_b32_e32 v2, s44 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v3, s45 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v42, v41 -; SI-NEXT: v_mov_b32_e32 v50, v37 -; SI-NEXT: v_mov_b32_e32 v36, v38 -; SI-NEXT: v_mov_b32_e32 v55, v61 -; SI-NEXT: v_mov_b32_e32 v38, v2 -; SI-NEXT: v_mov_b32_e32 v35, v34 -; SI-NEXT: v_mov_b32_e32 v34, v54 -; SI-NEXT: v_mov_b32_e32 v54, v14 -; SI-NEXT: v_mov_b32_e32 v33, v32 -; SI-NEXT: v_mov_b32_e32 v53, v63 -; SI-NEXT: v_mov_b32_e32 v62, v52 -; SI-NEXT: v_mov_b32_e32 v60, v59 -; SI-NEXT: v_mov_b32_e32 v49, v51 -; SI-NEXT: s_mov_b64 s[4:5], -1 -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: .LBB71_3: ; %Flow -; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] -; SI-NEXT: v_mov_b32_e32 v58, v49 -; SI-NEXT: s_cbranch_vccnz .LBB71_5 -; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v48, v0 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v38 +; SI-NEXT: v_mov_b32_e32 v4, s46 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, s47 +; SI-NEXT: v_mov_b32_e32 v6, s56 +; SI-NEXT: v_mov_b32_e32 v7, s57 +; SI-NEXT: v_mov_b32_e32 v8, s58 +; SI-NEXT: v_mov_b32_e32 v9, s59 +; SI-NEXT: v_mov_b32_e32 v10, s60 +; SI-NEXT: v_mov_b32_e32 v11, s61 +; SI-NEXT: v_mov_b32_e32 v12, s62 +; SI-NEXT: v_mov_b32_e32 v13, s63 +; SI-NEXT: s_cbranch_execnz .LBB71_3 +; SI-NEXT: .LBB71_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v40 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v56, v1 -; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1 -; SI-NEXT: v_mov_b32_e32 v52, v53 -; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: s_or_b32 s7, s8, s7 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: s_or_b32 s8, s9, s8 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: s_or_b32 s9, s10, s9 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: s_or_b32 s10, s11, s10 -; SI-NEXT: s_add_i32 s4, s4, 0x30000 -; SI-NEXT: s_add_i32 s5, s5, 0x30000 -; SI-NEXT: s_add_i32 s6, s6, 0x30000 -; SI-NEXT: s_add_i32 s7, s7, 0x30000 -; SI-NEXT: s_add_i32 s8, s8, 0x30000 -; SI-NEXT: s_add_i32 s9, s9, 0x30000 -; SI-NEXT: s_add_i32 s10, s10, 0x30000 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_mov_b32_e32 v5, s9 -; SI-NEXT: v_mov_b32_e32 v6, s10 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v39, v0 -; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v47, v0 -; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v46, v0 -; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_or_b32_e32 v1, v44, v1 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v45, v0 -; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v44, v0 -; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v54 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v43, v0 ; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v0, v43, v0 ; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v54 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v0, v42, v0 ; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: v_or_b32_e32 v0, v63, v0 ; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v0, v62, v0 ; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v50, v0 +; SI-NEXT: v_or_b32_e32 v0, v61, v0 ; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v0, v60, v0 ; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v55, v0 +; SI-NEXT: v_or_b32_e32 v0, v59, v0 ; SI-NEXT: v_add_i32_e32 v22, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v0, v58, v0 ; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v57, v0 ; SI-NEXT: v_add_i32_e32 v24, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v0, v56, v0 ; SI-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v40, v0 +; SI-NEXT: v_or_b32_e32 v0, v47, v0 ; SI-NEXT: v_add_i32_e32 v26, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v62 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v0, v46, v0 ; SI-NEXT: v_add_i32_e32 v27, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v28, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v60 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s16, s42, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: s_and_b32 s16, s18, 0xffff +; SI-NEXT: s_lshl_b32 s17, s41, 16 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_and_b32 s17, s19, 0xffff +; SI-NEXT: s_lshl_b32 s18, s40, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: s_and_b32 s18, s20, 0xffff +; SI-NEXT: s_lshl_b32 s15, s15, 16 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_or_b32 s15, s15, s18 +; SI-NEXT: s_and_b32 s18, s21, 0xffff +; SI-NEXT: s_lshl_b32 s14, s14, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s14, s14, s18 +; SI-NEXT: s_and_b32 s18, s22, 0xffff +; SI-NEXT: s_lshl_b32 s13, s13, 16 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_or_b32 s13, s13, s18 +; SI-NEXT: s_and_b32 s18, s23, 0xffff +; SI-NEXT: s_lshl_b32 s12, s12, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s12, s12, s18 +; SI-NEXT: s_and_b32 s18, s24, 0xffff +; SI-NEXT: s_lshl_b32 s11, s11, 16 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_or_b32 s11, s11, s18 +; SI-NEXT: s_and_b32 s18, s25, 0xffff +; SI-NEXT: s_lshl_b32 s10, s10, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s10, s10, s18 +; SI-NEXT: s_and_b32 s18, s26, 0xffff +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_or_b32 s9, s9, s18 +; SI-NEXT: s_and_b32 s18, s27, 0xffff +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_or_b32 s8, s8, s18 +; SI-NEXT: s_and_b32 s18, s28, 0xffff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_or_b32 s7, s7, s18 +; SI-NEXT: s_and_b32 s18, s29, 0xffff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_or_b32 s6, s6, s18 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s16, s16, 0x30000 +; SI-NEXT: s_add_i32 s17, s17, 0x30000 +; SI-NEXT: s_add_i32 s15, s15, 0x30000 +; SI-NEXT: s_add_i32 s14, s14, 0x30000 +; SI-NEXT: s_add_i32 s13, s13, 0x30000 +; SI-NEXT: s_add_i32 s12, s12, 0x30000 +; SI-NEXT: s_add_i32 s11, s11, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v3, s17 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v4, s15 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, s14 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v29, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v58 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_mov_b32_e32 v6, s13 +; SI-NEXT: v_mov_b32_e32 v7, s12 +; SI-NEXT: v_mov_b32_e32 v8, s11 +; SI-NEXT: v_mov_b32_e32 v9, s10 +; SI-NEXT: v_mov_b32_e32 v10, s9 +; SI-NEXT: v_mov_b32_e32 v11, s8 +; SI-NEXT: v_mov_b32_e32 v12, s7 +; SI-NEXT: v_mov_b32_e32 v13, s6 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v30, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v31, vcc, 0x30000, v0 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: .LBB71_5: ; %end -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: .LBB71_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB71_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_branch .LBB71_2 ; ; VI-LABEL: bitcast_v64i16_to_v16i64_scalar: ; VI: ; %bb.0: @@ -130220,90 +128903,23 @@ define <64 x bfloat> @bitcast_v16f64_to_v64bf16(<16 x double> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr62 ; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr60 @@ -130332,584 +128948,549 @@ define <64 x bfloat> @bitcast_v16f64_to_v64bf16(<16 x double> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 ; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB76_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v32 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v31 ; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v30 ; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v29 ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v28 ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v27 ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v26 ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v25 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v24 ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v23 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v22 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v21 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v20 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v19 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v18 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v17 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v16 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: v_and_b32_e32 v37, 0xffff0000, v15 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v15 -; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v14 -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v14 -; SI-NEXT: v_and_b32_e32 v49, 0xffff0000, v13 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v13 -; SI-NEXT: v_and_b32_e32 v51, 0xffff0000, v12 -; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v12 -; SI-NEXT: v_and_b32_e32 v53, 0xffff0000, v11 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v11 -; SI-NEXT: v_and_b32_e32 v55, 0xffff0000, v10 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v10 -; SI-NEXT: v_and_b32_e32 v41, 0xffff0000, v9 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v9 -; SI-NEXT: v_and_b32_e32 v43, 0xffff0000, v8 -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v8 -; SI-NEXT: v_and_b32_e32 v45, 0xffff0000, v7 -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v7 -; SI-NEXT: v_and_b32_e32 v47, 0xffff0000, v6 -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v6 -; SI-NEXT: v_and_b32_e32 v57, 0xffff0000, v5 -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v5 -; SI-NEXT: v_and_b32_e32 v59, 0xffff0000, v4 -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v4 -; SI-NEXT: v_and_b32_e32 v61, 0xffff0000, v3 -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v3 -; SI-NEXT: v_and_b32_e32 v63, 0xffff0000, v2 -; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v2 -; SI-NEXT: v_and_b32_e32 v34, 0xffff0000, v1 -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v14 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v14 +; SI-NEXT: v_and_b32_e32 v37, 0xffff0000, v13 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v13 +; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v12 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v12 +; SI-NEXT: v_and_b32_e32 v49, 0xffff0000, v11 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v11 +; SI-NEXT: v_and_b32_e32 v51, 0xffff0000, v10 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v10 +; SI-NEXT: v_and_b32_e32 v53, 0xffff0000, v9 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v9 +; SI-NEXT: v_and_b32_e32 v55, 0xffff0000, v8 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v8 +; SI-NEXT: v_and_b32_e32 v41, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v7 +; SI-NEXT: v_and_b32_e32 v43, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v6 +; SI-NEXT: v_and_b32_e32 v45, 0xffff0000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v5 +; SI-NEXT: v_and_b32_e32 v47, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v4 +; SI-NEXT: v_and_b32_e32 v57, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v3 +; SI-NEXT: v_and_b32_e32 v59, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v2 +; SI-NEXT: v_and_b32_e32 v61, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v1 +; SI-NEXT: v_and_b32_e32 v63, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: .LBB76_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB76_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_f64 v[31:32], v[31:32], 1.0 -; SI-NEXT: v_add_f64 v[29:30], v[29:30], 1.0 -; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v32 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 +; SI-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 ; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v31 ; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; SI-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v30 ; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v29 ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; SI-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v28 ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v27 ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; SI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v26 ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v25 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v24 ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v23 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v22 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v21 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v20 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v19 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 -; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 -; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 -; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 -; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 -; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v18 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 -; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v17 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v16 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_and_b32_e32 v37, 0xffff0000, v15 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v15 -; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v14 -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v14 -; SI-NEXT: v_and_b32_e32 v49, 0xffff0000, v13 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v13 -; SI-NEXT: v_and_b32_e32 v51, 0xffff0000, v12 -; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v12 -; SI-NEXT: v_and_b32_e32 v53, 0xffff0000, v11 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v11 -; SI-NEXT: v_and_b32_e32 v55, 0xffff0000, v10 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v10 -; SI-NEXT: v_and_b32_e32 v41, 0xffff0000, v9 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v9 -; SI-NEXT: v_and_b32_e32 v43, 0xffff0000, v8 -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v8 -; SI-NEXT: v_and_b32_e32 v45, 0xffff0000, v7 -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v7 -; SI-NEXT: v_and_b32_e32 v47, 0xffff0000, v6 -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v6 -; SI-NEXT: v_and_b32_e32 v57, 0xffff0000, v5 -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v5 -; SI-NEXT: v_and_b32_e32 v59, 0xffff0000, v4 -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v4 -; SI-NEXT: v_and_b32_e32 v61, 0xffff0000, v3 -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v3 -; SI-NEXT: v_and_b32_e32 v63, 0xffff0000, v2 -; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v2 -; SI-NEXT: v_and_b32_e32 v34, 0xffff0000, v1 -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v1 -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v14 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v14 +; SI-NEXT: v_and_b32_e32 v37, 0xffff0000, v13 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v13 +; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v12 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v12 +; SI-NEXT: v_and_b32_e32 v49, 0xffff0000, v11 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v11 +; SI-NEXT: v_and_b32_e32 v51, 0xffff0000, v10 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v10 +; SI-NEXT: v_and_b32_e32 v53, 0xffff0000, v9 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v9 +; SI-NEXT: v_and_b32_e32 v55, 0xffff0000, v8 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v8 +; SI-NEXT: v_and_b32_e32 v41, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v7 +; SI-NEXT: v_and_b32_e32 v43, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v6 +; SI-NEXT: v_and_b32_e32 v45, 0xffff0000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v5 +; SI-NEXT: v_and_b32_e32 v47, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v4 +; SI-NEXT: v_and_b32_e32 v57, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v3 +; SI-NEXT: v_and_b32_e32 v59, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v2 +; SI-NEXT: v_and_b32_e32 v61, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v1 +; SI-NEXT: v_and_b32_e32 v63, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v0 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: .LBB76_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v34 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v33 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v63 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v62 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v63 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v62 +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v61 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v60 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v59 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v58 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v57 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v56 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v47 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v46 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v45 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v44 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v43 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v42 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v41 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v40 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v55 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v54 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v53 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v52 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v51 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v50 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v49 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v48 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v39 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v38 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v37 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v36 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v59 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v58 +; SI-NEXT: v_alignbit_b32 v2, v2, v3, 16 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v57 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v56 +; SI-NEXT: v_alignbit_b32 v3, v3, v4, 16 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v47 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v46 +; SI-NEXT: v_alignbit_b32 v4, v4, v5, 16 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v45 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_mul_f32_e32 v6, 1.0, v44 +; SI-NEXT: v_alignbit_b32 v5, v5, v6, 16 +; SI-NEXT: v_mul_f32_e32 v6, 1.0, v43 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v42 +; SI-NEXT: v_alignbit_b32 v6, v6, v7, 16 +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v41 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v40 +; SI-NEXT: v_alignbit_b32 v7, v7, v8, 16 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v55 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v54 +; SI-NEXT: v_alignbit_b32 v8, v8, v9, 16 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v53 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v52 +; SI-NEXT: v_alignbit_b32 v9, v9, v10, 16 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v51 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v50 +; SI-NEXT: v_alignbit_b32 v10, v10, v11, 16 +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v49 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_mul_f32_e32 v12, 1.0, v48 +; SI-NEXT: v_alignbit_b32 v11, v11, v12, 16 +; SI-NEXT: v_mul_f32_e32 v12, 1.0, v39 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_mul_f32_e32 v13, 1.0, v38 +; SI-NEXT: v_alignbit_b32 v12, v12, v13, 16 +; SI-NEXT: v_mul_f32_e32 v13, 1.0, v37 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_mul_f32_e32 v14, 1.0, v36 +; SI-NEXT: v_alignbit_b32 v13, v13, v14, 16 +; SI-NEXT: v_mul_f32_e32 v14, 1.0, v35 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v15, 1.0, v34 +; SI-NEXT: v_alignbit_b32 v14, v14, v15, 16 +; SI-NEXT: v_mul_f32_e32 v15, 1.0, v33 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_alignbit_b32 v15, v15, v16, 16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v18 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v19 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v20 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v21 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v23 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v24 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v25 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v26 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v27 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v28 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_mul_f32_e32 v29, 1.0, v29 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v30 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v31 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_alignbit_b32 v16, v16, v17, 16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_alignbit_b32 v17, v17, v18, 16 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_alignbit_b32 v18, v18, v19, 16 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_alignbit_b32 v19, v19, v20, 16 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_alignbit_b32 v20, v20, v21, 16 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_alignbit_b32 v21, v21, v22, 16 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_alignbit_b32 v22, v22, v23, 16 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_alignbit_b32 v23, v23, v24, 16 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_alignbit_b32 v24, v24, v25, 16 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_alignbit_b32 v25, v25, v26, 16 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_alignbit_b32 v26, v26, v27, 16 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_alignbit_b32 v27, v27, v28, 16 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_alignbit_b32 v28, v28, v29, 16 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v29, 1.0, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_alignbit_b32 v29, v29, v30, 16 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v35 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_alignbit_b32 v30, v30, v31, 16 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v32 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; SI-NEXT: v_alignbit_b32 v31, v31, v32, 16 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16f64_to_v64bf16: @@ -131037,8 +129618,8 @@ define inreg <64 x bfloat> @bitcast_v16f64_to_v64bf16_scalar(<16 x double> inreg ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_writelane_b32 v63, s30, 0 @@ -131063,68 +129644,68 @@ define inreg <64 x bfloat> @bitcast_v16f64_to_v64bf16_scalar(<16 x double> inreg ; SI-NEXT: v_writelane_b32 v63, s67, 19 ; SI-NEXT: v_writelane_b32 v63, s68, 20 ; SI-NEXT: v_writelane_b32 v63, s69, 21 -; SI-NEXT: v_mov_b32_e32 v20, s16 +; SI-NEXT: v_mov_b32_e32 v19, s16 ; SI-NEXT: v_writelane_b32 v63, s70, 22 -; SI-NEXT: v_readfirstlane_b32 s4, v20 -; SI-NEXT: v_mov_b32_e32 v20, s17 +; SI-NEXT: v_readfirstlane_b32 s4, v19 +; SI-NEXT: v_mov_b32_e32 v19, s17 ; SI-NEXT: v_writelane_b32 v63, s71, 23 -; SI-NEXT: v_readfirstlane_b32 s5, v20 -; SI-NEXT: v_mov_b32_e32 v20, s18 +; SI-NEXT: v_readfirstlane_b32 s5, v19 +; SI-NEXT: v_mov_b32_e32 v19, s18 ; SI-NEXT: v_writelane_b32 v63, s80, 24 -; SI-NEXT: v_readfirstlane_b32 s6, v20 -; SI-NEXT: v_mov_b32_e32 v20, s19 +; SI-NEXT: v_readfirstlane_b32 s6, v19 +; SI-NEXT: v_mov_b32_e32 v19, s19 ; SI-NEXT: v_writelane_b32 v63, s81, 25 -; SI-NEXT: v_readfirstlane_b32 s7, v20 -; SI-NEXT: v_mov_b32_e32 v20, s20 +; SI-NEXT: v_readfirstlane_b32 s7, v19 +; SI-NEXT: v_mov_b32_e32 v19, s20 ; SI-NEXT: v_writelane_b32 v63, s82, 26 -; SI-NEXT: v_readfirstlane_b32 s8, v20 -; SI-NEXT: v_mov_b32_e32 v20, s21 +; SI-NEXT: v_readfirstlane_b32 s8, v19 +; SI-NEXT: v_mov_b32_e32 v19, s21 ; SI-NEXT: v_writelane_b32 v63, s83, 27 -; SI-NEXT: v_readfirstlane_b32 s9, v20 -; SI-NEXT: v_mov_b32_e32 v20, s22 +; SI-NEXT: v_readfirstlane_b32 s9, v19 +; SI-NEXT: v_mov_b32_e32 v19, s22 ; SI-NEXT: v_writelane_b32 v63, s84, 28 -; SI-NEXT: v_readfirstlane_b32 s20, v20 -; SI-NEXT: v_mov_b32_e32 v20, s23 +; SI-NEXT: v_readfirstlane_b32 s20, v19 +; SI-NEXT: v_mov_b32_e32 v19, s23 ; SI-NEXT: v_writelane_b32 v63, s85, 29 -; SI-NEXT: v_readfirstlane_b32 s21, v20 -; SI-NEXT: v_mov_b32_e32 v20, s24 +; SI-NEXT: v_readfirstlane_b32 s21, v19 +; SI-NEXT: v_mov_b32_e32 v19, s24 ; SI-NEXT: v_writelane_b32 v63, s86, 30 -; SI-NEXT: v_readfirstlane_b32 s24, v20 -; SI-NEXT: v_mov_b32_e32 v20, s25 +; SI-NEXT: v_readfirstlane_b32 s24, v19 +; SI-NEXT: v_mov_b32_e32 v19, s25 ; SI-NEXT: v_writelane_b32 v63, s87, 31 -; SI-NEXT: v_readfirstlane_b32 s25, v20 -; SI-NEXT: v_mov_b32_e32 v20, s26 +; SI-NEXT: v_readfirstlane_b32 s25, v19 +; SI-NEXT: v_mov_b32_e32 v19, s26 ; SI-NEXT: v_writelane_b32 v63, s96, 32 -; SI-NEXT: v_readfirstlane_b32 s40, v20 -; SI-NEXT: v_mov_b32_e32 v20, s27 +; SI-NEXT: v_readfirstlane_b32 s40, v19 +; SI-NEXT: v_mov_b32_e32 v19, s27 ; SI-NEXT: v_writelane_b32 v63, s97, 33 -; SI-NEXT: v_readfirstlane_b32 s41, v20 -; SI-NEXT: v_mov_b32_e32 v20, s28 +; SI-NEXT: v_readfirstlane_b32 s41, v19 +; SI-NEXT: v_mov_b32_e32 v19, s28 ; SI-NEXT: v_writelane_b32 v63, s98, 34 -; SI-NEXT: v_readfirstlane_b32 s42, v20 -; SI-NEXT: v_mov_b32_e32 v20, s29 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 +; SI-NEXT: v_readfirstlane_b32 s42, v19 +; SI-NEXT: v_mov_b32_e32 v19, s29 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 ; SI-NEXT: v_writelane_b32 v63, s99, 35 -; SI-NEXT: v_readfirstlane_b32 s43, v20 -; SI-NEXT: v_readfirstlane_b32 s44, v1 -; SI-NEXT: v_readfirstlane_b32 s45, v2 -; SI-NEXT: v_readfirstlane_b32 s28, v3 -; SI-NEXT: v_readfirstlane_b32 s29, v4 -; SI-NEXT: v_readfirstlane_b32 s26, v5 -; SI-NEXT: v_readfirstlane_b32 s27, v6 -; SI-NEXT: v_readfirstlane_b32 s22, v7 -; SI-NEXT: v_readfirstlane_b32 s23, v8 -; SI-NEXT: v_readfirstlane_b32 s18, v9 -; SI-NEXT: v_readfirstlane_b32 s19, v10 -; SI-NEXT: v_readfirstlane_b32 s16, v11 -; SI-NEXT: v_readfirstlane_b32 s17, v12 -; SI-NEXT: v_readfirstlane_b32 s14, v13 -; SI-NEXT: v_readfirstlane_b32 s15, v14 -; SI-NEXT: v_readfirstlane_b32 s10, v15 -; SI-NEXT: v_readfirstlane_b32 s11, v16 -; SI-NEXT: v_readfirstlane_b32 s12, v17 +; SI-NEXT: v_readfirstlane_b32 s43, v19 +; SI-NEXT: v_readfirstlane_b32 s44, v0 +; SI-NEXT: v_readfirstlane_b32 s45, v1 +; SI-NEXT: v_readfirstlane_b32 s28, v2 +; SI-NEXT: v_readfirstlane_b32 s29, v3 +; SI-NEXT: v_readfirstlane_b32 s26, v4 +; SI-NEXT: v_readfirstlane_b32 s27, v5 +; SI-NEXT: v_readfirstlane_b32 s22, v6 +; SI-NEXT: v_readfirstlane_b32 s23, v7 +; SI-NEXT: v_readfirstlane_b32 s18, v8 +; SI-NEXT: v_readfirstlane_b32 s19, v9 +; SI-NEXT: v_readfirstlane_b32 s16, v10 +; SI-NEXT: v_readfirstlane_b32 s17, v11 +; SI-NEXT: v_readfirstlane_b32 s14, v12 +; SI-NEXT: v_readfirstlane_b32 s15, v13 +; SI-NEXT: v_readfirstlane_b32 s10, v14 +; SI-NEXT: v_readfirstlane_b32 s11, v15 +; SI-NEXT: v_readfirstlane_b32 s12, v16 ; SI-NEXT: s_and_b64 s[46:47], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s13, v18 +; SI-NEXT: v_readfirstlane_b32 s13, v17 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill @@ -131212,94 +129793,103 @@ define inreg <64 x bfloat> @bitcast_v16f64_to_v64bf16_scalar(<16 x double> inreg ; SI-NEXT: v_writelane_b32 v62, s46, 3 ; SI-NEXT: s_cbranch_execnz .LBB77_4 ; SI-NEXT: .LBB77_2: ; %cmp.true -; SI-NEXT: v_add_f64 v[3:4], s[12:13], 1.0 -; SI-NEXT: v_add_f64 v[1:2], s[20:21], 1.0 -; SI-NEXT: v_add_f64 v[41:42], s[24:25], 1.0 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: v_and_b32_e32 v54, 0xffff0000, v42 -; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v42 -; SI-NEXT: v_and_b32_e32 v40, 0xffff0000, v41 -; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v41 -; SI-NEXT: v_and_b32_e32 v42, 0xffff0000, v2 -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v2 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_f64 v[2:3], s[8:9], 1.0 -; SI-NEXT: v_add_f64 v[51:52], s[40:41], 1.0 -; SI-NEXT: v_add_f64 v[49:50], s[42:43], 1.0 -; SI-NEXT: v_add_f64 v[35:36], s[44:45], 1.0 -; SI-NEXT: v_add_f64 v[31:32], s[28:29], 1.0 -; SI-NEXT: v_add_f64 v[27:28], s[26:27], 1.0 -; SI-NEXT: v_add_f64 v[23:24], s[22:23], 1.0 -; SI-NEXT: v_add_f64 v[19:20], s[18:19], 1.0 -; SI-NEXT: v_add_f64 v[15:16], s[16:17], 1.0 -; SI-NEXT: v_add_f64 v[11:12], s[14:15], 1.0 -; SI-NEXT: v_add_f64 v[7:8], s[10:11], 1.0 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: v_and_b32_e32 v46, 0xffff0000, v3 -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v3 -; SI-NEXT: v_add_f64 v[59:60], s[6:7], 1.0 +; SI-NEXT: v_add_f64 v[22:23], s[12:13], 1.0 +; SI-NEXT: v_add_f64 v[0:1], s[20:21], 1.0 +; SI-NEXT: v_add_f64 v[2:3], s[24:25], 1.0 +; SI-NEXT: v_add_f64 v[14:15], s[22:23], 1.0 +; SI-NEXT: v_add_f64 v[16:17], s[18:19], 1.0 +; SI-NEXT: v_add_f64 v[24:25], s[10:11], 1.0 +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_add_f64 v[10:11], s[28:29], 1.0 +; SI-NEXT: v_add_f64 v[18:19], s[16:17], 1.0 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_f64 v[3:4], s[4:5], 1.0 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v8 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_and_b32_e32 v43, 0xffff0000, v17 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v17 +; SI-NEXT: v_and_b32_e32 v47, 0xffff0000, v15 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v15 +; SI-NEXT: v_and_b32_e32 v57, 0xffff0000, v14 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v14 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v2 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v1 +; SI-NEXT: v_add_f64 v[1:2], s[8:9], 1.0 +; SI-NEXT: v_add_f64 v[20:21], s[14:15], 1.0 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v25 +; SI-NEXT: v_add_f64 v[4:5], s[40:41], 1.0 +; SI-NEXT: v_add_f64 v[6:7], s[42:43], 1.0 +; SI-NEXT: v_add_f64 v[8:9], s[44:45], 1.0 +; SI-NEXT: v_add_f64 v[12:13], s[26:27], 1.0 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v12 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v12 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v16 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v16 -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v15 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v20 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v20 -; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v19 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v24 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v24 -; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v23 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v28 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v28 -; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v27 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v32 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v32 -; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v31 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; SI-NEXT: v_and_b32_e32 v34, 0xffff0000, v36 -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v36 -; SI-NEXT: v_and_b32_e32 v36, 0xffff0000, v35 -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; SI-NEXT: v_and_b32_e32 v38, 0xffff0000, v50 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v50 -; SI-NEXT: v_and_b32_e32 v48, 0xffff0000, v49 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v49 -; SI-NEXT: v_and_b32_e32 v50, 0xffff0000, v52 -; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v52 -; SI-NEXT: v_and_b32_e32 v52, 0xffff0000, v51 -; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; SI-NEXT: v_and_b32_e32 v43, 0xffff0000, v1 -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v1 -; SI-NEXT: v_and_b32_e32 v47, 0xffff0000, v2 -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v2 -; SI-NEXT: v_and_b32_e32 v58, 0xffff0000, v60 -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v60 -; SI-NEXT: v_and_b32_e32 v60, 0xffff0000, v59 -; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v59 -; SI-NEXT: v_and_b32_e32 v61, 0xffff0000, v4 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v55, 0xffff0000, v19 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v19 +; SI-NEXT: v_and_b32_e32 v41, 0xffff0000, v18 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v18 +; SI-NEXT: v_and_b32_e32 v48, 0xffff0000, v11 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v11 +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v10 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v10 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v3 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v2 +; SI-NEXT: v_add_f64 v[2:3], s[6:7], 1.0 +; SI-NEXT: v_add_f64 v[37:38], s[4:5], 1.0 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v25 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v24 +; SI-NEXT: v_and_b32_e32 v51, 0xffff0000, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v53, 0xffff0000, v20 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v20 +; SI-NEXT: v_and_b32_e32 v45, 0xffff0000, v16 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v16 +; SI-NEXT: v_and_b32_e32 v59, 0xffff0000, v13 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v13 +; SI-NEXT: v_and_b32_e32 v61, 0xffff0000, v12 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v12 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v9 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v9 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v8 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v8 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v7 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v7 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v6 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v5 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v4 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v0 +; SI-NEXT: v_and_b32_e32 v49, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v3 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v38 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v38 +; SI-NEXT: v_mov_b32_e32 v38, v34 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v37 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v37 ; SI-NEXT: s_branch .LBB77_5 ; SI-NEXT: .LBB77_3: ; SI-NEXT: ; implicit-def: $sgpr46 @@ -131372,368 +129962,313 @@ define inreg <64 x bfloat> @bitcast_v16f64_to_v64bf16_scalar(<16 x double> inreg ; SI-NEXT: ; kill: killed $sgpr46 ; SI-NEXT: s_branch .LBB77_2 ; SI-NEXT: .LBB77_4: -; SI-NEXT: v_mov_b32_e32 v1, s59 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v0, s59 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v1, s58 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v0, s58 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, s57 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v1, s57 +; SI-NEXT: v_mov_b32_e32 v0, s56 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, s99 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, s97 ; SI-NEXT: v_readlane_b32 s4, v62, 0 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v1, s56 -; SI-NEXT: v_mov_b32_e32 v61, s4 +; SI-NEXT: v_mov_b32_e32 v0, s96 +; SI-NEXT: v_mov_b32_e32 v2, s4 ; SI-NEXT: v_readlane_b32 s4, v62, 1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v1, s4 +; SI-NEXT: v_mov_b32_e32 v0, s86 +; SI-NEXT: v_mov_b32_e32 v9, s4 ; SI-NEXT: v_readlane_b32 s4, v62, 2 -; SI-NEXT: v_mov_b32_e32 v2, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_readlane_b32 s4, v62, 3 -; SI-NEXT: v_mov_b32_e32 v6, s99 -; SI-NEXT: v_mov_b32_e32 v5, s98 -; SI-NEXT: v_mov_b32_e32 v8, s97 -; SI-NEXT: v_mov_b32_e32 v7, s96 -; SI-NEXT: v_mov_b32_e32 v10, s87 -; SI-NEXT: v_mov_b32_e32 v9, s86 -; SI-NEXT: v_mov_b32_e32 v12, s85 -; SI-NEXT: v_mov_b32_e32 v11, s84 -; SI-NEXT: v_mov_b32_e32 v14, s83 -; SI-NEXT: v_mov_b32_e32 v13, s82 -; SI-NEXT: v_mov_b32_e32 v16, s81 -; SI-NEXT: v_mov_b32_e32 v15, s80 -; SI-NEXT: v_mov_b32_e32 v18, s71 -; SI-NEXT: v_mov_b32_e32 v17, s70 -; SI-NEXT: v_mov_b32_e32 v20, s69 -; SI-NEXT: v_mov_b32_e32 v19, s68 -; SI-NEXT: v_mov_b32_e32 v22, s67 -; SI-NEXT: v_mov_b32_e32 v21, s66 -; SI-NEXT: v_mov_b32_e32 v24, s65 -; SI-NEXT: v_mov_b32_e32 v23, s64 -; SI-NEXT: v_mov_b32_e32 v26, s55 -; SI-NEXT: v_mov_b32_e32 v25, s54 -; SI-NEXT: v_mov_b32_e32 v28, s53 -; SI-NEXT: v_mov_b32_e32 v27, s52 -; SI-NEXT: v_mov_b32_e32 v30, s51 -; SI-NEXT: v_mov_b32_e32 v29, s50 -; SI-NEXT: v_mov_b32_e32 v32, s49 -; SI-NEXT: v_mov_b32_e32 v31, s48 -; SI-NEXT: v_mov_b32_e32 v34, s39 -; SI-NEXT: v_mov_b32_e32 v33, s38 -; SI-NEXT: v_mov_b32_e32 v36, s37 -; SI-NEXT: v_mov_b32_e32 v35, s36 -; SI-NEXT: v_mov_b32_e32 v38, s35 -; SI-NEXT: v_mov_b32_e32 v37, s34 -; SI-NEXT: v_mov_b32_e32 v48, s31 -; SI-NEXT: v_mov_b32_e32 v39, s30 -; SI-NEXT: v_mov_b32_e32 v50, s95 -; SI-NEXT: v_mov_b32_e32 v49, s94 -; SI-NEXT: v_mov_b32_e32 v52, s93 -; SI-NEXT: v_mov_b32_e32 v51, s92 -; SI-NEXT: v_mov_b32_e32 v54, s91 -; SI-NEXT: v_mov_b32_e32 v53, s90 -; SI-NEXT: v_mov_b32_e32 v40, s89 -; SI-NEXT: v_mov_b32_e32 v55, s88 -; SI-NEXT: v_mov_b32_e32 v42, s79 -; SI-NEXT: v_mov_b32_e32 v41, s78 -; SI-NEXT: v_mov_b32_e32 v43, s77 -; SI-NEXT: v_mov_b32_e32 v44, s76 -; SI-NEXT: v_mov_b32_e32 v46, s75 -; SI-NEXT: v_mov_b32_e32 v45, s74 -; SI-NEXT: v_mov_b32_e32 v47, s73 -; SI-NEXT: v_mov_b32_e32 v56, s72 -; SI-NEXT: v_mov_b32_e32 v58, s63 -; SI-NEXT: v_mov_b32_e32 v57, s62 -; SI-NEXT: v_mov_b32_e32 v60, s61 -; SI-NEXT: v_mov_b32_e32 v59, s60 +; SI-NEXT: v_mov_b32_e32 v38, s98 +; SI-NEXT: v_mov_b32_e32 v51, s87 +; SI-NEXT: v_mov_b32_e32 v53, s85 +; SI-NEXT: v_mov_b32_e32 v52, s84 +; SI-NEXT: v_mov_b32_e32 v55, s83 +; SI-NEXT: v_mov_b32_e32 v54, s82 +; SI-NEXT: v_mov_b32_e32 v41, s81 +; SI-NEXT: v_mov_b32_e32 v40, s80 +; SI-NEXT: v_mov_b32_e32 v43, s71 +; SI-NEXT: v_mov_b32_e32 v42, s70 +; SI-NEXT: v_mov_b32_e32 v45, s69 +; SI-NEXT: v_mov_b32_e32 v44, s68 +; SI-NEXT: v_mov_b32_e32 v47, s67 +; SI-NEXT: v_mov_b32_e32 v46, s66 +; SI-NEXT: v_mov_b32_e32 v57, s65 +; SI-NEXT: v_mov_b32_e32 v56, s64 +; SI-NEXT: v_mov_b32_e32 v59, s55 +; SI-NEXT: v_mov_b32_e32 v58, s54 +; SI-NEXT: v_mov_b32_e32 v61, s53 +; SI-NEXT: v_mov_b32_e32 v60, s52 +; SI-NEXT: v_mov_b32_e32 v48, s51 +; SI-NEXT: v_mov_b32_e32 v39, s50 +; SI-NEXT: v_mov_b32_e32 v33, s49 +; SI-NEXT: v_mov_b32_e32 v32, s48 +; SI-NEXT: v_mov_b32_e32 v31, s39 +; SI-NEXT: v_mov_b32_e32 v30, s38 +; SI-NEXT: v_mov_b32_e32 v29, s37 +; SI-NEXT: v_mov_b32_e32 v28, s36 +; SI-NEXT: v_mov_b32_e32 v27, s35 +; SI-NEXT: v_mov_b32_e32 v26, s34 +; SI-NEXT: v_mov_b32_e32 v25, s31 +; SI-NEXT: v_mov_b32_e32 v24, s30 +; SI-NEXT: v_mov_b32_e32 v23, s95 +; SI-NEXT: v_mov_b32_e32 v22, s94 +; SI-NEXT: v_mov_b32_e32 v21, s93 +; SI-NEXT: v_mov_b32_e32 v20, s92 +; SI-NEXT: v_mov_b32_e32 v19, s91 +; SI-NEXT: v_mov_b32_e32 v18, s90 +; SI-NEXT: v_mov_b32_e32 v17, s89 +; SI-NEXT: v_mov_b32_e32 v50, s88 +; SI-NEXT: v_mov_b32_e32 v15, s79 +; SI-NEXT: v_mov_b32_e32 v14, s78 +; SI-NEXT: v_mov_b32_e32 v12, s77 +; SI-NEXT: v_mov_b32_e32 v36, s76 +; SI-NEXT: v_mov_b32_e32 v11, s75 +; SI-NEXT: v_mov_b32_e32 v10, s74 +; SI-NEXT: v_mov_b32_e32 v49, s73 +; SI-NEXT: v_mov_b32_e32 v13, s72 +; SI-NEXT: v_mov_b32_e32 v1, s63 +; SI-NEXT: v_mov_b32_e32 v6, s62 +; SI-NEXT: v_mov_b32_e32 v5, s61 +; SI-NEXT: v_mov_b32_e32 v4, s60 ; SI-NEXT: v_mov_b32_e32 v3, s4 ; SI-NEXT: .LBB77_5: ; %end -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_lshr_b64 v[2:3], v[3:4], 16 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v61 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v60 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v59 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v58 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v57 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v47 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v56 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v46 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v45 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v43 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v44 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v42 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v41 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v40 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v55 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v54 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v53 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v52 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v51 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v34, 1.0, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v5 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v6 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v49 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v11 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v50 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v49 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v10 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v12 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v48 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v39 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v36 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v15 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v17 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v19 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v21 +; SI-NEXT: v_mul_f32_e32 v15, 1.0, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v33 +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v48 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v13, 1.0, v20 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v61 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v59 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v57 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v47 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v45 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v43 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v41 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v55 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v53 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v51 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v50 +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v4 +; SI-NEXT: v_lshr_b64 v[33:34], v[34:35], 16 +; SI-NEXT: v_lshr_b64 v[34:35], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[2:3], v[7:8], 16 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v39 +; SI-NEXT: v_mul_f32_e32 v36, 1.0, v60 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v26 +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v28 +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v30 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v32 +; SI-NEXT: v_mul_f32_e32 v58, 1.0, v58 +; SI-NEXT: v_mul_f32_e32 v56, 1.0, v56 +; SI-NEXT: v_mul_f32_e32 v46, 1.0, v46 +; SI-NEXT: v_mul_f32_e32 v44, 1.0, v44 +; SI-NEXT: v_mul_f32_e32 v42, 1.0, v42 +; SI-NEXT: v_mul_f32_e32 v40, 1.0, v40 +; SI-NEXT: v_mul_f32_e32 v54, 1.0, v54 +; SI-NEXT: v_mul_f32_e32 v52, 1.0, v52 +; SI-NEXT: v_mul_f32_e32 v38, 1.0, v38 +; SI-NEXT: v_readlane_b32 s99, v63, 35 +; SI-NEXT: v_readlane_b32 s98, v63, 34 +; SI-NEXT: v_readlane_b32 s97, v63, 33 +; SI-NEXT: v_readlane_b32 s96, v63, 32 +; SI-NEXT: v_readlane_b32 s87, v63, 31 +; SI-NEXT: v_readlane_b32 s86, v63, 30 +; SI-NEXT: v_readlane_b32 s85, v63, 29 +; SI-NEXT: v_readlane_b32 s84, v63, 28 +; SI-NEXT: v_readlane_b32 s83, v63, 27 +; SI-NEXT: v_readlane_b32 s82, v63, 26 +; SI-NEXT: v_readlane_b32 s81, v63, 25 +; SI-NEXT: v_readlane_b32 s80, v63, 24 +; SI-NEXT: v_readlane_b32 s71, v63, 23 +; SI-NEXT: v_readlane_b32 s70, v63, 22 +; SI-NEXT: v_readlane_b32 s69, v63, 21 +; SI-NEXT: v_readlane_b32 s68, v63, 20 +; SI-NEXT: v_readlane_b32 s67, v63, 19 +; SI-NEXT: v_readlane_b32 s66, v63, 18 +; SI-NEXT: v_readlane_b32 s65, v63, 17 +; SI-NEXT: v_readlane_b32 s64, v63, 16 +; SI-NEXT: v_readlane_b32 s55, v63, 15 +; SI-NEXT: v_readlane_b32 s54, v63, 14 +; SI-NEXT: v_readlane_b32 s53, v63, 13 +; SI-NEXT: v_readlane_b32 s52, v63, 12 +; SI-NEXT: v_readlane_b32 s51, v63, 11 +; SI-NEXT: v_readlane_b32 s50, v63, 10 +; SI-NEXT: v_readlane_b32 s49, v63, 9 +; SI-NEXT: v_readlane_b32 s48, v63, 8 +; SI-NEXT: v_readlane_b32 s39, v63, 7 +; SI-NEXT: v_readlane_b32 s38, v63, 6 +; SI-NEXT: v_readlane_b32 s37, v63, 5 +; SI-NEXT: v_readlane_b32 s36, v63, 4 +; SI-NEXT: v_readlane_b32 s35, v63, 3 +; SI-NEXT: v_readlane_b32 s34, v63, 2 +; SI-NEXT: v_readlane_b32 s31, v63, 1 +; SI-NEXT: v_readlane_b32 s30, v63, 0 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_mul_f32_e32 v50, 1.0, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshr_b64 v[3:4], v[3:4], 16 +; SI-NEXT: v_lshr_b64 v[4:5], v[5:6], 16 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshr_b64 v[5:6], v[5:6], 16 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_mul_f32_e32 v48, 1.0, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshr_b64 v[6:7], v[6:7], 16 +; SI-NEXT: v_lshr_b64 v[7:8], v[15:16], 16 +; SI-NEXT: v_lshr_b64 v[8:9], v[9:10], 16 +; SI-NEXT: v_lshr_b64 v[9:10], v[11:12], 16 +; SI-NEXT: v_lshr_b64 v[10:11], v[13:14], 16 +; SI-NEXT: v_lshr_b64 v[11:12], v[22:23], 16 +; SI-NEXT: v_lshr_b64 v[12:13], v[24:25], 16 +; SI-NEXT: v_lshr_b64 v[13:14], v[26:27], 16 +; SI-NEXT: v_lshr_b64 v[14:15], v[28:29], 16 +; SI-NEXT: v_lshr_b64 v[15:16], v[30:31], 16 +; SI-NEXT: v_lshr_b64 v[16:17], v[17:18], 16 +; SI-NEXT: v_lshr_b64 v[17:18], v[19:20], 16 +; SI-NEXT: v_lshr_b64 v[18:19], v[36:37], 16 +; SI-NEXT: v_lshr_b64 v[19:20], v[58:59], 16 +; SI-NEXT: v_lshr_b64 v[20:21], v[56:57], 16 +; SI-NEXT: v_lshr_b64 v[21:22], v[46:47], 16 +; SI-NEXT: v_lshr_b64 v[22:23], v[44:45], 16 +; SI-NEXT: v_lshr_b64 v[23:24], v[42:43], 16 +; SI-NEXT: v_lshr_b64 v[24:25], v[40:41], 16 +; SI-NEXT: v_lshr_b64 v[25:26], v[54:55], 16 +; SI-NEXT: v_lshr_b64 v[26:27], v[52:53], 16 +; SI-NEXT: v_lshr_b64 v[27:28], v[50:51], 16 +; SI-NEXT: v_lshr_b64 v[28:29], v[48:49], 16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_lshr_b64 v[29:30], v[38:39], 16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v60, 1.0, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: v_lshr_b64 v[30:31], v[60:61], 16 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v38 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v37 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v36 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v35 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v34 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v33 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v32 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v31 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v30 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v29 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v28 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v27 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v26 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v25 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v24 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v23 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v22 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v21 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v20 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v19 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v18 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v17 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v16 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v15 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v14 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v13 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v12 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v11 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v10 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v9 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v8 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v7 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v6 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v5 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: v_readlane_b32 s99, v63, 35 -; SI-NEXT: v_readlane_b32 s98, v63, 34 -; SI-NEXT: v_readlane_b32 s97, v63, 33 -; SI-NEXT: v_readlane_b32 s96, v63, 32 -; SI-NEXT: v_readlane_b32 s87, v63, 31 -; SI-NEXT: v_readlane_b32 s86, v63, 30 -; SI-NEXT: v_readlane_b32 s85, v63, 29 -; SI-NEXT: v_readlane_b32 s84, v63, 28 -; SI-NEXT: v_readlane_b32 s83, v63, 27 -; SI-NEXT: v_readlane_b32 s82, v63, 26 -; SI-NEXT: v_readlane_b32 s81, v63, 25 -; SI-NEXT: v_readlane_b32 s80, v63, 24 -; SI-NEXT: v_readlane_b32 s71, v63, 23 -; SI-NEXT: v_readlane_b32 s70, v63, 22 -; SI-NEXT: v_readlane_b32 s69, v63, 21 -; SI-NEXT: v_readlane_b32 s68, v63, 20 -; SI-NEXT: v_readlane_b32 s67, v63, 19 -; SI-NEXT: v_readlane_b32 s66, v63, 18 -; SI-NEXT: v_readlane_b32 s65, v63, 17 -; SI-NEXT: v_readlane_b32 s64, v63, 16 -; SI-NEXT: v_readlane_b32 s55, v63, 15 -; SI-NEXT: v_readlane_b32 s54, v63, 14 -; SI-NEXT: v_readlane_b32 s53, v63, 13 -; SI-NEXT: v_readlane_b32 s52, v63, 12 -; SI-NEXT: v_readlane_b32 s51, v63, 11 -; SI-NEXT: v_readlane_b32 s50, v63, 10 -; SI-NEXT: v_readlane_b32 s49, v63, 9 -; SI-NEXT: v_readlane_b32 s48, v63, 8 -; SI-NEXT: v_readlane_b32 s39, v63, 7 -; SI-NEXT: v_readlane_b32 s38, v63, 6 -; SI-NEXT: v_readlane_b32 s37, v63, 5 -; SI-NEXT: v_readlane_b32 s36, v63, 4 -; SI-NEXT: v_readlane_b32 s35, v63, 3 -; SI-NEXT: v_readlane_b32 s34, v63, 2 -; SI-NEXT: v_readlane_b32 s31, v63, 1 -; SI-NEXT: v_readlane_b32 s30, v63, 0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshr_b64 v[31:32], v[0:1], 16 +; SI-NEXT: v_mov_b32_e32 v0, v33 +; SI-NEXT: v_mov_b32_e32 v1, v34 ; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[4:5] -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16f64_to_v64bf16_scalar: @@ -131934,382 +130469,266 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) { ; SI-LABEL: bitcast_v64bf16_to_v16f64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:132 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:88 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v63, 0xffff0000, v0 +; SI-NEXT: v_and_b32_e32 v62, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_mul_f32_e32 v63, 1.0, v63 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v63, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v11 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v10 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v12 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v62 +; SI-NEXT: v_and_b32_e32 v61, 0xffff0000, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v15 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v61 +; SI-NEXT: v_and_b32_e32 v60, 0xffff0000, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v17 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v60 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v59, 0xffff0000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v58, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v57, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v47, 0xffff0000, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v46, 0xffff0000, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v45, 0xffff0000, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_and_b32_e32 v44, 0xffff0000, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v43, 0xffff0000, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_and_b32_e32 v42, 0xffff0000, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v41, 0xffff0000, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v40, 0xffff0000, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_and_b32_e32 v55, 0xffff0000, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v30 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v29 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_and_b32_e32 v34, 0xffff0000, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_and_b32_e32 v36, 0xffff0000, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_and_b32_e32 v48, 0xffff0000, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_and_b32_e32 v49, 0xffff0000, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_and_b32_e32 v50, 0xffff0000, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_and_b32_e32 v51, 0xffff0000, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_and_b32_e32 v52, 0xffff0000, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_and_b32_e32 v53, 0xffff0000, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_and_b32_e32 v54, 0xffff0000, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_mul_f32_e32 v62, 1.0, v54 +; SI-NEXT: v_mul_f32_e32 v60, 1.0, v53 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v56, 0xffff0000, v37 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v38 +; SI-NEXT: v_and_b32_e32 v38, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v18 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v38 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v21 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v20 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v59 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v23 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v5 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v22 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v58 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v6 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v24 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v57 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v27 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v26 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v47 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v29 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v28 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: v_mul_f32_e32 v62, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v60, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v61, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v58, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v59, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v56, 1.0, v7 -; SI-NEXT: v_mul_f32_e32 v57, 1.0, v6 -; SI-NEXT: v_mul_f32_e32 v47, 1.0, v8 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v46 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v9 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v33 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v45 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v34 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v10 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v35 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v44 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v36 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v11 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v37 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v43 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v38 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v39 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v42 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v48 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v49 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v41 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v50 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v51 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v40 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v52 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v53 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v55 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v54 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:100 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:112 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:108 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:120 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:116 -; SI-NEXT: v_mul_f32_e32 v39, 1.0, v41 -; SI-NEXT: v_mul_f32_e32 v51, 1.0, v42 -; SI-NEXT: v_mul_f32_e32 v32, 1.0, v30 -; SI-NEXT: v_mul_f32_e32 v52, 1.0, v55 -; SI-NEXT: v_mul_f32_e32 v55, 1.0, v40 -; SI-NEXT: v_mul_f32_e32 v34, 1.0, v43 -; SI-NEXT: v_mul_f32_e32 v38, 1.0, v44 -; SI-NEXT: v_mul_f32_e32 v33, 1.0, v45 -; SI-NEXT: v_mul_f32_e32 v35, 1.0, v46 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_mul_f32_e32 v41, 1.0, v0 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_mul_f32_e32 v42, 1.0, v1 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_mul_f32_e32 v54, 1.0, v2 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_mul_f32_e32 v40, 1.0, v3 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_mul_f32_e32 v48, 1.0, v4 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_mul_f32_e32 v53, 1.0, v5 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_mul_f32_e32 v36, 1.0, v6 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_mul_f32_e32 v49, 1.0, v7 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v37, 1.0, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v50, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v17 +; SI-NEXT: v_mul_f32_e32 v61, 1.0, v18 +; SI-NEXT: v_mul_f32_e32 v58, 1.0, v52 +; SI-NEXT: v_mul_f32_e32 v59, 1.0, v19 +; SI-NEXT: v_mul_f32_e32 v47, 1.0, v51 +; SI-NEXT: v_mul_f32_e32 v57, 1.0, v20 +; SI-NEXT: v_mul_f32_e32 v45, 1.0, v50 +; SI-NEXT: v_mul_f32_e32 v46, 1.0, v21 +; SI-NEXT: v_mul_f32_e32 v43, 1.0, v49 +; SI-NEXT: v_mul_f32_e32 v44, 1.0, v22 +; SI-NEXT: v_mul_f32_e32 v41, 1.0, v48 +; SI-NEXT: v_mul_f32_e32 v42, 1.0, v23 +; SI-NEXT: v_mul_f32_e32 v55, 1.0, v39 +; SI-NEXT: v_mul_f32_e32 v40, 1.0, v24 +; SI-NEXT: v_mul_f32_e32 v53, 1.0, v36 +; SI-NEXT: v_mul_f32_e32 v54, 1.0, v25 +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v35 +; SI-NEXT: v_mul_f32_e32 v52, 1.0, v26 +; SI-NEXT: v_mul_f32_e32 v49, 1.0, v34 +; SI-NEXT: v_mul_f32_e32 v50, 1.0, v27 +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v33 +; SI-NEXT: v_mul_f32_e32 v48, 1.0, v28 +; SI-NEXT: v_mul_f32_e32 v36, 1.0, v32 +; SI-NEXT: v_mul_f32_e32 v38, 1.0, v29 +; SI-NEXT: v_mul_f32_e32 v34, 1.0, v31 +; SI-NEXT: v_mul_f32_e32 v35, 1.0, v30 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v56 +; SI-NEXT: v_mul_f32_e32 v33, 1.0, v37 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB78_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v62 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v60 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v58 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v56 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v39 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v34 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v41 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v54 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v48 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v36 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v37 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: v_alignbit_b32 v0, v0, v63, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v61, 16 -; SI-NEXT: v_alignbit_b32 v2, v2, v59, 16 -; SI-NEXT: v_alignbit_b32 v3, v3, v57, 16 -; SI-NEXT: v_alignbit_b32 v24, v24, v51, 16 -; SI-NEXT: v_alignbit_b32 v25, v25, v38, 16 -; SI-NEXT: v_alignbit_b32 v26, v26, v35, 16 -; SI-NEXT: v_alignbit_b32 v27, v27, v42, 16 -; SI-NEXT: v_alignbit_b32 v28, v28, v40, 16 -; SI-NEXT: v_alignbit_b32 v29, v29, v53, 16 -; SI-NEXT: v_alignbit_b32 v30, v30, v49, 16 -; SI-NEXT: v_alignbit_b32 v31, v31, v50, 16 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_alignbit_b32 v5, v5, v6, 16 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_alignbit_b32 v16, v16, v17, 16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_alignbit_b32 v6, v6, v7, 16 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_alignbit_b32 v17, v17, v18, 16 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_alignbit_b32 v7, v7, v8, 16 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_alignbit_b32 v18, v18, v19, 16 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_alignbit_b32 v8, v8, v9, 16 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_alignbit_b32 v19, v19, v20, 16 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_alignbit_b32 v9, v9, v10, 16 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_alignbit_b32 v20, v20, v21, 16 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_alignbit_b32 v10, v10, v11, 16 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_alignbit_b32 v21, v21, v22, 16 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_alignbit_b32 v11, v11, v12, 16 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_alignbit_b32 v22, v22, v23, 16 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v52 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: v_alignbit_b32 v23, v23, v55, 16 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_alignbit_b32 v12, v12, v13, 16 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_alignbit_b32 v13, v13, v14, 16 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_alignbit_b32 v14, v14, v15, 16 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_alignbit_b32 v4, v4, v47, 16 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_alignbit_b32 v15, v15, v32, 16 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 @@ -132355,282 +130774,416 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v58 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v47 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v45 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v43 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v34 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: v_alignbit_b32 v19, v19, v59, 16 +; SI-NEXT: v_alignbit_b32 v20, v20, v57, 16 +; SI-NEXT: v_alignbit_b32 v21, v21, v46, 16 +; SI-NEXT: v_alignbit_b32 v22, v22, v44, 16 +; SI-NEXT: v_alignbit_b32 v23, v23, v42, 16 +; SI-NEXT: v_alignbit_b32 v24, v24, v40, 16 +; SI-NEXT: v_alignbit_b32 v25, v25, v54, 16 +; SI-NEXT: v_alignbit_b32 v26, v26, v52, 16 +; SI-NEXT: v_alignbit_b32 v27, v27, v50, 16 +; SI-NEXT: v_alignbit_b32 v28, v28, v48, 16 +; SI-NEXT: v_alignbit_b32 v29, v29, v38, 16 +; SI-NEXT: v_alignbit_b32 v30, v30, v35, 16 +; SI-NEXT: v_alignbit_b32 v31, v31, v33, 16 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_alignbit_b32 v0, v0, v63, 16 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_alignbit_b32 v2, v2, v3, 16 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_alignbit_b32 v3, v3, v4, 16 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_alignbit_b32 v4, v4, v5, 16 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_alignbit_b32 v5, v5, v6, 16 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_alignbit_b32 v6, v6, v7, 16 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_alignbit_b32 v7, v7, v8, 16 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_alignbit_b32 v8, v8, v9, 16 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_alignbit_b32 v9, v9, v10, 16 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_alignbit_b32 v10, v10, v11, 16 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_alignbit_b32 v11, v11, v12, 16 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_alignbit_b32 v12, v12, v13, 16 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_alignbit_b32 v13, v13, v14, 16 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_alignbit_b32 v14, v14, v15, 16 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_alignbit_b32 v15, v15, v16, 16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_alignbit_b32 v16, v16, v17, 16 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v62 +; SI-NEXT: v_alignbit_b32 v17, v17, v18, 16 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v60 +; SI-NEXT: v_alignbit_b32 v18, v18, v61, 16 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: .LBB78_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB78_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v62 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v60 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v63 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v61 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; SI-NEXT: v_alignbit_b32 v1, v3, v2, 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v58 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v59 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v56 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v57 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v47 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v52 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v62 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v60 +; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v58 +; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v47 +; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v45 +; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v43 +; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v41 ; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 ; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v39 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v55 ; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 ; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v34 +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v53 ; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 ; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v33 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v51 ; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 ; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 -; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v41 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v49 ; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 ; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v54 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v39 ; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 ; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 -; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v48 +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v36 ; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 ; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v36 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v34 ; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v3, v2, 16 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 ; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 ; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 ; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 ; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 ; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: v_alignbit_b32 v8, v9, v8, 16 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 ; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 ; SI-NEXT: v_alignbit_b32 v10, v11, v10, 16 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 ; SI-NEXT: v_alignbit_b32 v11, v12, v11, 16 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 ; SI-NEXT: v_alignbit_b32 v12, v13, v12, 16 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 ; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; SI-NEXT: v_alignbit_b32 v14, v15, v14, 16 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v32 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 ; SI-NEXT: v_alignbit_b32 v15, v16, v15, 16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v37 -; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; SI-NEXT: v_alignbit_b32 v16, v17, v16, 16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 ; SI-NEXT: v_alignbit_b32 v17, v18, v17, 16 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v61 ; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; SI-NEXT: v_alignbit_b32 v18, v19, v18, 16 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v59 ; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 ; SI-NEXT: v_alignbit_b32 v19, v20, v19, 16 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v57 ; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 ; SI-NEXT: v_alignbit_b32 v20, v21, v20, 16 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v46 ; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 ; SI-NEXT: v_alignbit_b32 v21, v22, v21, 16 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v44 ; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 ; SI-NEXT: v_alignbit_b32 v22, v23, v22, 16 -; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v55 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v42 ; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 ; SI-NEXT: v_alignbit_b32 v23, v24, v23, 16 -; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v51 +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v40 ; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 ; SI-NEXT: v_alignbit_b32 v24, v25, v24, 16 -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v38 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v54 ; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 ; SI-NEXT: v_alignbit_b32 v25, v26, v25, 16 -; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v35 +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v52 ; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 ; SI-NEXT: v_alignbit_b32 v26, v27, v26, 16 -; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v42 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v50 ; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 ; SI-NEXT: v_alignbit_b32 v27, v28, v27, 16 -; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v40 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v48 ; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 ; SI-NEXT: v_alignbit_b32 v28, v29, v28, 16 -; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v53 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v38 ; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 ; SI-NEXT: v_alignbit_b32 v29, v30, v29, 16 -; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v49 +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v35 ; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 ; SI-NEXT: v_alignbit_b32 v30, v31, v30, 16 -; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v50 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v33 ; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 ; SI-NEXT: v_alignbit_b32 v31, v32, v31, 16 ; SI-NEXT: .LBB78_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v64bf16_to_v16f64: @@ -134908,667 +133461,665 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; SI-LABEL: bitcast_v64bf16_to_v16f64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:68 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v63, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v7 -; SI-NEXT: v_mov_b32_e32 v43, v21 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v11 -; SI-NEXT: v_mov_b32_e32 v54, v29 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v43 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v54 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v41, v23 -; SI-NEXT: v_mov_b32_e32 v29, v20 -; SI-NEXT: v_mul_f32_e32 v57, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v59, 1.0, v9 -; SI-NEXT: v_mul_f32_e32 v61, 1.0, v13 -; SI-NEXT: v_mul_f32_e32 v23, 1.0, v15 -; SI-NEXT: v_mul_f32_e32 v44, 1.0, v17 -; SI-NEXT: v_mul_f32_e32 v21, 1.0, v19 -; SI-NEXT: v_mul_f32_e32 v20, 1.0, v41 -; SI-NEXT: v_mul_f32_e32 v17, 1.0, v25 -; SI-NEXT: v_mul_f32_e32 v15, 1.0, v27 -; SI-NEXT: v_mul_f32_e64 v25, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v3, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v5, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v7, 1.0, s29 -; SI-NEXT: v_mul_f32_e32 v9, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v54, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v11, 1.0, v6 -; SI-NEXT: v_mul_f32_e32 v56, 1.0, v8 -; SI-NEXT: v_mul_f32_e32 v13, 1.0, v10 -; SI-NEXT: v_mul_f32_e32 v58, 1.0, v12 -; SI-NEXT: v_mul_f32_e32 v60, 1.0, v14 -; SI-NEXT: v_mul_f32_e32 v62, 1.0, v16 -; SI-NEXT: v_mul_f32_e32 v47, 1.0, v22 -; SI-NEXT: v_mul_f32_e32 v22, 1.0, v28 -; SI-NEXT: v_mul_f32_e64 v19, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v14, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v16, 1.0, s22 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v32 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v33 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v34 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v35 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v36 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v37 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v38 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v39 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v48 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v5 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v16 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v16 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v7 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v30 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v17 +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v15 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v14 +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v13 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v12 +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v11 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v10 +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v9 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v8 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v28 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v34, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_and_b32 s6, s29, 0xffff0000 +; SI-NEXT: s_lshl_b32 s7, s29, 16 +; SI-NEXT: s_and_b32 s8, s28, 0xffff0000 +; SI-NEXT: s_lshl_b32 s9, s28, 16 +; SI-NEXT: s_and_b32 s10, s27, 0xffff0000 +; SI-NEXT: s_lshl_b32 s11, s27, 16 +; SI-NEXT: s_and_b32 s12, s26, 0xffff0000 +; SI-NEXT: s_lshl_b32 s13, s26, 16 +; SI-NEXT: s_and_b32 s14, s25, 0xffff0000 +; SI-NEXT: s_lshl_b32 s15, s25, 16 +; SI-NEXT: s_and_b32 s25, s24, 0xffff0000 +; SI-NEXT: s_lshl_b32 s24, s24, 16 +; SI-NEXT: s_and_b32 s26, s23, 0xffff0000 +; SI-NEXT: s_lshl_b32 s23, s23, 16 +; SI-NEXT: s_and_b32 s27, s22, 0xffff0000 +; SI-NEXT: s_lshl_b32 s22, s22, 16 +; SI-NEXT: s_and_b32 s28, s21, 0xffff0000 +; SI-NEXT: s_lshl_b32 s21, s21, 16 +; SI-NEXT: s_and_b32 s29, s20, 0xffff0000 +; SI-NEXT: s_lshl_b32 s20, s20, 16 +; SI-NEXT: s_and_b32 s40, s19, 0xffff0000 +; SI-NEXT: s_lshl_b32 s19, s19, 16 +; SI-NEXT: s_and_b32 s41, s18, 0xffff0000 +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_and_b32 s42, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_and_b32 s43, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s16, s16, 16 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v49 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v27 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v26 +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v25 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v20 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s19 -; SI-NEXT: v_mul_f32_e32 v39, 1.0, v0 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v45 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v19 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mul_f32_e64 v35, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v33, 1.0, s27 -; SI-NEXT: v_mul_f32_e32 v32, 1.0, v18 -; SI-NEXT: v_mul_f32_e32 v34, 1.0, v29 -; SI-NEXT: v_mul_f32_e32 v36, 1.0, v24 -; SI-NEXT: v_mul_f32_e32 v38, 1.0, v26 -; SI-NEXT: v_mul_f32_e32 v31, 1.0, v30 -; SI-NEXT: v_mul_f32_e32 v24, 1.0, v51 -; SI-NEXT: v_mul_f32_e32 v41, 1.0, v53 -; SI-NEXT: v_mul_f32_e32 v26, 1.0, v55 -; SI-NEXT: v_mul_f32_e32 v43, 1.0, v40 -; SI-NEXT: v_mul_f32_e32 v28, 1.0, v42 -; SI-NEXT: v_mul_f32_e32 v51, 1.0, v50 -; SI-NEXT: v_mul_f32_e32 v53, 1.0, v52 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v46 -; SI-NEXT: v_mul_f32_e64 v48, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v18, 1.0, s24 -; SI-NEXT: v_mul_f32_e64 v29, 1.0, s26 +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v35 +; SI-NEXT: v_mul_f32_e32 v50, 1.0, v34 +; SI-NEXT: v_mul_f32_e32 v42, 1.0, v33 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_mul_f32_e32 v59, 1.0, v32 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mul_f32_e32 v61, 1.0, v31 +; SI-NEXT: v_mul_f32_e32 v29, 1.0, v29 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e64 v63, 1.0, s43 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_mul_f32_e64 v23, 1.0, s42 +; SI-NEXT: v_mul_f32_e64 v55, 1.0, s41 +; SI-NEXT: v_mul_f32_e64 v57, 1.0, s40 +; SI-NEXT: v_mul_f32_e64 v25, 1.0, s29 ; SI-NEXT: v_mul_f32_e64 v45, 1.0, s28 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e64 v52, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v43, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v37, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v47, 1.0, s14 +; SI-NEXT: v_mul_f32_e64 v33, 1.0, s12 +; SI-NEXT: v_mul_f32_e64 v27, 1.0, s10 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mul_f32_e64 v21, 1.0, s8 +; SI-NEXT: v_mul_f32_e64 v19, 1.0, s6 +; SI-NEXT: v_mul_f32_e32 v35, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v34, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v49, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v48, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v38, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v54, 1.0, v36 +; SI-NEXT: v_mul_f32_e32 v53, 1.0, v17 +; SI-NEXT: v_mul_f32_e64 v62, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v60, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v58, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v56, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v41, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v6, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v40, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v46, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s15 +; SI-NEXT: v_mul_f32_e64 v44, 1.0, s13 +; SI-NEXT: v_mul_f32_e64 v12, 1.0, s11 +; SI-NEXT: v_mul_f32_e64 v36, 1.0, s9 +; SI-NEXT: v_mul_f32_e64 v14, 1.0, s7 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB79_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v0, v19 -; SI-NEXT: v_mov_b32_e32 v37, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v33 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v63 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v57 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v61 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v63 +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v44 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v21 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_lshr_b64 v[0:1], v[62:63], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[60:61], 16 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v55 +; SI-NEXT: v_lshr_b64 v[2:3], v[58:59], 16 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v57 +; SI-NEXT: v_lshr_b64 v[3:4], v[56:57], 16 +; SI-NEXT: v_mov_b32_e32 v4, v41 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v25 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[4:5], v[41:42], 16 +; SI-NEXT: v_mov_b32_e32 v5, v6 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v45 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v52 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v43 +; SI-NEXT: v_mov_b32_e32 v61, v37 +; SI-NEXT: v_mov_b32_e32 v59, v47 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v61 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v59 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v50 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v29 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v63, v39 ; SI-NEXT: s_mov_b64 s[4:5], 0 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[0:1], v[19:20], 16 -; SI-NEXT: v_mov_b32_e32 v1, v48 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[1:2], v[48:49], 16 -; SI-NEXT: v_mov_b32_e32 v2, v14 -; SI-NEXT: v_mov_b32_e32 v49, v15 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v3 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[2:3], v[14:15], 16 -; SI-NEXT: v_mov_b32_e32 v3, v16 -; SI-NEXT: v_mov_b32_e32 v20, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v35 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[3:4], v[16:17], 16 -; SI-NEXT: v_mov_b32_e32 v4, v18 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v5 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[5:6], v[6:7], 16 +; SI-NEXT: v_mov_b32_e32 v6, v40 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[4:5], v[18:19], 16 -; SI-NEXT: v_mov_b32_e32 v5, v29 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[6:7], v[40:41], 16 +; SI-NEXT: v_mov_b32_e32 v7, v8 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[5:6], v[29:30], 16 -; SI-NEXT: v_mov_b32_e32 v6, v45 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[6:7], v[45:46], 16 -; SI-NEXT: v_mov_b32_e32 v7, v39 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[7:8], v[39:40], 16 -; SI-NEXT: v_mov_b32_e32 v8, v9 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v37 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v49 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_lshr_b64 v[7:8], v[8:9], 16 +; SI-NEXT: v_mov_b32_e32 v8, v46 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[8:9], v[9:10], 16 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v9 -; SI-NEXT: v_mov_b32_e32 v9, v54 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[8:9], v[46:47], 16 +; SI-NEXT: v_mov_b32_e32 v9, v10 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[9:10], v[54:55], 16 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v55, v13 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v10 -; SI-NEXT: v_mov_b32_e32 v10, v11 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[10:11], v[11:12], 16 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v11 -; SI-NEXT: v_mov_b32_e32 v11, v56 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[11:12], v[56:57], 16 -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v56, v44 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v12 -; SI-NEXT: v_lshr_b64 v[12:13], v[13:14], 16 -; SI-NEXT: v_mov_b32_e32 v13, v58 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[13:14], v[58:59], 16 -; SI-NEXT: v_mov_b32_e32 v14, v60 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[9:10], v[10:11], 16 +; SI-NEXT: v_mov_b32_e32 v10, v44 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[14:15], v[60:61], 16 -; SI-NEXT: v_mov_b32_e32 v15, v62 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[10:11], v[44:45], 16 +; SI-NEXT: v_mov_b32_e32 v11, v12 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v45, v50 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[11:12], v[12:13], 16 +; SI-NEXT: v_mov_b32_e32 v12, v36 +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[12:13], v[36:37], 16 +; SI-NEXT: v_mov_b32_e32 v13, v14 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v39 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[13:14], v[14:15], 16 +; SI-NEXT: v_mov_b32_e32 v14, v35 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[14:15], v[35:36], 16 +; SI-NEXT: v_mov_b32_e32 v15, v16 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[15:16], v[62:63], 16 +; SI-NEXT: v_lshr_b64 v[15:16], v[16:17], 16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 ; SI-NEXT: v_mov_b32_e32 v16, v32 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshr_b64 v[16:17], v[32:33], 16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: v_mov_b32_e32 v33, v34 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v17 +; SI-NEXT: v_mov_b32_e32 v17, v18 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[17:18], v[18:19], 16 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v18 +; SI-NEXT: v_lshr_b64 v[18:19], v[34:35], 16 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v19 +; SI-NEXT: v_mov_b32_e32 v19, v20 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v17 -; SI-NEXT: v_mov_b32_e32 v40, v17 -; SI-NEXT: v_lshr_b64 v[17:18], v[34:35], 16 -; SI-NEXT: v_lshr_b64 v[18:19], v[47:48], 16 -; SI-NEXT: v_lshr_b64 v[19:20], v[36:37], 16 -; SI-NEXT: v_mov_b32_e32 v20, v38 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[20:21], v[38:39], 16 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_mov_b32_e32 v34, v47 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v21 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[19:20], v[20:21], 16 +; SI-NEXT: v_mov_b32_e32 v20, v51 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[20:21], v[51:52], 16 ; SI-NEXT: v_mov_b32_e32 v21, v22 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshr_b64 v[21:22], v[22:23], 16 -; SI-NEXT: v_mov_b32_e32 v22, v31 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v22 +; SI-NEXT: v_mov_b32_e32 v22, v49 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[22:23], v[31:32], 16 +; SI-NEXT: v_lshr_b64 v[22:23], v[49:50], 16 ; SI-NEXT: v_mov_b32_e32 v23, v24 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshr_b64 v[23:24], v[24:25], 16 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v52 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v30 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v24 -; SI-NEXT: v_mov_b32_e32 v24, v41 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[24:25], v[41:42], 16 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v41, v26 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v50 -; SI-NEXT: v_mov_b32_e32 v42, v51 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v24 +; SI-NEXT: v_mov_b32_e32 v24, v48 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[24:25], v[48:49], 16 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v57 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v43 +; SI-NEXT: v_mov_b32_e32 v57, v31 +; SI-NEXT: v_mov_b32_e32 v43, v54 +; SI-NEXT: v_mov_b32_e32 v48, v53 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v25 +; SI-NEXT: v_mov_b32_e32 v25, v26 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v41 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshr_b64 v[25:26], v[26:27], 16 -; SI-NEXT: v_mov_b32_e32 v26, v43 -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v26, v38 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[26:27], v[38:39], 16 +; SI-NEXT: v_mov_b32_e32 v27, v28 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v38, v30 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[26:27], v[43:44], 16 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v43, v28 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v27 ; SI-NEXT: v_lshr_b64 v[27:28], v[28:29], 16 -; SI-NEXT: v_lshr_b64 v[28:29], v[51:52], 16 -; SI-NEXT: v_lshr_b64 v[29:30], v[53:54], 16 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v52, v53 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v30 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshr_b64 v[30:31], v[31:32], 16 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v57 -; SI-NEXT: v_mov_b32_e32 v53, v31 -; SI-NEXT: v_lshr_b64 v[31:32], v[31:32], 16 +; SI-NEXT: v_lshr_b64 v[28:29], v[31:32], 16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v37 +; SI-NEXT: v_lshr_b64 v[29:30], v[30:31], 16 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v37, v61 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v30 +; SI-NEXT: v_lshr_b64 v[30:31], v[54:55], 16 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v47 +; SI-NEXT: v_mov_b32_e32 v47, v59 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_lshr_b64 v[31:32], v[53:54], 16 ; SI-NEXT: s_branch .LBB79_3 ; SI-NEXT: .LBB79_2: -; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v56, v44 -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v55, v13 -; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v63, v39 +; SI-NEXT: v_mov_b32_e32 v45, v50 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: v_mov_b32_e32 v33, v34 -; SI-NEXT: v_mov_b32_e32 v34, v47 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v43, v28 -; SI-NEXT: v_mov_b32_e32 v52, v53 -; SI-NEXT: v_mov_b32_e32 v53, v0 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v57, v31 +; SI-NEXT: v_mov_b32_e32 v38, v30 +; SI-NEXT: v_mov_b32_e32 v43, v54 +; SI-NEXT: v_mov_b32_e32 v48, v53 ; SI-NEXT: s_mov_b64 s[4:5], -1 -; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v41, v26 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v42, v51 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: .LBB79_3: ; %Flow -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v37, v34 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: v_mov_b32_e32 v34, v33 -; SI-NEXT: v_mov_b32_e32 v35, v56 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_mov_b32_e32 v32, v40 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_mov_b32_e32 v33, v38 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mov_b32_e32 v51, v46 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mov_b32_e32 v54, v46 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mov_b32_e32 v44, v46 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mov_b32_e32 v45, v56 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mov_b32_e32 v47, v56 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mov_b32_e32 v58, v60 -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_mov_b32_e32 v33, v50 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: s_cbranch_vccnz .LBB79_5 ; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v57 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v62 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v61 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v60 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v49 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v39 -; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v59 -; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v40 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v35 -; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v32 -; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v53 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v32 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v42 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v61 +; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v53, v57 +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v48 ; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 ; SI-NEXT: v_lshr_b64 v[1:2], v[2:3], 16 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v60 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v36 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v58 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v56 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshr_b64 v[3:4], v[3:4], 16 +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v51 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v58 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_lshr_b64 v[3:4], v[3:4], 16 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v56 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_lshr_b64 v[4:5], v[4:5], 16 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v47 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_lshr_b64 v[5:6], v[5:6], 16 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v45 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v50 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_lshr_b64 v[6:7], v[6:7], 16 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v46 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_lshr_b64 v[7:8], v[7:8], 16 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v44 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v37 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: v_lshr_b64 v[8:9], v[8:9], 16 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v54 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v47 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 ; SI-NEXT: v_lshr_b64 v[9:10], v[9:10], 16 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v51 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v35 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 ; SI-NEXT: v_lshr_b64 v[10:11], v[10:11], 16 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v50 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_lshr_b64 v[11:12], v[11:12], 16 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v55 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_lshr_b64 v[12:13], v[12:13], 16 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v48 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_lshr_b64 v[13:14], v[13:14], 16 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v38 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v63 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; SI-NEXT: v_lshr_b64 v[14:15], v[14:15], 16 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v33 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v45 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 ; SI-NEXT: v_lshr_b64 v[15:16], v[15:16], 16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v33 ; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_lshr_b64 v[16:17], v[16:17], 16 -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v34 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v59 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 ; SI-NEXT: v_lshr_b64 v[17:18], v[17:18], 16 -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v37 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v34 ; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; SI-NEXT: v_lshr_b64 v[18:19], v[18:19], 16 -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v36 -; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; SI-NEXT: v_lshr_b64 v[19:20], v[19:20], 16 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 ; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 ; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 ; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v33 ; SI-NEXT: v_lshr_b64 v[32:33], v[32:33], 16 ; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_lshr_b64 v[19:20], v[19:20], 16 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -135576,9 +134127,10 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 ; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 ; SI-NEXT: v_lshr_b64 v[20:21], v[20:21], 16 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 @@ -135587,9 +134139,10 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 ; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 ; SI-NEXT: v_lshr_b64 v[21:22], v[21:22], 16 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 @@ -135598,9 +134151,9 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 ; SI-NEXT: v_lshr_b64 v[22:23], v[22:23], 16 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 @@ -135609,9 +134162,9 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 ; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 ; SI-NEXT: v_lshr_b64 v[23:24], v[23:24], 16 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 ; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 @@ -135620,12 +134173,20 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 ; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 ; SI-NEXT: v_lshr_b64 v[24:25], v[24:25], 16 -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v41 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 ; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 ; SI-NEXT: v_lshr_b64 v[25:26], v[25:26], 16 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 @@ -135634,45 +134195,45 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 ; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 ; SI-NEXT: v_lshr_b64 v[26:27], v[26:27], 16 -; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v43 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 ; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 ; SI-NEXT: v_lshr_b64 v[27:28], v[27:28], 16 -; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v42 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v53 ; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 ; SI-NEXT: v_lshr_b64 v[28:29], v[28:29], 16 -; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v52 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v38 ; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 ; SI-NEXT: v_lshr_b64 v[29:30], v[29:30], 16 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v43 ; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 -; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 ; SI-NEXT: v_lshr_b64 v[30:31], v[30:31], 16 ; SI-NEXT: v_mov_b32_e32 v31, v32 ; SI-NEXT: .LBB79_5: ; %end -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v64bf16_to_v16f64_scalar: @@ -138377,185 +136938,174 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB80_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v29 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v11 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v9, v38 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v26 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v7, v36 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v27 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v8, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v48 -; SI-NEXT: v_mov_b32_e32 v48, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v49 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v9, v50 -; SI-NEXT: v_mov_b32_e32 v50, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v51 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v37, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v38 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v24 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v39 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v52 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v31 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v30 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v49 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v21 ; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v15 ; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 @@ -138576,168 +137126,172 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v3 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v39 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v51, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v53 -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v53, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v40 -; SI-NEXT: v_mov_b32_e32 v40, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v56 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v35 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v50 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v50, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v57 ; SI-NEXT: v_cvt_f32_f16_e32 v41, v58 ; SI-NEXT: v_cvt_f32_f16_e32 v43, v59 ; SI-NEXT: v_cvt_f32_f16_e32 v45, v60 ; SI-NEXT: v_cvt_f32_f16_e32 v46, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v36 -; SI-NEXT: v_mov_b32_e32 v36, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v35 -; SI-NEXT: v_mov_b32_e32 v35, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v4 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v33 +; SI-NEXT: v_mov_b32_e32 v33, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v32 +; SI-NEXT: v_mov_b32_e32 v32, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v3 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: .LBB80_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB80_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 -; SI-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 -; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 +; SI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v26 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v62 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v26 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v3 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v62 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v25 -; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 -; SI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v61 -; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v61 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v60 -; SI-NEXT: v_add_f64 v[35:36], v[11:12], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v23 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v60 +; SI-NEXT: v_add_f64 v[32:33], v[10:11], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v24 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v36 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v12, v33 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v59 -; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 -; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 -; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 -; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 -; SI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 -; SI-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 -; SI-NEXT: v_add_f64 v[29:30], v[29:30], 1.0 -; SI-NEXT: v_add_f64 v[31:32], v[31:32], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v35 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v36 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v29 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v31 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v32 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v58 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; SI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; SI-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; SI-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v31 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 @@ -138756,375 +137310,278 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v32 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v63 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v63 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 ; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 ; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 ; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 ; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 ; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 ; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 ; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v3 -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v2 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: .LBB80_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v57 -; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v47 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v44 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v62 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v0, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v44 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v40 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v62 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v58 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 -; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 -; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 -; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v58 -; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 -; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 -; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 -; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 -; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 -; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 -; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 -; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v43 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v48 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v37 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 -; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v8, v47 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 -; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v10, v45 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v12, v41 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v14, v53 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v18, v39 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v20, v35 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v22, v36 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v23, v25, v23 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v25, v27, v25 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_or_b32_e32 v26, v27, v26 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 +; SI-NEXT: v_or_b32_e32 v27, v29, v27 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v28, v50 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_or_b32_e32 v28, v29, v28 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v30 +; SI-NEXT: v_or_b32_e32 v29, v31, v29 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v30, v33 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x78, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_or_b32_e32 v30, v31, v30 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v32 +; SI-NEXT: v_or_b32_e32 v31, v33, v31 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16f64_to_v64f16: @@ -139251,22 +137708,22 @@ define inreg <64 x half> @bitcast_v16f64_to_v64f16_scalar(<16 x double> inreg %a ; SI-LABEL: bitcast_v16f64_to_v64f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 -; SI-NEXT: v_mov_b32_e32 v19, s16 -; SI-NEXT: v_mov_b32_e32 v20, s17 -; SI-NEXT: v_mov_b32_e32 v21, s18 -; SI-NEXT: v_mov_b32_e32 v22, s19 -; SI-NEXT: v_mov_b32_e32 v27, s20 -; SI-NEXT: v_mov_b32_e32 v28, s21 -; SI-NEXT: v_mov_b32_e32 v31, s22 -; SI-NEXT: v_mov_b32_e32 v32, s23 -; SI-NEXT: v_mov_b32_e32 v29, s24 -; SI-NEXT: v_mov_b32_e32 v30, s25 -; SI-NEXT: v_mov_b32_e32 v25, s26 -; SI-NEXT: v_mov_b32_e32 v26, s27 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: v_mov_b32_e32 v18, s16 +; SI-NEXT: v_mov_b32_e32 v19, s17 +; SI-NEXT: v_mov_b32_e32 v20, s18 +; SI-NEXT: v_mov_b32_e32 v21, s19 +; SI-NEXT: v_mov_b32_e32 v26, s20 +; SI-NEXT: v_mov_b32_e32 v27, s21 +; SI-NEXT: v_mov_b32_e32 v30, s22 +; SI-NEXT: v_mov_b32_e32 v31, s23 +; SI-NEXT: v_mov_b32_e32 v28, s24 +; SI-NEXT: v_mov_b32_e32 v29, s25 +; SI-NEXT: v_mov_b32_e32 v24, s26 +; SI-NEXT: v_mov_b32_e32 v25, s27 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mov_b32_e32 v23, s28 -; SI-NEXT: v_mov_b32_e32 v24, s29 +; SI-NEXT: v_mov_b32_e32 v22, s28 +; SI-NEXT: v_mov_b32_e32 v23, s29 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -139285,288 +137742,281 @@ define inreg <64 x half> @bitcast_v16f64_to_v64f16_scalar(<16 x double> inreg %a ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB81_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v33 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v33 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v33 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v33 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v33 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v33 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v33 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v33 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v17 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v27 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v19 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v39, v15 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v17 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v58, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v16 +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v39, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v32 +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v39, v13 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v13 +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v39, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v32 +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v39, v11 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v10 +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v39, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v32 +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v39, v9 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v29 +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v39, v8 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v33 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v33 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v33 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v33 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v33 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v33 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v33 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v33 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v33 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v33 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v33 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v33 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v33 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v15 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v39, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v35 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v33, v14 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v6 +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v33, v13 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v39, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v35 +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v33, v12 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v39, v5 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v33, v11 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v33, v10 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v39, v4 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v31 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v33, v9 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v4 +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v33, v8 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v39, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v35 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v30 +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v33, v7 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v39, v2 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v33, v6 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v33, v5 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v39, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v35 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v33, v4 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v2 +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v33, v3 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v39, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v35 +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v33, v2 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v39, v23 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v33, v1 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v33, v24 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v39, v22 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v26 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v33, v23 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v0 +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v33, v26 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v39, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v35 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v21 +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v33, v25 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v39, v24 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v35 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v20 +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v33, v30 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v39, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v33 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v35 +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v33, v29 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v39, v28 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v19 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v33, v32 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v33, v31 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v39, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v33 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v35 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v16 +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v33, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v18 ; SI-NEXT: s_cbranch_execnz .LBB81_3 ; SI-NEXT: .LBB81_2: ; %cmp.true -; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 -; SI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_add_f64 v[33:34], v[25:26], 1.0 -; SI-NEXT: v_add_f64 v[29:30], v[29:30], 1.0 -; SI-NEXT: v_add_f64 v[31:32], v[31:32], 1.0 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_add_f64 v[39:40], v[24:25], 1.0 +; SI-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; SI-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v24 -; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v2 -; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v23 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v1 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v3 ; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v40 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v6 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v5 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v29 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v28 ; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v10 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v29 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v34 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v9 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v31 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v32 -; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: v_add_f64 v[0:1], v[20:21], 1.0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v0 +; SI-NEXT: v_add_f64 v[0:1], v[18:19], 1.0 +; SI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v36 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v31 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v35 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v30 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_f64 v[1:2], v[21:22], 1.0 -; SI-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v1 -; SI-NEXT: v_add_f64 v[1:2], v[19:20], 1.0 -; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v48 -; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 -; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 -; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v33 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v29 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v31 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v32 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v57 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v40 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v36 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v29 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v33 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v56 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v10 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v61 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v47 ; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v11 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v60 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v12 ; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v14 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v58 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v17 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v44 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v56 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v42 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v44 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v52 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 @@ -139575,336 +138025,237 @@ define inreg <64 x half> @bitcast_v16f64_to_v64f16_scalar(<16 x double> inreg %a ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 ; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 ; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 ; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 ; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 ; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 ; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v6 -; SI-NEXT: v_mov_b32_e32 v48, v16 -; SI-NEXT: v_mov_b32_e32 v38, v17 -; SI-NEXT: v_mov_b32_e32 v36, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v5 +; SI-NEXT: v_mov_b32_e32 v36, v16 +; SI-NEXT: v_mov_b32_e32 v35, v17 ; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: .LBB81_3: ; %end +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v34 -; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v63 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v63 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v62 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v61 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v42 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v59 -; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v57 -; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 -; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v50 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v38 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v45 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v55 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v58 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v51 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 -; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 -; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 -; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 -; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 -; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 -; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 -; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v10, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v35 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v12, v34 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 +; SI-NEXT: v_or_b32_e32 v23, v25, v23 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v24, v48 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 +; SI-NEXT: v_or_b32_e32 v25, v27, v25 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v26, v52 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_or_b32_e32 v26, v27, v26 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 +; SI-NEXT: v_or_b32_e32 v27, v29, v27 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v28, v41 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v48 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v38 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x78, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v36 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_or_b32_e32 v28, v29, v28 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v46 ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -139921,106 +138272,111 @@ define inreg <64 x half> @bitcast_v16f64_to_v64f16_scalar(<16 x double> inreg %a ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_or_b32_e32 v29, v31, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v36 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_or_b32_e32 v30, v31, v30 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v32 +; SI-NEXT: v_or_b32_e32 v31, v33, v31 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB81_4: -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr62 ; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: s_branch .LBB81_2 ; ; VI-LABEL: bitcast_v16f64_to_v64f16_scalar: @@ -140221,755 +138577,789 @@ define <16 x double> @bitcast_v64f16_to_v16f64(<64 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v64f16_to_v16f64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v62, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:132 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:88 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:84 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v63, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:4 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v62 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v61 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v2 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v60 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v3 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v31 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v57 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v63 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v53 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v33 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v34 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v50 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v35 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v36 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v35 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v37 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v33 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v38 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v48 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v39 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v48 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v48, v46 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v45 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v30 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v50 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v44 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v52 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v53 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v43 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v42 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v41 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v55, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:100 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:112 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:108 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:120 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:116 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f16_f32_e32 v53, v0 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v52, v1 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f16_f32_e32 v50, v2 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v39, v3 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v38, v4 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v37, v5 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v36, v6 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v35, v7 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v34, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v33, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB82_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v47 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v34 -; SI-NEXT: v_or_b32_e32 v31, v33, v31 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v63 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v61 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v59 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v60 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v58 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v56 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v46 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v44 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v40 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v54 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v49 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v53 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v50 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v38 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v36 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: v_or_b32_e32 v0, v62, v0 -; SI-NEXT: v_or_b32_e32 v1, v60, v1 -; SI-NEXT: v_or_b32_e32 v2, v58, v2 -; SI-NEXT: v_or_b32_e32 v3, v56, v3 -; SI-NEXT: v_or_b32_e32 v25, v51, v25 -; SI-NEXT: v_or_b32_e32 v26, v48, v26 -; SI-NEXT: v_or_b32_e32 v27, v52, v27 -; SI-NEXT: v_or_b32_e32 v28, v39, v28 -; SI-NEXT: v_or_b32_e32 v29, v37, v29 -; SI-NEXT: v_or_b32_e32 v30, v35, v30 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v35 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: v_or_b32_e32 v18, v59, v18 +; SI-NEXT: v_or_b32_e32 v19, v57, v19 +; SI-NEXT: v_or_b32_e32 v20, v47, v20 +; SI-NEXT: v_or_b32_e32 v21, v45, v21 +; SI-NEXT: v_or_b32_e32 v22, v43, v22 +; SI-NEXT: v_or_b32_e32 v23, v41, v23 +; SI-NEXT: v_or_b32_e32 v24, v55, v24 +; SI-NEXT: v_or_b32_e32 v25, v53, v25 +; SI-NEXT: v_or_b32_e32 v26, v51, v26 +; SI-NEXT: v_or_b32_e32 v27, v49, v27 +; SI-NEXT: v_or_b32_e32 v28, v38, v28 +; SI-NEXT: v_or_b32_e32 v29, v36, v29 +; SI-NEXT: v_or_b32_e32 v30, v34, v30 +; SI-NEXT: v_or_b32_e32 v31, v33, v31 ; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v32 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_or_b32_e32 v21, v22, v21 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_or_b32_e32 v23, v24, v23 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v40 -; SI-NEXT: v_or_b32_e32 v24, v55, v24 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v62 +; SI-NEXT: v_or_b32_e32 v17, v61, v17 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: .LBB82_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB82_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v58 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v56 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v43 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v41 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v26, v54 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v57 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v53 ; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v51 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v51 ; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v39 ; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 ; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v50 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v38 ; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v36 ; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 ; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 ; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 ; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 ; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v34 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v62 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v60 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_or_b32_e32 v18, v19, v18 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v20, v56 ; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v46 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v22, v20 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v45 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_or_b32_e32 v21, v22, v21 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v42 ; SI-NEXT: v_or_b32_e32 v22, v24, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v24, v40 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; SI-NEXT: v_or_b32_e32 v23, v25, v23 ; SI-NEXT: v_cvt_f32_f16_e32 v25, v55 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: v_or_b32_e32 v24, v25, v24 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v52 ; SI-NEXT: v_or_b32_e32 v25, v27, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v50 ; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 ; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 ; SI-NEXT: v_or_b32_e32 v26, v28, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v49 ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 ; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 ; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 ; SI-NEXT: v_or_b32_e32 v27, v28, v27 ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v37 ; SI-NEXT: v_or_b32_e32 v28, v30, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v35 ; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 ; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 ; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 ; SI-NEXT: v_or_b32_e32 v29, v31, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v34 ; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 ; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 ; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 @@ -140978,23 +139368,23 @@ define <16 x double> @bitcast_v64f16_to_v16f64(<64 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v31, v33, v31 ; SI-NEXT: .LBB82_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v64f16_to_v16f64: @@ -141235,548 +139625,676 @@ define inreg <16 x double> @bitcast_v64f16_to_v16f64_scalar(<64 x half> inreg %a ; SI-LABEL: bitcast_v64f16_to_v16f64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v53, v26 -; SI-NEXT: v_mov_b32_e32 v45, v6 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:48 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:44 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:68 -; SI-NEXT: v_mov_b32_e32 v54, v14 -; SI-NEXT: v_mov_b32_e32 v55, v12 -; SI-NEXT: v_mov_b32_e32 v41, v11 -; SI-NEXT: v_mov_b32_e32 v40, v10 -; SI-NEXT: v_mov_b32_e32 v44, v9 -; SI-NEXT: v_mov_b32_e32 v43, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v0, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v1, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v2, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v3, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v4, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v5, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v7, s28 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f16_f32_e32 v25, v39 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f16_f32_e32 v48, v26 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f16_f32_e32 v26, v31 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f16_f32_e32 v39, v6 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f16_f32_e32 v27, v42 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f16_f32_e32 v38, v60 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f16_f32_e32 v28, v37 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v37, v62 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v29, v63 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v31, v32 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v30, v33 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v32, v34 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v34, v35 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v63, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v62, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v60, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v42, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v35, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v33, s26 -; SI-NEXT: v_cvt_f16_f32_e32 v6, s29 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: s_cbranch_scc0 .LBB83_2 -; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v21, v22, v21 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v52 -; SI-NEXT: v_or_b32_e32 v5, v33, v5 -; SI-NEXT: v_mov_b32_e32 v33, v52 -; SI-NEXT: v_mov_b32_e32 v52, v51 -; SI-NEXT: v_or_b32_e32 v22, v51, v22 -; SI-NEXT: v_mov_b32_e32 v51, v23 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_or_b32_e32 v23, v50, v23 -; SI-NEXT: v_mov_b32_e32 v50, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_or_b32_e32 v24, v49, v24 -; SI-NEXT: v_mov_b32_e32 v49, v25 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_or_b32_e32 v25, v48, v25 -; SI-NEXT: v_mov_b32_e32 v48, v26 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v26, v39, v26 -; SI-NEXT: v_mov_b32_e32 v39, v27 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v9 -; SI-NEXT: v_or_b32_e32 v27, v38, v27 -; SI-NEXT: v_mov_b32_e32 v38, v28 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_or_b32_e32 v28, v37, v28 -; SI-NEXT: v_mov_b32_e32 v37, v29 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v47, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v2 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v46, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v44, s20 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v42, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v13 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v56, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v1 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v40, s24 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v17, v40 +; SI-NEXT: s_lshr_b32 s4, s29, 16 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v58, s4 +; SI-NEXT: s_lshr_b32 s4, s28, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v59, s4 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: s_lshr_b32 s4, s27, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v61, s4 +; SI-NEXT: s_lshr_b32 s4, s26, 16 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v62, s4 +; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s4 +; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s4 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v51, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v53, s4 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s4 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v46 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s4 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v47 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s4 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v55, s25 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v45, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s26 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v49, v26 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v62 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v43, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v41, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v63, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v60, s28 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v57, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v57 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f16_f32_e32 v46, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v44, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v42, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v56, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v18, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v19, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v20, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v21, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v22, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v23, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v24, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v25, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v26, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v27, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v28, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v29, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v30, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v31, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v57, v0 +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB83_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v4, v35, v4 -; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_or_b32_e32 v9, v14, v9 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v58 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v46 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v61 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v47 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v6 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; SI-NEXT: v_or_b32_e32 v3, v7, v3 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v14 +; SI-NEXT: v_or_b32_e32 v5, v11, v5 +; SI-NEXT: v_or_b32_e32 v6, v13, v6 +; SI-NEXT: v_or_b32_e32 v7, v15, v7 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v56 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v46 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_mov_b32_e32 v35, v54 -; SI-NEXT: v_or_b32_e32 v19, v54, v19 -; SI-NEXT: v_mov_b32_e32 v54, v20 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v29, v31, v29 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 ; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v34 -; SI-NEXT: v_or_b32_e32 v0, v63, v0 -; SI-NEXT: v_or_b32_e32 v1, v62, v1 -; SI-NEXT: v_or_b32_e32 v2, v60, v2 -; SI-NEXT: v_or_b32_e32 v3, v42, v3 -; SI-NEXT: v_or_b32_e32 v10, v56, v10 -; SI-NEXT: v_mov_b32_e32 v63, v44 -; SI-NEXT: v_or_b32_e32 v11, v44, v11 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_or_b32_e32 v10, v62, v10 ; SI-NEXT: v_mov_b32_e32 v62, v61 -; SI-NEXT: v_mov_b32_e32 v60, v59 +; SI-NEXT: v_or_b32_e32 v11, v35, v11 +; SI-NEXT: v_mov_b32_e32 v35, v59 ; SI-NEXT: v_or_b32_e32 v12, v59, v12 -; SI-NEXT: v_mov_b32_e32 v58, v57 -; SI-NEXT: v_or_b32_e32 v13, v57, v13 -; SI-NEXT: v_mov_b32_e32 v56, v47 -; SI-NEXT: v_mov_b32_e32 v46, v45 -; SI-NEXT: v_or_b32_e32 v14, v45, v14 -; SI-NEXT: v_mov_b32_e32 v44, v43 -; SI-NEXT: v_or_b32_e32 v15, v43, v15 -; SI-NEXT: v_mov_b32_e32 v42, v41 -; SI-NEXT: v_or_b32_e32 v16, v41, v16 -; SI-NEXT: v_or_b32_e32 v17, v40, v17 -; SI-NEXT: v_mov_b32_e32 v40, v55 -; SI-NEXT: v_or_b32_e32 v18, v55, v18 -; SI-NEXT: v_or_b32_e32 v20, v53, v20 -; SI-NEXT: v_or_b32_e32 v30, v32, v30 -; SI-NEXT: v_mov_b32_e32 v32, v34 -; SI-NEXT: v_or_b32_e32 v31, v36, v31 +; SI-NEXT: v_mov_b32_e32 v33, v58 +; SI-NEXT: v_or_b32_e32 v13, v58, v13 +; SI-NEXT: v_mov_b32_e32 v32, v56 +; SI-NEXT: v_or_b32_e32 v14, v47, v14 +; SI-NEXT: v_mov_b32_e32 v47, v46 +; SI-NEXT: v_or_b32_e32 v15, v45, v15 +; SI-NEXT: v_mov_b32_e32 v45, v44 +; SI-NEXT: v_or_b32_e32 v18, v40, v18 +; SI-NEXT: v_or_b32_e32 v19, v55, v19 +; SI-NEXT: v_mov_b32_e32 v55, v54 +; SI-NEXT: v_or_b32_e32 v20, v54, v20 +; SI-NEXT: v_or_b32_e32 v21, v53, v21 +; SI-NEXT: v_mov_b32_e32 v53, v52 +; SI-NEXT: v_or_b32_e32 v22, v52, v22 +; SI-NEXT: v_or_b32_e32 v23, v51, v23 +; SI-NEXT: v_mov_b32_e32 v51, v50 +; SI-NEXT: v_or_b32_e32 v24, v50, v24 +; SI-NEXT: v_or_b32_e32 v25, v49, v25 +; SI-NEXT: v_mov_b32_e32 v49, v48 +; SI-NEXT: v_or_b32_e32 v26, v48, v26 +; SI-NEXT: v_or_b32_e32 v27, v39, v27 +; SI-NEXT: v_mov_b32_e32 v39, v38 +; SI-NEXT: v_or_b32_e32 v28, v38, v28 +; SI-NEXT: v_or_b32_e32 v29, v63, v29 +; SI-NEXT: v_mov_b32_e32 v63, v60 +; SI-NEXT: v_or_b32_e32 v30, v60, v30 +; SI-NEXT: v_or_b32_e32 v31, v57, v31 ; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v4, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v16 +; SI-NEXT: v_or_b32_e32 v4, v9, v4 +; SI-NEXT: v_or_b32_e32 v8, v17, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v44 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v42 +; SI-NEXT: v_or_b32_e32 v9, v36, v9 +; SI-NEXT: v_mov_b32_e32 v36, v34 +; SI-NEXT: v_or_b32_e32 v16, v43, v16 +; SI-NEXT: v_mov_b32_e32 v43, v42 +; SI-NEXT: v_or_b32_e32 v17, v41, v17 +; SI-NEXT: v_mov_b32_e32 v41, v40 ; SI-NEXT: s_branch .LBB83_3 ; SI-NEXT: .LBB83_2: -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v63, v44 +; SI-NEXT: v_mov_b32_e32 v36, v34 ; SI-NEXT: v_mov_b32_e32 v62, v61 -; SI-NEXT: v_mov_b32_e32 v60, v59 -; SI-NEXT: v_mov_b32_e32 v58, v57 -; SI-NEXT: v_mov_b32_e32 v56, v47 -; SI-NEXT: v_mov_b32_e32 v46, v45 -; SI-NEXT: v_mov_b32_e32 v44, v43 -; SI-NEXT: v_mov_b32_e32 v42, v41 -; SI-NEXT: v_mov_b32_e32 v40, v55 -; SI-NEXT: v_mov_b32_e32 v35, v54 -; SI-NEXT: v_mov_b32_e32 v54, v20 -; SI-NEXT: v_mov_b32_e32 v33, v52 -; SI-NEXT: v_mov_b32_e32 v32, v34 -; SI-NEXT: v_mov_b32_e32 v52, v51 -; SI-NEXT: v_mov_b32_e32 v51, v23 -; SI-NEXT: v_mov_b32_e32 v50, v24 -; SI-NEXT: v_mov_b32_e32 v49, v25 -; SI-NEXT: v_mov_b32_e32 v48, v26 -; SI-NEXT: v_mov_b32_e32 v39, v27 -; SI-NEXT: v_mov_b32_e32 v38, v28 -; SI-NEXT: v_mov_b32_e32 v37, v29 +; SI-NEXT: v_mov_b32_e32 v35, v59 +; SI-NEXT: v_mov_b32_e32 v33, v58 +; SI-NEXT: v_mov_b32_e32 v32, v56 +; SI-NEXT: v_mov_b32_e32 v47, v46 +; SI-NEXT: v_mov_b32_e32 v45, v44 +; SI-NEXT: v_mov_b32_e32 v43, v42 +; SI-NEXT: v_mov_b32_e32 v41, v40 +; SI-NEXT: v_mov_b32_e32 v55, v54 +; SI-NEXT: v_mov_b32_e32 v53, v52 +; SI-NEXT: v_mov_b32_e32 v51, v50 +; SI-NEXT: v_mov_b32_e32 v49, v48 +; SI-NEXT: v_mov_b32_e32 v39, v38 +; SI-NEXT: v_mov_b32_e32 v63, v60 ; SI-NEXT: s_mov_b64 s[4:5], -1 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: .LBB83_3: ; %Flow -; SI-NEXT: v_mov_b32_e32 v34, v33 +; SI-NEXT: v_mov_b32_e32 v60, v39 +; SI-NEXT: v_mov_b32_e32 v38, v49 +; SI-NEXT: v_mov_b32_e32 v39, v51 +; SI-NEXT: v_mov_b32_e32 v48, v53 +; SI-NEXT: v_mov_b32_e32 v49, v55 +; SI-NEXT: v_mov_b32_e32 v50, v41 +; SI-NEXT: v_mov_b32_e32 v51, v43 +; SI-NEXT: v_mov_b32_e32 v52, v45 +; SI-NEXT: v_mov_b32_e32 v53, v47 +; SI-NEXT: v_mov_b32_e32 v54, v32 +; SI-NEXT: v_mov_b32_e32 v32, v33 ; SI-NEXT: v_mov_b32_e32 v33, v35 -; SI-NEXT: v_mov_b32_e32 v35, v40 -; SI-NEXT: v_mov_b32_e32 v53, v42 -; SI-NEXT: v_mov_b32_e32 v40, v46 -; SI-NEXT: v_mov_b32_e32 v41, v56 -; SI-NEXT: v_mov_b32_e32 v42, v58 -; SI-NEXT: v_mov_b32_e32 v43, v60 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v34, v36 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; SI-NEXT: s_cbranch_vccnz .LBB83_5 ; SI-NEXT: ; %bb.4: ; %cmp.true ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v47 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v32 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v43 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v36 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v40 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_mov_b32_e32 v55, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v33 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v38 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v38 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v36 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v60 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v2, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 ; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v58 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v46 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v44 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v42 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v40 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v37 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v35 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v34 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v62 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v62 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v54 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v53 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v52 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v51 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 @@ -141784,55 +140302,63 @@ define inreg <16 x double> @bitcast_v64f16_to_v16f64_scalar(<64 x half> inreg %a ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_or_b32_e32 v18, v19, v18 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v54 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; SI-NEXT: v_or_b32_e32 v20, v22, v20 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_or_b32_e32 v21, v22, v21 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v51 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v22, v24, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v50 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; SI-NEXT: v_or_b32_e32 v23, v25, v23 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v25, v39 ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: v_or_b32_e32 v24, v25, v24 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v48 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v25, v27, v25 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 ; SI-NEXT: v_or_b32_e32 v26, v28, v26 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_or_b32_e32 v25, v27, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v39 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 @@ -141840,46 +140366,43 @@ define inreg <16 x double> @bitcast_v64f16_to_v16f64_scalar(<64 x half> inreg %a ; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 ; SI-NEXT: v_or_b32_e32 v27, v28, v27 ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v37 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v28, v30, v28 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; SI-NEXT: v_or_b32_e32 v29, v31, v29 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 ; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 ; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_or_b32_e32 v29, v31, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v63 ; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 ; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 ; SI-NEXT: v_or_b32_e32 v30, v31, v30 ; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v32 ; SI-NEXT: v_or_b32_e32 v31, v33, v31 ; SI-NEXT: .LBB83_5: ; %end -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -142208,373 +140731,276 @@ define <64 x i16> @bitcast_v16f64_to_v64i16(<16 x double> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; kill: killed $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 +; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB84_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_alignbit_b32 v33, v32, v31, 16 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v34, v30, v29, 16 -; SI-NEXT: v_alignbit_b32 v35, v28, v27, 16 -; SI-NEXT: v_alignbit_b32 v36, v26, v25, 16 -; SI-NEXT: v_alignbit_b32 v37, v24, v23, 16 -; SI-NEXT: v_alignbit_b32 v38, v22, v21, 16 -; SI-NEXT: v_alignbit_b32 v39, v20, v19, 16 -; SI-NEXT: v_alignbit_b32 v49, v18, v17, 16 -; SI-NEXT: v_alignbit_b32 v51, v16, v15, 16 -; SI-NEXT: v_alignbit_b32 v54, v14, v13, 16 -; SI-NEXT: v_alignbit_b32 v40, v12, v11, 16 -; SI-NEXT: v_alignbit_b32 v42, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v45, v8, v7, 16 -; SI-NEXT: v_alignbit_b32 v47, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v32, v31, v30, 16 +; SI-NEXT: v_alignbit_b32 v33, v29, v28, 16 +; SI-NEXT: v_alignbit_b32 v34, v27, v26, 16 +; SI-NEXT: v_alignbit_b32 v35, v25, v24, 16 +; SI-NEXT: v_alignbit_b32 v36, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v37, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v38, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v39, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v48, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v49, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v50, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v52, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v40, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v43, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v46, v3, v2, 16 ; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_alignbit_b32 v58, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v57, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v13 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v11 ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_alignbit_b32 v60, v2, v1, 16 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v32 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v9 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v7 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v5 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v3 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v1 ; SI-NEXT: .LBB84_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB84_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_f64 v[31:32], v[31:32], 1.0 -; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 -; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 -; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 -; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 -; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 -; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 -; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 -; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 -; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 -; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 -; SI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 -; SI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 -; SI-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 -; SI-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 -; SI-NEXT: v_add_f64 v[29:30], v[29:30], 1.0 -; SI-NEXT: v_alignbit_b32 v33, v32, v31, 16 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v34, v30, v29, 16 -; SI-NEXT: v_alignbit_b32 v35, v28, v27, 16 -; SI-NEXT: v_alignbit_b32 v36, v26, v25, 16 -; SI-NEXT: v_alignbit_b32 v37, v24, v23, 16 -; SI-NEXT: v_alignbit_b32 v38, v22, v21, 16 -; SI-NEXT: v_alignbit_b32 v39, v20, v19, 16 -; SI-NEXT: v_alignbit_b32 v49, v18, v17, 16 -; SI-NEXT: v_alignbit_b32 v51, v16, v15, 16 -; SI-NEXT: v_alignbit_b32 v54, v14, v13, 16 -; SI-NEXT: v_alignbit_b32 v40, v12, v11, 16 -; SI-NEXT: v_alignbit_b32 v42, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v45, v8, v7, 16 -; SI-NEXT: v_alignbit_b32 v47, v6, v5, 16 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; SI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; SI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; SI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; SI-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 +; SI-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; SI-NEXT: v_alignbit_b32 v32, v31, v30, 16 +; SI-NEXT: v_alignbit_b32 v33, v29, v28, 16 +; SI-NEXT: v_alignbit_b32 v34, v27, v26, 16 +; SI-NEXT: v_alignbit_b32 v35, v25, v24, 16 +; SI-NEXT: v_alignbit_b32 v36, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v37, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v38, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v39, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v48, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v49, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v50, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v52, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v40, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v43, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v46, v3, v2, 16 ; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_alignbit_b32 v58, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v57, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v13 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v11 ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_alignbit_b32 v60, v2, v1, 16 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v32 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v9 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v7 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v5 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v3 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v1 ; SI-NEXT: .LBB84_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v60 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v57 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v46 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v43 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 +; SI-NEXT: v_or_b32_e32 v0, v0, v57 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v1, v60 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v58 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v63 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v47 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v62 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v45 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v61 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v42 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v59 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v57 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v56 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v46 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v49 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v44 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v43 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v24 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v25 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v26 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v53 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v27 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v28 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v29 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v30 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v31 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v63 +; SI-NEXT: v_or_b32_e32 v2, v2, v46 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v62 +; SI-NEXT: v_or_b32_e32 v4, v4, v43 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v61 +; SI-NEXT: v_or_b32_e32 v6, v6, v40 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v60 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; SI-NEXT: v_or_b32_e32 v1, v1, v57 +; SI-NEXT: v_or_b32_e32 v3, v3, v46 +; SI-NEXT: v_or_b32_e32 v5, v5, v43 +; SI-NEXT: v_or_b32_e32 v7, v7, v40 +; SI-NEXT: v_or_b32_e32 v8, v8, v52 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v59 +; SI-NEXT: v_or_b32_e32 v10, v10, v50 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v58 +; SI-NEXT: v_or_b32_e32 v12, v12, v49 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v56 +; SI-NEXT: v_or_b32_e32 v14, v14, v48 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v47 +; SI-NEXT: v_or_b32_e32 v16, v16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v45 +; SI-NEXT: v_or_b32_e32 v18, v18, v38 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v44 +; SI-NEXT: v_or_b32_e32 v20, v20, v37 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v42 +; SI-NEXT: v_or_b32_e32 v22, v22, v36 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v41 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_or_b32_e32 v24, v24, v35 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v55 +; SI-NEXT: v_or_b32_e32 v26, v26, v34 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v54 +; SI-NEXT: v_or_b32_e32 v28, v28, v33 +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v53 +; SI-NEXT: v_or_b32_e32 v30, v30, v32 +; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v51 +; SI-NEXT: v_or_b32_e32 v9, v9, v52 +; SI-NEXT: v_or_b32_e32 v11, v11, v50 +; SI-NEXT: v_or_b32_e32 v13, v13, v49 +; SI-NEXT: v_or_b32_e32 v15, v15, v48 +; SI-NEXT: v_or_b32_e32 v17, v17, v39 +; SI-NEXT: v_or_b32_e32 v19, v19, v38 +; SI-NEXT: v_or_b32_e32 v21, v21, v37 +; SI-NEXT: v_or_b32_e32 v23, v23, v36 +; SI-NEXT: v_or_b32_e32 v25, v25, v35 +; SI-NEXT: v_or_b32_e32 v27, v27, v34 +; SI-NEXT: v_or_b32_e32 v29, v29, v33 +; SI-NEXT: v_or_b32_e32 v31, v31, v32 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v32 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16f64_to_v64i16: @@ -142701,22 +141127,22 @@ define inreg <64 x i16> @bitcast_v16f64_to_v64i16_scalar(<16 x double> inreg %a, ; SI-LABEL: bitcast_v16f64_to_v64i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 -; SI-NEXT: v_mov_b32_e32 v31, s16 -; SI-NEXT: v_mov_b32_e32 v32, s17 -; SI-NEXT: v_mov_b32_e32 v29, s18 -; SI-NEXT: v_mov_b32_e32 v30, s19 -; SI-NEXT: v_mov_b32_e32 v27, s20 -; SI-NEXT: v_mov_b32_e32 v28, s21 -; SI-NEXT: v_mov_b32_e32 v25, s22 -; SI-NEXT: v_mov_b32_e32 v26, s23 -; SI-NEXT: v_mov_b32_e32 v23, s24 -; SI-NEXT: v_mov_b32_e32 v24, s25 -; SI-NEXT: v_mov_b32_e32 v21, s26 -; SI-NEXT: v_mov_b32_e32 v22, s27 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: v_mov_b32_e32 v30, s16 +; SI-NEXT: v_mov_b32_e32 v31, s17 +; SI-NEXT: v_mov_b32_e32 v28, s18 +; SI-NEXT: v_mov_b32_e32 v29, s19 +; SI-NEXT: v_mov_b32_e32 v26, s20 +; SI-NEXT: v_mov_b32_e32 v27, s21 +; SI-NEXT: v_mov_b32_e32 v24, s22 +; SI-NEXT: v_mov_b32_e32 v25, s23 +; SI-NEXT: v_mov_b32_e32 v22, s24 +; SI-NEXT: v_mov_b32_e32 v23, s25 +; SI-NEXT: v_mov_b32_e32 v20, s26 +; SI-NEXT: v_mov_b32_e32 v21, s27 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mov_b32_e32 v19, s28 -; SI-NEXT: v_mov_b32_e32 v20, s29 +; SI-NEXT: v_mov_b32_e32 v18, s28 +; SI-NEXT: v_mov_b32_e32 v19, s29 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -142735,309 +141161,229 @@ define inreg <64 x i16> @bitcast_v16f64_to_v64i16_scalar(<16 x double> inreg %a, ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB85_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v18 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v17 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v15 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v13 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v12 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v11 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v10 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v9 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[33:34], v[17:18], 16 -; SI-NEXT: v_lshr_b64 v[34:35], v[15:16], 16 -; SI-NEXT: v_lshr_b64 v[35:36], v[13:14], 16 -; SI-NEXT: v_lshr_b64 v[36:37], v[11:12], 16 -; SI-NEXT: v_lshr_b64 v[37:38], v[9:10], 16 -; SI-NEXT: v_lshr_b64 v[38:39], v[7:8], 16 -; SI-NEXT: v_lshr_b64 v[39:40], v[23:24], 16 -; SI-NEXT: v_lshr_b64 v[40:41], v[25:26], 16 -; SI-NEXT: v_lshr_b64 v[41:42], v[27:28], 16 -; SI-NEXT: v_lshr_b64 v[51:52], v[5:6], 16 -; SI-NEXT: v_lshr_b64 v[48:49], v[1:2], 16 -; SI-NEXT: v_lshr_b64 v[42:43], v[29:30], 16 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v32 -; SI-NEXT: v_lshr_b64 v[54:55], v[3:4], 16 -; SI-NEXT: v_lshr_b64 v[49:50], v[19:20], 16 -; SI-NEXT: v_lshr_b64 v[52:53], v[21:22], 16 -; SI-NEXT: v_lshr_b64 v[43:44], v[31:32], 16 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; SI-NEXT: v_lshr_b64 v[42:43], v[16:17], 16 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[43:44], v[14:15], 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[32:33], v[20:21], 16 +; SI-NEXT: v_lshr_b64 v[44:45], v[12:13], 16 +; SI-NEXT: v_lshr_b64 v[33:34], v[22:23], 16 +; SI-NEXT: v_lshr_b64 v[45:46], v[10:11], 16 +; SI-NEXT: v_lshr_b64 v[56:57], v[4:5], 16 +; SI-NEXT: v_lshr_b64 v[34:35], v[24:25], 16 +; SI-NEXT: v_lshr_b64 v[46:47], v[8:9], 16 +; SI-NEXT: v_lshr_b64 v[57:58], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[35:36], v[26:27], 16 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v31 +; SI-NEXT: v_lshr_b64 v[47:48], v[6:7], 16 +; SI-NEXT: v_lshr_b64 v[58:59], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[40:41], v[18:19], 16 +; SI-NEXT: v_lshr_b64 v[38:39], v[28:29], 16 +; SI-NEXT: v_lshr_b64 v[36:37], v[30:31], 16 ; SI-NEXT: s_cbranch_execnz .LBB85_3 ; SI-NEXT: .LBB85_2: ; %cmp.true -; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 -; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 -; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v18 -; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 -; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v12 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v10 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[33:34], v[17:18], 16 -; SI-NEXT: v_lshr_b64 v[34:35], v[15:16], 16 -; SI-NEXT: v_lshr_b64 v[35:36], v[13:14], 16 -; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 -; SI-NEXT: v_lshr_b64 v[36:37], v[11:12], 16 -; SI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 -; SI-NEXT: v_lshr_b64 v[37:38], v[9:10], 16 -; SI-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 -; SI-NEXT: v_lshr_b64 v[38:39], v[7:8], 16 -; SI-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 -; SI-NEXT: v_lshr_b64 v[39:40], v[23:24], 16 -; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 -; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 -; SI-NEXT: v_add_f64 v[29:30], v[29:30], 1.0 -; SI-NEXT: v_lshr_b64 v[40:41], v[25:26], 16 -; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 -; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 -; SI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 -; SI-NEXT: v_add_f64 v[31:32], v[31:32], 1.0 -; SI-NEXT: v_lshr_b64 v[41:42], v[27:28], 16 -; SI-NEXT: v_lshr_b64 v[51:52], v[5:6], 16 -; SI-NEXT: v_lshr_b64 v[48:49], v[1:2], 16 -; SI-NEXT: v_lshr_b64 v[42:43], v[29:30], 16 -; SI-NEXT: v_lshr_b64 v[54:55], v[3:4], 16 -; SI-NEXT: v_lshr_b64 v[49:50], v[19:20], 16 -; SI-NEXT: v_lshr_b64 v[52:53], v[21:22], 16 -; SI-NEXT: v_lshr_b64 v[43:44], v[31:32], 16 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v32 -; SI-NEXT: .LBB85_3: ; %end -; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v43 -; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; SI-NEXT: v_or_b32_e32 v31, v31, v50 -; SI-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v31, 0xffff, v32 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v57 -; SI-NEXT: v_or_b32_e32 v31, v31, v32 -; SI-NEXT: v_add_i32_e32 v32, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v31, v32, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v42 -; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; SI-NEXT: v_or_b32_e32 v29, v29, v31 -; SI-NEXT: v_add_i32_e32 v31, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v29, v31, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v29, 0xffff, v30 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v56 -; SI-NEXT: v_or_b32_e32 v29, v29, v30 -; SI-NEXT: v_add_i32_e32 v30, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v29, v30, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v41 -; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; SI-NEXT: v_or_b32_e32 v27, v27, v29 -; SI-NEXT: v_add_i32_e32 v29, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v27, v29, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v27, 0xffff, v28 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v47 -; SI-NEXT: v_or_b32_e32 v27, v27, v28 -; SI-NEXT: v_add_i32_e32 v28, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v27, v28, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v40 -; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; SI-NEXT: v_or_b32_e32 v25, v25, v27 -; SI-NEXT: v_add_i32_e32 v27, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v25, v27, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v25, 0xffff, v26 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v46 -; SI-NEXT: v_or_b32_e32 v25, v25, v26 -; SI-NEXT: v_add_i32_e32 v26, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v25, v26, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v39 -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; SI-NEXT: v_or_b32_e32 v23, v23, v25 -; SI-NEXT: v_add_i32_e32 v25, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v23, v25, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v45 -; SI-NEXT: v_or_b32_e32 v23, v23, v24 -; SI-NEXT: v_add_i32_e32 v24, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v23, v24, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v52 -; SI-NEXT: v_or_b32_e32 v21, v21, v23 -; SI-NEXT: v_add_i32_e32 v23, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v21, v23, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v22 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v63 -; SI-NEXT: v_or_b32_e32 v21, v21, v22 -; SI-NEXT: v_add_i32_e32 v22, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v21, v22, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v49 -; SI-NEXT: v_or_b32_e32 v19, v19, v21 -; SI-NEXT: v_add_i32_e32 v21, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v19, v21, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v62 -; SI-NEXT: v_or_b32_e32 v19, v19, v20 -; SI-NEXT: v_add_i32_e32 v20, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v48 -; SI-NEXT: v_or_b32_e32 v1, v1, v19 -; SI-NEXT: v_add_i32_e32 v19, vcc, 56, v0 -; SI-NEXT: buffer_store_dword v1, v19, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v61 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v60 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v17 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v15 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v59 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v13 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v11 +; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v58 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v9 +; SI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; SI-NEXT: v_lshr_b64 v[42:43], v[16:17], 16 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[43:44], v[14:15], 16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_lshr_b64 v[32:33], v[20:21], 16 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; SI-NEXT: v_lshr_b64 v[44:45], v[12:13], 16 +; SI-NEXT: v_lshr_b64 v[33:34], v[22:23], 16 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; SI-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 +; SI-NEXT: v_lshr_b64 v[45:46], v[10:11], 16 +; SI-NEXT: v_lshr_b64 v[56:57], v[4:5], 16 +; SI-NEXT: v_lshr_b64 v[34:35], v[24:25], 16 +; SI-NEXT: v_lshr_b64 v[46:47], v[8:9], 16 +; SI-NEXT: v_lshr_b64 v[57:58], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[35:36], v[26:27], 16 +; SI-NEXT: v_lshr_b64 v[47:48], v[6:7], 16 +; SI-NEXT: v_lshr_b64 v[58:59], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[40:41], v[18:19], 16 +; SI-NEXT: v_lshr_b64 v[38:39], v[28:29], 16 +; SI-NEXT: v_lshr_b64 v[36:37], v[30:31], 16 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v31 +; SI-NEXT: .LBB85_3: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; SI-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; SI-NEXT: v_or_b32_e32 v36, v30, v36 +; SI-NEXT: v_and_b32_e32 v30, 0xffff, v31 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v52 +; SI-NEXT: v_or_b32_e32 v37, v30, v31 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v38 +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: v_or_b32_e32 v38, v28, v30 +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v50 +; SI-NEXT: v_or_b32_e32 v39, v28, v29 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v35 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v48, v26, v28 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v49 +; SI-NEXT: v_or_b32_e32 v49, v26, v27 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v34 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v50, v24, v26 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v51 +; SI-NEXT: v_or_b32_e32 v51, v24, v25 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v33 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v52, v22, v24 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v53 +; SI-NEXT: v_or_b32_e32 v53, v22, v23 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v32 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v54, v20, v22 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v55 +; SI-NEXT: v_or_b32_e32 v55, v20, v21 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v40 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v40, v18, v20 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v61 +; SI-NEXT: v_or_b32_e32 v41, v18, v19 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v58 +; SI-NEXT: v_or_b32_e32 v32, v0, v18 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v60 +; SI-NEXT: v_or_b32_e32 v33, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v57 +; SI-NEXT: v_or_b32_e32 v34, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v63 +; SI-NEXT: v_or_b32_e32 v35, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v56 +; SI-NEXT: v_or_b32_e32 v18, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v62 +; SI-NEXT: v_or_b32_e32 v19, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v47 +; SI-NEXT: v_or_b32_e32 v20, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; SI-NEXT: v_mov_b32_e32 v2, v38 +; SI-NEXT: v_mov_b32_e32 v3, v39 +; SI-NEXT: v_mov_b32_e32 v4, v48 +; SI-NEXT: v_mov_b32_e32 v5, v49 +; SI-NEXT: v_mov_b32_e32 v6, v50 +; SI-NEXT: v_mov_b32_e32 v7, v51 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v21, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v46 +; SI-NEXT: v_or_b32_e32 v22, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; SI-NEXT: v_mov_b32_e32 v8, v52 +; SI-NEXT: v_mov_b32_e32 v9, v53 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v23, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v45 +; SI-NEXT: v_or_b32_e32 v24, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v11 +; SI-NEXT: v_mov_b32_e32 v10, v54 +; SI-NEXT: v_mov_b32_e32 v11, v55 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v25, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v44 +; SI-NEXT: v_or_b32_e32 v26, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v13 +; SI-NEXT: v_mov_b32_e32 v12, v40 +; SI-NEXT: v_mov_b32_e32 v13, v41 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v27, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v43 +; SI-NEXT: v_or_b32_e32 v28, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v15 +; SI-NEXT: v_mov_b32_e32 v14, v32 +; SI-NEXT: v_mov_b32_e32 v15, v33 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v29, v0, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v42 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -143054,46 +141400,57 @@ define inreg <64 x i16> @bitcast_v16f64_to_v64i16_scalar(<16 x double> inreg %a, ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v30, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v17 +; SI-NEXT: v_mov_b32_e32 v16, v34 +; SI-NEXT: v_mov_b32_e32 v17, v35 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v31, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, v36 +; SI-NEXT: v_mov_b32_e32 v1, v37 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB85_4: +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; kill: killed $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; kill: killed $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; kill: killed $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; kill: killed $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; kill: killed $vgpr37 ; SI-NEXT: s_branch .LBB85_2 ; ; VI-LABEL: bitcast_v16f64_to_v64i16_scalar: @@ -143294,186 +141651,319 @@ define <16 x double> @bitcast_v64i16_to_v16f64(<64 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v64i16_to_v16f64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v62, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:132 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:88 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:104 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:112 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:128 -; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v25 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 +; SI-NEXT: v_mov_b32_e32 v45, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v19 +; SI-NEXT: v_mov_b32_e32 v46, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v20 +; SI-NEXT: v_mov_b32_e32 v47, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v21 +; SI-NEXT: v_mov_b32_e32 v56, v6 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v22 +; SI-NEXT: v_mov_b32_e32 v57, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v23 +; SI-NEXT: v_mov_b32_e32 v58, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v24 +; SI-NEXT: v_mov_b32_e32 v59, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v25 +; SI-NEXT: v_mov_b32_e32 v60, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v26 +; SI-NEXT: v_mov_b32_e32 v61, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v27 +; SI-NEXT: v_mov_b32_e32 v43, v11 +; SI-NEXT: v_mov_b32_e32 v44, v10 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v18 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v17 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v16 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v15 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v43 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v44 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v45 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v46 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v47 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v56 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v57 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v58 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v59 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v60 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v61 ; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v10 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v8 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v12 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v18 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v20 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v22 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v24 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v26 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v28 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v30 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v62 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v9 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v33 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:36 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:4 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v31 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:108 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:100 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v63 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB86_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v22, 0xffff, v41 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v46 +; SI-NEXT: v_or_b32_e32 v8, v8, v32 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v43 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v32 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v62 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v61 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v60 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v59 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v58 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v57 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v56 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v47 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v45 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v44 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: v_or_b32_e32 v0, v0, v49 +; SI-NEXT: v_or_b32_e32 v1, v1, v42 +; SI-NEXT: v_or_b32_e32 v2, v2, v34 +; SI-NEXT: v_or_b32_e32 v3, v3, v41 +; SI-NEXT: v_or_b32_e32 v4, v4, v48 +; SI-NEXT: v_or_b32_e32 v5, v5, v40 +; SI-NEXT: v_or_b32_e32 v6, v6, v33 +; SI-NEXT: v_or_b32_e32 v7, v7, v55 +; SI-NEXT: v_or_b32_e32 v9, v9, v54 +; SI-NEXT: v_or_b32_e32 v10, v10, v39 +; SI-NEXT: v_or_b32_e32 v11, v11, v53 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v12, v12, v38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v24, v24, v25 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_or_b32_e32 v13, v13, v52 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_or_b32_e32 v14, v14, v37 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v26, v26, v27 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_or_b32_e32 v15, v15, v51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v16, v16, v36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: v_or_b32_e32 v28, v28, v29 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_or_b32_e32 v17, v17, v50 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: v_or_b32_e32 v29, v29, v30 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v18, v18, v35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; SI-NEXT: v_or_b32_e32 v30, v30, v31 +; SI-NEXT: v_and_b32_e32 v31, 0xffff, v63 +; SI-NEXT: v_or_b32_e32 v31, v31, v32 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 @@ -143503,255 +141993,53 @@ define <16 x double> @bitcast_v64i16_to_v16f64(<64 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v48 -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v35 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: v_or_b32_e32 v16, v16, v53 -; SI-NEXT: v_or_b32_e32 v17, v17, v51 -; SI-NEXT: v_or_b32_e32 v18, v18, v50 ; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; SI-NEXT: v_or_b32_e32 v22, v22, v23 -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v54 -; SI-NEXT: v_or_b32_e32 v19, v19, v39 -; SI-NEXT: v_or_b32_e32 v23, v23, v24 -; SI-NEXT: v_and_b32_e32 v24, 0xffff, v52 -; SI-NEXT: v_or_b32_e32 v24, v24, v25 -; SI-NEXT: v_and_b32_e32 v25, 0xffff, v49 -; SI-NEXT: v_or_b32_e32 v25, v25, v26 -; SI-NEXT: v_and_b32_e32 v26, 0xffff, v37 -; SI-NEXT: v_or_b32_e32 v26, v26, v27 -; SI-NEXT: v_and_b32_e32 v27, 0xffff, v33 -; SI-NEXT: v_or_b32_e32 v27, v27, v28 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: v_or_b32_e32 v0, v0, v63 -; SI-NEXT: v_or_b32_e32 v1, v1, v62 -; SI-NEXT: v_or_b32_e32 v2, v2, v61 -; SI-NEXT: v_or_b32_e32 v3, v3, v60 -; SI-NEXT: v_or_b32_e32 v4, v4, v59 -; SI-NEXT: v_or_b32_e32 v5, v5, v58 -; SI-NEXT: v_or_b32_e32 v6, v6, v57 -; SI-NEXT: v_or_b32_e32 v7, v7, v56 -; SI-NEXT: v_or_b32_e32 v8, v8, v47 -; SI-NEXT: v_or_b32_e32 v9, v9, v46 -; SI-NEXT: v_or_b32_e32 v10, v10, v45 -; SI-NEXT: v_or_b32_e32 v11, v11, v44 -; SI-NEXT: v_or_b32_e32 v12, v12, v43 -; SI-NEXT: v_or_b32_e32 v13, v13, v42 -; SI-NEXT: v_or_b32_e32 v14, v14, v40 -; SI-NEXT: v_or_b32_e32 v15, v15, v55 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; SI-NEXT: v_or_b32_e32 v28, v28, v29 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; SI-NEXT: v_or_b32_e32 v20, v20, v36 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; SI-NEXT: v_or_b32_e32 v29, v29, v30 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v30, 0xffff, v30 -; SI-NEXT: v_or_b32_e32 v30, v30, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; SI-NEXT: v_or_b32_e32 v21, v21, v34 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; kill: killed $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; kill: killed $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; kill: killed $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; kill: killed $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; SI-NEXT: v_or_b32_e32 v31, v31, v38 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; kill: killed $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; kill: killed $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; kill: killed $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; kill: killed $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; kill: killed $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; kill: killed $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; kill: killed $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; kill: killed $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; kill: killed $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: .LBB86_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB86_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v41 -; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v46 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v8, v32, v8 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v48 -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v35 -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v32 -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; SI-NEXT: s_mov_b32 s6, 0x30000 -; SI-NEXT: v_or_b32_e32 v16, v53, v16 -; SI-NEXT: v_or_b32_e32 v17, v51, v17 -; SI-NEXT: v_or_b32_e32 v18, v50, v18 -; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 -; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 -; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v54 -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; SI-NEXT: v_or_b32_e32 v23, v24, v23 -; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v52 -; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v49 -; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; SI-NEXT: v_or_b32_e32 v25, v26, v25 -; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v37 -; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; SI-NEXT: v_or_b32_e32 v26, v27, v26 -; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v33 -; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; SI-NEXT: v_or_b32_e32 v27, v28, v27 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v62 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v61 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v60 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v59 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v58 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v57 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v56 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v47 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v45 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v44 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v43 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 @@ -143760,32 +142048,21 @@ define <16 x double> @bitcast_v64i16_to_v16f64(<64 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; SI-NEXT: v_or_b32_e32 v0, v63, v0 -; SI-NEXT: v_or_b32_e32 v1, v62, v1 -; SI-NEXT: v_or_b32_e32 v2, v61, v2 -; SI-NEXT: v_or_b32_e32 v3, v60, v3 -; SI-NEXT: v_or_b32_e32 v4, v59, v4 -; SI-NEXT: v_or_b32_e32 v5, v58, v5 -; SI-NEXT: v_or_b32_e32 v6, v57, v6 -; SI-NEXT: v_or_b32_e32 v7, v56, v7 -; SI-NEXT: v_or_b32_e32 v8, v47, v8 -; SI-NEXT: v_or_b32_e32 v9, v46, v9 -; SI-NEXT: v_or_b32_e32 v10, v45, v10 -; SI-NEXT: v_or_b32_e32 v11, v44, v11 -; SI-NEXT: v_or_b32_e32 v12, v43, v12 -; SI-NEXT: v_or_b32_e32 v13, v42, v13 -; SI-NEXT: v_or_b32_e32 v14, v40, v14 -; SI-NEXT: v_or_b32_e32 v15, v55, v15 -; SI-NEXT: v_or_b32_e32 v19, v39, v19 +; SI-NEXT: v_or_b32_e32 v0, v49, v0 +; SI-NEXT: v_or_b32_e32 v1, v42, v1 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v2, v34, v2 +; SI-NEXT: v_or_b32_e32 v3, v41, v3 +; SI-NEXT: v_or_b32_e32 v4, v48, v4 +; SI-NEXT: v_or_b32_e32 v5, v40, v5 +; SI-NEXT: v_or_b32_e32 v6, v33, v6 +; SI-NEXT: v_or_b32_e32 v7, v55, v7 +; SI-NEXT: v_or_b32_e32 v9, v54, v9 +; SI-NEXT: v_or_b32_e32 v10, v39, v10 +; SI-NEXT: v_or_b32_e32 v11, v53, v11 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 @@ -143793,72 +142070,136 @@ define <16 x double> @bitcast_v64i16_to_v16f64(<64 x i16> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 ; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 ; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; SI-NEXT: v_or_b32_e32 v28, v29, v28 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 ; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 ; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 ; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 ; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 -; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 -; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 -; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 -; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 ; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v12, v38, v12 +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 ; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_or_b32_e32 v13, v52, v13 +; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 ; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_or_b32_e32 v14, v37, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 ; SI-NEXT: v_add_i32_e32 v25, vcc, s6, v25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v26, v27, v26 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_or_b32_e32 v15, v51, v15 +; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 ; SI-NEXT: v_add_i32_e32 v26, vcc, s6, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v16, v36, v16 +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 ; SI-NEXT: v_add_i32_e32 v27, vcc, s6, v27 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: v_or_b32_e32 v28, v29, v28 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_or_b32_e32 v17, v50, v17 +; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 ; SI-NEXT: v_add_i32_e32 v28, vcc, s6, v28 -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; SI-NEXT: v_or_b32_e32 v20, v36, v20 -; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v20 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 ; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 ; SI-NEXT: v_or_b32_e32 v29, v30, v29 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v18, v35, v18 +; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 ; SI-NEXT: v_add_i32_e32 v29, vcc, 0x30000, v29 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 ; SI-NEXT: v_and_b32_e32 v30, 0xffff, v30 ; SI-NEXT: v_or_b32_e32 v30, v31, v30 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; SI-NEXT: v_or_b32_e32 v21, v34, v21 -; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v21 -; SI-NEXT: v_add_i32_e32 v30, vcc, 0x30000, v30 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v31 +; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v63 ; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; SI-NEXT: v_or_b32_e32 v31, v38, v31 +; SI-NEXT: v_or_b32_e32 v31, v32, v31 +; SI-NEXT: v_add_i32_e32 v30, vcc, 0x30000, v30 ; SI-NEXT: v_add_i32_e32 v31, vcc, 0x30000, v31 ; SI-NEXT: .LBB86_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -144099,442 +142440,387 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a, ; SI-LABEL: bitcast_v64i16_to_v16f64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v54, v12 -; SI-NEXT: v_mov_b32_e32 v34, v10 -; SI-NEXT: v_mov_b32_e32 v35, v8 -; SI-NEXT: v_mov_b32_e32 v38, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:68 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v29 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v40 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v40, v1 +; SI-NEXT: v_mov_b32_e32 v55, v2 +; SI-NEXT: v_mov_b32_e32 v41, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v40 +; SI-NEXT: v_mov_b32_e32 v54, v3 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v55 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v54 +; SI-NEXT: v_mov_b32_e32 v52, v5 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v53 +; SI-NEXT: v_mov_b32_e32 v51, v6 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v52 +; SI-NEXT: v_mov_b32_e32 v50, v7 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v51 +; SI-NEXT: v_mov_b32_e32 v49, v8 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v50 +; SI-NEXT: v_mov_b32_e32 v48, v9 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v49 +; SI-NEXT: v_mov_b32_e32 v39, v10 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v48 +; SI-NEXT: v_mov_b32_e32 v38, v11 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v39 +; SI-NEXT: v_mov_b32_e32 v37, v12 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v38 +; SI-NEXT: v_mov_b32_e32 v36, v13 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v37 +; SI-NEXT: v_mov_b32_e32 v35, v14 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v36 +; SI-NEXT: v_mov_b32_e32 v34, v15 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v35 +; SI-NEXT: v_mov_b32_e32 v33, v16 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v34 +; SI-NEXT: v_mov_b32_e32 v32, v17 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v33 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v41 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s7, s28, 16 +; SI-NEXT: s_lshr_b32 s8, s27, 16 +; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: s_lshr_b32 s13, s22, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s40, s19, 16 +; SI-NEXT: s_lshr_b32 s41, s18, 16 +; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v32 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v36 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v8 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v12 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v33 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v50 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v49 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: s_cbranch_scc0 .LBB87_2 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB87_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 -; SI-NEXT: v_or_b32_e32 v7, v0, v48 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v4 -; SI-NEXT: v_or_b32_e32 v9, v0, v39 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v6 -; SI-NEXT: v_or_b32_e32 v10, v0, v47 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 -; SI-NEXT: v_or_b32_e32 v11, v0, v46 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 -; SI-NEXT: v_or_b32_e32 v12, v0, v45 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v41 +; SI-NEXT: v_or_b32_e32 v14, v0, v45 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; SI-NEXT: v_or_b32_e32 v16, v0, v43 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v54 -; SI-NEXT: v_or_b32_e32 v13, v0, v44 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v14 -; SI-NEXT: v_mov_b32_e32 v35, v34 -; SI-NEXT: v_mov_b32_e32 v34, v54 -; SI-NEXT: v_mov_b32_e32 v54, v14 -; SI-NEXT: v_or_b32_e32 v14, v0, v43 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_or_b32_e32 v15, v0, v15 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v18 -; SI-NEXT: v_or_b32_e32 v16, v0, v42 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v20 -; SI-NEXT: v_or_b32_e32 v17, v0, v17 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v22 -; SI-NEXT: v_or_b32_e32 v18, v0, v41 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v24 -; SI-NEXT: v_or_b32_e32 v19, v0, v19 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v26 -; SI-NEXT: v_or_b32_e32 v20, v0, v37 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v28 -; SI-NEXT: v_or_b32_e32 v21, v0, v21 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 -; SI-NEXT: v_or_b32_e32 v22, v0, v61 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 -; SI-NEXT: v_or_b32_e32 v23, v0, v23 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 ; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: v_or_b32_e32 v24, v0, v57 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v63 +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: v_or_b32_e32 v17, v0, v42 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: v_or_b32_e32 v25, v0, v25 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v62 -; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: v_or_b32_e32 v26, v0, v40 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s44, s42, 16 +; SI-NEXT: v_or_b32_e32 v18, v0, v63 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 -; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: v_or_b32_e32 v27, v0, v27 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v60 -; SI-NEXT: s_or_b32 s7, s7, s8 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: v_or_b32_e32 v28, v0, v5 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v59 -; SI-NEXT: s_or_b32 s8, s8, s9 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: v_or_b32_e32 v29, v0, v29 +; SI-NEXT: s_or_b32 s5, s5, s44 +; SI-NEXT: s_and_b32 s44, s18, 0xffff +; SI-NEXT: s_lshl_b32 s45, s41, 16 +; SI-NEXT: v_or_b32_e32 v19, v0, v62 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 -; SI-NEXT: s_or_b32 s9, s9, s10 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: s_or_b32 s44, s44, s45 +; SI-NEXT: s_and_b32 s45, s19, 0xffff +; SI-NEXT: s_lshl_b32 s46, s40, 16 +; SI-NEXT: v_or_b32_e32 v20, v0, v61 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: s_or_b32 s45, s45, s46 +; SI-NEXT: s_and_b32 s46, s20, 0xffff +; SI-NEXT: s_lshl_b32 s47, s15, 16 +; SI-NEXT: v_or_b32_e32 v21, v0, v60 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: s_or_b32 s46, s46, s47 +; SI-NEXT: s_and_b32 s47, s21, 0xffff +; SI-NEXT: s_lshl_b32 s56, s14, 16 +; SI-NEXT: v_or_b32_e32 v22, v0, v59 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; SI-NEXT: s_or_b32 s47, s47, s56 +; SI-NEXT: s_and_b32 s56, s22, 0xffff +; SI-NEXT: s_lshl_b32 s57, s13, 16 +; SI-NEXT: v_or_b32_e32 v23, v0, v58 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: s_or_b32 s56, s56, s57 +; SI-NEXT: s_and_b32 s57, s23, 0xffff +; SI-NEXT: s_lshl_b32 s58, s12, 16 +; SI-NEXT: v_or_b32_e32 v24, v0, v57 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: s_or_b32 s57, s57, s58 +; SI-NEXT: s_and_b32 s58, s24, 0xffff +; SI-NEXT: s_lshl_b32 s59, s11, 16 +; SI-NEXT: v_or_b32_e32 v25, v0, v56 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: s_or_b32 s58, s58, s59 +; SI-NEXT: s_and_b32 s59, s25, 0xffff +; SI-NEXT: s_lshl_b32 s60, s10, 16 +; SI-NEXT: v_or_b32_e32 v26, v0, v47 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: s_or_b32 s59, s59, s60 +; SI-NEXT: s_and_b32 s60, s26, 0xffff +; SI-NEXT: s_lshl_b32 s61, s9, 16 +; SI-NEXT: v_or_b32_e32 v27, v0, v46 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: s_or_b32 s60, s60, s61 +; SI-NEXT: s_and_b32 s61, s27, 0xffff +; SI-NEXT: s_lshl_b32 s62, s8, 16 +; SI-NEXT: v_or_b32_e32 v28, v0, v5 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: s_or_b32 s61, s61, s62 +; SI-NEXT: s_and_b32 s62, s28, 0xffff +; SI-NEXT: s_lshl_b32 s63, s7, 16 +; SI-NEXT: v_or_b32_e32 v29, v0, v4 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: s_or_b32 s62, s62, s63 +; SI-NEXT: s_and_b32 s63, s29, 0xffff +; SI-NEXT: s_lshl_b32 s72, s6, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v40 ; SI-NEXT: v_or_b32_e32 v30, v0, v3 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v58 -; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_mov_b32_e32 v36, v38 -; SI-NEXT: v_mov_b32_e32 v38, v2 -; SI-NEXT: v_or_b32_e32 v8, v1, v56 -; SI-NEXT: v_mov_b32_e32 v42, v41 -; SI-NEXT: v_mov_b32_e32 v50, v37 -; SI-NEXT: v_mov_b32_e32 v55, v61 -; SI-NEXT: v_mov_b32_e32 v33, v32 -; SI-NEXT: v_mov_b32_e32 v53, v63 -; SI-NEXT: v_mov_b32_e32 v62, v52 -; SI-NEXT: v_mov_b32_e32 v60, v59 -; SI-NEXT: v_mov_b32_e32 v49, v51 -; SI-NEXT: v_or_b32_e32 v31, v0, v31 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: s_or_b32 s63, s63, s72 +; SI-NEXT: v_or_b32_e32 v15, v1, v44 +; SI-NEXT: v_or_b32_e32 v31, v0, v2 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_mov_b32_e32 v5, s9 -; SI-NEXT: v_mov_b32_e32 v6, s10 -; SI-NEXT: s_mov_b64 s[4:5], 0 -; SI-NEXT: s_branch .LBB87_3 -; SI-NEXT: .LBB87_2: +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_mov_b32_e32 v2, s44 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v3, s45 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v42, v41 -; SI-NEXT: v_mov_b32_e32 v50, v37 -; SI-NEXT: v_mov_b32_e32 v36, v38 -; SI-NEXT: v_mov_b32_e32 v55, v61 -; SI-NEXT: v_mov_b32_e32 v38, v2 -; SI-NEXT: v_mov_b32_e32 v35, v34 -; SI-NEXT: v_mov_b32_e32 v34, v54 -; SI-NEXT: v_mov_b32_e32 v54, v14 -; SI-NEXT: v_mov_b32_e32 v33, v32 -; SI-NEXT: v_mov_b32_e32 v53, v63 -; SI-NEXT: v_mov_b32_e32 v62, v52 -; SI-NEXT: v_mov_b32_e32 v60, v59 -; SI-NEXT: v_mov_b32_e32 v49, v51 -; SI-NEXT: s_mov_b64 s[4:5], -1 -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: .LBB87_3: ; %Flow -; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] -; SI-NEXT: v_mov_b32_e32 v58, v49 -; SI-NEXT: s_cbranch_vccnz .LBB87_5 -; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v48, v0 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v38 +; SI-NEXT: v_mov_b32_e32 v4, s46 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, s47 +; SI-NEXT: v_mov_b32_e32 v6, s56 +; SI-NEXT: v_mov_b32_e32 v7, s57 +; SI-NEXT: v_mov_b32_e32 v8, s58 +; SI-NEXT: v_mov_b32_e32 v9, s59 +; SI-NEXT: v_mov_b32_e32 v10, s60 +; SI-NEXT: v_mov_b32_e32 v11, s61 +; SI-NEXT: v_mov_b32_e32 v12, s62 +; SI-NEXT: v_mov_b32_e32 v13, s63 +; SI-NEXT: s_cbranch_execnz .LBB87_3 +; SI-NEXT: .LBB87_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v40 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v56, v1 -; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1 -; SI-NEXT: v_mov_b32_e32 v52, v53 -; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: s_or_b32 s7, s8, s7 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: s_or_b32 s8, s9, s8 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: s_or_b32 s9, s10, s9 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: s_or_b32 s10, s11, s10 -; SI-NEXT: s_add_i32 s4, s4, 0x30000 -; SI-NEXT: s_add_i32 s5, s5, 0x30000 -; SI-NEXT: s_add_i32 s6, s6, 0x30000 -; SI-NEXT: s_add_i32 s7, s7, 0x30000 -; SI-NEXT: s_add_i32 s8, s8, 0x30000 -; SI-NEXT: s_add_i32 s9, s9, 0x30000 -; SI-NEXT: s_add_i32 s10, s10, 0x30000 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_mov_b32_e32 v5, s9 -; SI-NEXT: v_mov_b32_e32 v6, s10 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v39, v0 -; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v47, v0 -; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v46, v0 -; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_or_b32_e32 v1, v44, v1 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v45, v0 -; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v44, v0 -; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v54 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v43, v0 ; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v0, v43, v0 ; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v54 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v0, v42, v0 ; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: v_or_b32_e32 v0, v63, v0 ; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v0, v62, v0 ; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v50, v0 +; SI-NEXT: v_or_b32_e32 v0, v61, v0 ; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v0, v60, v0 ; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v55, v0 +; SI-NEXT: v_or_b32_e32 v0, v59, v0 ; SI-NEXT: v_add_i32_e32 v22, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v0, v58, v0 ; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v57, v0 ; SI-NEXT: v_add_i32_e32 v24, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v0, v56, v0 ; SI-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v40, v0 +; SI-NEXT: v_or_b32_e32 v0, v47, v0 ; SI-NEXT: v_add_i32_e32 v26, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v62 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v0, v46, v0 ; SI-NEXT: v_add_i32_e32 v27, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v28, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v60 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s16, s42, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: s_and_b32 s16, s18, 0xffff +; SI-NEXT: s_lshl_b32 s17, s41, 16 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_and_b32 s17, s19, 0xffff +; SI-NEXT: s_lshl_b32 s18, s40, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: s_and_b32 s18, s20, 0xffff +; SI-NEXT: s_lshl_b32 s15, s15, 16 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_or_b32 s15, s15, s18 +; SI-NEXT: s_and_b32 s18, s21, 0xffff +; SI-NEXT: s_lshl_b32 s14, s14, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s14, s14, s18 +; SI-NEXT: s_and_b32 s18, s22, 0xffff +; SI-NEXT: s_lshl_b32 s13, s13, 16 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_or_b32 s13, s13, s18 +; SI-NEXT: s_and_b32 s18, s23, 0xffff +; SI-NEXT: s_lshl_b32 s12, s12, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s12, s12, s18 +; SI-NEXT: s_and_b32 s18, s24, 0xffff +; SI-NEXT: s_lshl_b32 s11, s11, 16 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_or_b32 s11, s11, s18 +; SI-NEXT: s_and_b32 s18, s25, 0xffff +; SI-NEXT: s_lshl_b32 s10, s10, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s10, s10, s18 +; SI-NEXT: s_and_b32 s18, s26, 0xffff +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_or_b32 s9, s9, s18 +; SI-NEXT: s_and_b32 s18, s27, 0xffff +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_or_b32 s8, s8, s18 +; SI-NEXT: s_and_b32 s18, s28, 0xffff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_or_b32 s7, s7, s18 +; SI-NEXT: s_and_b32 s18, s29, 0xffff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_or_b32 s6, s6, s18 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s16, s16, 0x30000 +; SI-NEXT: s_add_i32 s17, s17, 0x30000 +; SI-NEXT: s_add_i32 s15, s15, 0x30000 +; SI-NEXT: s_add_i32 s14, s14, 0x30000 +; SI-NEXT: s_add_i32 s13, s13, 0x30000 +; SI-NEXT: s_add_i32 s12, s12, 0x30000 +; SI-NEXT: s_add_i32 s11, s11, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v3, s17 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v4, s15 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, s14 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v29, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v58 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_mov_b32_e32 v6, s13 +; SI-NEXT: v_mov_b32_e32 v7, s12 +; SI-NEXT: v_mov_b32_e32 v8, s11 +; SI-NEXT: v_mov_b32_e32 v9, s10 +; SI-NEXT: v_mov_b32_e32 v10, s9 +; SI-NEXT: v_mov_b32_e32 v11, s8 +; SI-NEXT: v_mov_b32_e32 v12, s7 +; SI-NEXT: v_mov_b32_e32 v13, s6 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v30, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v31, vcc, 0x30000, v0 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: .LBB87_5: ; %end -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: .LBB87_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB87_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_branch .LBB87_2 ; ; VI-LABEL: bitcast_v64i16_to_v16f64_scalar: ; VI: ; %bb.0: @@ -144967,1041 +143253,1112 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; SI-LABEL: bitcast_v128i8_to_v64bf16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:392 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:100 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:116 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:124 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:132 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:148 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:156 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:164 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:180 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:188 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:388 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:120 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:128 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:144 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:152 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:160 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:176 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:184 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:108 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v6 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v5 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v8 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v7 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v12 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v11 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v14 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v13 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v16 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v15 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v20 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v19 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v22 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v21 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v24 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v23 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v28 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v27 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v30 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v29 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; kill: killed $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; kill: killed $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; kill: killed $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v2 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v3 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v7 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v9 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v10 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v11 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v13 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:144 -; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v15 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v6 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v17 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v19 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v9 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v21 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v10 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v23 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v25 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:268 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:264 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:140 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v18 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:136 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:140 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:136 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:168 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:132 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:96 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:148 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v26 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v20 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v27 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:212 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:192 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:156 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:88 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(5) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v2 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(5) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v3 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:208 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:168 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:164 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:180 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:192 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:216 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:188 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(3) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:204 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:204 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:200 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:200 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:224 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:196 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:224 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v26, 8, v3 +; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v2 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:240 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:236 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:232 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:232 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:228 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:276 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:256 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v1 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:256 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:280 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v31, 24, v1 +; SI-NEXT: v_lshlrev_b32_e32 v17, 24, v0 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v33, 24, v2 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v38, 8, v3 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:308 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:316 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:288 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:304 +; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v2 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v4 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:288 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:304 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:312 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:284 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:300 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:296 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:296 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:292 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:308 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:928 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:272 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v0 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_lshlrev_b32_e32 v51, 24, v1 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_lshlrev_b32_e32 v41, 24, v2 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_lshlrev_b32_e32 v44, 8, v3 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:340 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:348 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:320 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:336 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:332 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:328 +; SI-NEXT: v_lshlrev_b32_e32 v54, 24, v5 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v43, 8, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:320 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:336 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:344 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:316 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:332 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:328 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:324 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:340 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_lshlrev_b32_e32 v45, 24, v1 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_lshlrev_b32_e32 v57, 24, v2 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:356 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:372 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:380 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:352 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:368 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:364 -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:360 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_lshlrev_b32_e32 v61, 8, v3 +; SI-NEXT: v_lshlrev_b32_e32 v45, 24, v12 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_lshlrev_b32_e32 v62, 24, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:388 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:384 +; SI-NEXT: v_lshlrev_b32_e32 v47, 24, v13 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v59, 8, v14 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:352 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:368 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:376 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:348 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:364 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:360 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:356 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:372 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v2 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v4 +; SI-NEXT: v_lshlrev_b32_e32 v60, 24, v12 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:384 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:380 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v61, 24, v13 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; kill: killed $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v63, 8, v14 +; SI-NEXT: ; kill: killed $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:24 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v62, 24, v12 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:12 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:276 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:152 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:184 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:216 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:248 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:280 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:312 -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:344 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:376 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:76 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:100 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:8 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:92 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:116 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:84 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:928 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:76 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:72 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:68 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:60 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:52 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:44 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; kill: killed $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; kill: killed $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; kill: killed $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; kill: killed $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; kill: killed $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; kill: killed $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; kill: killed $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; kill: killed $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; kill: killed $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; kill: killed $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; kill: killed $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; kill: killed $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; kill: killed $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; kill: killed $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB88_2 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v7 -; SI-NEXT: v_lshlrev_b32_e32 v17, 24, v56 -; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v8 -; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v54 -; SI-NEXT: v_and_b32_e32 v13, 0xff, v60 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v31, v7, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v6 +; SI-NEXT: v_or_b32_e32 v32, v4, v2 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v29, v45, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; SI-NEXT: v_or_b32_e32 v33, v0, v4 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v34, v47, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v41 +; SI-NEXT: v_or_b32_e32 v0, v0, v59 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v8 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: v_or_b32_e32 v27, v54, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v9 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: v_or_b32_e32 v2, v2, v43 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v13 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v4, 0xff, v42 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: v_or_b32_e32 v35, v60, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v56 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v46 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: v_or_b32_e32 v36, v5, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v44 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: v_or_b32_e32 v37, v61, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v58 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v57 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: v_or_b32_e32 v4, v4, v63 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: v_or_b32_e32 v38, v62, v5 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v4 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v9 -; SI-NEXT: v_or_b32_e32 v5, v9, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v5, v9, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v14, 24, v14 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v12, v5, v9 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v12 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v5, v9, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v9 -; SI-NEXT: v_or_b32_e32 v5, v9, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v15, 24, v15 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v5, v9, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v16, v5, v9 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v16 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v14 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v5, v9, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v19, v15 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v9 -; SI-NEXT: v_or_b32_e32 v5, v9, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v19 +; SI-NEXT: v_or_b32_e32 v15, v19, v15 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v5, v9, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v15, v19, v15 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v20, v5, v9 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v20 +; SI-NEXT: v_or_b32_e32 v21, v15, v19 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v21 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v5, v9, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v15, v19, v15 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v9 -; SI-NEXT: v_or_b32_e32 v5, v9, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v19 +; SI-NEXT: v_or_b32_e32 v15, v19, v15 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v5, v9, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v15, v19, v15 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v22, v5, v9 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v22 +; SI-NEXT: v_or_b32_e32 v39, v15, v19 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v5, v9, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v15, v19, v15 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:860 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v9 -; SI-NEXT: v_or_b32_e32 v5, v9, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v19 +; SI-NEXT: v_or_b32_e32 v15, v19, v15 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v5, v9, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v15, v19, v15 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v24, v5, v9 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v24 +; SI-NEXT: v_or_b32_e32 v48, v15, v19 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:868 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v5, v9, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v15, v19, v15 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:880 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:912 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:900 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v9 -; SI-NEXT: v_or_b32_e32 v5, v9, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v19 +; SI-NEXT: v_or_b32_e32 v15, v19, v15 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:860 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:888 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v5, v9, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v15, v19, v15 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:904 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:920 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v28, v5, v9 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:888 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v28 +; SI-NEXT: v_or_b32_e32 v49, v15, v19 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v49 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v5, v9, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v15, v19, v15 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:912 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v9 -; SI-NEXT: v_or_b32_e32 v5, v9, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v19 +; SI-NEXT: v_or_b32_e32 v15, v19, v15 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v5, v9, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v15, v19, v15 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v29, v5, v9 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v29 +; SI-NEXT: v_or_b32_e32 v50, v15, v19 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v5, v9, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v15, v19, v15 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:952 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:940 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v9 -; SI-NEXT: v_or_b32_e32 v5, v9, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v19 +; SI-NEXT: v_or_b32_e32 v15, v19, v15 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:924 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v5, v9, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v15, v19, v15 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v1, v1, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v9, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v51, v15, v19 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v9 -; SI-NEXT: v_or_b32_e32 v5, v9, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v15, v19, v15 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v5, v9, v5 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v19 +; SI-NEXT: v_or_b32_e32 v15, v19, v15 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v27 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v27, v5, v9 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v27 -; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v5, v9, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v15, v19, v15 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v9 -; SI-NEXT: v_or_b32_e32 v5, v9, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v52, v15, v19 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v5, v9, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v15, v19, v15 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v6, v5, v6 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v6 -; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v9, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v19 +; SI-NEXT: v_or_b32_e32 v15, v19, v15 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v9 -; SI-NEXT: v_or_b32_e32 v5, v9, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v15, v19, v15 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:856 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v5, v9, v5 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v30 +; SI-NEXT: v_or_b32_e32 v53, v15, v19 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:956 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v30, v5, v9 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:972 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v5, v9, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:876 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:864 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v22, v19, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:880 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:872 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v9 -; SI-NEXT: v_or_b32_e32 v5, v9, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v9, 0xff, v40 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v10, v10, v9 -; SI-NEXT: v_and_b32_e32 v9, 0xff, v49 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v57, v9 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v19 +; SI-NEXT: v_or_b32_e32 v26, v19, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v51 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v19, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v53 -; SI-NEXT: v_or_b32_e32 v19, v5, v26 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:932 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v19 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v25, v3, v15 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:908 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v31, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v15, 0xff, v58 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_or_b32_e32 v23, v17, v15 -; SI-NEXT: v_and_b32_e32 v15, 0xff, v46 +; SI-NEXT: v_or_b32_e32 v24, v17, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:964 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:952 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v3, v3, v18 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_or_b32_e32 v18, v3, v15 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v63 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v17, 24, v17 +; SI-NEXT: v_or_b32_e32 v30, v17, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:940 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v28, v10, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v53 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v2 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; kill: killed $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr2 @@ -146063,583 +144420,563 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; kill: killed $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v59 ; SI-NEXT: ; kill: killed $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: ; kill: killed $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: v_or_b32_e32 v17, v4, v3 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v33, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v42 -; SI-NEXT: v_or_b32_e32 v7, v5, v38 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v21 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v51, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v8, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v25 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v11, v41, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v47 -; SI-NEXT: v_or_b32_e32 v8, v5, v44 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v36 -; SI-NEXT: v_or_b32_e32 v14, v13, v61 -; SI-NEXT: v_and_b32_e32 v13, 0xff, v43 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v5, v45, v5 -; SI-NEXT: v_or_b32_e32 v13, v62, v13 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v14 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr62 ; SI-NEXT: .LBB88_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB88_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v63 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_or_b32_e32 v2, v2, v5 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v59 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x300, v2 -; SI-NEXT: v_or_b32_e32 v4, v4, v5 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v58 -; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v56 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v46 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v4 ; SI-NEXT: s_movk_i32 s6, 0x300 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; SI-NEXT: v_or_b32_e32 v3, v3, v5 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v60 -; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v43 -; SI-NEXT: v_or_b32_e32 v4, v61, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_or_b32_e32 v2, v7, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v7, v2, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:964 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:952 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; SI-NEXT: v_or_b32_e32 v0, v0, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v16 ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; SI-NEXT: v_or_b32_e32 v5, v62, v5 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v40 +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_or_b32_e32 v5, v47, v5 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v5, v0 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v9 ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v54 -; SI-NEXT: v_or_b32_e32 v5, v9, v5 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v11 +; SI-NEXT: v_or_b32_e32 v5, v43, v5 ; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; SI-NEXT: v_or_b32_e32 v9, v57, v9 +; SI-NEXT: v_or_b32_e32 v9, v45, v9 ; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; SI-NEXT: v_or_b32_e32 v5, v9, v5 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v47 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v20 ; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v36 -; SI-NEXT: v_or_b32_e32 v9, v44, v9 -; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 -; SI-NEXT: v_or_b32_e32 v10, v45, v10 -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v14 -; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v8 -; SI-NEXT: v_or_b32_e32 v8, v8, v10 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v25 -; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; SI-NEXT: v_or_b32_e32 v10, v41, v10 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v42 -; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v21 -; SI-NEXT: v_or_b32_e32 v10, v38, v10 -; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 -; SI-NEXT: v_or_b32_e32 v11, v51, v11 -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v15 -; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 -; SI-NEXT: v_or_b32_e32 v7, v7, v11 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:932 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: s_mov_b32 s7, 0x3000000 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:864 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:972 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_or_b32_e32 v11, v33, v11 -; SI-NEXT: v_or_b32_e32 v11, v11, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v53 -; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; SI-NEXT: v_or_b32_e32 v7, v26, v7 +; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_or_b32_e32 v6, v6, v9 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_or_b32_e32 v8, v54, v8 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v58 ; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; SI-NEXT: v_or_b32_e32 v12, v31, v12 -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: v_or_b32_e32 v12, v12, v7 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:876 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_lshlrev_b32_e32 v13, 8, v13 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; SI-NEXT: v_or_b32_e32 v7, v13, v7 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v57 +; SI-NEXT: v_or_b32_e32 v12, v63, v12 ; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v13, v19, v13 -; SI-NEXT: v_or_b32_e32 v13, v13, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v30 -; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; SI-NEXT: v_or_b32_e32 v7, v14, v7 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x300, v12 +; SI-NEXT: v_or_b32_e32 v13, v62, v13 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v56 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v46 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v44 ; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_or_b32_e32 v14, v14, v7 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v15 -; SI-NEXT: v_or_b32_e32 v7, v15, v7 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 +; SI-NEXT: v_or_b32_e32 v14, v61, v14 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v41 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v42 +; SI-NEXT: v_or_b32_e32 v14, v59, v14 ; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: v_or_b32_e32 v15, v15, v7 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v16, v7 -; SI-NEXT: v_or_b32_e32 v16, v7, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v17, v7 -; SI-NEXT: v_or_b32_e32 v17, v7, v6 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v27 -; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:940 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v15, v60, v15 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: s_mov_b32 s7, 0x3000000 +; SI-NEXT: v_add_i32_e32 v4, vcc, s7, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, s7, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, s7, v7 +; SI-NEXT: v_add_i32_e32 v0, vcc, s7, v0 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v4 +; SI-NEXT: v_and_b32_e32 v34, 0xffff0000, v0 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v10, v2 +; SI-NEXT: v_or_b32_e32 v8, v2, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:908 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v7, vcc, s7, v8 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v18, v7 -; SI-NEXT: v_or_b32_e32 v18, v7, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_or_b32_e32 v1, v18, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_or_b32_e32 v2, v17, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v9, v2, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:880 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:872 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v8, vcc, s7, v9 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v8 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v19, v7 -; SI-NEXT: v_or_b32_e32 v19, v7, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v10, v2, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:856 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:956 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v9, vcc, s7, v10 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v1, v6, v1 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_or_b32_e32 v20, v6, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:952 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:940 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v11, v2, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v10, vcc, s7, v11 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v10 ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 -; SI-NEXT: v_or_b32_e32 v1, v6, v1 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:924 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_or_b32_e32 v21, v6, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v15, v2, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v11, vcc, s7, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v16, v2, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v16, vcc, s7, v16 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v16 ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v1, v6, v1 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_or_b32_e32 v22, v6, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:912 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v17, v2, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v17, vcc, s7, v17 ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 -; SI-NEXT: v_or_b32_e32 v1, v6, v1 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_or_b32_e32 v23, v6, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:904 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v18, v2, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v18, vcc, s7, v18 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v18 ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v1, v6, v1 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:888 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_or_b32_e32 v24, v6, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:880 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v19, v2, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v20, vcc, s7, v19 ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 -; SI-NEXT: v_or_b32_e32 v1, v6, v1 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:860 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_or_b32_e32 v25, v6, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v7, vcc, s7, v10 -; SI-NEXT: v_add_i32_e32 v10, vcc, s7, v13 -; SI-NEXT: v_add_i32_e32 v13, vcc, s7, v16 -; SI-NEXT: v_add_i32_e32 v16, vcc, s7, v19 -; SI-NEXT: v_add_i32_e32 v19, vcc, s7, v22 -; SI-NEXT: v_add_i32_e32 v22, vcc, s7, v25 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v19 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v21, v2, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v22, vcc, s7, v21 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v22 ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v26, v6, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v23, v2, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:920 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v24, vcc, s7, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v25, v2, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:912 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:900 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v21, vcc, s7, v25 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v25, 8, v25 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:888 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v26, v2, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v25, vcc, s7, v26 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:868 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v27, v6, v1 -; SI-NEXT: v_add_i32_e32 v6, vcc, s7, v9 -; SI-NEXT: v_add_i32_e32 v9, vcc, s7, v12 -; SI-NEXT: v_add_i32_e32 v12, vcc, s7, v15 -; SI-NEXT: v_add_i32_e32 v15, vcc, s7, v18 -; SI-NEXT: v_add_i32_e32 v18, vcc, s7, v21 -; SI-NEXT: v_add_i32_e32 v21, vcc, s7, v24 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v1, vcc, s7, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, s7, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, s7, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, s7, v5 -; SI-NEXT: v_add_i32_e32 v5, vcc, s7, v8 -; SI-NEXT: v_add_i32_e32 v8, vcc, s7, v11 -; SI-NEXT: v_add_i32_e32 v11, vcc, s7, v14 -; SI-NEXT: v_add_i32_e32 v14, vcc, s7, v17 -; SI-NEXT: v_add_i32_e32 v17, vcc, s7, v20 -; SI-NEXT: v_add_i32_e32 v20, vcc, s7, v23 -; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v26 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; SI-NEXT: v_or_b32_e32 v23, v27, v23 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v23, vcc, s7, v23 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v21 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v27, v2, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:860 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; SI-NEXT: v_and_b32_e32 v24, 0xff, v24 -; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v24 -; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; SI-NEXT: v_and_b32_e32 v25, 0xff, v25 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_or_b32_e32 v25, v26, v25 -; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v24, vcc, s7, v24 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v28, v2, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, s7, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v26, vcc, s7, v28 +; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v3 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; SI-NEXT: v_and_b32_e32 v25, 0xff, v25 +; SI-NEXT: v_or_b32_e32 v29, v2, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v25, v26, v25 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v25, vcc, s6, v25 -; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; SI-NEXT: v_and_b32_e32 v26, 0xff, v26 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_or_b32_e32 v26, v27, v26 -; SI-NEXT: v_or_b32_e32 v25, v26, v25 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v25, vcc, s7, v25 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v25 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v19, v15 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_or_b32_e32 v30, v2, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s7, v13 +; SI-NEXT: v_add_i32_e32 v13, vcc, s7, v27 +; SI-NEXT: v_add_i32_e32 v27, vcc, s7, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v1, vcc, s7, v12 +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v29 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v12, v30, v12 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v12, vcc, s7, v12 +; SI-NEXT: v_and_b32_e32 v37, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v2 +; SI-NEXT: v_and_b32_e32 v38, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v19, v15 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v14, vcc, s7, v14 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; SI-NEXT: v_and_b32_e32 v26, 0xff, v26 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v27, 8, v27 -; SI-NEXT: v_or_b32_e32 v26, v27, v26 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v26, vcc, s6, v26 -; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v19, 8, v19 +; SI-NEXT: v_or_b32_e32 v15, v19, v15 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; SI-NEXT: v_and_b32_e32 v27, 0xff, v27 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; SI-NEXT: v_or_b32_e32 v27, v28, v27 -; SI-NEXT: v_or_b32_e32 v26, v27, v26 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v26, vcc, s7, v26 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v23, v19 +; SI-NEXT: v_or_b32_e32 v15, v19, v15 +; SI-NEXT: v_add_i32_e32 v28, vcc, s7, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; SI-NEXT: v_and_b32_e32 v27, 0xff, v27 +; SI-NEXT: v_or_b32_e32 v15, v19, v15 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v27, v28, v27 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v27, vcc, s6, v27 -; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v23, v19 +; SI-NEXT: v_or_b32_e32 v15, v19, v15 +; SI-NEXT: v_add_i32_e32 v23, vcc, s7, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; SI-NEXT: v_and_b32_e32 v28, 0xff, v28 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; SI-NEXT: v_or_b32_e32 v28, v29, v28 -; SI-NEXT: v_or_b32_e32 v27, v28, v27 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v27, vcc, s7, v27 -; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v19, 8, v19 +; SI-NEXT: v_or_b32_e32 v15, v19, v15 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v29, v19 +; SI-NEXT: v_or_b32_e32 v15, v19, v15 +; SI-NEXT: v_add_i32_e32 v29, vcc, s7, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; SI-NEXT: v_and_b32_e32 v28, 0xff, v28 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v29, 8, v29 -; SI-NEXT: v_or_b32_e32 v28, v29, v28 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v28, vcc, s6, v28 -; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: v_or_b32_e32 v15, v19, v15 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 -; SI-NEXT: v_and_b32_e32 v29, 0xff, v29 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; SI-NEXT: v_or_b32_e32 v29, v30, v29 -; SI-NEXT: v_or_b32_e32 v28, v29, v28 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v28, vcc, s7, v28 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v30, v19 +; SI-NEXT: v_or_b32_e32 v15, v19, v15 +; SI-NEXT: v_add_i32_e32 v19, vcc, s7, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 -; SI-NEXT: v_and_b32_e32 v29, 0xff, v29 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v29, v30, v29 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v29, vcc, s6, v29 -; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v30, 8, v30 +; SI-NEXT: v_or_b32_e32 v15, v30, v15 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 ; SI-NEXT: v_and_b32_e32 v30, 0xff, v30 ; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 ; SI-NEXT: v_or_b32_e32 v30, v31, v30 -; SI-NEXT: v_or_b32_e32 v29, v30, v29 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v29, vcc, s7, v29 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v29 +; SI-NEXT: v_or_b32_e32 v15, v30, v15 +; SI-NEXT: v_add_i32_e32 v30, vcc, s7, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 -; SI-NEXT: v_and_b32_e32 v30, 0xff, v30 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v31, 8, v31 -; SI-NEXT: v_or_b32_e32 v30, v31, v30 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v30, vcc, s6, v30 -; SI-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; SI-NEXT: v_or_b32_e32 v15, v31, v15 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v31 ; SI-NEXT: v_and_b32_e32 v31, 0xff, v31 ; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 ; SI-NEXT: v_or_b32_e32 v31, v32, v31 -; SI-NEXT: v_or_b32_e32 v30, v31, v30 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v30, vcc, s7, v30 +; SI-NEXT: v_or_b32_e32 v15, v31, v15 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v15, vcc, s7, v15 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v31 ; SI-NEXT: v_and_b32_e32 v31, 0xff, v31 ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v32, 8, v32 ; SI-NEXT: v_or_b32_e32 v31, v32, v31 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v31, vcc, s6, v31 ; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -146648,458 +144985,318 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 ; SI-NEXT: v_or_b32_e32 v32, v33, v32 ; SI-NEXT: v_or_b32_e32 v31, v32, v31 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v31, vcc, s7, v31 -; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v31 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32 -; SI-NEXT: v_and_b32_e32 v32, 0xff, v32 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v33, 8, v33 -; SI-NEXT: v_or_b32_e32 v32, v33, v32 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v32, vcc, s6, v32 -; SI-NEXT: v_and_b32_e32 v32, 0xffff, v32 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v33, vcc, 3, v33 -; SI-NEXT: v_and_b32_e32 v33, 0xff, v33 -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; SI-NEXT: v_or_b32_e32 v33, v34, v33 -; SI-NEXT: v_or_b32_e32 v32, v33, v32 -; SI-NEXT: v_add_i32_e32 v32, vcc, s7, v32 -; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v32 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v31 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v15 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v30 ; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v19 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v29 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v23 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v28 ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v14 +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v12 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v26 ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v13 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v21 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v24 ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v23 -; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v22 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v21 -; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v20 +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v22 +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v20 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v19 -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v18 ; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v17 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v17 -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v15 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v15 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v14 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v13 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v13 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v12 +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v11 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v11 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v10 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v9 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v9 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v7 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v7 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v6 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v4 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v4 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v3 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v3 -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v2 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v2 -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v1 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v1 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v10 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v9 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v9 +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v8 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v7 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v6 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v5 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v0 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill ; SI-NEXT: .LBB88_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v55 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:932 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v12 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v52 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v12, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v6, 1.0, v23 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v14, 1.0, v21 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v39 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v48 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v49 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v26 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v52 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:876 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:904 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:948 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v19 +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_mul_f32_e32 v13, 1.0, v13 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v11 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_mul_f32_e32 v15, 1.0, v15 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v17 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v50 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v21 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v48 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_alignbit_b32 v2, v2, v3, 16 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v39 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_alignbit_b32 v3, v3, v4, 16 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_alignbit_b32 v4, v4, v5, 16 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v37 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_alignbit_b32 v5, v5, v6, 16 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_alignbit_b32 v6, v6, v7, 16 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v35 +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_alignbit_b32 v7, v7, v8, 16 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_alignbit_b32 v8, v8, v9, 16 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v34 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_alignbit_b32 v9, v9, v10, 16 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:856 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_alignbit_b32 v10, v10, v11, 16 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:868 ; 4-byte Folded Reload -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v28 +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_alignbit_b32 v11, v11, v12, 16 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:884 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:872 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_alignbit_b32 v12, v12, v13, 16 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:864 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:896 ; 4-byte Folded Reload -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v24 +; SI-NEXT: v_mul_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_alignbit_b32 v13, v13, v14, 16 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:884 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:908 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:900 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_alignbit_b32 v14, v14, v15, 16 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:896 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:916 ; 4-byte Folded Reload -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v22 +; SI-NEXT: v_mul_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_alignbit_b32 v15, v15, v16, 16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:916 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:936 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:920 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_alignbit_b32 v16, v16, v17, 16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:924 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:944 ; 4-byte Folded Reload -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v20 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_alignbit_b32 v17, v17, v18, 16 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:936 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:956 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:948 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_alignbit_b32 v18, v18, v19, 16 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:944 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:964 ; 4-byte Folded Reload -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_alignbit_b32 v19, v19, v20, 16 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:968 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v11 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_alignbit_b32 v20, v20, v21, 16 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v22 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v50 +; SI-NEXT: v_alignbit_b32 v21, v21, v22, 16 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v25 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_alignbit_b32 v22, v22, v23, 16 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v24 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v51 +; SI-NEXT: v_alignbit_b32 v23, v23, v24, 16 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v28 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v30 +; SI-NEXT: v_alignbit_b32 v24, v24, v25, 16 +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v31 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_alignbit_b32 v25, v25, v26, 16 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v27 +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v32 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v40 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_alignbit_b32 v26, v26, v27, 16 +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v29 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v53 +; SI-NEXT: v_alignbit_b32 v27, v27, v28, 16 +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v34 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_mul_f32_e32 v29, 1.0, v33 +; SI-NEXT: v_alignbit_b32 v28, v28, v29, 16 +; SI-NEXT: v_mul_f32_e32 v29, 1.0, v35 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v55 +; SI-NEXT: v_alignbit_b32 v29, v29, v30, 16 +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v37 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v36 +; SI-NEXT: v_alignbit_b32 v30, v30, v31, 16 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v38 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_alignbit_b32 v31, v31, v32, 16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v5 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v12 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v9 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v10 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v13 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v32 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v18 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v23 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v17 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v29 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v128i8_to_v64bf16: @@ -151224,26 +149421,31 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[4:5] -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:332 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:328 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:324 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:320 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:316 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:312 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:308 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:304 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:328 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:324 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:320 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:316 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:312 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:308 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:304 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:300 ; SI-NEXT: ; implicit-def: $vgpr44 : SGPR spill to VGPR lane -; SI-NEXT: s_mov_b32 s73, s21 +; SI-NEXT: s_mov_b32 s75, s27 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v44, s19, 0 -; SI-NEXT: v_writelane_b32 v44, s18, 1 -; SI-NEXT: v_writelane_b32 v44, s17, 2 -; SI-NEXT: v_writelane_b32 v44, s16, 3 +; SI-NEXT: v_writelane_b32 v44, s25, 0 +; SI-NEXT: v_writelane_b32 v44, s22, 1 +; SI-NEXT: v_writelane_b32 v44, s19, 2 +; SI-NEXT: v_writelane_b32 v44, s18, 3 +; SI-NEXT: s_mov_b32 s74, s29 +; SI-NEXT: s_mov_b32 s6, s23 +; SI-NEXT: s_mov_b32 s76, s28 +; SI-NEXT: s_mov_b32 s77, s26 +; SI-NEXT: s_mov_b32 s79, s24 ; SI-NEXT: v_writelane_b32 v41, s30, 0 ; SI-NEXT: v_writelane_b32 v41, s31, 1 ; SI-NEXT: v_writelane_b32 v41, s34, 2 @@ -151268,9 +149470,6 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: v_writelane_b32 v41, s69, 21 ; SI-NEXT: v_writelane_b32 v41, s70, 22 ; SI-NEXT: v_writelane_b32 v41, s71, 23 -; SI-NEXT: s_mov_b32 s74, s29 -; SI-NEXT: s_mov_b32 s78, s28 -; SI-NEXT: s_mov_b32 s76, s27 ; SI-NEXT: v_writelane_b32 v41, s80, 24 ; SI-NEXT: v_writelane_b32 v41, s81, 25 ; SI-NEXT: v_writelane_b32 v41, s82, 26 @@ -151280,308 +149479,312 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: v_writelane_b32 v41, s86, 30 ; SI-NEXT: v_writelane_b32 v41, s87, 31 ; SI-NEXT: v_writelane_b32 v41, s96, 32 -; SI-NEXT: s_mov_b32 s47, s26 ; SI-NEXT: v_writelane_b32 v41, s97, 33 ; SI-NEXT: v_writelane_b32 v41, s98, 34 ; SI-NEXT: v_writelane_b32 v41, s99, 35 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:164 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:160 ; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:156 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:152 -; SI-NEXT: v_readfirstlane_b32 s37, v22 +; SI-NEXT: v_readfirstlane_b32 s49, v19 ; SI-NEXT: ; implicit-def: $vgpr43 : SGPR spill to VGPR lane -; SI-NEXT: v_readfirstlane_b32 s38, v20 -; SI-NEXT: v_writelane_b32 v43, s37, 0 -; SI-NEXT: v_readfirstlane_b32 s39, v19 -; SI-NEXT: v_writelane_b32 v43, s38, 1 -; SI-NEXT: v_readfirstlane_b32 s48, v25 -; SI-NEXT: v_writelane_b32 v43, s39, 2 -; SI-NEXT: v_readfirstlane_b32 s49, v26 -; SI-NEXT: v_writelane_b32 v43, s48, 3 -; SI-NEXT: v_readfirstlane_b32 s50, v24 -; SI-NEXT: v_writelane_b32 v43, s49, 4 -; SI-NEXT: v_readfirstlane_b32 s51, v23 -; SI-NEXT: v_writelane_b32 v43, s50, 5 -; SI-NEXT: v_readfirstlane_b32 s52, v29 -; SI-NEXT: v_writelane_b32 v43, s51, 6 -; SI-NEXT: v_readfirstlane_b32 s53, v30 -; SI-NEXT: v_writelane_b32 v43, s52, 7 -; SI-NEXT: v_readfirstlane_b32 s54, v28 -; SI-NEXT: v_writelane_b32 v43, s53, 8 -; SI-NEXT: v_readfirstlane_b32 s55, v27 -; SI-NEXT: v_writelane_b32 v43, s54, 9 -; SI-NEXT: v_writelane_b32 v43, s55, 10 -; SI-NEXT: s_mov_b32 s57, s24 -; SI-NEXT: v_readfirstlane_b32 s16, v1 -; SI-NEXT: v_readfirstlane_b32 s17, v2 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_readfirstlane_b32 s6, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:300 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:296 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:292 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:288 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:284 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:280 +; SI-NEXT: v_readfirstlane_b32 s50, v18 +; SI-NEXT: v_writelane_b32 v43, s49, 0 +; SI-NEXT: v_readfirstlane_b32 s51, v24 +; SI-NEXT: v_writelane_b32 v43, s50, 1 +; SI-NEXT: v_readfirstlane_b32 s52, v25 +; SI-NEXT: v_writelane_b32 v43, s51, 2 +; SI-NEXT: v_readfirstlane_b32 s53, v23 +; SI-NEXT: v_writelane_b32 v43, s52, 3 +; SI-NEXT: v_readfirstlane_b32 s54, v22 +; SI-NEXT: v_writelane_b32 v43, s53, 4 +; SI-NEXT: v_readfirstlane_b32 s55, v28 +; SI-NEXT: v_writelane_b32 v43, s54, 5 +; SI-NEXT: v_readfirstlane_b32 s64, v29 +; SI-NEXT: v_writelane_b32 v43, s55, 6 +; SI-NEXT: v_readfirstlane_b32 s65, v27 +; SI-NEXT: v_writelane_b32 v43, s64, 7 +; SI-NEXT: v_readfirstlane_b32 s66, v26 +; SI-NEXT: v_writelane_b32 v43, s65, 8 +; SI-NEXT: v_writelane_b32 v43, s66, 9 +; SI-NEXT: v_readfirstlane_b32 s85, v30 +; SI-NEXT: v_readfirstlane_b32 s18, v0 +; SI-NEXT: v_readfirstlane_b32 s19, v1 +; SI-NEXT: v_readfirstlane_b32 s89, v4 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:296 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:292 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:288 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:284 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:280 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:276 +; SI-NEXT: v_writelane_b32 v44, s4, 4 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_readfirstlane_b32 s4, v32 -; SI-NEXT: v_writelane_b32 v44, s4, 4 -; SI-NEXT: v_readfirstlane_b32 s4, v33 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:276 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:272 ; SI-NEXT: v_writelane_b32 v44, s4, 5 -; SI-NEXT: v_readfirstlane_b32 s4, v34 +; SI-NEXT: v_readfirstlane_b32 s4, v33 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:272 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:268 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:168 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:164 ; SI-NEXT: v_writelane_b32 v44, s4, 6 -; SI-NEXT: v_readfirstlane_b32 s4, v35 +; SI-NEXT: v_readfirstlane_b32 s4, v34 ; SI-NEXT: v_writelane_b32 v44, s4, 7 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_readfirstlane_b32 s4, v36 +; SI-NEXT: v_readfirstlane_b32 s4, v35 ; SI-NEXT: v_writelane_b32 v44, s4, 8 -; SI-NEXT: v_readfirstlane_b32 s4, v37 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:268 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:264 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:260 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:256 +; SI-NEXT: v_readfirstlane_b32 s4, v36 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:264 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:260 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:256 ; SI-NEXT: v_writelane_b32 v44, s4, 9 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_readfirstlane_b32 s4, v38 +; SI-NEXT: v_readfirstlane_b32 s4, v37 ; SI-NEXT: v_writelane_b32 v44, s4, 10 -; SI-NEXT: v_readfirstlane_b32 s18, v5 -; SI-NEXT: v_readfirstlane_b32 s19, v6 -; SI-NEXT: v_readfirstlane_b32 s77, v4 -; SI-NEXT: v_readfirstlane_b32 s89, v3 -; SI-NEXT: v_readfirstlane_b32 s90, v9 -; SI-NEXT: v_readfirstlane_b32 s91, v10 -; SI-NEXT: v_readfirstlane_b32 s92, v8 -; SI-NEXT: v_readfirstlane_b32 s93, v7 -; SI-NEXT: v_readfirstlane_b32 s94, v13 -; SI-NEXT: v_readfirstlane_b32 s95, v14 -; SI-NEXT: v_readfirstlane_b32 s30, v17 -; SI-NEXT: v_readfirstlane_b32 s31, v18 -; SI-NEXT: v_readfirstlane_b32 s34, v16 -; SI-NEXT: v_readfirstlane_b32 s35, v15 -; SI-NEXT: v_readfirstlane_b32 s36, v21 +; SI-NEXT: v_readfirstlane_b32 s4, v38 +; SI-NEXT: v_writelane_b32 v44, s4, 11 +; SI-NEXT: v_readfirstlane_b32 s90, v5 +; SI-NEXT: v_readfirstlane_b32 s91, v3 +; SI-NEXT: v_readfirstlane_b32 s92, v2 +; SI-NEXT: v_readfirstlane_b32 s93, v8 +; SI-NEXT: v_readfirstlane_b32 s94, v9 +; SI-NEXT: v_readfirstlane_b32 s95, v7 +; SI-NEXT: v_readfirstlane_b32 s30, v13 +; SI-NEXT: v_readfirstlane_b32 s31, v11 +; SI-NEXT: v_readfirstlane_b32 s34, v10 +; SI-NEXT: v_readfirstlane_b32 s35, v16 +; SI-NEXT: v_readfirstlane_b32 s36, v17 +; SI-NEXT: v_readfirstlane_b32 s37, v15 +; SI-NEXT: v_readfirstlane_b32 s38, v14 +; SI-NEXT: v_readfirstlane_b32 s39, v20 +; SI-NEXT: v_readfirstlane_b32 s48, v21 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_readfirstlane_b32 s24, v40 -; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_readfirstlane_b32 s4, v31 -; SI-NEXT: v_writelane_b32 v44, s4, 11 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_readfirstlane_b32 s4, v39 ; SI-NEXT: v_writelane_b32 v44, s4, 12 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_readfirstlane_b32 s4, v48 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_readfirstlane_b32 s4, v39 ; SI-NEXT: v_writelane_b32 v44, s4, 13 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_readfirstlane_b32 s4, v49 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_readfirstlane_b32 s4, v48 ; SI-NEXT: v_writelane_b32 v44, s4, 14 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_readfirstlane_b32 s4, v50 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_readfirstlane_b32 s4, v49 ; SI-NEXT: v_writelane_b32 v44, s4, 15 -; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_readfirstlane_b32 s4, v50 +; SI-NEXT: v_writelane_b32 v44, s4, 16 +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_readfirstlane_b32 s4, v51 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:252 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:248 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:244 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:240 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:236 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:232 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:228 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_readfirstlane_b32 s75, v32 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_readfirstlane_b32 s21, v33 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:224 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:220 -; SI-NEXT: v_writelane_b32 v44, s4, 16 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:248 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:244 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:240 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:236 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:232 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:228 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:224 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_readfirstlane_b32 s58, v32 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:220 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_readfirstlane_b32 s78, v33 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:216 +; SI-NEXT: v_writelane_b32 v44, s4, 17 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_readfirstlane_b32 s4, v34 +; SI-NEXT: v_readfirstlane_b32 s44, v34 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_readfirstlane_b32 s40, v35 +; SI-NEXT: v_readfirstlane_b32 s47, v35 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_readfirstlane_b32 s61, v36 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_readfirstlane_b32 s63, v37 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:216 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:212 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:208 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:204 -; SI-NEXT: v_writelane_b32 v44, s4, 17 +; SI-NEXT: v_readfirstlane_b32 s27, v36 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:212 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:208 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:204 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_readfirstlane_b32 s59, v31 +; SI-NEXT: v_readfirstlane_b32 s88, v31 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_readfirstlane_b32 s56, v38 +; SI-NEXT: v_readfirstlane_b32 s43, v37 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_readfirstlane_b32 s43, v39 +; SI-NEXT: v_readfirstlane_b32 s45, v38 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_readfirstlane_b32 s46, v48 +; SI-NEXT: v_readfirstlane_b32 s29, v39 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_readfirstlane_b32 s42, v49 +; SI-NEXT: v_readfirstlane_b32 s63, v48 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_readfirstlane_b32 s13, v50 +; SI-NEXT: v_readfirstlane_b32 s11, v49 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_readfirstlane_b32 s45, v51 +; SI-NEXT: v_readfirstlane_b32 s23, v50 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_readfirstlane_b32 s13, v51 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:200 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:196 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:192 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:188 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:184 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:180 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:176 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:196 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:192 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:188 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:184 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:180 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:176 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:172 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_readfirstlane_b32 s88, v32 +; SI-NEXT: v_readfirstlane_b32 s60, v32 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_readfirstlane_b32 s79, v33 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:172 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:168 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_readfirstlane_b32 s4, v34 +; SI-NEXT: v_readfirstlane_b32 s4, v33 ; SI-NEXT: v_writelane_b32 v44, s4, 18 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_readfirstlane_b32 s4, v35 -; SI-NEXT: v_writelane_b32 v44, s4, 19 ; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_readfirstlane_b32 s4, v34 +; SI-NEXT: v_writelane_b32 v44, s4, 19 +; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_readfirstlane_b32 s4, v36 ; SI-NEXT: v_writelane_b32 v44, s4, 20 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_readfirstlane_b32 s4, v37 -; SI-NEXT: v_writelane_b32 v44, s4, 21 +; SI-NEXT: v_readfirstlane_b32 s15, v35 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:112 ; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_readfirstlane_b32 s4, v31 -; SI-NEXT: v_writelane_b32 v44, s4, 22 +; SI-NEXT: v_writelane_b32 v44, s4, 21 ; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_readfirstlane_b32 s4, v37 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:148 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:144 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:140 +; SI-NEXT: v_writelane_b32 v44, s4, 22 +; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_readfirstlane_b32 s4, v38 ; SI-NEXT: v_writelane_b32 v44, s4, 23 -; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_readfirstlane_b32 s4, v39 ; SI-NEXT: v_writelane_b32 v44, s4, 24 -; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_readfirstlane_b32 s4, v48 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:136 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:132 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:128 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:124 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:120 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:116 ; SI-NEXT: v_writelane_b32 v44, s4, 25 -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_readfirstlane_b32 s4, v49 ; SI-NEXT: v_writelane_b32 v44, s4, 26 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_readfirstlane_b32 s4, v50 ; SI-NEXT: v_writelane_b32 v44, s4, 27 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_readfirstlane_b32 s4, v51 -; SI-NEXT: v_writelane_b32 v44, s4, 28 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:148 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:144 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_readfirstlane_b32 s4, v33 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:140 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:136 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:132 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:128 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:124 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:120 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:116 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:336 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:332 ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:108 ; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:104 -; SI-NEXT: v_writelane_b32 v44, s4, 29 -; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_writelane_b32 v44, s4, 28 ; SI-NEXT: v_readfirstlane_b32 s4, v52 -; SI-NEXT: v_writelane_b32 v44, s4, 30 +; SI-NEXT: v_writelane_b32 v44, s4, 29 ; SI-NEXT: v_readfirstlane_b32 s4, v53 -; SI-NEXT: v_writelane_b32 v44, s4, 31 +; SI-NEXT: v_writelane_b32 v44, s4, 30 ; SI-NEXT: v_readfirstlane_b32 s4, v54 -; SI-NEXT: v_writelane_b32 v44, s4, 32 +; SI-NEXT: v_writelane_b32 v44, s4, 31 ; SI-NEXT: v_readfirstlane_b32 s4, v55 +; SI-NEXT: v_writelane_b32 v44, s4, 32 +; SI-NEXT: v_readfirstlane_b32 s4, v40 ; SI-NEXT: v_writelane_b32 v44, s4, 33 -; SI-NEXT: v_writelane_b32 v44, s22, 34 -; SI-NEXT: v_writelane_b32 v44, s23, 35 -; SI-NEXT: v_writelane_b32 v44, s73, 36 -; SI-NEXT: v_writelane_b32 v44, s20, 37 -; SI-NEXT: v_writelane_b32 v44, s47, 38 -; SI-NEXT: v_writelane_b32 v44, s76, 39 -; SI-NEXT: v_writelane_b32 v44, s25, 40 -; SI-NEXT: v_writelane_b32 v44, s57, 41 +; SI-NEXT: v_writelane_b32 v44, s17, 34 +; SI-NEXT: v_writelane_b32 v44, s16, 35 +; SI-NEXT: v_writelane_b32 v44, s6, 36 +; SI-NEXT: v_writelane_b32 v44, s21, 37 +; SI-NEXT: v_writelane_b32 v44, s20, 38 +; SI-NEXT: v_writelane_b32 v44, s77, 39 +; SI-NEXT: v_writelane_b32 v44, s75, 40 +; SI-NEXT: v_writelane_b32 v44, s79, 41 ; SI-NEXT: v_writelane_b32 v44, s74, 42 -; SI-NEXT: v_writelane_b32 v44, s78, 43 -; SI-NEXT: v_writelane_b32 v44, s24, 44 -; SI-NEXT: v_writelane_b32 v44, s16, 45 -; SI-NEXT: v_writelane_b32 v44, s17, 46 -; SI-NEXT: v_writelane_b32 v44, s18, 47 -; SI-NEXT: v_writelane_b32 v44, s19, 48 -; SI-NEXT: v_writelane_b32 v44, s77, 49 -; SI-NEXT: v_writelane_b32 v44, s89, 50 -; SI-NEXT: v_writelane_b32 v44, s90, 51 -; SI-NEXT: v_writelane_b32 v44, s91, 52 -; SI-NEXT: v_writelane_b32 v44, s92, 53 -; SI-NEXT: v_writelane_b32 v44, s93, 54 -; SI-NEXT: v_writelane_b32 v44, s94, 55 -; SI-NEXT: v_writelane_b32 v44, s95, 56 +; SI-NEXT: v_writelane_b32 v44, s76, 43 +; SI-NEXT: v_writelane_b32 v44, s18, 44 +; SI-NEXT: v_writelane_b32 v44, s19, 45 +; SI-NEXT: v_writelane_b32 v44, s89, 46 +; SI-NEXT: v_writelane_b32 v44, s90, 47 +; SI-NEXT: v_writelane_b32 v44, s91, 48 +; SI-NEXT: v_writelane_b32 v44, s92, 49 +; SI-NEXT: v_writelane_b32 v44, s93, 50 +; SI-NEXT: v_writelane_b32 v44, s94, 51 +; SI-NEXT: v_writelane_b32 v44, s95, 52 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_readfirstlane_b32 s24, v31 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_readfirstlane_b32 s58, v33 +; SI-NEXT: v_readfirstlane_b32 s28, v32 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_readfirstlane_b32 s10, v34 +; SI-NEXT: v_readfirstlane_b32 s26, v33 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_readfirstlane_b32 s66, v35 -; SI-NEXT: v_readfirstlane_b32 s28, v31 -; SI-NEXT: v_readfirstlane_b32 s27, v32 +; SI-NEXT: v_readfirstlane_b32 s8, v35 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_readfirstlane_b32 s29, v36 +; SI-NEXT: v_readfirstlane_b32 s72, v36 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_readfirstlane_b32 s69, v37 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_readfirstlane_b32 s14, v38 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_readfirstlane_b32 s68, v39 +; SI-NEXT: v_readfirstlane_b32 s9, v37 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:100 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:96 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:88 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:68 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v48 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:80 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_readfirstlane_b32 s11, v49 +; SI-NEXT: v_readfirstlane_b32 s73, v38 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_readfirstlane_b32 s70, v50 +; SI-NEXT: v_readfirstlane_b32 s69, v39 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_readfirstlane_b32 s71, v51 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:64 +; SI-NEXT: v_readfirstlane_b32 s70, v48 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v49 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_readfirstlane_b32 s7, v50 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_readfirstlane_b32 s59, v51 ; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:60 ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:56 ; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:52 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_readfirstlane_b32 vcc_lo, v12 -; SI-NEXT: v_readfirstlane_b32 vcc_hi, v11 -; SI-NEXT: v_writelane_b32 v44, vcc_lo, 57 -; SI-NEXT: v_writelane_b32 v44, vcc_hi, 58 -; SI-NEXT: v_writelane_b32 v44, s30, 59 -; SI-NEXT: v_writelane_b32 v44, s31, 60 -; SI-NEXT: v_writelane_b32 v44, s34, 61 -; SI-NEXT: v_writelane_b32 v44, s35, 62 -; SI-NEXT: v_writelane_b32 v44, s36, 63 +; SI-NEXT: v_readfirstlane_b32 vcc_lo, v6 +; SI-NEXT: v_readfirstlane_b32 vcc_hi, v12 +; SI-NEXT: v_writelane_b32 v44, vcc_lo, 53 +; SI-NEXT: v_writelane_b32 v44, vcc_hi, 54 +; SI-NEXT: v_writelane_b32 v44, s30, 55 +; SI-NEXT: v_writelane_b32 v44, s31, 56 +; SI-NEXT: v_writelane_b32 v44, s34, 57 +; SI-NEXT: v_writelane_b32 v44, s35, 58 +; SI-NEXT: v_writelane_b32 v44, s36, 59 +; SI-NEXT: v_writelane_b32 v44, s37, 60 +; SI-NEXT: v_writelane_b32 v44, s38, 61 +; SI-NEXT: v_writelane_b32 v44, s39, 62 +; SI-NEXT: v_writelane_b32 v44, s48, 63 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_readfirstlane_b32 s57, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:48 +; SI-NEXT: v_readfirstlane_b32 s41, v34 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:64 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_readfirstlane_b32 s60, v31 +; SI-NEXT: v_readfirstlane_b32 s40, v32 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_readfirstlane_b32 s62, v32 +; SI-NEXT: v_readfirstlane_b32 s61, v33 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_readfirstlane_b32 s83, v33 +; SI-NEXT: v_readfirstlane_b32 s82, v35 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_readfirstlane_b32 s98, v34 +; SI-NEXT: v_readfirstlane_b32 s98, v36 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_readfirstlane_b32 s81, v35 +; SI-NEXT: v_readfirstlane_b32 s14, v37 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_readfirstlane_b32 s72, v36 +; SI-NEXT: v_readfirstlane_b32 s71, v49 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_readfirstlane_b32 s87, v37 +; SI-NEXT: v_readfirstlane_b32 s56, v50 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_readfirstlane_b32 s99, v38 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_readfirstlane_b32 s82, v39 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:48 +; SI-NEXT: v_readfirstlane_b32 s81, v51 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_readfirstlane_b32 s80, v34 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_readfirstlane_b32 s83, v38 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_readfirstlane_b32 s99, v39 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s12, v48 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:44 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:40 ; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:36 @@ -151589,85 +149792,71 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:24 ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:20 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_readfirstlane_b32 s26, v48 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_readfirstlane_b32 s15, v49 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_readfirstlane_b32 s96, v50 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_readfirstlane_b32 s7, v51 ; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_readfirstlane_b32 s41, v31 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_readfirstlane_b32 s97, v32 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_readfirstlane_b32 s44, v33 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_readfirstlane_b32 s9, v34 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_readfirstlane_b32 s80, v35 +; SI-NEXT: v_readfirstlane_b32 s22, v31 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_readfirstlane_b32 s86, v36 +; SI-NEXT: v_readfirstlane_b32 s62, v32 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_readfirstlane_b32 s85, v37 +; SI-NEXT: v_readfirstlane_b32 s97, v33 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_readfirstlane_b32 s8, v38 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_readfirstlane_b32 s12, v39 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_readfirstlane_b32 s65, v48 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_readfirstlane_b32 s64, v49 -; SI-NEXT: v_writelane_b32 v43, s64, 11 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_readfirstlane_b32 s46, v34 +; SI-NEXT: v_readfirstlane_b32 s42, v48 +; SI-NEXT: v_readfirstlane_b32 s68, v49 ; SI-NEXT: v_readfirstlane_b32 s67, v50 -; SI-NEXT: v_writelane_b32 v43, s65, 12 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_writelane_b32 v43, s67, 10 ; SI-NEXT: v_readfirstlane_b32 s84, v51 -; SI-NEXT: v_writelane_b32 v43, s67, 13 -; SI-NEXT: v_writelane_b32 v43, s84, 14 -; SI-NEXT: v_writelane_b32 v43, s85, 15 -; SI-NEXT: v_writelane_b32 v43, s86, 16 -; SI-NEXT: v_writelane_b32 v43, s87, 17 -; SI-NEXT: v_writelane_b32 v43, s8, 18 -; SI-NEXT: v_writelane_b32 v43, s99, 19 -; SI-NEXT: v_writelane_b32 v43, s12, 20 -; SI-NEXT: v_writelane_b32 v43, s44, 21 -; SI-NEXT: v_writelane_b32 v43, s97, 22 -; SI-NEXT: v_writelane_b32 v43, s15, 23 -; SI-NEXT: v_writelane_b32 v43, s96, 24 -; SI-NEXT: v_writelane_b32 v43, s98, 25 -; SI-NEXT: v_writelane_b32 v43, s83, 26 -; SI-NEXT: v_writelane_b32 v43, s82, 27 -; SI-NEXT: v_writelane_b32 v43, s9, 28 -; SI-NEXT: v_writelane_b32 v43, s81, 29 +; SI-NEXT: v_writelane_b32 v43, s68, 11 +; SI-NEXT: v_writelane_b32 v43, s84, 12 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_readfirstlane_b32 s86, v38 +; SI-NEXT: v_writelane_b32 v43, s85, 13 +; SI-NEXT: v_readfirstlane_b32 s87, v37 +; SI-NEXT: v_writelane_b32 v43, s86, 14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s96, v39 +; SI-NEXT: v_writelane_b32 v43, s87, 15 +; SI-NEXT: v_writelane_b32 v43, s96, 16 +; SI-NEXT: v_writelane_b32 v43, s83, 17 +; SI-NEXT: v_writelane_b32 v43, s99, 18 +; SI-NEXT: v_writelane_b32 v43, s42, 19 +; SI-NEXT: v_writelane_b32 v43, s46, 20 +; SI-NEXT: v_writelane_b32 v43, s97, 21 +; SI-NEXT: v_writelane_b32 v43, s56, 22 +; SI-NEXT: v_writelane_b32 v43, s98, 23 +; SI-NEXT: v_writelane_b32 v43, s82, 24 +; SI-NEXT: v_readfirstlane_b32 s10, v35 +; SI-NEXT: v_writelane_b32 v43, s81, 25 +; SI-NEXT: v_writelane_b32 v43, s10, 26 +; SI-NEXT: v_readfirstlane_b32 s25, v36 +; SI-NEXT: v_writelane_b32 v43, s12, 27 +; SI-NEXT: v_writelane_b32 v43, s25, 28 +; SI-NEXT: v_writelane_b32 v43, s14, 29 ; SI-NEXT: v_writelane_b32 v43, s80, 30 -; SI-NEXT: v_writelane_b32 v43, s7, 31 -; SI-NEXT: v_writelane_b32 v43, s72, 32 -; SI-NEXT: v_writelane_b32 v43, s26, 33 -; SI-NEXT: v_writelane_b32 v43, s41, 34 -; SI-NEXT: v_writelane_b32 v43, s14, 35 -; SI-NEXT: v_writelane_b32 v43, s69, 36 -; SI-NEXT: v_writelane_b32 v43, s71, 37 -; SI-NEXT: v_writelane_b32 v43, s70, 38 -; SI-NEXT: v_writelane_b32 v43, s68, 39 -; SI-NEXT: v_writelane_b32 v43, s60, 40 -; SI-NEXT: v_writelane_b32 v43, s62, 41 -; SI-NEXT: v_writelane_b32 v43, s11, 42 -; SI-NEXT: v_writelane_b32 v43, s10, 43 -; SI-NEXT: v_writelane_b32 v43, s58, 44 -; SI-NEXT: v_writelane_b32 v43, s66, 45 -; SI-NEXT: v_writelane_b32 v43, s29, 46 +; SI-NEXT: v_writelane_b32 v43, s22, 31 +; SI-NEXT: v_writelane_b32 v43, s62, 32 +; SI-NEXT: v_writelane_b32 v43, s71, 33 +; SI-NEXT: v_writelane_b32 v43, s70, 34 +; SI-NEXT: v_writelane_b32 v43, s69, 35 +; SI-NEXT: v_writelane_b32 v43, s59, 36 +; SI-NEXT: v_writelane_b32 v43, s57, 37 +; SI-NEXT: v_writelane_b32 v43, s41, 38 +; SI-NEXT: v_writelane_b32 v43, s40, 39 +; SI-NEXT: v_writelane_b32 v43, s61, 40 +; SI-NEXT: v_writelane_b32 v43, s72, 41 +; SI-NEXT: v_writelane_b32 v43, s8, 42 +; SI-NEXT: v_writelane_b32 v43, s7, 43 +; SI-NEXT: v_writelane_b32 v43, s9, 44 +; SI-NEXT: v_writelane_b32 v43, s73, 45 +; SI-NEXT: v_writelane_b32 v43, s24, 46 ; SI-NEXT: v_writelane_b32 v43, s28, 47 -; SI-NEXT: v_writelane_b32 v43, s27, 48 +; SI-NEXT: v_writelane_b32 v43, s26, 48 ; SI-NEXT: s_cbranch_scc0 .LBB89_4 ; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_lshl_b32 s5, s17, 24 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_writelane_b32 v43, s4, 57 ; SI-NEXT: v_readlane_b32 s4, v44, 3 ; SI-NEXT: s_and_b32 s4, s4, 0xff ; SI-NEXT: v_readlane_b32 s5, v44, 2 @@ -151675,404 +149864,392 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_lshl_b32 s5, s5, 24 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: v_writelane_b32 v43, s4, 58 -; SI-NEXT: v_readlane_b32 s4, v44, 1 -; SI-NEXT: s_and_b32 s4, s4, 0xff -; SI-NEXT: v_readlane_b32 s5, v44, 0 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: s_lshl_b32 s5, s5, 24 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: v_writelane_b32 v43, s4, 59 ; SI-NEXT: s_and_b32 s4, s20, 0xff -; SI-NEXT: s_lshl_b32 s5, s73, 8 +; SI-NEXT: s_lshl_b32 s5, s21, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s22, 0xff +; SI-NEXT: v_readlane_b32 s5, v44, 1 +; SI-NEXT: s_and_b32 s5, s5, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_mov_b32 s22, s6 -; SI-NEXT: s_lshl_b32 s6, s23, 24 -; SI-NEXT: v_writelane_b32 v43, s4, 60 +; SI-NEXT: s_lshl_b32 s6, s6, 24 +; SI-NEXT: v_writelane_b32 v43, s4, 59 ; SI-NEXT: s_or_b32 s4, s6, s5 -; SI-NEXT: s_and_b32 s5, s57, 0xff +; SI-NEXT: s_and_b32 s5, s79, 0xff +; SI-NEXT: v_readlane_b32 s6, v44, 0 ; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s6, s25, 24 -; SI-NEXT: v_writelane_b32 v43, s4, 61 -; SI-NEXT: s_or_b32 s4, s6, s5 -; SI-NEXT: s_and_b32 s5, s47, 0xff +; SI-NEXT: s_lshl_b32 s6, s6, 24 +; SI-NEXT: v_writelane_b32 v43, s4, 60 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: v_writelane_b32 v43, s5, 61 +; SI-NEXT: s_and_b32 s5, s77, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s6, s76, 24 -; SI-NEXT: v_writelane_b32 v43, s4, 62 -; SI-NEXT: s_or_b32 s4, s6, s5 -; SI-NEXT: s_and_b32 s5, s78, 0xff +; SI-NEXT: s_lshl_b32 s6, s75, 24 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: v_writelane_b32 v43, s5, 62 +; SI-NEXT: s_and_b32 s5, s76, 0xff ; SI-NEXT: s_lshl_b32 s6, s74, 8 ; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_and_b32 s6, s16, 0xff +; SI-NEXT: s_and_b32 s6, s18, 0xff ; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: s_lshl_b32 s16, s17, 24 -; SI-NEXT: v_writelane_b32 v43, s4, 63 -; SI-NEXT: s_or_b32 s4, s16, s6 -; SI-NEXT: s_and_b32 s6, s89, 0xff +; SI-NEXT: s_lshl_b32 s16, s19, 24 +; SI-NEXT: s_or_b32 s6, s16, s6 +; SI-NEXT: v_writelane_b32 v43, s6, 63 +; SI-NEXT: s_and_b32 s6, s92, 0xff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_lshl_b32 s16, s91, 24 +; SI-NEXT: s_or_b32 s6, s16, s6 ; SI-NEXT: ; implicit-def: $vgpr42 : SGPR spill to VGPR lane +; SI-NEXT: s_lshl_b32 s16, s90, 24 +; SI-NEXT: v_writelane_b32 v42, s6, 0 +; SI-NEXT: s_and_b32 s6, s89, 0xff ; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: s_lshl_b32 s16, s77, 24 -; SI-NEXT: v_writelane_b32 v42, s4, 0 ; SI-NEXT: s_or_b32 s6, s16, s6 ; SI-NEXT: v_writelane_b32 v42, s6, 1 -; SI-NEXT: s_and_b32 s6, s18, 0xff -; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: s_lshl_b32 s16, s19, 24 -; SI-NEXT: s_or_b32 s76, s16, s6 -; SI-NEXT: s_and_b32 s6, s93, 0xff -; SI-NEXT: s_lshl_b32 s16, s92, 8 +; SI-NEXT: s_and_b32 s6, vcc_lo, 0xff +; SI-NEXT: s_lshl_b32 s16, s95, 8 ; SI-NEXT: s_or_b32 s6, s6, s16 -; SI-NEXT: s_and_b32 s16, s90, 0xff +; SI-NEXT: s_and_b32 s16, s93, 0xff ; SI-NEXT: s_lshl_b32 s16, s16, 16 -; SI-NEXT: s_lshl_b32 s17, s91, 24 -; SI-NEXT: s_or_b32 s77, s17, s16 -; SI-NEXT: s_and_b32 s16, vcc_hi, 0xff +; SI-NEXT: s_lshl_b32 s17, s94, 24 +; SI-NEXT: s_or_b32 s74, s17, s16 +; SI-NEXT: s_and_b32 s16, s34, 0xff ; SI-NEXT: s_lshl_b32 s16, s16, 16 -; SI-NEXT: s_lshl_b32 s17, vcc_lo, 24 -; SI-NEXT: s_or_b32 s25, s17, s16 -; SI-NEXT: s_and_b32 s16, s94, 0xff +; SI-NEXT: s_lshl_b32 s17, s31, 24 +; SI-NEXT: s_mov_b32 s4, s63 +; SI-NEXT: s_or_b32 s63, s17, s16 +; SI-NEXT: s_and_b32 s16, vcc_hi, 0xff ; SI-NEXT: s_lshl_b32 s16, s16, 16 -; SI-NEXT: s_lshl_b32 s17, s95, 24 -; SI-NEXT: s_or_b32 s74, s17, s16 -; SI-NEXT: s_and_b32 s16, s35, 0xff -; SI-NEXT: s_lshl_b32 s17, s34, 8 +; SI-NEXT: s_lshl_b32 s17, s30, 24 +; SI-NEXT: s_or_b32 s76, s17, s16 +; SI-NEXT: s_and_b32 s16, s38, 0xff +; SI-NEXT: s_lshl_b32 s17, s37, 8 ; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: s_and_b32 s17, s30, 0xff +; SI-NEXT: s_and_b32 s17, s35, 0xff ; SI-NEXT: s_lshl_b32 s17, s17, 16 -; SI-NEXT: s_lshl_b32 s18, s31, 24 -; SI-NEXT: s_or_b32 s78, s18, s17 -; SI-NEXT: s_and_b32 s17, s39, 0xff +; SI-NEXT: s_lshl_b32 s18, s36, 24 +; SI-NEXT: s_or_b32 s79, s18, s17 +; SI-NEXT: s_and_b32 s17, s50, 0xff ; SI-NEXT: s_lshl_b32 s17, s17, 16 -; SI-NEXT: s_lshl_b32 s18, s38, 24 -; SI-NEXT: s_mov_b32 s31, s88 -; SI-NEXT: s_or_b32 s88, s18, s17 -; SI-NEXT: s_and_b32 s17, s36, 0xff +; SI-NEXT: s_lshl_b32 s18, s49, 24 +; SI-NEXT: s_or_b32 s77, s18, s17 +; SI-NEXT: s_and_b32 s17, s39, 0xff ; SI-NEXT: s_lshl_b32 s17, s17, 16 -; SI-NEXT: s_lshl_b32 s18, s37, 24 -; SI-NEXT: s_or_b32 s89, s18, s17 +; SI-NEXT: s_lshl_b32 s18, s48, 24 +; SI-NEXT: s_or_b32 s37, s18, s17 +; SI-NEXT: s_and_b32 s17, s54, 0xff +; SI-NEXT: s_lshl_b32 s18, s53, 8 +; SI-NEXT: s_or_b32 s18, s17, s18 ; SI-NEXT: s_and_b32 s17, s51, 0xff -; SI-NEXT: s_lshl_b32 s18, s50, 8 -; SI-NEXT: s_or_b32 s17, s17, s18 -; SI-NEXT: s_and_b32 s18, s48, 0xff -; SI-NEXT: s_lshl_b32 s18, s18, 16 -; SI-NEXT: s_lshl_b32 s19, s49, 24 -; SI-NEXT: s_or_b32 s18, s19, s18 -; SI-NEXT: v_writelane_b32 v43, s18, 49 -; SI-NEXT: s_and_b32 s18, s55, 0xff -; SI-NEXT: s_lshl_b32 s18, s18, 16 -; SI-NEXT: s_lshl_b32 s19, s54, 24 -; SI-NEXT: s_mov_b32 s73, s79 -; SI-NEXT: s_or_b32 s79, s19, s18 -; SI-NEXT: s_and_b32 s18, s52, 0xff -; SI-NEXT: s_lshl_b32 s18, s18, 16 -; SI-NEXT: s_lshl_b32 s19, s53, 24 -; SI-NEXT: s_or_b32 s94, s19, s18 -; SI-NEXT: s_and_b32 s18, s84, 0xff -; SI-NEXT: s_lshl_b32 s19, s67, 8 -; SI-NEXT: s_or_b32 s18, s18, s19 -; SI-NEXT: s_and_b32 s19, s64, 0xff -; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s65, 24 -; SI-NEXT: s_or_b32 s95, s20, s19 -; SI-NEXT: s_and_b32 s19, s12, 0xff -; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s8, 24 -; SI-NEXT: s_or_b32 s8, s20, s19 -; SI-NEXT: s_and_b32 s19, s85, 0xff -; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s86, 24 -; SI-NEXT: s_or_b32 s12, s20, s19 -; SI-NEXT: s_and_b32 s19, s80, 0xff -; SI-NEXT: s_lshl_b32 s20, s9, 8 -; SI-NEXT: s_or_b32 vcc_lo, s19, s20 -; SI-NEXT: s_and_b32 s19, s44, 0xff -; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s97, 24 -; SI-NEXT: s_or_b32 s9, s20, s19 -; SI-NEXT: s_and_b32 s19, s41, 0xff -; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s7, 24 -; SI-NEXT: s_or_b32 s7, s20, s19 -; SI-NEXT: s_and_b32 s19, s96, 0xff -; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s15, 24 -; SI-NEXT: v_writelane_b32 v43, s12, 50 -; SI-NEXT: s_or_b32 s12, s20, s19 -; SI-NEXT: s_and_b32 s19, s26, 0xff -; SI-NEXT: s_lshl_b32 s20, s82, 8 -; SI-NEXT: s_or_b32 vcc_hi, s19, s20 -; SI-NEXT: s_and_b32 s19, s99, 0xff -; SI-NEXT: s_lshl_b32 s19, s19, 16 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s19, s52, 24 +; SI-NEXT: s_or_b32 s30, s19, s17 +; SI-NEXT: s_and_b32 s17, s66, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s19, s65, 24 +; SI-NEXT: s_or_b32 s91, s19, s17 +; SI-NEXT: s_and_b32 s17, s55, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s19, s64, 24 +; SI-NEXT: s_or_b32 s89, s19, s17 +; SI-NEXT: s_and_b32 s17, s85, 0xff +; SI-NEXT: s_lshl_b32 s19, s84, 8 +; SI-NEXT: s_or_b32 s19, s17, s19 +; SI-NEXT: s_and_b32 s17, s67, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s20, s68, 24 +; SI-NEXT: s_or_b32 s90, s20, s17 +; SI-NEXT: s_and_b32 s17, s42, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s20, s96, 24 +; SI-NEXT: s_or_b32 s42, s20, s17 +; SI-NEXT: s_and_b32 s17, s86, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 16 ; SI-NEXT: s_lshl_b32 s20, s87, 24 -; SI-NEXT: v_writelane_b32 v43, s9, 51 -; SI-NEXT: s_or_b32 s9, s20, s19 -; SI-NEXT: s_and_b32 s19, s72, 0xff -; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s81, 24 -; SI-NEXT: v_writelane_b32 v43, s9, 52 -; SI-NEXT: s_or_b32 s9, s20, s19 -; SI-NEXT: s_and_b32 s19, s98, 0xff -; SI-NEXT: s_lshl_b32 s19, s19, 16 +; SI-NEXT: s_or_b32 s92, s20, s17 +; SI-NEXT: s_and_b32 s17, s25, 0xff +; SI-NEXT: s_lshl_b32 s20, s10, 8 +; SI-NEXT: s_or_b32 vcc_lo, s17, s20 +; SI-NEXT: s_and_b32 s17, s46, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s20, s97, 24 +; SI-NEXT: s_or_b32 s10, s20, s17 +; SI-NEXT: s_and_b32 s17, s62, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s20, s22, 24 +; SI-NEXT: s_or_b32 s75, s20, s17 +; SI-NEXT: s_and_b32 s17, s81, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s20, s56, 24 +; SI-NEXT: v_writelane_b32 v43, s10, 49 +; SI-NEXT: s_or_b32 s10, s20, s17 +; SI-NEXT: s_and_b32 s17, s71, 0xff +; SI-NEXT: s_lshl_b32 s20, s12, 8 +; SI-NEXT: s_or_b32 vcc_hi, s17, s20 +; SI-NEXT: s_and_b32 s17, s99, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 16 ; SI-NEXT: s_lshl_b32 s20, s83, 24 -; SI-NEXT: v_writelane_b32 v43, s9, 54 -; SI-NEXT: s_or_b32 s9, s20, s19 -; SI-NEXT: s_and_b32 s19, s62, 0xff -; SI-NEXT: s_lshl_b32 s20, s60, 8 -; SI-NEXT: s_or_b32 s84, s19, s20 -; SI-NEXT: s_and_b32 s19, s71, 0xff -; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s70, 24 -; SI-NEXT: v_writelane_b32 v43, s9, 53 -; SI-NEXT: s_or_b32 s9, s20, s19 -; SI-NEXT: s_and_b32 s19, s11, 0xff -; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s68, 24 -; SI-NEXT: s_or_b32 s57, s20, s19 -; SI-NEXT: s_and_b32 s19, s14, 0xff -; SI-NEXT: s_lshl_b32 s19, s19, 16 +; SI-NEXT: v_writelane_b32 v43, s10, 50 +; SI-NEXT: s_or_b32 s10, s20, s17 +; SI-NEXT: s_and_b32 s17, s80, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s20, s14, 24 +; SI-NEXT: v_writelane_b32 v43, s10, 51 +; SI-NEXT: s_or_b32 s10, s20, s17 +; SI-NEXT: s_and_b32 s17, s98, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s20, s82, 24 +; SI-NEXT: v_writelane_b32 v43, s10, 53 +; SI-NEXT: s_or_b32 s10, s20, s17 +; SI-NEXT: s_and_b32 s17, s61, 0xff +; SI-NEXT: s_lshl_b32 s20, s40, 8 +; SI-NEXT: s_or_b32 s84, s17, s20 +; SI-NEXT: s_and_b32 s17, s57, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s20, s59, 24 +; SI-NEXT: v_writelane_b32 v43, s10, 52 +; SI-NEXT: s_or_b32 s10, s20, s17 +; SI-NEXT: s_and_b32 s17, s7, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s20, s41, 24 +; SI-NEXT: s_or_b32 s7, s20, s17 +; SI-NEXT: s_and_b32 s17, s70, 0xff +; SI-NEXT: v_writelane_b32 v43, s10, 54 +; SI-NEXT: s_lshl_b32 s17, s17, 16 ; SI-NEXT: s_lshl_b32 s20, s69, 24 -; SI-NEXT: v_writelane_b32 v43, s9, 55 -; SI-NEXT: s_or_b32 s9, s20, s19 -; SI-NEXT: s_and_b32 s19, s29, 0xff -; SI-NEXT: s_lshl_b32 s20, s66, 8 -; SI-NEXT: s_or_b32 s85, s19, s20 -; SI-NEXT: s_and_b32 s19, s10, 0xff -; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s58, 24 -; SI-NEXT: v_writelane_b32 v43, s9, 56 -; SI-NEXT: s_or_b32 s9, s20, s19 -; SI-NEXT: s_and_b32 s19, s27, 0xff -; SI-NEXT: s_lshl_b32 s19, s19, 16 +; SI-NEXT: v_writelane_b32 v43, s7, 56 +; SI-NEXT: s_or_b32 s7, s20, s17 +; SI-NEXT: s_and_b32 s17, s73, 0xff +; SI-NEXT: s_lshl_b32 s20, s9, 8 +; SI-NEXT: s_or_b32 s85, s17, s20 +; SI-NEXT: s_and_b32 s17, s72, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s20, s8, 24 +; SI-NEXT: s_or_b32 s8, s20, s17 +; SI-NEXT: s_and_b32 s17, s26, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 16 ; SI-NEXT: s_lshl_b32 s20, s28, 24 -; SI-NEXT: v_writelane_b32 v43, s9, 57 -; SI-NEXT: s_or_b32 s23, s20, s19 -; SI-NEXT: s_and_b32 s19, s24, 0xff +; SI-NEXT: v_writelane_b32 v43, s7, 55 +; SI-NEXT: s_or_b32 s7, s20, s17 +; SI-NEXT: s_and_b32 s17, s24, 0xff ; SI-NEXT: v_readlane_b32 s9, v44, 33 -; SI-NEXT: s_lshl_b32 s19, s19, 16 +; SI-NEXT: s_lshl_b32 s17, s17, 16 ; SI-NEXT: s_lshl_b32 s20, s9, 24 ; SI-NEXT: v_readlane_b32 s9, v44, 32 -; SI-NEXT: s_or_b32 s10, s20, s19 -; SI-NEXT: s_and_b32 s19, s9, 0xff +; SI-NEXT: s_or_b32 s51, s20, s17 +; SI-NEXT: s_and_b32 s17, s9, 0xff ; SI-NEXT: v_readlane_b32 s9, v44, 31 ; SI-NEXT: s_lshl_b32 s20, s9, 8 ; SI-NEXT: v_readlane_b32 s9, v44, 30 -; SI-NEXT: s_or_b32 s86, s19, s20 -; SI-NEXT: s_and_b32 s19, s9, 0xff +; SI-NEXT: s_or_b32 s86, s17, s20 +; SI-NEXT: s_and_b32 s17, s9, 0xff ; SI-NEXT: v_readlane_b32 s9, v44, 29 -; SI-NEXT: s_lshl_b32 s19, s19, 16 +; SI-NEXT: s_lshl_b32 s17, s17, 16 ; SI-NEXT: s_lshl_b32 s20, s9, 24 ; SI-NEXT: v_readlane_b32 s9, v44, 28 -; SI-NEXT: s_or_b32 s47, s20, s19 -; SI-NEXT: s_and_b32 s19, s9, 0xff +; SI-NEXT: s_or_b32 s53, s20, s17 +; SI-NEXT: s_and_b32 s17, s9, 0xff ; SI-NEXT: v_readlane_b32 s9, v44, 27 -; SI-NEXT: s_lshl_b32 s19, s19, 16 +; SI-NEXT: s_lshl_b32 s17, s17, 16 ; SI-NEXT: s_lshl_b32 s20, s9, 24 -; SI-NEXT: v_readlane_b32 s11, v44, 26 -; SI-NEXT: s_or_b32 s9, s20, s19 -; SI-NEXT: s_and_b32 s19, s11, 0xff -; SI-NEXT: v_readlane_b32 s11, v44, 25 -; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s11, 24 -; SI-NEXT: v_readlane_b32 s11, v44, 24 -; SI-NEXT: s_or_b32 s24, s20, s19 -; SI-NEXT: s_mov_b32 s92, s11 -; SI-NEXT: s_and_b32 s19, s11, 0xff -; SI-NEXT: v_readlane_b32 s11, v44, 23 -; SI-NEXT: s_mov_b32 s36, s11 -; SI-NEXT: s_lshl_b32 s20, s11, 8 -; SI-NEXT: v_readlane_b32 s11, v44, 22 -; SI-NEXT: s_or_b32 s87, s19, s20 -; SI-NEXT: s_mov_b32 s62, s11 -; SI-NEXT: s_and_b32 s19, s11, 0xff -; SI-NEXT: v_readlane_b32 s11, v44, 21 -; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_mov_b32 s30, s11 -; SI-NEXT: s_lshl_b32 s20, s11, 24 -; SI-NEXT: v_readlane_b32 s11, v44, 20 -; SI-NEXT: s_or_b32 s58, s20, s19 -; SI-NEXT: s_mov_b32 s91, s11 -; SI-NEXT: s_and_b32 s19, s11, 0xff -; SI-NEXT: v_readlane_b32 s11, v44, 19 -; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_mov_b32 s35, s11 +; SI-NEXT: v_readlane_b32 s9, v44, 26 +; SI-NEXT: s_or_b32 s52, s20, s17 +; SI-NEXT: s_and_b32 s17, s9, 0xff +; SI-NEXT: v_readlane_b32 s9, v44, 25 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s20, s9, 24 +; SI-NEXT: v_readlane_b32 s9, v44, 24 +; SI-NEXT: s_or_b32 s54, s20, s17 +; SI-NEXT: s_and_b32 s17, s9, 0xff +; SI-NEXT: v_readlane_b32 s9, v44, 23 +; SI-NEXT: s_lshl_b32 s20, s9, 8 +; SI-NEXT: v_readlane_b32 s9, v44, 22 +; SI-NEXT: s_or_b32 s87, s17, s20 +; SI-NEXT: s_and_b32 s17, s9, 0xff +; SI-NEXT: v_readlane_b32 s9, v44, 21 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s20, s9, 24 +; SI-NEXT: v_readlane_b32 s9, v44, 20 +; SI-NEXT: s_or_b32 s64, s20, s17 +; SI-NEXT: s_and_b32 s17, s9, 0xff +; SI-NEXT: s_mov_b32 s31, s9 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s20, s15, 24 +; SI-NEXT: v_readlane_b32 s9, v44, 19 +; SI-NEXT: s_or_b32 s55, s20, s17 +; SI-NEXT: s_mov_b32 s72, s9 +; SI-NEXT: s_and_b32 s17, s9, 0xff +; SI-NEXT: v_readlane_b32 s9, v44, 18 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s20, s9, 24 +; SI-NEXT: s_or_b32 s65, s20, s17 +; SI-NEXT: s_and_b32 s17, s60, 0xff +; SI-NEXT: s_lshl_b32 s20, s13, 8 +; SI-NEXT: s_or_b32 s26, s17, s20 +; SI-NEXT: s_and_b32 s17, s23, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 16 ; SI-NEXT: s_lshl_b32 s20, s11, 24 -; SI-NEXT: v_readlane_b32 s11, v44, 18 -; SI-NEXT: s_mov_b32 s4, s46 -; SI-NEXT: s_or_b32 s46, s20, s19 -; SI-NEXT: s_and_b32 s19, s11, 0xff -; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s73, 24 -; SI-NEXT: s_mov_b32 s52, s73 -; SI-NEXT: s_or_b32 s73, s20, s19 -; SI-NEXT: s_and_b32 s19, s31, 0xff -; SI-NEXT: s_lshl_b32 s20, s45, 8 -; SI-NEXT: s_or_b32 s26, s19, s20 -; SI-NEXT: s_and_b32 s19, s13, 0xff -; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s42, 24 -; SI-NEXT: s_or_b32 s67, s20, s19 -; SI-NEXT: s_and_b32 s19, s4, 0xff -; SI-NEXT: s_lshl_b32 s19, s19, 16 +; SI-NEXT: s_or_b32 s67, s20, s17 +; SI-NEXT: s_and_b32 s17, s4, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s20, s29, 24 +; SI-NEXT: s_or_b32 s66, s20, s17 +; SI-NEXT: s_and_b32 s17, s45, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 16 ; SI-NEXT: s_lshl_b32 s20, s43, 24 -; SI-NEXT: s_mov_b32 s53, s42 -; SI-NEXT: s_or_b32 s42, s20, s19 -; SI-NEXT: s_and_b32 s19, s56, 0xff -; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s59, 24 -; SI-NEXT: s_or_b32 s68, s20, s19 -; SI-NEXT: s_and_b32 s19, s63, 0xff -; SI-NEXT: s_lshl_b32 s20, s61, 8 -; SI-NEXT: v_readlane_b32 s93, v44, 17 -; SI-NEXT: s_or_b32 s27, s19, s20 -; SI-NEXT: s_and_b32 s19, s40, 0xff -; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s93, 24 -; SI-NEXT: s_or_b32 s70, s20, s19 -; SI-NEXT: s_and_b32 s19, s21, 0xff -; SI-NEXT: s_mov_b32 s51, s59 -; SI-NEXT: s_mov_b32 s59, s7 -; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s75, 24 +; SI-NEXT: s_or_b32 s68, s20, s17 +; SI-NEXT: s_and_b32 s17, s88, 0xff +; SI-NEXT: s_lshl_b32 s20, s27, 8 +; SI-NEXT: s_mov_b32 s34, s23 +; SI-NEXT: s_or_b32 s23, s17, s20 +; SI-NEXT: s_and_b32 s17, s47, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s20, s44, 24 +; SI-NEXT: s_or_b32 s70, s20, s17 +; SI-NEXT: s_and_b32 s17, s78, 0xff +; SI-NEXT: s_mov_b32 s94, s47 +; SI-NEXT: s_mov_b32 s47, s7 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s20, s58, 24 +; SI-NEXT: v_readlane_b32 s7, v44, 17 +; SI-NEXT: s_mov_b32 s93, s60 +; SI-NEXT: s_mov_b32 s60, s8 +; SI-NEXT: s_or_b32 s69, s20, s17 +; SI-NEXT: s_mov_b32 s8, s7 +; SI-NEXT: s_and_b32 s17, s7, 0xff ; SI-NEXT: v_readlane_b32 s7, v44, 16 -; SI-NEXT: s_mov_b32 s48, s56 -; SI-NEXT: s_mov_b32 s56, s10 -; SI-NEXT: s_or_b32 s69, s20, s19 -; SI-NEXT: s_mov_b32 s10, s7 -; SI-NEXT: s_and_b32 s19, s7, 0xff -; SI-NEXT: v_readlane_b32 s7, v44, 15 -; SI-NEXT: s_lshl_b32 s19, s19, 16 +; SI-NEXT: s_lshl_b32 s17, s17, 16 ; SI-NEXT: s_mov_b32 s71, s7 ; SI-NEXT: s_lshl_b32 s20, s7, 24 +; SI-NEXT: v_readlane_b32 s7, v44, 15 +; SI-NEXT: s_or_b32 s28, s20, s17 +; SI-NEXT: s_mov_b32 s40, s7 +; SI-NEXT: s_and_b32 s17, s7, 0xff ; SI-NEXT: v_readlane_b32 s7, v44, 14 -; SI-NEXT: s_mov_b32 s39, s75 -; SI-NEXT: s_mov_b32 s75, s94 -; SI-NEXT: s_or_b32 s94, s20, s19 -; SI-NEXT: s_mov_b32 s41, s7 -; SI-NEXT: s_and_b32 s19, s7, 0xff -; SI-NEXT: v_readlane_b32 s7, v44, 13 -; SI-NEXT: s_mov_b32 s14, s7 ; SI-NEXT: s_lshl_b32 s20, s7, 8 -; SI-NEXT: v_readlane_b32 s7, v44, 12 -; SI-NEXT: s_or_b32 s29, s19, s20 -; SI-NEXT: s_mov_b32 s81, s7 -; SI-NEXT: s_and_b32 s19, s7, 0xff -; SI-NEXT: v_readlane_b32 s7, v44, 11 -; SI-NEXT: s_mov_b32 s55, s45 -; SI-NEXT: s_mov_b32 s45, s9 -; SI-NEXT: s_lshl_b32 s19, s19, 16 +; SI-NEXT: v_readlane_b32 s10, v44, 13 +; SI-NEXT: s_mov_b32 s61, s9 ; SI-NEXT: s_mov_b32 s9, s7 -; SI-NEXT: s_lshl_b32 s20, s7, 24 -; SI-NEXT: v_readlane_b32 s7, v44, 10 -; SI-NEXT: s_mov_b32 s38, s11 -; SI-NEXT: s_or_b32 s11, s20, s19 -; SI-NEXT: s_mov_b32 s72, s7 -; SI-NEXT: s_and_b32 s19, s7, 0xff -; SI-NEXT: v_readlane_b32 s7, v44, 9 -; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_mov_b32 s82, s7 -; SI-NEXT: s_lshl_b32 s20, s7, 24 -; SI-NEXT: v_readlane_b32 s7, v44, 8 -; SI-NEXT: s_or_b32 s80, s20, s19 -; SI-NEXT: s_mov_b32 s83, s7 -; SI-NEXT: s_and_b32 s19, s7, 0xff -; SI-NEXT: v_readlane_b32 s7, v44, 7 -; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_mov_b32 s96, s7 -; SI-NEXT: s_lshl_b32 s20, s7, 24 -; SI-NEXT: v_readlane_b32 s7, v44, 6 -; SI-NEXT: s_mov_b32 s90, s31 -; SI-NEXT: s_or_b32 s31, s20, s19 -; SI-NEXT: s_mov_b32 s98, s7 -; SI-NEXT: s_and_b32 s19, s7, 0xff -; SI-NEXT: v_readlane_b32 s7, v44, 5 -; SI-NEXT: s_mov_b32 s44, s7 -; SI-NEXT: s_lshl_b32 s20, s7, 8 -; SI-NEXT: v_readlane_b32 s7, v44, 4 -; SI-NEXT: s_mov_b32 s37, s43 -; SI-NEXT: s_mov_b32 s43, s93 -; SI-NEXT: s_mov_b32 s93, s21 -; SI-NEXT: s_or_b32 s21, s19, s20 -; SI-NEXT: s_and_b32 s19, s7, 0xff -; SI-NEXT: s_mov_b32 s34, s4 +; SI-NEXT: s_or_b32 s25, s17, s20 +; SI-NEXT: s_mov_b32 s7, s10 +; SI-NEXT: s_and_b32 s17, s10, 0xff +; SI-NEXT: v_readlane_b32 s10, v44, 12 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_lshl_b32 s20, s10, 24 +; SI-NEXT: v_readlane_b32 s10, v44, 11 +; SI-NEXT: s_or_b32 s81, s20, s17 +; SI-NEXT: s_mov_b32 s62, s10 +; SI-NEXT: s_and_b32 s17, s10, 0xff +; SI-NEXT: v_readlane_b32 s10, v44, 10 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_mov_b32 s82, s10 +; SI-NEXT: s_lshl_b32 s20, s10, 24 +; SI-NEXT: v_readlane_b32 s10, v44, 9 +; SI-NEXT: s_or_b32 s80, s20, s17 +; SI-NEXT: s_and_b32 s17, s10, 0xff +; SI-NEXT: v_readlane_b32 s12, v44, 8 +; SI-NEXT: s_mov_b32 s96, s10 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_mov_b32 s10, s12 +; SI-NEXT: s_lshl_b32 s20, s12, 24 +; SI-NEXT: v_readlane_b32 s12, v44, 7 +; SI-NEXT: s_or_b32 s41, s20, s17 +; SI-NEXT: s_mov_b32 s98, s12 +; SI-NEXT: s_and_b32 s17, s12, 0xff +; SI-NEXT: v_readlane_b32 s12, v44, 6 +; SI-NEXT: s_mov_b32 s46, s12 +; SI-NEXT: s_lshl_b32 s20, s12, 8 +; SI-NEXT: v_readlane_b32 s12, v44, 5 +; SI-NEXT: v_readlane_b32 s21, v44, 4 +; SI-NEXT: s_mov_b32 s36, s29 +; SI-NEXT: s_or_b32 s29, s17, s20 +; SI-NEXT: s_and_b32 s17, s12, 0xff +; SI-NEXT: s_mov_b32 s95, s4 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s20, s21, 24 +; SI-NEXT: v_readlane_b32 s4, v43, 59 +; SI-NEXT: s_mov_b32 s57, s15 +; SI-NEXT: s_mov_b32 s35, s13 +; SI-NEXT: s_mov_b32 s50, s11 +; SI-NEXT: s_mov_b32 s11, s42 +; SI-NEXT: s_mov_b32 s49, s45 +; SI-NEXT: s_mov_b32 s48, s43 +; SI-NEXT: s_mov_b32 s15, s88 +; SI-NEXT: s_mov_b32 s39, s27 +; SI-NEXT: s_mov_b32 s27, s89 +; SI-NEXT: s_mov_b32 s89, s44 +; SI-NEXT: s_mov_b32 s38, s78 +; SI-NEXT: s_mov_b32 s78, s30 +; SI-NEXT: s_mov_b32 s30, s58 +; SI-NEXT: s_mov_b32 s58, s37 +; SI-NEXT: s_mov_b32 s42, s12 +; SI-NEXT: s_mov_b32 s22, s21 +; SI-NEXT: s_or_b32 s83, s20, s17 +; SI-NEXT: s_lshl_b32 s21, s4, 16 +; SI-NEXT: s_lshl_b32 s20, s5, 16 +; SI-NEXT: s_lshl_b32 s59, s6, 16 +; SI-NEXT: s_lshl_b32 s17, s16, 16 +; SI-NEXT: s_lshl_b32 s16, s18, 16 ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s22, 24 -; SI-NEXT: v_readlane_b32 s4, v43, 60 -; SI-NEXT: s_mov_b32 s54, s13 -; SI-NEXT: s_mov_b32 s13, s12 -; SI-NEXT: s_mov_b32 s50, s63 -; SI-NEXT: s_mov_b32 s63, s95 -; SI-NEXT: s_mov_b32 s49, s61 -; SI-NEXT: s_mov_b32 s61, s8 -; SI-NEXT: s_mov_b32 s60, s40 -; SI-NEXT: s_mov_b32 s12, s7 -; SI-NEXT: s_mov_b32 s7, s22 -; SI-NEXT: s_or_b32 s15, s20, s19 -; SI-NEXT: s_lshl_b32 s20, s4, 16 -; SI-NEXT: s_lshl_b32 s95, s5, 16 -; SI-NEXT: s_lshl_b32 s22, s6, 16 -; SI-NEXT: s_lshl_b32 s16, s16, 16 -; SI-NEXT: s_lshl_b32 s19, s17, 16 -; SI-NEXT: s_lshl_b32 s18, s18, 16 -; SI-NEXT: s_lshl_b32 s17, vcc_lo, 16 +; SI-NEXT: s_lshl_b32 s18, vcc_lo, 16 ; SI-NEXT: s_lshl_b32 s6, vcc_hi, 16 ; SI-NEXT: s_lshl_b32 s99, s84, 16 -; SI-NEXT: s_lshl_b32 s8, s85, 16 +; SI-NEXT: s_lshl_b32 s12, s85, 16 ; SI-NEXT: s_lshl_b32 s97, s86, 16 -; SI-NEXT: s_lshl_b32 s28, s87, 16 +; SI-NEXT: s_lshl_b32 s56, s87, 16 ; SI-NEXT: s_lshl_b32 s87, s26, 16 ; SI-NEXT: v_readlane_b32 s26, v43, 58 -; SI-NEXT: s_lshl_b32 s86, s27, 16 -; SI-NEXT: v_readlane_b32 s27, v43, 59 -; SI-NEXT: v_readlane_b32 s66, v43, 63 -; SI-NEXT: s_lshl_b32 s85, s29, 16 -; SI-NEXT: v_readlane_b32 s29, v43, 62 -; SI-NEXT: v_readlane_b32 s65, v43, 61 -; SI-NEXT: v_readlane_b32 s64, v42, 0 -; SI-NEXT: s_lshl_b32 s84, s21, 16 -; SI-NEXT: v_readlane_b32 s21, v42, 1 +; SI-NEXT: v_readlane_b32 s37, v43, 62 +; SI-NEXT: s_lshl_b32 s86, s23, 16 +; SI-NEXT: v_readlane_b32 s23, v42, 0 +; SI-NEXT: s_lshl_b32 s85, s25, 16 +; SI-NEXT: v_readlane_b32 s25, v43, 57 +; SI-NEXT: v_readlane_b32 s88, v43, 60 +; SI-NEXT: s_lshl_b32 s84, s29, 16 +; SI-NEXT: v_readlane_b32 s29, v43, 61 +; SI-NEXT: v_readlane_b32 s73, v42, 1 +; SI-NEXT: v_readlane_b32 s24, v43, 63 ; SI-NEXT: s_cbranch_execnz .LBB89_3 ; SI-NEXT: .LBB89_2: ; %cmp.true ; SI-NEXT: s_add_i32 s4, s98, 3 ; SI-NEXT: s_and_b32 s4, s4, 0xff -; SI-NEXT: s_lshl_b32 s5, s44, 8 -; SI-NEXT: s_add_i32 s6, s12, 3 +; SI-NEXT: s_lshl_b32 s5, s46, 8 +; SI-NEXT: s_add_i32 s6, s42, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_and_b32 s6, s6, 0xff -; SI-NEXT: s_lshl_b32 s5, s7, 24 +; SI-NEXT: s_lshl_b32 s5, s22, 24 ; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: s_addk_i32 s4, 0x300 ; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_add_i32 s5, s72, 3 +; SI-NEXT: s_add_i32 s5, s62, 3 ; SI-NEXT: s_and_b32 s5, s5, 0xff ; SI-NEXT: s_lshl_b32 s6, s82, 8 -; SI-NEXT: s_add_i32 s16, s83, 3 +; SI-NEXT: s_add_i32 s16, s96, 3 ; SI-NEXT: s_or_b32 s5, s6, s5 ; SI-NEXT: s_and_b32 s16, s16, 0xff -; SI-NEXT: s_lshl_b32 s6, s96, 24 +; SI-NEXT: s_lshl_b32 s6, s10, 24 ; SI-NEXT: s_lshl_b32 s16, s16, 16 ; SI-NEXT: s_addk_i32 s5, 0x300 ; SI-NEXT: s_or_b32 s6, s6, s16 ; SI-NEXT: s_and_b32 s5, s5, 0xffff ; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_add_i32 s6, s41, 3 +; SI-NEXT: s_add_i32 s6, s40, 3 ; SI-NEXT: s_and_b32 s6, s6, 0xff -; SI-NEXT: s_lshl_b32 s16, s14, 8 -; SI-NEXT: s_add_i32 s17, s81, 3 +; SI-NEXT: s_lshl_b32 s16, s9, 8 +; SI-NEXT: s_add_i32 s17, s7, 3 ; SI-NEXT: s_or_b32 s6, s16, s6 ; SI-NEXT: s_and_b32 s17, s17, 0xff -; SI-NEXT: s_lshl_b32 s16, s9, 24 +; SI-NEXT: s_lshl_b32 s16, s14, 24 ; SI-NEXT: s_lshl_b32 s17, s17, 16 ; SI-NEXT: s_addk_i32 s6, 0x300 ; SI-NEXT: s_or_b32 s16, s16, s17 ; SI-NEXT: s_and_b32 s6, s6, 0xffff ; SI-NEXT: s_or_b32 s6, s16, s6 -; SI-NEXT: s_add_i32 s16, s93, 3 +; SI-NEXT: s_add_i32 s16, s38, 3 ; SI-NEXT: s_and_b32 s16, s16, 0xff -; SI-NEXT: s_lshl_b32 s17, s39, 8 -; SI-NEXT: s_add_i32 s18, s10, 3 +; SI-NEXT: s_lshl_b32 s17, s30, 8 +; SI-NEXT: s_add_i32 s18, s8, 3 ; SI-NEXT: s_or_b32 s16, s17, s16 ; SI-NEXT: s_and_b32 s18, s18, 0xff ; SI-NEXT: s_lshl_b32 s17, s71, 24 @@ -152081,83 +150258,87 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_or_b32 s17, s17, s18 ; SI-NEXT: s_and_b32 s16, s16, 0xffff ; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: s_add_i32 s17, s50, 3 +; SI-NEXT: s_add_i32 s17, s15, 3 ; SI-NEXT: s_and_b32 s17, s17, 0xff -; SI-NEXT: s_lshl_b32 s18, s49, 8 -; SI-NEXT: s_add_i32 s19, s60, 3 +; SI-NEXT: s_lshl_b32 s18, s39, 8 +; SI-NEXT: s_add_i32 s19, s94, 3 ; SI-NEXT: s_or_b32 s17, s18, s17 ; SI-NEXT: s_and_b32 s19, s19, 0xff -; SI-NEXT: s_lshl_b32 s18, s43, 24 +; SI-NEXT: s_lshl_b32 s18, s89, 24 ; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_addk_i32 s17, 0x300 ; SI-NEXT: s_or_b32 s18, s18, s19 ; SI-NEXT: s_and_b32 s17, s17, 0xffff ; SI-NEXT: s_or_b32 s17, s18, s17 -; SI-NEXT: s_add_i32 s18, s34, 3 +; SI-NEXT: s_add_i32 s18, s95, 3 ; SI-NEXT: s_and_b32 s18, s18, 0xff -; SI-NEXT: s_lshl_b32 s19, s37, 8 -; SI-NEXT: s_add_i32 s20, s48, 3 +; SI-NEXT: s_lshl_b32 s19, s36, 8 +; SI-NEXT: s_add_i32 s20, s49, 3 ; SI-NEXT: s_or_b32 s18, s19, s18 ; SI-NEXT: s_and_b32 s20, s20, 0xff -; SI-NEXT: s_lshl_b32 s19, s51, 24 +; SI-NEXT: s_lshl_b32 s19, s48, 24 ; SI-NEXT: s_lshl_b32 s20, s20, 16 ; SI-NEXT: s_addk_i32 s18, 0x300 ; SI-NEXT: s_or_b32 s19, s19, s20 ; SI-NEXT: s_and_b32 s18, s18, 0xffff ; SI-NEXT: s_or_b32 s18, s19, s18 -; SI-NEXT: s_add_i32 s19, s90, 3 +; SI-NEXT: s_add_i32 s19, s93, 3 ; SI-NEXT: s_and_b32 s19, s19, 0xff -; SI-NEXT: s_lshl_b32 s20, s55, 8 -; SI-NEXT: s_add_i32 s22, s54, 3 +; SI-NEXT: s_lshl_b32 s20, s35, 8 +; SI-NEXT: s_add_i32 s21, s34, 3 ; SI-NEXT: s_or_b32 s19, s20, s19 -; SI-NEXT: s_and_b32 s22, s22, 0xff -; SI-NEXT: s_lshl_b32 s20, s53, 24 -; SI-NEXT: s_lshl_b32 s22, s22, 16 +; SI-NEXT: s_and_b32 s21, s21, 0xff +; SI-NEXT: s_lshl_b32 s20, s50, 24 +; SI-NEXT: s_lshl_b32 s21, s21, 16 ; SI-NEXT: s_addk_i32 s19, 0x300 -; SI-NEXT: s_or_b32 s20, s20, s22 +; SI-NEXT: s_or_b32 s20, s20, s21 ; SI-NEXT: s_and_b32 s19, s19, 0xffff ; SI-NEXT: s_or_b32 s19, s20, s19 -; SI-NEXT: s_add_i32 s20, s91, 3 +; SI-NEXT: s_add_i32 s20, s31, 3 ; SI-NEXT: s_and_b32 s20, s20, 0xff -; SI-NEXT: s_lshl_b32 s22, s35, 8 -; SI-NEXT: s_add_i32 s23, s38, 3 -; SI-NEXT: s_or_b32 s20, s22, s20 -; SI-NEXT: s_and_b32 s23, s23, 0xff -; SI-NEXT: s_lshl_b32 s22, s52, 24 -; SI-NEXT: s_lshl_b32 s23, s23, 16 +; SI-NEXT: s_lshl_b32 s21, s57, 8 +; SI-NEXT: s_add_i32 s22, s72, 3 +; SI-NEXT: s_or_b32 s20, s21, s20 +; SI-NEXT: s_and_b32 s22, s22, 0xff +; SI-NEXT: s_lshl_b32 s21, s61, 24 +; SI-NEXT: s_lshl_b32 s22, s22, 16 ; SI-NEXT: s_addk_i32 s20, 0x300 -; SI-NEXT: s_or_b32 s22, s22, s23 +; SI-NEXT: s_or_b32 s21, s21, s22 ; SI-NEXT: s_and_b32 s20, s20, 0xffff -; SI-NEXT: s_or_b32 s20, s22, s20 -; SI-NEXT: s_add_i32 s22, s92, 3 -; SI-NEXT: s_and_b32 s22, s22, 0xff -; SI-NEXT: s_lshl_b32 s23, s36, 8 -; SI-NEXT: s_add_i32 s60, s62, 3 -; SI-NEXT: s_or_b32 s22, s23, s22 +; SI-NEXT: v_readlane_b32 s7, v44, 24 +; SI-NEXT: s_or_b32 s20, s21, s20 +; SI-NEXT: s_add_i32 s21, s7, 3 +; SI-NEXT: v_readlane_b32 s7, v44, 23 +; SI-NEXT: s_and_b32 s21, s21, 0xff +; SI-NEXT: s_lshl_b32 s22, s7, 8 +; SI-NEXT: v_readlane_b32 s7, v44, 21 +; SI-NEXT: s_or_b32 s21, s22, s21 +; SI-NEXT: s_lshl_b32 s22, s7, 24 +; SI-NEXT: v_readlane_b32 s7, v44, 22 +; SI-NEXT: s_add_i32 s60, s7, 3 ; SI-NEXT: s_and_b32 s60, s60, 0xff -; SI-NEXT: s_lshl_b32 s23, s30, 24 ; SI-NEXT: s_lshl_b32 s60, s60, 16 -; SI-NEXT: s_addk_i32 s22, 0x300 -; SI-NEXT: s_or_b32 s23, s23, s60 -; SI-NEXT: s_and_b32 s22, s22, 0xffff +; SI-NEXT: s_addk_i32 s21, 0x300 +; SI-NEXT: s_or_b32 s22, s22, s60 +; SI-NEXT: s_and_b32 s21, s21, 0xffff ; SI-NEXT: v_readlane_b32 s7, v44, 28 -; SI-NEXT: s_or_b32 s22, s23, s22 -; SI-NEXT: s_add_i32 s23, s7, 3 +; SI-NEXT: s_or_b32 s21, s22, s21 +; SI-NEXT: s_add_i32 s22, s7, 3 ; SI-NEXT: v_readlane_b32 s7, v44, 27 -; SI-NEXT: s_and_b32 s23, s23, 0xff +; SI-NEXT: s_and_b32 s22, s22, 0xff ; SI-NEXT: s_lshl_b32 s60, s7, 8 ; SI-NEXT: v_readlane_b32 s7, v44, 25 -; SI-NEXT: s_or_b32 s23, s60, s23 +; SI-NEXT: s_or_b32 s22, s60, s22 ; SI-NEXT: s_lshl_b32 s60, s7, 24 ; SI-NEXT: v_readlane_b32 s7, v44, 26 ; SI-NEXT: s_add_i32 s61, s7, 3 ; SI-NEXT: s_and_b32 s61, s61, 0xff ; SI-NEXT: s_lshl_b32 s61, s61, 16 -; SI-NEXT: s_addk_i32 s23, 0x300 +; SI-NEXT: s_addk_i32 s22, 0x300 ; SI-NEXT: s_or_b32 s60, s60, s61 -; SI-NEXT: s_and_b32 s23, s23, 0xffff +; SI-NEXT: s_and_b32 s22, s22, 0xffff ; SI-NEXT: v_readlane_b32 s7, v44, 32 -; SI-NEXT: s_or_b32 s23, s60, s23 +; SI-NEXT: s_or_b32 s22, s60, s22 ; SI-NEXT: s_add_i32 s60, s7, 3 ; SI-NEXT: v_readlane_b32 s7, v44, 31 ; SI-NEXT: s_and_b32 s60, s60, 0xff @@ -152168,210 +150349,194 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: v_readlane_b32 s7, v44, 30 ; SI-NEXT: s_add_i32 s62, s7, 3 ; SI-NEXT: v_readlane_b32 s7, v43, 48 -; SI-NEXT: s_and_b32 s62, s62, 0xff ; SI-NEXT: s_add_i32 s59, s7, 3 ; SI-NEXT: v_readlane_b32 s7, v43, 47 -; SI-NEXT: s_lshl_b32 s62, s62, 16 -; SI-NEXT: s_addk_i32 s60, 0x300 ; SI-NEXT: s_and_b32 s59, s59, 0xff ; SI-NEXT: s_lshl_b32 s58, s7, 8 ; SI-NEXT: v_readlane_b32 s7, v44, 33 -; SI-NEXT: s_or_b32 s61, s61, s62 -; SI-NEXT: s_and_b32 s60, s60, 0xffff ; SI-NEXT: s_or_b32 s58, s58, s59 ; SI-NEXT: s_lshl_b32 s59, s7, 24 -; SI-NEXT: v_readlane_b32 s7, v44, 44 -; SI-NEXT: s_or_b32 s60, s61, s60 -; SI-NEXT: s_add_i32 s61, s7, 3 ; SI-NEXT: v_readlane_b32 s7, v43, 46 ; SI-NEXT: s_add_i32 s57, s7, 3 ; SI-NEXT: v_readlane_b32 s7, v43, 45 -; SI-NEXT: s_lshl_b32 s56, s7, 8 +; SI-NEXT: s_add_i32 s56, s7, 3 ; SI-NEXT: v_readlane_b32 s7, v43, 44 -; SI-NEXT: s_lshl_b32 s47, s7, 24 -; SI-NEXT: v_readlane_b32 s7, v43, 43 -; SI-NEXT: s_add_i32 s46, s7, 3 +; SI-NEXT: s_lshl_b32 s47, s7, 8 ; SI-NEXT: v_readlane_b32 s7, v43, 42 -; SI-NEXT: s_add_i32 s45, s7, 3 -; SI-NEXT: v_readlane_b32 s7, v43, 39 -; SI-NEXT: s_lshl_b32 s42, s7, 8 -; SI-NEXT: v_readlane_b32 s7, v43, 36 -; SI-NEXT: s_lshl_b32 s15, s7, 24 -; SI-NEXT: v_readlane_b32 s7, v43, 35 -; SI-NEXT: s_and_b32 s45, s45, 0xff -; SI-NEXT: s_add_i32 s14, s7, 3 -; SI-NEXT: s_or_b32 s42, s42, s45 -; SI-NEXT: s_and_b32 s14, s14, 0xff -; SI-NEXT: s_lshl_b32 s14, s14, 16 -; SI-NEXT: s_addk_i32 s42, 0x300 +; SI-NEXT: s_lshl_b32 s45, s7, 24 ; SI-NEXT: v_readlane_b32 s7, v43, 41 -; SI-NEXT: s_and_b32 s57, s57, 0xff -; SI-NEXT: s_or_b32 s14, s15, s14 -; SI-NEXT: s_and_b32 s15, s42, 0xffff +; SI-NEXT: s_and_b32 s56, s56, 0xff ; SI-NEXT: s_add_i32 s44, s7, 3 -; SI-NEXT: v_readlane_b32 s7, v43, 40 -; SI-NEXT: s_or_b32 s56, s56, s57 -; SI-NEXT: s_or_b32 s57, s14, s15 -; SI-NEXT: s_and_b32 s14, s44, 0xff -; SI-NEXT: s_lshl_b32 s15, s7, 8 +; SI-NEXT: v_readlane_b32 s7, v43, 43 +; SI-NEXT: s_or_b32 s47, s47, s56 +; SI-NEXT: s_and_b32 s44, s44, 0xff +; SI-NEXT: s_add_i32 s46, s7, 3 ; SI-NEXT: v_readlane_b32 s7, v43, 38 -; SI-NEXT: s_or_b32 s14, s15, s14 -; SI-NEXT: s_lshl_b32 s15, s7, 24 +; SI-NEXT: s_lshl_b32 s44, s44, 16 +; SI-NEXT: s_addk_i32 s47, 0x300 +; SI-NEXT: s_lshl_b32 s41, s7, 8 +; SI-NEXT: v_readlane_b32 s7, v43, 35 +; SI-NEXT: s_or_b32 s44, s45, s44 +; SI-NEXT: s_and_b32 s45, s47, 0xffff +; SI-NEXT: s_lshl_b32 s14, s7, 24 +; SI-NEXT: v_readlane_b32 s7, v43, 34 +; SI-NEXT: s_or_b32 s47, s44, s45 +; SI-NEXT: s_and_b32 s44, s46, 0xff +; SI-NEXT: s_add_i32 s13, s7, 3 +; SI-NEXT: s_or_b32 s41, s41, s44 +; SI-NEXT: s_and_b32 s13, s13, 0xff +; SI-NEXT: s_lshl_b32 s13, s13, 16 +; SI-NEXT: s_addk_i32 s41, 0x300 +; SI-NEXT: v_readlane_b32 s7, v43, 40 +; SI-NEXT: s_or_b32 s13, s14, s13 +; SI-NEXT: s_and_b32 s14, s41, 0xffff +; SI-NEXT: s_add_i32 s43, s7, 3 +; SI-NEXT: v_readlane_b32 s7, v43, 39 +; SI-NEXT: s_or_b32 s56, s13, s14 +; SI-NEXT: s_and_b32 s13, s43, 0xff +; SI-NEXT: s_lshl_b32 s14, s7, 8 +; SI-NEXT: v_readlane_b32 s7, v43, 36 +; SI-NEXT: s_or_b32 s13, s14, s13 +; SI-NEXT: s_lshl_b32 s14, s7, 24 ; SI-NEXT: v_readlane_b32 s7, v43, 37 ; SI-NEXT: s_add_i32 s40, s7, 3 -; SI-NEXT: s_and_b32 s61, s61, 0xff -; SI-NEXT: s_and_b32 s40, s40, 0xff -; SI-NEXT: s_lshl_b32 s61, s61, 16 +; SI-NEXT: s_and_b32 s57, s57, 0xff +; SI-NEXT: s_and_b32 s15, s40, 0xff +; SI-NEXT: s_lshl_b32 s57, s57, 16 ; SI-NEXT: s_addk_i32 s58, 0x300 -; SI-NEXT: s_lshl_b32 s40, s40, 16 -; SI-NEXT: s_addk_i32 s14, 0x300 -; SI-NEXT: s_or_b32 s59, s59, s61 +; SI-NEXT: s_lshl_b32 s15, s15, 16 +; SI-NEXT: s_addk_i32 s13, 0x300 +; SI-NEXT: s_or_b32 s57, s59, s57 ; SI-NEXT: s_and_b32 s58, s58, 0xffff -; SI-NEXT: s_or_b32 s15, s15, s40 -; SI-NEXT: s_and_b32 s14, s14, 0xffff -; SI-NEXT: s_or_b32 s58, s59, s58 -; SI-NEXT: s_or_b32 s59, s15, s14 -; SI-NEXT: s_add_i32 s14, s6, 0x3000000 -; SI-NEXT: v_readlane_b32 s6, v43, 32 -; SI-NEXT: s_add_i32 s11, s6, 3 +; SI-NEXT: s_or_b32 s14, s14, s15 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_or_b32 s57, s57, s58 +; SI-NEXT: s_or_b32 s58, s14, s13 +; SI-NEXT: s_add_i32 s13, s6, 0x3000000 +; SI-NEXT: v_readlane_b32 s6, v43, 30 +; SI-NEXT: s_add_i32 s9, s6, 3 ; SI-NEXT: v_readlane_b32 s7, v43, 29 -; SI-NEXT: s_and_b32 s6, s11, 0xff +; SI-NEXT: s_and_b32 s6, s9, 0xff ; SI-NEXT: s_lshl_b32 s8, s7, 8 -; SI-NEXT: v_readlane_b32 s7, v43, 26 +; SI-NEXT: v_readlane_b32 s7, v43, 24 ; SI-NEXT: s_or_b32 s6, s8, s6 ; SI-NEXT: s_lshl_b32 s8, s7, 24 -; SI-NEXT: v_readlane_b32 s7, v43, 25 -; SI-NEXT: s_add_i32 s24, s7, 3 -; SI-NEXT: s_and_b32 s11, s24, 0xff +; SI-NEXT: v_readlane_b32 s7, v43, 23 +; SI-NEXT: s_add_i32 s27, s7, 3 +; SI-NEXT: s_and_b32 s9, s27, 0xff ; SI-NEXT: s_addk_i32 s6, 0x300 -; SI-NEXT: s_lshl_b32 s11, s11, 16 +; SI-NEXT: s_lshl_b32 s9, s9, 16 ; SI-NEXT: s_and_b32 s6, s6, 0xffff -; SI-NEXT: s_or_b32 s8, s8, s11 +; SI-NEXT: s_or_b32 s8, s8, s9 ; SI-NEXT: s_or_b32 s8, s8, s6 ; SI-NEXT: v_readlane_b32 s6, v43, 33 ; SI-NEXT: s_add_i32 s12, s6, 3 ; SI-NEXT: v_readlane_b32 s7, v43, 27 ; SI-NEXT: s_and_b32 s6, s12, 0xff -; SI-NEXT: s_lshl_b32 s11, s7, 8 +; SI-NEXT: s_lshl_b32 s9, s7, 8 ; SI-NEXT: v_readlane_b32 s7, v43, 17 -; SI-NEXT: s_or_b32 s6, s11, s6 -; SI-NEXT: s_lshl_b32 s11, s7, 24 -; SI-NEXT: v_readlane_b32 s7, v43, 19 +; SI-NEXT: s_or_b32 s6, s9, s6 +; SI-NEXT: s_lshl_b32 s9, s7, 24 +; SI-NEXT: v_readlane_b32 s7, v43, 18 ; SI-NEXT: s_add_i32 s12, s7, 3 ; SI-NEXT: s_and_b32 s12, s12, 0xff ; SI-NEXT: s_addk_i32 s6, 0x300 ; SI-NEXT: s_lshl_b32 s12, s12, 16 -; SI-NEXT: v_readlane_b32 s7, v43, 34 +; SI-NEXT: v_readlane_b32 s7, v43, 32 ; SI-NEXT: s_and_b32 s6, s6, 0xffff -; SI-NEXT: s_or_b32 s11, s11, s12 -; SI-NEXT: s_add_i32 s13, s7, 3 +; SI-NEXT: s_or_b32 s9, s9, s12 +; SI-NEXT: s_add_i32 s11, s7, 3 ; SI-NEXT: v_readlane_b32 s7, v43, 31 -; SI-NEXT: s_or_b32 s6, s11, s6 -; SI-NEXT: s_and_b32 s11, s13, 0xff +; SI-NEXT: s_or_b32 s6, s9, s6 +; SI-NEXT: s_and_b32 s9, s11, 0xff ; SI-NEXT: s_lshl_b32 s10, s7, 8 -; SI-NEXT: v_readlane_b32 s7, v43, 23 +; SI-NEXT: v_readlane_b32 s7, v43, 22 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_lshl_b32 s10, s7, 24 +; SI-NEXT: v_readlane_b32 s7, v43, 25 +; SI-NEXT: s_add_i32 s24, s7, 3 +; SI-NEXT: s_and_b32 s11, s24, 0xff +; SI-NEXT: s_addk_i32 s9, 0x300 +; SI-NEXT: s_lshl_b32 s11, s11, 16 +; SI-NEXT: s_and_b32 s9, s9, 0xffff ; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: s_lshl_b32 s11, s7, 24 -; SI-NEXT: v_readlane_b32 s7, v43, 24 -; SI-NEXT: s_add_i32 s25, s7, 3 -; SI-NEXT: s_and_b32 s12, s25, 0xff -; SI-NEXT: s_addk_i32 s10, 0x300 -; SI-NEXT: s_lshl_b32 s12, s12, 16 -; SI-NEXT: s_and_b32 s10, s10, 0xffff -; SI-NEXT: s_or_b32 s11, s11, s12 -; SI-NEXT: v_readlane_b32 s7, v43, 30 -; SI-NEXT: s_or_b32 s10, s11, s10 -; SI-NEXT: s_add_i32 s9, s7, 3 ; SI-NEXT: v_readlane_b32 s7, v43, 28 -; SI-NEXT: v_readlane_b32 s11, v43, 21 -; SI-NEXT: s_and_b32 s9, s9, 0xff -; SI-NEXT: s_lshl_b32 s7, s7, 8 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_add_i32 s7, s7, 3 +; SI-NEXT: v_readlane_b32 s10, v43, 26 +; SI-NEXT: v_readlane_b32 s11, v43, 20 +; SI-NEXT: s_and_b32 s7, s7, 0xff +; SI-NEXT: s_lshl_b32 s10, s10, 8 ; SI-NEXT: s_add_i32 s11, s11, 3 -; SI-NEXT: s_or_b32 s7, s7, s9 -; SI-NEXT: v_readlane_b32 s9, v43, 22 +; SI-NEXT: s_or_b32 s7, s10, s7 +; SI-NEXT: v_readlane_b32 s10, v43, 21 ; SI-NEXT: s_and_b32 s11, s11, 0xff ; SI-NEXT: s_addk_i32 s7, 0x300 -; SI-NEXT: s_lshl_b32 s9, s9, 24 +; SI-NEXT: s_lshl_b32 s10, s10, 24 ; SI-NEXT: s_lshl_b32 s11, s11, 16 ; SI-NEXT: s_and_b32 s7, s7, 0xffff -; SI-NEXT: s_or_b32 s9, s9, s11 -; SI-NEXT: s_or_b32 s7, s9, s7 -; SI-NEXT: v_readlane_b32 s9, v43, 20 -; SI-NEXT: s_add_i32 s21, s9, 3 -; SI-NEXT: v_readlane_b32 s11, v43, 18 -; SI-NEXT: v_readlane_b32 s12, v43, 15 -; SI-NEXT: s_and_b32 s9, s21, 0xff +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: s_or_b32 s7, s10, s7 +; SI-NEXT: v_readlane_b32 s10, v43, 19 +; SI-NEXT: s_add_i32 s23, s10, 3 +; SI-NEXT: v_readlane_b32 s11, v43, 16 +; SI-NEXT: v_readlane_b32 s12, v43, 14 +; SI-NEXT: s_and_b32 s10, s23, 0xff ; SI-NEXT: s_lshl_b32 s11, s11, 8 ; SI-NEXT: s_add_i32 s12, s12, 3 -; SI-NEXT: s_or_b32 s9, s11, s9 -; SI-NEXT: v_readlane_b32 s11, v43, 16 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: v_readlane_b32 s11, v43, 15 ; SI-NEXT: s_and_b32 s12, s12, 0xff -; SI-NEXT: s_addk_i32 s9, 0x300 +; SI-NEXT: s_addk_i32 s10, 0x300 ; SI-NEXT: s_lshl_b32 s11, s11, 24 ; SI-NEXT: s_lshl_b32 s12, s12, 16 -; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_and_b32 s10, s10, 0xffff ; SI-NEXT: s_or_b32 s11, s11, s12 -; SI-NEXT: s_or_b32 s9, s11, s9 -; SI-NEXT: v_readlane_b32 s11, v43, 14 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: v_readlane_b32 s11, v43, 13 +; SI-NEXT: s_add_i32 s14, s16, 0x3000000 ; SI-NEXT: s_add_i32 s11, s11, 3 -; SI-NEXT: v_readlane_b32 s12, v43, 13 -; SI-NEXT: v_readlane_b32 s13, v43, 11 +; SI-NEXT: v_readlane_b32 s12, v43, 12 +; SI-NEXT: v_readlane_b32 s16, v43, 10 ; SI-NEXT: s_and_b32 s11, s11, 0xff ; SI-NEXT: s_lshl_b32 s12, s12, 8 -; SI-NEXT: s_add_i32 s13, s13, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_or_b32 s11, s12, s11 -; SI-NEXT: v_readlane_b32 s12, v43, 12 -; SI-NEXT: s_and_b32 s13, s13, 0xff +; SI-NEXT: v_readlane_b32 s12, v43, 11 +; SI-NEXT: s_and_b32 s16, s16, 0xff ; SI-NEXT: s_addk_i32 s11, 0x300 ; SI-NEXT: s_lshl_b32 s12, s12, 24 -; SI-NEXT: s_lshl_b32 s13, s13, 16 +; SI-NEXT: s_lshl_b32 s16, s16, 16 ; SI-NEXT: s_and_b32 s11, s11, 0xffff -; SI-NEXT: s_or_b32 s12, s12, s13 +; SI-NEXT: s_or_b32 s12, s12, s16 ; SI-NEXT: s_or_b32 s11, s12, s11 -; SI-NEXT: v_readlane_b32 s12, v43, 10 -; SI-NEXT: s_add_i32 s15, s16, 0x3000000 +; SI-NEXT: v_readlane_b32 s12, v43, 9 +; SI-NEXT: s_add_i32 s15, s17, 0x3000000 ; SI-NEXT: s_add_i32 s12, s12, 3 -; SI-NEXT: v_readlane_b32 s13, v43, 9 -; SI-NEXT: v_readlane_b32 s16, v43, 7 +; SI-NEXT: v_readlane_b32 s16, v43, 8 +; SI-NEXT: v_readlane_b32 s17, v43, 6 ; SI-NEXT: s_and_b32 s12, s12, 0xff -; SI-NEXT: s_lshl_b32 s13, s13, 8 -; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: s_or_b32 s12, s13, s12 -; SI-NEXT: v_readlane_b32 s13, v43, 8 -; SI-NEXT: s_and_b32 s16, s16, 0xff -; SI-NEXT: s_addk_i32 s12, 0x300 -; SI-NEXT: s_lshl_b32 s13, s13, 24 -; SI-NEXT: s_lshl_b32 s16, s16, 16 -; SI-NEXT: s_and_b32 s12, s12, 0xffff -; SI-NEXT: s_or_b32 s13, s13, s16 -; SI-NEXT: s_or_b32 s12, s13, s12 -; SI-NEXT: v_readlane_b32 s13, v43, 6 -; SI-NEXT: s_add_i32 s40, s17, 0x3000000 -; SI-NEXT: s_add_i32 s13, s13, 3 -; SI-NEXT: v_readlane_b32 s16, v43, 5 -; SI-NEXT: v_readlane_b32 s17, v43, 3 -; SI-NEXT: s_and_b32 s13, s13, 0xff ; SI-NEXT: s_lshl_b32 s16, s16, 8 ; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: s_or_b32 s13, s16, s13 -; SI-NEXT: v_readlane_b32 s16, v43, 4 +; SI-NEXT: s_or_b32 s12, s16, s12 +; SI-NEXT: v_readlane_b32 s16, v43, 7 ; SI-NEXT: s_and_b32 s17, s17, 0xff -; SI-NEXT: s_addk_i32 s13, 0x300 +; SI-NEXT: s_addk_i32 s12, 0x300 ; SI-NEXT: s_lshl_b32 s16, s16, 24 ; SI-NEXT: s_lshl_b32 s17, s17, 16 -; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_and_b32 s12, s12, 0xffff ; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: s_or_b32 s13, s16, s13 -; SI-NEXT: v_readlane_b32 s16, v43, 2 -; SI-NEXT: s_add_i32 s41, s18, 0x3000000 +; SI-NEXT: s_or_b32 s12, s16, s12 +; SI-NEXT: v_readlane_b32 s16, v43, 5 +; SI-NEXT: s_add_i32 s40, s18, 0x3000000 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_readlane_b32 s17, v43, 1 -; SI-NEXT: v_readlane_b32 s18, v44, 63 +; SI-NEXT: v_readlane_b32 s17, v43, 4 +; SI-NEXT: v_readlane_b32 s18, v43, 2 ; SI-NEXT: s_and_b32 s16, s16, 0xff ; SI-NEXT: s_lshl_b32 s17, s17, 8 ; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: v_readlane_b32 s17, v43, 0 +; SI-NEXT: v_readlane_b32 s17, v43, 3 ; SI-NEXT: s_and_b32 s18, s18, 0xff ; SI-NEXT: s_addk_i32 s16, 0x300 ; SI-NEXT: s_lshl_b32 s17, s17, 24 @@ -152379,50 +150544,51 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_and_b32 s16, s16, 0xffff ; SI-NEXT: s_or_b32 s17, s17, s18 ; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: s_add_i32 s17, s16, 0x3000000 -; SI-NEXT: v_readlane_b32 s16, v44, 62 -; SI-NEXT: s_add_i32 s42, s19, 0x3000000 -; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_readlane_b32 s18, v44, 61 -; SI-NEXT: v_readlane_b32 s19, v44, 59 -; SI-NEXT: s_and_b32 s16, s16, 0xff +; SI-NEXT: v_readlane_b32 s17, v43, 1 +; SI-NEXT: s_add_i32 s41, s19, 0x3000000 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: v_readlane_b32 s18, v43, 0 +; SI-NEXT: v_readlane_b32 s19, v44, 62 +; SI-NEXT: s_and_b32 s17, s17, 0xff ; SI-NEXT: s_lshl_b32 s18, s18, 8 ; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: s_or_b32 s16, s18, s16 -; SI-NEXT: v_readlane_b32 s18, v44, 60 +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: v_readlane_b32 s18, v44, 63 ; SI-NEXT: s_and_b32 s19, s19, 0xff -; SI-NEXT: s_addk_i32 s16, 0x300 +; SI-NEXT: s_addk_i32 s17, 0x300 ; SI-NEXT: s_lshl_b32 s18, s18, 24 ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_and_b32 s17, s17, 0xffff ; SI-NEXT: s_or_b32 s18, s18, s19 -; SI-NEXT: s_or_b32 s16, s18, s16 -; SI-NEXT: v_readlane_b32 s18, v44, 58 -; SI-NEXT: s_add_i32 s43, s20, 0x3000000 -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: v_readlane_b32 s19, v44, 57 -; SI-NEXT: v_readlane_b32 s20, v44, 55 -; SI-NEXT: s_and_b32 s18, s18, 0xff +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: s_add_i32 s18, s17, 0x3000000 +; SI-NEXT: v_readlane_b32 s17, v44, 61 +; SI-NEXT: s_add_i32 s42, s20, 0x3000000 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: v_readlane_b32 s19, v44, 60 +; SI-NEXT: v_readlane_b32 s20, v44, 58 +; SI-NEXT: s_and_b32 s17, s17, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 8 ; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: s_or_b32 s18, s19, s18 -; SI-NEXT: v_readlane_b32 s19, v44, 56 +; SI-NEXT: s_or_b32 s17, s19, s17 +; SI-NEXT: v_readlane_b32 s19, v44, 59 ; SI-NEXT: s_and_b32 s20, s20, 0xff -; SI-NEXT: s_addk_i32 s18, 0x300 +; SI-NEXT: s_addk_i32 s17, 0x300 ; SI-NEXT: s_lshl_b32 s19, s19, 24 ; SI-NEXT: s_lshl_b32 s20, s20, 16 -; SI-NEXT: s_and_b32 s18, s18, 0xffff +; SI-NEXT: s_and_b32 s17, s17, 0xffff ; SI-NEXT: s_or_b32 s19, s19, s20 -; SI-NEXT: s_or_b32 s18, s19, s18 -; SI-NEXT: v_readlane_b32 s19, v44, 54 +; SI-NEXT: s_or_b32 s17, s19, s17 +; SI-NEXT: v_readlane_b32 s19, v44, 57 +; SI-NEXT: s_add_i32 s43, s21, 0x3000000 ; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: v_readlane_b32 s20, v44, 53 -; SI-NEXT: v_readlane_b32 s21, v44, 51 +; SI-NEXT: v_readlane_b32 s20, v44, 56 +; SI-NEXT: v_readlane_b32 s21, v44, 54 ; SI-NEXT: s_and_b32 s19, s19, 0xff ; SI-NEXT: s_lshl_b32 s20, s20, 8 ; SI-NEXT: s_add_i32 s21, s21, 3 ; SI-NEXT: s_or_b32 s19, s20, s19 -; SI-NEXT: v_readlane_b32 s20, v44, 52 +; SI-NEXT: v_readlane_b32 s20, v44, 55 ; SI-NEXT: s_and_b32 s21, s21, 0xff ; SI-NEXT: s_addk_i32 s19, 0x300 ; SI-NEXT: s_lshl_b32 s20, s20, 24 @@ -152430,16 +150596,16 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_and_b32 s19, s19, 0xffff ; SI-NEXT: s_or_b32 s20, s20, s21 ; SI-NEXT: s_or_b32 s19, s20, s19 -; SI-NEXT: v_readlane_b32 s20, v44, 50 +; SI-NEXT: v_readlane_b32 s20, v44, 53 ; SI-NEXT: s_add_i32 s44, s22, 0x3000000 ; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: v_readlane_b32 s21, v44, 49 -; SI-NEXT: v_readlane_b32 s22, v44, 47 +; SI-NEXT: v_readlane_b32 s21, v44, 52 +; SI-NEXT: v_readlane_b32 s22, v44, 50 ; SI-NEXT: s_and_b32 s20, s20, 0xff ; SI-NEXT: s_lshl_b32 s21, s21, 8 ; SI-NEXT: s_add_i32 s22, s22, 3 ; SI-NEXT: s_or_b32 s20, s21, s20 -; SI-NEXT: v_readlane_b32 s21, v44, 48 +; SI-NEXT: v_readlane_b32 s21, v44, 51 ; SI-NEXT: s_and_b32 s22, s22, 0xff ; SI-NEXT: s_addk_i32 s20, 0x300 ; SI-NEXT: s_lshl_b32 s21, s21, 24 @@ -152447,406 +150613,325 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_and_b32 s20, s20, 0xffff ; SI-NEXT: s_or_b32 s21, s21, s22 ; SI-NEXT: s_or_b32 s20, s21, s20 -; SI-NEXT: s_add_i32 s21, s20, 0x3000000 -; SI-NEXT: v_readlane_b32 s20, v44, 43 -; SI-NEXT: s_add_i32 s45, s23, 0x3000000 +; SI-NEXT: s_add_i32 s22, s20, 0x3000000 +; SI-NEXT: v_readlane_b32 s20, v44, 49 ; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: v_readlane_b32 s22, v44, 42 -; SI-NEXT: v_readlane_b32 s23, v44, 45 +; SI-NEXT: v_readlane_b32 s21, v44, 48 +; SI-NEXT: v_readlane_b32 s23, v44, 46 ; SI-NEXT: s_and_b32 s20, s20, 0xff -; SI-NEXT: s_lshl_b32 s22, s22, 8 +; SI-NEXT: s_lshl_b32 s21, s21, 8 ; SI-NEXT: s_add_i32 s23, s23, 3 -; SI-NEXT: s_or_b32 s20, s22, s20 -; SI-NEXT: v_readlane_b32 s22, v44, 46 +; SI-NEXT: s_or_b32 s20, s21, s20 +; SI-NEXT: v_readlane_b32 s21, v44, 47 ; SI-NEXT: s_and_b32 s23, s23, 0xff ; SI-NEXT: s_addk_i32 s20, 0x300 -; SI-NEXT: s_lshl_b32 s22, s22, 24 +; SI-NEXT: s_lshl_b32 s21, s21, 24 ; SI-NEXT: s_lshl_b32 s23, s23, 16 ; SI-NEXT: s_and_b32 s20, s20, 0xffff -; SI-NEXT: s_or_b32 s22, s22, s23 -; SI-NEXT: s_or_b32 s20, s22, s20 -; SI-NEXT: s_add_i32 s22, s20, 0x3000000 -; SI-NEXT: v_readlane_b32 s20, v44, 41 +; SI-NEXT: s_or_b32 s21, s21, s23 +; SI-NEXT: s_or_b32 s20, s21, s20 +; SI-NEXT: s_add_i32 s23, s20, 0x3000000 +; SI-NEXT: v_readlane_b32 s20, v44, 43 ; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: v_readlane_b32 s23, v44, 40 -; SI-NEXT: v_readlane_b32 s24, v44, 38 +; SI-NEXT: v_readlane_b32 s21, v44, 42 +; SI-NEXT: v_readlane_b32 s24, v44, 44 ; SI-NEXT: s_and_b32 s20, s20, 0xff -; SI-NEXT: s_lshl_b32 s23, s23, 8 +; SI-NEXT: s_lshl_b32 s21, s21, 8 ; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: s_or_b32 s20, s23, s20 -; SI-NEXT: v_readlane_b32 s23, v44, 39 +; SI-NEXT: s_or_b32 s20, s21, s20 +; SI-NEXT: v_readlane_b32 s21, v44, 45 ; SI-NEXT: s_and_b32 s24, s24, 0xff ; SI-NEXT: s_addk_i32 s20, 0x300 -; SI-NEXT: s_lshl_b32 s23, s23, 24 +; SI-NEXT: s_lshl_b32 s21, s21, 24 ; SI-NEXT: s_lshl_b32 s24, s24, 16 ; SI-NEXT: s_and_b32 s20, s20, 0xffff -; SI-NEXT: s_or_b32 s23, s23, s24 -; SI-NEXT: s_or_b32 s20, s23, s20 -; SI-NEXT: s_add_i32 s23, s20, 0x3000000 -; SI-NEXT: v_readlane_b32 s20, v44, 37 -; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: v_readlane_b32 s24, v44, 36 -; SI-NEXT: v_readlane_b32 s25, v44, 34 -; SI-NEXT: s_and_b32 s20, s20, 0xff +; SI-NEXT: s_or_b32 s21, s21, s24 +; SI-NEXT: s_or_b32 s20, s21, s20 +; SI-NEXT: v_readlane_b32 s21, v44, 41 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: v_readlane_b32 s24, v44, 0 +; SI-NEXT: v_readlane_b32 s25, v44, 39 +; SI-NEXT: s_and_b32 s21, s21, 0xff ; SI-NEXT: s_lshl_b32 s24, s24, 8 ; SI-NEXT: s_add_i32 s25, s25, 3 -; SI-NEXT: s_or_b32 s20, s24, s20 -; SI-NEXT: v_readlane_b32 s24, v44, 35 +; SI-NEXT: s_or_b32 s21, s24, s21 +; SI-NEXT: v_readlane_b32 s24, v44, 40 ; SI-NEXT: s_and_b32 s25, s25, 0xff -; SI-NEXT: s_addk_i32 s20, 0x300 +; SI-NEXT: s_addk_i32 s21, 0x300 ; SI-NEXT: s_lshl_b32 s24, s24, 24 ; SI-NEXT: s_lshl_b32 s25, s25, 16 -; SI-NEXT: s_and_b32 s20, s20, 0xffff +; SI-NEXT: s_and_b32 s21, s21, 0xffff ; SI-NEXT: s_or_b32 s24, s24, s25 -; SI-NEXT: s_or_b32 s20, s24, s20 -; SI-NEXT: v_readlane_b32 s24, v44, 3 -; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: v_readlane_b32 s25, v44, 2 +; SI-NEXT: s_or_b32 s21, s24, s21 +; SI-NEXT: s_add_i32 s24, s21, 0x3000000 +; SI-NEXT: v_readlane_b32 s21, v44, 38 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: v_readlane_b32 s25, v44, 37 ; SI-NEXT: v_readlane_b32 s26, v44, 1 -; SI-NEXT: s_and_b32 s24, s24, 0xff +; SI-NEXT: s_and_b32 s21, s21, 0xff ; SI-NEXT: s_lshl_b32 s25, s25, 8 ; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: s_or_b32 s24, s25, s24 -; SI-NEXT: v_readlane_b32 s25, v44, 0 +; SI-NEXT: s_or_b32 s21, s25, s21 +; SI-NEXT: v_readlane_b32 s25, v44, 36 ; SI-NEXT: s_and_b32 s26, s26, 0xff -; SI-NEXT: s_add_i32 s13, s13, 0x3000000 -; SI-NEXT: s_addk_i32 s24, 0x300 +; SI-NEXT: s_addk_i32 s21, 0x300 ; SI-NEXT: s_lshl_b32 s25, s25, 24 ; SI-NEXT: s_lshl_b32 s26, s26, 16 -; SI-NEXT: s_add_i32 s9, s9, 0x3000000 -; SI-NEXT: s_add_i32 s11, s11, 0x3000000 -; SI-NEXT: s_add_i32 s18, s18, 0x3000000 -; SI-NEXT: s_and_b32 s24, s24, 0xffff +; SI-NEXT: s_and_b32 s21, s21, 0xffff ; SI-NEXT: s_or_b32 s25, s25, s26 -; SI-NEXT: s_and_b32 s89, s17, 0xffff0000 -; SI-NEXT: s_lshl_b32 s88, s17, 16 -; SI-NEXT: s_and_b32 s17, s13, 0xffff0000 ; SI-NEXT: s_add_i32 s7, s7, 0x3000000 -; SI-NEXT: s_or_b32 s24, s25, s24 -; SI-NEXT: s_and_b32 s74, s18, 0xffff0000 -; SI-NEXT: s_lshl_b32 s25, s18, 16 -; SI-NEXT: v_writelane_b32 v43, s17, 49 -; SI-NEXT: s_and_b32 s63, s11, 0xffff0000 -; SI-NEXT: s_lshl_b32 s18, s11, 16 -; SI-NEXT: s_and_b32 s11, s9, 0xffff0000 -; SI-NEXT: s_and_b32 s46, s46, 0xff +; SI-NEXT: s_add_i32 s10, s10, 0x3000000 +; SI-NEXT: s_add_i32 s11, s11, 0x3000000 +; SI-NEXT: s_add_i32 s19, s19, 0x3000000 +; SI-NEXT: s_or_b32 s21, s25, s21 +; SI-NEXT: v_readlane_b32 s25, v44, 35 +; SI-NEXT: s_add_i32 s9, s9, 0x3000000 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: v_readlane_b32 s26, v44, 34 +; SI-NEXT: v_readlane_b32 s27, v44, 3 +; SI-NEXT: s_and_b32 s76, s19, 0xffff0000 +; SI-NEXT: s_lshl_b32 s63, s19, 16 +; SI-NEXT: s_and_b32 s90, s11, 0xffff0000 +; SI-NEXT: s_lshl_b32 s19, s11, 16 +; SI-NEXT: s_and_b32 s92, s10, 0xffff0000 +; SI-NEXT: s_lshl_b32 s11, s10, 16 +; SI-NEXT: s_and_b32 s10, s7, 0xffff0000 +; SI-NEXT: s_add_i32 s46, s57, 0x3000000 +; SI-NEXT: s_add_i32 s57, s58, 0x3000000 ; SI-NEXT: s_add_i32 s6, s6, 0x3000000 -; SI-NEXT: v_writelane_b32 v43, s11, 50 -; SI-NEXT: s_lshl_b32 s61, s9, 16 -; SI-NEXT: s_and_b32 s9, s7, 0xffff0000 -; SI-NEXT: s_lshl_b32 s46, s46, 16 -; SI-NEXT: s_addk_i32 s56, 0x300 +; SI-NEXT: s_and_b32 s25, s25, 0xff +; SI-NEXT: s_lshl_b32 s26, s26, 8 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_and_b32 s58, s18, 0xffff0000 +; SI-NEXT: s_lshl_b32 s77, s18, 16 +; SI-NEXT: v_writelane_b32 v43, s10, 49 +; SI-NEXT: s_lshl_b32 s18, s7, 16 +; SI-NEXT: s_and_b32 s7, s9, 0xffff0000 +; SI-NEXT: s_and_b32 s62, s62, 0xff ; SI-NEXT: s_add_i32 s8, s8, 0x3000000 -; SI-NEXT: v_writelane_b32 v43, s9, 51 -; SI-NEXT: s_lshl_b32 s17, s7, 16 +; SI-NEXT: s_or_b32 s25, s26, s25 +; SI-NEXT: v_readlane_b32 s26, v44, 2 +; SI-NEXT: s_and_b32 s27, s27, 0xff +; SI-NEXT: v_writelane_b32 v43, s7, 50 ; SI-NEXT: s_and_b32 s7, s6, 0xffff0000 -; SI-NEXT: s_or_b32 s46, s47, s46 -; SI-NEXT: s_and_b32 s47, s56, 0xffff -; SI-NEXT: v_writelane_b32 v43, s7, 52 +; SI-NEXT: s_lshl_b32 s62, s62, 16 +; SI-NEXT: s_addk_i32 s60, 0x300 +; SI-NEXT: s_addk_i32 s25, 0x300 +; SI-NEXT: s_lshl_b32 s26, s26, 24 +; SI-NEXT: s_lshl_b32 s27, s27, 16 +; SI-NEXT: v_writelane_b32 v43, s7, 51 ; SI-NEXT: s_and_b32 s7, s8, 0xffff0000 -; SI-NEXT: s_or_b32 s56, s46, s47 -; SI-NEXT: s_add_i32 s47, s58, 0x3000000 -; SI-NEXT: s_add_i32 s58, s59, 0x3000000 -; SI-NEXT: v_writelane_b32 v43, s7, 53 +; SI-NEXT: s_or_b32 s61, s61, s62 +; SI-NEXT: s_and_b32 s60, s60, 0xffff +; SI-NEXT: s_and_b32 s25, s25, 0xffff +; SI-NEXT: s_or_b32 s26, s26, s27 +; SI-NEXT: v_writelane_b32 v43, s7, 52 ; SI-NEXT: s_lshl_b32 s7, s8, 16 -; SI-NEXT: s_add_i32 s57, s57, 0x3000000 -; SI-NEXT: v_writelane_b32 v43, s7, 54 -; SI-NEXT: s_and_b32 s7, s58, 0xffff0000 +; SI-NEXT: s_or_b32 s60, s61, s60 +; SI-NEXT: s_add_i32 s56, s56, 0x3000000 +; SI-NEXT: s_or_b32 s25, s26, s25 +; SI-NEXT: v_writelane_b32 v43, s7, 53 +; SI-NEXT: s_and_b32 s7, s57, 0xffff0000 ; SI-NEXT: s_add_i32 s4, s4, 0x3000000 ; SI-NEXT: s_add_i32 s5, s5, 0x3000000 -; SI-NEXT: s_add_i32 s46, s60, 0x3000000 -; SI-NEXT: s_add_i32 s56, s56, 0x3000000 -; SI-NEXT: s_add_i32 s10, s10, 0x3000000 +; SI-NEXT: s_add_i32 s45, s60, 0x3000000 +; SI-NEXT: s_add_i32 s47, s47, 0x3000000 ; SI-NEXT: s_add_i32 s12, s12, 0x3000000 ; SI-NEXT: s_add_i32 s16, s16, 0x3000000 -; SI-NEXT: s_add_i32 s19, s19, 0x3000000 +; SI-NEXT: s_add_i32 s17, s17, 0x3000000 ; SI-NEXT: s_add_i32 s20, s20, 0x3000000 -; SI-NEXT: s_add_i32 s24, s24, 0x3000000 -; SI-NEXT: v_writelane_b32 v43, s7, 55 -; SI-NEXT: s_and_b32 s7, s57, 0xffff0000 -; SI-NEXT: s_and_b32 s27, s24, 0xffff0000 -; SI-NEXT: s_lshl_b32 s26, s24, 16 -; SI-NEXT: s_and_b32 s65, s20, 0xffff0000 -; SI-NEXT: s_lshl_b32 s20, s20, 16 -; SI-NEXT: s_and_b32 s66, s23, 0xffff0000 -; SI-NEXT: s_lshl_b32 s29, s23, 16 -; SI-NEXT: s_and_b32 s64, s22, 0xffff0000 -; SI-NEXT: s_lshl_b32 s95, s22, 16 -; SI-NEXT: s_and_b32 s76, s21, 0xffff0000 +; SI-NEXT: s_add_i32 s21, s21, 0x3000000 +; SI-NEXT: s_add_i32 s25, s25, 0x3000000 +; SI-NEXT: v_writelane_b32 v43, s7, 54 +; SI-NEXT: s_and_b32 s7, s56, 0xffff0000 +; SI-NEXT: s_and_b32 s26, s25, 0xffff0000 +; SI-NEXT: s_lshl_b32 s25, s25, 16 +; SI-NEXT: s_and_b32 s88, s21, 0xffff0000 ; SI-NEXT: s_lshl_b32 s21, s21, 16 -; SI-NEXT: s_and_b32 s77, s19, 0xffff0000 -; SI-NEXT: s_lshl_b32 s22, s19, 16 +; SI-NEXT: s_and_b32 s37, s24, 0xffff0000 +; SI-NEXT: s_lshl_b32 s29, s24, 16 +; SI-NEXT: s_and_b32 s24, s20, 0xffff0000 +; SI-NEXT: s_lshl_b32 s20, s20, 16 +; SI-NEXT: s_and_b32 s73, s23, 0xffff0000 +; SI-NEXT: s_lshl_b32 s23, s23, 16 +; SI-NEXT: s_and_b32 s74, s22, 0xffff0000 +; SI-NEXT: s_lshl_b32 s59, s22, 16 +; SI-NEXT: s_and_b32 s79, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s17, s17, 16 ; SI-NEXT: s_and_b32 s78, s16, 0xffff0000 ; SI-NEXT: s_lshl_b32 s16, s16, 16 -; SI-NEXT: s_lshl_b32 s19, s13, 16 -; SI-NEXT: s_and_b32 s75, s12, 0xffff0000 -; SI-NEXT: s_lshl_b32 s79, s12, 16 -; SI-NEXT: s_and_b32 s13, s10, 0xffff0000 -; SI-NEXT: s_lshl_b32 s59, s10, 16 +; SI-NEXT: s_and_b32 s27, s12, 0xffff0000 +; SI-NEXT: s_lshl_b32 s91, s12, 16 +; SI-NEXT: s_lshl_b32 s75, s9, 16 ; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: s_lshl_b32 s99, s58, 16 -; SI-NEXT: v_writelane_b32 v43, s7, 56 -; SI-NEXT: s_lshl_b32 s57, s57, 16 -; SI-NEXT: s_and_b32 s7, s56, 0xffff0000 -; SI-NEXT: s_lshl_b32 s8, s56, 16 -; SI-NEXT: s_and_b32 s56, s47, 0xffff0000 -; SI-NEXT: s_lshl_b32 s23, s47, 16 -; SI-NEXT: s_and_b32 s47, s46, 0xffff0000 -; SI-NEXT: s_lshl_b32 s97, s46, 16 -; SI-NEXT: s_and_b32 s24, s45, 0xffff0000 -; SI-NEXT: s_lshl_b32 s45, s45, 16 -; SI-NEXT: s_and_b32 s58, s44, 0xffff0000 -; SI-NEXT: s_lshl_b32 s28, s44, 16 -; SI-NEXT: s_and_b32 s73, s43, 0xffff0000 -; SI-NEXT: s_lshl_b32 s46, s43, 16 -; SI-NEXT: s_and_b32 s67, s42, 0xffff0000 -; SI-NEXT: s_lshl_b32 s87, s42, 16 -; SI-NEXT: s_and_b32 s68, s41, 0xffff0000 -; SI-NEXT: s_lshl_b32 s42, s41, 16 -; SI-NEXT: s_and_b32 s70, s40, 0xffff0000 -; SI-NEXT: s_lshl_b32 s86, s40, 16 -; SI-NEXT: s_and_b32 s94, s15, 0xffff0000 -; SI-NEXT: s_lshl_b32 s69, s15, 16 -; SI-NEXT: s_and_b32 s11, s14, 0xffff0000 -; SI-NEXT: s_lshl_b32 s85, s14, 16 -; SI-NEXT: s_and_b32 s31, s5, 0xffff0000 +; SI-NEXT: s_lshl_b32 s99, s57, 16 +; SI-NEXT: v_writelane_b32 v43, s7, 55 +; SI-NEXT: s_lshl_b32 s7, s56, 16 +; SI-NEXT: s_and_b32 s60, s47, 0xffff0000 +; SI-NEXT: s_lshl_b32 s12, s47, 16 +; SI-NEXT: s_and_b32 s51, s46, 0xffff0000 +; SI-NEXT: s_lshl_b32 s47, s46, 16 +; SI-NEXT: s_and_b32 s53, s45, 0xffff0000 +; SI-NEXT: s_lshl_b32 s97, s45, 16 +; SI-NEXT: s_and_b32 s54, s44, 0xffff0000 +; SI-NEXT: s_lshl_b32 s52, s44, 16 +; SI-NEXT: s_and_b32 s64, s43, 0xffff0000 +; SI-NEXT: s_lshl_b32 s56, s43, 16 +; SI-NEXT: s_and_b32 s65, s42, 0xffff0000 +; SI-NEXT: s_lshl_b32 s55, s42, 16 +; SI-NEXT: s_and_b32 s67, s41, 0xffff0000 +; SI-NEXT: s_lshl_b32 s87, s41, 16 +; SI-NEXT: s_and_b32 s68, s40, 0xffff0000 +; SI-NEXT: s_lshl_b32 s66, s40, 16 +; SI-NEXT: s_and_b32 s70, s15, 0xffff0000 +; SI-NEXT: s_lshl_b32 s86, s15, 16 +; SI-NEXT: s_and_b32 s28, s14, 0xffff0000 +; SI-NEXT: s_lshl_b32 s69, s14, 16 +; SI-NEXT: s_and_b32 s81, s13, 0xffff0000 +; SI-NEXT: s_lshl_b32 s85, s13, 16 +; SI-NEXT: s_and_b32 s41, s5, 0xffff0000 ; SI-NEXT: s_lshl_b32 s80, s5, 16 -; SI-NEXT: s_and_b32 s15, s4, 0xffff0000 +; SI-NEXT: s_and_b32 s83, s4, 0xffff0000 ; SI-NEXT: s_lshl_b32 s84, s4, 16 -; SI-NEXT: v_writelane_b32 v43, s7, 57 +; SI-NEXT: v_writelane_b32 v43, s7, 56 ; SI-NEXT: .LBB89_3: ; %end -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s27 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s26 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_readlane_b32 s4, v43, 49 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s65 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s20 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s66 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s29 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s64 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s95 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s76 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s21 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s77 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s22 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s74 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s25 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s78 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s16 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s89 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v0, 1.0, s26 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_mul_f32_e64 v0, 1.0, s25 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s88 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s19 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s75 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s79 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s63 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s21 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s37 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s29 +; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s24 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s20 +; SI-NEXT: v_lshr_b64 v[3:4], v[3:4], 16 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s73 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s23 +; SI-NEXT: v_lshr_b64 v[4:5], v[4:5], 16 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s74 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s59 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: v_lshr_b64 v[5:6], v[5:6], 16 +; SI-NEXT: v_mul_f32_e64 v6, 1.0, s76 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_mul_f32_e64 v6, 1.0, s63 +; SI-NEXT: v_lshr_b64 v[6:7], v[6:7], 16 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s79 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v7 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s17 +; SI-NEXT: v_lshr_b64 v[7:8], v[7:8], 16 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s58 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v8 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s77 +; SI-NEXT: v_lshr_b64 v[8:9], v[8:9], 16 +; SI-NEXT: v_mul_f32_e64 v9, 1.0, s78 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v9 +; SI-NEXT: v_mul_f32_e64 v9, 1.0, s16 +; SI-NEXT: v_lshr_b64 v[9:10], v[9:10], 16 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s27 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s91 +; SI-NEXT: v_lshr_b64 v[10:11], v[10:11], 16 +; SI-NEXT: v_mul_f32_e64 v11, 1.0, s90 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v11 +; SI-NEXT: v_mul_f32_e64 v11, 1.0, s19 +; SI-NEXT: v_lshr_b64 v[11:12], v[11:12], 16 +; SI-NEXT: v_mul_f32_e64 v12, 1.0, s92 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v12 +; SI-NEXT: v_mul_f32_e64 v12, 1.0, s11 +; SI-NEXT: v_lshr_b64 v[12:13], v[12:13], 16 +; SI-NEXT: v_readlane_b32 s4, v43, 49 +; SI-NEXT: v_mul_f32_e64 v13, 1.0, s4 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v13 +; SI-NEXT: v_mul_f32_e64 v13, 1.0, s18 +; SI-NEXT: v_lshr_b64 v[13:14], v[13:14], 16 ; SI-NEXT: v_readlane_b32 s4, v43, 50 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s61 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 +; SI-NEXT: v_mul_f32_e64 v14, 1.0, s4 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_mul_f32_e64 v14, 1.0, s75 +; SI-NEXT: v_lshr_b64 v[14:15], v[14:15], 16 ; SI-NEXT: v_readlane_b32 s4, v43, 51 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s17 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s13 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s59 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: v_mul_f32_e64 v15, 1.0, s4 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v15 +; SI-NEXT: v_mul_f32_e64 v15, 1.0, s6 +; SI-NEXT: v_lshr_b64 v[15:16], v[15:16], 16 ; SI-NEXT: v_readlane_b32 s4, v43, 52 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s6 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: v_mul_f32_e64 v16, 1.0, s4 ; SI-NEXT: v_readlane_b32 s4, v43, 53 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v16 +; SI-NEXT: v_mul_f32_e64 v16, 1.0, s4 +; SI-NEXT: v_lshr_b64 v[16:17], v[16:17], 16 ; SI-NEXT: v_readlane_b32 s4, v43, 54 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: v_mul_f32_e64 v17, 1.0, s4 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v17 +; SI-NEXT: v_mul_f32_e64 v17, 1.0, s99 +; SI-NEXT: v_lshr_b64 v[17:18], v[17:18], 16 ; SI-NEXT: v_readlane_b32 s4, v43, 55 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s99 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: v_mul_f32_e64 v18, 1.0, s4 ; SI-NEXT: v_readlane_b32 s4, v43, 56 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s57 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 -; SI-NEXT: v_readlane_b32 s4, v43, 57 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s8 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s56 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s23 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s47 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s97 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s24 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s45 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s58 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s28 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s73 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s46 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s67 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s87 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s68 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s42 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s70 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s86 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s94 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s69 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s11 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s85 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s31 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s80 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s15 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s84 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; SI-NEXT: v_mul_f32_e64 v18, 1.0, s4 +; SI-NEXT: v_lshr_b64 v[18:19], v[18:19], 16 +; SI-NEXT: v_mul_f32_e64 v19, 1.0, s60 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v19 +; SI-NEXT: v_mul_f32_e64 v19, 1.0, s12 +; SI-NEXT: v_lshr_b64 v[19:20], v[19:20], 16 +; SI-NEXT: v_mul_f32_e64 v20, 1.0, s51 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v20 +; SI-NEXT: v_mul_f32_e64 v20, 1.0, s47 +; SI-NEXT: v_lshr_b64 v[20:21], v[20:21], 16 +; SI-NEXT: v_mul_f32_e64 v21, 1.0, s53 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v21 +; SI-NEXT: v_mul_f32_e64 v21, 1.0, s97 +; SI-NEXT: v_lshr_b64 v[21:22], v[21:22], 16 +; SI-NEXT: v_mul_f32_e64 v22, 1.0, s54 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 +; SI-NEXT: v_mul_f32_e64 v22, 1.0, s52 +; SI-NEXT: v_lshr_b64 v[22:23], v[22:23], 16 +; SI-NEXT: v_mul_f32_e64 v23, 1.0, s64 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v23 +; SI-NEXT: v_mul_f32_e64 v23, 1.0, s56 +; SI-NEXT: v_lshr_b64 v[23:24], v[23:24], 16 +; SI-NEXT: v_mul_f32_e64 v24, 1.0, s65 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v24 +; SI-NEXT: v_mul_f32_e64 v24, 1.0, s55 +; SI-NEXT: v_lshr_b64 v[24:25], v[24:25], 16 +; SI-NEXT: v_mul_f32_e64 v25, 1.0, s67 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v25 +; SI-NEXT: v_mul_f32_e64 v25, 1.0, s87 +; SI-NEXT: v_lshr_b64 v[25:26], v[25:26], 16 +; SI-NEXT: v_mul_f32_e64 v26, 1.0, s68 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v26 +; SI-NEXT: v_mul_f32_e64 v26, 1.0, s66 +; SI-NEXT: v_lshr_b64 v[26:27], v[26:27], 16 +; SI-NEXT: v_mul_f32_e64 v27, 1.0, s70 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v27 +; SI-NEXT: v_mul_f32_e64 v27, 1.0, s86 +; SI-NEXT: v_lshr_b64 v[27:28], v[27:28], 16 +; SI-NEXT: v_mul_f32_e64 v28, 1.0, s28 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v28 +; SI-NEXT: v_mul_f32_e64 v28, 1.0, s69 +; SI-NEXT: v_lshr_b64 v[28:29], v[28:29], 16 +; SI-NEXT: v_mul_f32_e64 v29, 1.0, s81 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v29 +; SI-NEXT: v_mul_f32_e64 v29, 1.0, s85 +; SI-NEXT: v_lshr_b64 v[29:30], v[29:30], 16 +; SI-NEXT: v_mul_f32_e64 v30, 1.0, s41 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v30 +; SI-NEXT: v_mul_f32_e64 v30, 1.0, s80 +; SI-NEXT: v_lshr_b64 v[30:31], v[30:31], 16 +; SI-NEXT: v_mul_f32_e64 v31, 1.0, s83 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v31 +; SI-NEXT: v_mul_f32_e64 v31, 1.0, s84 +; SI-NEXT: v_lshr_b64 v[31:32], v[31:32], 16 ; SI-NEXT: v_readlane_b32 s99, v41, 35 ; SI-NEXT: v_readlane_b32 s98, v41, 34 ; SI-NEXT: v_readlane_b32 s97, v41, 33 @@ -152884,123 +150969,118 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: v_readlane_b32 s31, v41, 1 ; SI-NEXT: v_readlane_b32 s30, v41, 0 ; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[4:5] -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB89_4: -; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: ; kill: killed $sgpr8 -; SI-NEXT: s_mov_b32 s7, s6 -; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; kill: killed $sgpr8 -; SI-NEXT: ; kill: killed $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; kill: killed $sgpr12 ; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: ; kill: killed $sgpr8 -; SI-NEXT: v_readlane_b32 s92, v44, 24 -; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: v_readlane_b32 s91, v44, 20 -; SI-NEXT: s_mov_b32 s90, s88 -; SI-NEXT: v_readlane_b32 s36, v44, 23 -; SI-NEXT: v_readlane_b32 s35, v44, 19 -; SI-NEXT: v_readlane_b32 s62, v44, 22 -; SI-NEXT: v_readlane_b32 s38, v44, 18 -; SI-NEXT: s_mov_b32 s34, s46 -; SI-NEXT: s_mov_b32 s93, s21 -; SI-NEXT: s_mov_b32 s37, s43 -; SI-NEXT: s_mov_b32 s39, s75 -; SI-NEXT: v_readlane_b32 s72, v44, 10 -; SI-NEXT: s_mov_b32 s50, s63 -; SI-NEXT: s_mov_b32 s51, s59 -; SI-NEXT: s_mov_b32 s48, s56 -; SI-NEXT: v_readlane_b32 s30, v44, 21 -; SI-NEXT: s_mov_b32 s49, s61 -; SI-NEXT: s_mov_b32 s52, s79 -; SI-NEXT: v_readlane_b32 s98, v44, 6 -; SI-NEXT: s_mov_b32 s55, s45 -; SI-NEXT: v_readlane_b32 s43, v44, 17 -; SI-NEXT: s_mov_b32 s60, s40 -; SI-NEXT: v_readlane_b32 s41, v44, 14 -; SI-NEXT: s_mov_b32 s53, s42 -; SI-NEXT: s_mov_b32 s54, s13 -; SI-NEXT: v_readlane_b32 s14, v44, 13 -; SI-NEXT: v_readlane_b32 s44, v44, 5 -; SI-NEXT: v_readlane_b32 s9, v44, 11 -; SI-NEXT: v_readlane_b32 s81, v44, 12 -; SI-NEXT: v_readlane_b32 s82, v44, 9 -; SI-NEXT: v_readlane_b32 s10, v44, 16 -; SI-NEXT: v_readlane_b32 s12, v44, 4 -; SI-NEXT: v_readlane_b32 s96, v44, 7 -; SI-NEXT: v_readlane_b32 s83, v44, 8 -; SI-NEXT: v_readlane_b32 s71, v44, 15 +; SI-NEXT: v_readlane_b32 s31, v44, 20 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; kill: killed $sgpr12 +; SI-NEXT: s_mov_b32 s93, s60 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; kill: killed $sgpr12 +; SI-NEXT: s_mov_b32 s57, s15 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; kill: killed $sgpr12 +; SI-NEXT: s_mov_b32 s15, s88 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: v_readlane_b32 s72, v44, 19 +; SI-NEXT: s_mov_b32 s38, s78 +; SI-NEXT: v_readlane_b32 s61, v44, 18 +; SI-NEXT: s_mov_b32 s30, s58 +; SI-NEXT: s_mov_b32 s95, s63 +; SI-NEXT: v_readlane_b32 s62, v44, 11 +; SI-NEXT: s_mov_b32 s39, s27 +; SI-NEXT: s_mov_b32 s36, s29 +; SI-NEXT: v_readlane_b32 s82, v44, 10 +; SI-NEXT: v_readlane_b32 s98, v44, 7 +; SI-NEXT: s_mov_b32 s89, s44 +; SI-NEXT: s_mov_b32 s94, s47 +; SI-NEXT: s_mov_b32 s48, s43 +; SI-NEXT: s_mov_b32 s49, s45 +; SI-NEXT: v_readlane_b32 s46, v44, 6 +; SI-NEXT: v_readlane_b32 s40, v44, 15 +; SI-NEXT: s_mov_b32 s35, s13 +; SI-NEXT: v_readlane_b32 s9, v44, 14 +; SI-NEXT: v_readlane_b32 s42, v44, 5 +; SI-NEXT: s_mov_b32 s50, s11 +; SI-NEXT: s_mov_b32 s34, s23 +; SI-NEXT: v_readlane_b32 s14, v44, 12 +; SI-NEXT: v_readlane_b32 s7, v44, 13 +; SI-NEXT: v_readlane_b32 s10, v44, 8 +; SI-NEXT: v_readlane_b32 s96, v44, 9 +; SI-NEXT: v_readlane_b32 s71, v44, 16 +; SI-NEXT: v_readlane_b32 s8, v44, 17 +; SI-NEXT: v_readlane_b32 s22, v44, 4 ; SI-NEXT: ; kill: killed $sgpr6 ; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; kill: killed $sgpr8 -; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; kill: killed $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr25 ; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr27 -; SI-NEXT: ; implicit-def: $sgpr20 -; SI-NEXT: ; implicit-def: $sgpr65 -; SI-NEXT: ; implicit-def: $sgpr29 -; SI-NEXT: ; implicit-def: $sgpr66 -; SI-NEXT: ; implicit-def: $sgpr95 -; SI-NEXT: ; implicit-def: $sgpr64 ; SI-NEXT: ; implicit-def: $sgpr21 +; SI-NEXT: ; implicit-def: $sgpr88 +; SI-NEXT: ; implicit-def: $sgpr29 +; SI-NEXT: ; implicit-def: $sgpr37 +; SI-NEXT: ; implicit-def: $sgpr20 +; SI-NEXT: ; implicit-def: $sgpr24 +; SI-NEXT: ; implicit-def: $sgpr23 +; SI-NEXT: ; implicit-def: $sgpr73 +; SI-NEXT: ; implicit-def: $sgpr59 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr63 ; SI-NEXT: ; implicit-def: $sgpr76 -; SI-NEXT: ; implicit-def: $sgpr22 +; SI-NEXT: ; implicit-def: $sgpr17 +; SI-NEXT: ; implicit-def: $sgpr79 ; SI-NEXT: ; implicit-def: $sgpr77 -; SI-NEXT: ; implicit-def: $sgpr25 -; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr58 ; SI-NEXT: ; implicit-def: $sgpr16 ; SI-NEXT: ; implicit-def: $sgpr78 -; SI-NEXT: ; implicit-def: $sgpr88 -; SI-NEXT: ; implicit-def: $sgpr89 +; SI-NEXT: ; implicit-def: $sgpr91 +; SI-NEXT: ; implicit-def: $sgpr27 ; SI-NEXT: ; implicit-def: $sgpr19 -; SI-NEXT: ; implicit-def: $sgpr79 -; SI-NEXT: ; implicit-def: $sgpr75 +; SI-NEXT: ; implicit-def: $sgpr90 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr92 ; SI-NEXT: ; implicit-def: $sgpr18 -; SI-NEXT: ; implicit-def: $sgpr63 -; SI-NEXT: ; implicit-def: $sgpr61 -; SI-NEXT: ; implicit-def: $sgpr17 +; SI-NEXT: ; implicit-def: $sgpr75 ; SI-NEXT: ; kill: killed $sgpr6 -; SI-NEXT: ; implicit-def: $sgpr59 -; SI-NEXT: ; implicit-def: $sgpr13 ; SI-NEXT: ; implicit-def: $sgpr6 ; SI-NEXT: ; implicit-def: $sgpr99 -; SI-NEXT: ; implicit-def: $sgpr57 -; SI-NEXT: ; kill: killed $sgpr8 -; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: ; kill: killed $sgpr11 -; SI-NEXT: ; implicit-def: $sgpr23 -; SI-NEXT: ; implicit-def: $sgpr56 -; SI-NEXT: ; implicit-def: $sgpr97 +; SI-NEXT: ; kill: killed $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr60 ; SI-NEXT: ; implicit-def: $sgpr47 -; SI-NEXT: ; implicit-def: $sgpr45 -; SI-NEXT: ; implicit-def: $sgpr24 -; SI-NEXT: ; implicit-def: $sgpr28 -; SI-NEXT: ; implicit-def: $sgpr58 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr73 +; SI-NEXT: ; implicit-def: $sgpr51 +; SI-NEXT: ; implicit-def: $sgpr97 +; SI-NEXT: ; implicit-def: $sgpr53 +; SI-NEXT: ; implicit-def: $sgpr52 +; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr64 +; SI-NEXT: ; implicit-def: $sgpr55 +; SI-NEXT: ; implicit-def: $sgpr65 ; SI-NEXT: ; implicit-def: $sgpr87 ; SI-NEXT: ; implicit-def: $sgpr67 -; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr66 ; SI-NEXT: ; implicit-def: $sgpr68 ; SI-NEXT: ; implicit-def: $sgpr86 ; SI-NEXT: ; implicit-def: $sgpr70 ; SI-NEXT: ; implicit-def: $sgpr69 -; SI-NEXT: ; implicit-def: $sgpr94 +; SI-NEXT: ; implicit-def: $sgpr28 ; SI-NEXT: ; implicit-def: $sgpr85 -; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr81 ; SI-NEXT: ; implicit-def: $sgpr80 -; SI-NEXT: ; implicit-def: $sgpr31 +; SI-NEXT: ; implicit-def: $sgpr41 ; SI-NEXT: ; implicit-def: $sgpr84 -; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $sgpr83 ; SI-NEXT: s_branch .LBB89_2 ; ; VI-LABEL: bitcast_v128i8_to_v64bf16_scalar: @@ -156683,332 +154763,98 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; SI-LABEL: bitcast_v64bf16_to_v128i8: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:136 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:72 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:92 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v4 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v3 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v6 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v7 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v10 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v9 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v12 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v11 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v14 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v13 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v15 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v18 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v17 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v20 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v19 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v22 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v21 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:8 +; SI-NEXT: v_mov_b32_e32 v31, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v30 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v23 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_and_b32_e32 v62, 0xffff0000, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v26 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v25 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v62 +; SI-NEXT: v_and_b32_e32 v61, 0xffff0000, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v28 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v27 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v61 +; SI-NEXT: v_and_b32_e32 v57, 0xffff0000, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v30 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v32 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v34 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v38 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; SI-NEXT: v_mul_f32_e32 v61, 1.0, v36 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v57 +; SI-NEXT: v_and_b32_e32 v56, 0xffff0000, v5 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v50 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; SI-NEXT: v_mul_f32_e32 v57, 1.0, v29 -; SI-NEXT: v_mul_f32_e32 v42, 1.0, v35 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v54 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill -; SI-NEXT: v_mul_f32_e32 v62, 1.0, v37 -; SI-NEXT: v_mul_f32_e32 v6, 1.0, v41 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v56 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v43 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; SI-NEXT: v_mul_f32_e32 v9, 1.0, v56 -; SI-NEXT: v_mul_f32_e32 v59, 1.0, v39 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v33 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:88 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:108 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:104 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:116 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:112 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:124 -; SI-NEXT: v_mul_f32_e32 v56, 1.0, v58 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; kill: killed $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: v_mul_f32_e32 v60, 1.0, v48 -; SI-NEXT: v_mul_f32_e32 v45, 1.0, v49 -; SI-NEXT: v_mul_f32_e32 v49, 1.0, v51 -; SI-NEXT: v_mul_f32_e32 v51, 1.0, v52 -; SI-NEXT: v_mul_f32_e32 v24, 1.0, v53 -; SI-NEXT: v_mul_f32_e32 v46, 1.0, v55 -; SI-NEXT: v_mul_f32_e32 v53, 1.0, v40 -; SI-NEXT: v_mul_f32_e32 v30, 1.0, v44 -; SI-NEXT: v_mul_f32_e32 v19, 1.0, v63 -; SI-NEXT: ; kill: killed $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_mul_f32_e32 v41, 1.0, v1 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_mul_f32_e32 v12, 1.0, v2 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_mul_f32_e32 v27, 1.0, v3 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:128 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_mul_f32_e32 v63, 1.0, v4 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_mul_f32_e32 v36, 1.0, v7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: v_mul_f32_e32 v16, 1.0, v5 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_mul_f32_e32 v39, 1.0, v8 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_mul_f32_e32 v21, 1.0, v10 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_mul_f32_e32 v43, 1.0, v1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v5 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v47, 0xffff0000, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v46, 0xffff0000, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_and_b32_e32 v45, 0xffff0000, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v44, 0xffff0000, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_and_b32_e32 v50, 0xffff0000, v24 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v53, 0xffff0000, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v37, 0xffff0000, v27 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v27 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v15 +; SI-NEXT: v_and_b32_e32 v55, 0xffff0000, v19 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_and_b32_e32 v43, 0xffff0000, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_and_b32_e32 v41, 0xffff0000, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_and_b32_e32 v54, 0xffff0000, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_and_b32_e32 v38, 0xffff0000, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 @@ -157038,6 +154884,98 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v63, 0xffff0000, v33 +; SI-NEXT: v_and_b32_e32 v60, 0xffff0000, v36 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v39 +; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v39 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v36 +; SI-NEXT: v_and_b32_e32 v36, 0xffff0000, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v36 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v33 +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v33 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v47 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v9 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v46 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v10 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v45 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v11 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v44 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v53 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v20 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v51 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 @@ -157088,390 +155026,568 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v63 +; SI-NEXT: v_mul_f32_e32 v63, 1.0, v58 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v29 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_and_b32_e32 v34, 0xffff0000, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_and_b32_e32 v48, 0xffff0000, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_and_b32_e32 v52, 0xffff0000, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_and_b32_e32 v40, 0xffff0000, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_and_b32_e32 v49, 0xffff0000, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v44, 1.0, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v55, 1.0, v3 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v49, 1.0, v49 +; SI-NEXT: v_mul_f32_e32 v53, 1.0, v17 +; SI-NEXT: v_mul_f32_e32 v43, 1.0, v43 +; SI-NEXT: v_mul_f32_e32 v55, 1.0, v55 +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v42 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v41 +; SI-NEXT: v_mul_f32_e32 v62, 1.0, v40 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_mul_f32_e32 v56, 1.0, v54 +; SI-NEXT: v_mul_f32_e32 v42, 1.0, v52 +; SI-NEXT: v_mul_f32_e32 v41, 1.0, v23 +; SI-NEXT: v_mul_f32_e32 v33, 1.0, v50 +; SI-NEXT: v_mul_f32_e32 v57, 1.0, v48 +; SI-NEXT: v_mul_f32_e32 v36, 1.0, v25 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v38 +; SI-NEXT: v_mul_f32_e32 v45, 1.0, v37 +; SI-NEXT: v_mul_f32_e32 v61, 1.0, v35 +; SI-NEXT: v_mul_f32_e32 v12, 1.0, v34 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v28 +; SI-NEXT: v_mul_f32_e32 v44, 1.0, v32 +; SI-NEXT: v_mul_f32_e32 v47, 1.0, v29 +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v30 +; SI-NEXT: v_mul_f32_e32 v46, 1.0, v60 +; SI-NEXT: v_mul_f32_e32 v60, 1.0, v59 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v6, 1.0, v0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB90_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_alignbit_b32 v38, v1, v2, 16 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_alignbit_b32 v38, v0, v1, 16 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_alignbit_b32 v35, v1, v2, 16 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v35, v0, v1, 16 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_alignbit_b32 v32, v1, v2, 16 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v32, v0, v1, 16 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_alignbit_b32 v29, v1, v2, 16 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v29, v0, v1, 16 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_alignbit_b32 v26, v1, v2, 16 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v58 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v26, v0, v1, 16 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v59 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_alignbit_b32 v40, v1, v2, 16 +; SI-NEXT: v_alignbit_b32 v40, v0, v1, 16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_alignbit_b32 v23, v1, v2, 16 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v33 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v23, v0, v1, 16 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v58 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_alignbit_b32 v54, v1, v2, 16 +; SI-NEXT: v_alignbit_b32 v54, v0, v1, 16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_alignbit_b32 v18, v1, v2, 16 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v47 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v18, v0, v1, 16 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v39 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_alignbit_b32 v52, v1, v2, 16 +; SI-NEXT: v_alignbit_b32 v52, v0, v1, 16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v15, v1, v57, 16 -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_alignbit_b32 v15, v0, v27, 16 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v57 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v50, v1, v2, 16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v42 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; SI-NEXT: v_alignbit_b32 v13, v1, v61, 16 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v42 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_alignbit_b32 v48, v1, v2, 16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v59 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; SI-NEXT: v_alignbit_b32 v11, v1, v60, 16 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v59 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_alignbit_b32 v37, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v50, v0, v1, 16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v49 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; SI-NEXT: v_alignbit_b32 v8, v1, v51, 16 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v49 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v13, v0, v53, 16 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v49 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_alignbit_b32 v34, v1, v2, 16 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v46 -; SI-NEXT: v_alignbit_b32 v5, v1, v53, 16 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v51 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_alignbit_b32 v31, v1, v2, 16 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v48, v0, v1, 16 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v30 -; SI-NEXT: v_alignbit_b32 v4, v1, v9, 16 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v62 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v55 +; SI-NEXT: v_alignbit_b32 v11, v0, v19, 16 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_alignbit_b32 v28, v1, v2, 16 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v56 -; SI-NEXT: v_alignbit_b32 v3, v1, v41, 16 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v45 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_alignbit_b32 v25, v1, v2, 16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v63 -; SI-NEXT: v_alignbit_b32 v2, v1, v16, 16 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v24 -; SI-NEXT: v_alignbit_b32 v22, v1, v7, 16 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; SI-NEXT: v_alignbit_b32 v20, v7, v9, 16 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v53 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v6 ; SI-NEXT: v_lshrrev_b32_e32 v6, 24, v6 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v6, 24, v44 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v6, 24, v51 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v6, v40, v38, 24 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v6, v40, v38, 16 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v6, v40, v38, 8 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v6, v54, v35, 24 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v6, v54, v35, 16 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v6, v54, v35, 8 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v6, v52, v32, 24 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v6, v52, v32, 16 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v6, v52, v32, 8 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v6, v50, v29, 24 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v6, v50, v29, 16 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v6, v50, v29, 8 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v6, v48, v26, 24 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v6, v48, v26, 16 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v6, v48, v26, 8 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_alignbit_b32 v37, v0, v1, 16 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v62 +; SI-NEXT: v_alignbit_b32 v8, v0, v21, 16 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v6, v37, v23, 24 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v55 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v6, v37, v23, 16 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v6, v37, v23, 8 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v21, 24, v59 +; SI-NEXT: v_mov_b32_e32 v59, v19 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_alignbit_b32 v34, v0, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v42 +; SI-NEXT: v_alignbit_b32 v5, v0, v41, 16 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_alignbit_b32 v6, v34, v18, 24 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v6, v34, v18, 16 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v6, v34, v18, 8 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v41 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v57 +; SI-NEXT: v_alignbit_b32 v4, v1, v36, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v43 +; SI-NEXT: v_alignbit_b32 v28, v1, v2, 16 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v31, v15, 24 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v45 +; SI-NEXT: v_alignbit_b32 v3, v1, v61, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v9 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v6, v0, v15, 24 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v31, v15, 16 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v6, v0, v15, 16 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v31, v15, 8 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v6, v0, v15, 8 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v6, v28, v13, 24 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v6, v28, v13, 16 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v6, v28, v13, 8 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v9, 24, v9 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v9, 24, v56 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v9, 24, v33 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v9, 24, v16 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v9, 24, v12 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_alignbit_b32 v25, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v44 +; SI-NEXT: v_alignbit_b32 v2, v1, v47, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v56 +; SI-NEXT: v_alignbit_b32 v22, v1, v7, 16 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v33 +; SI-NEXT: v_alignbit_b32 v20, v7, v10, 16 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload ; SI-NEXT: v_alignbit_b32 v6, v25, v11, 24 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v6, v25, v11, 16 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v6, v25, v11, 8 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v6, v22, v8, 24 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v6, v22, v8, 16 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v6, v22, v8, 8 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v6, v20, v5, 24 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v19 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v16 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v6, v20, v5, 16 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v6, v20, v5, 8 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_alignbit_b32 v17, v7, v9, 16 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_alignbit_b32 v6, v17, v4, 24 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v46 +; SI-NEXT: v_alignbit_b32 v1, v1, v60, 16 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_alignbit_b32 v17, v7, v10, 16 +; SI-NEXT: v_alignbit_b32 v6, v17, v4, 24 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v12 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v6, v17, v4, 16 -; SI-NEXT: v_alignbit_b32 v14, v7, v27, 16 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v14, v7, v24, 16 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v6, v17, v4, 8 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v14, v3, 24 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v36 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v6, v14, v3, 16 -; SI-NEXT: v_alignbit_b32 v10, v7, v39, 16 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v10, v19, v30, 16 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v6, v14, v3, 8 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v6, v10, v2, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v21 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v44 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v51 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v6, v10, v2, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v43, 16 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v7, v7, v55, 16 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v7, v63, 16 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v6, v10, v2, 8 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v6, v7, v1, 24 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v6, v7, v1, 16 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v7, v1, 8 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v40 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v54 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v52 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v50 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v48 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v37 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v34 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v31 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v0 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v28 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v25 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v22 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v20 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v17 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v14 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v10 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v7 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; kill: killed $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr6 @@ -157515,223 +155631,222 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; kill: killed $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: v_lshrrev_b32_e32 v9, 24, v58 ; SI-NEXT: ; kill: killed $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v9, 24, v33 ; SI-NEXT: ; kill: killed $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v9, 24, v47 ; SI-NEXT: ; kill: killed $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v9, 24, v57 ; SI-NEXT: ; kill: killed $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v9, 24, v42 ; SI-NEXT: ; kill: killed $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v9, 24, v59 ; SI-NEXT: ; kill: killed $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v19, 24, v58 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v9, 24, v49 +; SI-NEXT: v_lshrrev_b32_e32 v19, 24, v39 ; SI-NEXT: ; kill: killed $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v9, 24, v51 +; SI-NEXT: v_lshrrev_b32_e32 v19, 24, v27 ; SI-NEXT: ; kill: killed $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v9, 24, v62 +; SI-NEXT: v_lshrrev_b32_e32 v19, 24, v49 ; SI-NEXT: ; kill: killed $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v9, 24, v45 +; SI-NEXT: v_lshrrev_b32_e32 v19, 24, v53 ; SI-NEXT: ; kill: killed $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v9, 24, v24 +; SI-NEXT: v_lshrrev_b32_e32 v19, 24, v55 ; SI-NEXT: ; kill: killed $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v12 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 24, v41 ; SI-NEXT: ; kill: killed $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: v_lshrrev_b32_e32 v47, 24, v19 -; SI-NEXT: v_lshrrev_b32_e32 v58, 24, v36 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 24, v43 ; SI-NEXT: ; kill: killed $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; kill: killed $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v39, v14, v3, 24 +; SI-NEXT: v_alignbit_b32 v58, v7, v1, 8 ; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: .LBB90_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB90_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v21 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v43 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v46 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v60 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v44 +; SI-NEXT: v_alignbit_b32 v1, v1, v0, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v47 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v63 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v45 +; SI-NEXT: v_alignbit_b32 v2, v2, v0, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v61 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v56 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v41 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v57 +; SI-NEXT: v_alignbit_b32 v3, v3, v0, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v36 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v51 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v42 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v30 -; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v9 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v55 -; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v44 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v46 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v53 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_add_f32_e32 v36, 0x40c00000, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v41 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_alignbit_b32 v4, v4, v0, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v63 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_alignbit_b32 v5, v8, v5, 16 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v9 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v39 -; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v36 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v49 -; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v51 -; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_alignbit_b32 v8, v11, v8, 16 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v16 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v10, v11, v10, 16 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_alignbit_b32 v5, v7, v5, 16 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v36 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v62 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v27 -; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v11 +; SI-NEXT: v_alignbit_b32 v7, v7, v0, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v30 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 ; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v12 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v59 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v55 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v6 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v11 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v60 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v19 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_alignbit_b32 v8, v10, v8, 16 +; SI-NEXT: v_alignbit_b32 v10, v59, v0, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v24 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_alignbit_b32 v11, v14, v11, 16 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v12 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v14, v14, v13, 16 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v42 -; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; SI-NEXT: v_lshrrev_b32_e32 v58, 24, v16 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v13 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v19 -; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v13 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v61 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_alignbit_b32 v11, v13, v11, 16 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v12 +; SI-NEXT: v_alignbit_b32 v14, v13, v0, 16 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v16 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v49 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v53 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; SI-NEXT: v_alignbit_b32 v13, v17, v13, 16 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v19 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_alignbit_b32 v13, v15, v13, 16 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v16 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_lshrrev_b32_e32 v6, 24, v6 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v17, v17, v15, 16 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_lshrrev_b32_e32 v6, 24, v36 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v58, v7, v1, 8 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_alignbit_b32 v17, v15, v0, 16 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v33 +; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v27 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_alignbit_b32 v15, v18, v15, 16 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v21 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 ; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 ; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 ; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 -; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_alignbit_b32 v29, v30, v29, 16 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_and_b32_e32 v34, 0xffff0000, v34 ; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; SI-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 ; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v35 -; SI-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v34 +; SI-NEXT: v_alignbit_b32 v32, v34, v32, 16 ; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_and_b32_e32 v38, 0xffff0000, v38 +; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v35 ; SI-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 +; SI-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 ; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v38 -; SI-NEXT: v_lshrrev_b32_e32 v47, 24, v19 +; SI-NEXT: v_alignbit_b32 v35, v38, v35, 16 ; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_and_b32_e32 v49, 0xffff0000, v49 ; SI-NEXT: s_waitcnt vmcnt(5) @@ -157752,391 +155867,365 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v53 ; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v55 ; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v41 -; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v15 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v57 -; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: v_alignbit_b32 v15, v20, v15, 16 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v6 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v20, v20, v18, 16 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v6, 24, v6 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v6, 24, v9 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v18 -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v24 -; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_alignbit_b32 v20, v18, v0, 16 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v56 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 ; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; SI-NEXT: v_alignbit_b32 v18, v23, v18, 16 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v21 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v22, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v18, v19, v18, 16 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v27 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v22, v19, v0, 16 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v21, 24, v21 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v23 -; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v45 -; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v23 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; SI-NEXT: v_alignbit_b32 v23, v26, v23, 16 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v24 -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v25, v26, v25, 16 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_alignbit_b32 v23, v23, v19, 16 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v25, v19, v0, 16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v24, 24, v24 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v26 -; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v62 -; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v26 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; SI-NEXT: v_alignbit_b32 v26, v29, v26, 16 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v27 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v28, v29, v28, 16 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v43 +; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v9, 24, v9 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v27, 24, v27 -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v29 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v29 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 -; SI-NEXT: v_alignbit_b32 v29, v32, v29, 16 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v30 -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v31, v32, v31, 16 +; SI-NEXT: v_lshrrev_b32_e32 v9, 24, v27 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v30, 24, v30 -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; SI-NEXT: v_add_f32_e32 v34, 0x40c00000, v32 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v32 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v9, 24, v21 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v9, 24, v16 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v9, 24, v12 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_alignbit_b32 v26, v26, v19, 16 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v24 +; SI-NEXT: v_alignbit_b32 v28, v19, v0, 16 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; SI-NEXT: v_alignbit_b32 v32, v35, v32, 16 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v33 -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v34, v35, v34, 16 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v19 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v0, v30, v0, 16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v33 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v19, 24, v19 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v12 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v35 -; SI-NEXT: v_add_f32_e32 v37, 0x40c00000, v35 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v19, 24, v24 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v30 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v35 -; SI-NEXT: v_add_f32_e32 v36, 0x40c00000, v35 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v30 +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v34, v34, v33, 16 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v30, 24, v30 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; SI-NEXT: v_add_f32_e32 v37, 0x40c00000, v33 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v35 -; SI-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 -; SI-NEXT: v_alignbit_b32 v35, v38, v35, 16 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v36 -; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v33 +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: v_alignbit_b32 v37, v38, v37, 16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v36, 24, v36 -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v33 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v38, 0xffff0000, v38 ; SI-NEXT: v_add_f32_e32 v48, 0x40c00000, v38 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v38, 0xffff0000, v38 ; SI-NEXT: v_add_f32_e32 v39, 0x40c00000, v38 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v38, 0xffff0000, v38 ; SI-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 ; SI-NEXT: v_alignbit_b32 v38, v49, v38, 16 ; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v39 -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: v_alignbit_b32 v48, v49, v48, 16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload ; SI-NEXT: v_lshrrev_b32_e32 v39, 24, v39 -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v39, v14, v3, 24 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v49, 0xffff0000, v49 ; SI-NEXT: v_add_f32_e32 v49, 0x40c00000, v49 ; SI-NEXT: v_alignbit_b32 v50, v50, v49, 16 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v49, 0xffff0000, v49 ; SI-NEXT: v_add_f32_e32 v49, 0x40c00000, v49 ; SI-NEXT: v_alignbit_b32 v52, v52, v49, 16 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v49, 0xffff0000, v49 ; SI-NEXT: v_add_f32_e32 v49, 0x40c00000, v49 ; SI-NEXT: v_alignbit_b32 v54, v54, v49, 16 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v49, 0xffff0000, v49 ; SI-NEXT: v_add_f32_e32 v49, 0x40c00000, v49 ; SI-NEXT: v_alignbit_b32 v40, v40, v49, 16 ; SI-NEXT: v_alignbit_b32 v6, v40, v38, 24 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v6, v40, v38, 16 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v6, v40, v38, 8 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v6, v54, v35, 24 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v6, v54, v35, 16 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v6, v54, v35, 8 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v6, v52, v32, 24 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v6, v52, v32, 16 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v6, v52, v32, 8 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v6, v50, v29, 24 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v6, v50, v29, 16 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v6, v50, v29, 8 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v6, v48, v26, 24 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v6, v48, v26, 16 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v6, v48, v26, 8 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v6, v37, v23, 24 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v6, v37, v23, 16 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v6, v37, v23, 8 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v6, v34, v18, 24 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v6, v34, v18, 16 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v6, v34, v18, 8 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v31, v15, 24 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v6, v0, v15, 24 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v31, v15, 16 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v6, v0, v15, 16 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v31, v15, 8 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v6, v0, v15, 8 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v6, v28, v13, 24 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v6, v28, v13, 16 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v6, v28, v13, 8 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v6, v25, v11, 24 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v6, v25, v11, 16 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v6, v25, v11, 8 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v6, v22, v8, 24 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v6, v22, v8, 16 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v6, v22, v8, 8 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v6, v20, v5, 24 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v6, v20, v5, 16 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v6, v20, v5, 8 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v6, v17, v4, 24 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v6, v17, v4, 16 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v6, v17, v4, 8 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v14, v3, 24 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v6, v14, v3, 16 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v6, v14, v3, 8 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v6, v10, v2, 24 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v6, v10, v2, 16 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v6, v10, v2, 8 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v6, v7, v1, 24 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v6, v7, v1, 16 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v7, v1, 8 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v40 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v54 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v52 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v50 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v48 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v37 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v34 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v31 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v0 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v28 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v25 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v22 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v20 ; SI-NEXT: v_lshrrev_b32_e32 v49, 24, v41 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v17 -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v49, 24, v55 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v14 -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v49, 24, v53 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v10 -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v49, 24, v51 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v7 -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: .LBB90_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v6, 0xff, v38 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 ; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12 @@ -158145,15 +156234,15 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v12, v9 ; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v6, v31, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v6, 0xff, v40 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 ; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12 @@ -158162,16 +156251,16 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v12, v9 ; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: v_add_i32_e32 v9, vcc, 4, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, 4, v31 ; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v6, 0xff, v35 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 ; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12 @@ -158180,16 +156269,16 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v12, v9 ; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: v_add_i32_e32 v9, vcc, 8, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, 8, v31 ; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v6, 0xff, v54 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 ; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12 @@ -158198,16 +156287,16 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v12, v9 ; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: v_add_i32_e32 v9, vcc, 12, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, 12, v31 ; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v6, 0xff, v32 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 ; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12 @@ -158216,16 +156305,16 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v12, v9 ; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: v_add_i32_e32 v9, vcc, 16, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, 16, v31 ; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v6, 0xff, v52 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 ; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12 @@ -158234,16 +156323,16 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v12, v9 ; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: v_add_i32_e32 v9, vcc, 20, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, 20, v31 ; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v6, 0xff, v29 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 ; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12 @@ -158252,16 +156341,16 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v12, v9 ; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: v_add_i32_e32 v9, vcc, 24, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, 24, v31 ; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v6, 0xff, v50 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 ; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12 @@ -158270,16 +156359,16 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v12, v9 ; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: v_add_i32_e32 v9, vcc, 28, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, 28, v31 ; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v6, 0xff, v26 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 ; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12 @@ -158288,16 +156377,16 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v12, v9 ; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: v_add_i32_e32 v9, vcc, 32, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, 32, v31 ; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v6, 0xff, v48 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 ; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12 @@ -158306,16 +156395,16 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v12, v9 ; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: v_add_i32_e32 v9, vcc, 36, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, 36, v31 ; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v6, 0xff, v23 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 ; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12 @@ -158324,16 +156413,16 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v12, v9 ; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: v_add_i32_e32 v9, vcc, 40, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, 40, v31 ; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v6, 0xff, v37 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 ; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12 @@ -158342,16 +156431,16 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v12, v9 ; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: v_add_i32_e32 v9, vcc, 44, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, 44, v31 ; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v6, 0xff, v18 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 ; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12 @@ -158360,16 +156449,16 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v12, v9 ; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: v_add_i32_e32 v9, vcc, 48, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, 48, v31 ; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v6, 0xff, v34 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 ; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12 @@ -158378,52 +156467,16 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v12, v9 ; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: v_add_i32_e32 v9, vcc, 52, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, 52, v31 ; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v6, 0xff, v15 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 -; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v12, v9 -; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: v_add_i32_e32 v9, vcc, 56, v0 -; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v31 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 -; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v12, v9 -; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: v_add_i32_e32 v9, vcc, 60, v0 -; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v13 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 ; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12 @@ -158432,283 +156485,323 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v12, v9 ; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: v_add_i32_e32 v9, vcc, 64, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, 56, v31 ; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v28 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 -; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; SI-NEXT: v_or_b32_e32 v0, v0, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12 +; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v9 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v12, v9 -; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: v_add_i32_e32 v9, vcc, 0x44, v0 -; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v9, v6 +; SI-NEXT: v_or_b32_e32 v0, v0, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 60, v31 +; SI-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v11 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v13 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 -; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; SI-NEXT: v_or_b32_e32 v0, v0, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v11 +; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v9 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: v_add_i32_e32 v9, vcc, 0x48, v0 -; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v9, v6 +; SI-NEXT: v_or_b32_e32 v0, v0, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 64, v31 +; SI-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v25 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v28 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 -; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; SI-NEXT: v_or_b32_e32 v0, v0, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v11 +; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v9 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: v_add_i32_e32 v9, vcc, 0x4c, v0 -; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v9, v6 +; SI-NEXT: v_or_b32_e32 v0, v0, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x44, v31 +; SI-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v8 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v11 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v8 -; SI-NEXT: v_or_b32_e32 v6, v6, v8 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; SI-NEXT: v_or_b32_e32 v0, v0, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v9 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_or_b32_e32 v6, v6, v8 -; SI-NEXT: v_add_i32_e32 v8, vcc, 0x50, v0 -; SI-NEXT: buffer_store_dword v6, v8, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v9, v6 +; SI-NEXT: v_or_b32_e32 v0, v0, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x48, v31 +; SI-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v22 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v25 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v8 -; SI-NEXT: v_or_b32_e32 v6, v6, v8 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; SI-NEXT: v_or_b32_e32 v0, v0, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v9 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_or_b32_e32 v6, v6, v8 -; SI-NEXT: v_add_i32_e32 v8, vcc, 0x54, v0 -; SI-NEXT: buffer_store_dword v6, v8, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v9, v6 +; SI-NEXT: v_or_b32_e32 v0, v0, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x4c, v31 +; SI-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 -; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v0, v0, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v8 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: v_add_i32_e32 v6, vcc, 0x58, v0 -; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v0, v0, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x50, v31 +; SI-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v20 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v22 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 -; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v0, v0, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v8 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: v_add_i32_e32 v6, vcc, 0x5c, v0 -; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v0, v0, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x54, v31 +; SI-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 -; SI-NEXT: v_or_b32_e32 v4, v4, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v0, v0, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v6 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_or_b32_e32 v4, v4, v5 -; SI-NEXT: v_add_i32_e32 v5, vcc, 0x60, v0 -; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v0, v0, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x58, v31 +; SI-NEXT: buffer_store_dword v0, v5, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v4, 0xff, v17 -; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v47 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xff, v20 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 -; SI-NEXT: v_or_b32_e32 v4, v4, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v0, v0, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v6 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_or_b32_e32 v4, v4, v5 -; SI-NEXT: v_add_i32_e32 v5, vcc, 0x64, v0 -; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v0, v0, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x5c, v31 +; SI-NEXT: buffer_store_dword v0, v5, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v0, v0, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 0x68, v0 -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v0, v0, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x60, v31 +; SI-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v33 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xff, v17 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v0, v0, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 0x6c, v0 -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v0, v0, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x64, v31 +; SI-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v0, 0xff, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v39 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 -; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v0, v0, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v31 +; SI-NEXT: buffer_store_dword v0, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v58 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xff, v14 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0 -; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v0, v0, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v31 +; SI-NEXT: buffer_store_dword v0, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v31 +; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v7 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v10 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v59 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v31 +; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v58 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x78, v31 +; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xff, v7 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x7c, v31 +; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -164235,47 +162328,11 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[4:5] -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:16 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:28 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:24 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:72 ; SI-NEXT: v_writelane_b32 v63, s30, 0 ; SI-NEXT: v_writelane_b32 v63, s31, 1 ; SI-NEXT: v_writelane_b32 v63, s34, 2 @@ -164308,323 +162365,365 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_writelane_b32 v63, s85, 29 ; SI-NEXT: v_writelane_b32 v63, s86, 30 ; SI-NEXT: v_writelane_b32 v63, s87, 31 -; SI-NEXT: v_mul_f32_e32 v39, 1.0, v10 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v17 +; SI-NEXT: v_and_b32_e32 v34, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: v_writelane_b32 v63, s96, 32 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v60, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v19 ; SI-NEXT: v_writelane_b32 v63, s97, 33 +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v18 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v14 +; SI-NEXT: v_and_b32_e32 v37, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v17 ; SI-NEXT: v_writelane_b32 v63, s98, 34 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v38, 0xffff0000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v36, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_and_b32 s6, s29, 0xffff0000 +; SI-NEXT: s_lshl_b32 s7, s29, 16 +; SI-NEXT: s_and_b32 s8, s28, 0xffff0000 +; SI-NEXT: s_lshl_b32 s9, s28, 16 +; SI-NEXT: s_and_b32 s10, s27, 0xffff0000 +; SI-NEXT: s_lshl_b32 s11, s27, 16 +; SI-NEXT: s_and_b32 s12, s26, 0xffff0000 +; SI-NEXT: s_lshl_b32 s13, s26, 16 +; SI-NEXT: s_and_b32 s14, s25, 0xffff0000 +; SI-NEXT: s_lshl_b32 s15, s25, 16 +; SI-NEXT: s_and_b32 s25, s24, 0xffff0000 +; SI-NEXT: s_lshl_b32 s24, s24, 16 +; SI-NEXT: s_and_b32 s26, s23, 0xffff0000 +; SI-NEXT: s_lshl_b32 s23, s23, 16 +; SI-NEXT: s_and_b32 s27, s22, 0xffff0000 +; SI-NEXT: s_lshl_b32 s22, s22, 16 +; SI-NEXT: s_and_b32 s28, s21, 0xffff0000 +; SI-NEXT: s_lshl_b32 s21, s21, 16 +; SI-NEXT: s_and_b32 s29, s20, 0xffff0000 +; SI-NEXT: s_lshl_b32 s20, s20, 16 +; SI-NEXT: s_and_b32 s40, s19, 0xffff0000 +; SI-NEXT: s_lshl_b32 s19, s19, 16 +; SI-NEXT: s_and_b32 s41, s18, 0xffff0000 +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_and_b32 s42, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_and_b32 s43, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s16, s16, 16 +; SI-NEXT: v_mul_f32_e32 v42, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v23 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v20 ; SI-NEXT: v_writelane_b32 v63, s99, 35 -; SI-NEXT: v_mul_f32_e32 v35, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v51, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v47, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v46, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; SI-NEXT: v_mul_f32_e32 v3, 1.0, v5 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v29 +; SI-NEXT: v_mul_f32_e32 v48, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v28 +; SI-NEXT: v_mul_f32_e32 v53, 1.0, v37 +; SI-NEXT: v_mul_f32_e32 v41, 1.0, v36 +; SI-NEXT: v_mul_f32_e32 v46, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v47, 1.0, v38 +; SI-NEXT: v_mul_f32_e32 v56, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v52, 1.0, v35 +; SI-NEXT: v_mul_f32_e32 v59, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v36, 1.0, v34 +; SI-NEXT: v_mul_f32_e32 v45, 1.0, v32 ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v8 -; SI-NEXT: v_mul_f32_e32 v5, 1.0, v7 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v60, 1.0, v9 -; SI-NEXT: v_mul_f32_e32 v9, 1.0, v12 -; SI-NEXT: v_mul_f32_e32 v31, 1.0, v11 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v14 -; SI-NEXT: v_mul_f32_e32 v7, 1.0, v13 -; SI-NEXT: v_mul_f32_e32 v4, 1.0, v15 -; SI-NEXT: v_mul_f32_e32 v55, 1.0, v18 -; SI-NEXT: v_mul_f32_e32 v18, 1.0, v17 -; SI-NEXT: v_mul_f32_e32 v20, 1.0, v20 -; SI-NEXT: v_mul_f32_e32 v12, 1.0, v19 -; SI-NEXT: v_mul_f32_e32 v8, 1.0, v22 -; SI-NEXT: v_mul_f32_e32 v19, 1.0, v21 -; SI-NEXT: v_mul_f32_e32 v21, 1.0, v24 -; SI-NEXT: v_mul_f32_e32 v22, 1.0, v23 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 -; SI-NEXT: v_mul_f32_e32 v32, 1.0, v16 -; SI-NEXT: v_mul_f32_e32 v23, 1.0, v26 -; SI-NEXT: v_mul_f32_e32 v25, 1.0, v25 -; SI-NEXT: v_mul_f32_e32 v28, 1.0, v28 -; SI-NEXT: v_mul_f32_e32 v10, 1.0, v38 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v6, 1.0, v30 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v58, 1.0, v27 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v26 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v25 +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v13 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v13, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v22 +; SI-NEXT: v_mul_f32_e32 v15, 1.0, v15 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v10, 1.0, v48 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: v_mul_f32_e32 v27, 1.0, v27 -; SI-NEXT: v_mul_f32_e32 v26, 1.0, v30 -; SI-NEXT: v_mul_f32_e32 v24, 1.0, v29 -; SI-NEXT: v_mul_f32_e32 v29, 1.0, v33 -; SI-NEXT: v_mul_f32_e32 v17, 1.0, v34 -; SI-NEXT: s_waitcnt vmcnt(8) expcnt(0) -; SI-NEXT: v_mul_f32_e32 v10, 1.0, v54 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: s_waitcnt vmcnt(6) expcnt(0) -; SI-NEXT: v_mul_f32_e32 v10, 1.0, v42 -; SI-NEXT: v_mul_f32_e32 v33, 1.0, v36 -; SI-NEXT: v_mul_f32_e32 v37, 1.0, v37 -; SI-NEXT: v_mul_f32_e32 v57, 1.0, v57 -; SI-NEXT: v_mul_f32_e32 v36, 1.0, v58 -; SI-NEXT: v_mul_f32_e32 v58, 1.0, v59 -; SI-NEXT: v_mul_f32_e32 v59, 1.0, v49 -; SI-NEXT: v_mul_f32_e32 v50, 1.0, v50 -; SI-NEXT: v_mul_f32_e32 v49, 1.0, v52 -; SI-NEXT: v_mul_f32_e32 v53, 1.0, v53 -; SI-NEXT: v_mul_f32_e32 v38, 1.0, v40 -; SI-NEXT: v_mul_f32_e32 v40, 1.0, v41 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(6) expcnt(0) -; SI-NEXT: v_mul_f32_e32 v10, 1.0, v43 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_mul_f32_e32 v56, 1.0, v44 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_mul_f32_e32 v14, 1.0, v45 -; SI-NEXT: v_mul_f32_e64 v13, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v15, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v52, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v11, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v30, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v34, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v54, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v16, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v41, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v42, 1.0, s24 -; SI-NEXT: v_mul_f32_e64 v45, 1.0, s27 -; SI-NEXT: v_mul_f32_e64 v48, 1.0, s26 -; SI-NEXT: v_mul_f32_e64 v43, 1.0, s29 -; SI-NEXT: v_mul_f32_e64 v44, 1.0, s28 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v21 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v16 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v18 +; SI-NEXT: v_mul_f32_e64 v12, 1.0, s43 +; SI-NEXT: v_mul_f32_e64 v16, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v50, 1.0, s42 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v23, 1.0, s41 +; SI-NEXT: v_mul_f32_e64 v25, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v44, 1.0, s40 +; SI-NEXT: v_mul_f32_e64 v21, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v27, 1.0, s29 +; SI-NEXT: v_mul_f32_e64 v29, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v14, 1.0, s28 +; SI-NEXT: v_mul_f32_e64 v26, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v32, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v33, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v19, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v31, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v40, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v57, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v20, 1.0, s14 +; SI-NEXT: v_mul_f32_e64 v34, 1.0, s15 +; SI-NEXT: v_mul_f32_e64 v37, 1.0, s12 +; SI-NEXT: v_mul_f32_e64 v49, 1.0, s13 +; SI-NEXT: v_mul_f32_e64 v18, 1.0, s10 +; SI-NEXT: v_mul_f32_e64 v35, 1.0, s11 +; SI-NEXT: v_mul_f32_e64 v55, 1.0, s8 +; SI-NEXT: v_mul_f32_e64 v43, 1.0, s9 +; SI-NEXT: v_mul_f32_e64 v38, 1.0, s6 +; SI-NEXT: v_mul_f32_e64 v54, 1.0, s7 ; SI-NEXT: ; implicit-def: $vgpr62 : SGPR spill to VGPR lane ; SI-NEXT: ; implicit-def: $vgpr61 : SGPR spill to VGPR lane -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB91_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_readfirstlane_b32 s4, v13 +; SI-NEXT: v_readfirstlane_b32 s4, v12 ; SI-NEXT: s_lshr_b32 s5, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s4, v15 +; SI-NEXT: v_readfirstlane_b32 s4, v16 ; SI-NEXT: s_lshr_b64 s[8:9], s[4:5], 16 -; SI-NEXT: v_readfirstlane_b32 s4, v52 +; SI-NEXT: v_readfirstlane_b32 s4, v50 ; SI-NEXT: s_lshr_b32 s7, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s4, v30 +; SI-NEXT: v_readfirstlane_b32 s4, v23 ; SI-NEXT: s_lshr_b32 s5, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s4, v34 +; SI-NEXT: v_readfirstlane_b32 s4, v25 ; SI-NEXT: s_lshr_b64 s[86:87], s[4:5], 16 -; SI-NEXT: v_readfirstlane_b32 s4, v54 +; SI-NEXT: v_readfirstlane_b32 s4, v44 ; SI-NEXT: s_lshr_b32 s65, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s4, v41 +; SI-NEXT: v_readfirstlane_b32 s4, v27 ; SI-NEXT: s_lshr_b32 s5, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s4, v42 +; SI-NEXT: v_readfirstlane_b32 s4, v29 ; SI-NEXT: s_lshr_b64 s[80:81], s[4:5], 16 -; SI-NEXT: v_readfirstlane_b32 s4, v45 +; SI-NEXT: v_readfirstlane_b32 s4, v14 ; SI-NEXT: s_lshr_b32 s69, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s4, v43 +; SI-NEXT: v_readfirstlane_b32 s4, v32 ; SI-NEXT: s_lshr_b32 s5, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s4, v44 -; SI-NEXT: v_mov_b32_e32 v34, v35 +; SI-NEXT: v_readfirstlane_b32 s4, v33 ; SI-NEXT: s_lshr_b64 s[66:67], s[4:5], 16 -; SI-NEXT: v_readfirstlane_b32 s4, v34 +; SI-NEXT: v_readfirstlane_b32 s4, v19 ; SI-NEXT: s_lshr_b32 s91, s4, 16 -; SI-NEXT: v_mov_b32_e32 v30, v51 -; SI-NEXT: v_readfirstlane_b32 s4, v47 -; SI-NEXT: v_mov_b32_e32 v51, v46 +; SI-NEXT: v_readfirstlane_b32 s4, v40 ; SI-NEXT: s_lshr_b32 s5, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s4, v51 -; SI-NEXT: v_mov_b32_e32 v35, v6 +; SI-NEXT: v_readfirstlane_b32 s4, v57 ; SI-NEXT: s_lshr_b64 s[52:53], s[4:5], 16 -; SI-NEXT: v_readfirstlane_b32 s4, v35 +; SI-NEXT: v_readfirstlane_b32 s4, v20 ; SI-NEXT: s_lshr_b32 s37, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s4, v1 +; SI-NEXT: v_readfirstlane_b32 s4, v37 ; SI-NEXT: s_lshr_b32 s5, s4, 16 -; SI-NEXT: v_mov_b32_e32 v41, v5 -; SI-NEXT: v_readfirstlane_b32 s4, v5 -; SI-NEXT: v_mov_b32_e32 v5, v39 +; SI-NEXT: v_readfirstlane_b32 s4, v49 ; SI-NEXT: s_lshr_b64 s[30:31], s[4:5], 16 -; SI-NEXT: v_readfirstlane_b32 s4, v5 +; SI-NEXT: v_readfirstlane_b32 s4, v18 ; SI-NEXT: s_lshr_b32 s89, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s4, v9 +; SI-NEXT: v_readfirstlane_b32 s4, v55 ; SI-NEXT: s_lshr_b32 s5, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: v_readfirstlane_b32 s4, v43 ; SI-NEXT: s_lshr_b64 s[50:51], s[4:5], 16 -; SI-NEXT: v_readfirstlane_b32 s4, v2 +; SI-NEXT: v_readfirstlane_b32 s4, v38 +; SI-NEXT: v_mov_b32_e32 v37, v35 +; SI-NEXT: v_readfirstlane_b32 s88, v35 ; SI-NEXT: s_lshr_b32 s57, s4, 16 -; SI-NEXT: v_mov_b32_e32 v42, v32 -; SI-NEXT: v_readfirstlane_b32 s4, v32 -; SI-NEXT: v_mov_b32_e32 v32, v4 +; SI-NEXT: v_readfirstlane_b32 s4, v39 +; SI-NEXT: v_mov_b32_e32 v35, v48 ; SI-NEXT: s_lshr_b32 s5, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s4, v32 -; SI-NEXT: v_mov_b32_e32 v6, v55 +; SI-NEXT: v_readfirstlane_b32 s4, v35 ; SI-NEXT: s_lshr_b64 s[92:93], s[4:5], 16 -; SI-NEXT: v_readfirstlane_b32 s4, v6 +; SI-NEXT: v_readfirstlane_b32 s4, v51 +; SI-NEXT: v_mov_b32_e32 v39, v53 ; SI-NEXT: s_lshr_b32 s79, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s4, v20 -; SI-NEXT: v_mov_b32_e32 v39, v12 -; SI-NEXT: s_lshr_b32 s5, s4, 16 ; SI-NEXT: v_readfirstlane_b32 s4, v39 -; SI-NEXT: v_mov_b32_e32 v9, v8 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_mov_b32_e32 v43, v20 +; SI-NEXT: v_mov_b32_e32 v53, v46 +; SI-NEXT: v_mov_b32_e32 v46, v59 +; SI-NEXT: v_mov_b32_e32 v59, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: s_lshr_b32 s5, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v42 ; SI-NEXT: s_lshr_b64 s[76:77], s[4:5], 16 -; SI-NEXT: v_readfirstlane_b32 s4, v9 -; SI-NEXT: v_mov_b32_e32 v20, v21 -; SI-NEXT: v_readfirstlane_b32 s78, v18 +; SI-NEXT: v_readfirstlane_b32 s4, v41 ; SI-NEXT: s_lshr_b32 s73, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s4, v20 -; SI-NEXT: v_mov_b32_e32 v18, v22 +; SI-NEXT: v_readfirstlane_b32 s4, v47 ; SI-NEXT: s_lshr_b32 s5, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s4, v18 +; SI-NEXT: v_readfirstlane_b32 s4, v56 +; SI-NEXT: v_mov_b32_e32 v12, v44 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v44, v43 ; SI-NEXT: s_lshr_b64 s[62:63], s[4:5], 16 -; SI-NEXT: v_readfirstlane_b32 s4, v23 +; SI-NEXT: v_readfirstlane_b32 s4, v52 +; SI-NEXT: v_mov_b32_e32 v43, v36 ; SI-NEXT: s_lshr_b32 s59, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s4, v28 -; SI-NEXT: v_mov_b32_e32 v21, v25 +; SI-NEXT: v_readfirstlane_b32 s4, v43 ; SI-NEXT: s_lshr_b32 s5, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s4, v27 -; SI-NEXT: v_mov_b32_e32 v25, v26 +; SI-NEXT: v_readfirstlane_b32 s4, v2 +; SI-NEXT: v_mov_b32_e32 v36, v45 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: s_lshr_b64 s[46:47], s[4:5], 16 -; SI-NEXT: v_readfirstlane_b32 s4, v25 -; SI-NEXT: v_mov_b32_e32 v12, v29 +; SI-NEXT: v_readfirstlane_b32 s4, v36 +; SI-NEXT: v_readfirstlane_b32 s28, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: s_lshr_b32 s45, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s4, v12 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_mov_b32_e32 v44, v1 +; SI-NEXT: v_readfirstlane_b32 s4, v6 ; SI-NEXT: s_lshr_b32 s5, s4, 16 -; SI-NEXT: v_mov_b32_e32 v1, v52 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_mov_b32_e32 v52, v17 -; SI-NEXT: v_readfirstlane_b32 s4, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v29, v33 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v22, v24 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_readfirstlane_b32 s4, v8 +; SI-NEXT: v_mov_b32_e32 v8, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: s_lshr_b64 s[40:41], s[4:5], 16 -; SI-NEXT: v_readfirstlane_b32 s4, v29 +; SI-NEXT: v_readfirstlane_b32 s4, v58 ; SI-NEXT: s_lshr_b32 s29, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s6, v11 -; SI-NEXT: v_readfirstlane_b32 s12, v40 +; SI-NEXT: v_readfirstlane_b32 s4, v4 +; SI-NEXT: s_lshr_b32 s5, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v9 +; SI-NEXT: s_lshr_b64 s[24:25], s[4:5], 16 +; SI-NEXT: v_readfirstlane_b32 s4, v8 +; SI-NEXT: s_lshr_b32 s23, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v24 +; SI-NEXT: s_lshr_b32 s5, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v28 +; SI-NEXT: s_lshr_b64 s[16:17], s[4:5], 16 +; SI-NEXT: v_readfirstlane_b32 s6, v7 +; SI-NEXT: v_readfirstlane_b32 s12, v22 ; SI-NEXT: s_lshr_b64 s[96:97], s[6:7], 16 ; SI-NEXT: s_mov_b32 s9, s96 -; SI-NEXT: v_readfirstlane_b32 s88, v60 ; SI-NEXT: s_lshr_b64 s[82:83], s[88:89], 16 -; SI-NEXT: v_readfirstlane_b32 s64, v16 +; SI-NEXT: v_readfirstlane_b32 s64, v21 ; SI-NEXT: s_lshr_b64 s[84:85], s[64:65], 16 ; SI-NEXT: s_mov_b32 s87, s84 -; SI-NEXT: v_readfirstlane_b32 s68, v48 +; SI-NEXT: v_readfirstlane_b32 s68, v26 ; SI-NEXT: s_lshr_b64 s[70:71], s[68:69], 16 ; SI-NEXT: s_mov_b32 s81, s70 -; SI-NEXT: v_readfirstlane_b32 s90, v30 +; SI-NEXT: v_readfirstlane_b32 s90, v31 ; SI-NEXT: s_lshr_b64 s[38:39], s[90:91], 16 ; SI-NEXT: s_mov_b32 s67, s38 -; SI-NEXT: v_readfirstlane_b32 s36, v3 +; SI-NEXT: v_readfirstlane_b32 s36, v34 ; SI-NEXT: s_lshr_b64 s[98:99], s[36:37], 16 ; SI-NEXT: s_mov_b32 s53, s98 ; SI-NEXT: s_mov_b32 s31, s82 -; SI-NEXT: v_readfirstlane_b32 s56, v7 +; SI-NEXT: v_readfirstlane_b32 s56, v54 ; SI-NEXT: s_lshr_b64 s[94:95], s[56:57], 16 ; SI-NEXT: s_mov_b32 s51, s94 +; SI-NEXT: v_readfirstlane_b32 s78, v60 ; SI-NEXT: s_lshr_b64 s[74:75], s[78:79], 16 ; SI-NEXT: s_mov_b32 s93, s74 -; SI-NEXT: v_readfirstlane_b32 s72, v19 +; SI-NEXT: v_readfirstlane_b32 s72, v53 ; SI-NEXT: s_lshr_b64 s[60:61], s[72:73], 16 ; SI-NEXT: s_mov_b32 s77, s60 -; SI-NEXT: v_readfirstlane_b32 s58, v21 +; SI-NEXT: v_readfirstlane_b32 s58, v46 ; SI-NEXT: s_lshr_b64 s[54:55], s[58:59], 16 ; SI-NEXT: s_mov_b32 s63, s54 -; SI-NEXT: v_readfirstlane_b32 s44, v22 +; SI-NEXT: v_readfirstlane_b32 s44, v59 ; SI-NEXT: s_lshr_b64 s[42:43], s[44:45], 16 ; SI-NEXT: s_mov_b32 s47, s42 -; SI-NEXT: v_mov_b32_e32 v26, v37 -; SI-NEXT: v_readfirstlane_b32 s28, v26 ; SI-NEXT: s_lshr_b64 s[26:27], s[28:29], 16 ; SI-NEXT: s_mov_b32 s41, s26 -; SI-NEXT: v_readfirstlane_b32 s22, v36 -; SI-NEXT: v_readfirstlane_b32 s18, v49 -; SI-NEXT: v_lshrrev_b32_e32 v48, 24, v1 -; SI-NEXT: v_mov_b32_e32 v1, v56 -; SI-NEXT: v_mov_b32_e32 v3, v54 -; SI-NEXT: v_lshrrev_b32_e32 v37, 24, v6 -; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v50 -; SI-NEXT: v_lshrrev_b32_e32 v8, 24, v38 -; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v1 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_mov_b32_e32 v54, v59 -; SI-NEXT: s_lshr_b32 s78, s96, 8 -; SI-NEXT: s_lshr_b32 s61, s84, 8 -; SI-NEXT: s_lshr_b32 s72, s70, 8 -; SI-NEXT: s_lshr_b32 s75, s38, 8 -; SI-NEXT: s_lshr_b32 s58, s98, 8 -; SI-NEXT: s_lshr_b32 s43, s82, 8 -; SI-NEXT: s_lshr_b32 s44, s94, 8 -; SI-NEXT: s_mov_b32 s64, s74 -; SI-NEXT: s_lshr_b32 s27, s74, 8 -; SI-NEXT: s_mov_b32 s90, s60 -; SI-NEXT: s_lshr_b32 s28, s60, 8 -; SI-NEXT: s_lshr_b32 s74, s54, 8 -; SI-NEXT: s_mov_b32 s68, s42 -; SI-NEXT: s_mov_b32 s56, s26 -; SI-NEXT: v_lshrrev_b32_e32 v16, 24, v3 -; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v45 -; SI-NEXT: v_lshrrev_b32_e32 v13, 24, v34 -; SI-NEXT: v_lshrrev_b32_e32 v11, 24, v35 -; SI-NEXT: v_lshrrev_b32_e32 v47, 24, v5 -; SI-NEXT: v_lshrrev_b32_e32 v3, 24, v2 -; SI-NEXT: v_lshrrev_b32_e32 v55, 24, v9 -; SI-NEXT: v_lshrrev_b32_e32 v4, 24, v25 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_readfirstlane_b32 s4, v17 -; SI-NEXT: s_lshr_b32 s5, s4, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_readfirstlane_b32 s4, v33 -; SI-NEXT: s_lshr_b64 s[24:25], s[4:5], 16 -; SI-NEXT: v_readfirstlane_b32 s4, v57 -; SI-NEXT: s_lshr_b32 s23, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s4, v58 -; SI-NEXT: s_lshr_b32 s5, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s4, v59 -; SI-NEXT: s_lshr_b64 s[16:17], s[4:5], 16 -; SI-NEXT: v_readfirstlane_b32 s4, v50 +; SI-NEXT: v_readfirstlane_b32 s22, v11 +; SI-NEXT: s_lshr_b64 s[34:35], s[22:23], 16 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_readfirstlane_b32 s4, v1 ; SI-NEXT: s_lshr_b32 s19, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s4, v53 +; SI-NEXT: v_readfirstlane_b32 s4, v30 ; SI-NEXT: s_lshr_b32 s5, s4, 16 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_readfirstlane_b32 s4, v24 +; SI-NEXT: v_readfirstlane_b32 s4, v15 ; SI-NEXT: s_lshr_b64 s[10:11], s[4:5], 16 -; SI-NEXT: v_readfirstlane_b32 s4, v38 +; SI-NEXT: v_readfirstlane_b32 s4, v3 ; SI-NEXT: s_lshr_b32 s13, s4, 16 ; SI-NEXT: s_mov_b32 s5, s13 ; SI-NEXT: v_writelane_b32 v61, s4, 26 ; SI-NEXT: v_writelane_b32 v61, s5, 27 -; SI-NEXT: v_readfirstlane_b32 s4, v46 +; SI-NEXT: s_lshr_b64 s[20:21], s[12:13], 16 +; SI-NEXT: v_readfirstlane_b32 s12, v17 +; SI-NEXT: s_mov_b32 s25, s34 +; SI-NEXT: v_readfirstlane_b32 s18, v13 +; SI-NEXT: s_lshr_b64 s[14:15], s[18:19], 16 +; SI-NEXT: s_mov_b32 s17, s14 +; SI-NEXT: s_mov_b32 s11, s20 +; SI-NEXT: v_mov_b32_e32 v7, v50 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_readfirstlane_b32 s4, v2 ; SI-NEXT: s_lshr_b32 s5, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s4, v10 +; SI-NEXT: v_mov_b32_e32 v50, v49 +; SI-NEXT: v_mov_b32_e32 v55, v54 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_readfirstlane_b32 s4, v5 ; SI-NEXT: s_lshr_b64 s[4:5], s[4:5], 16 -; SI-NEXT: v_readfirstlane_b32 s5, v56 -; SI-NEXT: s_lshr_b64 s[20:21], s[12:13], 16 +; SI-NEXT: s_lshr_b32 s78, s96, 8 +; SI-NEXT: s_lshr_b32 s61, s84, 8 +; SI-NEXT: s_lshr_b32 s72, s70, 8 +; SI-NEXT: s_lshr_b32 s75, s38, 8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s5, v10 ; SI-NEXT: s_lshr_b32 s13, s5, 16 -; SI-NEXT: v_readfirstlane_b32 s12, v14 ; SI-NEXT: s_lshr_b64 vcc, s[12:13], 16 ; SI-NEXT: s_mov_b32 s5, vcc_lo ; SI-NEXT: s_mov_b32 s88, vcc_lo @@ -164733,10 +162832,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_writelane_b32 v61, vcc_hi, 7 ; SI-NEXT: s_lshr_b64 vcc, s[40:41], 16 ; SI-NEXT: v_writelane_b32 v61, vcc_lo, 4 -; SI-NEXT: s_lshr_b64 s[34:35], s[22:23], 16 ; SI-NEXT: v_writelane_b32 v61, vcc_hi, 5 ; SI-NEXT: s_lshr_b64 vcc, s[40:41], 8 -; SI-NEXT: s_mov_b32 s25, s34 ; SI-NEXT: v_writelane_b32 v61, vcc_lo, 2 ; SI-NEXT: v_writelane_b32 v61, vcc_hi, 3 ; SI-NEXT: s_lshr_b64 vcc, s[24:25], 24 @@ -164744,10 +162841,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_writelane_b32 v61, vcc_hi, 13 ; SI-NEXT: s_lshr_b64 vcc, s[24:25], 16 ; SI-NEXT: v_writelane_b32 v61, vcc_lo, 10 -; SI-NEXT: s_lshr_b64 s[14:15], s[18:19], 16 ; SI-NEXT: v_writelane_b32 v61, vcc_hi, 11 ; SI-NEXT: s_lshr_b64 vcc, s[24:25], 8 -; SI-NEXT: s_mov_b32 s17, s14 ; SI-NEXT: v_writelane_b32 v61, vcc_lo, 8 ; SI-NEXT: v_writelane_b32 v61, vcc_hi, 9 ; SI-NEXT: s_lshr_b64 vcc, s[16:17], 24 @@ -164757,7 +162852,6 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_writelane_b32 v61, vcc_lo, 16 ; SI-NEXT: v_writelane_b32 v61, vcc_hi, 17 ; SI-NEXT: s_lshr_b64 vcc, s[16:17], 8 -; SI-NEXT: s_mov_b32 s11, s20 ; SI-NEXT: v_writelane_b32 v61, vcc_lo, 14 ; SI-NEXT: v_writelane_b32 v61, vcc_hi, 15 ; SI-NEXT: s_lshr_b64 vcc, s[10:11], 24 @@ -164777,9 +162871,18 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_writelane_b32 v61, vcc_hi, 31 ; SI-NEXT: s_lshr_b64 vcc, s[4:5], 8 ; SI-NEXT: v_writelane_b32 v61, vcc_lo, 28 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_lshrrev_b32_e32 v10, 24, v23 +; SI-NEXT: v_lshrrev_b32_e32 v26, 24, v10 +; SI-NEXT: s_lshr_b32 s58, s98, 8 +; SI-NEXT: s_lshr_b32 s43, s82, 8 +; SI-NEXT: s_lshr_b32 s44, s94, 8 +; SI-NEXT: s_mov_b32 s64, s74 +; SI-NEXT: s_lshr_b32 s27, s74, 8 +; SI-NEXT: s_mov_b32 s90, s60 +; SI-NEXT: s_lshr_b32 s28, s60, 8 +; SI-NEXT: s_lshr_b32 s74, s54, 8 +; SI-NEXT: s_mov_b32 s68, s42 ; SI-NEXT: s_lshr_b32 s22, s42, 8 +; SI-NEXT: s_mov_b32 s56, s26 ; SI-NEXT: s_lshr_b32 s21, s26, 8 ; SI-NEXT: s_lshr_b32 s18, s34, 8 ; SI-NEXT: s_mov_b32 s36, s14 @@ -164788,182 +162891,142 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: s_lshr_b32 s12, s20, 8 ; SI-NEXT: v_writelane_b32 v61, vcc_hi, 29 ; SI-NEXT: s_mov_b64 vcc, 0 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v14, 24, v29 -; SI-NEXT: v_lshrrev_b32_e32 v56, 24, v57 -; SI-NEXT: v_mov_b32_e32 v59, v30 -; SI-NEXT: v_mov_b32_e32 v31, v51 -; SI-NEXT: v_mov_b32_e32 v60, v34 -; SI-NEXT: v_mov_b32_e32 v30, v39 -; SI-NEXT: v_mov_b32_e32 v19, v5 -; SI-NEXT: v_mov_b32_e32 v39, v21 -; SI-NEXT: v_mov_b32_e32 v21, v20 -; SI-NEXT: v_mov_b32_e32 v34, v18 -; SI-NEXT: v_mov_b32_e32 v18, v37 +; SI-NEXT: v_lshrrev_b32_e32 v25, 24, v7 +; SI-NEXT: v_lshrrev_b32_e32 v23, 24, v12 +; SI-NEXT: v_lshrrev_b32_e32 v21, 24, v14 +; SI-NEXT: v_lshrrev_b32_e32 v16, 24, v19 +; SI-NEXT: v_lshrrev_b32_e32 v12, 24, v20 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v18 +; SI-NEXT: v_lshrrev_b32_e32 v60, 24, v38 +; SI-NEXT: v_lshrrev_b32_e32 v57, 24, v51 +; SI-NEXT: v_lshrrev_b32_e32 v40, 24, v41 +; SI-NEXT: v_lshrrev_b32_e32 v34, 24, v52 +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v36 +; SI-NEXT: v_lshrrev_b32_e32 v32, 24, v58 +; SI-NEXT: v_lshrrev_b32_e32 v31, 24, v8 +; SI-NEXT: v_lshrrev_b32_e32 v29, 24, v1 +; SI-NEXT: v_lshrrev_b32_e32 v27, 24, v3 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mov_b32_e32 v7, v26 -; SI-NEXT: v_mov_b32_e32 v20, v2 -; SI-NEXT: v_mov_b32_e32 v37, v17 -; SI-NEXT: v_mov_b32_e32 v51, v33 -; SI-NEXT: v_mov_b32_e32 v17, v9 -; SI-NEXT: v_mov_b32_e32 v9, v10 -; SI-NEXT: v_mov_b32_e32 v26, v25 +; SI-NEXT: v_mov_b32_e32 v14, v48 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v19, v53 +; SI-NEXT: v_mov_b32_e32 v53, v39 +; SI-NEXT: v_mov_b32_e32 v48, v46 +; SI-NEXT: v_mov_b32_e32 v46, v59 +; SI-NEXT: v_mov_b32_e32 v39, v43 ; SI-NEXT: s_branch .LBB91_3 ; SI-NEXT: .LBB91_2: ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_mov_b32_e32 v54, v59 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v19, v46 ; SI-NEXT: v_writelane_b32 v62, s4, 0 ; SI-NEXT: v_writelane_b32 v62, s5, 1 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v59, v51 +; SI-NEXT: v_mov_b32_e32 v46, v1 ; SI-NEXT: v_writelane_b32 v62, s4, 2 ; SI-NEXT: v_writelane_b32 v62, s5, 3 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v31, v46 +; SI-NEXT: v_mov_b32_e32 v8, v10 ; SI-NEXT: v_writelane_b32 v62, s4, 4 ; SI-NEXT: v_writelane_b32 v62, s5, 5 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v34, v22 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: v_writelane_b32 v62, s4, 6 ; SI-NEXT: v_writelane_b32 v62, s5, 7 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v22, v24 +; SI-NEXT: ; implicit-def: $sgpr21 +; SI-NEXT: ; implicit-def: $sgpr20 +; SI-NEXT: ; implicit-def: $sgpr89 +; SI-NEXT: ; implicit-def: $sgpr88 +; SI-NEXT: v_mov_b32_e32 v44, v43 ; SI-NEXT: v_writelane_b32 v62, s4, 8 ; SI-NEXT: v_writelane_b32 v62, s5, 9 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v7, v37 +; SI-NEXT: v_mov_b32_e32 v55, v54 ; SI-NEXT: v_writelane_b32 v62, s4, 10 ; SI-NEXT: v_writelane_b32 v62, s5, 11 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v50, v49 ; SI-NEXT: v_writelane_b32 v62, s4, 12 ; SI-NEXT: v_writelane_b32 v62, s5, 13 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v37, v35 ; SI-NEXT: v_writelane_b32 v62, s4, 14 ; SI-NEXT: v_writelane_b32 v62, s5, 15 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: ; implicit-def: $sgpr21 -; SI-NEXT: ; implicit-def: $sgpr20 -; SI-NEXT: ; implicit-def: $sgpr89 -; SI-NEXT: ; implicit-def: $sgpr88 -; SI-NEXT: v_mov_b32_e32 v44, v1 +; SI-NEXT: s_mov_b64 vcc, -1 ; SI-NEXT: v_writelane_b32 v62, s4, 16 ; SI-NEXT: v_writelane_b32 v62, s5, 17 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: v_mov_b32_e32 v52, v17 +; SI-NEXT: v_mov_b32_e32 v14, v48 ; SI-NEXT: v_writelane_b32 v62, s4, 18 ; SI-NEXT: v_writelane_b32 v62, s5, 19 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v43, v20 +; SI-NEXT: v_mov_b32_e32 v48, v59 ; SI-NEXT: v_writelane_b32 v62, s4, 20 ; SI-NEXT: v_writelane_b32 v62, s5, 21 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v42, v32 +; SI-NEXT: v_mov_b32_e32 v39, v36 ; SI-NEXT: v_writelane_b32 v62, s4, 22 ; SI-NEXT: v_writelane_b32 v62, s5, 23 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v41, v5 +; SI-NEXT: v_mov_b32_e32 v36, v45 ; SI-NEXT: v_writelane_b32 v62, s4, 24 ; SI-NEXT: v_writelane_b32 v62, s5, 25 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: s_mov_b64 vcc, -1 -; SI-NEXT: v_writelane_b32 v62, s4, 26 -; SI-NEXT: v_writelane_b32 v62, s5, 27 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v60, v35 -; SI-NEXT: v_writelane_b32 v62, s4, 28 -; SI-NEXT: v_writelane_b32 v62, s5, 29 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: v_mov_b32_e32 v35, v6 -; SI-NEXT: v_writelane_b32 v62, s4, 30 -; SI-NEXT: v_writelane_b32 v62, s5, 31 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v32, v4 -; SI-NEXT: v_writelane_b32 v62, s4, 32 -; SI-NEXT: v_writelane_b32 v62, s5, 33 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v30, v12 -; SI-NEXT: v_writelane_b32 v62, s4, 34 -; SI-NEXT: v_writelane_b32 v62, s5, 35 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v19, v39 -; SI-NEXT: v_writelane_b32 v62, s4, 36 -; SI-NEXT: v_writelane_b32 v62, s5, 37 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: v_mov_b32_e32 v39, v25 -; SI-NEXT: v_writelane_b32 v62, s4, 38 -; SI-NEXT: v_writelane_b32 v62, s5, 39 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v12, v29 -; SI-NEXT: v_writelane_b32 v62, s4, 40 -; SI-NEXT: v_writelane_b32 v62, s5, 41 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v20, v2 -; SI-NEXT: v_writelane_b32 v62, s4, 42 -; SI-NEXT: v_writelane_b32 v62, s5, 43 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v6, v55 -; SI-NEXT: v_writelane_b32 v62, s4, 44 -; SI-NEXT: v_writelane_b32 v62, s5, 45 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v17, v8 -; SI-NEXT: v_writelane_b32 v62, s4, 46 -; SI-NEXT: v_writelane_b32 v62, s5, 47 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v29, v33 -; SI-NEXT: v_writelane_b32 v62, s4, 48 -; SI-NEXT: v_writelane_b32 v62, s5, 49 -; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr8 ; SI-NEXT: ; implicit-def: $sgpr96 ; SI-NEXT: ; implicit-def: $sgpr78 ; SI-NEXT: ; implicit-def: $sgpr7 -; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $sgpr86 ; SI-NEXT: ; implicit-def: $sgpr84 ; SI-NEXT: ; implicit-def: $sgpr61 ; SI-NEXT: ; implicit-def: $sgpr65 -; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $sgpr80 ; SI-NEXT: ; implicit-def: $sgpr70 ; SI-NEXT: ; implicit-def: $sgpr72 ; SI-NEXT: ; implicit-def: $sgpr69 -; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $sgpr66 ; SI-NEXT: ; implicit-def: $sgpr38 ; SI-NEXT: ; implicit-def: $sgpr75 ; SI-NEXT: ; implicit-def: $sgpr91 -; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $sgpr52 ; SI-NEXT: ; implicit-def: $sgpr98 ; SI-NEXT: ; implicit-def: $sgpr58 ; SI-NEXT: ; implicit-def: $sgpr37 -; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $sgpr30 ; SI-NEXT: ; implicit-def: $sgpr82 ; SI-NEXT: ; implicit-def: $sgpr43 -; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $sgpr44 -; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $sgpr27 -; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $sgpr74 -; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $sgpr22 -; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $sgpr18 -; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 ; SI-NEXT: ; implicit-def: $sgpr50 ; SI-NEXT: ; implicit-def: $sgpr94 ; SI-NEXT: ; implicit-def: $sgpr57 @@ -164991,11 +163054,42 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: ; implicit-def: $sgpr10 ; SI-NEXT: ; implicit-def: $sgpr14 ; SI-NEXT: ; implicit-def: $sgpr13 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: v_writelane_b32 v62, s4, 26 +; SI-NEXT: v_writelane_b32 v62, s5, 27 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v62, s4, 28 +; SI-NEXT: v_writelane_b32 v62, s5, 29 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v62, s4, 30 +; SI-NEXT: v_writelane_b32 v62, s5, 31 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v62, s4, 32 +; SI-NEXT: v_writelane_b32 v62, s5, 33 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v62, s4, 34 +; SI-NEXT: v_writelane_b32 v62, s5, 35 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v62, s4, 36 +; SI-NEXT: v_writelane_b32 v62, s5, 37 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v62, s4, 38 +; SI-NEXT: v_writelane_b32 v62, s5, 39 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v62, s4, 40 +; SI-NEXT: v_writelane_b32 v62, s5, 41 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v62, s4, 42 +; SI-NEXT: v_writelane_b32 v62, s5, 43 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v62, s4, 44 +; SI-NEXT: v_writelane_b32 v62, s5, 45 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v62, s4, 46 +; SI-NEXT: v_writelane_b32 v62, s5, 47 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v62, s4, 48 +; SI-NEXT: v_writelane_b32 v62, s5, 49 +; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: v_writelane_b32 v62, s4, 50 ; SI-NEXT: v_writelane_b32 v62, s5, 51 ; SI-NEXT: ; implicit-def: $sgpr4 @@ -165068,364 +163162,373 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_writelane_b32 v61, s89, 33 ; SI-NEXT: ; implicit-def: $sgpr88 ; SI-NEXT: .LBB91_3: ; %Flow -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: s_andn2_b64 vcc, exec, vcc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v26, v27 +; SI-NEXT: v_mov_b32_e32 v27, v29 +; SI-NEXT: v_mov_b32_e32 v29, v31 +; SI-NEXT: v_mov_b32_e32 v31, v32 +; SI-NEXT: v_mov_b32_e32 v32, v33 ; SI-NEXT: s_cbranch_vccnz .LBB91_5 ; SI-NEXT: ; %bb.4: ; %cmp.true ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v46 -; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_readfirstlane_b32 s4, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_readfirstlane_b32 s4, v12 ; SI-NEXT: s_lshr_b32 s5, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s4, v3 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_readfirstlane_b32 s12, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v24 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v53 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v38 -; SI-NEXT: v_readfirstlane_b32 s6, v9 -; SI-NEXT: v_readfirstlane_b32 s8, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v40 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_readfirstlane_b32 s4, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v17 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v10 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v30 +; SI-NEXT: v_readfirstlane_b32 s12, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v15 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_readfirstlane_b32 s6, v12 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 ; SI-NEXT: s_lshr_b32 s9, s6, 16 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_readfirstlane_b32 s6, v8 +; SI-NEXT: v_readfirstlane_b32 s8, v7 +; SI-NEXT: v_readfirstlane_b32 s6, v10 ; SI-NEXT: s_lshr_b64 s[10:11], s[8:9], 16 ; SI-NEXT: s_lshr_b32 s9, s6, 16 -; SI-NEXT: v_readfirstlane_b32 s8, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v54 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v22 ; SI-NEXT: s_mov_b32 s7, s9 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v58 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v24 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 ; SI-NEXT: v_writelane_b32 v61, s6, 26 -; SI-NEXT: s_lshr_b64 s[20:21], s[8:9], 16 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_readfirstlane_b32 s8, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v49 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; SI-NEXT: v_readfirstlane_b32 s8, v7 ; SI-NEXT: v_writelane_b32 v61, s7, 27 -; SI-NEXT: v_readfirstlane_b32 s6, v5 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v28 +; SI-NEXT: v_readfirstlane_b32 s6, v6 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v50 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 +; SI-NEXT: s_lshr_b64 s[20:21], s[8:9], 16 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 ; SI-NEXT: s_lshr_b32 s9, s6, 16 +; SI-NEXT: v_readfirstlane_b32 s6, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v9 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_readfirstlane_b32 s8, v7 +; SI-NEXT: s_lshr_b32 s19, s6, 16 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_readfirstlane_b32 s18, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v37 +; SI-NEXT: v_readfirstlane_b32 s6, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v11 ; SI-NEXT: s_lshr_b64 s[16:17], s[8:9], 16 -; SI-NEXT: v_readfirstlane_b32 s6, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: s_lshr_b32 s19, s6, 16 -; SI-NEXT: v_readfirstlane_b32 s6, v3 -; SI-NEXT: s_lshr_b32 s9, s6, 16 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v12 +; SI-NEXT: v_readfirstlane_b32 s8, v2 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v1 +; SI-NEXT: v_readfirstlane_b32 s22, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v13 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v21 -; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v43 -; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v44 -; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v35 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: s_lshr_b64 s[26:27], s[18:19], 16 -; SI-NEXT: s_mov_b32 s17, s26 -; SI-NEXT: s_mov_b32 s11, s20 -; SI-NEXT: s_lshr_b64 s[4:5], s[4:5], 16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v51 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_readfirstlane_b32 s8, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v36 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v1 -; SI-NEXT: v_readfirstlane_b32 s22, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v52 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v57 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_readfirstlane_b32 s18, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: s_lshr_b32 s9, s6, 16 ; SI-NEXT: s_lshr_b64 s[24:25], s[8:9], 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v8 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_readfirstlane_b32 s8, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v7 ; SI-NEXT: v_readfirstlane_b32 s6, v1 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: s_lshr_b32 s23, s6, 16 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v39 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v47 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v53 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: s_lshr_b64 s[34:35], s[22:23], 16 +; SI-NEXT: s_mov_b32 s25, s34 +; SI-NEXT: s_lshr_b64 s[26:27], s[18:19], 16 +; SI-NEXT: s_mov_b32 s17, s26 +; SI-NEXT: s_mov_b32 s11, s20 +; SI-NEXT: s_lshr_b64 s[4:5], s[4:5], 16 +; SI-NEXT: v_readfirstlane_b32 s5, v17 +; SI-NEXT: s_lshr_b32 s13, s5, 16 +; SI-NEXT: s_lshr_b64 vcc, s[12:13], 16 +; SI-NEXT: s_mov_b32 s5, vcc_lo +; SI-NEXT: s_lshr_b32 s18, s34, 8 +; SI-NEXT: s_lshr_b32 s12, s20, 8 +; SI-NEXT: v_lshrrev_b32_e32 v29, 24, v1 +; SI-NEXT: v_lshrrev_b32_e32 v27, 24, v3 +; SI-NEXT: v_lshrrev_b32_e32 v26, 24, v10 +; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v17 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_readfirstlane_b32 s8, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_readfirstlane_b32 s6, v5 -; SI-NEXT: v_readfirstlane_b32 s28, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v27 ; SI-NEXT: s_lshr_b32 s9, s6, 16 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v29 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: s_lshr_b64 s[40:41], s[8:9], 16 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v28 -; SI-NEXT: v_readfirstlane_b32 s8, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v22 -; SI-NEXT: v_readfirstlane_b32 s6, v5 -; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v2 +; SI-NEXT: v_readfirstlane_b32 s28, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v58 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_readfirstlane_b32 s6, v2 ; SI-NEXT: s_lshr_b32 s29, s6, 16 -; SI-NEXT: v_readfirstlane_b32 s6, v7 -; SI-NEXT: v_readfirstlane_b32 s44, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v34 +; SI-NEXT: v_readfirstlane_b32 s6, v6 ; SI-NEXT: s_lshr_b32 s9, s6, 16 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v26 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: s_lshr_b64 s[42:43], s[28:29], 16 +; SI-NEXT: s_mov_b32 s41, s42 +; SI-NEXT: s_lshr_b32 s21, s42, 8 +; SI-NEXT: v_lshrrev_b32_e32 v31, 24, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_readfirstlane_b32 s8, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v46 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v36 +; SI-NEXT: v_readfirstlane_b32 s44, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v56 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: s_lshr_b64 s[46:47], s[8:9], 16 -; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; SI-NEXT: v_readfirstlane_b32 s8, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v39 -; SI-NEXT: v_readfirstlane_b32 s6, v7 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_readfirstlane_b32 s6, v5 +; SI-NEXT: v_readfirstlane_b32 s8, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v48 ; SI-NEXT: s_lshr_b32 s45, s6, 16 -; SI-NEXT: v_readfirstlane_b32 s6, v9 -; SI-NEXT: v_readfirstlane_b32 s58, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v30 +; SI-NEXT: v_readfirstlane_b32 s6, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v6 +; SI-NEXT: v_readfirstlane_b32 s58, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v42 ; SI-NEXT: s_lshr_b32 s9, s6, 16 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v52 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 ; SI-NEXT: s_lshr_b64 s[62:63], s[8:9], 16 -; SI-NEXT: v_readfirstlane_b32 s8, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v23 -; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; SI-NEXT: v_readfirstlane_b32 s6, v9 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_readfirstlane_b32 s8, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v19 +; SI-NEXT: v_readfirstlane_b32 s6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 ; SI-NEXT: s_lshr_b32 s59, s6, 16 -; SI-NEXT: v_readfirstlane_b32 s6, v10 +; SI-NEXT: v_readfirstlane_b32 s6, v8 +; SI-NEXT: v_readfirstlane_b32 s72, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v14 ; SI-NEXT: s_lshr_b32 s9, s6, 16 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 ; SI-NEXT: s_lshr_b64 s[76:77], s[8:9], 16 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v17 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v11 -; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v42 -; SI-NEXT: v_readfirstlane_b32 s6, v10 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v41 +; SI-NEXT: v_readfirstlane_b32 s8, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; SI-NEXT: v_readfirstlane_b32 s6, v8 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 ; SI-NEXT: s_lshr_b32 s73, s6, 16 ; SI-NEXT: v_readfirstlane_b32 s6, v11 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v6 -; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v51 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 ; SI-NEXT: s_lshr_b32 s9, s6, 16 -; SI-NEXT: v_readfirstlane_b32 s6, v18 +; SI-NEXT: v_readfirstlane_b32 s6, v11 ; SI-NEXT: s_lshr_b32 s79, s6, 16 +; SI-NEXT: v_readfirstlane_b32 s6, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v38 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v12 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_lshr_b64 s[92:93], s[8:9], 16 +; SI-NEXT: s_lshr_b32 s9, s6, 16 +; SI-NEXT: v_readfirstlane_b32 s6, v13 +; SI-NEXT: s_lshr_b32 s57, s6, 16 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: s_lshr_b64 s[74:75], s[72:73], 16 +; SI-NEXT: s_mov_b32 s77, s74 ; SI-NEXT: s_lshr_b64 s[54:55], s[58:59], 16 ; SI-NEXT: s_mov_b32 s63, s54 ; SI-NEXT: s_lshr_b64 s[60:61], s[44:45], 16 ; SI-NEXT: s_mov_b32 s47, s60 -; SI-NEXT: s_lshr_b64 s[42:43], s[28:29], 16 -; SI-NEXT: s_mov_b32 s41, s42 -; SI-NEXT: s_lshr_b64 s[34:35], s[22:23], 16 -; SI-NEXT: s_mov_b32 s25, s34 -; SI-NEXT: v_readfirstlane_b32 s5, v14 -; SI-NEXT: s_lshr_b32 s13, s5, 16 -; SI-NEXT: s_lshr_b64 vcc, s[12:13], 16 -; SI-NEXT: s_mov_b32 s5, vcc_lo -; SI-NEXT: v_lshrrev_b32_e32 v56, 24, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v2 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v8 -; SI-NEXT: s_lshr_b32 s22, s60, 8 -; SI-NEXT: s_lshr_b32 s21, s42, 8 -; SI-NEXT: s_lshr_b32 s18, s34, 8 -; SI-NEXT: s_lshr_b32 s12, s20, 8 -; SI-NEXT: v_lshrrev_b32_e32 v18, 24, v18 -; SI-NEXT: v_lshrrev_b32_e32 v55, 24, v10 -; SI-NEXT: v_lshrrev_b32_e32 v9, 24, v9 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v14 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_readfirstlane_b32 s72, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v32 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_readfirstlane_b32 s8, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: s_lshr_b64 s[92:93], s[8:9], 16 -; SI-NEXT: s_lshr_b64 s[74:75], s[72:73], 16 -; SI-NEXT: s_mov_b32 s77, s74 ; SI-NEXT: s_lshr_b32 s28, s74, 8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_readfirstlane_b32 s78, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: s_lshr_b64 s[48:49], s[78:79], 16 -; SI-NEXT: s_mov_b32 s93, s48 -; SI-NEXT: s_lshr_b32 s27, s48, 8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_readfirstlane_b32 s8, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: v_readfirstlane_b32 s6, v11 -; SI-NEXT: s_lshr_b32 s9, s6, 16 +; SI-NEXT: s_lshr_b32 s22, s60, 8 +; SI-NEXT: v_lshrrev_b32_e32 v60, 24, v13 +; SI-NEXT: v_lshrrev_b32_e32 v57, 24, v11 +; SI-NEXT: v_lshrrev_b32_e32 v40, 24, v8 +; SI-NEXT: v_lshrrev_b32_e32 v34, 24, v6 +; SI-NEXT: v_lshrrev_b32_e32 v32, 24, v5 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_readfirstlane_b32 s78, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v44 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_readfirstlane_b32 s8, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v55 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_readfirstlane_b32 s56, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v50 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 ; SI-NEXT: s_lshr_b64 s[50:51], s[8:9], 16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v3 -; SI-NEXT: v_readfirstlane_b32 s56, v11 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v41 -; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: v_readfirstlane_b32 s8, v11 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v20 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_readfirstlane_b32 s6, v3 -; SI-NEXT: s_lshr_b32 s57, s6, 16 -; SI-NEXT: v_readfirstlane_b32 s6, v13 +; SI-NEXT: v_readfirstlane_b32 s8, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v37 +; SI-NEXT: s_lshr_b64 s[94:95], s[56:57], 16 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_readfirstlane_b32 s6, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v7 +; SI-NEXT: v_readfirstlane_b32 s88, v12 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: s_lshr_b32 s9, s6, 16 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v19 ; SI-NEXT: s_lshr_b64 s[30:31], s[8:9], 16 -; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v13 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: v_readfirstlane_b32 s6, v24 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v18 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_readfirstlane_b32 s6, v7 ; SI-NEXT: s_lshr_b32 s89, s6, 16 -; SI-NEXT: s_lshr_b64 s[94:95], s[56:57], 16 -; SI-NEXT: s_mov_b32 s51, s94 -; SI-NEXT: s_lshr_b32 s44, s94, 8 -; SI-NEXT: s_mov_b32 s56, s42 -; SI-NEXT: v_lshrrev_b32_e32 v47, 24, v24 -; SI-NEXT: v_lshrrev_b32_e32 v3, 24, v3 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: v_readfirstlane_b32 s88, v11 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v31 -; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: v_readfirstlane_b32 s8, v11 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: s_lshr_b64 s[82:83], s[88:89], 16 ; SI-NEXT: s_mov_b32 s31, s82 +; SI-NEXT: s_mov_b32 s51, s94 +; SI-NEXT: s_lshr_b64 s[48:49], s[78:79], 16 +; SI-NEXT: s_mov_b32 s93, s48 ; SI-NEXT: s_lshr_b32 s43, s82, 8 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; SI-NEXT: v_readfirstlane_b32 s6, v13 +; SI-NEXT: s_lshr_b32 s44, s94, 8 +; SI-NEXT: s_lshr_b32 s27, s48, 8 +; SI-NEXT: s_mov_b32 s56, s42 +; SI-NEXT: s_mov_b32 s88, vcc_lo +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v7 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_readfirstlane_b32 s8, v12 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_readfirstlane_b32 s6, v14 ; SI-NEXT: s_lshr_b32 s9, s6, 16 ; SI-NEXT: s_lshr_b64 s[52:53], s[8:9], 16 -; SI-NEXT: v_readfirstlane_b32 s6, v4 -; SI-NEXT: s_lshr_b32 s37, s6, 16 -; SI-NEXT: s_mov_b32 s88, vcc_lo ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: v_readfirstlane_b32 s36, v11 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: v_readfirstlane_b32 s8, v11 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v59 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v12 +; SI-NEXT: v_readfirstlane_b32 s36, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v20 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_readfirstlane_b32 s6, v12 +; SI-NEXT: s_lshr_b32 s37, s6, 16 ; SI-NEXT: s_lshr_b64 s[98:99], s[36:37], 16 ; SI-NEXT: s_mov_b32 s53, s98 ; SI-NEXT: s_lshr_b32 s58, s98, 8 ; SI-NEXT: s_mov_b32 s36, s26 +; SI-NEXT: v_lshrrev_b32_e32 v12, 24, v12 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; SI-NEXT: v_readfirstlane_b32 s6, v13 -; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v11 -; SI-NEXT: v_readfirstlane_b32 s90, v13 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v60 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_readfirstlane_b32 s8, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_readfirstlane_b32 s6, v16 ; SI-NEXT: s_lshr_b32 s9, s6, 16 -; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 ; SI-NEXT: s_lshr_b64 s[66:67], s[8:9], 16 -; SI-NEXT: v_readfirstlane_b32 s6, v11 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v14 +; SI-NEXT: v_readfirstlane_b32 s90, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_readfirstlane_b32 s6, v14 ; SI-NEXT: s_lshr_b32 s91, s6, 16 ; SI-NEXT: s_lshr_b64 s[38:39], s[90:91], 16 ; SI-NEXT: s_mov_b32 s67, s38 ; SI-NEXT: s_lshr_b32 s75, s38, 8 ; SI-NEXT: s_mov_b32 s90, s74 ; SI-NEXT: s_lshr_b32 s74, s54, 8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; SI-NEXT: v_readfirstlane_b32 s8, v13 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_readfirstlane_b32 s8, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: v_readfirstlane_b32 s6, v15 +; SI-NEXT: v_lshrrev_b32_e32 v21, 24, v15 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_readfirstlane_b32 s68, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; SI-NEXT: v_readfirstlane_b32 s6, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: s_lshr_b32 s9, s6, 16 ; SI-NEXT: s_lshr_b64 s[80:81], s[8:9], 16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v6 -; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v13 -; SI-NEXT: v_readfirstlane_b32 s68, v15 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v45 -; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; SI-NEXT: v_readfirstlane_b32 s6, v13 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_readfirstlane_b32 s6, v15 ; SI-NEXT: s_lshr_b32 s69, s6, 16 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 ; SI-NEXT: s_lshr_b64 s[70:71], s[68:69], 16 ; SI-NEXT: s_mov_b32 s81, s70 ; SI-NEXT: s_lshr_b32 s72, s70, 8 ; SI-NEXT: s_mov_b32 s68, s60 +; SI-NEXT: v_lshrrev_b32_e32 v23, 24, v9 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: v_readfirstlane_b32 s8, v15 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_readfirstlane_b32 s8, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: v_readfirstlane_b32 s64, v15 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v25, 24, v4 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_readfirstlane_b32 s6, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: v_readfirstlane_b32 s64, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; SI-NEXT: v_readfirstlane_b32 s6, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: s_lshr_b32 s9, s6, 16 ; SI-NEXT: s_lshr_b64 s[86:87], s[8:9], 16 -; SI-NEXT: v_readfirstlane_b32 s6, v12 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_readfirstlane_b32 s6, v9 ; SI-NEXT: s_lshr_b32 s65, s6, 16 -; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: s_lshr_b64 s[84:85], s[64:65], 16 ; SI-NEXT: s_mov_b32 s87, s84 -; SI-NEXT: v_lshrrev_b32_e32 v48, 24, v6 ; SI-NEXT: s_lshr_b32 s61, s84, 8 ; SI-NEXT: s_mov_b32 s64, s48 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: v_readfirstlane_b32 s8, v15 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_readfirstlane_b32 s6, v16 +; SI-NEXT: v_readfirstlane_b32 s8, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; SI-NEXT: v_readfirstlane_b32 s6, v18 ; SI-NEXT: s_lshr_b32 s9, s6, 16 -; SI-NEXT: v_readfirstlane_b32 s6, v6 +; SI-NEXT: v_readfirstlane_b32 s6, v4 ; SI-NEXT: s_lshr_b32 s7, s6, 16 ; SI-NEXT: s_lshr_b64 s[8:9], s[8:9], 16 -; SI-NEXT: v_lshrrev_b32_e32 v6, 24, v7 -; SI-NEXT: v_lshrrev_b32_e32 v16, 24, v12 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: v_readfirstlane_b32 s6, v15 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_readfirstlane_b32 s6, v16 ; SI-NEXT: s_lshr_b64 s[96:97], s[6:7], 16 ; SI-NEXT: s_mov_b32 s9, s96 ; SI-NEXT: s_lshr_b64 s[14:15], s[8:9], 24 @@ -165571,17 +163674,12 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_writelane_b32 v61, s15, 31 ; SI-NEXT: s_lshr_b64 s[14:15], s[4:5], 8 ; SI-NEXT: v_writelane_b32 v61, s14, 28 -; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 24, v11 -; SI-NEXT: v_lshrrev_b32_e32 v11, 24, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 24, v5 ; SI-NEXT: v_writelane_b32 v61, s15, 29 ; SI-NEXT: s_lshr_b32 s78, s96, 8 ; SI-NEXT: s_lshr_b32 s15, s26, 8 ; SI-NEXT: s_mov_b32 s14, s20 ; SI-NEXT: s_lshr_b32 s6, vcc_lo, 8 -; SI-NEXT: v_mov_b32_e32 v14, v4 -; SI-NEXT: v_mov_b32_e32 v4, v6 +; SI-NEXT: v_lshrrev_b32_e32 v16, 24, v14 ; SI-NEXT: .LBB91_5: ; %end ; SI-NEXT: s_and_b32 s5, s8, 0xff ; SI-NEXT: v_readlane_b32 s8, v62, 0 @@ -165597,7 +163695,7 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: s_and_b32 s5, s5, 0xffff ; SI-NEXT: s_or_b32 s8, s9, s8 ; SI-NEXT: s_or_b32 s5, s5, s8 -; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(3) expcnt(0) ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: s_and_b32 s5, s96, 0xff ; SI-NEXT: s_lshl_b32 s8, s78, 8 @@ -165606,7 +163704,7 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_lshl_b32 s8, s8, 16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v48 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v25 ; SI-NEXT: s_and_b32 s5, s5, 0xffff ; SI-NEXT: v_or_b32_e32 v1, s8, v1 ; SI-NEXT: v_readlane_b32 s8, v62, 6 @@ -165615,7 +163713,7 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: s_and_b32 s5, s86, 0xff ; SI-NEXT: v_readlane_b32 s9, v62, 7 ; SI-NEXT: s_lshl_b32 s8, s8, 8 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 ; SI-NEXT: s_or_b32 s5, s5, s8 ; SI-NEXT: v_readlane_b32 s8, v62, 8 @@ -165638,7 +163736,7 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: s_and_b32 s8, s65, 0xff ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_lshl_b32 s8, s8, 16 -; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v16 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v23 ; SI-NEXT: s_and_b32 s5, s5, 0xffff ; SI-NEXT: v_or_b32_e32 v1, s8, v1 ; SI-NEXT: v_readlane_b32 s8, v62, 12 @@ -165668,7 +163766,7 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: s_and_b32 s8, s69, 0xff ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_lshl_b32 s8, s8, 16 -; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v15 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v21 ; SI-NEXT: s_and_b32 s5, s5, 0xffff ; SI-NEXT: v_or_b32_e32 v1, s8, v1 ; SI-NEXT: v_readlane_b32 s8, v62, 18 @@ -165699,7 +163797,7 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: s_and_b32 s8, s91, 0xff ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_lshl_b32 s8, s8, 16 -; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v13 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v16 ; SI-NEXT: s_and_b32 s5, s5, 0xffff ; SI-NEXT: v_or_b32_e32 v1, s8, v1 ; SI-NEXT: v_readlane_b32 s8, v62, 24 @@ -165730,7 +163828,7 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: s_and_b32 s8, s37, 0xff ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_lshl_b32 s8, s8, 16 -; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v11 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v12 ; SI-NEXT: s_and_b32 s5, s5, 0xffff ; SI-NEXT: v_or_b32_e32 v1, s8, v1 ; SI-NEXT: v_readlane_b32 s8, v62, 30 @@ -165761,7 +163859,7 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: s_and_b32 s8, s89, 0xff ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_lshl_b32 s8, s8, 16 -; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v47 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v7 ; SI-NEXT: s_and_b32 s5, s5, 0xffff ; SI-NEXT: v_or_b32_e32 v1, s8, v1 ; SI-NEXT: v_readlane_b32 s8, v62, 36 @@ -165791,7 +163889,7 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: s_and_b32 s8, s57, 0xff ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_lshl_b32 s8, s8, 16 -; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v60 ; SI-NEXT: s_and_b32 s5, s5, 0xffff ; SI-NEXT: v_or_b32_e32 v1, s8, v1 ; SI-NEXT: v_readlane_b32 s8, v62, 42 @@ -165822,7 +163920,7 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: s_and_b32 s8, s79, 0xff ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_lshl_b32 s8, s8, 16 -; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v18 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v57 ; SI-NEXT: s_and_b32 s5, s5, 0xffff ; SI-NEXT: v_or_b32_e32 v1, s8, v1 ; SI-NEXT: v_readlane_b32 s8, v62, 48 @@ -165852,7 +163950,7 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: s_and_b32 s8, s73, 0xff ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_lshl_b32 s8, s8, 16 -; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v55 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v40 ; SI-NEXT: s_and_b32 s5, s5, 0xffff ; SI-NEXT: v_or_b32_e32 v1, s8, v1 ; SI-NEXT: v_readlane_b32 s8, v62, 54 @@ -165883,7 +163981,7 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: s_and_b32 s8, s59, 0xff ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_lshl_b32 s8, s8, 16 -; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v9 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v34 ; SI-NEXT: s_and_b32 s5, s5, 0xffff ; SI-NEXT: v_or_b32_e32 v1, s8, v1 ; SI-NEXT: v_readlane_b32 s8, v62, 60 @@ -165914,7 +164012,7 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: s_and_b32 s8, s45, 0xff ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_lshl_b32 s8, s8, 16 -; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v4 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v32 ; SI-NEXT: s_and_b32 s5, s5, 0xffff ; SI-NEXT: v_or_b32_e32 v1, s8, v1 ; SI-NEXT: v_readlane_b32 s8, v61, 2 @@ -165945,7 +164043,7 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: s_and_b32 s8, s29, 0xff ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_lshl_b32 s8, s8, 16 -; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v14 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v31 ; SI-NEXT: s_and_b32 s5, s5, 0xffff ; SI-NEXT: v_or_b32_e32 v1, s8, v1 ; SI-NEXT: v_readlane_b32 s8, v61, 8 @@ -165975,7 +164073,7 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: s_and_b32 s8, s23, 0xff ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_lshl_b32 s8, s8, 16 -; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v56 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v29 ; SI-NEXT: s_and_b32 s5, s5, 0xffff ; SI-NEXT: v_or_b32_e32 v1, s8, v1 ; SI-NEXT: v_readlane_b32 s8, v61, 14 @@ -165996,63 +164094,17 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 ; SI-NEXT: s_or_b32 s5, s5, s8 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x68, v0 ; SI-NEXT: v_mov_b32_e32 v2, s5 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: s_and_b32 s5, s36, 0xff ; SI-NEXT: s_lshl_b32 s8, s15, 8 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x68, v0 ; SI-NEXT: s_or_b32 s5, s5, s8 ; SI-NEXT: s_and_b32 s8, s19, 0xff +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v27 ; SI-NEXT: s_and_b32 s5, s5, 0xffff -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 -; SI-NEXT: s_and_b32 s4, s4, 0xff -; SI-NEXT: v_readlane_b32 s61, v62, 35 -; SI-NEXT: v_readlane_b32 s43, v62, 47 -; SI-NEXT: v_readlane_b32 s27, v61, 7 -; SI-NEXT: v_readlane_b32 s21, v61, 13 -; SI-NEXT: v_readlane_b32 s17, v61, 19 -; SI-NEXT: v_readlane_b32 s99, v63, 35 -; SI-NEXT: v_readlane_b32 s98, v63, 34 -; SI-NEXT: v_readlane_b32 s97, v63, 33 -; SI-NEXT: v_readlane_b32 s96, v63, 32 -; SI-NEXT: v_readlane_b32 s87, v63, 31 -; SI-NEXT: v_readlane_b32 s86, v63, 30 -; SI-NEXT: v_readlane_b32 s85, v63, 29 -; SI-NEXT: v_readlane_b32 s84, v63, 28 -; SI-NEXT: v_readlane_b32 s83, v63, 27 -; SI-NEXT: v_readlane_b32 s82, v63, 26 -; SI-NEXT: v_readlane_b32 s81, v63, 25 -; SI-NEXT: v_readlane_b32 s80, v63, 24 -; SI-NEXT: v_readlane_b32 s71, v63, 23 -; SI-NEXT: v_readlane_b32 s70, v63, 22 -; SI-NEXT: v_readlane_b32 s69, v63, 21 -; SI-NEXT: v_readlane_b32 s68, v63, 20 -; SI-NEXT: v_readlane_b32 s67, v63, 19 -; SI-NEXT: v_readlane_b32 s66, v63, 18 -; SI-NEXT: v_readlane_b32 s65, v63, 17 -; SI-NEXT: v_readlane_b32 s64, v63, 16 -; SI-NEXT: v_readlane_b32 s55, v63, 15 -; SI-NEXT: v_readlane_b32 s54, v63, 14 -; SI-NEXT: v_readlane_b32 s53, v63, 13 -; SI-NEXT: v_readlane_b32 s52, v63, 12 -; SI-NEXT: v_readlane_b32 s51, v63, 11 -; SI-NEXT: v_readlane_b32 s50, v63, 10 -; SI-NEXT: v_readlane_b32 s49, v63, 9 -; SI-NEXT: v_readlane_b32 s48, v63, 8 -; SI-NEXT: v_readlane_b32 s39, v63, 7 -; SI-NEXT: v_readlane_b32 s38, v63, 6 -; SI-NEXT: v_readlane_b32 s37, v63, 5 -; SI-NEXT: v_readlane_b32 s36, v63, 4 -; SI-NEXT: v_readlane_b32 s35, v63, 3 -; SI-NEXT: v_readlane_b32 s34, v63, 2 -; SI-NEXT: v_readlane_b32 s31, v63, 1 -; SI-NEXT: v_readlane_b32 s30, v63, 0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 ; SI-NEXT: v_or_b32_e32 v1, s8, v1 ; SI-NEXT: v_readlane_b32 s8, v61, 20 ; SI-NEXT: v_or_b32_e32 v1, s5, v1 @@ -166068,32 +164120,30 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: s_lshl_b32 s9, s10, 24 ; SI-NEXT: s_and_b32 s5, s5, 0xffff ; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 ; SI-NEXT: s_or_b32 s5, s5, s8 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x70, v0 ; SI-NEXT: v_mov_b32_e32 v2, s5 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: s_and_b32 s5, s14, 0xff ; SI-NEXT: s_lshl_b32 s8, s12, 8 ; SI-NEXT: s_or_b32 s5, s5, s8 ; SI-NEXT: v_readlane_b32 s8, v61, 26 ; SI-NEXT: v_readlane_b32 s9, v61, 27 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x70, v0 ; SI-NEXT: s_and_b32 s8, s9, 0xff +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_lshl_b32 s8, s8, 16 -; SI-NEXT: s_and_b32 s5, s5, 0xffff -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 -; SI-NEXT: v_readlane_b32 s11, v61, 25 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v26 ; SI-NEXT: v_or_b32_e32 v1, s8, v1 ; SI-NEXT: v_readlane_b32 s8, v61, 28 +; SI-NEXT: s_and_b32 s5, s5, 0xffff ; SI-NEXT: v_readlane_b32 s9, v61, 29 ; SI-NEXT: v_or_b32_e32 v1, s5, v1 ; SI-NEXT: s_lshl_b32 s5, s8, 8 ; SI-NEXT: v_readlane_b32 s8, v61, 30 +; SI-NEXT: s_and_b32 s4, s4, 0xff ; SI-NEXT: v_readlane_b32 s9, v61, 31 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_and_b32 s5, s8, 0xff @@ -166102,13 +164152,15 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: s_lshl_b32 s8, s8, 24 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s8, s5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x78, v0 ; SI-NEXT: v_mov_b32_e32 v2, s4 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: s_and_b32 s4, s88, 0xff ; SI-NEXT: s_lshl_b32 s5, s6, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 @@ -166116,29 +164168,71 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: s_lshl_b32 s5, s5, 16 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: v_readlane_b32 s61, v62, 35 +; SI-NEXT: v_readlane_b32 s43, v62, 47 +; SI-NEXT: v_readlane_b32 s27, v61, 7 +; SI-NEXT: v_readlane_b32 s21, v61, 13 +; SI-NEXT: v_readlane_b32 s17, v61, 19 +; SI-NEXT: v_readlane_b32 s11, v61, 25 ; SI-NEXT: v_readlane_b32 s9, v61, 33 +; SI-NEXT: v_readlane_b32 s99, v63, 35 +; SI-NEXT: v_readlane_b32 s98, v63, 34 +; SI-NEXT: v_readlane_b32 s97, v63, 33 +; SI-NEXT: v_readlane_b32 s96, v63, 32 +; SI-NEXT: v_readlane_b32 s87, v63, 31 +; SI-NEXT: v_readlane_b32 s86, v63, 30 +; SI-NEXT: v_readlane_b32 s85, v63, 29 +; SI-NEXT: v_readlane_b32 s84, v63, 28 +; SI-NEXT: v_readlane_b32 s83, v63, 27 +; SI-NEXT: v_readlane_b32 s82, v63, 26 +; SI-NEXT: v_readlane_b32 s81, v63, 25 +; SI-NEXT: v_readlane_b32 s80, v63, 24 +; SI-NEXT: v_readlane_b32 s71, v63, 23 +; SI-NEXT: v_readlane_b32 s70, v63, 22 +; SI-NEXT: v_readlane_b32 s69, v63, 21 +; SI-NEXT: v_readlane_b32 s68, v63, 20 +; SI-NEXT: v_readlane_b32 s67, v63, 19 +; SI-NEXT: v_readlane_b32 s66, v63, 18 +; SI-NEXT: v_readlane_b32 s65, v63, 17 +; SI-NEXT: v_readlane_b32 s64, v63, 16 +; SI-NEXT: v_readlane_b32 s55, v63, 15 +; SI-NEXT: v_readlane_b32 s54, v63, 14 +; SI-NEXT: v_readlane_b32 s53, v63, 13 +; SI-NEXT: v_readlane_b32 s52, v63, 12 +; SI-NEXT: v_readlane_b32 s51, v63, 11 +; SI-NEXT: v_readlane_b32 s50, v63, 10 +; SI-NEXT: v_readlane_b32 s49, v63, 9 +; SI-NEXT: v_readlane_b32 s48, v63, 8 +; SI-NEXT: v_readlane_b32 s39, v63, 7 +; SI-NEXT: v_readlane_b32 s38, v63, 6 +; SI-NEXT: v_readlane_b32 s37, v63, 5 +; SI-NEXT: v_readlane_b32 s36, v63, 4 +; SI-NEXT: v_readlane_b32 s35, v63, 3 +; SI-NEXT: v_readlane_b32 s34, v63, 2 +; SI-NEXT: v_readlane_b32 s31, v63, 1 +; SI-NEXT: v_readlane_b32 s30, v63, 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 ; SI-NEXT: v_or_b32_e32 v1, s5, v1 ; SI-NEXT: v_or_b32_e32 v1, s4, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload ; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -171364,2246 +169458,2033 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; SI-LABEL: bitcast_v128i8_to_v64f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v42, v29 -; SI-NEXT: v_mov_b32_e32 v43, v27 -; SI-NEXT: v_mov_b32_e32 v44, v25 -; SI-NEXT: v_mov_b32_e32 v45, v23 -; SI-NEXT: v_mov_b32_e32 v46, v21 -; SI-NEXT: v_mov_b32_e32 v47, v19 -; SI-NEXT: v_mov_b32_e32 v56, v17 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_mov_b32_e32 v57, v15 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_mov_b32_e32 v58, v13 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_mov_b32_e32 v59, v11 -; SI-NEXT: v_mov_b32_e32 v55, v9 -; SI-NEXT: v_mov_b32_e32 v54, v7 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:392 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:100 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:108 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:116 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:132 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:140 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:148 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:156 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:164 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:172 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:180 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:188 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:128 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:136 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v57, v12 +; SI-NEXT: v_mov_b32_e32 v50, v0 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:388 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:32 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:144 -; SI-NEXT: v_lshlrev_b32_e32 v53, 8, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v20 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v22 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v24 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v26 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v28 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v30 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill -; SI-NEXT: v_lshlrev_b32_e32 v41, 8, v4 -; SI-NEXT: v_lshlrev_b32_e32 v63, 8, v6 -; SI-NEXT: v_lshlrev_b32_e32 v61, 8, v16 -; SI-NEXT: v_lshlrev_b32_e32 v62, 8, v18 -; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v8 -; SI-NEXT: v_lshlrev_b32_e32 v10, 8, v10 -; SI-NEXT: v_lshlrev_b32_e32 v12, 8, v12 -; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v14 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:64 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:136 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:144 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:152 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:160 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:168 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:176 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:184 +; SI-NEXT: v_mov_b32_e32 v44, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v23 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v25 +; SI-NEXT: v_mov_b32_e32 v60, v26 +; SI-NEXT: v_mov_b32_e32 v45, v20 +; SI-NEXT: v_mov_b32_e32 v56, v14 +; SI-NEXT: v_lshlrev_b32_e32 v26, 8, v3 +; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v11 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; SI-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; SI-NEXT: v_lshlrev_b32_e32 v58, 8, v15 +; SI-NEXT: v_lshlrev_b32_e32 v59, 8, v17 +; SI-NEXT: v_lshlrev_b32_e32 v19, 8, v19 +; SI-NEXT: v_lshlrev_b32_e32 v62, 8, v21 +; SI-NEXT: v_lshlrev_b32_e32 v27, 8, v27 +; SI-NEXT: v_lshlrev_b32_e32 v29, 8, v29 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: v_lshlrev_b32_e32 v49, 8, v49 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v55 +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v50 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v42 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v51 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v41 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v7 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v48 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v11 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v49 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v13 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v61 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v15 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v17 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v40 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v19 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v54 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v21 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v23 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v51 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v25 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v46 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v27 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v53 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v29 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v43 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v32 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v33 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v32 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v34 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v33 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v35 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v36 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v35 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v37 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v36 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v38 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v37 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v39 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v38 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v48 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:196 +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v39 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:212 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:220 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:192 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:200 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:160 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:168 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:176 -; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:192 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:208 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:188 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:196 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:116 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:124 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:132 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:140 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:148 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:156 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:164 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:172 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:180 +; SI-NEXT: v_lshlrev_b32_e32 v34, 8, v34 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v6 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:208 -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v4 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:228 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:236 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:244 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:252 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:224 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(3) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v6 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v7 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:232 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:204 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v4 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:240 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:212 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:260 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:276 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:284 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:224 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:240 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:256 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:220 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(3) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v6 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:264 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:228 +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v4 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:272 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:236 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:292 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:300 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:308 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:316 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:288 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(3) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v6 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:296 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v4 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:304 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:244 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:324 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:340 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:348 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:256 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:272 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:320 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:252 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(3) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v6 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:328 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:260 +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v4 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:336 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:268 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:356 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:364 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:372 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:380 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:352 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(3) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v6 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(3) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v4 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:388 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:384 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:276 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:288 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:304 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:312 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:360 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:284 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:976 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:48 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:104 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:88 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:56 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:292 +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:40 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:300 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:308 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:320 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:336 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:344 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:316 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:324 +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:368 +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:112 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:120 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:332 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:216 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:340 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:352 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:360 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:368 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:376 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:348 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v61, 8, v1 +; SI-NEXT: s_waitcnt vmcnt(2) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v7 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:356 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:928 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:384 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:52 +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:344 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:364 +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:376 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:372 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:928 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB92_2 ; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xff, v50 +; SI-NEXT: v_or_b32_e32 v0, v0, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v53 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v3 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v0, 0xff, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_or_b32_e32 v1, v2, v41 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v0, 0xff, v4 +; SI-NEXT: v_or_b32_e32 v0, v0, v5 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v17, v0, v20 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v8 +; SI-NEXT: v_or_b32_e32 v21, v0, v9 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v10 +; SI-NEXT: v_or_b32_e32 v15, v0, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v57 +; SI-NEXT: v_or_b32_e32 v63, v0, v13 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v56 +; SI-NEXT: v_or_b32_e32 v14, v0, v58 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v11, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v16, 0xff, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xff, v45 +; SI-NEXT: v_or_b32_e32 v16, v16, v19 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v40 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v42 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v41 +; SI-NEXT: v_and_b32_e32 v33, 0xff, v53 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v52 +; SI-NEXT: v_or_b32_e32 v6, v6, v34 +; SI-NEXT: v_and_b32_e32 v34, 0xff, v31 +; SI-NEXT: v_and_b32_e32 v35, 0xff, v35 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v46 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v47 +; SI-NEXT: v_and_b32_e32 v31, 0xff, v11 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v13, v0, v59 +; SI-NEXT: v_mov_b32_e32 v0, v61 +; SI-NEXT: v_mov_b32_e32 v61, v3 +; SI-NEXT: v_mov_b32_e32 v3, v23 +; SI-NEXT: v_or_b32_e32 v23, v18, v62 +; SI-NEXT: v_and_b32_e32 v18, 0xff, v44 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v20, v18, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v2, 0xff, v5 -; SI-NEXT: v_mov_b32_e32 v3, v9 -; SI-NEXT: v_or_b32_e32 v9, v2, v63 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v54 -; SI-NEXT: v_or_b32_e32 v11, v2, v8 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v55 -; SI-NEXT: v_or_b32_e32 v13, v2, v10 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v59 -; SI-NEXT: v_or_b32_e32 v15, v2, v12 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v58 -; SI-NEXT: v_or_b32_e32 v17, v2, v14 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v57 -; SI-NEXT: v_or_b32_e32 v19, v2, v61 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v56 -; SI-NEXT: v_or_b32_e32 v21, v2, v62 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v47 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:972 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v6, 0xff, v42 -; SI-NEXT: v_and_b32_e32 v18, 0xff, v18 -; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 -; SI-NEXT: v_and_b32_e32 v24, 0xff, v24 -; SI-NEXT: v_and_b32_e32 v26, 0xff, v26 -; SI-NEXT: v_and_b32_e32 v28, 0xff, v28 -; SI-NEXT: v_and_b32_e32 v30, 0xff, v30 -; SI-NEXT: v_and_b32_e32 v34, 0xff, v34 -; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 -; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 -; SI-NEXT: v_and_b32_e32 v35, 0xff, v52 -; SI-NEXT: v_and_b32_e32 v36, 0xff, v40 -; SI-NEXT: v_and_b32_e32 v37, 0xff, v60 -; SI-NEXT: v_and_b32_e32 v39, 0xff, v31 -; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 -; SI-NEXT: v_or_b32_e32 v4, v4, v49 -; SI-NEXT: v_and_b32_e32 v49, 0xff, v51 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:976 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_or_b32_e32 v23, v2, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v2, 0xff, v46 +; SI-NEXT: v_and_b32_e32 v18, 0xff, v24 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v25, v2, v1 +; SI-NEXT: v_or_b32_e32 v22, v18, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v18, 0xff, v60 +; SI-NEXT: v_or_b32_e32 v24, v18, v27 +; SI-NEXT: v_and_b32_e32 v18, 0xff, v28 +; SI-NEXT: v_or_b32_e32 v26, v18, v29 +; SI-NEXT: v_and_b32_e32 v18, 0xff, v30 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:904 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v27, 0xff, v54 +; SI-NEXT: v_and_b32_e32 v29, 0xff, v51 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v19, v18, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v2, 0xff, v45 +; SI-NEXT: v_and_b32_e32 v18, 0xff, v25 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v27, v2, v1 +; SI-NEXT: v_or_b32_e32 v25, v18, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v2, 0xff, v44 +; SI-NEXT: v_and_b32_e32 v18, 0xff, v43 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v29, v2, v1 +; SI-NEXT: v_or_b32_e32 v28, v18, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v2, 0xff, v43 +; SI-NEXT: v_and_b32_e32 v18, 0xff, v32 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v2, v2, v1 +; SI-NEXT: v_or_b32_e32 v32, v18, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v32, v6, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v32 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v4 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v1 +; SI-NEXT: v_and_b32_e32 v18, 0xff, v36 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v7, v7, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v33, v6, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v33 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v10, v10, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v6, v6, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v6 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v8, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v12, v12, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v8, v8, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v10, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v10, v10, v3 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v10 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v12, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v36, v18, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v18, 0xff, v37 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v12, v12, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v14, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v37, v18, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v18, 0xff, v38 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v14, v14, v1 +; SI-NEXT: v_or_b32_e32 v2, v2, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v14 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v18, v18, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v4, v4, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v22, v22, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v38, v18, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v24, v24, v1 +; SI-NEXT: v_and_b32_e32 v18, 0xff, v39 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v39, v18, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v24 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v26, v26, v1 +; SI-NEXT: v_and_b32_e32 v18, 0xff, v48 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v48, v18, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v18, 0xff, v49 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v49, v18, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v18, 0xff, v55 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v18, v18, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v26 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v28, v28, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v28 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v30, v30, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v34, v34, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v27, v27, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v33, v33, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v29, v29, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v34 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v16, v16, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v16 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v20, v20, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v20 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v35, v35, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v35 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v36, v36, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v34, v34, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v36 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v37, v37, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:856 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v37 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v38, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v38, v38, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v38 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v39, v39, v1 -; SI-NEXT: v_mov_b32_e32 v1, v48 -; SI-NEXT: v_and_b32_e32 v48, 0xff, v50 -; SI-NEXT: v_or_b32_e32 v48, v48, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v35, v35, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v8, v8, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v9, v9, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v3, v3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v50, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v50, v50, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v51, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v39 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v48 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v51, v51, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v49, v49, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:876 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v52, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v50, 0xff, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v52, v52, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v50, v50, v3 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v50 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:968 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v51 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v52 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v53, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v53, v53, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:888 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:860 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v5, v5, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v54, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:868 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v54, v54, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v39, v54 -; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v55, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:872 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v55, v55, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v40, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:884 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v40, v40, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v37, v40 -; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v41, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v41, v41, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v42, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:900 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v42, v42, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v33, v42 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v43, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:864 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v43, v43, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v44, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:872 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v44, v44, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v36, v44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v45, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:880 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v45, v45, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:912 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v38, v45 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v46, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:884 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v46, v46, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v47, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:904 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v30 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v47, v47, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v35, v47 -; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: v_and_b32_e32 v43, 0xff, v30 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:916 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v56, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:916 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v43, v43, v30 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v56, v56, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v44, 0xff, v30 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:924 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v57, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:920 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v44, v44, v30 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:856 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v57, v57, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v32, v57 -; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: v_and_b32_e32 v45, 0xff, v30 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:932 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v58, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:924 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v45, v45, v30 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:864 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v58, v58, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v46, 0xff, v30 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:936 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v59, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:944 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v46, v46, v30 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:876 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v59, v59, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:860 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v47, 0xff, v30 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:940 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v60, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:948 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v47, v47, v30 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:880 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v60, v60, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:868 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v56, 0xff, v30 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:944 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v61, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:952 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v56, v56, v30 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:888 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v61, v61, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:936 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v57, 0xff, v30 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:948 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v62, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:956 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v57, v57, v30 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:896 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v62, v62, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:896 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v58, 0xff, v30 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:952 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v31, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v58, v58, v30 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:908 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v31, v31, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:900 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v59, 0xff, v30 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:956 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v63, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:964 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v59, v59, v30 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:912 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v63, v63, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:908 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v60, 0xff, v30 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:920 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v60, v60, v61 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:940 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v61, 0xff, v30 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v61, v61, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:964 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v62, 0xff, v30 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_or_b32_e32 v3, v3, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:932 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v6, v3 -; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: v_or_b32_e32 v62, v62, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:968 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_or_b32_e32 v5, v5, v7 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v31, v31, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v30, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v13 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v48 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v53 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v23 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v55 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v11, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v33 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v41 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v22 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v11, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v9 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v24 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v11 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v11, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v20 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v13 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v26 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v15 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v11, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v17 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v19 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v19 -; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v25 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v62 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v5 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v56 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v61 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v46 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v10 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v59 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v63 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: .LBB92_2: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB92_4 -; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:932 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:976 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v40 -; SI-NEXT: v_and_b32_e32 v27, 0xff, v27 -; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v52 -; SI-NEXT: v_and_b32_e32 v29, 0xff, v29 -; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 -; SI-NEXT: v_add_i32_e32 v34, vcc, 3, v34 -; SI-NEXT: v_and_b32_e32 v34, 0xff, v34 -; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 -; SI-NEXT: v_and_b32_e32 v30, 0xff, v30 -; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; SI-NEXT: v_and_b32_e32 v28, 0xff, v28 -; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; SI-NEXT: v_and_b32_e32 v26, 0xff, v26 -; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; SI-NEXT: v_and_b32_e32 v24, 0xff, v24 -; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; SI-NEXT: v_and_b32_e32 v18, 0xff, v18 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; SI-NEXT: v_mov_b32_e32 v7, v49 -; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 -; SI-NEXT: v_add_i32_e32 v42, vcc, 3, v42 -; SI-NEXT: v_and_b32_e32 v42, 0xff, v42 -; SI-NEXT: v_add_i32_e32 v43, vcc, 3, v43 -; SI-NEXT: v_and_b32_e32 v43, 0xff, v43 -; SI-NEXT: v_add_i32_e32 v44, vcc, 3, v44 -; SI-NEXT: v_and_b32_e32 v44, 0xff, v44 -; SI-NEXT: v_add_i32_e32 v45, vcc, 3, v45 -; SI-NEXT: v_and_b32_e32 v45, 0xff, v45 -; SI-NEXT: v_add_i32_e32 v56, vcc, 3, v56 -; SI-NEXT: v_and_b32_e32 v56, 0xff, v56 -; SI-NEXT: v_or_b32_e32 v56, v62, v56 -; SI-NEXT: v_add_i32_e32 v62, vcc, 3, v5 -; SI-NEXT: v_add_i32_e32 v46, vcc, 3, v46 -; SI-NEXT: v_and_b32_e32 v62, 0xff, v62 -; SI-NEXT: v_and_b32_e32 v46, 0xff, v46 -; SI-NEXT: v_or_b32_e32 v62, v63, v62 -; SI-NEXT: v_add_i32_e32 v63, vcc, 3, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v1 -; SI-NEXT: s_movk_i32 s6, 0x300 -; SI-NEXT: v_add_i32_e32 v57, vcc, 3, v57 -; SI-NEXT: v_and_b32_e32 v57, 0xff, v57 -; SI-NEXT: v_or_b32_e32 v57, v61, v57 -; SI-NEXT: v_add_i32_e32 v61, vcc, 3, v54 -; SI-NEXT: v_add_i32_e32 v58, vcc, 3, v58 -; SI-NEXT: v_and_b32_e32 v58, 0xff, v58 -; SI-NEXT: v_or_b32_e32 v58, v14, v58 -; SI-NEXT: v_add_i32_e32 v59, vcc, 3, v59 -; SI-NEXT: v_and_b32_e32 v59, 0xff, v59 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v11, v6, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:940 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:972 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v11 -; SI-NEXT: v_or_b32_e32 v59, v12, v59 -; SI-NEXT: v_and_b32_e32 v61, 0xff, v61 -; SI-NEXT: v_or_b32_e32 v61, v8, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v37 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v39 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v49 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v27 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v9 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v50 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v52 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v53 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v54 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v55 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v40 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v41 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v42 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v44 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v45 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v46 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v56 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v57 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v58 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v60 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v61 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v62 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: .LBB92_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB92_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:968 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_mov_b32_e32 v33, v46 +; SI-NEXT: v_add_i32_e32 v33, vcc, 3, v33 +; SI-NEXT: v_and_b32_e32 v33, 0xff, v33 +; SI-NEXT: v_add_i32_e32 v35, vcc, 3, v35 +; SI-NEXT: v_and_b32_e32 v35, 0xff, v35 +; SI-NEXT: v_mov_b32_e32 v63, v50 +; SI-NEXT: v_add_i32_e32 v50, vcc, 3, v31 +; SI-NEXT: v_and_b32_e32 v50, 0xff, v50 +; SI-NEXT: v_add_i32_e32 v51, vcc, 3, v51 +; SI-NEXT: v_and_b32_e32 v51, 0xff, v51 +; SI-NEXT: v_add_i32_e32 v53, vcc, 3, v53 +; SI-NEXT: v_and_b32_e32 v53, 0xff, v53 +; SI-NEXT: v_add_i32_e32 v54, vcc, 3, v54 +; SI-NEXT: v_and_b32_e32 v54, 0xff, v54 +; SI-NEXT: v_add_i32_e32 v55, vcc, 3, v55 +; SI-NEXT: v_and_b32_e32 v55, 0xff, v55 +; SI-NEXT: v_add_i32_e32 v49, vcc, 3, v49 +; SI-NEXT: v_and_b32_e32 v49, 0xff, v49 +; SI-NEXT: v_add_i32_e32 v48, vcc, 3, v48 +; SI-NEXT: v_and_b32_e32 v48, 0xff, v48 +; SI-NEXT: v_mov_b32_e32 v46, v45 +; SI-NEXT: v_mov_b32_e32 v45, v44 +; SI-NEXT: v_mov_b32_e32 v44, v28 +; SI-NEXT: v_add_i32_e32 v39, vcc, 3, v39 +; SI-NEXT: v_and_b32_e32 v39, 0xff, v39 +; SI-NEXT: v_add_i32_e32 v38, vcc, 3, v38 +; SI-NEXT: v_and_b32_e32 v38, 0xff, v38 +; SI-NEXT: v_add_i32_e32 v57, vcc, 3, v57 +; SI-NEXT: v_and_b32_e32 v57, 0xff, v57 +; SI-NEXT: s_movk_i32 s6, 0x300 +; SI-NEXT: v_add_i32_e32 v37, vcc, 3, v37 +; SI-NEXT: v_and_b32_e32 v37, 0xff, v37 +; SI-NEXT: v_add_i32_e32 v36, vcc, 3, v36 +; SI-NEXT: v_and_b32_e32 v36, 0xff, v36 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 +; SI-NEXT: v_add_i32_e32 v63, vcc, 3, v63 ; SI-NEXT: v_and_b32_e32 v63, 0xff, v63 -; SI-NEXT: v_add_i32_e32 v47, vcc, 3, v47 -; SI-NEXT: v_and_b32_e32 v47, 0xff, v47 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_add_i32_e32 v62, vcc, s6, v62 -; SI-NEXT: v_add_i32_e32 v61, vcc, s6, v61 -; SI-NEXT: v_add_i32_e32 v59, vcc, s6, v59 -; SI-NEXT: v_add_i32_e32 v58, vcc, s6, v58 -; SI-NEXT: v_add_i32_e32 v57, vcc, s6, v57 +; SI-NEXT: v_or_b32_e32 v63, v22, v63 +; SI-NEXT: v_add_i32_e32 v56, vcc, 3, v56 +; SI-NEXT: v_and_b32_e32 v56, 0xff, v56 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_or_b32_e32 v56, v58, v56 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_and_b32_e32 v32, 0xff, v32 +; SI-NEXT: v_and_b32_e32 v25, 0xff, v25 +; SI-NEXT: v_add_i32_e32 v52, vcc, 3, v52 +; SI-NEXT: v_and_b32_e32 v52, 0xff, v52 +; SI-NEXT: v_or_b32_e32 v52, v34, v52 +; SI-NEXT: v_add_i32_e32 v52, vcc, s6, v52 +; SI-NEXT: v_add_i32_e32 v63, vcc, s6, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 ; SI-NEXT: v_add_i32_e32 v56, vcc, s6, v56 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v15, v6, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:908 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:968 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v17, v6, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:900 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:964 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v54, vcc, s6, v17 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v19, v6, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:896 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v21, v6, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:936 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:956 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v23, v6, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:868 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:952 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v25, v6, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:860 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:948 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v39, vcc, s6, v25 +; SI-NEXT: v_or_b32_e32 v11, v1, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:964 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x300, v11 +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:920 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:956 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v32, v6, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:944 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v38, vcc, s6, v32 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v15, v61, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:912 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v61, v10 +; SI-NEXT: v_add_i32_e32 v58, vcc, 3, v61 +; SI-NEXT: v_and_b32_e32 v58, 0xff, v58 +; SI-NEXT: v_or_b32_e32 v58, v14, v58 +; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 +; SI-NEXT: v_add_i32_e32 v58, vcc, s6, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v17, v3, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:908 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v3, v2 +; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v33, v6, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:924 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v37, vcc, s6, v33 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v21, v1, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:896 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:952 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v21 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v35, v6, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:920 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v35, vcc, s6, v35 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:888 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:948 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v2, v6, v2 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:916 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:880 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:944 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v2, v6, v2 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:904 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:876 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:940 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v2, v6, v2 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:912 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:884 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:864 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:936 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v2, v6, v2 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:880 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:856 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:932 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v2, v6, v2 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:872 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:924 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v2, v6, v2 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:864 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:916 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v2, v6, v2 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:904 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v2, v6, v2 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:900 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v2, v6, v2 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v2, v6, v2 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v16, v1, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:884 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v0, v13 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:872 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 +; SI-NEXT: v_mov_b32_e32 v1, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v41 +; SI-NEXT: v_add_i32_e32 v41, vcc, 3, v44 +; SI-NEXT: v_add_i32_e32 v44, vcc, 3, v45 +; SI-NEXT: v_add_i32_e32 v45, vcc, 3, v46 +; SI-NEXT: v_add_i32_e32 v46, vcc, 3, v18 +; SI-NEXT: v_and_b32_e32 v46, 0xff, v46 +; SI-NEXT: v_or_b32_e32 v46, v19, v46 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_or_b32_e32 v57, v0, v57 +; SI-NEXT: v_and_b32_e32 v41, 0xff, v41 +; SI-NEXT: v_or_b32_e32 v41, v29, v41 +; SI-NEXT: v_and_b32_e32 v45, 0xff, v45 +; SI-NEXT: v_or_b32_e32 v45, v62, v45 +; SI-NEXT: v_add_i32_e32 v62, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v62, 0xff, v62 +; SI-NEXT: v_or_b32_e32 v62, v26, v62 +; SI-NEXT: v_add_i32_e32 v61, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v44, 0xff, v44 +; SI-NEXT: v_and_b32_e32 v61, 0xff, v61 +; SI-NEXT: v_or_b32_e32 v61, v5, v61 +; SI-NEXT: v_add_i32_e32 v41, vcc, s6, v41 +; SI-NEXT: v_add_i32_e32 v45, vcc, s6, v45 +; SI-NEXT: v_add_i32_e32 v46, vcc, s6, v46 +; SI-NEXT: v_add_i32_e32 v57, vcc, s6, v57 +; SI-NEXT: v_add_i32_e32 v61, vcc, s6, v61 +; SI-NEXT: v_add_i32_e32 v62, vcc, s6, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_or_b32_e32 v2, v2, v13 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v2, v6, v2 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:868 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v2, v2, v13 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:888 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:860 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v2, v6, v2 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v2, v2, v13 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v2, v6, v2 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v2, v2, v13 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v2, v6, v2 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v48 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_add_i32_e32 v48, vcc, s6, v23 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v2, v6, v2 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:876 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v2, v6, v2 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v51 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v2, v6, v2 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v50 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v2, v6, v2 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v31 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v2, v6, v2 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v2, v2, v13 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:856 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v2, v6, v2 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v60 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_add_i32_e32 v60, vcc, 3, v55 -; SI-NEXT: v_add_i32_e32 v55, vcc, s6, v15 -; SI-NEXT: v_and_b32_e32 v60, 0xff, v60 -; SI-NEXT: v_or_b32_e32 v60, v10, v60 -; SI-NEXT: v_add_i32_e32 v60, vcc, s6, v60 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v2, v6, v2 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v6, v41 -; SI-NEXT: v_or_b32_e32 v63, v6, v63 -; SI-NEXT: v_add_i32_e32 v63, vcc, s6, v63 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v2, v2, v27 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v2, v2, v13 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v26, vcc, s6, v22 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v2, v2, v29 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v2, v2, v13 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v23 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v23, v2, v13 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v47 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 +; SI-NEXT: v_add_i32_e32 v47, vcc, 3, v18 +; SI-NEXT: v_and_b32_e32 v47, 0xff, v47 +; SI-NEXT: v_or_b32_e32 v47, v59, v47 +; SI-NEXT: v_add_i32_e32 v59, vcc, 3, v8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v59, 0xff, v59 +; SI-NEXT: v_or_b32_e32 v59, v9, v59 +; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v23 +; SI-NEXT: v_add_i32_e32 v47, vcc, s6, v47 +; SI-NEXT: v_add_i32_e32 v59, vcc, s6, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v22 +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_or_b32_e32 v13, v2, v13 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v33, v2, v33 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v33, vcc, s6, v33 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v2, v2, v20 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v35, v2, v35 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v35, vcc, s6, v35 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v2, v2, v16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v50, v2, v50 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v50, vcc, s6, v50 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v2, v2, v34 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v51, v2, v51 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v51, vcc, s6, v51 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v2, v2, v30 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v53, v2, v53 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v53, vcc, s6, v53 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v2, v2, v28 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v54, v2, v54 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v54, vcc, s6, v54 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v55, v2, v55 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v55, vcc, s6, v55 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v49, v2, v49 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v49, vcc, s6, v49 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v2, v2, v26 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v48, v2, v48 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v48, vcc, s6, v48 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v2, v2, v24 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v39, v2, v39 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v39, vcc, s6, v39 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v2, v2, v22 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v38, v2, v38 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v38, vcc, s6, v38 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v2, v2, v18 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v36, vcc, 3, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v36, 0xff, v36 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v2, v2, v36 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v36, v7, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v36, vcc, s6, v36 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v42, v4, v42 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v49, vcc, 3, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v49, 0xff, v49 +; SI-NEXT: v_or_b32_e32 v4, v2, v4 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v42 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_or_b32_e32 v2, v10, v2 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v42, vcc, 3, v60 +; SI-NEXT: v_add_i32_e32 v60, vcc, 3, v6 +; SI-NEXT: v_and_b32_e32 v60, 0xff, v60 +; SI-NEXT: v_or_b32_e32 v60, v20, v60 +; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v42, 0xff, v42 +; SI-NEXT: v_or_b32_e32 v42, v27, v42 +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 ; SI-NEXT: v_add_i32_e32 v42, vcc, s6, v42 +; SI-NEXT: v_add_i32_e32 v60, vcc, s6, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_or_b32_e32 v37, v10, v37 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v37, vcc, s6, v37 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_add_i32_e32 v27, vcc, s6, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v43, v4, v43 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v43, vcc, s6, v43 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v41, v2, v49 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v49, vcc, s6, v21 -; SI-NEXT: v_add_i32_e32 v41, vcc, s6, v41 +; SI-NEXT: v_or_b32_e32 v36, v10, v36 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v36, vcc, s6, v36 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v44, v4, v44 -; SI-NEXT: v_add_i32_e32 v44, vcc, s6, v44 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v50, vcc, 3, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v50, 0xff, v50 -; SI-NEXT: v_or_b32_e32 v51, v9, v50 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v50, vcc, s6, v19 -; SI-NEXT: v_add_i32_e32 v51, vcc, s6, v51 +; SI-NEXT: v_add_i32_e32 v29, vcc, s6, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v52, vcc, 3, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v52, 0xff, v52 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v45, v4, v45 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v45, vcc, s6, v45 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v52, v2, v52 -; SI-NEXT: v_mov_b32_e32 v2, v53 -; SI-NEXT: v_add_i32_e32 v53, vcc, 3, v9 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v53, 0xff, v53 +; SI-NEXT: v_or_b32_e32 v12, v10, v12 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v40 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v46, v4, v46 -; SI-NEXT: v_or_b32_e32 v13, v2, v3 -; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_add_i32_e32 v46, vcc, s6, v46 -; SI-NEXT: v_add_i32_e32 v52, vcc, s6, v52 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v63 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v62 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v61 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v60 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v59 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v58 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v10, v28, v10 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v7, v28, v7 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v57 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v7, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v49 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v56 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_or_b32_e32 v53, v9, v53 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v53, vcc, s6, v53 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_or_b32_e32 v47, v4, v47 -; SI-NEXT: v_add_i32_e32 v47, vcc, s6, v47 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_add_i32_e32 v34, vcc, s6, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v7, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v52 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v47 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v12, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v36 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_or_b32_e32 v32, v28, v32 +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v43 +; SI-NEXT: v_add_i32_e32 v43, vcc, 3, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v25, v40, v25 +; SI-NEXT: v_add_i32_e32 v40, vcc, 3, v30 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v40, 0xff, v40 +; SI-NEXT: v_and_b32_e32 v43, 0xff, v43 +; SI-NEXT: v_and_b32_e32 v28, 0xff, v28 +; SI-NEXT: v_or_b32_e32 v28, v31, v28 +; SI-NEXT: v_add_i32_e32 v28, vcc, s6, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_add_i32_e32 v25, vcc, s6, v25 +; SI-NEXT: v_add_i32_e32 v32, vcc, s6, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v46 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v28, v32 +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_or_b32_e32 v43, v24, v43 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v43, vcc, s6, v43 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_or_b32_e32 v40, v30, v40 +; SI-NEXT: v_add_i32_e32 v30, vcc, s6, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v40, vcc, s6, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_or_b32_e32 v44, v24, v44 +; SI-NEXT: v_add_i32_e32 v44, vcc, s6, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v44 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v31, vcc, s6, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v45 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v12, v50 +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v44 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v12, v35 +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v43 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v12, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v11 +; SI-NEXT: v_mov_b32_e32 v11, v25 +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v42 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v12, v13 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v36 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_add_i32_e32 v40, vcc, 3, v9 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v40, 0xff, v40 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_add_i32_e32 v33, vcc, s6, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v40, v9, v40 -; SI-NEXT: v_add_i32_e32 v40, vcc, s6, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v39 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v40 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v4, v48 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v34, vcc, s6, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v53 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v12, v23 +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v52 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v12, v22 +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v51 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_add_i32_e32 v32, vcc, s6, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v12, v26 +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v41 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v31, vcc, s6, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v30, vcc, s6, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v29, vcc, s6, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v36, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v37 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v28, vcc, s6, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v27, vcc, s6, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v26, vcc, s6, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v25, vcc, s6, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v12, v19 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v28 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v25 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v10, v55 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v32 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v10, v54 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v39 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v10, v53 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v33 -; SI-NEXT: v_mov_b32_e32 v37, v3 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v12, v29 +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v55 -; SI-NEXT: v_mov_b32_e32 v39, v5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v12, v27 +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v50 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v12, v20 +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v7, v37 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v9 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v7, v38 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v10 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v16 +; SI-NEXT: v_mov_b32_e32 v16, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v48 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v5 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v12 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v14 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v38 -; SI-NEXT: v_mov_b32_e32 v38, v4 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v6 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v54 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v34 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v18 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v9 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v35 -; SI-NEXT: v_mov_b32_e32 v35, v2 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v20 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v49 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v30 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v22 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v21 +; SI-NEXT: v_mov_b32_e32 v21, v10 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v17 +; SI-NEXT: v_mov_b32_e32 v17, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v26 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v15 +; SI-NEXT: v_mov_b32_e32 v15, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v31 -; SI-NEXT: v_mov_b32_e32 v33, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill ; SI-NEXT: .LBB92_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v6, v63 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f16_f32_e32 v31, v30 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v30 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v23, v25, v23 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v25, v27, v25 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_or_b32_e32 v26, v27, v26 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v27, v29, v27 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v2, v39 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_or_b32_e32 v28, v29, v28 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v30 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v29, v32, v29 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v30 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v36 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v27 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v23 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v19 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v15 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v11 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v7 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x78, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v6 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v34, v30 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v33 +; SI-NEXT: v_or_b32_e32 v30, v32, v30 +; SI-NEXT: v_or_b32_e32 v31, v34, v31 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v128i8_to_v64f16: @@ -177728,11 +175609,11 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[4:5] -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:332 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:328 ; SI-NEXT: ; implicit-def: $vgpr61 : SGPR spill to VGPR lane ; SI-NEXT: s_waitcnt expcnt(2) ; SI-NEXT: v_writelane_b32 v63, s30, 0 @@ -177746,6 +175627,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; SI-NEXT: v_writelane_b32 v61, s23, 6 ; SI-NEXT: v_writelane_b32 v61, s22, 7 ; SI-NEXT: v_writelane_b32 v61, s21, 8 +; SI-NEXT: v_writelane_b32 v61, s20, 9 ; SI-NEXT: v_writelane_b32 v63, s31, 1 ; SI-NEXT: v_writelane_b32 v63, s34, 2 ; SI-NEXT: v_writelane_b32 v63, s35, 3 @@ -177779,76 +175661,73 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; SI-NEXT: v_writelane_b32 v63, s87, 31 ; SI-NEXT: v_writelane_b32 v63, s96, 32 ; SI-NEXT: v_writelane_b32 v63, s97, 33 -; SI-NEXT: s_mov_b32 s72, s19 -; SI-NEXT: s_mov_b32 s73, s17 -; SI-NEXT: s_mov_b32 s60, s20 ; SI-NEXT: v_writelane_b32 v63, s98, 34 -; SI-NEXT: v_readfirstlane_b32 s31, v1 -; SI-NEXT: v_readfirstlane_b32 s12, v28 +; SI-NEXT: v_writelane_b32 v63, s99, 35 +; SI-NEXT: s_mov_b32 s61, s19 +; SI-NEXT: s_mov_b32 s62, s17 +; SI-NEXT: s_mov_b32 s73, s18 +; SI-NEXT: s_mov_b32 s10, s16 +; SI-NEXT: v_readfirstlane_b32 s35, v0 +; SI-NEXT: v_readfirstlane_b32 s12, v27 ; SI-NEXT: ; implicit-def: $vgpr62 : SGPR spill to VGPR lane -; SI-NEXT: v_readfirstlane_b32 s41, v27 +; SI-NEXT: v_readfirstlane_b32 s41, v26 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_writelane_b32 v62, s12, 0 -; SI-NEXT: v_readfirstlane_b32 s46, v30 +; SI-NEXT: v_readfirstlane_b32 s46, v29 ; SI-NEXT: v_writelane_b32 v62, s41, 1 -; SI-NEXT: v_readfirstlane_b32 s56, v29 +; SI-NEXT: v_readfirstlane_b32 s56, v28 ; SI-NEXT: v_writelane_b32 v62, s46, 2 ; SI-NEXT: v_writelane_b32 v62, s56, 3 -; SI-NEXT: s_mov_b32 s10, s16 -; SI-NEXT: v_readfirstlane_b32 s36, v3 -; SI-NEXT: v_writelane_b32 v63, s99, 35 -; SI-NEXT: v_readfirstlane_b32 s99, v6 -; SI-NEXT: v_readfirstlane_b32 s94, v5 -; SI-NEXT: v_readfirstlane_b32 s38, v7 -; SI-NEXT: v_readfirstlane_b32 s91, v10 -; SI-NEXT: v_readfirstlane_b32 s88, v9 -; SI-NEXT: v_readfirstlane_b32 s90, v12 -; SI-NEXT: v_readfirstlane_b32 s16, v11 -; SI-NEXT: v_readfirstlane_b32 s24, v14 -; SI-NEXT: v_readfirstlane_b32 s8, v13 -; SI-NEXT: v_readfirstlane_b32 s27, v16 -; SI-NEXT: v_readfirstlane_b32 s9, v15 -; SI-NEXT: v_readfirstlane_b32 s79, v18 -; SI-NEXT: v_readfirstlane_b32 s13, v17 -; SI-NEXT: v_readfirstlane_b32 s40, v20 -; SI-NEXT: v_readfirstlane_b32 s42, v19 -; SI-NEXT: v_readfirstlane_b32 s43, v22 -; SI-NEXT: v_readfirstlane_b32 s44, v21 -; SI-NEXT: v_readfirstlane_b32 s78, v24 -; SI-NEXT: v_readfirstlane_b32 s37, v23 -; SI-NEXT: v_readfirstlane_b32 s28, v26 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s4, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:328 -; SI-NEXT: v_writelane_b32 v61, s4, 9 -; SI-NEXT: v_readfirstlane_b32 s7, v25 -; SI-NEXT: v_readfirstlane_b32 s95, v8 -; SI-NEXT: v_readfirstlane_b32 s96, v4 +; SI-NEXT: v_readfirstlane_b32 s77, v30 +; SI-NEXT: v_readfirstlane_b32 s96, v3 ; SI-NEXT: v_readfirstlane_b32 s6, v2 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:336 -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_readfirstlane_b32 s38, v4 +; SI-NEXT: v_readfirstlane_b32 s94, v7 +; SI-NEXT: v_readfirstlane_b32 s90, v6 +; SI-NEXT: v_readfirstlane_b32 s91, v9 +; SI-NEXT: v_readfirstlane_b32 s98, v8 +; SI-NEXT: v_readfirstlane_b32 s93, v11 +; SI-NEXT: v_readfirstlane_b32 s20, v10 +; SI-NEXT: v_readfirstlane_b32 s24, v13 +; SI-NEXT: v_readfirstlane_b32 s27, v12 +; SI-NEXT: v_readfirstlane_b32 s8, v15 +; SI-NEXT: v_readfirstlane_b32 s9, v14 +; SI-NEXT: v_readfirstlane_b32 s78, v17 +; SI-NEXT: v_readfirstlane_b32 s14, v16 +; SI-NEXT: v_readfirstlane_b32 s40, v19 +; SI-NEXT: v_readfirstlane_b32 s42, v18 +; SI-NEXT: v_readfirstlane_b32 s43, v21 +; SI-NEXT: v_readfirstlane_b32 s44, v20 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:324 ; SI-NEXT: v_writelane_b32 v61, s4, 10 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s88, v23 +; SI-NEXT: v_readfirstlane_b32 s37, v22 +; SI-NEXT: v_readfirstlane_b32 s28, v25 +; SI-NEXT: v_readfirstlane_b32 s7, v24 +; SI-NEXT: v_readfirstlane_b32 s31, v5 +; SI-NEXT: v_readfirstlane_b32 s87, v1 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:332 +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:320 ; SI-NEXT: v_writelane_b32 v61, s4, 11 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:316 @@ -177945,215 +175824,215 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:224 ; SI-NEXT: v_writelane_b32 v61, s4, 35 -; SI-NEXT: v_writelane_b32 v61, s73, 36 +; SI-NEXT: v_writelane_b32 v61, s62, 36 ; SI-NEXT: v_writelane_b32 v61, s10, 37 -; SI-NEXT: v_writelane_b32 v61, s72, 38 -; SI-NEXT: v_writelane_b32 v61, s18, 39 -; SI-NEXT: v_writelane_b32 v61, s60, 40 -; SI-NEXT: v_writelane_b32 v61, s31, 41 -; SI-NEXT: v_writelane_b32 v61, s36, 42 -; SI-NEXT: v_writelane_b32 v61, s99, 43 +; SI-NEXT: v_writelane_b32 v61, s61, 38 +; SI-NEXT: v_writelane_b32 v61, s73, 39 +; SI-NEXT: v_writelane_b32 v61, s35, 40 +; SI-NEXT: v_writelane_b32 v61, s96, 41 +; SI-NEXT: v_writelane_b32 v61, s6, 42 +; SI-NEXT: v_writelane_b32 v61, s38, 43 ; SI-NEXT: v_writelane_b32 v61, s94, 44 -; SI-NEXT: v_writelane_b32 v61, s38, 45 +; SI-NEXT: v_writelane_b32 v61, s90, 45 ; SI-NEXT: v_writelane_b32 v61, s91, 46 -; SI-NEXT: v_writelane_b32 v61, s88, 47 -; SI-NEXT: v_writelane_b32 v61, s90, 48 -; SI-NEXT: v_writelane_b32 v61, s16, 49 +; SI-NEXT: v_writelane_b32 v61, s98, 47 +; SI-NEXT: v_writelane_b32 v61, s93, 48 +; SI-NEXT: v_writelane_b32 v61, s20, 49 ; SI-NEXT: v_writelane_b32 v61, s24, 50 -; SI-NEXT: v_writelane_b32 v61, s8, 51 -; SI-NEXT: v_writelane_b32 v61, s27, 52 +; SI-NEXT: v_writelane_b32 v61, s27, 51 +; SI-NEXT: v_writelane_b32 v61, s8, 52 ; SI-NEXT: v_writelane_b32 v61, s9, 53 -; SI-NEXT: v_writelane_b32 v61, s79, 54 -; SI-NEXT: v_writelane_b32 v61, s13, 55 +; SI-NEXT: v_writelane_b32 v61, s78, 54 +; SI-NEXT: v_writelane_b32 v61, s14, 55 ; SI-NEXT: v_writelane_b32 v61, s40, 56 ; SI-NEXT: v_writelane_b32 v61, s42, 57 ; SI-NEXT: v_writelane_b32 v61, s43, 58 ; SI-NEXT: v_writelane_b32 v61, s44, 59 -; SI-NEXT: v_writelane_b32 v61, s78, 60 +; SI-NEXT: v_writelane_b32 v61, s88, 60 ; SI-NEXT: v_writelane_b32 v61, s37, 61 ; SI-NEXT: v_writelane_b32 v61, s28, 62 ; SI-NEXT: v_writelane_b32 v61, s7, 63 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s93, v31 +; SI-NEXT: v_readfirstlane_b32 s99, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:220 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s68, v31 +; SI-NEXT: v_readfirstlane_b32 s95, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:216 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s89, v31 +; SI-NEXT: v_readfirstlane_b32 s68, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:212 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s30, v31 +; SI-NEXT: v_readfirstlane_b32 s89, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:208 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s34, v31 +; SI-NEXT: v_readfirstlane_b32 s76, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:204 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s39, v31 +; SI-NEXT: v_readfirstlane_b32 s36, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:200 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s53, v31 +; SI-NEXT: v_readfirstlane_b32 s48, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:196 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s45, v31 +; SI-NEXT: v_readfirstlane_b32 s53, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:192 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s81, v31 +; SI-NEXT: v_readfirstlane_b32 s45, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:188 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s66, v31 +; SI-NEXT: v_readfirstlane_b32 s81, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:184 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s21, v31 +; SI-NEXT: v_readfirstlane_b32 s66, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:180 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s69, v31 +; SI-NEXT: v_readfirstlane_b32 s19, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:176 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s97, v31 +; SI-NEXT: v_readfirstlane_b32 s69, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:172 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s25, v31 +; SI-NEXT: v_readfirstlane_b32 s97, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:168 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s85, v31 +; SI-NEXT: v_readfirstlane_b32 s25, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:164 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s29, v31 +; SI-NEXT: v_readfirstlane_b32 s85, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:160 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s14, v31 +; SI-NEXT: v_readfirstlane_b32 s26, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:156 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s11, v31 +; SI-NEXT: v_readfirstlane_b32 s13, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:152 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s57, v31 +; SI-NEXT: v_readfirstlane_b32 s11, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:148 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s47, v31 +; SI-NEXT: v_readfirstlane_b32 s57, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:144 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s75, v31 +; SI-NEXT: v_readfirstlane_b32 s47, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:140 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s59, v31 +; SI-NEXT: v_readfirstlane_b32 s92, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:136 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s50, v31 +; SI-NEXT: v_readfirstlane_b32 s59, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:132 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s76, v31 +; SI-NEXT: v_readfirstlane_b32 s34, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s54, v31 +; SI-NEXT: v_readfirstlane_b32 s30, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:124 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s35, v31 +; SI-NEXT: v_readfirstlane_b32 s50, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s19, v31 +; SI-NEXT: v_readfirstlane_b32 s39, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:116 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s48, v31 +; SI-NEXT: v_readfirstlane_b32 s54, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:112 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s17, v31 +; SI-NEXT: v_readfirstlane_b32 s15, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:108 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s52, v31 +; SI-NEXT: v_readfirstlane_b32 s17, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:104 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s49, v31 +; SI-NEXT: v_readfirstlane_b32 s18, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:100 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s65, v31 +; SI-NEXT: v_readfirstlane_b32 s64, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:96 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s67, v31 +; SI-NEXT: v_readfirstlane_b32 s52, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:92 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s64, v31 +; SI-NEXT: v_readfirstlane_b32 s49, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:88 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s70, v31 +; SI-NEXT: v_readfirstlane_b32 s65, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:84 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s71, v31 +; SI-NEXT: v_readfirstlane_b32 s67, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s83, v31 +; SI-NEXT: v_readfirstlane_b32 s71, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s84, v31 +; SI-NEXT: v_readfirstlane_b32 s80, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:72 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s55, v31 +; SI-NEXT: v_readfirstlane_b32 s70, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:68 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s80, v31 +; SI-NEXT: v_readfirstlane_b32 s86, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:64 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s62, v31 +; SI-NEXT: v_readfirstlane_b32 s84, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:60 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s82, v31 +; SI-NEXT: v_readfirstlane_b32 s63, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:56 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s63, v31 +; SI-NEXT: v_readfirstlane_b32 s83, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:52 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s87, v31 +; SI-NEXT: v_readfirstlane_b32 s74, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:48 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s61, v31 +; SI-NEXT: v_readfirstlane_b32 s51, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:44 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s86, v31 +; SI-NEXT: v_readfirstlane_b32 s22, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:40 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s22, v31 +; SI-NEXT: v_readfirstlane_b32 s55, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:36 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s51, v31 +; SI-NEXT: v_readfirstlane_b32 s60, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:32 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s74, v31 +; SI-NEXT: v_readfirstlane_b32 s82, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:28 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s20, v31 +; SI-NEXT: v_readfirstlane_b32 s72, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:24 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s26, v31 +; SI-NEXT: v_readfirstlane_b32 s75, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:20 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s92, v31 +; SI-NEXT: v_readfirstlane_b32 s23, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s23, v31 +; SI-NEXT: v_readfirstlane_b32 s79, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:12 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s98, v31 +; SI-NEXT: v_readfirstlane_b32 s21, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s15, v31 +; SI-NEXT: v_readfirstlane_b32 s16, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s58, v31 +; SI-NEXT: v_readfirstlane_b32 s29, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; SI-NEXT: v_writelane_b32 v62, s58, 4 -; SI-NEXT: v_writelane_b32 v62, s98, 5 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s77, v31 +; SI-NEXT: v_readfirstlane_b32 s58, v31 +; SI-NEXT: v_writelane_b32 v62, s58, 4 +; SI-NEXT: v_writelane_b32 v62, s16, 5 ; SI-NEXT: v_writelane_b32 v62, s77, 6 -; SI-NEXT: v_writelane_b32 v62, s92, 7 -; SI-NEXT: v_writelane_b32 v62, s15, 8 -; SI-NEXT: v_writelane_b32 v62, s20, 9 -; SI-NEXT: v_writelane_b32 v62, s23, 10 -; SI-NEXT: v_writelane_b32 v62, s26, 11 -; SI-NEXT: v_writelane_b32 v62, s48, 12 -; SI-NEXT: v_writelane_b32 v62, s17, 13 +; SI-NEXT: v_writelane_b32 v62, s79, 7 +; SI-NEXT: v_writelane_b32 v62, s29, 8 +; SI-NEXT: v_writelane_b32 v62, s75, 9 +; SI-NEXT: v_writelane_b32 v62, s21, 10 +; SI-NEXT: v_writelane_b32 v62, s23, 11 +; SI-NEXT: v_writelane_b32 v62, s17, 12 +; SI-NEXT: v_writelane_b32 v62, s18, 13 ; SI-NEXT: v_writelane_b32 v62, s52, 14 ; SI-NEXT: v_writelane_b32 v62, s65, 15 ; SI-NEXT: v_writelane_b32 v62, s64, 16 @@ -178164,375 +176043,372 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; SI-NEXT: v_writelane_b32 v62, s84, 21 ; SI-NEXT: v_writelane_b32 v62, s80, 22 ; SI-NEXT: v_writelane_b32 v62, s83, 23 -; SI-NEXT: v_writelane_b32 v62, s82, 24 -; SI-NEXT: v_writelane_b32 v62, s87, 25 -; SI-NEXT: v_writelane_b32 v62, s51, 26 +; SI-NEXT: v_writelane_b32 v62, s51, 24 +; SI-NEXT: v_writelane_b32 v62, s82, 25 +; SI-NEXT: v_writelane_b32 v62, s55, 26 ; SI-NEXT: v_writelane_b32 v62, s86, 27 -; SI-NEXT: v_writelane_b32 v62, s55, 28 -; SI-NEXT: v_writelane_b32 v62, s62, 29 -; SI-NEXT: v_writelane_b32 v62, s63, 30 -; SI-NEXT: v_writelane_b32 v62, s74, 31 -; SI-NEXT: v_writelane_b32 v62, s61, 32 -; SI-NEXT: v_writelane_b32 v62, s22, 33 +; SI-NEXT: v_writelane_b32 v62, s63, 28 +; SI-NEXT: v_writelane_b32 v62, s74, 29 +; SI-NEXT: v_writelane_b32 v62, s72, 30 +; SI-NEXT: v_writelane_b32 v62, s22, 31 +; SI-NEXT: v_writelane_b32 v62, s60, 32 ; SI-NEXT: s_cbranch_scc0 .LBB93_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_and_b32 s4, s10, 0xff -; SI-NEXT: s_lshl_b32 s5, s73, 8 +; SI-NEXT: s_lshl_b32 s5, s62, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_and_b32 s4, s18, 0xff -; SI-NEXT: s_lshl_b32 s5, s72, 8 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: s_and_b32 s4, s73, 0xff +; SI-NEXT: s_lshl_b32 s5, s61, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 +; SI-NEXT: v_readlane_b32 s4, v61, 9 ; SI-NEXT: v_readlane_b32 s5, v61, 8 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 -; SI-NEXT: s_and_b32 s4, s60, 0xff +; SI-NEXT: s_and_b32 s4, s4, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 ; SI-NEXT: v_readlane_b32 s4, v61, 7 ; SI-NEXT: v_readlane_b32 s5, v61, 6 ; SI-NEXT: s_and_b32 s4, s4, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 ; SI-NEXT: v_readlane_b32 s4, v61, 5 ; SI-NEXT: v_readlane_b32 s5, v61, 4 ; SI-NEXT: s_and_b32 s4, s4, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 ; SI-NEXT: v_readlane_b32 s4, v61, 3 ; SI-NEXT: v_readlane_b32 s5, v61, 2 ; SI-NEXT: s_and_b32 s4, s4, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 ; SI-NEXT: v_readlane_b32 s4, v61, 1 ; SI-NEXT: v_readlane_b32 s5, v61, 0 ; SI-NEXT: s_and_b32 s4, s4, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 -; SI-NEXT: s_and_b32 s4, s31, 0xff -; SI-NEXT: s_lshl_b32 s5, s6, 8 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 +; SI-NEXT: s_and_b32 s4, s35, 0xff +; SI-NEXT: s_lshl_b32 s5, s87, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 -; SI-NEXT: s_and_b32 s4, s36, 0xff +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 +; SI-NEXT: s_and_b32 s4, s6, 0xff ; SI-NEXT: s_lshl_b32 s5, s96, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 -; SI-NEXT: s_and_b32 s4, s94, 0xff -; SI-NEXT: s_lshl_b32 s5, s99, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 ; SI-NEXT: s_and_b32 s4, s38, 0xff -; SI-NEXT: s_lshl_b32 s5, s95, 8 +; SI-NEXT: s_lshl_b32 s5, s31, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: s_and_b32 s4, s90, 0xff +; SI-NEXT: s_lshl_b32 s5, s94, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 -; SI-NEXT: s_and_b32 s4, s88, 0xff +; SI-NEXT: s_and_b32 s4, s98, 0xff ; SI-NEXT: s_lshl_b32 s5, s91, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 -; SI-NEXT: s_and_b32 s4, s16, 0xff -; SI-NEXT: s_lshl_b32 s5, s90, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 -; SI-NEXT: s_and_b32 s4, s8, 0xff +; SI-NEXT: s_and_b32 s4, s20, 0xff +; SI-NEXT: s_lshl_b32 s5, s93, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: s_and_b32 s4, s27, 0xff ; SI-NEXT: s_lshl_b32 s5, s24, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 ; SI-NEXT: s_and_b32 s4, s9, 0xff -; SI-NEXT: s_lshl_b32 s5, s27, 8 +; SI-NEXT: s_lshl_b32 s5, s8, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 -; SI-NEXT: s_and_b32 s4, s13, 0xff -; SI-NEXT: s_lshl_b32 s5, s79, 8 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 +; SI-NEXT: s_and_b32 s4, s14, 0xff +; SI-NEXT: s_lshl_b32 s5, s78, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 ; SI-NEXT: s_and_b32 s4, s42, 0xff ; SI-NEXT: s_lshl_b32 s5, s40, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 ; SI-NEXT: s_and_b32 s4, s44, 0xff ; SI-NEXT: s_lshl_b32 s5, s43, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 ; SI-NEXT: s_and_b32 s4, s37, 0xff -; SI-NEXT: s_lshl_b32 s5, s78, 8 +; SI-NEXT: s_lshl_b32 s5, s88, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 ; SI-NEXT: s_and_b32 s4, s7, 0xff ; SI-NEXT: s_lshl_b32 s5, s28, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 ; SI-NEXT: s_and_b32 s4, s41, 0xff ; SI-NEXT: s_lshl_b32 s5, s12, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 ; SI-NEXT: s_and_b32 s4, s56, 0xff ; SI-NEXT: s_lshl_b32 s5, s46, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 ; SI-NEXT: s_and_b32 s4, s77, 0xff ; SI-NEXT: s_lshl_b32 s5, s58, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v22, s4 -; SI-NEXT: s_and_b32 s4, s15, 0xff -; SI-NEXT: s_lshl_b32 s5, s98, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s4 -; SI-NEXT: s_and_b32 s4, s23, 0xff -; SI-NEXT: s_lshl_b32 s5, s92, 8 +; SI-NEXT: s_and_b32 s4, s29, 0xff +; SI-NEXT: s_lshl_b32 s5, s16, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v24, s4 -; SI-NEXT: s_and_b32 s4, s26, 0xff -; SI-NEXT: s_lshl_b32 s5, s20, 8 +; SI-NEXT: s_and_b32 s4, s21, 0xff +; SI-NEXT: s_lshl_b32 s5, s79, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 -; SI-NEXT: s_and_b32 s4, s74, 0xff -; SI-NEXT: s_lshl_b32 s5, s51, 8 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 +; SI-NEXT: s_and_b32 s4, s23, 0xff +; SI-NEXT: s_lshl_b32 s5, s75, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s4 -; SI-NEXT: s_and_b32 s4, s22, 0xff -; SI-NEXT: s_lshl_b32 s5, s86, 8 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s4 +; SI-NEXT: s_and_b32 s4, s72, 0xff +; SI-NEXT: s_lshl_b32 s5, s82, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s4 -; SI-NEXT: s_and_b32 s4, s61, 0xff -; SI-NEXT: s_lshl_b32 s5, s87, 8 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s4 +; SI-NEXT: s_and_b32 s4, s60, 0xff +; SI-NEXT: s_lshl_b32 s5, s55, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v28, s4 -; SI-NEXT: s_and_b32 s4, s63, 0xff -; SI-NEXT: s_lshl_b32 s5, s82, 8 +; SI-NEXT: s_and_b32 s4, s22, 0xff +; SI-NEXT: s_lshl_b32 s5, s51, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s4 -; SI-NEXT: s_and_b32 s4, s62, 0xff -; SI-NEXT: s_lshl_b32 s5, s80, 8 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 +; SI-NEXT: s_and_b32 s4, s74, 0xff +; SI-NEXT: s_lshl_b32 s5, s83, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s4 -; SI-NEXT: s_and_b32 s4, s55, 0xff +; SI-NEXT: v_cvt_f32_f16_e32 v29, s4 +; SI-NEXT: s_and_b32 s4, s63, 0xff ; SI-NEXT: s_lshl_b32 s5, s84, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s4 -; SI-NEXT: s_and_b32 s4, s83, 0xff -; SI-NEXT: s_lshl_b32 s5, s71, 8 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 +; SI-NEXT: s_and_b32 s4, s86, 0xff +; SI-NEXT: s_lshl_b32 s5, s70, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s4 -; SI-NEXT: s_and_b32 s4, s70, 0xff -; SI-NEXT: s_lshl_b32 s5, s64, 8 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: s_and_b32 s4, s80, 0xff +; SI-NEXT: s_lshl_b32 s5, s71, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 ; SI-NEXT: s_and_b32 s4, s67, 0xff ; SI-NEXT: s_lshl_b32 s5, s65, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s4 ; SI-NEXT: s_and_b32 s4, s49, 0xff ; SI-NEXT: s_lshl_b32 s5, s52, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s4 -; SI-NEXT: s_and_b32 s4, s17, 0xff -; SI-NEXT: s_lshl_b32 s5, s48, 8 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s4 +; SI-NEXT: s_and_b32 s4, s64, 0xff +; SI-NEXT: s_lshl_b32 s5, s18, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v36, s4 -; SI-NEXT: s_and_b32 s4, s19, 0xff -; SI-NEXT: s_lshl_b32 s5, s35, 8 +; SI-NEXT: s_and_b32 s4, s17, 0xff +; SI-NEXT: s_lshl_b32 s5, s15, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s4 ; SI-NEXT: s_and_b32 s4, s54, 0xff -; SI-NEXT: s_lshl_b32 s5, s76, 8 +; SI-NEXT: s_lshl_b32 s5, s39, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s4 ; SI-NEXT: s_and_b32 s4, s50, 0xff +; SI-NEXT: s_lshl_b32 s5, s30, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s4 +; SI-NEXT: s_and_b32 s4, s34, 0xff ; SI-NEXT: s_lshl_b32 s5, s59, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v49, s4 -; SI-NEXT: s_and_b32 s4, s75, 0xff +; SI-NEXT: v_cvt_f32_f16_e32 v48, s4 +; SI-NEXT: s_and_b32 s4, s92, 0xff ; SI-NEXT: s_lshl_b32 s5, s47, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s4 ; SI-NEXT: s_and_b32 s4, s57, 0xff ; SI-NEXT: s_lshl_b32 s5, s11, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v51, s4 -; SI-NEXT: s_and_b32 s4, s14, 0xff -; SI-NEXT: s_lshl_b32 s5, s29, 8 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s4 +; SI-NEXT: s_and_b32 s4, s13, 0xff +; SI-NEXT: s_lshl_b32 s5, s26, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v50, s4 ; SI-NEXT: s_and_b32 s4, s85, 0xff ; SI-NEXT: s_lshl_b32 s5, s25, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v53, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s4 ; SI-NEXT: s_and_b32 s4, s97, 0xff ; SI-NEXT: s_lshl_b32 s5, s69, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s4 -; SI-NEXT: s_and_b32 s4, s21, 0xff +; SI-NEXT: v_cvt_f32_f16_e32 v51, s4 +; SI-NEXT: s_and_b32 s4, s19, 0xff ; SI-NEXT: s_lshl_b32 s5, s66, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v55, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v53, s4 ; SI-NEXT: s_and_b32 s4, s81, 0xff ; SI-NEXT: s_lshl_b32 s5, s45, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v54, s4 ; SI-NEXT: s_and_b32 s4, s53, 0xff -; SI-NEXT: s_lshl_b32 s5, s39, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v41, s4 -; SI-NEXT: s_and_b32 s4, s34, 0xff -; SI-NEXT: s_lshl_b32 s5, s30, 8 +; SI-NEXT: s_lshl_b32 s5, s48, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v40, s4 +; SI-NEXT: s_and_b32 s4, s36, 0xff +; SI-NEXT: s_lshl_b32 s5, s76, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v55, s4 ; SI-NEXT: s_and_b32 s4, s89, 0xff ; SI-NEXT: s_lshl_b32 s5, s68, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s5, v61, 35 -; SI-NEXT: v_cvt_f32_f16_e32 v43, s4 -; SI-NEXT: s_and_b32 s4, s93, 0xff -; SI-NEXT: s_mov_b32 s99, s5 -; SI-NEXT: s_lshl_b32 s5, s5, 8 +; SI-NEXT: v_cvt_f32_f16_e32 v41, s4 +; SI-NEXT: s_and_b32 s4, s95, 0xff +; SI-NEXT: s_lshl_b32 s5, s99, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v42, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 34 -; SI-NEXT: v_readlane_b32 s5, v61, 33 -; SI-NEXT: s_mov_b32 s31, s6 -; SI-NEXT: s_mov_b32 s6, s4 +; SI-NEXT: v_readlane_b32 s4, v61, 35 +; SI-NEXT: v_readlane_b32 s5, v61, 34 +; SI-NEXT: s_mov_b32 s6, s99 +; SI-NEXT: s_mov_b32 s99, s4 ; SI-NEXT: s_and_b32 s4, s4, 0xff -; SI-NEXT: s_mov_b32 s55, s5 +; SI-NEXT: s_mov_b32 s96, s5 ; SI-NEXT: s_lshl_b32 s5, s5, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v44, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 32 -; SI-NEXT: v_readlane_b32 s5, v61, 31 -; SI-NEXT: s_mov_b32 s86, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v43, s4 +; SI-NEXT: v_readlane_b32 s4, v61, 33 +; SI-NEXT: v_readlane_b32 s5, v61, 32 +; SI-NEXT: s_mov_b32 s55, s4 ; SI-NEXT: s_and_b32 s4, s4, 0xff -; SI-NEXT: s_mov_b32 s51, s5 +; SI-NEXT: s_mov_b32 s86, s5 ; SI-NEXT: s_lshl_b32 s5, s5, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v45, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 30 -; SI-NEXT: v_readlane_b32 s5, v61, 29 -; SI-NEXT: s_mov_b32 s36, s96 -; SI-NEXT: s_mov_b32 s96, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v44, s4 +; SI-NEXT: v_readlane_b32 s4, v61, 31 +; SI-NEXT: v_readlane_b32 s5, v61, 30 +; SI-NEXT: s_mov_b32 s35, s87 +; SI-NEXT: s_mov_b32 s82, s4 ; SI-NEXT: s_and_b32 s4, s4, 0xff -; SI-NEXT: s_mov_b32 s82, s5 +; SI-NEXT: s_mov_b32 s87, s5 ; SI-NEXT: s_lshl_b32 s5, s5, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v46, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 28 -; SI-NEXT: v_readlane_b32 s5, v61, 27 +; SI-NEXT: v_cvt_f32_f16_e32 v45, s4 +; SI-NEXT: v_readlane_b32 s4, v61, 29 +; SI-NEXT: v_readlane_b32 s5, v61, 28 ; SI-NEXT: s_mov_b32 s83, s4 ; SI-NEXT: s_and_b32 s4, s4, 0xff -; SI-NEXT: s_mov_b32 s87, s5 +; SI-NEXT: s_mov_b32 s51, s5 ; SI-NEXT: s_lshl_b32 s5, s5, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v47, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 26 -; SI-NEXT: v_readlane_b32 s5, v61, 25 +; SI-NEXT: v_cvt_f32_f16_e32 v46, s4 +; SI-NEXT: v_readlane_b32 s4, v61, 27 +; SI-NEXT: v_readlane_b32 s5, v61, 26 ; SI-NEXT: s_mov_b32 s84, s4 ; SI-NEXT: s_and_b32 s4, s4, 0xff ; SI-NEXT: s_mov_b32 s80, s5 ; SI-NEXT: s_lshl_b32 s5, s5, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v56, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 24 -; SI-NEXT: v_readlane_b32 s5, v61, 23 +; SI-NEXT: v_cvt_f32_f16_e32 v47, s4 +; SI-NEXT: v_readlane_b32 s4, v61, 25 +; SI-NEXT: v_readlane_b32 s5, v61, 24 ; SI-NEXT: s_mov_b32 s71, s4 ; SI-NEXT: s_and_b32 s4, s4, 0xff ; SI-NEXT: s_mov_b32 s70, s5 ; SI-NEXT: s_lshl_b32 s5, s5, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v57, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 22 -; SI-NEXT: v_readlane_b32 s5, v61, 21 +; SI-NEXT: v_cvt_f32_f16_e32 v56, s4 +; SI-NEXT: v_readlane_b32 s4, v61, 23 +; SI-NEXT: v_readlane_b32 s5, v61, 22 ; SI-NEXT: s_mov_b32 s49, s4 ; SI-NEXT: s_and_b32 s4, s4, 0xff ; SI-NEXT: s_mov_b32 s67, s5 ; SI-NEXT: s_lshl_b32 s5, s5, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v58, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 20 -; SI-NEXT: v_readlane_b32 s5, v61, 19 +; SI-NEXT: v_cvt_f32_f16_e32 v57, s4 +; SI-NEXT: v_readlane_b32 s4, v61, 21 +; SI-NEXT: v_readlane_b32 s5, v61, 20 ; SI-NEXT: s_mov_b32 s65, s4 ; SI-NEXT: s_and_b32 s4, s4, 0xff ; SI-NEXT: s_mov_b32 s64, s5 ; SI-NEXT: s_lshl_b32 s5, s5, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v59, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 18 -; SI-NEXT: v_readlane_b32 s5, v61, 17 -; SI-NEXT: s_mov_b32 s17, s19 -; SI-NEXT: s_mov_b32 s19, s54 -; SI-NEXT: s_mov_b32 s26, s50 +; SI-NEXT: v_cvt_f32_f16_e32 v58, s4 +; SI-NEXT: v_readlane_b32 s4, v61, 19 +; SI-NEXT: v_readlane_b32 s5, v61, 18 +; SI-NEXT: s_mov_b32 s17, s15 +; SI-NEXT: s_mov_b32 s18, s54 +; SI-NEXT: s_mov_b32 s15, s50 ; SI-NEXT: s_mov_b32 s54, s4 ; SI-NEXT: s_and_b32 s4, s4, 0xff ; SI-NEXT: s_mov_b32 s50, s5 ; SI-NEXT: s_lshl_b32 s5, s5, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v60, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 16 -; SI-NEXT: v_readlane_b32 s5, v61, 15 -; SI-NEXT: s_mov_b32 s23, s35 -; SI-NEXT: s_mov_b32 s35, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v59, s4 +; SI-NEXT: v_readlane_b32 s4, v61, 17 +; SI-NEXT: v_readlane_b32 s5, v61, 16 +; SI-NEXT: s_mov_b32 s23, s34 +; SI-NEXT: s_mov_b32 s14, s48 +; SI-NEXT: s_mov_b32 s34, s4 ; SI-NEXT: s_and_b32 s4, s4, 0xff ; SI-NEXT: s_mov_b32 s48, s5 ; SI-NEXT: s_lshl_b32 s5, s5, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 14 -; SI-NEXT: v_readlane_b32 s5, v61, 13 -; SI-NEXT: s_mov_b32 s15, s75 +; SI-NEXT: v_cvt_f32_f16_e32 v60, s4 +; SI-NEXT: v_readlane_b32 s4, v61, 15 +; SI-NEXT: v_readlane_b32 s5, v61, 14 ; SI-NEXT: s_mov_b32 s52, s4 ; SI-NEXT: s_and_b32 s4, s4, 0xff ; SI-NEXT: s_mov_b32 s75, s5 ; SI-NEXT: s_lshl_b32 s5, s5, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 12 -; SI-NEXT: v_readlane_b32 s5, v61, 11 -; SI-NEXT: s_mov_b32 s88, s30 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s4 +; SI-NEXT: v_readlane_b32 s4, v61, 13 +; SI-NEXT: v_readlane_b32 s5, v61, 12 +; SI-NEXT: s_mov_b32 s29, s30 +; SI-NEXT: s_mov_b32 s79, s92 ; SI-NEXT: s_mov_b32 s30, s4 ; SI-NEXT: s_and_b32 s4, s4, 0xff ; SI-NEXT: s_mov_b32 s92, s5 ; SI-NEXT: s_lshl_b32 s5, s5, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 10 -; SI-NEXT: v_readlane_b32 s5, v61, 9 -; SI-NEXT: s_mov_b32 s13, s39 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s4 +; SI-NEXT: v_readlane_b32 s4, v61, 11 +; SI-NEXT: v_readlane_b32 s5, v61, 10 +; SI-NEXT: s_mov_b32 s21, s39 ; SI-NEXT: s_mov_b32 s39, s4 ; SI-NEXT: s_and_b32 s4, s4, 0xff ; SI-NEXT: s_mov_b32 s77, s5 ; SI-NEXT: s_lshl_b32 s5, s5, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_mov_b32 s38, s95 -; SI-NEXT: s_mov_b32 s20, s76 -; SI-NEXT: s_mov_b32 s98, s59 +; SI-NEXT: s_mov_b32 s38, s31 +; SI-NEXT: s_mov_b32 s16, s59 ; SI-NEXT: s_mov_b32 s56, s47 ; SI-NEXT: s_mov_b32 s58, s57 ; SI-NEXT: s_mov_b32 s12, s11 -; SI-NEXT: s_mov_b32 s41, s14 -; SI-NEXT: s_mov_b32 s28, s29 +; SI-NEXT: s_mov_b32 s41, s13 +; SI-NEXT: s_mov_b32 s28, s26 ; SI-NEXT: s_mov_b32 s7, s85 -; SI-NEXT: s_mov_b32 s29, s25 +; SI-NEXT: s_mov_b32 s26, s25 ; SI-NEXT: s_mov_b32 s85, s97 ; SI-NEXT: s_mov_b32 s25, s69 -; SI-NEXT: s_mov_b32 s97, s21 +; SI-NEXT: s_mov_b32 s97, s19 ; SI-NEXT: s_mov_b32 s37, s66 ; SI-NEXT: s_mov_b32 s69, s81 ; SI-NEXT: s_mov_b32 s44, s45 ; SI-NEXT: s_mov_b32 s66, s53 -; SI-NEXT: s_mov_b32 s53, s34 -; SI-NEXT: s_mov_b32 s34, s89 -; SI-NEXT: s_mov_b32 s94, s68 -; SI-NEXT: s_mov_b32 s89, s93 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 +; SI-NEXT: s_mov_b32 s53, s36 +; SI-NEXT: s_mov_b32 s98, s76 +; SI-NEXT: s_mov_b32 s36, s89 +; SI-NEXT: s_mov_b32 s90, s68 +; SI-NEXT: s_mov_b32 s89, s95 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s4 ; SI-NEXT: s_cbranch_execnz .LBB93_3 ; SI-NEXT: .LBB93_2: ; %cmp.true ; SI-NEXT: s_add_i32 s4, s39, 3 @@ -178547,7 +176423,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; SI-NEXT: s_and_b32 vcc_lo, vcc_lo, 0xff ; SI-NEXT: s_lshl_b32 vcc_hi, s75, 8 ; SI-NEXT: s_or_b32 vcc_lo, vcc_hi, vcc_lo -; SI-NEXT: s_add_i32 vcc_hi, s35, 3 +; SI-NEXT: s_add_i32 vcc_hi, s34, 3 ; SI-NEXT: s_and_b32 vcc_hi, vcc_hi, 0xff ; SI-NEXT: s_lshl_b32 s60, s48, 8 ; SI-NEXT: s_or_b32 s60, s60, vcc_hi @@ -178573,51 +176449,51 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; SI-NEXT: s_or_b32 s73, s73, vcc_hi ; SI-NEXT: s_add_i32 vcc_hi, s83, 3 ; SI-NEXT: s_and_b32 vcc_hi, vcc_hi, 0xff -; SI-NEXT: s_lshl_b32 s74, s87, 8 +; SI-NEXT: s_lshl_b32 s74, s51, 8 ; SI-NEXT: s_or_b32 s74, s74, vcc_hi -; SI-NEXT: s_add_i32 vcc_hi, s96, 3 +; SI-NEXT: s_add_i32 vcc_hi, s82, 3 ; SI-NEXT: s_and_b32 vcc_hi, vcc_hi, 0xff -; SI-NEXT: s_lshl_b32 s75, s82, 8 +; SI-NEXT: s_lshl_b32 s75, s87, 8 ; SI-NEXT: s_or_b32 s75, s75, vcc_hi -; SI-NEXT: s_add_i32 vcc_hi, s86, 3 +; SI-NEXT: s_add_i32 vcc_hi, s55, 3 ; SI-NEXT: s_and_b32 vcc_hi, vcc_hi, 0xff -; SI-NEXT: s_lshl_b32 s76, s51, 8 +; SI-NEXT: s_lshl_b32 s76, s86, 8 ; SI-NEXT: s_or_b32 s76, s76, vcc_hi -; SI-NEXT: s_add_i32 vcc_hi, s6, 3 -; SI-NEXT: s_add_i32 s93, s34, 3 +; SI-NEXT: s_add_i32 vcc_hi, s99, 3 +; SI-NEXT: s_add_i32 s95, s36, 3 ; SI-NEXT: s_and_b32 vcc_hi, vcc_hi, 0xff -; SI-NEXT: s_lshl_b32 s77, s55, 8 +; SI-NEXT: s_lshl_b32 s77, s96, 8 ; SI-NEXT: s_add_i32 s89, s89, 3 -; SI-NEXT: s_and_b32 s93, s93, 0xff -; SI-NEXT: s_lshl_b32 s78, s94, 8 -; SI-NEXT: s_add_i32 s34, s53, 3 +; SI-NEXT: s_and_b32 s95, s95, 0xff +; SI-NEXT: s_lshl_b32 s88, s90, 8 +; SI-NEXT: s_add_i32 s36, s53, 3 ; SI-NEXT: s_or_b32 s77, s77, vcc_hi ; SI-NEXT: s_and_b32 s89, s89, 0xff -; SI-NEXT: s_lshl_b32 vcc_hi, s99, 8 -; SI-NEXT: s_or_b32 s22, s78, s93 -; SI-NEXT: s_and_b32 s93, s34, 0xff -; SI-NEXT: s_lshl_b32 s92, s88, 8 +; SI-NEXT: s_lshl_b32 vcc_hi, s6, 8 +; SI-NEXT: s_or_b32 s22, s88, s95 +; SI-NEXT: s_and_b32 s95, s36, 0xff +; SI-NEXT: s_lshl_b32 s92, s98, 8 ; SI-NEXT: s_add_i32 s53, s66, 3 ; SI-NEXT: s_or_b32 s89, vcc_hi, s89 -; SI-NEXT: s_or_b32 s92, s92, s93 -; SI-NEXT: s_and_b32 s93, s53, 0xff -; SI-NEXT: s_lshl_b32 vcc_hi, s13, 8 +; SI-NEXT: s_or_b32 s92, s92, s95 +; SI-NEXT: s_and_b32 s95, s53, 0xff +; SI-NEXT: s_lshl_b32 vcc_hi, s14, 8 ; SI-NEXT: s_add_i32 s66, s69, 3 -; SI-NEXT: s_or_b32 s93, vcc_hi, s93 +; SI-NEXT: s_or_b32 s95, vcc_hi, s95 ; SI-NEXT: s_and_b32 vcc_hi, s66, 0xff -; SI-NEXT: s_lshl_b32 s34, s44, 8 +; SI-NEXT: s_lshl_b32 s36, s44, 8 ; SI-NEXT: s_add_i32 s68, s97, 3 -; SI-NEXT: s_or_b32 vcc_hi, s34, vcc_hi -; SI-NEXT: s_and_b32 s34, s68, 0xff +; SI-NEXT: s_or_b32 vcc_hi, s36, vcc_hi +; SI-NEXT: s_and_b32 s36, s68, 0xff ; SI-NEXT: s_lshl_b32 s39, s37, 8 ; SI-NEXT: s_add_i32 s69, s85, 3 -; SI-NEXT: s_or_b32 s34, s39, s34 +; SI-NEXT: s_or_b32 s36, s39, s36 ; SI-NEXT: s_and_b32 s39, s69, 0xff ; SI-NEXT: s_lshl_b32 s52, s25, 8 ; SI-NEXT: s_add_i32 s81, s7, 3 ; SI-NEXT: s_or_b32 s39, s52, s39 ; SI-NEXT: s_and_b32 s52, s81, 0xff -; SI-NEXT: s_lshl_b32 s53, s29, 8 +; SI-NEXT: s_lshl_b32 s53, s26, 8 ; SI-NEXT: s_add_i32 s85, s41, 3 ; SI-NEXT: s_or_b32 s52, s53, s52 ; SI-NEXT: s_and_b32 s53, s85, 0xff @@ -178626,52 +176502,51 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; SI-NEXT: s_or_b32 s53, s64, s53 ; SI-NEXT: s_and_b32 s64, s97, 0xff ; SI-NEXT: s_lshl_b32 s66, s12, 8 -; SI-NEXT: s_add_i32 s21, s15, 3 +; SI-NEXT: s_add_i32 s19, s79, 3 ; SI-NEXT: s_or_b32 s64, s66, s64 -; SI-NEXT: s_and_b32 s21, s21, 0xff +; SI-NEXT: s_and_b32 s19, s19, 0xff ; SI-NEXT: s_lshl_b32 s66, s56, 8 -; SI-NEXT: s_add_i32 s25, s26, 3 -; SI-NEXT: s_or_b32 s66, s66, s21 -; SI-NEXT: s_and_b32 s21, s25, 0xff -; SI-NEXT: s_lshl_b32 s6, s98, 8 -; SI-NEXT: s_add_i32 s29, s19, 3 -; SI-NEXT: s_or_b32 s67, s6, s21 -; SI-NEXT: s_and_b32 s6, s29, 0xff -; SI-NEXT: s_lshl_b32 s18, s20, 8 -; SI-NEXT: s_add_i32 s28, s17, 3 -; SI-NEXT: s_or_b32 s68, s18, s6 +; SI-NEXT: s_add_i32 s25, s23, 3 +; SI-NEXT: s_or_b32 s66, s66, s19 +; SI-NEXT: s_and_b32 s19, s25, 0xff +; SI-NEXT: s_lshl_b32 s6, s16, 8 +; SI-NEXT: s_add_i32 s26, s15, 3 +; SI-NEXT: s_or_b32 s67, s6, s19 +; SI-NEXT: s_and_b32 s6, s26, 0xff +; SI-NEXT: s_lshl_b32 s19, s29, 8 +; SI-NEXT: s_add_i32 s28, s18, 3 +; SI-NEXT: s_or_b32 s68, s19, s6 ; SI-NEXT: s_and_b32 s6, s28, 0xff -; SI-NEXT: s_lshl_b32 s18, s23, 8 -; SI-NEXT: s_or_b32 s69, s18, s6 -; SI-NEXT: v_readlane_b32 s6, v62, 13 -; SI-NEXT: s_add_i32 s7, s6, 3 +; SI-NEXT: s_lshl_b32 s19, s21, 8 +; SI-NEXT: s_or_b32 s69, s19, s6 +; SI-NEXT: v_readlane_b32 s6, v62, 12 ; SI-NEXT: v_readlane_b32 s16, v62, 11 -; SI-NEXT: s_and_b32 s6, s7, 0xff -; SI-NEXT: v_readlane_b32 s7, v62, 12 +; SI-NEXT: s_add_i32 s7, s6, 3 ; SI-NEXT: s_add_i32 s27, s16, 3 ; SI-NEXT: v_readlane_b32 s16, v62, 9 -; SI-NEXT: s_lshl_b32 s7, s7, 8 +; SI-NEXT: s_and_b32 s6, s7, 0xff +; SI-NEXT: s_lshl_b32 s7, s17, 8 ; SI-NEXT: s_lshl_b32 s23, s16, 8 ; SI-NEXT: v_readlane_b32 s16, v62, 10 ; SI-NEXT: s_or_b32 s70, s7, s6 -; SI-NEXT: v_readlane_b32 s6, v62, 17 +; SI-NEXT: v_readlane_b32 s6, v62, 16 ; SI-NEXT: s_add_i32 s24, s16, 3 ; SI-NEXT: v_readlane_b32 s16, v62, 7 ; SI-NEXT: s_add_i32 s11, s6, 3 -; SI-NEXT: v_readlane_b32 s7, v62, 14 -; SI-NEXT: s_lshl_b32 s19, s16, 8 +; SI-NEXT: v_readlane_b32 s7, v62, 13 +; SI-NEXT: s_lshl_b32 s18, s16, 8 ; SI-NEXT: v_readlane_b32 s16, v62, 8 ; SI-NEXT: s_and_b32 s6, s11, 0xff ; SI-NEXT: s_lshl_b32 s7, s7, 8 ; SI-NEXT: s_add_i32 s20, s16, 3 ; SI-NEXT: v_readlane_b32 s16, v62, 5 ; SI-NEXT: s_or_b32 s71, s7, s6 -; SI-NEXT: v_readlane_b32 s6, v62, 18 +; SI-NEXT: v_readlane_b32 s6, v62, 17 ; SI-NEXT: s_and_b32 s20, s20, 0xff ; SI-NEXT: s_lshl_b32 s17, s16, 8 ; SI-NEXT: v_readlane_b32 s16, v62, 6 ; SI-NEXT: s_add_i32 s12, s6, 3 -; SI-NEXT: v_readlane_b32 s7, v62, 15 +; SI-NEXT: v_readlane_b32 s7, v62, 14 ; SI-NEXT: s_or_b32 s17, s17, s20 ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: v_readlane_b32 s20, v62, 4 @@ -178680,21 +176555,21 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; SI-NEXT: s_and_b32 s16, s16, 0xff ; SI-NEXT: s_lshl_b32 s20, s20, 8 ; SI-NEXT: s_or_b32 s81, s7, s6 -; SI-NEXT: v_readlane_b32 s6, v62, 20 +; SI-NEXT: v_readlane_b32 s6, v62, 18 ; SI-NEXT: s_and_b32 s24, s24, 0xff ; SI-NEXT: s_or_b32 s16, s20, s16 ; SI-NEXT: v_readlane_b32 s20, v62, 3 -; SI-NEXT: s_add_i32 s14, s6, 3 -; SI-NEXT: v_readlane_b32 s7, v62, 16 -; SI-NEXT: s_or_b32 s19, s19, s24 +; SI-NEXT: s_add_i32 s13, s6, 3 +; SI-NEXT: v_readlane_b32 s7, v62, 15 +; SI-NEXT: s_or_b32 s18, s18, s24 ; SI-NEXT: s_add_i32 s98, s20, 3 ; SI-NEXT: v_readlane_b32 s24, v62, 2 -; SI-NEXT: s_and_b32 s6, s14, 0xff +; SI-NEXT: s_and_b32 s6, s13, 0xff ; SI-NEXT: s_lshl_b32 s7, s7, 8 ; SI-NEXT: s_and_b32 s20, s98, 0xff ; SI-NEXT: s_lshl_b32 s24, s24, 8 ; SI-NEXT: s_or_b32 s83, s7, s6 -; SI-NEXT: v_readlane_b32 s6, v62, 23 +; SI-NEXT: v_readlane_b32 s6, v62, 22 ; SI-NEXT: s_and_b32 s27, s27, 0xff ; SI-NEXT: s_or_b32 s20, s24, s20 ; SI-NEXT: v_readlane_b32 s24, v62, 1 @@ -178708,128 +176583,129 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; SI-NEXT: s_and_b32 s24, s86, 0xff ; SI-NEXT: s_lshl_b32 s27, s27, 8 ; SI-NEXT: s_or_b32 s85, s7, s6 -; SI-NEXT: v_readlane_b32 s6, v62, 28 +; SI-NEXT: v_readlane_b32 s6, v62, 27 ; SI-NEXT: s_or_b32 s24, s27, s24 ; SI-NEXT: v_readlane_b32 s27, v61, 63 ; SI-NEXT: s_add_i32 s46, s6, 3 -; SI-NEXT: v_readlane_b32 s7, v62, 21 -; SI-NEXT: s_add_i32 s12, s73, 0x300 +; SI-NEXT: v_readlane_b32 s7, v62, 20 +; SI-NEXT: s_add_i32 s11, s72, 0x300 ; SI-NEXT: s_add_i32 s82, s27, 3 -; SI-NEXT: v_readlane_b32 s73, v61, 62 +; SI-NEXT: v_readlane_b32 s72, v61, 62 ; SI-NEXT: s_and_b32 s6, s46, 0xff ; SI-NEXT: s_lshl_b32 s7, s7, 8 ; SI-NEXT: s_and_b32 s27, s82, 0xff -; SI-NEXT: s_lshl_b32 s73, s73, 8 +; SI-NEXT: s_lshl_b32 s72, s72, 8 ; SI-NEXT: s_or_b32 s96, s7, s6 -; SI-NEXT: v_readlane_b32 s6, v62, 29 -; SI-NEXT: s_or_b32 s27, s73, s27 -; SI-NEXT: v_readlane_b32 s73, v61, 61 +; SI-NEXT: v_readlane_b32 s6, v62, 28 +; SI-NEXT: s_or_b32 s27, s72, s27 +; SI-NEXT: v_readlane_b32 s72, v61, 61 ; SI-NEXT: s_add_i32 s47, s6, 3 -; SI-NEXT: v_readlane_b32 s7, v62, 22 -; SI-NEXT: s_add_i32 s13, s74, 0x300 -; SI-NEXT: s_add_i32 s65, s73, 3 -; SI-NEXT: v_readlane_b32 s74, v61, 60 +; SI-NEXT: v_readlane_b32 s7, v62, 21 +; SI-NEXT: s_add_i32 s12, s73, 0x300 +; SI-NEXT: s_add_i32 s65, s72, 3 +; SI-NEXT: v_readlane_b32 s73, v61, 60 ; SI-NEXT: s_and_b32 s6, s47, 0xff ; SI-NEXT: s_lshl_b32 s7, s7, 8 -; SI-NEXT: s_and_b32 s73, s65, 0xff -; SI-NEXT: s_lshl_b32 s74, s74, 8 +; SI-NEXT: s_and_b32 s72, s65, 0xff +; SI-NEXT: s_lshl_b32 s73, s73, 8 ; SI-NEXT: s_or_b32 s97, s7, s6 -; SI-NEXT: v_readlane_b32 s6, v62, 30 -; SI-NEXT: s_or_b32 s73, s74, s73 -; SI-NEXT: v_readlane_b32 s74, v61, 59 +; SI-NEXT: v_readlane_b32 s6, v62, 29 +; SI-NEXT: s_or_b32 s72, s73, s72 +; SI-NEXT: v_readlane_b32 s73, v61, 59 ; SI-NEXT: s_add_i32 s56, s6, 3 -; SI-NEXT: v_readlane_b32 s7, v62, 24 -; SI-NEXT: s_add_i32 s14, s75, 0x300 -; SI-NEXT: s_add_i32 s54, s74, 3 -; SI-NEXT: v_readlane_b32 s75, v61, 58 +; SI-NEXT: v_readlane_b32 s7, v62, 23 +; SI-NEXT: s_add_i32 s13, s74, 0x300 +; SI-NEXT: s_add_i32 s54, s73, 3 +; SI-NEXT: v_readlane_b32 s74, v61, 58 ; SI-NEXT: s_and_b32 s6, s56, 0xff ; SI-NEXT: s_lshl_b32 s7, s7, 8 -; SI-NEXT: s_and_b32 s74, s54, 0xff -; SI-NEXT: s_lshl_b32 s75, s75, 8 +; SI-NEXT: s_and_b32 s73, s54, 0xff +; SI-NEXT: s_lshl_b32 s74, s74, 8 ; SI-NEXT: s_or_b32 s63, s7, s6 -; SI-NEXT: v_readlane_b32 s6, v62, 32 -; SI-NEXT: s_or_b32 s74, s75, s74 -; SI-NEXT: v_readlane_b32 s75, v61, 57 +; SI-NEXT: v_readlane_b32 s6, v62, 31 +; SI-NEXT: s_or_b32 s73, s74, s73 +; SI-NEXT: v_readlane_b32 s74, v61, 57 ; SI-NEXT: s_add_i32 s58, s6, 3 -; SI-NEXT: v_readlane_b32 s7, v62, 25 +; SI-NEXT: v_readlane_b32 s7, v62, 24 ; SI-NEXT: s_add_i32 s15, s76, 0x300 -; SI-NEXT: s_add_i32 s50, s75, 3 +; SI-NEXT: s_add_i32 s50, s74, 3 ; SI-NEXT: v_readlane_b32 s76, v61, 56 ; SI-NEXT: s_and_b32 s6, s58, 0xff ; SI-NEXT: s_lshl_b32 s7, s7, 8 -; SI-NEXT: s_and_b32 s75, s50, 0xff +; SI-NEXT: s_and_b32 s74, s50, 0xff ; SI-NEXT: s_lshl_b32 s76, s76, 8 -; SI-NEXT: s_or_b32 s79, s7, s6 -; SI-NEXT: v_readlane_b32 s6, v62, 33 -; SI-NEXT: s_or_b32 s75, s76, s75 +; SI-NEXT: s_or_b32 s78, s7, s6 +; SI-NEXT: v_readlane_b32 s6, v62, 32 +; SI-NEXT: s_or_b32 s74, s76, s74 ; SI-NEXT: v_readlane_b32 s76, v61, 55 ; SI-NEXT: s_add_i32 s59, s6, 3 -; SI-NEXT: v_readlane_b32 s7, v62, 27 -; SI-NEXT: s_add_i32 s18, s77, 0x300 +; SI-NEXT: v_readlane_b32 s7, v62, 26 +; SI-NEXT: s_add_i32 s19, s77, 0x300 ; SI-NEXT: s_add_i32 s48, s76, 3 ; SI-NEXT: v_readlane_b32 s77, v61, 54 ; SI-NEXT: s_and_b32 s6, s59, 0xff ; SI-NEXT: s_lshl_b32 s7, s7, 8 ; SI-NEXT: s_and_b32 s76, s48, 0xff ; SI-NEXT: s_lshl_b32 s77, s77, 8 -; SI-NEXT: s_or_b32 s78, s7, s6 -; SI-NEXT: v_readlane_b32 s6, v62, 31 +; SI-NEXT: s_or_b32 s88, s7, s6 +; SI-NEXT: v_readlane_b32 s6, v62, 30 ; SI-NEXT: s_or_b32 s76, s77, s76 ; SI-NEXT: v_readlane_b32 s77, v61, 53 ; SI-NEXT: s_add_i32 s57, s6, 3 -; SI-NEXT: v_readlane_b32 s7, v62, 26 -; SI-NEXT: s_add_i32 s11, s72, 0x300 -; SI-NEXT: s_add_i32 s72, s79, 0x300 +; SI-NEXT: v_readlane_b32 s7, v62, 25 +; SI-NEXT: s_add_i32 s14, s75, 0x300 +; SI-NEXT: s_add_i32 s75, s78, 0x300 ; SI-NEXT: s_add_i32 s37, s77, 3 -; SI-NEXT: v_readlane_b32 s79, v61, 52 +; SI-NEXT: v_readlane_b32 s78, v61, 52 ; SI-NEXT: s_and_b32 s6, s57, 0xff ; SI-NEXT: s_lshl_b32 s7, s7, 8 ; SI-NEXT: s_and_b32 s77, s37, 0xff -; SI-NEXT: s_lshl_b32 s79, s79, 8 -; SI-NEXT: s_or_b32 s88, s7, s6 -; SI-NEXT: s_or_b32 s77, s79, s77 -; SI-NEXT: v_readlane_b32 s79, v61, 51 +; SI-NEXT: s_lshl_b32 s78, s78, 8 +; SI-NEXT: s_or_b32 s79, s7, s6 +; SI-NEXT: s_or_b32 s77, s78, s77 +; SI-NEXT: v_readlane_b32 s78, v61, 51 ; SI-NEXT: s_add_i32 s21, s89, 0x300 -; SI-NEXT: s_add_i32 s89, s88, 0x300 -; SI-NEXT: s_add_i32 s35, s79, 3 -; SI-NEXT: v_readlane_b32 s88, v61, 50 -; SI-NEXT: s_and_b32 s79, s35, 0xff -; SI-NEXT: s_lshl_b32 s88, s88, 8 +; SI-NEXT: s_add_i32 s89, s79, 0x300 +; SI-NEXT: s_add_i32 s34, s78, 3 +; SI-NEXT: v_readlane_b32 s79, v61, 50 +; SI-NEXT: s_and_b32 s78, s34, 0xff +; SI-NEXT: s_lshl_b32 s79, s79, 8 +; SI-NEXT: s_or_b32 s78, s79, s78 +; SI-NEXT: v_readlane_b32 s79, v61, 49 ; SI-NEXT: v_readlane_b32 s90, v61, 48 ; SI-NEXT: s_add_i32 s25, s92, 0x300 -; SI-NEXT: s_or_b32 s79, s88, s79 -; SI-NEXT: v_readlane_b32 s88, v61, 49 +; SI-NEXT: s_add_i32 s30, s79, 3 ; SI-NEXT: s_lshl_b32 s92, s90, 8 ; SI-NEXT: v_readlane_b32 s90, v61, 47 -; SI-NEXT: s_add_i32 s30, s88, 3 -; SI-NEXT: s_add_i32 s94, s90, 3 +; SI-NEXT: s_and_b32 s79, s30, 0xff +; SI-NEXT: s_add_i32 s93, s90, 3 ; SI-NEXT: v_readlane_b32 s90, v61, 46 -; SI-NEXT: s_and_b32 s88, s30, 0xff +; SI-NEXT: s_or_b32 s79, s92, s79 +; SI-NEXT: s_and_b32 s92, s93, 0xff ; SI-NEXT: s_lshl_b32 s91, s90, 8 ; SI-NEXT: v_readlane_b32 s90, v61, 45 -; SI-NEXT: s_or_b32 s88, s92, s88 -; SI-NEXT: s_and_b32 s92, s94, 0xff -; SI-NEXT: s_add_i32 s90, s90, 3 ; SI-NEXT: s_or_b32 s91, s91, s92 +; SI-NEXT: s_add_i32 s90, s90, 3 +; SI-NEXT: v_readlane_b32 s92, v61, 44 ; SI-NEXT: s_and_b32 s90, s90, 0xff -; SI-NEXT: s_lshl_b32 s92, s38, 8 +; SI-NEXT: s_lshl_b32 s92, s92, 8 ; SI-NEXT: s_or_b32 s90, s92, s90 -; SI-NEXT: v_readlane_b32 s92, v61, 44 -; SI-NEXT: s_add_i32 s26, s93, 0x300 +; SI-NEXT: v_readlane_b32 s92, v61, 43 ; SI-NEXT: s_add_i32 s92, s92, 3 -; SI-NEXT: v_readlane_b32 s93, v61, 43 ; SI-NEXT: s_and_b32 s92, s92, 0xff -; SI-NEXT: s_lshl_b32 s93, s93, 8 +; SI-NEXT: s_lshl_b32 s93, s38, 8 ; SI-NEXT: s_or_b32 s92, s93, s92 ; SI-NEXT: v_readlane_b32 s93, v61, 42 ; SI-NEXT: s_add_i32 s93, s93, 3 +; SI-NEXT: v_readlane_b32 s94, v61, 41 ; SI-NEXT: s_and_b32 s93, s93, 0xff -; SI-NEXT: s_lshl_b32 s94, s36, 8 +; SI-NEXT: s_lshl_b32 s94, s94, 8 ; SI-NEXT: s_or_b32 s93, s94, s93 -; SI-NEXT: v_readlane_b32 s94, v61, 41 +; SI-NEXT: v_readlane_b32 s94, v61, 40 ; SI-NEXT: s_add_i32 s94, s94, 3 +; SI-NEXT: s_add_i32 s26, s95, 0x300 ; SI-NEXT: s_and_b32 s94, s94, 0xff -; SI-NEXT: s_lshl_b32 s95, s31, 8 +; SI-NEXT: s_lshl_b32 s95, s35, 8 ; SI-NEXT: s_or_b32 s94, s95, s94 ; SI-NEXT: v_readlane_b32 s95, v61, 1 ; SI-NEXT: s_add_i32 s95, s95, 3 @@ -178852,41 +176728,37 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; SI-NEXT: s_lshl_b32 s30, s30, 8 ; SI-NEXT: s_or_b32 vcc_hi, s30, vcc_hi ; SI-NEXT: v_readlane_b32 s30, v61, 7 +; SI-NEXT: s_addk_i32 vcc_hi, 0x300 ; SI-NEXT: s_add_i32 s30, s30, 3 ; SI-NEXT: v_readlane_b32 s31, v61, 6 ; SI-NEXT: s_and_b32 s30, s30, 0xff ; SI-NEXT: s_lshl_b32 s31, s31, 8 +; SI-NEXT: v_cvt_f32_f16_e32 v4, vcc_hi ; SI-NEXT: s_or_b32 s30, s31, s30 -; SI-NEXT: v_readlane_b32 s31, v61, 40 -; SI-NEXT: s_add_i32 s29, s34, 0x300 +; SI-NEXT: v_readlane_b32 s31, v61, 9 ; SI-NEXT: s_add_i32 s31, s31, 3 ; SI-NEXT: v_readlane_b32 s34, v61, 8 +; SI-NEXT: s_addk_i32 vcc_lo, 0x300 ; SI-NEXT: s_and_b32 s31, s31, 0xff ; SI-NEXT: s_lshl_b32 s34, s34, 8 ; SI-NEXT: s_or_b32 s31, s34, s31 -; SI-NEXT: s_addk_i32 s31, 0x300 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s31 -; SI-NEXT: s_addk_i32 s30, 0x300 -; SI-NEXT: s_addk_i32 vcc_hi, 0x300 ; SI-NEXT: v_readlane_b32 s34, v61, 39 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, s30 +; SI-NEXT: v_cvt_f32_f16_e32 v4, vcc_lo ; SI-NEXT: s_add_i32 s34, s34, 3 ; SI-NEXT: v_readlane_b32 s35, v61, 38 ; SI-NEXT: s_and_b32 s34, s34, 0xff -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, vcc_hi ; SI-NEXT: s_lshl_b32 s35, s35, 8 -; SI-NEXT: s_addk_i32 vcc_lo, 0x300 +; SI-NEXT: s_addk_i32 s95, 0x300 ; SI-NEXT: s_or_b32 s34, s35, s34 ; SI-NEXT: v_readlane_b32 s35, v61, 37 +; SI-NEXT: s_add_i32 s29, s36, 0x300 ; SI-NEXT: s_add_i32 s35, s35, 3 ; SI-NEXT: v_readlane_b32 s36, v61, 36 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, vcc_lo +; SI-NEXT: v_cvt_f32_f16_e32 v4, s95 ; SI-NEXT: s_and_b32 s35, s35, 0xff ; SI-NEXT: s_lshl_b32 s36, s36, 8 ; SI-NEXT: s_or_b32 s35, s36, s35 @@ -178913,106 +176785,119 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; SI-NEXT: s_add_i32 s61, s96, 0x300 ; SI-NEXT: s_add_i32 s62, s97, 0x300 ; SI-NEXT: s_addk_i32 s63, 0x300 -; SI-NEXT: s_addk_i32 s78, 0x300 +; SI-NEXT: s_addk_i32 s88, 0x300 ; SI-NEXT: s_addk_i32 s23, 0x300 -; SI-NEXT: s_addk_i32 s19, 0x300 +; SI-NEXT: s_addk_i32 s18, 0x300 ; SI-NEXT: s_addk_i32 s17, 0x300 ; SI-NEXT: s_addk_i32 s16, 0x300 ; SI-NEXT: s_addk_i32 s20, 0x300 ; SI-NEXT: s_addk_i32 s24, 0x300 ; SI-NEXT: s_addk_i32 s27, 0x300 +; SI-NEXT: s_addk_i32 s72, 0x300 ; SI-NEXT: s_addk_i32 s73, 0x300 ; SI-NEXT: s_addk_i32 s74, 0x300 -; SI-NEXT: s_addk_i32 s75, 0x300 ; SI-NEXT: s_addk_i32 s76, 0x300 ; SI-NEXT: s_addk_i32 s77, 0x300 +; SI-NEXT: s_addk_i32 s78, 0x300 ; SI-NEXT: s_addk_i32 s79, 0x300 -; SI-NEXT: s_addk_i32 s88, 0x300 ; SI-NEXT: s_addk_i32 s91, 0x300 ; SI-NEXT: s_addk_i32 s90, 0x300 ; SI-NEXT: s_addk_i32 s92, 0x300 ; SI-NEXT: s_addk_i32 s93, 0x300 ; SI-NEXT: s_addk_i32 s94, 0x300 -; SI-NEXT: s_addk_i32 s95, 0x300 +; SI-NEXT: s_addk_i32 s30, 0x300 +; SI-NEXT: s_addk_i32 s31, 0x300 ; SI-NEXT: s_addk_i32 s34, 0x300 ; SI-NEXT: s_addk_i32 s35, 0x300 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s35 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s34 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v7, s95 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s94 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s93 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s92 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s35 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s34 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s31 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s30 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, s94 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s93 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s92 ; SI-NEXT: v_cvt_f32_f16_e32 v10, s90 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s91 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s88 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s79 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s77 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s76 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s75 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s74 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s73 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s91 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s79 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s78 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s77 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s76 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s74 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s73 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s72 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s20 ; SI-NEXT: v_cvt_f32_f16_e32 v22, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s23 ; SI-NEXT: v_cvt_f32_f16_e32 v26, s89 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s78 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s72 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s63 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s62 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s61 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s60 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s59 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s88 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s75 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s63 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s62 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s61 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s60 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s59 ; SI-NEXT: v_cvt_f32_f16_e32 v34, s58 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s57 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s56 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s47 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s57 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s56 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s47 ; SI-NEXT: v_cvt_f32_f16_e32 v38, s46 -; SI-NEXT: v_cvt_f32_f16_e32 v49, s45 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s44 -; SI-NEXT: v_cvt_f32_f16_e32 v51, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s45 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s44 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s43 ; SI-NEXT: v_cvt_f32_f16_e32 v50, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v53, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v55, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v51, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v53, s29 ; SI-NEXT: v_cvt_f32_f16_e32 v54, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v41, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v40, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v43, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v40, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v55, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v41, s22 ; SI-NEXT: v_cvt_f32_f16_e32 v42, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v44, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v45, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v46, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v47, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v56, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v57, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v58, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v59, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v60, s8 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v43, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v44, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v45, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v46, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v47, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v56, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v57, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v58, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v59, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v60, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s4 ; SI-NEXT: .LBB93_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 ; SI-NEXT: v_readlane_b32 s99, v63, 35 +; SI-NEXT: v_or_b32_e32 v31, v31, v32 ; SI-NEXT: v_readlane_b32 s98, v63, 34 ; SI-NEXT: v_readlane_b32 s97, v63, 33 ; SI-NEXT: v_readlane_b32 s96, v63, 32 @@ -179049,355 +176934,249 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; SI-NEXT: v_readlane_b32 s31, v63, 1 ; SI-NEXT: v_readlane_b32 s30, v63, 0 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_add_i32_e32 v6, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_add_i32_e32 v6, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, 12, v0 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v8 -; SI-NEXT: v_add_i32_e32 v7, vcc, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v10 -; SI-NEXT: v_add_i32_e32 v7, vcc, 20, v0 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v12 -; SI-NEXT: v_add_i32_e32 v7, vcc, 24, v0 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v14 -; SI-NEXT: v_add_i32_e32 v7, vcc, 28, v0 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v16 -; SI-NEXT: v_add_i32_e32 v7, vcc, 32, v0 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v18 -; SI-NEXT: v_add_i32_e32 v7, vcc, 36, v0 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v20 -; SI-NEXT: v_add_i32_e32 v7, vcc, 40, v0 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v22 -; SI-NEXT: v_add_i32_e32 v7, vcc, 44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v24 -; SI-NEXT: v_add_i32_e32 v7, vcc, 48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v26 -; SI-NEXT: v_add_i32_e32 v7, vcc, 52, v0 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v28 -; SI-NEXT: v_add_i32_e32 v7, vcc, 56, v0 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v30 -; SI-NEXT: v_add_i32_e32 v7, vcc, 60, v0 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v32 -; SI-NEXT: v_add_i32_e32 v7, vcc, 64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v34 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v36 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v38 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x4c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v48 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x50, v0 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v50 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x54, v0 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v52 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x58, v0 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v54 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x5c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v40 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x60, v0 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v42 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v45 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x68, v0 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v47 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x6c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v57 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x70, v0 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v59 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x74, v0 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v5, vcc, 0x78, v0 -; SI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v4 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v40 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v13 +; SI-NEXT: v_or_b32_e32 v5, v10, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v17 +; SI-NEXT: v_or_b32_e32 v6, v10, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v20 +; SI-NEXT: v_or_b32_e32 v7, v11, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v18 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v21 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v22 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v28 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v25 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v26 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v14 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v29 +; SI-NEXT: v_or_b32_e32 v13, v18, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v27 +; SI-NEXT: v_or_b32_e32 v15, v19, v15 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v33 +; SI-NEXT: v_or_b32_e32 v14, v18, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v34 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v37 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v38 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v49 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v50 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v53 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v54 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v43 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v41 +; SI-NEXT: v_or_b32_e32 v23, v25, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v42 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v47 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v45 +; SI-NEXT: v_or_b32_e32 v25, v27, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v46 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v59 +; SI-NEXT: v_or_b32_e32 v26, v27, v26 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v57 +; SI-NEXT: v_or_b32_e32 v27, v29, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v58 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_or_b32_e32 v28, v29, v28 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v60 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v29, v34, v29 +; SI-NEXT: v_or_b32_e32 v30, v33, v30 ; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[4:5] -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB93_4: -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: s_mov_b32 s17, s19 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: s_mov_b32 s19, s54 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: s_mov_b32 s26, s50 -; SI-NEXT: s_mov_b32 s23, s35 -; SI-NEXT: s_mov_b32 s15, s75 -; SI-NEXT: s_mov_b32 s20, s76 -; SI-NEXT: s_mov_b32 s98, s59 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: s_mov_b32 s18, s54 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: s_mov_b32 s17, s15 +; SI-NEXT: s_mov_b32 s15, s50 +; SI-NEXT: s_mov_b32 s23, s34 +; SI-NEXT: s_mov_b32 s21, s39 +; SI-NEXT: s_mov_b32 s29, s30 +; SI-NEXT: s_mov_b32 s79, s92 +; SI-NEXT: s_mov_b32 s16, s59 ; SI-NEXT: s_mov_b32 s58, s57 ; SI-NEXT: s_mov_b32 s56, s47 -; SI-NEXT: s_mov_b32 s41, s14 +; SI-NEXT: s_mov_b32 s41, s13 ; SI-NEXT: s_mov_b32 s12, s11 ; SI-NEXT: s_mov_b32 s7, s85 -; SI-NEXT: s_mov_b32 s28, s29 -; SI-NEXT: s_mov_b32 s29, s25 +; SI-NEXT: s_mov_b32 s28, s26 +; SI-NEXT: s_mov_b32 s26, s25 ; SI-NEXT: s_mov_b32 s85, s97 -; SI-NEXT: s_mov_b32 s97, s21 +; SI-NEXT: s_mov_b32 s97, s19 ; SI-NEXT: s_mov_b32 s25, s69 ; SI-NEXT: s_mov_b32 s69, s81 ; SI-NEXT: s_mov_b32 s37, s66 ; SI-NEXT: s_mov_b32 s66, s53 -; SI-NEXT: s_mov_b32 s53, s34 -; SI-NEXT: s_mov_b32 s34, s89 -; SI-NEXT: s_mov_b32 s89, s93 +; SI-NEXT: s_mov_b32 s53, s36 +; SI-NEXT: s_mov_b32 s36, s89 +; SI-NEXT: s_mov_b32 s89, s95 ; SI-NEXT: s_mov_b32 s44, s45 -; SI-NEXT: s_mov_b32 s13, s39 -; SI-NEXT: s_mov_b32 s88, s30 -; SI-NEXT: s_mov_b32 s38, s95 -; SI-NEXT: s_mov_b32 s94, s68 -; SI-NEXT: s_mov_b32 s36, s96 -; SI-NEXT: s_mov_b32 s31, s6 -; SI-NEXT: v_readlane_b32 s6, v61, 34 +; SI-NEXT: s_mov_b32 s14, s48 +; SI-NEXT: s_mov_b32 s98, s76 +; SI-NEXT: s_mov_b32 s90, s68 +; SI-NEXT: s_mov_b32 s38, s31 +; SI-NEXT: s_mov_b32 s6, s99 +; SI-NEXT: s_mov_b32 s35, s87 ; SI-NEXT: v_readlane_b32 s99, v61, 35 +; SI-NEXT: v_readlane_b32 s96, v61, 34 ; SI-NEXT: v_readlane_b32 s55, v61, 33 +; SI-NEXT: v_readlane_b32 s82, v61, 31 ; SI-NEXT: v_readlane_b32 s86, v61, 32 -; SI-NEXT: v_readlane_b32 s96, v61, 30 -; SI-NEXT: v_readlane_b32 s51, v61, 31 -; SI-NEXT: v_readlane_b32 s83, v61, 28 -; SI-NEXT: v_readlane_b32 s82, v61, 29 -; SI-NEXT: v_readlane_b32 s84, v61, 26 -; SI-NEXT: v_readlane_b32 s87, v61, 27 -; SI-NEXT: v_readlane_b32 s80, v61, 25 -; SI-NEXT: v_readlane_b32 s71, v61, 24 -; SI-NEXT: v_readlane_b32 s49, v61, 22 -; SI-NEXT: v_readlane_b32 s70, v61, 23 -; SI-NEXT: v_readlane_b32 s65, v61, 20 -; SI-NEXT: v_readlane_b32 s67, v61, 21 -; SI-NEXT: v_readlane_b32 s54, v61, 18 -; SI-NEXT: v_readlane_b32 s64, v61, 19 -; SI-NEXT: v_readlane_b32 s50, v61, 17 -; SI-NEXT: v_readlane_b32 s35, v61, 16 -; SI-NEXT: v_readlane_b32 s52, v61, 14 -; SI-NEXT: v_readlane_b32 s48, v61, 15 -; SI-NEXT: v_readlane_b32 s30, v61, 12 -; SI-NEXT: v_readlane_b32 s39, v61, 10 -; SI-NEXT: v_readlane_b32 s92, v61, 11 -; SI-NEXT: v_readlane_b32 s77, v61, 9 -; SI-NEXT: v_readlane_b32 s75, v61, 13 -; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: v_readlane_b32 s83, v61, 29 +; SI-NEXT: v_readlane_b32 s87, v61, 30 +; SI-NEXT: v_readlane_b32 s84, v61, 27 +; SI-NEXT: v_readlane_b32 s51, v61, 28 +; SI-NEXT: v_readlane_b32 s80, v61, 26 +; SI-NEXT: v_readlane_b32 s71, v61, 25 +; SI-NEXT: v_readlane_b32 s49, v61, 23 +; SI-NEXT: v_readlane_b32 s70, v61, 24 +; SI-NEXT: v_readlane_b32 s65, v61, 21 +; SI-NEXT: v_readlane_b32 s67, v61, 22 +; SI-NEXT: v_readlane_b32 s54, v61, 19 +; SI-NEXT: v_readlane_b32 s64, v61, 20 +; SI-NEXT: v_readlane_b32 s50, v61, 18 +; SI-NEXT: v_readlane_b32 s34, v61, 17 +; SI-NEXT: v_readlane_b32 s52, v61, 15 +; SI-NEXT: v_readlane_b32 s48, v61, 16 +; SI-NEXT: v_readlane_b32 s30, v61, 13 +; SI-NEXT: v_readlane_b32 s39, v61, 11 +; SI-NEXT: v_readlane_b32 s92, v61, 12 +; SI-NEXT: v_readlane_b32 s77, v61, 10 +; SI-NEXT: v_readlane_b32 s75, v61, 14 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr46 @@ -179407,10 +177186,9 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: s_branch .LBB93_2 ; ; VI-LABEL: bitcast_v128i8_to_v64f16_scalar: @@ -183093,781 +180871,843 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v64f16_to_v128i8: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:136 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:64 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:76 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:72 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:92 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v1, v63 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v2 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v62 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v4 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v31, v12 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v11 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v13 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v14 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v23 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v57 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v27 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v6 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v48 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v50 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v32 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v7 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v10 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v1, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v20 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v28 -; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v14 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v1, v12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; kill: killed $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; kill: killed $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; kill: killed $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; kill: killed $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; kill: killed $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; kill: killed $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; kill: killed $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; kill: killed $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; kill: killed $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; kill: killed $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; kill: killed $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; kill: killed $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v44 +; SI-NEXT: ; kill: killed $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; kill: killed $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; kill: killed $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; kill: killed $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v24 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; SI-NEXT: ; kill: killed $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; kill: killed $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; kill: killed $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; kill: killed $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v14 +; SI-NEXT: ; kill: killed $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; kill: killed $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; kill: killed $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; kill: killed $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 +; SI-NEXT: ; kill: killed $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; kill: killed $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; kill: killed $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; kill: killed $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v15 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; SI-NEXT: ; kill: killed $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; kill: killed $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; kill: killed $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; kill: killed $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v1, v16 +; SI-NEXT: ; kill: killed $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; kill: killed $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; kill: killed $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; kill: killed $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 +; SI-NEXT: ; kill: killed $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; kill: killed $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; kill: killed $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; kill: killed $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v17 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v10, v35 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v34 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; SI-NEXT: ; kill: killed $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; kill: killed $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; kill: killed $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; kill: killed $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v18 +; SI-NEXT: ; kill: killed $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; kill: killed $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; kill: killed $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; kill: killed $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 +; SI-NEXT: ; kill: killed $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; kill: killed $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; kill: killed $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; kill: killed $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v39 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v51 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; SI-NEXT: ; kill: killed $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; kill: killed $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; kill: killed $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; kill: killed $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v54 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v1, v20 +; SI-NEXT: ; kill: killed $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; kill: killed $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; kill: killed $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; kill: killed $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v44 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; SI-NEXT: ; kill: killed $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; kill: killed $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; kill: killed $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; kill: killed $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v45 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v1, v22 +; SI-NEXT: ; kill: killed $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; kill: killed $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; kill: killed $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; kill: killed $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v26 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; kill: killed $vgpr45 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 +; SI-NEXT: ; kill: killed $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; kill: killed $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; kill: killed $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; kill: killed $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v25 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v60, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v36 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v29 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f16_f32_e32 v8, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v31 +; SI-NEXT: ; kill: killed $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; kill: killed $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; kill: killed $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; kill: killed $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v25 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v27 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v63 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v63, v46 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:88 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:108 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:104 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:116 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:112 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:124 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f16_f32_e32 v40, v1 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v2 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v55, v3 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v43, v4 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v14 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v15 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:132 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:128 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v42, v16 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v17 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v2 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v46, v14 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; kill: killed $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; kill: killed $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; kill: killed $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; kill: killed $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; kill: killed $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; kill: killed $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; kill: killed $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; kill: killed $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; kill: killed $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; kill: killed $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; kill: killed $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; kill: killed $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; kill: killed $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; kill: killed $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; kill: killed $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; kill: killed $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; kill: killed $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; kill: killed $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; kill: killed $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; kill: killed $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; kill: killed $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; kill: killed $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; kill: killed $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; kill: killed $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; kill: killed $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; kill: killed $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; kill: killed $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; kill: killed $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; kill: killed $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; kill: killed $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; kill: killed $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; kill: killed $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; kill: killed $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; kill: killed $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; kill: killed $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; kill: killed $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; kill: killed $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; kill: killed $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; kill: killed $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; kill: killed $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; kill: killed $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; kill: killed $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; kill: killed $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; kill: killed $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; kill: killed $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; kill: killed $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; kill: killed $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; kill: killed $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; kill: killed $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; kill: killed $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; kill: killed $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; kill: killed $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; kill: killed $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; kill: killed $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; kill: killed $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; kill: killed $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; kill: killed $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; kill: killed $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; kill: killed $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; kill: killed $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; kill: killed $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; kill: killed $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; kill: killed $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; kill: killed $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; kill: killed $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; kill: killed $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; kill: killed $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; kill: killed $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; kill: killed $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v15 -; SI-NEXT: ; kill: killed $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; kill: killed $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; kill: killed $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; kill: killed $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; kill: killed $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; kill: killed $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: v_mov_b32_e32 v45, v46 -; SI-NEXT: v_mov_b32_e32 v46, v6 -; SI-NEXT: v_mov_b32_e32 v6, v5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; kill: killed $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; kill: killed $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB94_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v44, v5, v14 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v33 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v41, v5, v14 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v54, v5, v14 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v32 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v53, v5, v14 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v16, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v46, v13 +; SI-NEXT: v_mov_b32_e32 v47, v10 +; SI-NEXT: v_mov_b32_e32 v10, v45 +; SI-NEXT: v_mov_b32_e32 v45, v14 +; SI-NEXT: v_mov_b32_e32 v14, v15 +; SI-NEXT: v_mov_b32_e32 v15, v18 +; SI-NEXT: v_mov_b32_e32 v18, v8 +; SI-NEXT: v_mov_b32_e32 v8, v6 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_or_b32_e32 v44, v12, v5 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v12 +; SI-NEXT: v_or_b32_e32 v41, v17, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v51, v5, v14 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v31 +; SI-NEXT: v_or_b32_e32 v54, v17, v5 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v13 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v52, v5, v14 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v53, v17, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v51, v17, v5 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v57 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v49, v5, v14 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v12 +; SI-NEXT: v_or_b32_e32 v52, v17, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v50, v5, v14 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v49, v17, v5 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v50, v17, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v48, v5, v14 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v9 +; SI-NEXT: v_or_b32_e32 v48, v17, v5 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v9 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v39, v5, v14 +; SI-NEXT: v_or_b32_e32 v39, v17, v5 ; SI-NEXT: v_alignbit_b32 v5, v41, v44, 24 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v37, v5, v14 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v13 +; SI-NEXT: v_or_b32_e32 v37, v17, v5 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v11 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v38, v5, v14 +; SI-NEXT: v_or_b32_e32 v38, v17, v5 ; SI-NEXT: v_alignbit_b32 v5, v41, v44, 16 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v35, v5, v14 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v11 +; SI-NEXT: v_or_b32_e32 v35, v17, v5 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v47 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v36, v5, v14 +; SI-NEXT: v_or_b32_e32 v36, v17, v5 ; SI-NEXT: v_alignbit_b32 v5, v41, v44, 8 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v30, v5, v14 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v10 +; SI-NEXT: v_or_b32_e32 v33, v17, v5 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v10 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v34, v5, v14 +; SI-NEXT: v_or_b32_e32 v34, v17, v5 ; SI-NEXT: v_alignbit_b32 v5, v53, v54, 24 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v28, v5, v14 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v8 +; SI-NEXT: v_or_b32_e32 v31, v17, v5 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v46 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v29, v5, v14 +; SI-NEXT: v_or_b32_e32 v32, v17, v5 ; SI-NEXT: v_alignbit_b32 v5, v53, v54, 16 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v26, v5, v14 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v7 +; SI-NEXT: v_or_b32_e32 v29, v17, v5 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v45 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v27, v5, v14 +; SI-NEXT: v_or_b32_e32 v30, v17, v5 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload ; SI-NEXT: v_alignbit_b32 v5, v53, v54, 8 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v56 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v24, v47, v14 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_or_b32_e32 v25, v5, v14 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v56 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v27, v17, v5 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v28, v17, v5 ; SI-NEXT: v_alignbit_b32 v5, v52, v51, 24 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v59 -; SI-NEXT: v_or_b32_e32 v22, v58, v5 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v56 -; SI-NEXT: v_or_b32_e32 v23, v57, v5 +; SI-NEXT: v_or_b32_e32 v25, v58, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v15 +; SI-NEXT: v_or_b32_e32 v26, v7, v5 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: v_alignbit_b32 v5, v52, v51, 16 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v62 -; SI-NEXT: v_or_b32_e32 v20, v61, v5 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v47 -; SI-NEXT: v_or_b32_e32 v21, v60, v5 -; SI-NEXT: v_alignbit_b32 v5, v52, v51, 8 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v63 -; SI-NEXT: v_or_b32_e32 v18, v40, v5 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v23, v61, v5 ; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr62 ; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v40 -; SI-NEXT: v_or_b32_e32 v19, v55, v5 -; SI-NEXT: v_alignbit_b32 v5, v50, v49, 24 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 +; SI-NEXT: v_or_b32_e32 v24, v60, v5 +; SI-NEXT: v_alignbit_b32 v5, v52, v51, 8 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v43 -; SI-NEXT: v_or_b32_e32 v16, v1, v5 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v46 -; SI-NEXT: v_or_b32_e32 v17, v42, v1 -; SI-NEXT: v_alignbit_b32 v1, v50, v49, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v63 +; SI-NEXT: v_or_b32_e32 v21, v1, v5 +; SI-NEXT: v_mov_b32_e32 v5, v16 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; SI-NEXT: v_or_b32_e32 v22, v55, v1 +; SI-NEXT: v_alignbit_b32 v1, v50, v49, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; SI-NEXT: v_or_b32_e32 v14, v4, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v45 -; SI-NEXT: v_or_b32_e32 v15, v2, v1 +; SI-NEXT: v_or_b32_e32 v19, v40, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v18 +; SI-NEXT: v_or_b32_e32 v20, v2, v1 +; SI-NEXT: v_alignbit_b32 v1, v50, v49, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v43 +; SI-NEXT: v_or_b32_e32 v16, v4, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v8 +; SI-NEXT: v_or_b32_e32 v17, v42, v1 ; SI-NEXT: v_alignbit_b32 v1, v50, v49, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v39, v48, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v39, v48, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v39, v48, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v38, v37, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v38, v37, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v38, v37, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v36, v35, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v36, v35, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v36, v35, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v34, v30, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v34, v30, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v34, v30, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v29, v28, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v34, v33, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v29, v28, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v34, v33, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v29, v28, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v34, v33, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v27, v26, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v32, v31, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v27, v26, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v32, v31, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v27, v26, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v32, v31, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v25, v24, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v30, v29, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v25, v24, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v30, v29, 16 +; SI-NEXT: v_alignbit_b32 v3, v17, v16, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v25, v24, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v30, v29, 8 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v23, v22, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v3, 8, v41 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v23, v22, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v28, v27, 24 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v23, v22, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v3, 8, v53 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v21, v20, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v28, v27, 16 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v21, v20, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v3, 8, v52 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v21, v20, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v28, v27, 8 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v19, v18, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v3, 8, v50 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v19, v18, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v26, v25, 24 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v19, v18, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v3, 8, v39 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v17, v16, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v26, v25, 16 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v17, v16, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v3, 8, v38 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v17, v16, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v26, v25, 8 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v15, v14, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v3, 8, v36 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v15, v14, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v24, v23, 24 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v15, v14, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v3, 8, v34 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v41 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v24, v23, 16 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v53 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v3, 8, v32 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v52 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v24, v23, 8 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v50 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v3, 8, v30 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v39 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v22, v21, 24 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v38 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v3, 8, v28 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v36 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v22, v21, 16 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v34 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v3, 8, v26 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v29 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v22, v21, 8 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v27 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v3, 8, v24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v25 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v20, v19, 24 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v23 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v3, 8, v22 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v21 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v20, v19, 16 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v19 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v3, 8, v20 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v17 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v20, v19, 8 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v15 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v3, 8, v17 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v33, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v17, v16, 24 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v32, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; SI-NEXT: v_bfe_u32 v3, v12, 8, 8 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v31, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; SI-NEXT: v_bfe_u32 v3, v13, 8, 8 +; SI-NEXT: v_mov_b32_e32 v13, v46 +; SI-NEXT: v_mov_b32_e32 v46, v1 +; SI-NEXT: v_bfe_u32 v1, v57, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v12, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; SI-NEXT: v_bfe_u32 v1, v6, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_bfe_u32 v1, v9, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v13, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v6, v8 +; SI-NEXT: v_mov_b32_e32 v8, v18 +; SI-NEXT: v_mov_b32_e32 v18, v15 +; SI-NEXT: v_mov_b32_e32 v15, v14 +; SI-NEXT: v_mov_b32_e32 v14, v45 +; SI-NEXT: v_mov_b32_e32 v45, v10 +; SI-NEXT: v_mov_b32_e32 v10, v47 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_bfe_u32 v1, v11, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_bfe_u32 v1, v10, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v8, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; SI-NEXT: v_bfe_u32 v1, v45, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v7, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; SI-NEXT: v_bfe_u32 v1, v13, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v6, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; SI-NEXT: v_bfe_u32 v1, v14, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v56, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; SI-NEXT: v_bfe_u32 v1, v15, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v47, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: v_bfe_u32 v1, v18, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v40, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: v_bfe_u32 v1, v7, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v46, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: v_bfe_u32 v1, v5, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v45, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: v_bfe_u32 v1, v8, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 @@ -183929,78 +181769,82 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: v_alignbit_b32 v2, v17, v16, 16 ; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v47, v2 +; SI-NEXT: v_bfe_u32 v12, v6, 8, 8 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: .LBB94_2: ; %Flow -; SI-NEXT: s_or_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; SI-NEXT: s_xor_b64 exec, exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB94_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_mov_b32_e32 v22, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v43 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v16, v4, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v42 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v14, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v45 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_or_b32_e32 v17, v4, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v40 +; SI-NEXT: v_alignbit_b32 v46, v17, v16, 24 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v47 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_alignbit_b32 v47, v17, v16, 16 +; SI-NEXT: v_or_b32_e32 v19, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v8 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v45 -; SI-NEXT: v_or_b32_e32 v15, v2, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v43 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v8, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v8 +; SI-NEXT: v_or_b32_e32 v20, v2, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v18 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v16, v1, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v42 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v1 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v46 -; SI-NEXT: v_or_b32_e32 v17, v1, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v40 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v18, v2, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v21, v1, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v55 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v1 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 -; SI-NEXT: v_or_b32_e32 v19, v1, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v5 +; SI-NEXT: v_or_b32_e32 v22, v1, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v62 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v61 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -184008,9 +181852,11 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v20, v2, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v23, v2, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v2, v60 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -184018,7 +181864,7 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 -; SI-NEXT: v_or_b32_e32 v21, v1, v2 +; SI-NEXT: v_or_b32_e32 v24, v1, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v59 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v58 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -184026,40 +181872,40 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v22, v2, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v57 +; SI-NEXT: v_or_b32_e32 v25, v2, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v7 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v23, v1, v3 +; SI-NEXT: v_or_b32_e32 v26, v1, v3 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v6 -; SI-NEXT: v_or_b32_e32 v24, v3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v2, v15 +; SI-NEXT: v_or_b32_e32 v27, v3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v15 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v25, v2, v3 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v3, v7 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v28, v2, v3 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v3, v14 ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v14, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -184071,25 +181917,25 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v26, v2, v1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v7 +; SI-NEXT: v_or_b32_e32 v29, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v14 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v28, v4, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v31, v4, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_or_b32_e32 v27, v3, v1 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v1, v8 +; SI-NEXT: v_or_b32_e32 v30, v3, v1 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v1, v13 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v1 ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: s_waitcnt vmcnt(1) @@ -184099,32 +181945,32 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v8 -; SI-NEXT: v_or_b32_e32 v29, v1, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v13 +; SI-NEXT: v_or_b32_e32 v32, v1, v3 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v10 -; SI-NEXT: v_or_b32_e32 v30, v3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v2, v45 +; SI-NEXT: v_or_b32_e32 v33, v3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v45 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v34, v2, v3 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v3, v11 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v3, v10 ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v10, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -184137,24 +181983,24 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v35, v2, v1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v11 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v10 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v37, v4, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_or_b32_e32 v36, v3, v1 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v1, v13 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v1, v11 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v1 ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: s_waitcnt vmcnt(1) @@ -184164,16 +182010,16 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v11 ; SI-NEXT: v_or_b32_e32 v38, v1, v3 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v9 ; SI-NEXT: v_or_b32_e32 v48, v3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v2 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v9 @@ -184181,15 +182027,12 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v39, v2, v3 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v3, v12 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -184202,354 +182045,362 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v49, v2, v1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v12 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v51, v4, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_or_b32_e32 v50, v3, v1 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v1, v31 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v6 ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_or_b32_e32 v50, v3, v1 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v1, v57 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v1 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v57 ; SI-NEXT: v_or_b32_e32 v52, v1, v3 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v32 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_or_b32_e32 v54, v3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v32 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v2 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v53, v2, v3 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v33 +; SI-NEXT: v_or_b32_e32 v53, v2, v3 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v3 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_or_b32_e32 v44, v2, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v12 ; SI-NEXT: v_or_b32_e32 v41, v3, v1 ; SI-NEXT: v_alignbit_b32 v1, v41, v44, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v41, v44, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v41, v44, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v53, v54, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v53, v54, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v53, v54, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v52, v51, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v52, v51, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v52, v51, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v50, v49, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v50, v49, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v50, v49, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v39, v48, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v39, v48, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v39, v48, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v38, v37, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v38, v37, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v38, v37, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v36, v35, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v36, v35, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v36, v35, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v34, v30, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v34, v33, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v34, v30, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v34, v33, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v34, v30, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v34, v33, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v29, v28, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v32, v31, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v29, v28, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v32, v31, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v29, v28, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v27, v26, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v27, v26, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v32, v31, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v27, v26, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v30, v29, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v25, v24, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v30, v29, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v25, v24, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v30, v29, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v25, v24, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v28, v27, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v23, v22, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v28, v27, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v23, v22, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v28, v27, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v23, v22, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v26, v25, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v21, v20, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v26, v25, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v21, v20, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v26, v25, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v21, v20, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v24, v23, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v19, v18, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v24, v23, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v19, v18, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v24, v23, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v19, v18, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v22, v21, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v17, v16, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v22, v21, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v17, v16, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v22, v21, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v17, v16, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v20, v19, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v15, v14, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v20, v19, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v15, v14, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v20, v19, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v15, v14, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v17, v16, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v41 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v53 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v52 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v50 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v39 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v38 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v36 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v34 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v29 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v27 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v25 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v23 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v21 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v32 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v19 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v30 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v17 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v28 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v15 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v26 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v33, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v32, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v22 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v31, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v20 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v17 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_bfe_u32 v1, v12, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_bfe_u32 v1, v7, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v9, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; SI-NEXT: v_bfe_u32 v1, v57, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_bfe_u32 v1, v6, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v13, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; SI-NEXT: v_bfe_u32 v1, v9, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_bfe_u32 v1, v11, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_bfe_u32 v1, v10, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v8, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; SI-NEXT: v_bfe_u32 v1, v45, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v7, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; SI-NEXT: v_bfe_u32 v1, v13, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v6, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_bfe_u32 v1, v42, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_bfe_u32 v1, v14, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v15, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v18, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_bfe_u32 v1, v55, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_bfe_u32 v1, v40, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v6, v42 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v46, 8, 8 -; SI-NEXT: v_alignbit_b32 v5, v41, v44, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: v_bfe_u32 v1, v5, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v45, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: v_bfe_u32 v1, v8, 8, 8 +; SI-NEXT: v_bfe_u32 v12, v6, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: .LBB94_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v44 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v5 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v41 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v33 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v54 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 @@ -184560,30 +182411,32 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v53 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v32 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v51 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 @@ -184594,14 +182447,14 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v52 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v31 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v57 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 @@ -184610,14 +182463,14 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v49 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 @@ -184628,30 +182481,32 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v50 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v48 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 @@ -184662,8 +182517,8 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v39 ; SI-NEXT: s_waitcnt vmcnt(1) @@ -184678,14 +182533,14 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v37 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 @@ -184696,14 +182551,14 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v38 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v13 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v11 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 @@ -184712,14 +182567,14 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v35 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 @@ -184730,14 +182585,14 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v36 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v11 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v10 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 @@ -184746,14 +182601,14 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v30 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v33 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 @@ -184764,14 +182619,14 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v34 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v10 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v45 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 @@ -184780,14 +182635,14 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v28 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v31 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 @@ -184798,14 +182653,14 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v29 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v32 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v8 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v13 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 @@ -184814,14 +182669,14 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v26 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v29 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 @@ -184832,14 +182687,14 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v27 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v30 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v7 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v14 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 @@ -184848,17 +182703,17 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v24 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v27 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 @@ -184866,14 +182721,14 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v25 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v28 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v6 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v15 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 @@ -184882,14 +182737,14 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v22 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v25 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 @@ -184900,32 +182755,30 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v23 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v26 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v20 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v23 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 @@ -184936,14 +182789,14 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v21 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v24 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 @@ -184954,14 +182807,14 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v18 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v21 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 @@ -184972,32 +182825,30 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v19 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v22 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v16 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v19 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 @@ -185008,14 +182859,14 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v17 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v20 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v46 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v8 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 @@ -185024,56 +182875,50 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v14 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v46 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v47 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v15 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v12 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v45 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v6 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -188321,215 +186166,324 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[4:5] -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:36 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:44 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:72 +; SI-NEXT: v_mov_b32_e32 v24, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_lshr_b32 s4, s29, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: s_lshr_b32 s4, s28, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s4 +; SI-NEXT: s_lshr_b32 s4, s27, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s4 +; SI-NEXT: s_lshr_b32 s4, s26, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v40, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s4 +; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v51, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v51 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v53, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v54, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v41, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v44, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v56, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v60, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v28, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v50 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v61, s4 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s17 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v46 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s21 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v16, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v41 ; SI-NEXT: v_writelane_b32 v63, s30, 0 ; SI-NEXT: v_writelane_b32 v63, s31, 1 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v47 ; SI-NEXT: v_writelane_b32 v63, s34, 2 ; SI-NEXT: v_writelane_b32 v63, s35, 3 ; SI-NEXT: v_writelane_b32 v63, s36, 4 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_writelane_b32 v63, s37, 5 ; SI-NEXT: v_writelane_b32 v63, s38, 6 ; SI-NEXT: v_writelane_b32 v63, s39, 7 ; SI-NEXT: v_writelane_b32 v63, s48, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v7 ; SI-NEXT: v_writelane_b32 v63, s49, 9 ; SI-NEXT: v_writelane_b32 v63, s50, 10 ; SI-NEXT: v_writelane_b32 v63, s51, 11 ; SI-NEXT: v_writelane_b32 v63, s52, 12 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v9 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f16_f32_e32 v36, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: v_writelane_b32 v63, s53, 13 ; SI-NEXT: v_writelane_b32 v63, s54, 14 ; SI-NEXT: v_writelane_b32 v63, s55, 15 ; SI-NEXT: v_writelane_b32 v63, s64, 16 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v26 ; SI-NEXT: v_writelane_b32 v63, s65, 17 ; SI-NEXT: v_writelane_b32 v63, s66, 18 ; SI-NEXT: v_writelane_b32 v63, s67, 19 ; SI-NEXT: v_writelane_b32 v63, s68, 20 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v25 ; SI-NEXT: v_writelane_b32 v63, s69, 21 ; SI-NEXT: v_writelane_b32 v63, s70, 22 ; SI-NEXT: v_writelane_b32 v63, s71, 23 ; SI-NEXT: v_writelane_b32 v63, s80, 24 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v30 ; SI-NEXT: v_writelane_b32 v63, s81, 25 ; SI-NEXT: v_writelane_b32 v63, s82, 26 ; SI-NEXT: v_writelane_b32 v63, s83, 27 ; SI-NEXT: v_writelane_b32 v63, s84, 28 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v9, s27 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s18 ; SI-NEXT: v_writelane_b32 v63, s85, 29 ; SI-NEXT: v_writelane_b32 v63, s86, 30 ; SI-NEXT: v_writelane_b32 v63, s87, 31 -; SI-NEXT: v_mov_b32_e32 v46, v29 ; SI-NEXT: v_writelane_b32 v63, s96, 32 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v15 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v17 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v36 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v42 ; SI-NEXT: v_cvt_f16_f32_e32 v57, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 ; SI-NEXT: v_cvt_f16_f32_e32 v59, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v60 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v26, v50 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f16_f32_e32 v25, v51 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f16_f32_e32 v50, v53 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f16_f32_e32 v49, v54 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f16_f32_e32 v54, v55 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f16_f32_e32 v53, v40 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f16_f32_e32 v55, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v19, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v18, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v12, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v17, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v38, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v37, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v48, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v22, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v15, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v14, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v30, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v39, s26 -; SI-NEXT: v_cvt_f16_f32_e32 v35, s29 -; SI-NEXT: v_cvt_f16_f32_e32 v20, s28 ; SI-NEXT: v_writelane_b32 v63, s97, 33 ; SI-NEXT: v_writelane_b32 v63, s98, 34 ; SI-NEXT: v_writelane_b32 v63, s99, 35 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: ; implicit-def: $vgpr62 : SGPR spill to VGPR lane -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v7, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v43, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v46, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v53, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v47, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_cbranch_scc0 .LBB95_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_readfirstlane_b32 s4, v19 +; SI-NEXT: v_readfirstlane_b32 s4, v25 ; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v18 +; SI-NEXT: v_readfirstlane_b32 s5, v61 ; SI-NEXT: s_or_b32 s44, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v12 +; SI-NEXT: v_readfirstlane_b32 s4, v28 ; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v17 +; SI-NEXT: v_readfirstlane_b32 s5, v19 ; SI-NEXT: s_or_b32 s45, s5, s4 ; SI-NEXT: s_lshr_b64 s[4:5], s[44:45], 24 ; SI-NEXT: v_writelane_b32 v62, s4, 4 @@ -188540,13 +186494,13 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: s_lshr_b64 s[4:5], s[44:45], 8 ; SI-NEXT: v_writelane_b32 v62, s4, 0 ; SI-NEXT: v_writelane_b32 v62, s5, 1 -; SI-NEXT: v_readfirstlane_b32 s4, v38 +; SI-NEXT: v_readfirstlane_b32 s4, v60 ; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v37 +; SI-NEXT: v_readfirstlane_b32 s5, v21 ; SI-NEXT: s_or_b32 s42, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v48 +; SI-NEXT: v_readfirstlane_b32 s4, v44 ; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v22 +; SI-NEXT: v_readfirstlane_b32 s5, v0 ; SI-NEXT: s_or_b32 s43, s5, s4 ; SI-NEXT: s_lshr_b64 s[4:5], s[42:43], 24 ; SI-NEXT: v_writelane_b32 v62, s4, 10 @@ -188557,14 +186511,13 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: s_lshr_b64 s[4:5], s[42:43], 8 ; SI-NEXT: v_writelane_b32 v62, s4, 6 ; SI-NEXT: v_writelane_b32 v62, s5, 7 -; SI-NEXT: v_readfirstlane_b32 s5, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: v_readfirstlane_b32 s4, v15 +; SI-NEXT: v_readfirstlane_b32 s4, v37 ; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s5, v41 ; SI-NEXT: s_or_b32 s40, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v30 +; SI-NEXT: v_readfirstlane_b32 s4, v54 ; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v39 +; SI-NEXT: v_readfirstlane_b32 s5, v16 ; SI-NEXT: s_or_b32 s41, s5, s4 ; SI-NEXT: s_lshr_b64 s[4:5], s[40:41], 24 ; SI-NEXT: v_writelane_b32 v62, s4, 16 @@ -188575,32 +186528,13 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: s_lshr_b64 s[4:5], s[40:41], 8 ; SI-NEXT: v_writelane_b32 v62, s4, 12 ; SI-NEXT: v_writelane_b32 v62, s5, 13 -; SI-NEXT: v_readfirstlane_b32 s4, v35 +; SI-NEXT: v_readfirstlane_b32 s4, v38 ; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v20 +; SI-NEXT: v_readfirstlane_b32 s5, v48 ; SI-NEXT: s_or_b32 s28, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v47 +; SI-NEXT: v_readfirstlane_b32 s4, v52 ; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_mov_b32_e32 v22, v2 -; SI-NEXT: v_mov_b32_e32 v39, v5 -; SI-NEXT: v_mov_b32_e32 v60, v16 -; SI-NEXT: v_readfirstlane_b32 s46, v55 -; SI-NEXT: v_mov_b32_e32 v17, v43 -; SI-NEXT: v_mov_b32_e32 v40, v34 -; SI-NEXT: v_mov_b32_e32 v41, v21 -; SI-NEXT: v_mov_b32_e32 v51, v42 -; SI-NEXT: s_lshr_b32 s71, s45, 8 -; SI-NEXT: s_lshr_b32 s70, s43, 8 -; SI-NEXT: s_lshr_b32 s69, s41, 8 -; SI-NEXT: v_bfe_u32 v38, v47, 8, 8 -; SI-NEXT: v_bfe_u32 v37, v33, 8, 8 -; SI-NEXT: v_bfe_u32 v35, v32, 8, 8 -; SI-NEXT: v_bfe_u32 v20, v10, 8, 8 -; SI-NEXT: v_bfe_u32 v19, v9, 8, 8 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v55, v4 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s5, v14 +; SI-NEXT: v_readfirstlane_b32 s5, v22 ; SI-NEXT: s_or_b32 s29, s5, s4 ; SI-NEXT: s_lshr_b64 s[4:5], s[28:29], 24 ; SI-NEXT: v_writelane_b32 v62, s4, 22 @@ -188611,15 +186545,13 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: s_lshr_b64 s[4:5], s[28:29], 8 ; SI-NEXT: v_writelane_b32 v62, s4, 18 ; SI-NEXT: v_writelane_b32 v62, s5, 19 -; SI-NEXT: v_readfirstlane_b32 s4, v2 -; SI-NEXT: v_mov_b32_e32 v2, v1 -; SI-NEXT: v_readfirstlane_b32 s5, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: v_readfirstlane_b32 s4, v14 ; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s5, v49 ; SI-NEXT: s_or_b32 s26, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v33 +; SI-NEXT: v_readfirstlane_b32 s4, v45 ; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v43 +; SI-NEXT: v_readfirstlane_b32 s5, v13 ; SI-NEXT: s_or_b32 s27, s5, s4 ; SI-NEXT: s_lshr_b64 s[4:5], s[26:27], 24 ; SI-NEXT: v_writelane_b32 v62, s4, 28 @@ -188630,22 +186562,14 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: s_lshr_b64 s[4:5], s[26:27], 8 ; SI-NEXT: v_writelane_b32 v62, s4, 24 ; SI-NEXT: v_writelane_b32 v62, s5, 25 -; SI-NEXT: v_readfirstlane_b32 s4, v5 +; SI-NEXT: v_readfirstlane_b32 s4, v2 ; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: s_lshr_b32 s68, s29, 8 -; SI-NEXT: s_lshr_b32 s66, s27, 8 -; SI-NEXT: v_bfe_u32 v43, v31, 8, 8 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_readfirstlane_b32 s5, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_readfirstlane_b32 s5, v40 ; SI-NEXT: s_or_b32 s24, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v32 +; SI-NEXT: v_readfirstlane_b32 s4, v12 +; SI-NEXT: v_mov_b32_e32 v13, v9 ; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_bfe_u32 v15, v5, 8, 8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s5, v1 +; SI-NEXT: v_readfirstlane_b32 s5, v13 ; SI-NEXT: s_or_b32 s25, s5, s4 ; SI-NEXT: s_lshr_b64 s[4:5], s[24:25], 24 ; SI-NEXT: v_writelane_b32 v62, s4, 34 @@ -188656,13 +186580,13 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: s_lshr_b64 s[4:5], s[24:25], 8 ; SI-NEXT: v_writelane_b32 v62, s4, 30 ; SI-NEXT: v_writelane_b32 v62, s5, 31 -; SI-NEXT: v_readfirstlane_b32 s4, v7 +; SI-NEXT: v_readfirstlane_b32 s4, v56 ; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v29 +; SI-NEXT: v_readfirstlane_b32 s5, v23 ; SI-NEXT: s_or_b32 s22, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: v_readfirstlane_b32 s4, v11 ; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v6 +; SI-NEXT: v_readfirstlane_b32 s5, v50 ; SI-NEXT: s_or_b32 s23, s5, s4 ; SI-NEXT: s_lshr_b64 s[4:5], s[22:23], 24 ; SI-NEXT: v_writelane_b32 v62, s4, 40 @@ -188674,120 +186598,106 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_writelane_b32 v62, s4, 36 ; SI-NEXT: v_writelane_b32 v62, s5, 37 ; SI-NEXT: v_readfirstlane_b32 s4, v58 +; SI-NEXT: v_mov_b32_e32 v9, v51 ; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v13 +; SI-NEXT: v_readfirstlane_b32 s5, v9 ; SI-NEXT: s_or_b32 s20, s5, s4 ; SI-NEXT: v_readfirstlane_b32 s4, v10 ; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v11 +; SI-NEXT: v_readfirstlane_b32 s5, v29 ; SI-NEXT: s_or_b32 s21, s5, s4 ; SI-NEXT: s_lshr_b64 s[4:5], s[20:21], 24 ; SI-NEXT: v_writelane_b32 v62, s4, 44 ; SI-NEXT: v_writelane_b32 v62, s5, 45 ; SI-NEXT: s_lshr_b64 s[4:5], s[20:21], 16 -; SI-NEXT: v_mov_b32_e32 v58, v11 ; SI-NEXT: v_writelane_b32 v62, s4, 42 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: v_writelane_b32 v62, s5, 43 -; SI-NEXT: v_readfirstlane_b32 s5, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v7, v29 -; SI-NEXT: v_mov_b32_e32 v29, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: v_readfirstlane_b32 s4, v34 +; SI-NEXT: v_readfirstlane_b32 s4, v20 ; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s5, v35 ; SI-NEXT: s_or_b32 s18, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v9 +; SI-NEXT: v_readfirstlane_b32 s4, v8 ; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v61 +; SI-NEXT: v_readfirstlane_b32 s5, v31 ; SI-NEXT: s_or_b32 s19, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v21 +; SI-NEXT: v_readfirstlane_b32 s4, v17 ; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v24 +; SI-NEXT: v_readfirstlane_b32 s5, v26 ; SI-NEXT: s_or_b32 s16, s5, s4 -; SI-NEXT: v_mov_b32_e32 v1, v53 -; SI-NEXT: v_mov_b32_e32 v34, v61 -; SI-NEXT: v_mov_b32_e32 v21, v24 -; SI-NEXT: s_lshr_b32 s64, s25, 8 -; SI-NEXT: s_lshr_b32 s54, s23, 8 -; SI-NEXT: s_lshr_b32 s52, s21, 8 -; SI-NEXT: s_lshr_b32 s50, s19, 8 -; SI-NEXT: s_lshr_b64 s[62:63], s[20:21], 8 -; SI-NEXT: s_lshr_b64 s[96:97], s[18:19], 24 -; SI-NEXT: s_lshr_b64 s[98:99], s[18:19], 16 -; SI-NEXT: s_lshr_b64 s[58:59], s[18:19], 8 -; SI-NEXT: v_mov_b32_e32 v13, v12 -; SI-NEXT: v_bfe_u32 v24, v12, 8, 8 -; SI-NEXT: v_mov_b32_e32 v12, v48 -; SI-NEXT: v_bfe_u32 v48, v48, 8, 8 -; SI-NEXT: v_bfe_u32 v61, v59, 8, 8 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_readfirstlane_b32 s4, v11 +; SI-NEXT: v_readfirstlane_b32 s4, v36 ; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_bfe_u32 v18, v11, 8, 8 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_readfirstlane_b32 s5, v16 +; SI-NEXT: v_readfirstlane_b32 s5, v32 ; SI-NEXT: s_or_b32 s17, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v44 +; SI-NEXT: v_readfirstlane_b32 s4, v4 +; SI-NEXT: v_mov_b32_e32 v37, v18 ; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v42 +; SI-NEXT: v_readfirstlane_b32 s5, v37 ; SI-NEXT: s_or_b32 s14, s5, s4 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s4, v6 +; SI-NEXT: v_readfirstlane_b32 s4, v7 +; SI-NEXT: v_mov_b32_e32 v51, v15 ; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v46 +; SI-NEXT: v_readfirstlane_b32 s5, v51 ; SI-NEXT: s_or_b32 s15, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v8 +; SI-NEXT: v_readfirstlane_b32 s4, v34 ; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v23 +; SI-NEXT: v_readfirstlane_b32 s5, v33 ; SI-NEXT: s_or_b32 s12, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v5 +; SI-NEXT: v_readfirstlane_b32 s4, v43 ; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v3 +; SI-NEXT: v_readfirstlane_b32 s5, v42 ; SI-NEXT: s_or_b32 s13, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v27 +; SI-NEXT: v_readfirstlane_b32 s4, v6 ; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v28 +; SI-NEXT: v_readfirstlane_b32 s5, v39 ; SI-NEXT: s_or_b32 s10, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v4 +; SI-NEXT: v_readfirstlane_b32 s4, v3 ; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v45 +; SI-NEXT: v_readfirstlane_b32 s5, v55 ; SI-NEXT: s_or_b32 s11, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v36 +; SI-NEXT: v_readfirstlane_b32 s4, v57 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v57 +; SI-NEXT: v_readfirstlane_b32 s5, v30 ; SI-NEXT: s_or_b32 s8, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v59 +; SI-NEXT: v_readfirstlane_b32 s4, v5 ; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v56 +; SI-NEXT: v_readfirstlane_b32 s5, v59 ; SI-NEXT: s_or_b32 s9, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v26 +; SI-NEXT: v_readfirstlane_b32 s4, v46 ; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v25 +; SI-NEXT: v_readfirstlane_b32 s5, v27 ; SI-NEXT: s_or_b32 s6, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v52 +; SI-NEXT: v_readfirstlane_b32 s4, v53 ; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v50 +; SI-NEXT: v_readfirstlane_b32 s5, v47 ; SI-NEXT: s_or_b32 s7, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v49 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v54 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s5, v53 -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_or_b32 s5, s46, s5 -; SI-NEXT: s_lshr_b64 vcc, s[4:5], 24 -; SI-NEXT: v_writelane_b32 v62, vcc_lo, 48 -; SI-NEXT: v_writelane_b32 v62, vcc_hi, 49 -; SI-NEXT: s_lshr_b64 vcc, s[4:5], 8 -; SI-NEXT: v_writelane_b32 v62, vcc_lo, 46 +; SI-NEXT: v_readfirstlane_b32 s46, v1 +; SI-NEXT: v_mov_b32_e32 v38, v48 +; SI-NEXT: v_mov_b32_e32 v2, v40 +; SI-NEXT: v_mov_b32_e32 v56, v23 +; SI-NEXT: v_mov_b32_e32 v34, v33 +; SI-NEXT: v_mov_b32_e32 v6, v39 +; SI-NEXT: s_lshr_b32 s71, s45, 8 +; SI-NEXT: s_lshr_b32 s70, s43, 8 +; SI-NEXT: s_lshr_b32 s69, s41, 8 +; SI-NEXT: s_lshr_b32 s68, s29, 8 +; SI-NEXT: s_lshr_b32 s66, s27, 8 +; SI-NEXT: s_lshr_b32 s64, s25, 8 +; SI-NEXT: s_lshr_b32 s54, s23, 8 +; SI-NEXT: s_lshr_b32 s52, s21, 8 +; SI-NEXT: s_lshr_b32 s50, s19, 8 ; SI-NEXT: s_lshr_b32 s48, s17, 8 ; SI-NEXT: s_lshr_b32 s67, s15, 8 ; SI-NEXT: s_lshr_b32 s65, s13, 8 ; SI-NEXT: s_lshr_b32 s55, s11, 8 ; SI-NEXT: s_lshr_b32 s53, s9, 8 ; SI-NEXT: s_lshr_b32 s51, s7, 8 -; SI-NEXT: s_lshr_b32 s49, s5, 8 +; SI-NEXT: s_lshr_b64 s[62:63], s[20:21], 8 +; SI-NEXT: s_lshr_b64 s[96:97], s[18:19], 24 +; SI-NEXT: s_lshr_b64 s[98:99], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[58:59], s[18:19], 8 ; SI-NEXT: s_lshr_b64 s[34:35], s[16:17], 24 ; SI-NEXT: s_lshr_b64 s[36:37], s[16:17], 16 ; SI-NEXT: s_lshr_b64 s[38:39], s[16:17], 8 @@ -188800,180 +186710,206 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: s_lshr_b64 s[72:73], s[10:11], 24 ; SI-NEXT: s_lshr_b64 s[74:75], s[10:11], 16 ; SI-NEXT: s_lshr_b64 s[76:77], s[10:11], 8 -; SI-NEXT: s_lshr_b64 s[46:47], s[8:9], 24 ; SI-NEXT: s_lshr_b64 s[56:57], s[8:9], 16 ; SI-NEXT: s_lshr_b64 s[60:61], s[8:9], 8 ; SI-NEXT: s_lshr_b64 s[82:83], s[6:7], 24 ; SI-NEXT: s_lshr_b64 s[84:85], s[6:7], 16 ; SI-NEXT: s_lshr_b64 s[86:87], s[6:7], 8 +; SI-NEXT: v_mov_b32_e32 v47, v28 +; SI-NEXT: v_bfe_u32 v35, v28, 8, 8 +; SI-NEXT: v_mov_b32_e32 v50, v44 +; SI-NEXT: v_bfe_u32 v32, v44, 8, 8 +; SI-NEXT: v_mov_b32_e32 v44, v54 +; SI-NEXT: v_bfe_u32 v31, v54, 8, 8 +; SI-NEXT: v_mov_b32_e32 v54, v52 +; SI-NEXT: v_bfe_u32 v29, v52, 8, 8 +; SI-NEXT: v_mov_b32_e32 v52, v45 +; SI-NEXT: v_bfe_u32 v49, v45, 8, 8 +; SI-NEXT: v_bfe_u32 v25, v12, 8, 8 +; SI-NEXT: v_mov_b32_e32 v12, v11 +; SI-NEXT: v_bfe_u32 v21, v11, 8, 8 +; SI-NEXT: v_bfe_u32 v16, v10, 8, 8 +; SI-NEXT: v_mov_b32_e32 v10, v8 +; SI-NEXT: v_bfe_u32 v0, v8, 8, 8 +; SI-NEXT: v_mov_b32_e32 v45, v36 +; SI-NEXT: v_bfe_u32 v19, v36, 8, 8 +; SI-NEXT: v_mov_b32_e32 v36, v7 +; SI-NEXT: v_bfe_u32 v40, v7, 8, 8 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_readfirstlane_b32 s4, v57 +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_readfirstlane_b32 s5, v15 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s5, v14 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_or_b32 s5, s46, s5 +; SI-NEXT: s_lshr_b64 vcc, s[4:5], 24 +; SI-NEXT: v_writelane_b32 v62, vcc_lo, 48 +; SI-NEXT: v_writelane_b32 v62, vcc_hi, 49 +; SI-NEXT: s_lshr_b64 vcc, s[4:5], 8 +; SI-NEXT: v_writelane_b32 v62, vcc_lo, 46 +; SI-NEXT: s_lshr_b32 s49, s5, 8 +; SI-NEXT: s_lshr_b64 s[46:47], s[8:9], 24 ; SI-NEXT: s_lshr_b64 s[80:81], s[4:5], 16 ; SI-NEXT: v_writelane_b32 v62, vcc_hi, 47 ; SI-NEXT: s_mov_b64 vcc, 0 -; SI-NEXT: v_mov_b32_e32 v57, v30 -; SI-NEXT: v_bfe_u32 v50, v30, 8, 8 -; SI-NEXT: v_mov_b32_e32 v56, v47 -; SI-NEXT: v_mov_b32_e32 v46, v33 -; SI-NEXT: v_mov_b32_e32 v30, v32 -; SI-NEXT: v_mov_b32_e32 v32, v31 -; SI-NEXT: v_mov_b32_e32 v31, v10 -; SI-NEXT: v_mov_b32_e32 v54, v9 -; SI-NEXT: v_bfe_u32 v42, v6, 8, 8 -; SI-NEXT: v_bfe_u32 v45, v4, 8, 8 -; SI-NEXT: v_bfe_u32 v47, v52, 8, 8 -; SI-NEXT: v_bfe_u32 v33, v1, 8, 8 -; SI-NEXT: v_mov_b32_e32 v3, v14 -; SI-NEXT: v_mov_b32_e32 v25, v59 -; SI-NEXT: v_mov_b32_e32 v1, v52 -; SI-NEXT: v_mov_b32_e32 v44, v11 +; SI-NEXT: v_mov_b32_e32 v7, v43 +; SI-NEXT: v_bfe_u32 v61, v43, 8, 8 +; SI-NEXT: v_mov_b32_e32 v43, v3 +; SI-NEXT: v_bfe_u32 v60, v3, 8, 8 +; SI-NEXT: v_bfe_u32 v41, v5, 8, 8 +; SI-NEXT: v_bfe_u32 v48, v53, 8, 8 +; SI-NEXT: v_bfe_u32 v42, v14, 8, 8 +; SI-NEXT: v_mov_b32_e32 v4, v22 +; SI-NEXT: v_mov_b32_e32 v14, v13 +; SI-NEXT: v_mov_b32_e32 v20, v53 +; SI-NEXT: v_mov_b32_e32 v53, v5 +; SI-NEXT: v_mov_b32_e32 v39, v9 +; SI-NEXT: v_mov_b32_e32 v18, v59 ; SI-NEXT: s_branch .LBB95_3 ; SI-NEXT: .LBB95_2: -; SI-NEXT: v_mov_b32_e32 v60, v16 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_mov_b32_e32 v7, v29 -; SI-NEXT: v_mov_b32_e32 v29, v6 -; SI-NEXT: v_mov_b32_e32 v39, v5 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v6, v39 +; SI-NEXT: v_mov_b32_e32 v39, v51 +; SI-NEXT: v_mov_b32_e32 v51, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr80 -; SI-NEXT: v_mov_b32_e32 v51, v42 +; SI-NEXT: v_mov_b32_e32 v34, v33 ; SI-NEXT: v_writelane_b32 v62, s4, 0 ; SI-NEXT: v_writelane_b32 v62, s5, 1 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v41, v21 +; SI-NEXT: v_mov_b32_e32 v47, v28 ; SI-NEXT: v_writelane_b32 v62, s4, 2 ; SI-NEXT: v_writelane_b32 v62, s5, 3 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v21, v24 +; SI-NEXT: v_mov_b32_e32 v56, v23 ; SI-NEXT: v_writelane_b32 v62, s4, 4 ; SI-NEXT: v_writelane_b32 v62, s5, 5 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v40, v34 +; SI-NEXT: v_mov_b32_e32 v50, v44 ; SI-NEXT: v_writelane_b32 v62, s4, 6 ; SI-NEXT: v_writelane_b32 v62, s5, 7 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v34, v61 +; SI-NEXT: v_mov_b32_e32 v44, v54 ; SI-NEXT: v_writelane_b32 v62, s4, 8 ; SI-NEXT: v_writelane_b32 v62, s5, 9 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v13, v12 +; SI-NEXT: v_mov_b32_e32 v54, v52 ; SI-NEXT: v_writelane_b32 v62, s4, 10 ; SI-NEXT: v_writelane_b32 v62, s5, 11 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v12, v48 +; SI-NEXT: v_mov_b32_e32 v52, v45 ; SI-NEXT: v_writelane_b32 v62, s4, 12 ; SI-NEXT: v_writelane_b32 v62, s5, 13 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v57, v30 +; SI-NEXT: v_mov_b32_e32 v12, v11 ; SI-NEXT: v_writelane_b32 v62, s4, 14 ; SI-NEXT: v_writelane_b32 v62, s5, 15 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v58, v11 +; SI-NEXT: v_mov_b32_e32 v10, v8 ; SI-NEXT: v_writelane_b32 v62, s4, 16 ; SI-NEXT: v_writelane_b32 v62, s5, 17 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v56, v47 +; SI-NEXT: v_mov_b32_e32 v45, v36 ; SI-NEXT: v_writelane_b32 v62, s4, 18 ; SI-NEXT: v_writelane_b32 v62, s5, 19 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v46, v33 +; SI-NEXT: v_mov_b32_e32 v36, v7 ; SI-NEXT: v_writelane_b32 v62, s4, 20 ; SI-NEXT: v_writelane_b32 v62, s5, 21 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v30, v32 +; SI-NEXT: v_mov_b32_e32 v7, v43 ; SI-NEXT: v_writelane_b32 v62, s4, 22 ; SI-NEXT: v_writelane_b32 v62, s5, 23 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v32, v31 +; SI-NEXT: v_mov_b32_e32 v43, v3 ; SI-NEXT: v_writelane_b32 v62, s4, 24 ; SI-NEXT: v_writelane_b32 v62, s5, 25 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v31, v10 +; SI-NEXT: v_mov_b32_e32 v2, v40 ; SI-NEXT: v_writelane_b32 v62, s4, 26 ; SI-NEXT: v_writelane_b32 v62, s5, 27 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mov_b32_e32 v54, v9 +; SI-NEXT: v_mov_b32_e32 v38, v48 ; SI-NEXT: v_writelane_b32 v62, s4, 28 ; SI-NEXT: v_writelane_b32 v62, s5, 29 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v55, v4 +; SI-NEXT: s_mov_b64 vcc, -1 ; SI-NEXT: v_writelane_b32 v62, s4, 30 ; SI-NEXT: v_writelane_b32 v62, s5, 31 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v22, v2 +; SI-NEXT: v_mov_b32_e32 v4, v22 ; SI-NEXT: v_writelane_b32 v62, s4, 32 ; SI-NEXT: v_writelane_b32 v62, s5, 33 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v2, v1 +; SI-NEXT: v_mov_b32_e32 v14, v9 ; SI-NEXT: v_writelane_b32 v62, s4, 34 ; SI-NEXT: v_writelane_b32 v62, s5, 35 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v17, v43 +; SI-NEXT: v_mov_b32_e32 v20, v53 ; SI-NEXT: v_writelane_b32 v62, s4, 36 ; SI-NEXT: v_writelane_b32 v62, s5, 37 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: s_mov_b64 vcc, -1 +; SI-NEXT: v_mov_b32_e32 v53, v5 ; SI-NEXT: v_writelane_b32 v62, s4, 38 ; SI-NEXT: v_writelane_b32 v62, s5, 39 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v25, v59 +; SI-NEXT: v_mov_b32_e32 v37, v18 ; SI-NEXT: v_writelane_b32 v62, s4, 40 ; SI-NEXT: v_writelane_b32 v62, s5, 41 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v1, v52 +; SI-NEXT: v_mov_b32_e32 v18, v59 ; SI-NEXT: v_writelane_b32 v62, s4, 42 ; SI-NEXT: v_writelane_b32 v62, s5, 43 ; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr44 ; SI-NEXT: ; implicit-def: $sgpr71 -; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $sgpr42 ; SI-NEXT: ; implicit-def: $sgpr70 -; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $sgpr40 ; SI-NEXT: ; implicit-def: $sgpr69 -; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $sgpr28 ; SI-NEXT: ; implicit-def: $sgpr68 -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr66 -; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $sgpr24 ; SI-NEXT: ; implicit-def: $sgpr64 -; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $sgpr22 ; SI-NEXT: ; implicit-def: $sgpr54 -; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $sgpr20 ; SI-NEXT: ; implicit-def: $sgpr62 ; SI-NEXT: ; implicit-def: $sgpr52 -; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $sgpr18 ; SI-NEXT: ; implicit-def: $sgpr58 ; SI-NEXT: ; implicit-def: $sgpr98 ; SI-NEXT: ; implicit-def: $sgpr96 ; SI-NEXT: ; implicit-def: $sgpr50 -; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $sgpr48 -; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $sgpr67 -; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $sgpr65 -; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: ; implicit-def: $sgpr55 -; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $sgpr53 -; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $sgpr51 -; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $sgpr49 -; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $sgpr16 ; SI-NEXT: ; implicit-def: $sgpr38 ; SI-NEXT: ; implicit-def: $sgpr36 @@ -189008,458 +186944,455 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_writelane_b32 v62, s81, 49 ; SI-NEXT: ; implicit-def: $sgpr80 ; SI-NEXT: .LBB95_3: ; %Flow -; SI-NEXT: v_mov_b32_e32 v14, v17 -; SI-NEXT: v_mov_b32_e32 v17, v39 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v13, v38 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v9, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v33, v34 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: s_andn2_b64 vcc, exec, vcc -; SI-NEXT: v_mov_b32_e32 v23, v2 -; SI-NEXT: v_mov_b32_e32 v59, v34 -; SI-NEXT: v_mov_b32_e32 v2, v25 +; SI-NEXT: v_mov_b32_e32 v5, v7 +; SI-NEXT: v_mov_b32_e32 v7, v10 +; SI-NEXT: v_mov_b32_e32 v8, v12 ; SI-NEXT: s_cbranch_vccnz .LBB95_5 ; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: v_mov_b32_e32 v52, v29 -; SI-NEXT: v_mov_b32_e32 v29, v7 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v18, v14 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_mov_b32_e32 v15, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v55 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v3 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v4 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v31 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v8 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v10 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v11 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_bfe_u32 v50, v57, 8, 8 -; SI-NEXT: v_bfe_u32 v48, v12, 8, 8 -; SI-NEXT: v_bfe_u32 v43, v32, 8, 8 -; SI-NEXT: v_bfe_u32 v24, v13, 8, 8 -; SI-NEXT: v_bfe_u32 v20, v31, 8, 8 -; SI-NEXT: v_bfe_u32 v19, v54, 8, 8 -; SI-NEXT: v_bfe_u32 v42, v6, 8, 8 -; SI-NEXT: v_bfe_u32 v45, v55, 8, 8 -; SI-NEXT: v_bfe_u32 v61, v2, 8, 8 -; SI-NEXT: v_bfe_u32 v33, v53, 8, 8 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v7 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_readfirstlane_b32 s4, v14 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v57 +; SI-NEXT: v_mov_b32_e32 v16, v4 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_readfirstlane_b32 s4, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v15 ; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v44 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v7 -; SI-NEXT: v_readfirstlane_b32 s5, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_readfirstlane_b32 s5, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s5, v53 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 +; SI-NEXT: v_readfirstlane_b32 s5, v15 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: v_readfirstlane_b32 s6, v14 +; SI-NEXT: v_bfe_u32 v42, v15, 8, 8 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: v_readfirstlane_b32 s6, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v46 ; SI-NEXT: s_or_b32 s5, s6, s5 ; SI-NEXT: s_lshr_b64 vcc, s[4:5], 24 ; SI-NEXT: s_lshr_b64 s[80:81], s[4:5], 16 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_lshr_b32 s49, s5, 8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_readfirstlane_b32 s6, v14 +; SI-NEXT: v_readfirstlane_b32 s6, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v27 ; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_readfirstlane_b32 s7, v14 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_readfirstlane_b32 s7, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v20 ; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: v_readfirstlane_b32 s7, v3 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: v_readfirstlane_b32 s7, v20 ; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: v_bfe_u32 v48, v20, 8, 8 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_readfirstlane_b32 s8, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_readfirstlane_b32 s8, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 ; SI-NEXT: s_or_b32 s7, s8, s7 ; SI-NEXT: s_lshr_b64 s[82:83], s[6:7], 24 ; SI-NEXT: s_lshr_b64 s[84:85], s[6:7], 16 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_lshr_b64 s[86:87], s[6:7], 8 ; SI-NEXT: s_lshr_b32 s51, s7, 8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_readfirstlane_b32 s8, v14 +; SI-NEXT: v_readfirstlane_b32 s8, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; SI-NEXT: s_lshl_b32 s8, s8, 16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_readfirstlane_b32 s9, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_readfirstlane_b32 s9, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v53 ; SI-NEXT: s_or_b32 s8, s9, s8 -; SI-NEXT: v_readfirstlane_b32 s9, v2 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v18 +; SI-NEXT: v_readfirstlane_b32 s9, v53 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_lshl_b32 s9, s9, 16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_readfirstlane_b32 s10, v14 +; SI-NEXT: v_bfe_u32 v41, v53, 8, 8 +; SI-NEXT: v_readfirstlane_b32 s10, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; SI-NEXT: s_or_b32 s9, s10, s9 ; SI-NEXT: s_lshr_b64 s[56:57], s[8:9], 16 ; SI-NEXT: s_lshr_b64 s[60:61], s[8:9], 8 ; SI-NEXT: s_lshr_b32 s53, s9, 8 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_readfirstlane_b32 s10, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_readfirstlane_b32 s10, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v6 ; SI-NEXT: s_lshl_b32 s10, s10, 16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_readfirstlane_b32 s11, v14 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_readfirstlane_b32 s11, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 ; SI-NEXT: s_or_b32 s10, s11, s10 -; SI-NEXT: v_readfirstlane_b32 s11, v55 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: v_readfirstlane_b32 s11, v43 ; SI-NEXT: s_lshl_b32 s11, s11, 16 +; SI-NEXT: v_bfe_u32 v60, v43, 8, 8 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_readfirstlane_b32 s12, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_readfirstlane_b32 s12, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v34 ; SI-NEXT: s_or_b32 s11, s12, s11 ; SI-NEXT: s_lshr_b64 s[72:73], s[10:11], 24 ; SI-NEXT: s_lshr_b64 s[74:75], s[10:11], 16 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_lshr_b64 s[76:77], s[10:11], 8 ; SI-NEXT: s_lshr_b32 s55, s11, 8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_readfirstlane_b32 s12, v14 +; SI-NEXT: v_readfirstlane_b32 s12, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v33 ; SI-NEXT: s_lshl_b32 s12, s12, 16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_readfirstlane_b32 s13, v14 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_readfirstlane_b32 s13, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v5 ; SI-NEXT: s_or_b32 s12, s13, s12 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: v_readfirstlane_b32 s13, v5 ; SI-NEXT: s_lshl_b32 s13, s13, 16 +; SI-NEXT: v_bfe_u32 v61, v5, 8, 8 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_readfirstlane_b32 s14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_readfirstlane_b32 s14, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: s_or_b32 s13, s14, s13 ; SI-NEXT: s_lshr_b64 s[78:79], s[12:13], 24 ; SI-NEXT: s_lshr_b64 s[88:89], s[12:13], 16 ; SI-NEXT: s_lshr_b64 s[90:91], s[12:13], 8 ; SI-NEXT: s_lshr_b32 s65, s13, 8 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_readfirstlane_b32 s14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_readfirstlane_b32 s14, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v37 ; SI-NEXT: s_lshl_b32 s14, s14, 16 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_readfirstlane_b32 s15, v14 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_readfirstlane_b32 s15, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v36 ; SI-NEXT: s_or_b32 s14, s15, s14 -; SI-NEXT: v_readfirstlane_b32 s15, v6 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 +; SI-NEXT: v_readfirstlane_b32 s15, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_lshl_b32 s15, s15, 16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_readfirstlane_b32 s16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v41 +; SI-NEXT: v_mov_b32_e32 v36, v1 +; SI-NEXT: v_bfe_u32 v40, v1, 8, 8 +; SI-NEXT: v_readfirstlane_b32 s16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v17 ; SI-NEXT: s_or_b32 s15, s16, s15 ; SI-NEXT: s_lshr_b64 s[92:93], s[14:15], 24 ; SI-NEXT: s_lshr_b64 s[94:95], s[14:15], 16 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_lshr_b64 s[30:31], s[14:15], 8 ; SI-NEXT: s_lshr_b32 s67, s15, 8 -; SI-NEXT: v_readfirstlane_b32 s16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v21 +; SI-NEXT: v_readfirstlane_b32 s16, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: s_lshl_b32 s16, s16, 16 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_readfirstlane_b32 s17, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_readfirstlane_b32 s17, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v45 ; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: v_readfirstlane_b32 s17, v44 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: v_readfirstlane_b32 s17, v6 ; SI-NEXT: s_lshl_b32 s17, s17, 16 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_readfirstlane_b32 s18, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v40 +; SI-NEXT: v_mov_b32_e32 v45, v6 +; SI-NEXT: v_bfe_u32 v19, v6, 8, 8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_readfirstlane_b32 s18, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload ; SI-NEXT: s_or_b32 s17, s18, s17 ; SI-NEXT: s_lshr_b64 s[34:35], s[16:17], 24 ; SI-NEXT: s_lshr_b64 s[36:37], s[16:17], 16 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: s_lshr_b64 s[38:39], s[16:17], 8 ; SI-NEXT: s_lshr_b32 s48, s17, 8 -; SI-NEXT: v_readfirstlane_b32 s18, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v60 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_readfirstlane_b32 s18, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: s_lshl_b32 s18, s18, 16 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_readfirstlane_b32 s19, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v59 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_readfirstlane_b32 s19, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v7 ; SI-NEXT: s_or_b32 s18, s19, s18 -; SI-NEXT: v_readfirstlane_b32 s19, v54 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: v_readfirstlane_b32 s19, v7 ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_readfirstlane_b32 s20, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_readfirstlane_b32 s20, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v58 ; SI-NEXT: s_or_b32 s19, s20, s19 ; SI-NEXT: s_lshr_b64 s[96:97], s[18:19], 24 ; SI-NEXT: s_lshr_b64 s[98:99], s[18:19], 16 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_lshr_b64 s[58:59], s[18:19], 8 ; SI-NEXT: s_lshr_b32 s50, s19, 8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_readfirstlane_b32 s20, v14 +; SI-NEXT: v_readfirstlane_b32 s20, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v39 ; SI-NEXT: s_lshl_b32 s20, s20, 16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_readfirstlane_b32 s21, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v58 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_readfirstlane_b32 s21, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: s_or_b32 s20, s21, s20 -; SI-NEXT: v_readfirstlane_b32 s21, v31 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: v_readfirstlane_b32 s21, v4 ; SI-NEXT: s_lshl_b32 s21, s21, 16 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_readfirstlane_b32 s22, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v39 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_readfirstlane_b32 s22, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: s_or_b32 s21, s22, s21 ; SI-NEXT: s_lshr_b64 s[62:63], s[20:21], 8 ; SI-NEXT: s_lshr_b32 s52, s21, 8 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_readfirstlane_b32 s22, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v29 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_readfirstlane_b32 s22, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v56 ; SI-NEXT: s_lshl_b32 s22, s22, 16 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_readfirstlane_b32 s23, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v52 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_readfirstlane_b32 s23, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v8 ; SI-NEXT: s_or_b32 s22, s23, s22 -; SI-NEXT: v_readfirstlane_b32 s23, v32 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_readfirstlane_b32 s23, v8 ; SI-NEXT: s_lshl_b32 s23, s23, 16 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_readfirstlane_b32 s24, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v17 +; SI-NEXT: v_bfe_u32 v21, v8, 8, 8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_readfirstlane_b32 s24, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v2 ; SI-NEXT: s_or_b32 s23, s24, s23 ; SI-NEXT: s_lshr_b32 s54, s23, 8 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_readfirstlane_b32 s24, v14 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_readfirstlane_b32 s24, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v9 ; SI-NEXT: s_lshl_b32 s24, s24, 16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_readfirstlane_b32 s25, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v30 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_readfirstlane_b32 s25, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: s_or_b32 s24, s25, s24 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v14 -; SI-NEXT: v_readfirstlane_b32 s25, v30 -; SI-NEXT: s_lshl_b32 s25, s25, 16 -; SI-NEXT: v_bfe_u32 v35, v30, 8, 8 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_readfirstlane_b32 s26, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v14 +; SI-NEXT: v_readfirstlane_b32 s25, v2 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_lshl_b32 s25, s25, 16 +; SI-NEXT: v_bfe_u32 v25, v2, 8, 8 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_readfirstlane_b32 s26, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v55 ; SI-NEXT: s_or_b32 s25, s26, s25 ; SI-NEXT: s_lshr_b32 s64, s25, 8 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_readfirstlane_b32 s26, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v23 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_readfirstlane_b32 s26, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v26 ; SI-NEXT: s_lshl_b32 s26, s26, 16 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_readfirstlane_b32 s27, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v46 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_readfirstlane_b32 s27, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v52 ; SI-NEXT: s_or_b32 s26, s27, s26 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v18 -; SI-NEXT: v_bfe_u32 v18, v44, 8, 8 -; SI-NEXT: v_readfirstlane_b32 s27, v46 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: v_readfirstlane_b32 s27, v52 ; SI-NEXT: s_lshl_b32 s27, s27, 16 -; SI-NEXT: v_bfe_u32 v37, v46, 8, 8 -; SI-NEXT: v_readfirstlane_b32 s28, v14 +; SI-NEXT: v_bfe_u32 v49, v52, 8, 8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_readfirstlane_b32 s28, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v38 ; SI-NEXT: s_or_b32 s27, s28, s27 ; SI-NEXT: s_lshr_b32 s66, s27, 8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_readfirstlane_b32 s28, v14 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_readfirstlane_b32 s28, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v13 ; SI-NEXT: s_lshl_b32 s28, s28, 16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_readfirstlane_b32 s29, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v56 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_readfirstlane_b32 s29, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v54 ; SI-NEXT: s_or_b32 s28, s29, s28 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v15 -; SI-NEXT: v_bfe_u32 v15, v5, 8, 8 -; SI-NEXT: v_readfirstlane_b32 s29, v56 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v16 +; SI-NEXT: v_bfe_u32 v16, v4, 8, 8 +; SI-NEXT: v_readfirstlane_b32 s29, v54 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_lshl_b32 s29, s29, 16 -; SI-NEXT: v_bfe_u32 v38, v56, 8, 8 -; SI-NEXT: v_readfirstlane_b32 s40, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_bfe_u32 v29, v54, 8, 8 +; SI-NEXT: v_readfirstlane_b32 s40, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v22 ; SI-NEXT: s_or_b32 s29, s40, s29 ; SI-NEXT: s_lshr_b32 s68, s29, 8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_readfirstlane_b32 s40, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_readfirstlane_b32 s40, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; SI-NEXT: s_lshl_b32 s40, s40, 16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_readfirstlane_b32 s41, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_readfirstlane_b32 s41, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v44 ; SI-NEXT: s_or_b32 s40, s41, s40 -; SI-NEXT: v_readfirstlane_b32 s41, v57 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: v_readfirstlane_b32 s41, v44 ; SI-NEXT: s_lshl_b32 s41, s41, 16 -; SI-NEXT: v_readfirstlane_b32 s42, v14 +; SI-NEXT: v_bfe_u32 v31, v44, 8, 8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_readfirstlane_b32 s42, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: s_or_b32 s41, s42, s41 ; SI-NEXT: s_lshr_b32 s69, s41, 8 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_readfirstlane_b32 s42, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_readfirstlane_b32 s42, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: s_lshl_b32 s42, s42, 16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_readfirstlane_b32 s43, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_readfirstlane_b32 s43, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v50 ; SI-NEXT: s_or_b32 s42, s43, s42 -; SI-NEXT: v_readfirstlane_b32 s43, v12 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_readfirstlane_b32 s43, v50 ; SI-NEXT: s_lshl_b32 s43, s43, 16 +; SI-NEXT: v_bfe_u32 v32, v50, 8, 8 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_readfirstlane_b32 s44, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_readfirstlane_b32 s44, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: s_or_b32 s43, s44, s43 ; SI-NEXT: s_lshr_b32 s70, s43, 8 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_readfirstlane_b32 s44, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_readfirstlane_b32 s44, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: s_lshl_b32 s44, s44, 16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 -; SI-NEXT: v_mov_b32_e32 v1, v3 -; SI-NEXT: v_bfe_u32 v47, v1, 8, 8 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_readfirstlane_b32 s45, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_readfirstlane_b32 s45, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v47 ; SI-NEXT: s_or_b32 s44, s45, s44 -; SI-NEXT: v_readfirstlane_b32 s45, v13 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: v_readfirstlane_b32 s45, v47 ; SI-NEXT: s_lshl_b32 s45, s45, 16 +; SI-NEXT: v_bfe_u32 v35, v47, 8, 8 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_readfirstlane_b32 s46, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_readfirstlane_b32 s46, v0 ; SI-NEXT: s_or_b32 s45, s46, s45 ; SI-NEXT: s_lshr_b64 s[46:47], s[44:45], 24 ; SI-NEXT: v_writelane_b32 v62, s46, 4 @@ -189537,7 +187470,10 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: s_lshr_b64 s[46:47], s[8:9], 24 ; SI-NEXT: v_writelane_b32 v62, vcc_hi, 47 ; SI-NEXT: s_lshr_b32 s71, s45, 8 +; SI-NEXT: v_bfe_u32 v0, v7, 8, 8 ; SI-NEXT: .LBB95_5: ; %end +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: v_readlane_b32 vcc_lo, v62, 0 ; SI-NEXT: v_readlane_b32 vcc_hi, v62, 1 ; SI-NEXT: s_lshl_b32 s47, vcc_lo, 8 @@ -189552,17 +187488,17 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: s_or_b32 s47, s57, s47 ; SI-NEXT: s_and_b32 s44, s44, 0xffff ; SI-NEXT: s_or_b32 s44, s44, s47 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mov_b32_e32 v16, s44 +; SI-NEXT: v_mov_b32_e32 v13, s44 ; SI-NEXT: s_and_b32 s44, s45, 0xff ; SI-NEXT: s_lshl_b32 s45, s71, 8 -; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v47 ; SI-NEXT: s_or_b32 s44, s44, s45 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v24 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_lshlrev_b32_e32 v15, 24, v35 ; SI-NEXT: s_and_b32 s44, s44, 0xffff -; SI-NEXT: v_or_b32_e32 v13, v21, v13 -; SI-NEXT: v_or_b32_e32 v13, s44, v13 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_or_b32_e32 v14, s44, v14 ; SI-NEXT: v_readlane_b32 s44, v62, 6 ; SI-NEXT: v_readlane_b32 s45, v62, 7 ; SI-NEXT: s_lshl_b32 s44, s44, 8 @@ -189578,16 +187514,18 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: s_or_b32 s44, s45, s44 ; SI-NEXT: s_and_b32 s42, s42, 0xffff ; SI-NEXT: s_or_b32 s42, s42, s44 -; SI-NEXT: v_mov_b32_e32 v21, s42 +; SI-NEXT: v_mov_b32_e32 v15, s42 ; SI-NEXT: s_and_b32 s42, s43, 0xff ; SI-NEXT: s_lshl_b32 s43, s70, 8 -; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 +; SI-NEXT: v_and_b32_e32 v18, 0xff, v50 ; SI-NEXT: s_or_b32 s42, s42, s43 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v23, 24, v48 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_mov_b32_e32 v17, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v32 ; SI-NEXT: s_and_b32 s42, s42, 0xffff -; SI-NEXT: v_or_b32_e32 v12, v23, v12 -; SI-NEXT: v_or_b32_e32 v12, s42, v12 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: v_or_b32_e32 v18, s42, v18 ; SI-NEXT: v_readlane_b32 s42, v62, 12 ; SI-NEXT: v_readlane_b32 s43, v62, 13 ; SI-NEXT: s_lshl_b32 s42, s42, 8 @@ -189602,16 +187540,16 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: s_or_b32 s42, s43, s42 ; SI-NEXT: s_and_b32 s40, s40, 0xffff ; SI-NEXT: s_or_b32 s40, s40, s42 -; SI-NEXT: v_mov_b32_e32 v23, s40 +; SI-NEXT: v_mov_b32_e32 v20, s40 ; SI-NEXT: s_and_b32 s40, s41, 0xff ; SI-NEXT: s_lshl_b32 s41, s69, 8 -; SI-NEXT: v_and_b32_e32 v11, 0xff, v57 +; SI-NEXT: v_and_b32_e32 v22, 0xff, v44 ; SI-NEXT: s_or_b32 s40, s40, s41 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v24, 24, v50 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v23, 24, v31 ; SI-NEXT: s_and_b32 s40, s40, 0xffff -; SI-NEXT: v_or_b32_e32 v11, v24, v11 -; SI-NEXT: v_or_b32_e32 v11, s40, v11 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: v_or_b32_e32 v22, s40, v22 ; SI-NEXT: v_readlane_b32 s40, v62, 18 ; SI-NEXT: v_readlane_b32 s41, v62, 19 ; SI-NEXT: s_lshl_b32 s40, s40, 8 @@ -189626,16 +187564,16 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: s_or_b32 s40, s41, s40 ; SI-NEXT: s_and_b32 s28, s28, 0xffff ; SI-NEXT: s_or_b32 s28, s28, s40 -; SI-NEXT: v_mov_b32_e32 v24, s28 +; SI-NEXT: v_mov_b32_e32 v23, s28 ; SI-NEXT: s_and_b32 s28, s29, 0xff ; SI-NEXT: s_lshl_b32 s29, s68, 8 -; SI-NEXT: v_and_b32_e32 v25, 0xff, v56 +; SI-NEXT: v_and_b32_e32 v26, 0xff, v54 ; SI-NEXT: s_or_b32 s28, s28, s29 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v26, 24, v38 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v28, 24, v29 ; SI-NEXT: s_and_b32 s28, s28, 0xffff -; SI-NEXT: v_or_b32_e32 v25, v26, v25 -; SI-NEXT: v_or_b32_e32 v25, s28, v25 +; SI-NEXT: v_or_b32_e32 v26, v28, v26 +; SI-NEXT: v_or_b32_e32 v26, s28, v26 ; SI-NEXT: v_readlane_b32 s28, v62, 24 ; SI-NEXT: v_readlane_b32 s29, v62, 25 ; SI-NEXT: s_lshl_b32 s28, s28, 8 @@ -189650,16 +187588,16 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: s_or_b32 s28, s29, s28 ; SI-NEXT: s_and_b32 s26, s26, 0xffff ; SI-NEXT: s_or_b32 s26, s26, s28 -; SI-NEXT: v_mov_b32_e32 v26, s26 +; SI-NEXT: v_mov_b32_e32 v28, s26 ; SI-NEXT: s_and_b32 s26, s27, 0xff ; SI-NEXT: s_lshl_b32 s27, s66, 8 -; SI-NEXT: v_and_b32_e32 v27, 0xff, v46 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v52 ; SI-NEXT: s_or_b32 s26, s26, s27 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v28, 24, v37 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v27, 24, v49 ; SI-NEXT: s_and_b32 s26, s26, 0xffff -; SI-NEXT: v_or_b32_e32 v27, v28, v27 -; SI-NEXT: v_or_b32_e32 v27, s26, v27 +; SI-NEXT: v_or_b32_e32 v12, v27, v12 +; SI-NEXT: v_or_b32_e32 v12, s26, v12 ; SI-NEXT: v_readlane_b32 s26, v62, 30 ; SI-NEXT: v_readlane_b32 s27, v62, 31 ; SI-NEXT: s_lshl_b32 s26, s26, 8 @@ -189675,66 +187613,66 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: s_or_b32 s26, s27, s26 ; SI-NEXT: s_and_b32 s24, s24, 0xffff ; SI-NEXT: s_or_b32 s24, s24, s26 -; SI-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v13, v24, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v16, vcc, 4, v0 -; SI-NEXT: v_mov_b32_e32 v28, s24 +; SI-NEXT: v_add_i32_e32 v13, vcc, 4, v24 +; SI-NEXT: v_mov_b32_e32 v27, s24 ; SI-NEXT: s_and_b32 s24, s25, 0xff ; SI-NEXT: s_lshl_b32 s25, s64, 8 -; SI-NEXT: v_and_b32_e32 v29, 0xff, v30 -; SI-NEXT: buffer_store_dword v13, v16, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v13, vcc, 8, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v11, 0xff, v2 +; SI-NEXT: buffer_store_dword v14, v13, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v13, vcc, 8, v24 ; SI-NEXT: s_or_b32 s24, s24, s25 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; SI-NEXT: v_lshlrev_b32_e32 v30, 24, v35 -; SI-NEXT: buffer_store_dword v21, v13, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v13, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v25, 24, v25 +; SI-NEXT: buffer_store_dword v15, v13, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v13, vcc, 12, v24 ; SI-NEXT: s_and_b32 s24, s24, 0xffff -; SI-NEXT: v_or_b32_e32 v29, v30, v29 -; SI-NEXT: buffer_store_dword v12, v13, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v12, vcc, 16, v0 -; SI-NEXT: v_or_b32_e32 v29, s24, v29 -; SI-NEXT: buffer_store_dword v23, v12, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v12, vcc, 20, v0 +; SI-NEXT: v_or_b32_e32 v11, v25, v11 +; SI-NEXT: buffer_store_dword v18, v13, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v13, vcc, 16, v24 +; SI-NEXT: v_or_b32_e32 v11, s24, v11 +; SI-NEXT: buffer_store_dword v20, v13, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v13, vcc, 20, v24 ; SI-NEXT: v_readlane_b32 s24, v62, 36 -; SI-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v11, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v22, v13, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v13, vcc, 24, v24 ; SI-NEXT: v_readlane_b32 s25, v62, 37 ; SI-NEXT: s_lshl_b32 s24, s24, 8 ; SI-NEXT: s_and_b32 s22, s22, 0xff -; SI-NEXT: buffer_store_dword v24, v11, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v11, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v23, v13, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v13, vcc, 28, v24 ; SI-NEXT: s_or_b32 s22, s22, s24 ; SI-NEXT: v_readlane_b32 s24, v62, 38 -; SI-NEXT: buffer_store_dword v25, v11, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v11, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v26, v13, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v13, vcc, 32, v24 ; SI-NEXT: v_readlane_b32 s25, v62, 39 ; SI-NEXT: s_and_b32 s24, s24, 0xff ; SI-NEXT: v_readlane_b32 s26, v62, 40 -; SI-NEXT: buffer_store_dword v26, v11, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v11, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v28, v13, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v13, vcc, 36, v24 ; SI-NEXT: s_lshl_b32 s24, s24, 16 ; SI-NEXT: s_lshl_b32 s25, s26, 24 -; SI-NEXT: buffer_store_dword v27, v11, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v11, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v12, v13, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v12, vcc, 40, v24 ; SI-NEXT: s_and_b32 s22, s22, 0xffff ; SI-NEXT: s_or_b32 s24, s25, s24 -; SI-NEXT: buffer_store_dword v28, v11, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v11, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v27, v12, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v12, vcc, 44, v24 ; SI-NEXT: s_or_b32 s22, s22, s24 -; SI-NEXT: buffer_store_dword v29, v11, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v11, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v11, vcc, 48, v24 ; SI-NEXT: v_mov_b32_e32 v12, s22 ; SI-NEXT: s_and_b32 s22, s23, 0xff ; SI-NEXT: s_lshl_b32 s23, s54, 8 -; SI-NEXT: v_and_b32_e32 v10, 0xff, v32 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v8 ; SI-NEXT: buffer_store_dword v12, v11, s[0:3], 0 offen ; SI-NEXT: s_or_b32 s22, s22, s23 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v43 +; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v21 ; SI-NEXT: s_and_b32 s22, s22, 0xffff ; SI-NEXT: v_or_b32_e32 v10, v11, v10 ; SI-NEXT: v_or_b32_e32 v10, s22, v10 @@ -189749,47 +187687,83 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: s_lshl_b32 s23, s24, 24 ; SI-NEXT: s_and_b32 s20, s20, 0xffff ; SI-NEXT: s_or_b32 s22, s23, s22 -; SI-NEXT: v_add_i32_e32 v11, vcc, 52, v0 +; SI-NEXT: v_add_i32_e32 v11, vcc, 52, v24 ; SI-NEXT: s_or_b32 s20, s20, s22 ; SI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v10, vcc, 56, v0 +; SI-NEXT: v_add_i32_e32 v10, vcc, 56, v24 ; SI-NEXT: v_mov_b32_e32 v11, s20 +; SI-NEXT: buffer_store_dword v11, v10, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: s_and_b32 s20, s21, 0xff ; SI-NEXT: s_lshl_b32 s21, s52, 8 -; SI-NEXT: v_and_b32_e32 v9, 0xff, v31 -; SI-NEXT: buffer_store_dword v11, v10, s[0:3], 0 offen ; SI-NEXT: s_or_b32 s20, s20, s21 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v20 +; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v16 ; SI-NEXT: s_and_b32 s20, s20, 0xffff +; SI-NEXT: s_and_b32 s18, s18, 0xff +; SI-NEXT: s_lshl_b32 s21, s96, 24 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; SI-NEXT: v_or_b32_e32 v0, v0, v8 +; SI-NEXT: s_and_b32 s16, s16, 0xff +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x44, v24 +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v19 +; SI-NEXT: s_and_b32 s14, s14, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v40 +; SI-NEXT: s_and_b32 s12, s12, 0xff +; SI-NEXT: s_and_b32 s10, s10, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v60 +; SI-NEXT: s_and_b32 s8, s8, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v41 +; SI-NEXT: s_and_b32 s6, s6, 0xff +; SI-NEXT: s_and_b32 s4, s4, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v42 +; SI-NEXT: v_readlane_b32 s45, v62, 17 +; SI-NEXT: v_readlane_b32 s43, v62, 23 +; SI-NEXT: v_readlane_b32 s41, v62, 29 +; SI-NEXT: v_readlane_b32 s29, v62, 35 +; SI-NEXT: v_readlane_b32 s27, v62, 41 +; SI-NEXT: v_readlane_b32 s25, v62, 45 +; SI-NEXT: v_readlane_b32 s99, v63, 35 +; SI-NEXT: v_readlane_b32 s97, v63, 33 +; SI-NEXT: v_readlane_b32 s96, v63, 32 +; SI-NEXT: v_readlane_b32 s87, v63, 31 +; SI-NEXT: v_readlane_b32 s85, v63, 29 +; SI-NEXT: v_readlane_b32 s83, v63, 27 +; SI-NEXT: v_readlane_b32 s81, v63, 25 +; SI-NEXT: v_readlane_b32 s71, v63, 23 +; SI-NEXT: v_readlane_b32 s70, v63, 22 +; SI-NEXT: v_readlane_b32 s69, v63, 21 +; SI-NEXT: v_readlane_b32 s68, v63, 20 +; SI-NEXT: v_readlane_b32 s66, v63, 18 +; SI-NEXT: v_readlane_b32 s64, v63, 16 +; SI-NEXT: v_readlane_b32 s54, v63, 14 +; SI-NEXT: v_readlane_b32 s52, v63, 12 +; SI-NEXT: v_readlane_b32 s39, v63, 7 +; SI-NEXT: v_readlane_b32 s37, v63, 5 +; SI-NEXT: v_readlane_b32 s35, v63, 3 +; SI-NEXT: v_readlane_b32 s31, v63, 1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v9, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 ; SI-NEXT: v_or_b32_e32 v9, s20, v9 -; SI-NEXT: s_and_b32 s18, s18, 0xff ; SI-NEXT: s_lshl_b32 s20, s58, 8 ; SI-NEXT: s_or_b32 s18, s18, s20 ; SI-NEXT: s_and_b32 s20, s98, 0xff ; SI-NEXT: s_lshl_b32 s20, s20, 16 -; SI-NEXT: s_lshl_b32 s21, s96, 24 ; SI-NEXT: s_and_b32 s18, s18, 0xffff ; SI-NEXT: s_or_b32 s20, s21, s20 -; SI-NEXT: v_add_i32_e32 v10, vcc, 60, v0 +; SI-NEXT: v_add_i32_e32 v10, vcc, 60, v24 ; SI-NEXT: s_or_b32 s18, s18, s20 ; SI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v9, vcc, 64, v0 ; SI-NEXT: v_mov_b32_e32 v10, s18 ; SI-NEXT: s_and_b32 s18, s19, 0xff ; SI-NEXT: s_lshl_b32 s19, s50, 8 -; SI-NEXT: v_and_b32_e32 v8, 0xff, v54 -; SI-NEXT: buffer_store_dword v10, v9, s[0:3], 0 offen ; SI-NEXT: s_or_b32 s18, s18, s19 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v19 ; SI-NEXT: s_and_b32 s18, s18, 0xffff -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_or_b32_e32 v8, s18, v8 -; SI-NEXT: s_and_b32 s16, s16, 0xff +; SI-NEXT: v_or_b32_e32 v0, s18, v0 ; SI-NEXT: s_lshl_b32 s18, s38, 8 ; SI-NEXT: s_or_b32 s16, s16, s18 ; SI-NEXT: s_and_b32 s18, s36, 0xff @@ -189797,23 +187771,23 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: s_lshl_b32 s19, s34, 24 ; SI-NEXT: s_and_b32 s16, s16, 0xffff ; SI-NEXT: s_or_b32 s18, s19, s18 -; SI-NEXT: v_add_i32_e32 v9, vcc, 0x44, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v9, vcc, 64, v24 ; SI-NEXT: s_or_b32 s16, s16, s18 -; SI-NEXT: buffer_store_dword v8, v9, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v10, v9, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v0, v8, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v8, vcc, 0x48, v0 -; SI-NEXT: v_mov_b32_e32 v9, s16 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x48, v24 +; SI-NEXT: v_mov_b32_e32 v8, s16 +; SI-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen ; SI-NEXT: s_and_b32 s16, s17, 0xff ; SI-NEXT: s_lshl_b32 s17, s48, 8 -; SI-NEXT: v_and_b32_e32 v7, 0xff, v44 -; SI-NEXT: buffer_store_dword v9, v8, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v0, 0xff, v45 ; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v18 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_or_b32_e32 v7, s16, v7 -; SI-NEXT: s_and_b32 s14, s14, 0xff +; SI-NEXT: v_or_b32_e32 v0, v7, v0 +; SI-NEXT: v_or_b32_e32 v0, s16, v0 ; SI-NEXT: s_lshl_b32 s16, s30, 8 ; SI-NEXT: s_or_b32 s14, s14, s16 ; SI-NEXT: s_and_b32 s16, s94, 0xff @@ -189821,23 +187795,21 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: s_lshl_b32 s17, s92, 24 ; SI-NEXT: s_and_b32 s14, s14, 0xffff ; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: v_add_i32_e32 v8, vcc, 0x4c, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x4c, v24 ; SI-NEXT: s_or_b32 s14, s14, s16 -; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v0, v7, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x50, v0 -; SI-NEXT: v_mov_b32_e32 v8, s14 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x50, v24 +; SI-NEXT: v_mov_b32_e32 v7, s14 +; SI-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen ; SI-NEXT: s_and_b32 s14, s15, 0xff ; SI-NEXT: s_lshl_b32 s15, s67, 8 -; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 -; SI-NEXT: buffer_store_dword v8, v7, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v0, 0xff, v36 ; SI-NEXT: s_or_b32 s14, s14, s15 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v42 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: s_and_b32 s14, s14, 0xffff -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_or_b32_e32 v6, s14, v6 -; SI-NEXT: s_and_b32 s12, s12, 0xff +; SI-NEXT: v_or_b32_e32 v0, v6, v0 +; SI-NEXT: v_or_b32_e32 v0, s14, v0 ; SI-NEXT: s_lshl_b32 s14, s90, 8 ; SI-NEXT: s_or_b32 s12, s12, s14 ; SI-NEXT: s_and_b32 s14, s88, 0xff @@ -189845,23 +187817,22 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: s_lshl_b32 s15, s78, 24 ; SI-NEXT: s_and_b32 s12, s12, 0xffff ; SI-NEXT: s_or_b32 s14, s15, s14 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x54, v0 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x54, v24 ; SI-NEXT: s_or_b32 s12, s12, s14 -; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v6, vcc, 0x58, v0 -; SI-NEXT: v_mov_b32_e32 v7, s12 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x58, v24 +; SI-NEXT: v_mov_b32_e32 v6, s12 +; SI-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen ; SI-NEXT: s_and_b32 s12, s13, 0xff ; SI-NEXT: s_lshl_b32 s13, s65, 8 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: buffer_store_dword v7, v6, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v0, 0xff, v5 ; SI-NEXT: s_or_b32 s12, s12, s13 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v15 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v61 ; SI-NEXT: s_and_b32 s12, s12, 0xffff -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_or_b32_e32 v5, s12, v5 -; SI-NEXT: s_and_b32 s10, s10, 0xff +; SI-NEXT: v_or_b32_e32 v0, v5, v0 +; SI-NEXT: v_or_b32_e32 v0, s12, v0 ; SI-NEXT: s_lshl_b32 s12, s76, 8 ; SI-NEXT: s_or_b32 s10, s10, s12 ; SI-NEXT: s_and_b32 s12, s74, 0xff @@ -189869,23 +187840,21 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: s_lshl_b32 s13, s72, 24 ; SI-NEXT: s_and_b32 s10, s10, 0xffff ; SI-NEXT: s_or_b32 s12, s13, s12 -; SI-NEXT: v_add_i32_e32 v6, vcc, 0x5c, v0 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x5c, v24 ; SI-NEXT: s_or_b32 s10, s10, s12 -; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v0, v5, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v5, vcc, 0x60, v0 -; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x60, v24 +; SI-NEXT: v_mov_b32_e32 v5, s10 +; SI-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen ; SI-NEXT: s_and_b32 s10, s11, 0xff ; SI-NEXT: s_lshl_b32 s11, s55, 8 -; SI-NEXT: v_and_b32_e32 v4, 0xff, v55 -; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v0, 0xff, v43 ; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v45 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: s_and_b32 s10, s10, 0xffff -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_or_b32_e32 v4, s10, v4 -; SI-NEXT: s_and_b32 s8, s8, 0xff +; SI-NEXT: v_or_b32_e32 v0, v4, v0 +; SI-NEXT: v_or_b32_e32 v0, s10, v0 ; SI-NEXT: s_lshl_b32 s10, s60, 8 ; SI-NEXT: s_or_b32 s8, s8, s10 ; SI-NEXT: s_and_b32 s10, s56, 0xff @@ -189893,23 +187862,21 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: s_lshl_b32 s11, s46, 24 ; SI-NEXT: s_and_b32 s8, s8, 0xffff ; SI-NEXT: s_or_b32 s10, s11, s10 -; SI-NEXT: v_add_i32_e32 v5, vcc, 0x64, v0 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x64, v24 ; SI-NEXT: s_or_b32 s8, s8, s10 -; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v4, vcc, 0x68, v0 -; SI-NEXT: v_mov_b32_e32 v5, s8 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x68, v24 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen ; SI-NEXT: s_and_b32 s8, s9, 0xff ; SI-NEXT: s_lshl_b32 s9, s53, 8 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: buffer_store_dword v5, v4, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v0, 0xff, v53 ; SI-NEXT: s_or_b32 s8, s8, s9 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v61 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: s_and_b32 s8, s8, 0xffff -; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: v_or_b32_e32 v2, s8, v2 -; SI-NEXT: s_and_b32 s6, s6, 0xff +; SI-NEXT: v_or_b32_e32 v0, v3, v0 +; SI-NEXT: v_or_b32_e32 v0, s8, v0 ; SI-NEXT: s_lshl_b32 s8, s86, 8 ; SI-NEXT: s_or_b32 s6, s6, s8 ; SI-NEXT: s_and_b32 s8, s84, 0xff @@ -189917,24 +187884,23 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: s_lshl_b32 s9, s82, 24 ; SI-NEXT: s_and_b32 s6, s6, 0xffff ; SI-NEXT: s_or_b32 s8, s9, s8 -; SI-NEXT: v_add_i32_e32 v4, vcc, 0x6c, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v24 ; SI-NEXT: s_or_b32 s6, s6, s8 -; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v0, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 -; SI-NEXT: v_mov_b32_e32 v4, s6 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x70, v24 +; SI-NEXT: v_mov_b32_e32 v3, s6 +; SI-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen ; SI-NEXT: s_and_b32 s6, s7, 0xff ; SI-NEXT: s_lshl_b32 s7, s51, 8 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v0, 0xff, v17 ; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v47 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v48 ; SI-NEXT: s_and_b32 s6, s6, 0xffff -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_or_b32_e32 v1, s6, v1 +; SI-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-NEXT: v_or_b32_e32 v0, s6, v0 ; SI-NEXT: v_readlane_b32 s6, v62, 46 -; SI-NEXT: s_and_b32 s4, s4, 0xff ; SI-NEXT: s_lshl_b32 s6, s6, 8 ; SI-NEXT: v_readlane_b32 s7, v62, 47 ; SI-NEXT: s_or_b32 s4, s4, s6 @@ -189944,85 +187910,60 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: s_lshl_b32 s7, s8, 24 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v24 ; SI-NEXT: s_or_b32 s4, s4, s6 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x78, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x78, v24 ; SI-NEXT: v_mov_b32_e32 v2, s4 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: s_and_b32 s4, s5, 0xff ; SI-NEXT: s_lshl_b32 s5, s49, 8 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v53 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v33 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_or_b32_e32 v1, s4, v1 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: v_readlane_b32 s45, v62, 17 -; SI-NEXT: v_readlane_b32 s43, v62, 23 -; SI-NEXT: v_readlane_b32 s41, v62, 29 -; SI-NEXT: v_readlane_b32 s29, v62, 35 -; SI-NEXT: v_readlane_b32 s27, v62, 41 -; SI-NEXT: v_readlane_b32 s25, v62, 45 ; SI-NEXT: v_readlane_b32 s9, v62, 49 -; SI-NEXT: v_readlane_b32 s99, v63, 35 ; SI-NEXT: v_readlane_b32 s98, v63, 34 -; SI-NEXT: v_readlane_b32 s97, v63, 33 -; SI-NEXT: v_readlane_b32 s96, v63, 32 -; SI-NEXT: v_readlane_b32 s87, v63, 31 ; SI-NEXT: v_readlane_b32 s86, v63, 30 -; SI-NEXT: v_readlane_b32 s85, v63, 29 ; SI-NEXT: v_readlane_b32 s84, v63, 28 -; SI-NEXT: v_readlane_b32 s83, v63, 27 ; SI-NEXT: v_readlane_b32 s82, v63, 26 -; SI-NEXT: v_readlane_b32 s81, v63, 25 ; SI-NEXT: v_readlane_b32 s80, v63, 24 -; SI-NEXT: v_readlane_b32 s71, v63, 23 -; SI-NEXT: v_readlane_b32 s70, v63, 22 -; SI-NEXT: v_readlane_b32 s69, v63, 21 -; SI-NEXT: v_readlane_b32 s68, v63, 20 ; SI-NEXT: v_readlane_b32 s67, v63, 19 -; SI-NEXT: v_readlane_b32 s66, v63, 18 ; SI-NEXT: v_readlane_b32 s65, v63, 17 -; SI-NEXT: v_readlane_b32 s64, v63, 16 ; SI-NEXT: v_readlane_b32 s55, v63, 15 -; SI-NEXT: v_readlane_b32 s54, v63, 14 ; SI-NEXT: v_readlane_b32 s53, v63, 13 -; SI-NEXT: v_readlane_b32 s52, v63, 12 ; SI-NEXT: v_readlane_b32 s51, v63, 11 ; SI-NEXT: v_readlane_b32 s50, v63, 10 ; SI-NEXT: v_readlane_b32 s49, v63, 9 ; SI-NEXT: v_readlane_b32 s48, v63, 8 -; SI-NEXT: v_readlane_b32 s39, v63, 7 ; SI-NEXT: v_readlane_b32 s38, v63, 6 -; SI-NEXT: v_readlane_b32 s37, v63, 5 ; SI-NEXT: v_readlane_b32 s36, v63, 4 -; SI-NEXT: v_readlane_b32 s35, v63, 3 ; SI-NEXT: v_readlane_b32 s34, v63, 2 -; SI-NEXT: v_readlane_b32 s31, v63, 1 ; SI-NEXT: v_readlane_b32 s30, v63, 0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v0, s4, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x7c, v24 +; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -193834,1423 +191775,1416 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-LABEL: bitcast_v128i8_to_v64i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v54, v15 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v57, v5 -; SI-NEXT: v_mov_b32_e32 v41, v3 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:392 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v59, v24 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v47, v16 +; SI-NEXT: v_mov_b32_e32 v61, v14 +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v60, v6 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:388 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:100 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:124 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:132 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:116 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:156 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:164 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:148 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:188 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:120 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:128 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:152 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:160 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:144 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:184 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:92 +; SI-NEXT: v_lshlrev_b32_e32 v40, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v9 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v14 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v17 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v12 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v25 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v22 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v30 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v7 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v28 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill -; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v13 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v8 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill -; SI-NEXT: v_lshlrev_b32_e32 v16, 24, v16 -; SI-NEXT: v_lshlrev_b32_e32 v22, 24, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 24, v20 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; kill: killed $vgpr42 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v36 -; SI-NEXT: v_lshlrev_b32_e32 v36, 24, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v15 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v11 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v21 ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v9 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v23 ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v5 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v19 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v13 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v29 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v15 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v27 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v29 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v27 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v43, v8 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v31 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v10 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:160 -; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v32 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v33 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v34 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:196 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:220 +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:192 -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v11 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v19 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v20 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v21 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v17 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v23 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; SI-NEXT: v_lshlrev_b32_e32 v31, 8, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v18 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v24 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v26 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill -; SI-NEXT: v_lshlrev_b32_e32 v34, 8, v10 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:128 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:184 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v7 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:216 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:116 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:180 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:176 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:212 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:76 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:208 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:148 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:40 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v8 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:228 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:252 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:224 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:108 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:172 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:140 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:204 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:28 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v6 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:248 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v9 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v8 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:260 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:284 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:256 -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:100 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:136 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:80 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:244 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:132 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(5) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(3) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v5 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:240 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:280 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v11 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v30 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v8 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:292 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:316 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:288 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:156 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:236 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:180 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:312 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:176 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:168 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:276 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:272 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v9 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v10 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v8 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v32 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:324 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:348 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:320 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:308 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:304 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v33 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:268 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_lshlrev_b32_e32 v37, 24, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v34 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:188 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:200 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:196 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshlrev_b32_e32 v63, 8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v2 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:344 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshlrev_b32_e32 v51, 24, v11 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:356 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:380 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:352 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:212 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:300 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v56, 24, v4 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:224 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v11 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:376 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:340 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:336 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:208 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:204 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v60, 24, v9 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:388 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:384 +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:372 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:368 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v2 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:332 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshlrev_b32_e32 v32, 24, v8 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:8 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:236 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:364 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:256 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:252 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v11 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(3) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v2 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:232 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:120 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:228 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:144 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:288 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v12 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:140 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:272 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:268 +; SI-NEXT: v_lshlrev_b32_e32 v63, 8, v2 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:320 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:344 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:316 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:264 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:112 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:260 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v26, 24, v0 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:72 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:340 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:304 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:300 +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v13 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:928 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:336 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:108 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:332 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:296 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:292 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v56, 24, v15 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v6 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:328 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:928 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v13 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:324 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v14 +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v14 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v15 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:976 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(3) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v16 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v2 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:352 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:376 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:348 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v46, 24, v0 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:384 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:380 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:48 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:372 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:368 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:364 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:360 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:356 +; SI-NEXT: v_lshlrev_b32_e32 v58, 8, v18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v57, 24, v15 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v62, 24, v19 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v20 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:60 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:328 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:44 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:360 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB96_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xff, v57 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v41 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_and_b32_e32 v7, 0xff, v54 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v8, v16, v7 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v51, 0xff, v47 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:920 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v48, 0xff, v43 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_or_b32_e32 v2, v2, v58 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v62, v3 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:872 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:964 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v1, v5, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:920 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v5, 0xff, v60 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_mov_b32_e32 v60, v32 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v13, v3, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v5, v2, v5 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v5, v1, v5 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v9, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v2, v6, v2 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v9, v1, v9 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v11, v2, v7 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v10, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v11, v10, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v10, 0xff, v61 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v2, v7, v2 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v10, v1, v10 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v10, v22, v7 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v15, v24, v7 +; SI-NEXT: v_or_b32_e32 v12, v1, v12 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v13, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v17, v7, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v13, v13, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v18, v36, v7 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v17, v1, v14 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v19, v2, v7 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v14, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v14, v1, v14 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v20, v7, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v15, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v15, v15, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v30, v2, v7 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v16, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v25, v1, v16 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v21, v2, v7 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v16, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v16, v1, v16 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v23, v7, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v19, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v19, v19, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v33, v2, v7 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v20, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v49, v1, v20 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v27, v2, v7 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v20, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v20, v1, v20 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v49, v7, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v21, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v21, v21, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v38, v2, v7 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v22, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v50, v1, v22 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v50, v2, v7 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v22, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v22, v1, v22 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v52, v7, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v23, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v23, v23, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v14, v2, v7 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v24, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v52, v1, v24 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v53, v2, v7 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v24, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v24, v1, v24 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v55, v7, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v27, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v27, v27, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v2, v2, v7 -; SI-NEXT: v_and_b32_e32 v7, 0xff, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v28, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v43, v3, v7 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v53, v1, v28 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v28, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v44, v7, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v28, v1, v28 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v29, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v16, v3, v7 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v29, v29, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v30, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v45, v3, v7 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v54, v1, v30 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v30, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v47, v7, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v30, v1, v30 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v31, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v24, v3, v7 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v33, v31, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v31, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v58, v3, v7 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v55, v1, v31 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v31, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v59, v7, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v34, v1, v31 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_mov_b32_e32 v3, v34 -; SI-NEXT: v_or_b32_e32 v34, v6, v7 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v31, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v35, v31, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v36, v6, v7 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v31, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:896 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v41, v1, v31 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v48, v7, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v31, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:900 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v36, v1, v31 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v35, v6, v7 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v31, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:916 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v37, v31, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v54, v6, v7 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v31, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:940 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v42, v1, v31 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v41, v7, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:888 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v31, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:880 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v37, v37, v7 +; SI-NEXT: v_or_b32_e32 v38, v1, v31 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:924 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v57, v51, v7 +; SI-NEXT: v_and_b32_e32 v31, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:888 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:936 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v63, v7, v63 +; SI-NEXT: v_or_b32_e32 v39, v31, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:932 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v51, v56, v7 +; SI-NEXT: v_and_b32_e32 v31, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:896 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:952 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v56, v60, v7 +; SI-NEXT: v_or_b32_e32 v44, v1, v31 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:956 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v60, v7, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v9, v9, v7 -; SI-NEXT: v_and_b32_e32 v7, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v7, v7, v31 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:976 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v12, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:908 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v26, v12, v3 +; SI-NEXT: v_and_b32_e32 v31, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:916 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v12, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v32, v4 +; SI-NEXT: v_or_b32_e32 v45, v1, v31 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v28, v12, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v12, 0xff, v25 +; SI-NEXT: v_and_b32_e32 v31, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:876 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v63, v31, v63 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v29, v12, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v31, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:860 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_or_b32_e32 v26, v26, v31 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v12, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v31, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_or_b32_e32 v7, v7, v31 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v32, v12, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v31, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:948 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v32, v31, v4 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v12, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v4, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:936 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v46, v4 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v62, v12, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v31, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_or_b32_e32 v46, v56, v31 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v12, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v31, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v31, v31, v40 +; SI-NEXT: v_and_b32_e32 v40, 0xff, v59 +; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; SI-NEXT: v_or_b32_e32 v31, v31, v9 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:956 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v48, v48, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v46, v12, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v51, v51, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v12, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v56, v40, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v61, v12, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v40, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v12, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v57, v40, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v3, v12, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v40, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v12, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v58, v40, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v39, v12, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v40, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v12, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v62, v40, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v40, v12, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v40, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v12, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v18, v40, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v31, v12, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:884 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v40, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v12, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:948 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v6, v40, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v12, v12, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:912 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v40, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v22, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:968 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v1, v40, v1 +; SI-NEXT: v_and_b32_e32 v40, 0xff, v60 +; SI-NEXT: v_or_b32_e32 v40, v40, v43 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v42, v22, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:944 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v43, 0xff, v43 +; SI-NEXT: v_or_b32_e32 v60, v43, v47 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:952 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v43, 0xff, v43 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v22, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:972 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v47, v43, v47 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v22, v22, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:964 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v43, 0xff, v43 +; SI-NEXT: v_or_b32_e32 v61, v43, v59 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v43, 0xff, v43 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v1, v1, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v43, v43, v59 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:904 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; SI-NEXT: v_or_b32_e32 v25, v6, v13 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v7 -; SI-NEXT: v_or_b32_e32 v6, v6, v5 -; SI-NEXT: v_alignbit_b32 v7, v25, v5, 16 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:980 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v26 -; SI-NEXT: v_or_b32_e32 v6, v6, v11 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; SI-NEXT: v_or_b32_e32 v5, v5, v8 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v5, v11, 16 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v59, 0xff, v59 +; SI-NEXT: v_or_b32_e32 v59, v59, v8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v6 -; SI-NEXT: v_or_b32_e32 v6, v11, v10 -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v28 -; SI-NEXT: v_or_b32_e32 v11, v11, v15 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v11, v6, v15, 16 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v17 -; SI-NEXT: v_or_b32_e32 v26, v11, v18 -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v29 -; SI-NEXT: v_or_b32_e32 v11, v11, v19 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v8, v8, v5 +; SI-NEXT: v_alignbit_b32 v9, v8, v9, 16 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v11 +; SI-NEXT: v_or_b32_e32 v11, v9, v10 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v48 +; SI-NEXT: v_or_b32_e32 v48, v9, v12 +; SI-NEXT: v_alignbit_b32 v9, v11, v12, 16 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v13 +; SI-NEXT: v_or_b32_e32 v12, v9, v17 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v51 +; SI-NEXT: v_or_b32_e32 v9, v9, v14 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v9, v12, v14, 16 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v15 +; SI-NEXT: v_or_b32_e32 v51, v9, v25 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v56 +; SI-NEXT: v_or_b32_e32 v15, v9, v16 +; SI-NEXT: v_alignbit_b32 v9, v51, v16, 16 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v19 +; SI-NEXT: v_or_b32_e32 v9, v9, v49 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v57 +; SI-NEXT: v_or_b32_e32 v16, v13, v20 +; SI-NEXT: v_alignbit_b32 v13, v9, v20, 16 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v31, v13, v50 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v58 +; SI-NEXT: v_or_b32_e32 v13, v13, v22 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v11, v26, v19, 16 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v13, v31, v22, 16 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v23 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v62 +; SI-NEXT: v_or_b32_e32 v13, v13, v52 +; SI-NEXT: v_or_b32_e32 v14, v14, v24 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v14, v13, v24, 16 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v27 +; SI-NEXT: v_or_b32_e32 v24, v14, v53 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v18, v14, v28 +; SI-NEXT: v_alignbit_b32 v14, v24, v28, 16 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v29 +; SI-NEXT: v_or_b32_e32 v21, v14, v54 +; SI-NEXT: v_or_b32_e32 v22, v6, v30 +; SI-NEXT: v_alignbit_b32 v6, v21, v30, 16 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v33 +; SI-NEXT: v_or_b32_e32 v19, v6, v55 +; SI-NEXT: v_or_b32_e32 v29, v1, v34 +; SI-NEXT: v_alignbit_b32 v1, v19, v34, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v20 -; SI-NEXT: v_or_b32_e32 v28, v11, v30 -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v32 -; SI-NEXT: v_or_b32_e32 v20, v11, v21 -; SI-NEXT: v_alignbit_b32 v11, v28, v21, 16 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v35 +; SI-NEXT: v_or_b32_e32 v23, v1, v41 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v40 +; SI-NEXT: v_or_b32_e32 v20, v1, v36 +; SI-NEXT: v_alignbit_b32 v1, v23, v36, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v23 -; SI-NEXT: v_or_b32_e32 v29, v11, v33 -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v62 -; SI-NEXT: v_or_b32_e32 v23, v11, v27 -; SI-NEXT: v_alignbit_b32 v11, v29, v27, 16 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v49 -; SI-NEXT: v_or_b32_e32 v19, v11, v38 -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v46 -; SI-NEXT: v_or_b32_e32 v27, v11, v50 -; SI-NEXT: v_alignbit_b32 v11, v19, v50, 16 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v52 -; SI-NEXT: v_or_b32_e32 v11, v11, v14 -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v61 -; SI-NEXT: v_or_b32_e32 v21, v15, v53 -; SI-NEXT: v_alignbit_b32 v15, v11, v53, 16 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v55 -; SI-NEXT: v_or_b32_e32 v15, v15, v2 -; SI-NEXT: v_or_b32_e32 v46, v3, v43 -; SI-NEXT: v_alignbit_b32 v3, v15, v43, 16 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v44 -; SI-NEXT: v_or_b32_e32 v17, v3, v16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v39 -; SI-NEXT: v_or_b32_e32 v39, v3, v45 -; SI-NEXT: v_alignbit_b32 v3, v17, v45, 16 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v37 +; SI-NEXT: v_or_b32_e32 v30, v1, v42 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v60 +; SI-NEXT: v_or_b32_e32 v27, v1, v38 +; SI-NEXT: v_alignbit_b32 v1, v30, v38, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v47 -; SI-NEXT: v_or_b32_e32 v61, v3, v24 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v40 -; SI-NEXT: v_or_b32_e32 v3, v3, v58 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v39 +; SI-NEXT: v_or_b32_e32 v28, v1, v44 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v47 +; SI-NEXT: v_or_b32_e32 v34, v1, v45 +; SI-NEXT: v_alignbit_b32 v1, v28, v45, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v3, v61, v58, 16 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v63 +; SI-NEXT: v_or_b32_e32 v33, v1, v26 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v61 +; SI-NEXT: v_or_b32_e32 v36, v1, v7 +; SI-NEXT: v_alignbit_b32 v1, v33, v7, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v59 -; SI-NEXT: v_or_b32_e32 v62, v3, v34 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v31 -; SI-NEXT: v_or_b32_e32 v40, v3, v36 -; SI-NEXT: v_alignbit_b32 v3, v62, v36, 16 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v48 -; SI-NEXT: v_or_b32_e32 v59, v3, v35 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v12 -; SI-NEXT: v_or_b32_e32 v31, v3, v54 -; SI-NEXT: v_alignbit_b32 v3, v59, v54, 16 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v41 -; SI-NEXT: v_or_b32_e32 v47, v3, v37 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v42 -; SI-NEXT: v_or_b32_e32 v25, v3, v57 -; SI-NEXT: v_alignbit_b32 v3, v47, v57, 16 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v63 -; SI-NEXT: v_or_b32_e32 v45, v3, v51 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v22 -; SI-NEXT: v_or_b32_e32 v22, v3, v56 -; SI-NEXT: v_alignbit_b32 v3, v45, v56, 16 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v32 +; SI-NEXT: v_or_b32_e32 v35, v1, v4 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v43 +; SI-NEXT: v_or_b32_e32 v38, v1, v46 +; SI-NEXT: v_alignbit_b32 v1, v35, v46, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v60 -; SI-NEXT: v_or_b32_e32 v44, v3, v4 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; kill: killed $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: v_mov_b32_e32 v3, v7 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:980 ; 4-byte Folded Reload -; SI-NEXT: ; kill: killed $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; kill: killed $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: v_or_b32_e32 v12, v1, v9 -; SI-NEXT: v_alignbit_b32 v1, v44, v9, 16 -; SI-NEXT: ; kill: killed $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v4 -; SI-NEXT: ; kill: killed $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v13 -; SI-NEXT: v_mov_b32_e32 v13, v25 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v33 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v38 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v14 -; SI-NEXT: v_mov_b32_e32 v14, v31 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v34 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v35 -; SI-NEXT: v_mov_b32_e32 v35, v22 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v37 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v51 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v37, v1, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v59 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: v_or_b32_e32 v39, v1, v3 +; SI-NEXT: v_alignbit_b32 v1, v37, v3, 16 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v50 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v52 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v42 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v44 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v4 ; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; kill: killed $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; kill: killed $vgpr42 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: .LBB96_2: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_or_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:968 ; 4-byte Folded Reload +; SI-NEXT: s_xor_b64 exec, exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB96_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:964 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:976 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:904 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_or_b32_e32 v2, v58, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_movk_i32 s6, 0x300 -; SI-NEXT: v_mov_b32_e32 v30, v16 -; SI-NEXT: v_mov_b32_e32 v33, v31 -; SI-NEXT: v_mov_b32_e32 v31, v22 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v62, v3 ; SI-NEXT: s_mov_b32 s7, 0x3000000 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:936 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:968 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:940 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:948 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:900 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v37, vcc, s7, v0 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v37 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:916 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:876 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:896 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_or_b32_e32 v3, v3, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:956 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x300, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:964 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v56, v5 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v26, v6 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v5, v1, v5 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x300, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:956 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v39, vcc, s7, v3 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:936 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v9, v2 -; SI-NEXT: v_or_b32_e32 v23, v2, v3 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:952 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:916 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v2, v56, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v38, vcc, s7, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:944 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:948 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; SI-NEXT: v_or_b32_e32 v4, v32, v4 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: v_mov_b32_e32 v32, v24 -; SI-NEXT: v_add_i32_e32 v44, vcc, s7, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:972 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:932 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 ; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v60, v4 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:924 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v35, vcc, s7, v3 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v4, v46, v4 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v35, vcc, s7, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v35 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 ; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 -; SI-NEXT: v_or_b32_e32 v4, v63, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:860 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:912 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v45, vcc, s7, v4 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v45 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:952 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v36, vcc, s7, v4 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:880 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v5, v63, v5 ; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 ; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v51, v6 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, s7, v5 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v33, vcc, s7, v5 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v33 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 ; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:888 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 ; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v37, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:884 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v47, vcc, s7, v6 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:888 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v34, vcc, s7, v6 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 ; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -195259,15 +193193,16 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:896 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, s7, v7 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:920 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v28, vcc, s7, v7 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v28 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 ; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 ; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -195276,15 +193211,15 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v59, vcc, s7, v8 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v27, vcc, s7, v8 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 ; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -195292,17 +193227,15 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:872 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v8, vcc, s7, v9 -; SI-NEXT: v_mov_b32_e32 v40, v8 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v32 ; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; SI-NEXT: v_add_i32_e32 v30, vcc, s7, v9 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v30 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 ; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -195311,15 +193244,14 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v62, vcc, s7, v10 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 ; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -195328,15 +193260,16 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v9, vcc, s7, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v23, vcc, s7, v11 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v23 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 ; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 ; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -195345,15 +193278,16 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v61, vcc, s7, v12 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v29, vcc, s7, v12 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 ; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -195362,652 +193296,526 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v39, vcc, s7, v13 -; SI-NEXT: v_mov_b32_e32 v13, v2 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 ; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 ; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 ; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v22, vcc, s7, v14 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 ; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v46, vcc, s7, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 ; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 ; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 ; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v15, vcc, s7, v16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v5, vcc, s7, v16 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 ; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 ; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; SI-NEXT: v_and_b32_e32 v18, 0xff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: v_or_b32_e32 v18, v18, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v24, vcc, s7, v17 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v24 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xff, v18 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 ; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: v_or_b32_e32 v19, v19, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v11, vcc, s7, v19 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v11 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v4, vcc, s7, v18 +; SI-NEXT: v_mov_b32_e32 v18, v5 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v17, v20, v17 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 ; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: v_or_b32_e32 v20, v20, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v27, vcc, s7, v20 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v17, v21, v17 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; SI-NEXT: v_and_b32_e32 v21, 0xff, v21 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_or_b32_e32 v21, v26, v21 -; SI-NEXT: v_or_b32_e32 v22, v21, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v19, vcc, s7, v22 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v19 +; SI-NEXT: v_or_b32_e32 v25, v20, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v17, v21, v17 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; SI-NEXT: v_and_b32_e32 v21, 0xff, v21 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_or_b32_e32 v21, v26, v21 -; SI-NEXT: v_or_b32_e32 v24, v21, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_or_b32_e32 v26, v20, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, s7, v26 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v17, v21, v17 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; SI-NEXT: v_and_b32_e32 v21, 0xff, v21 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_or_b32_e32 v21, v26, v21 -; SI-NEXT: v_or_b32_e32 v29, v21, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v29, vcc, s7, v29 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v29 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_or_b32_e32 v31, v20, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v21, vcc, s7, v15 +; SI-NEXT: v_add_i32_e32 v31, vcc, s7, v31 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v31 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v26, v21, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v26 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v32, v20, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v32 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v28, v21, v17 -; SI-NEXT: v_or_b32_e32 v2, v28, v2 -; SI-NEXT: v_add_i32_e32 v20, vcc, s7, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v17, vcc, s7, v23 -; SI-NEXT: v_mov_b32_e32 v12, v17 -; SI-NEXT: v_add_i32_e32 v17, vcc, s7, v14 -; SI-NEXT: v_mov_b32_e32 v14, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v23, vcc, s7, v24 -; SI-NEXT: v_add_i32_e32 v21, vcc, s7, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v15 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_or_b32_e32 v48, v20, v19 +; SI-NEXT: v_or_b32_e32 v0, v48, v0 +; SI-NEXT: v_add_i32_e32 v16, vcc, s7, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v19, vcc, s7, v13 +; SI-NEXT: v_add_i32_e32 v13, vcc, s7, v25 +; SI-NEXT: v_add_i32_e32 v20, vcc, s7, v10 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v19 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v28, vcc, s7, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v25 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v28 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v9, vcc, s7, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v59 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v9 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_add_i32_e32 v10, vcc, s7, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v15, vcc, s7, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v36, v3 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_add_i32_e32 v26, vcc, s7, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:908 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v26 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v51, vcc, s7, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v47 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v51 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v32, v3 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_add_i32_e32 v16, vcc, s7, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v6, vcc, s7, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v31, v3 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_add_i32_e32 v6, vcc, s7, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v6 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v12, vcc, s7, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v43 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_or_b32_e32 v2, v34, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_add_i32_e32 v4, vcc, s7, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v48, vcc, s7, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v54 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v3, v30, v3 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_add_i32_e32 v5, vcc, s7, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v41 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_or_b32_e32 v2, v33, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v62 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v59 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v3, v1, v3 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_add_i32_e32 v7, vcc, s7, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v57 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v2, v1, v2 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:920 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v61 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v11, vcc, s7, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_or_b32_e32 v0, v40, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v3, v1, v3 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_add_i32_e32 v1, vcc, s7, v2 -; SI-NEXT: v_alignbit_b32 v2, v5, v4, 16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_alignbit_b32 v2, v6, v16, 16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, s7, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v60 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, s7, v0 +; SI-NEXT: v_alignbit_b32 v0, v11, v48, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_alignbit_b32 v2, v26, v10, 16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v0, v12, v6, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v2, v28, v20, 16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v0, v51, v15, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v2, v29, v23, 16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v0, v9, v16, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v31, v3, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v13, v4, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v2, v19, v27, 16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v0, v24, v5, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v2, v11, v21, 16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v0, v21, v22, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v2, v15, v46, 16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v0, v19, v29, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v2, v17, v39, 16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_alignbit_b32 v2, v61, v9, 16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v2, v62, v8, 16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v0, v23, v20, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v2, v59, v14, 16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v0, v30, v27, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v2, v47, v13, 16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v0, v28, v34, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v2, v45, v35, 16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v0, v33, v36, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v2, v44, v12, 16 -; SI-NEXT: v_alignbit_b32 v3, v1, v7, 16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v17 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v61 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v47 +; SI-NEXT: v_alignbit_b32 v0, v35, v38, 16 +; SI-NEXT: v_alignbit_b32 v1, v8, v7, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v44 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v0, v37, v39, 16 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill ; SI-NEXT: .LBB96_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_waitcnt vmcnt(2) expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v58 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:856 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v41 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v45 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:880 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v48 +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v43 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v11 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:864 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:868 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v4, v4, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v15 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v51 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v16 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v55 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v8, v8, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v17 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:884 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v49 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v52 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v31 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:944 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v26 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v53 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v28 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v52 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v29 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v50 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v27 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v49 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v38 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v46 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v18 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v39 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v10 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:856 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v12, v12, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v50 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:900 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v25 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v29 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:924 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v55 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:940 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v18, v18, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v53 +; SI-NEXT: v_or_b32_e32 v19, v19, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:908 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v61 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:860 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v40 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v62 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:864 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v59 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:868 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v47 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v8 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:872 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v35 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v45 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:876 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:904 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v44 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:912 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v27 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:932 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v30 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v34 +; SI-NEXT: v_or_b32_e32 v24, v24, v25 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v44 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v36 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_or_b32_e32 v26, v26, v27 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v33 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v38 +; SI-NEXT: v_or_b32_e32 v28, v28, v29 +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v35 +; SI-NEXT: v_or_b32_e32 v29, v29, v30 +; SI-NEXT: v_and_b32_e32 v30, 0xffff, v39 +; SI-NEXT: v_or_b32_e32 v30, v30, v31 +; SI-NEXT: v_and_b32_e32 v31, 0xffff, v37 +; SI-NEXT: v_or_b32_e32 v31, v31, v32 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v128i8_to_v64i16: @@ -200132,18 +197940,18 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[4:5] -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:332 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:328 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:324 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:320 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:316 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:312 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:308 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:304 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:328 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:324 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:320 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:316 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:312 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:308 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:304 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:300 ; SI-NEXT: ; implicit-def: $vgpr43 : SGPR spill to VGPR lane ; SI-NEXT: s_waitcnt expcnt(2) ; SI-NEXT: v_writelane_b32 v41, s30, 0 @@ -200161,7 +197969,6 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_writelane_b32 v43, s19, 10 ; SI-NEXT: v_writelane_b32 v43, s18, 11 ; SI-NEXT: v_writelane_b32 v43, s17, 12 -; SI-NEXT: v_writelane_b32 v43, s16, 13 ; SI-NEXT: v_writelane_b32 v41, s31, 1 ; SI-NEXT: v_writelane_b32 v41, s34, 2 ; SI-NEXT: v_writelane_b32 v41, s35, 3 @@ -200196,289 +198003,299 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_writelane_b32 v41, s96, 32 ; SI-NEXT: v_writelane_b32 v41, s97, 33 ; SI-NEXT: v_writelane_b32 v41, s98, 34 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:164 +; SI-NEXT: s_mov_b32 s22, s16 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:160 ; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:156 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:152 -; SI-NEXT: v_readfirstlane_b32 s39, v26 -; SI-NEXT: ; implicit-def: $vgpr42 : SGPR spill to VGPR lane -; SI-NEXT: v_readfirstlane_b32 s47, v12 -; SI-NEXT: v_writelane_b32 v42, s39, 0 ; SI-NEXT: v_readfirstlane_b32 s56, v11 -; SI-NEXT: v_writelane_b32 v42, s47, 1 -; SI-NEXT: v_readfirstlane_b32 s48, v24 -; SI-NEXT: v_writelane_b32 v42, s56, 2 +; SI-NEXT: ; implicit-def: $vgpr42 : SGPR spill to VGPR lane +; SI-NEXT: v_readfirstlane_b32 s57, v10 +; SI-NEXT: v_writelane_b32 v42, s56, 0 ; SI-NEXT: v_readfirstlane_b32 s49, v23 -; SI-NEXT: v_writelane_b32 v42, s48, 3 -; SI-NEXT: v_readfirstlane_b32 s50, v21 -; SI-NEXT: v_writelane_b32 v42, s49, 4 -; SI-NEXT: v_readfirstlane_b32 s51, v22 -; SI-NEXT: v_writelane_b32 v42, s50, 5 -; SI-NEXT: v_writelane_b32 v42, s51, 6 -; SI-NEXT: v_readfirstlane_b32 s57, v20 +; SI-NEXT: v_writelane_b32 v42, s57, 1 +; SI-NEXT: v_readfirstlane_b32 s50, v22 +; SI-NEXT: v_writelane_b32 v42, s49, 2 +; SI-NEXT: v_readfirstlane_b32 s51, v20 +; SI-NEXT: v_writelane_b32 v42, s50, 3 +; SI-NEXT: v_readfirstlane_b32 s52, v21 +; SI-NEXT: v_writelane_b32 v42, s51, 4 +; SI-NEXT: v_writelane_b32 v42, s52, 5 ; SI-NEXT: v_readfirstlane_b32 s58, v19 -; SI-NEXT: v_readfirstlane_b32 s64, v29 -; SI-NEXT: v_readfirstlane_b32 s65, v30 -; SI-NEXT: v_readfirstlane_b32 s59, v28 +; SI-NEXT: v_readfirstlane_b32 s59, v18 +; SI-NEXT: v_readfirstlane_b32 s64, v30 +; SI-NEXT: v_readfirstlane_b32 s65, v28 +; SI-NEXT: v_readfirstlane_b32 s66, v29 ; SI-NEXT: v_readfirstlane_b32 s60, v27 -; SI-NEXT: v_readfirstlane_b32 s11, v1 -; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_readfirstlane_b32 s61, v26 +; SI-NEXT: v_readfirstlane_b32 s12, v0 +; SI-NEXT: v_readfirstlane_b32 s13, v1 +; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_readfirstlane_b32 s4, v31 -; SI-NEXT: v_writelane_b32 v43, s4, 14 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:300 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:296 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:292 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:288 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:284 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:280 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:296 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:292 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:288 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:284 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:280 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:276 +; SI-NEXT: v_writelane_b32 v43, s4, 13 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_readfirstlane_b32 s4, v32 -; SI-NEXT: v_writelane_b32 v43, s4, 15 +; SI-NEXT: v_writelane_b32 v43, s4, 14 ; SI-NEXT: v_readfirstlane_b32 s4, v33 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:276 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:272 -; SI-NEXT: v_writelane_b32 v43, s4, 16 +; SI-NEXT: v_writelane_b32 v43, s4, 15 ; SI-NEXT: v_readfirstlane_b32 s4, v34 -; SI-NEXT: v_writelane_b32 v43, s4, 17 -; SI-NEXT: v_readfirstlane_b32 s4, v35 -; SI-NEXT: v_writelane_b32 v43, s4, 18 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:272 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:268 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:168 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:164 +; SI-NEXT: v_writelane_b32 v43, s4, 16 ; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_readfirstlane_b32 s4, v35 +; SI-NEXT: v_writelane_b32 v43, s4, 17 ; SI-NEXT: v_readfirstlane_b32 s44, v36 -; SI-NEXT: v_readfirstlane_b32 s90, v37 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:268 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:264 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:260 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:256 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_readfirstlane_b32 s6, v38 -; SI-NEXT: v_readfirstlane_b32 s12, v2 -; SI-NEXT: v_readfirstlane_b32 s13, v9 -; SI-NEXT: v_readfirstlane_b32 s14, v10 -; SI-NEXT: v_readfirstlane_b32 s15, v8 -; SI-NEXT: v_readfirstlane_b32 s18, v7 -; SI-NEXT: v_readfirstlane_b32 s21, v5 -; SI-NEXT: v_readfirstlane_b32 s22, v6 -; SI-NEXT: v_readfirstlane_b32 s40, v17 -; SI-NEXT: v_readfirstlane_b32 s41, v18 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:264 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:260 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:256 +; SI-NEXT: v_readfirstlane_b32 s6, v37 +; SI-NEXT: v_readfirstlane_b32 s7, v38 +; SI-NEXT: v_readfirstlane_b32 s14, v8 +; SI-NEXT: v_readfirstlane_b32 s15, v9 +; SI-NEXT: v_readfirstlane_b32 s40, v7 +; SI-NEXT: v_readfirstlane_b32 s41, v6 ; SI-NEXT: v_readfirstlane_b32 s42, v4 -; SI-NEXT: v_readfirstlane_b32 s43, v3 +; SI-NEXT: v_readfirstlane_b32 s43, v5 ; SI-NEXT: v_readfirstlane_b32 s76, v16 -; SI-NEXT: v_readfirstlane_b32 s77, v15 -; SI-NEXT: v_readfirstlane_b32 s38, v25 +; SI-NEXT: v_readfirstlane_b32 s77, v17 +; SI-NEXT: v_readfirstlane_b32 s46, v3 +; SI-NEXT: v_readfirstlane_b32 s47, v2 +; SI-NEXT: v_readfirstlane_b32 s78, v15 +; SI-NEXT: v_readfirstlane_b32 s38, v13 +; SI-NEXT: v_readfirstlane_b32 s39, v24 ; SI-NEXT: v_writelane_b32 v41, s99, 35 +; SI-NEXT: v_readfirstlane_b32 s48, v25 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_readfirstlane_b32 s99, v54 ; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_readfirstlane_b32 s93, v55 +; SI-NEXT: v_readfirstlane_b32 s88, v40 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_readfirstlane_b32 s95, v40 -; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: v_writelane_b32 v43, s4, 18 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_readfirstlane_b32 s4, v39 ; SI-NEXT: v_writelane_b32 v43, s4, 19 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_readfirstlane_b32 s4, v39 +; SI-NEXT: v_readfirstlane_b32 s4, v48 ; SI-NEXT: v_writelane_b32 v43, s4, 20 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_readfirstlane_b32 s4, v48 +; SI-NEXT: v_readfirstlane_b32 s4, v49 ; SI-NEXT: v_writelane_b32 v43, s4, 21 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_readfirstlane_b32 s4, v49 +; SI-NEXT: v_readfirstlane_b32 s4, v50 ; SI-NEXT: v_writelane_b32 v43, s4, 22 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_readfirstlane_b32 s4, v50 -; SI-NEXT: v_writelane_b32 v43, s4, 23 -; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_readfirstlane_b32 s4, v51 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:252 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:248 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:244 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:240 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:236 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:232 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:228 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_readfirstlane_b32 s91, v32 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_readfirstlane_b32 s8, v33 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:224 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:220 -; SI-NEXT: v_writelane_b32 v43, s4, 24 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:248 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:244 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:240 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:236 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:232 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:228 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:224 +; SI-NEXT: v_writelane_b32 v43, s4, 23 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_readfirstlane_b32 s8, v32 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:220 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_readfirstlane_b32 s9, v33 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:216 ; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_readfirstlane_b32 s4, v34 -; SI-NEXT: v_writelane_b32 v43, s4, 25 +; SI-NEXT: v_writelane_b32 v43, s4, 24 ; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_readfirstlane_b32 s4, v35 -; SI-NEXT: v_writelane_b32 v43, s4, 26 +; SI-NEXT: v_writelane_b32 v43, s4, 25 ; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_readfirstlane_b32 s4, v36 -; SI-NEXT: v_writelane_b32 v43, s4, 27 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_readfirstlane_b32 s4, v37 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:216 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:212 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:208 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:204 -; SI-NEXT: v_writelane_b32 v43, s4, 28 +; SI-NEXT: v_writelane_b32 v43, s4, 26 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:212 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:208 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:204 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_readfirstlane_b32 s4, v31 -; SI-NEXT: v_writelane_b32 v43, s4, 29 +; SI-NEXT: v_readfirstlane_b32 s91, v31 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_readfirstlane_b32 s89, v38 +; SI-NEXT: v_readfirstlane_b32 s4, v37 +; SI-NEXT: v_writelane_b32 v43, s4, 27 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_readfirstlane_b32 s78, v39 +; SI-NEXT: v_readfirstlane_b32 s4, v38 +; SI-NEXT: v_writelane_b32 v43, s4, 28 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_readfirstlane_b32 s7, v48 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_readfirstlane_b32 s82, v49 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_readfirstlane_b32 s4, v50 +; SI-NEXT: v_readfirstlane_b32 s93, v39 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_readfirstlane_b32 s96, v51 +; SI-NEXT: v_readfirstlane_b32 s4, v50 +; SI-NEXT: v_readfirstlane_b32 s80, v48 +; SI-NEXT: v_readfirstlane_b32 s82, v49 +; SI-NEXT: v_writelane_b32 v43, s4, 29 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_readfirstlane_b32 s4, v51 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:200 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:196 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:192 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:188 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:184 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:180 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:176 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_readfirstlane_b32 s70, v33 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:172 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:168 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:196 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:192 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:188 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:184 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:180 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:176 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:172 ; SI-NEXT: v_writelane_b32 v43, s4, 30 -; SI-NEXT: v_readfirstlane_b32 s4, v32 -; SI-NEXT: v_writelane_b32 v43, s4, 31 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_readfirstlane_b32 s4, v34 -; SI-NEXT: v_writelane_b32 v43, s4, 32 +; SI-NEXT: v_readfirstlane_b32 s79, v32 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_readfirstlane_b32 s9, v35 +; SI-NEXT: v_readfirstlane_b32 s83, v33 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_readfirstlane_b32 s36, v34 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_readfirstlane_b32 s4, v37 -; SI-NEXT: v_writelane_b32 v43, s4, 33 -; SI-NEXT: v_readfirstlane_b32 s10, v36 +; SI-NEXT: v_readfirstlane_b32 s10, v35 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_readfirstlane_b32 s11, v36 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:112 ; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_readfirstlane_b32 s4, v31 -; SI-NEXT: v_writelane_b32 v43, s4, 34 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_readfirstlane_b32 s4, v38 -; SI-NEXT: v_writelane_b32 v43, s4, 35 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_readfirstlane_b32 s4, v39 -; SI-NEXT: v_writelane_b32 v43, s4, 36 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_readfirstlane_b32 s69, v48 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_readfirstlane_b32 s30, v49 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_readfirstlane_b32 s16, v50 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_readfirstlane_b32 s36, v51 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:148 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:144 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_readfirstlane_b32 s4, v33 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:140 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:136 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:132 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:128 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:124 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:120 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:116 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:336 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:112 +; SI-NEXT: v_writelane_b32 v43, s4, 31 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_readfirstlane_b32 s4, v37 +; SI-NEXT: v_writelane_b32 v43, s4, 32 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_readfirstlane_b32 s4, v38 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_readfirstlane_b32 s98, v39 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_readfirstlane_b32 s90, v48 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:136 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:132 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:128 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:124 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:120 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:116 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_readfirstlane_b32 s89, v49 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_readfirstlane_b32 s95, v50 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_readfirstlane_b32 s81, v51 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:332 ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:108 ; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:104 -; SI-NEXT: v_writelane_b32 v43, s4, 37 -; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_writelane_b32 v43, s4, 33 ; SI-NEXT: v_readfirstlane_b32 s4, v52 -; SI-NEXT: v_writelane_b32 v43, s4, 38 +; SI-NEXT: v_writelane_b32 v43, s4, 34 ; SI-NEXT: v_readfirstlane_b32 s4, v53 -; SI-NEXT: v_writelane_b32 v43, s4, 39 -; SI-NEXT: v_readfirstlane_b32 s4, v54 -; SI-NEXT: v_writelane_b32 v43, s4, 40 -; SI-NEXT: v_writelane_b32 v43, s44, 41 -; SI-NEXT: v_writelane_b32 v43, s6, 42 -; SI-NEXT: v_writelane_b32 v43, s7, 43 -; SI-NEXT: v_writelane_b32 v43, s8, 44 -; SI-NEXT: v_writelane_b32 v43, s9, 45 -; SI-NEXT: v_writelane_b32 v43, s10, 46 -; SI-NEXT: v_writelane_b32 v43, s11, 47 -; SI-NEXT: v_writelane_b32 v43, s12, 48 -; SI-NEXT: v_writelane_b32 v43, s13, 49 -; SI-NEXT: v_writelane_b32 v43, s14, 50 -; SI-NEXT: v_writelane_b32 v43, s15, 51 -; SI-NEXT: v_writelane_b32 v43, s18, 52 -; SI-NEXT: v_writelane_b32 v43, s21, 53 -; SI-NEXT: v_writelane_b32 v43, s22, 54 -; SI-NEXT: v_writelane_b32 v43, s40, 55 -; SI-NEXT: v_writelane_b32 v43, s41, 56 -; SI-NEXT: v_writelane_b32 v43, s42, 57 -; SI-NEXT: v_writelane_b32 v43, s43, 58 -; SI-NEXT: v_writelane_b32 v43, s76, 59 -; SI-NEXT: v_writelane_b32 v43, s77, 60 +; SI-NEXT: v_writelane_b32 v43, s4, 35 +; SI-NEXT: v_readfirstlane_b32 s4, v55 +; SI-NEXT: v_writelane_b32 v43, s4, 36 +; SI-NEXT: v_writelane_b32 v43, s44, 37 +; SI-NEXT: v_writelane_b32 v43, s6, 38 +; SI-NEXT: v_writelane_b32 v43, s83, 39 +; SI-NEXT: v_writelane_b32 v43, s7, 40 +; SI-NEXT: v_writelane_b32 v43, s8, 41 +; SI-NEXT: v_writelane_b32 v43, s36, 42 +; SI-NEXT: v_writelane_b32 v43, s9, 43 +; SI-NEXT: v_writelane_b32 v43, s10, 44 +; SI-NEXT: v_writelane_b32 v43, s11, 45 +; SI-NEXT: v_writelane_b32 v43, s12, 46 +; SI-NEXT: v_writelane_b32 v43, s13, 47 +; SI-NEXT: v_writelane_b32 v43, s14, 48 +; SI-NEXT: v_writelane_b32 v43, s15, 49 +; SI-NEXT: v_writelane_b32 v43, s40, 50 +; SI-NEXT: v_writelane_b32 v43, s41, 51 +; SI-NEXT: v_writelane_b32 v43, s42, 52 +; SI-NEXT: v_writelane_b32 v43, s43, 53 +; SI-NEXT: v_writelane_b32 v43, s76, 54 +; SI-NEXT: v_writelane_b32 v43, s77, 55 +; SI-NEXT: v_writelane_b32 v43, s46, 56 +; SI-NEXT: v_writelane_b32 v43, s47, 57 +; SI-NEXT: v_writelane_b32 v43, s78, 58 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_readfirstlane_b32 s97, v31 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_readfirstlane_b32 s17, v33 +; SI-NEXT: v_readfirstlane_b32 s28, v32 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_readfirstlane_b32 s98, v34 +; SI-NEXT: v_readfirstlane_b32 s29, v33 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_readfirstlane_b32 s23, v35 -; SI-NEXT: v_readfirstlane_b32 s25, v31 -; SI-NEXT: v_readfirstlane_b32 s28, v32 +; SI-NEXT: v_readfirstlane_b32 s92, v35 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_readfirstlane_b32 s26, v36 +; SI-NEXT: v_readfirstlane_b32 s84, v36 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_readfirstlane_b32 s88, v37 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_readfirstlane_b32 s79, v38 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_readfirstlane_b32 s75, v39 +; SI-NEXT: v_readfirstlane_b32 s17, v37 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:100 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:96 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:88 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:68 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v48 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:80 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_readfirstlane_b32 s24, v49 +; SI-NEXT: v_readfirstlane_b32 s94, v38 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_readfirstlane_b32 s85, v50 +; SI-NEXT: v_readfirstlane_b32 s21, v39 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_readfirstlane_b32 s66, v51 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:64 +; SI-NEXT: v_readfirstlane_b32 s24, v48 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v49 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_readfirstlane_b32 s16, v50 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_readfirstlane_b32 s34, v51 ; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:60 ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:56 ; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:52 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_readfirstlane_b32 vcc_lo, v13 -; SI-NEXT: v_readfirstlane_b32 vcc_hi, v14 -; SI-NEXT: v_writelane_b32 v43, vcc_lo, 61 -; SI-NEXT: v_writelane_b32 v43, vcc_hi, 62 -; SI-NEXT: v_writelane_b32 v43, s38, 63 +; SI-NEXT: v_readfirstlane_b32 vcc_lo, v14 +; SI-NEXT: v_readfirstlane_b32 vcc_hi, v12 +; SI-NEXT: v_writelane_b32 v43, vcc_lo, 59 +; SI-NEXT: v_writelane_b32 v43, vcc_hi, 60 +; SI-NEXT: v_writelane_b32 v43, s38, 61 +; SI-NEXT: v_writelane_b32 v43, s39, 62 +; SI-NEXT: v_writelane_b32 v43, s48, 63 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_readfirstlane_b32 s86, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:48 +; SI-NEXT: v_readfirstlane_b32 s25, v34 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:64 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_readfirstlane_b32 s20, v31 +; SI-NEXT: v_readfirstlane_b32 s23, v32 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_readfirstlane_b32 s19, v32 +; SI-NEXT: v_readfirstlane_b32 s96, v33 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_readfirstlane_b32 s27, v33 +; SI-NEXT: v_readfirstlane_b32 s35, v35 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_readfirstlane_b32 s94, v34 +; SI-NEXT: v_readfirstlane_b32 s31, v36 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_readfirstlane_b32 s72, v35 +; SI-NEXT: v_readfirstlane_b32 s72, v37 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_readfirstlane_b32 s73, v36 +; SI-NEXT: v_readfirstlane_b32 s20, v49 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_readfirstlane_b32 s67, v37 +; SI-NEXT: v_readfirstlane_b32 s18, v50 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_readfirstlane_b32 s71, v38 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_readfirstlane_b32 s97, v39 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:48 +; SI-NEXT: v_readfirstlane_b32 s19, v51 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_readfirstlane_b32 s75, v34 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_readfirstlane_b32 s67, v38 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_readfirstlane_b32 s71, v39 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s30, v48 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:44 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:40 ; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:36 @@ -200486,142 +198303,125 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:24 ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:20 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_readfirstlane_b32 s35, v48 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_readfirstlane_b32 s83, v49 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_readfirstlane_b32 s87, v50 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_readfirstlane_b32 s63, v51 ; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_readfirstlane_b32 s74, v31 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_readfirstlane_b32 s81, v32 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_readfirstlane_b32 s80, v33 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_readfirstlane_b32 s86, v34 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_readfirstlane_b32 s34, v35 +; SI-NEXT: v_readfirstlane_b32 s73, v31 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_readfirstlane_b32 s84, v36 +; SI-NEXT: v_readfirstlane_b32 s74, v32 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_readfirstlane_b32 s31, v37 +; SI-NEXT: v_readfirstlane_b32 s70, v33 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_readfirstlane_b32 s61, v38 +; SI-NEXT: v_readfirstlane_b32 s69, v34 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_readfirstlane_b32 s62, v39 +; SI-NEXT: v_readfirstlane_b32 s68, v35 +; SI-NEXT: v_readfirstlane_b32 s54, v49 +; SI-NEXT: v_readfirstlane_b32 s53, v50 +; SI-NEXT: v_writelane_b32 v42, s53, 6 +; SI-NEXT: v_writelane_b32 v42, s54, 7 +; SI-NEXT: v_writelane_b32 v42, s58, 8 +; SI-NEXT: v_readfirstlane_b32 s55, v51 +; SI-NEXT: v_writelane_b32 v42, s59, 9 +; SI-NEXT: v_writelane_b32 v42, s55, 10 +; SI-NEXT: v_writelane_b32 v42, s64, 11 +; SI-NEXT: v_writelane_b32 v42, s65, 12 +; SI-NEXT: v_writelane_b32 v42, s66, 13 +; SI-NEXT: v_writelane_b32 v42, s67, 14 +; SI-NEXT: v_writelane_b32 v42, s69, 15 +; SI-NEXT: v_writelane_b32 v42, s70, 16 +; SI-NEXT: v_writelane_b32 v42, s71, 17 +; SI-NEXT: v_writelane_b32 v42, s60, 18 +; SI-NEXT: v_writelane_b32 v42, s61, 19 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_readfirstlane_b32 s53, v48 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_readfirstlane_b32 s52, v49 -; SI-NEXT: v_writelane_b32 v42, s52, 7 -; SI-NEXT: v_writelane_b32 v42, s53, 8 -; SI-NEXT: v_writelane_b32 v42, s57, 9 +; SI-NEXT: v_readfirstlane_b32 s85, v36 +; SI-NEXT: v_writelane_b32 v42, s68, 20 +; SI-NEXT: v_writelane_b32 v42, s85, 21 +; SI-NEXT: v_writelane_b32 v42, s30, 22 +; SI-NEXT: v_writelane_b32 v42, s34, 23 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_readfirstlane_b32 s54, v50 -; SI-NEXT: v_writelane_b32 v42, s58, 10 +; SI-NEXT: v_readfirstlane_b32 s37, v38 +; SI-NEXT: v_writelane_b32 v42, s86, 24 +; SI-NEXT: v_readfirstlane_b32 s87, v37 +; SI-NEXT: v_writelane_b32 v42, s37, 25 +; SI-NEXT: v_writelane_b32 v42, s87, 26 +; SI-NEXT: v_writelane_b32 v42, s20, 27 +; SI-NEXT: v_writelane_b32 v42, s84, 28 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s55, v51 -; SI-NEXT: v_writelane_b32 v42, s54, 11 -; SI-NEXT: v_writelane_b32 v42, s55, 12 -; SI-NEXT: v_writelane_b32 v42, s64, 13 -; SI-NEXT: v_writelane_b32 v42, s65, 14 -; SI-NEXT: v_writelane_b32 v42, s67, 15 -; SI-NEXT: v_writelane_b32 v42, s71, 16 -; SI-NEXT: v_writelane_b32 v42, s80, 17 -; SI-NEXT: v_writelane_b32 v42, s81, 18 -; SI-NEXT: v_writelane_b32 v42, s59, 19 -; SI-NEXT: v_writelane_b32 v42, s60, 20 -; SI-NEXT: v_writelane_b32 v42, s86, 21 -; SI-NEXT: v_writelane_b32 v42, s97, 22 -; SI-NEXT: v_writelane_b32 v42, s34, 23 -; SI-NEXT: v_writelane_b32 v42, s66, 24 -; SI-NEXT: v_writelane_b32 v42, s85, 25 -; SI-NEXT: v_writelane_b32 v42, s31, 26 -; SI-NEXT: v_writelane_b32 v42, s84, 27 -; SI-NEXT: v_writelane_b32 v42, s35, 28 -; SI-NEXT: v_writelane_b32 v42, s98, 29 -; SI-NEXT: v_writelane_b32 v42, s17, 30 -; SI-NEXT: v_writelane_b32 v42, s20, 31 -; SI-NEXT: v_writelane_b32 v42, s61, 32 -; SI-NEXT: v_writelane_b32 v42, s19, 33 -; SI-NEXT: v_writelane_b32 v42, s62, 34 -; SI-NEXT: v_writelane_b32 v42, s23, 35 -; SI-NEXT: v_writelane_b32 v42, s83, 36 -; SI-NEXT: v_writelane_b32 v42, s87, 37 -; SI-NEXT: v_writelane_b32 v42, s26, 38 -; SI-NEXT: v_writelane_b32 v42, s94, 39 -; SI-NEXT: v_writelane_b32 v42, s27, 40 -; SI-NEXT: v_writelane_b32 v42, s63, 41 -; SI-NEXT: v_writelane_b32 v42, s79, 42 -; SI-NEXT: v_writelane_b32 v42, s88, 43 -; SI-NEXT: v_writelane_b32 v42, s72, 44 -; SI-NEXT: v_writelane_b32 v42, s73, 45 -; SI-NEXT: v_writelane_b32 v42, s74, 46 -; SI-NEXT: v_writelane_b32 v42, s75, 47 -; SI-NEXT: v_writelane_b32 v42, s24, 48 -; SI-NEXT: v_writelane_b32 v42, s25, 49 -; SI-NEXT: v_writelane_b32 v42, s28, 50 +; SI-NEXT: v_readfirstlane_b32 s62, v39 +; SI-NEXT: v_writelane_b32 v42, s92, 29 +; SI-NEXT: v_writelane_b32 v42, s62, 30 +; SI-NEXT: v_readfirstlane_b32 s63, v48 +; SI-NEXT: v_writelane_b32 v42, s23, 31 +; SI-NEXT: v_writelane_b32 v42, s63, 32 +; SI-NEXT: v_writelane_b32 v42, s96, 33 +; SI-NEXT: v_writelane_b32 v42, s17, 34 +; SI-NEXT: v_writelane_b32 v42, s18, 35 +; SI-NEXT: v_writelane_b32 v42, s94, 36 +; SI-NEXT: v_writelane_b32 v42, s19, 37 +; SI-NEXT: v_writelane_b32 v42, s31, 38 +; SI-NEXT: v_writelane_b32 v42, s35, 39 +; SI-NEXT: v_writelane_b32 v42, s24, 40 +; SI-NEXT: v_writelane_b32 v42, s21, 41 +; SI-NEXT: v_writelane_b32 v42, s72, 42 +; SI-NEXT: v_writelane_b32 v42, s73, 43 +; SI-NEXT: v_writelane_b32 v42, s74, 44 +; SI-NEXT: v_writelane_b32 v42, s75, 45 +; SI-NEXT: v_writelane_b32 v42, s25, 46 +; SI-NEXT: v_writelane_b32 v42, s16, 47 +; SI-NEXT: v_writelane_b32 v42, s97, 48 +; SI-NEXT: v_writelane_b32 v42, s28, 49 +; SI-NEXT: v_writelane_b32 v42, s29, 50 ; SI-NEXT: s_cbranch_scc0 .LBB97_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_readlane_b32 s4, v43, 13 ; SI-NEXT: v_readlane_b32 s5, v43, 12 -; SI-NEXT: s_and_b32 s4, s4, 0xff +; SI-NEXT: s_and_b32 s4, s22, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 8 -; SI-NEXT: s_or_b32 s29, s4, s5 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_writelane_b32 v42, s4, 51 ; SI-NEXT: v_readlane_b32 s4, v43, 5 ; SI-NEXT: v_readlane_b32 s5, v43, 4 ; SI-NEXT: s_and_b32 s4, s4, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 8 ; SI-NEXT: s_or_b32 s45, s4, s5 -; SI-NEXT: s_and_b32 s4, s43, 0xff -; SI-NEXT: s_lshl_b32 s5, s42, 8 +; SI-NEXT: s_and_b32 s4, s47, 0xff +; SI-NEXT: s_lshl_b32 s5, s46, 8 ; SI-NEXT: s_or_b32 s46, s4, s5 -; SI-NEXT: s_and_b32 s4, s56, 0xff -; SI-NEXT: s_lshl_b32 s5, s47, 8 +; SI-NEXT: s_and_b32 s4, s57, 0xff +; SI-NEXT: s_lshl_b32 s5, s56, 8 ; SI-NEXT: s_or_b32 s47, s4, s5 -; SI-NEXT: s_and_b32 s4, s58, 0xff -; SI-NEXT: s_lshl_b32 s5, s57, 8 +; SI-NEXT: s_and_b32 s4, s59, 0xff +; SI-NEXT: s_lshl_b32 s5, s58, 8 ; SI-NEXT: s_or_b32 s56, s4, s5 -; SI-NEXT: s_and_b32 s4, s60, 0xff -; SI-NEXT: s_lshl_b32 s5, s59, 8 +; SI-NEXT: s_and_b32 s4, s61, 0xff +; SI-NEXT: s_lshl_b32 s5, s60, 8 ; SI-NEXT: s_or_b32 s57, s4, s5 -; SI-NEXT: s_and_b32 s4, s62, 0xff -; SI-NEXT: s_lshl_b32 s5, s61, 8 +; SI-NEXT: s_and_b32 s4, s63, 0xff +; SI-NEXT: s_lshl_b32 s5, s62, 8 ; SI-NEXT: s_or_b32 s58, s4, s5 ; SI-NEXT: s_and_b32 s4, s74, 0xff -; SI-NEXT: s_lshl_b32 s5, s63, 8 +; SI-NEXT: s_lshl_b32 s5, s73, 8 ; SI-NEXT: s_or_b32 s59, s4, s5 -; SI-NEXT: s_and_b32 s4, s73, 0xff +; SI-NEXT: s_and_b32 s4, s75, 0xff ; SI-NEXT: s_lshl_b32 s5, s72, 8 ; SI-NEXT: s_or_b32 s60, s4, s5 -; SI-NEXT: s_and_b32 s4, s24, 0xff -; SI-NEXT: s_lshl_b32 s5, s75, 8 -; SI-NEXT: s_or_b32 s61, s4, s5 -; SI-NEXT: s_and_b32 s4, s28, 0xff +; SI-NEXT: s_and_b32 s4, s16, 0xff ; SI-NEXT: s_lshl_b32 s5, s25, 8 +; SI-NEXT: s_or_b32 s61, s4, s5 +; SI-NEXT: s_and_b32 s4, s29, 0xff +; SI-NEXT: s_lshl_b32 s5, s28, 8 ; SI-NEXT: s_or_b32 s62, s4, s5 -; SI-NEXT: s_and_b32 s4, s36, 0xff -; SI-NEXT: s_lshl_b32 s5, s16, 8 +; SI-NEXT: s_and_b32 s4, s81, 0xff +; SI-NEXT: s_lshl_b32 s5, s95, 8 ; SI-NEXT: s_or_b32 s63, s4, s5 -; SI-NEXT: s_and_b32 s4, s10, 0xff -; SI-NEXT: s_lshl_b32 s5, s9, 8 +; SI-NEXT: s_and_b32 s4, s11, 0xff +; SI-NEXT: s_lshl_b32 s5, s10, 8 ; SI-NEXT: s_or_b32 s72, s4, s5 -; SI-NEXT: s_and_b32 s4, s7, 0xff -; SI-NEXT: s_lshl_b32 s5, s78, 8 +; SI-NEXT: s_and_b32 s4, s80, 0xff +; SI-NEXT: s_lshl_b32 s5, s93, 8 ; SI-NEXT: s_or_b32 s73, s4, s5 -; SI-NEXT: s_and_b32 s4, s8, 0xff -; SI-NEXT: s_lshl_b32 s5, s91, 8 +; SI-NEXT: s_and_b32 s4, s9, 0xff +; SI-NEXT: s_lshl_b32 s5, s8, 8 ; SI-NEXT: s_or_b32 s74, s4, s5 -; SI-NEXT: s_and_b32 s4, s6, 0xff -; SI-NEXT: s_lshl_b32 s5, s90, 8 +; SI-NEXT: s_and_b32 s4, s7, 0xff +; SI-NEXT: s_lshl_b32 s5, s6, 8 ; SI-NEXT: s_or_b32 s75, s4, s5 ; SI-NEXT: v_readlane_b32 s4, v43, 9 ; SI-NEXT: v_readlane_b32 s5, v43, 8 @@ -200640,7 +198440,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_lshl_b32 s4, s4, 16 ; SI-NEXT: s_lshl_b32 s6, s6, 24 ; SI-NEXT: s_and_b32 s5, s5, 0xffff -; SI-NEXT: v_writelane_b32 v42, s7, 51 +; SI-NEXT: v_writelane_b32 v42, s7, 52 ; SI-NEXT: s_or_b32 s4, s6, s4 ; SI-NEXT: s_or_b32 s5, s5, s7 ; SI-NEXT: v_readlane_b32 s6, v43, 1 @@ -200648,348 +198448,351 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_and_b32 s6, s6, 0xff ; SI-NEXT: s_lshl_b32 s7, s7, 8 ; SI-NEXT: s_or_b32 s7, s6, s7 -; SI-NEXT: s_and_b32 s6, s11, 0xff +; SI-NEXT: s_and_b32 s6, s12, 0xff ; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: s_lshl_b32 s8, s12, 24 -; SI-NEXT: s_or_b32 s37, s8, s6 +; SI-NEXT: s_lshl_b32 s8, s13, 24 +; SI-NEXT: s_or_b32 s9, s8, s6 ; SI-NEXT: v_readlane_b32 s6, v43, 3 ; SI-NEXT: s_and_b32 s6, s6, 0xff ; SI-NEXT: v_readlane_b32 s8, v43, 2 ; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: s_lshl_b32 s8, s8, 24 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: v_writelane_b32 v42, s9, 53 ; SI-NEXT: s_or_b32 s6, s8, s6 -; SI-NEXT: s_and_b32 s8, s18, 0xff -; SI-NEXT: s_lshl_b32 s9, s15, 8 +; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: s_and_b32 s8, s41, 0xff +; SI-NEXT: s_lshl_b32 s9, s40, 8 ; SI-NEXT: s_or_b32 s9, s8, s9 -; SI-NEXT: s_and_b32 s8, s13, 0xff +; SI-NEXT: s_and_b32 s8, s14, 0xff ; SI-NEXT: s_lshl_b32 s8, s8, 16 -; SI-NEXT: s_lshl_b32 s10, s14, 24 -; SI-NEXT: s_or_b32 s68, s10, s8 -; SI-NEXT: s_and_b32 s8, s21, 0xff +; SI-NEXT: s_lshl_b32 s10, s15, 24 +; SI-NEXT: s_or_b32 s11, s10, s8 +; SI-NEXT: s_and_b32 s8, s42, 0xff ; SI-NEXT: s_lshl_b32 s8, s8, 16 -; SI-NEXT: s_lshl_b32 s10, s22, 24 +; SI-NEXT: s_lshl_b32 s10, s43, 24 +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: v_writelane_b32 v42, s11, 54 ; SI-NEXT: s_or_b32 s8, s10, s8 -; SI-NEXT: s_and_b32 s10, s77, 0xff -; SI-NEXT: s_lshl_b32 s11, s76, 8 +; SI-NEXT: s_or_b32 s9, s9, s11 +; SI-NEXT: s_and_b32 s10, vcc_lo, 0xff +; SI-NEXT: s_lshl_b32 s11, s78, 8 ; SI-NEXT: s_or_b32 s11, s10, s11 -; SI-NEXT: s_and_b32 s10, s40, 0xff +; SI-NEXT: s_and_b32 s10, s76, 0xff ; SI-NEXT: s_lshl_b32 s10, s10, 16 -; SI-NEXT: s_lshl_b32 s12, s41, 24 -; SI-NEXT: s_or_b32 s99, s12, s10 -; SI-NEXT: s_and_b32 s10, vcc_lo, 0xff +; SI-NEXT: s_lshl_b32 s12, s77, 24 +; SI-NEXT: s_or_b32 s13, s12, s10 +; SI-NEXT: s_and_b32 s10, vcc_hi, 0xff ; SI-NEXT: s_lshl_b32 s10, s10, 16 -; SI-NEXT: s_lshl_b32 s12, vcc_hi, 24 +; SI-NEXT: s_lshl_b32 s12, s38, 24 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: v_writelane_b32 v42, s13, 55 ; SI-NEXT: s_or_b32 s10, s12, s10 -; SI-NEXT: s_and_b32 s12, s49, 0xff -; SI-NEXT: s_lshl_b32 s13, s48, 8 +; SI-NEXT: s_or_b32 s11, s11, s13 +; SI-NEXT: s_and_b32 s12, s50, 0xff +; SI-NEXT: s_lshl_b32 s13, s49, 8 ; SI-NEXT: s_or_b32 s13, s12, s13 -; SI-NEXT: s_and_b32 s12, s38, 0xff +; SI-NEXT: s_and_b32 s12, s39, 0xff ; SI-NEXT: s_lshl_b32 s12, s12, 16 -; SI-NEXT: s_lshl_b32 s14, s39, 24 -; SI-NEXT: s_or_b32 s92, s14, s12 -; SI-NEXT: s_and_b32 s12, s50, 0xff +; SI-NEXT: s_lshl_b32 s14, s48, 24 +; SI-NEXT: s_or_b32 s27, s14, s12 +; SI-NEXT: s_and_b32 s12, s51, 0xff ; SI-NEXT: s_lshl_b32 s12, s12, 16 -; SI-NEXT: s_lshl_b32 s14, s51, 24 +; SI-NEXT: s_lshl_b32 s14, s52, 24 ; SI-NEXT: s_or_b32 s12, s14, s12 -; SI-NEXT: s_and_b32 s14, s55, 0xff -; SI-NEXT: s_lshl_b32 s15, s54, 8 +; SI-NEXT: s_and_b32 s14, s64, 0xff +; SI-NEXT: s_lshl_b32 s15, s55, 8 ; SI-NEXT: s_or_b32 s15, s14, s15 -; SI-NEXT: s_and_b32 s14, s52, 0xff +; SI-NEXT: s_and_b32 s14, s53, 0xff ; SI-NEXT: s_lshl_b32 s14, s14, 16 -; SI-NEXT: s_lshl_b32 s24, s53, 24 -; SI-NEXT: s_mov_b32 s28, s90 -; SI-NEXT: s_or_b32 s90, s24, s14 -; SI-NEXT: s_and_b32 s14, s64, 0xff +; SI-NEXT: s_lshl_b32 s16, s54, 24 +; SI-NEXT: s_or_b32 s26, s16, s14 +; SI-NEXT: s_and_b32 s14, s65, 0xff ; SI-NEXT: s_lshl_b32 s14, s14, 16 -; SI-NEXT: s_lshl_b32 s25, s65, 24 +; SI-NEXT: s_lshl_b32 s25, s66, 24 ; SI-NEXT: s_or_b32 s14, s25, s14 -; SI-NEXT: s_and_b32 s25, s34, 0xff -; SI-NEXT: s_lshl_b32 s40, s86, 8 +; SI-NEXT: s_and_b32 s25, s85, 0xff +; SI-NEXT: s_lshl_b32 s40, s68, 8 ; SI-NEXT: s_or_b32 s41, s25, s40 -; SI-NEXT: s_and_b32 s25, s80, 0xff +; SI-NEXT: s_and_b32 s25, s69, 0xff ; SI-NEXT: s_lshl_b32 s25, s25, 16 -; SI-NEXT: s_lshl_b32 s40, s81, 24 -; SI-NEXT: s_or_b32 s18, s40, s25 -; SI-NEXT: s_and_b32 s40, s31, 0xff +; SI-NEXT: s_lshl_b32 s40, s70, 24 +; SI-NEXT: s_or_b32 s16, s40, s25 +; SI-NEXT: s_and_b32 s40, s37, 0xff ; SI-NEXT: s_lshl_b32 s40, s40, 16 -; SI-NEXT: s_lshl_b32 s42, s84, 24 +; SI-NEXT: s_lshl_b32 s42, s87, 24 ; SI-NEXT: s_or_b32 s40, s42, s40 -; SI-NEXT: s_and_b32 s42, s35, 0xff -; SI-NEXT: s_lshl_b32 s43, s97, 8 +; SI-NEXT: s_and_b32 s42, s20, 0xff +; SI-NEXT: s_lshl_b32 s43, s30, 8 ; SI-NEXT: s_or_b32 s43, s42, s43 ; SI-NEXT: s_and_b32 s42, s71, 0xff ; SI-NEXT: s_lshl_b32 s42, s42, 16 ; SI-NEXT: s_lshl_b32 s76, s67, 24 -; SI-NEXT: s_or_b32 s35, s76, s42 -; SI-NEXT: s_and_b32 s42, s87, 0xff +; SI-NEXT: s_or_b32 s69, s76, s42 +; SI-NEXT: s_and_b32 s42, s19, 0xff ; SI-NEXT: s_lshl_b32 s42, s42, 16 -; SI-NEXT: s_lshl_b32 s76, s83, 24 +; SI-NEXT: s_lshl_b32 s76, s18, 24 ; SI-NEXT: s_or_b32 s42, s76, s42 -; SI-NEXT: s_and_b32 s76, s19, 0xff -; SI-NEXT: s_lshl_b32 s77, s20, 8 +; SI-NEXT: s_and_b32 s76, s96, 0xff +; SI-NEXT: s_lshl_b32 s77, s23, 8 ; SI-NEXT: s_or_b32 s76, s76, s77 -; SI-NEXT: s_and_b32 s77, s66, 0xff -; SI-NEXT: v_writelane_b32 v42, s78, 52 +; SI-NEXT: s_and_b32 s77, s86, 0xff ; SI-NEXT: s_lshl_b32 s77, s77, 16 -; SI-NEXT: s_lshl_b32 s78, s85, 24 -; SI-NEXT: s_or_b32 s19, s78, s77 -; SI-NEXT: s_and_b32 s77, s94, 0xff +; SI-NEXT: s_lshl_b32 s78, s34, 24 +; SI-NEXT: s_or_b32 s70, s78, s77 +; SI-NEXT: s_and_b32 s77, s31, 0xff ; SI-NEXT: s_lshl_b32 s77, s77, 16 -; SI-NEXT: s_lshl_b32 s78, s27, 24 +; SI-NEXT: s_lshl_b32 s78, s35, 24 ; SI-NEXT: s_and_b32 s76, s76, 0xffff ; SI-NEXT: s_or_b32 vcc_lo, s78, s77 -; SI-NEXT: s_or_b32 vcc_hi, s76, s19 -; SI-NEXT: s_and_b32 s76, s26, 0xff -; SI-NEXT: s_lshl_b32 s77, s23, 8 +; SI-NEXT: s_or_b32 vcc_hi, s76, s70 +; SI-NEXT: s_and_b32 s76, s94, 0xff +; SI-NEXT: s_lshl_b32 s77, s17, 8 ; SI-NEXT: s_or_b32 s76, s76, s77 -; SI-NEXT: s_and_b32 s77, s98, 0xff +; SI-NEXT: s_and_b32 s77, s84, 0xff ; SI-NEXT: s_lshl_b32 s77, s77, 16 -; SI-NEXT: s_lshl_b32 s78, s17, 24 -; SI-NEXT: s_or_b32 s71, s78, s77 -; SI-NEXT: s_and_b32 s77, s79, 0xff -; SI-NEXT: s_and_b32 s76, s76, 0xffff -; SI-NEXT: v_readlane_b32 s17, v43, 40 +; SI-NEXT: s_lshl_b32 s78, s92, 24 ; SI-NEXT: s_and_b32 s41, s41, 0xffff +; SI-NEXT: s_or_b32 s71, s78, s77 +; SI-NEXT: s_and_b32 s77, s24, 0xff +; SI-NEXT: s_or_b32 s41, s41, s16 +; SI-NEXT: s_mov_b32 s37, s16 ; SI-NEXT: s_lshl_b32 s77, s77, 16 -; SI-NEXT: s_lshl_b32 s78, s88, 24 -; SI-NEXT: s_or_b32 s39, s76, s71 -; SI-NEXT: s_and_b32 s76, s17, 0xff -; SI-NEXT: v_readlane_b32 s17, v43, 39 -; SI-NEXT: s_or_b32 s41, s41, s18 -; SI-NEXT: s_mov_b32 s31, s18 +; SI-NEXT: s_lshl_b32 s78, s21, 24 +; SI-NEXT: s_and_b32 s76, s76, 0xffff +; SI-NEXT: v_readlane_b32 s16, v43, 36 ; SI-NEXT: s_or_b32 s38, s78, s77 -; SI-NEXT: s_lshl_b32 s77, s17, 8 -; SI-NEXT: v_readlane_b32 s18, v43, 38 +; SI-NEXT: s_or_b32 s39, s76, s71 +; SI-NEXT: s_and_b32 s76, s16, 0xff +; SI-NEXT: s_lshl_b32 s77, s99, 8 +; SI-NEXT: v_readlane_b32 s16, v43, 35 ; SI-NEXT: s_or_b32 s76, s76, s77 -; SI-NEXT: s_and_b32 s77, s18, 0xff -; SI-NEXT: v_readlane_b32 s18, v43, 37 +; SI-NEXT: s_and_b32 s77, s16, 0xff +; SI-NEXT: v_readlane_b32 s16, v43, 34 ; SI-NEXT: s_lshl_b32 s77, s77, 16 -; SI-NEXT: s_lshl_b32 s78, s18, 24 +; SI-NEXT: s_lshl_b32 s78, s16, 24 +; SI-NEXT: v_writelane_b32 v42, s80, 56 ; SI-NEXT: s_or_b32 s80, s78, s77 -; SI-NEXT: s_and_b32 s77, s95, 0xff -; SI-NEXT: s_and_b32 s76, s76, 0xffff -; SI-NEXT: v_readlane_b32 s17, v43, 36 +; SI-NEXT: s_and_b32 s77, s97, 0xff ; SI-NEXT: s_lshl_b32 s77, s77, 16 -; SI-NEXT: s_lshl_b32 s78, s93, 24 -; SI-NEXT: s_or_b32 s49, s76, s80 -; SI-NEXT: s_and_b32 s76, s17, 0xff -; SI-NEXT: v_readlane_b32 s17, v43, 35 +; SI-NEXT: s_lshl_b32 s78, s88, 24 +; SI-NEXT: s_and_b32 s76, s76, 0xffff +; SI-NEXT: v_readlane_b32 s16, v43, 33 ; SI-NEXT: s_or_b32 s48, s78, s77 -; SI-NEXT: s_lshl_b32 s77, s17, 8 -; SI-NEXT: v_readlane_b32 s17, v43, 34 +; SI-NEXT: s_or_b32 s49, s76, s80 +; SI-NEXT: s_and_b32 s76, s98, 0xff +; SI-NEXT: s_lshl_b32 s77, s16, 8 +; SI-NEXT: v_readlane_b32 s16, v43, 32 ; SI-NEXT: s_or_b32 s76, s76, s77 -; SI-NEXT: s_and_b32 s77, s17, 0xff -; SI-NEXT: v_readlane_b32 s17, v43, 33 +; SI-NEXT: s_and_b32 s77, s16, 0xff +; SI-NEXT: v_readlane_b32 s16, v43, 31 ; SI-NEXT: s_lshl_b32 s77, s77, 16 -; SI-NEXT: s_lshl_b32 s78, s17, 24 +; SI-NEXT: s_lshl_b32 s78, s16, 24 +; SI-NEXT: v_writelane_b32 v42, s81, 57 ; SI-NEXT: s_or_b32 s81, s78, s77 -; SI-NEXT: s_and_b32 s77, s30, 0xff +; SI-NEXT: s_and_b32 s77, s89, 0xff ; SI-NEXT: s_lshl_b32 s77, s77, 16 -; SI-NEXT: s_lshl_b32 s78, s69, 24 +; SI-NEXT: s_lshl_b32 s78, s90, 24 ; SI-NEXT: s_and_b32 s76, s76, 0xffff -; SI-NEXT: v_readlane_b32 s17, v43, 31 +; SI-NEXT: v_readlane_b32 s16, v43, 30 ; SI-NEXT: s_or_b32 s50, s78, s77 ; SI-NEXT: s_or_b32 s51, s76, s81 -; SI-NEXT: s_and_b32 s76, s17, 0xff -; SI-NEXT: s_lshl_b32 s77, s96, 8 -; SI-NEXT: v_readlane_b32 s17, v43, 30 +; SI-NEXT: s_and_b32 s76, s79, 0xff +; SI-NEXT: s_lshl_b32 s77, s16, 8 +; SI-NEXT: v_readlane_b32 s16, v43, 29 ; SI-NEXT: s_or_b32 s76, s76, s77 -; SI-NEXT: s_and_b32 s77, s17, 0xff +; SI-NEXT: s_and_b32 s77, s16, 0xff ; SI-NEXT: s_lshl_b32 s77, s77, 16 ; SI-NEXT: s_lshl_b32 s78, s82, 24 -; SI-NEXT: v_writelane_b32 v42, s96, 53 -; SI-NEXT: v_readlane_b32 s18, v43, 32 -; SI-NEXT: v_writelane_b32 v42, s82, 54 +; SI-NEXT: v_writelane_b32 v42, s82, 58 ; SI-NEXT: s_or_b32 s82, s78, s77 -; SI-NEXT: s_and_b32 s77, s18, 0xff -; SI-NEXT: s_and_b32 s76, s76, 0xffff -; SI-NEXT: v_readlane_b32 s17, v43, 28 +; SI-NEXT: s_and_b32 s77, s36, 0xff ; SI-NEXT: s_lshl_b32 s77, s77, 16 -; SI-NEXT: s_lshl_b32 s78, s70, 24 -; SI-NEXT: s_or_b32 s53, s76, s82 -; SI-NEXT: s_and_b32 s76, s17, 0xff -; SI-NEXT: v_readlane_b32 s17, v43, 27 +; SI-NEXT: s_lshl_b32 s78, s83, 24 +; SI-NEXT: s_and_b32 s76, s76, 0xffff +; SI-NEXT: v_readlane_b32 s16, v43, 26 ; SI-NEXT: s_or_b32 s52, s78, s77 -; SI-NEXT: s_lshl_b32 s77, s17, 8 -; SI-NEXT: v_readlane_b32 s18, v43, 26 +; SI-NEXT: s_or_b32 s53, s76, s82 +; SI-NEXT: s_and_b32 s76, s91, 0xff +; SI-NEXT: s_lshl_b32 s77, s16, 8 +; SI-NEXT: v_readlane_b32 s16, v43, 25 ; SI-NEXT: s_or_b32 s76, s76, s77 -; SI-NEXT: s_and_b32 s77, s18, 0xff -; SI-NEXT: v_readlane_b32 s17, v43, 25 -; SI-NEXT: s_lshl_b32 s77, s77, 16 -; SI-NEXT: s_lshl_b32 s78, s17, 24 -; SI-NEXT: v_writelane_b32 v42, s16, 55 -; SI-NEXT: s_or_b32 s16, s78, s77 -; SI-NEXT: s_and_b32 s77, s89, 0xff -; SI-NEXT: v_readlane_b32 s18, v43, 29 +; SI-NEXT: s_and_b32 s77, s16, 0xff +; SI-NEXT: v_readlane_b32 s16, v43, 24 ; SI-NEXT: s_lshl_b32 s77, s77, 16 -; SI-NEXT: s_lshl_b32 s78, s18, 24 +; SI-NEXT: s_lshl_b32 s78, s16, 24 +; SI-NEXT: v_readlane_b32 s16, v43, 28 +; SI-NEXT: s_or_b32 s83, s78, s77 +; SI-NEXT: s_and_b32 s77, s16, 0xff +; SI-NEXT: v_readlane_b32 s16, v43, 27 +; SI-NEXT: s_lshl_b32 s78, s16, 24 ; SI-NEXT: s_and_b32 s76, s76, 0xffff -; SI-NEXT: v_readlane_b32 s17, v43, 22 -; SI-NEXT: v_readlane_b32 s18, v43, 21 +; SI-NEXT: v_readlane_b32 s16, v43, 21 +; SI-NEXT: s_lshl_b32 s77, s77, 16 +; SI-NEXT: s_or_b32 s55, s76, s83 +; SI-NEXT: s_and_b32 s76, s16, 0xff +; SI-NEXT: v_readlane_b32 s16, v43, 20 ; SI-NEXT: s_or_b32 s54, s78, s77 -; SI-NEXT: s_or_b32 s55, s76, s16 -; SI-NEXT: s_and_b32 s76, s17, 0xff -; SI-NEXT: s_lshl_b32 s77, s18, 8 -; SI-NEXT: v_readlane_b32 s17, v43, 20 +; SI-NEXT: s_lshl_b32 s77, s16, 8 +; SI-NEXT: v_readlane_b32 s16, v43, 19 ; SI-NEXT: s_or_b32 s76, s76, s77 -; SI-NEXT: s_and_b32 s77, s17, 0xff -; SI-NEXT: v_readlane_b32 s17, v43, 19 -; SI-NEXT: s_lshl_b32 s77, s77, 16 -; SI-NEXT: s_lshl_b32 s78, s17, 24 -; SI-NEXT: v_readlane_b32 s17, v43, 24 -; SI-NEXT: s_or_b32 s83, s78, s77 -; SI-NEXT: s_and_b32 s77, s17, 0xff -; SI-NEXT: v_readlane_b32 s17, v43, 23 +; SI-NEXT: s_and_b32 s77, s16, 0xff +; SI-NEXT: v_readlane_b32 s16, v43, 18 ; SI-NEXT: s_lshl_b32 s77, s77, 16 -; SI-NEXT: s_lshl_b32 s78, s17, 24 +; SI-NEXT: s_lshl_b32 s78, s16, 24 +; SI-NEXT: v_readlane_b32 s16, v43, 23 +; SI-NEXT: s_or_b32 s84, s78, s77 +; SI-NEXT: s_and_b32 s77, s16, 0xff +; SI-NEXT: v_readlane_b32 s16, v43, 22 +; SI-NEXT: s_lshl_b32 s78, s16, 24 ; SI-NEXT: s_and_b32 s76, s76, 0xffff -; SI-NEXT: v_readlane_b32 s17, v43, 17 -; SI-NEXT: v_readlane_b32 s18, v43, 16 +; SI-NEXT: v_readlane_b32 s16, v43, 16 +; SI-NEXT: s_lshl_b32 s77, s77, 16 +; SI-NEXT: s_or_b32 s65, s76, s84 +; SI-NEXT: s_and_b32 s76, s16, 0xff +; SI-NEXT: v_readlane_b32 s16, v43, 15 ; SI-NEXT: s_or_b32 s64, s78, s77 -; SI-NEXT: s_or_b32 s65, s76, s83 -; SI-NEXT: s_and_b32 s76, s17, 0xff -; SI-NEXT: s_lshl_b32 s77, s18, 8 -; SI-NEXT: v_readlane_b32 s18, v43, 15 +; SI-NEXT: s_lshl_b32 s77, s16, 8 +; SI-NEXT: v_readlane_b32 s16, v43, 14 +; SI-NEXT: v_writelane_b32 v42, s93, 59 ; SI-NEXT: s_or_b32 s76, s76, s77 -; SI-NEXT: s_and_b32 s77, s18, 0xff -; SI-NEXT: v_readlane_b32 s18, v43, 14 -; SI-NEXT: v_writelane_b32 v42, s89, 56 +; SI-NEXT: s_and_b32 s77, s16, 0xff +; SI-NEXT: v_readlane_b32 s16, v43, 13 +; SI-NEXT: v_writelane_b32 v42, s90, 60 ; SI-NEXT: s_lshl_b32 s77, s77, 16 -; SI-NEXT: s_lshl_b32 s78, s18, 24 -; SI-NEXT: v_writelane_b32 v42, s70, 57 -; SI-NEXT: s_or_b32 s85, s78, s77 +; SI-NEXT: s_lshl_b32 s78, s16, 24 +; SI-NEXT: s_or_b32 s96, s78, s77 ; SI-NEXT: s_and_b32 s77, s44, 0xff -; SI-NEXT: v_readlane_b32 s18, v43, 18 -; SI-NEXT: s_and_b32 s7, s7, 0xffff -; SI-NEXT: s_and_b32 s9, s9, 0xffff -; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: v_readlane_b32 s25, v43, 17 +; SI-NEXT: v_readlane_b32 s16, v42, 51 +; SI-NEXT: s_lshl_b32 s77, s77, 16 +; SI-NEXT: s_lshl_b32 s78, s25, 24 +; SI-NEXT: s_and_b32 s44, s16, 0xffff +; SI-NEXT: s_lshr_b64 s[16:17], vcc, 16 ; SI-NEXT: s_and_b32 s13, s13, 0xffff ; SI-NEXT: s_and_b32 s15, s15, 0xffff -; SI-NEXT: s_and_b32 s43, s43, 0xffff -; SI-NEXT: v_writelane_b32 v42, s69, 58 -; SI-NEXT: s_lshl_b32 s77, s77, 16 -; SI-NEXT: s_lshl_b32 s78, s18, 24 -; SI-NEXT: s_and_b32 s76, s76, 0xffff -; SI-NEXT: s_and_b32 s44, s29, 0xffff -; SI-NEXT: s_or_b32 s7, s7, s37 -; SI-NEXT: s_or_b32 s9, s9, s68 -; SI-NEXT: s_or_b32 s11, s11, s99 -; SI-NEXT: s_or_b32 s13, s13, s92 -; SI-NEXT: s_or_b32 s15, s15, s90 -; SI-NEXT: s_or_b32 s43, s43, s35 -; SI-NEXT: v_writelane_b32 v42, s30, 59 -; SI-NEXT: s_mov_b32 s23, s91 -; SI-NEXT: s_mov_b32 s91, s36 ; SI-NEXT: s_or_b32 s66, s78, s77 -; SI-NEXT: s_or_b32 s67, s76, s85 -; SI-NEXT: s_and_b32 s45, s45, 0xffff +; SI-NEXT: s_mov_b32 s77, s22 ; SI-NEXT: s_and_b32 s46, s46, 0xffff +; SI-NEXT: s_and_b32 s22, s73, 0xffff +; SI-NEXT: v_readlane_b32 s17, v42, 52 +; SI-NEXT: s_or_b32 s13, s13, s27 +; SI-NEXT: s_or_b32 s15, s15, s26 +; SI-NEXT: s_mov_b32 s93, s88 +; SI-NEXT: s_mov_b32 s88, s98 +; SI-NEXT: s_and_b32 s76, s76, 0xffff +; SI-NEXT: s_and_b32 s45, s45, 0xffff +; SI-NEXT: s_and_b32 s98, s62, 0xffff +; SI-NEXT: s_or_b32 s62, s46, s8 +; SI-NEXT: s_lshr_b64 s[24:25], s[8:9], 16 +; SI-NEXT: s_or_b32 s8, s22, s54 +; SI-NEXT: s_mov_b32 s22, s77 +; SI-NEXT: s_lshr_b32 s77, s17, 16 +; SI-NEXT: v_readlane_b32 s17, v42, 53 +; SI-NEXT: s_and_b32 s43, s43, 0xffff +; SI-NEXT: s_mov_b32 s90, s89 +; SI-NEXT: s_mov_b32 s89, s79 +; SI-NEXT: s_mov_b32 s79, s91 +; SI-NEXT: s_mov_b32 s91, s99 +; SI-NEXT: s_or_b32 s67, s76, s96 ; SI-NEXT: s_and_b32 s47, s47, 0xffff ; SI-NEXT: s_and_b32 s56, s56, 0xffff ; SI-NEXT: s_and_b32 s57, s57, 0xffff ; SI-NEXT: s_and_b32 s30, s58, 0xffff -; SI-NEXT: s_and_b32 s34, s59, 0xffff -; SI-NEXT: s_and_b32 s36, s60, 0xffff -; SI-NEXT: s_and_b32 s97, s61, 0xffff -; SI-NEXT: s_and_b32 s86, s62, 0xffff -; SI-NEXT: s_and_b32 s98, s63, 0xffff -; SI-NEXT: s_and_b32 s17, s72, 0xffff -; SI-NEXT: s_and_b32 s87, s73, 0xffff -; SI-NEXT: s_and_b32 s96, s74, 0xffff -; SI-NEXT: s_and_b32 s22, s75, 0xffff +; SI-NEXT: s_and_b32 s86, s61, 0xffff +; SI-NEXT: s_and_b32 s85, s63, 0xffff +; SI-NEXT: s_and_b32 s87, s72, 0xffff +; SI-NEXT: s_and_b32 s68, s74, 0xffff +; SI-NEXT: s_and_b32 s99, s75, 0xffff ; SI-NEXT: s_or_b32 s74, s44, s4 ; SI-NEXT: s_mov_b32 s75, s5 -; SI-NEXT: s_lshr_b64 s[76:77], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[4:5], s[40:41], 16 -; SI-NEXT: s_mov_b32 s70, s93 -; SI-NEXT: s_mov_b32 s69, s95 -; SI-NEXT: s_mov_b32 s93, s28 +; SI-NEXT: s_lshr_b64 s[18:19], s[4:5], 16 ; SI-NEXT: s_or_b32 s72, s45, s6 ; SI-NEXT: s_mov_b32 s73, s7 -; SI-NEXT: s_lshr_b64 s[26:27], s[6:7], 16 -; SI-NEXT: s_or_b32 s62, s46, s8 +; SI-NEXT: s_lshr_b64 s[20:21], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[14:15], 16 +; SI-NEXT: s_lshr_b32 s76, s17, 16 +; SI-NEXT: v_readlane_b32 s17, v42, 54 +; SI-NEXT: s_or_b32 s43, s43, s69 +; SI-NEXT: s_and_b32 s34, s59, 0xffff +; SI-NEXT: s_and_b32 s36, s60, 0xffff ; SI-NEXT: s_mov_b32 s63, s9 -; SI-NEXT: s_lshr_b64 s[28:29], s[8:9], 16 ; SI-NEXT: s_or_b32 s60, s47, s10 ; SI-NEXT: s_mov_b32 s61, s11 -; SI-NEXT: s_lshr_b64 s[88:89], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[10:11], 16 ; SI-NEXT: s_or_b32 s58, s56, s12 ; SI-NEXT: s_mov_b32 s59, s13 -; SI-NEXT: s_lshr_b64 s[20:21], s[12:13], 16 ; SI-NEXT: s_or_b32 s56, s57, s14 ; SI-NEXT: s_mov_b32 s57, s15 -; SI-NEXT: s_lshr_b64 s[24:25], s[14:15], 16 ; SI-NEXT: s_or_b32 s46, s30, s40 +; SI-NEXT: s_mov_b32 s94, s6 +; SI-NEXT: s_mov_b32 s92, s4 ; SI-NEXT: s_mov_b32 s47, s41 -; SI-NEXT: s_or_b32 s44, s34, s42 -; SI-NEXT: s_mov_b32 s34, s4 -; SI-NEXT: s_mov_b32 s45, s43 -; SI-NEXT: s_lshr_b64 s[94:95], s[42:43], 16 -; SI-NEXT: s_or_b32 s42, s36, vcc_lo -; SI-NEXT: s_mov_b32 s43, vcc_hi -; SI-NEXT: s_lshr_b64 vcc, vcc, 16 -; SI-NEXT: s_or_b32 s40, s97, s38 +; SI-NEXT: s_lshr_b64 s[30:31], s[40:41], 16 +; SI-NEXT: s_or_b32 s40, s86, s38 ; SI-NEXT: s_mov_b32 s41, s39 ; SI-NEXT: s_lshr_b64 s[38:39], s[38:39], 16 -; SI-NEXT: s_or_b32 s14, s86, s48 +; SI-NEXT: s_or_b32 s14, s98, s48 ; SI-NEXT: s_mov_b32 s15, s49 ; SI-NEXT: s_lshr_b64 s[48:49], s[48:49], 16 -; SI-NEXT: s_or_b32 s12, s98, s50 +; SI-NEXT: s_or_b32 s12, s85, s50 ; SI-NEXT: s_mov_b32 s13, s51 ; SI-NEXT: s_lshr_b64 s[50:51], s[50:51], 16 -; SI-NEXT: s_or_b32 s10, s17, s52 +; SI-NEXT: s_or_b32 s10, s87, s52 ; SI-NEXT: s_mov_b32 s11, s53 ; SI-NEXT: s_lshr_b64 s[52:53], s[52:53], 16 -; SI-NEXT: s_or_b32 s8, s87, s54 ; SI-NEXT: s_mov_b32 s9, s55 ; SI-NEXT: s_lshr_b64 s[54:55], s[54:55], 16 -; SI-NEXT: s_or_b32 s6, s96, s64 +; SI-NEXT: s_or_b32 s6, s68, s64 ; SI-NEXT: s_mov_b32 s7, s65 ; SI-NEXT: s_lshr_b64 s[64:65], s[64:65], 16 -; SI-NEXT: s_or_b32 s4, s22, s66 +; SI-NEXT: s_or_b32 s4, s99, s66 ; SI-NEXT: s_mov_b32 s5, s67 ; SI-NEXT: s_lshr_b64 s[66:67], s[66:67], 16 -; SI-NEXT: v_readlane_b32 s17, v42, 51 -; SI-NEXT: s_lshr_b32 s55, s17, 16 -; SI-NEXT: s_lshr_b32 s53, s37, 16 -; SI-NEXT: s_lshr_b32 s51, s68, 16 -; SI-NEXT: s_lshr_b32 s49, s99, 16 -; SI-NEXT: s_lshr_b32 s86, s92, 16 -; SI-NEXT: s_lshr_b32 s39, s90, 16 -; SI-NEXT: s_lshr_b32 s18, s31, 16 -; SI-NEXT: s_lshr_b32 s22, s35, 16 -; SI-NEXT: s_lshr_b32 s97, s19, 16 -; SI-NEXT: s_lshr_b32 s65, s71, 16 -; SI-NEXT: s_lshr_b32 s19, s80, 16 -; SI-NEXT: s_lshr_b32 s71, s81, 16 -; SI-NEXT: s_lshr_b32 s67, s82, 16 -; SI-NEXT: v_readlane_b32 s82, v42, 54 -; SI-NEXT: v_readlane_b32 s96, v42, 53 -; SI-NEXT: s_lshr_b32 s80, s16, 16 -; SI-NEXT: v_readlane_b32 s16, v42, 55 -; SI-NEXT: s_lshr_b32 s81, s83, 16 -; SI-NEXT: s_mov_b32 s90, s93 -; SI-NEXT: v_readlane_b32 s78, v42, 52 -; SI-NEXT: s_mov_b32 s95, s69 -; SI-NEXT: s_mov_b32 s93, s70 -; SI-NEXT: v_readlane_b32 s30, v42, 59 -; SI-NEXT: v_readlane_b32 s69, v42, 58 -; SI-NEXT: v_readlane_b32 s70, v42, 57 -; SI-NEXT: v_readlane_b32 s89, v42, 56 -; SI-NEXT: s_lshr_b32 s77, s85, 16 -; SI-NEXT: s_mov_b32 s84, vcc_lo -; SI-NEXT: s_mov_b32 s36, s91 -; SI-NEXT: s_mov_b32 s91, s23 +; SI-NEXT: s_lshr_b32 s78, s17, 16 +; SI-NEXT: v_readlane_b32 s17, v42, 55 +; SI-NEXT: s_or_b32 s44, s34, s42 +; SI-NEXT: s_mov_b32 s45, s43 +; SI-NEXT: s_lshr_b64 s[34:35], s[42:43], 16 +; SI-NEXT: s_or_b32 s42, s36, vcc_lo +; SI-NEXT: s_mov_b32 s43, vcc_hi +; SI-NEXT: s_lshr_b32 s67, s17, 16 +; SI-NEXT: s_lshr_b32 s27, s27, 16 +; SI-NEXT: s_lshr_b32 s55, s26, 16 +; SI-NEXT: s_lshr_b32 s36, s37, 16 +; SI-NEXT: s_lshr_b32 s69, s69, 16 +; SI-NEXT: s_lshr_b32 s65, s70, 16 +; SI-NEXT: s_lshr_b32 s71, s71, 16 +; SI-NEXT: s_lshr_b32 s37, s80, 16 +; SI-NEXT: v_readlane_b32 s80, v42, 56 +; SI-NEXT: s_lshr_b32 s39, s81, 16 +; SI-NEXT: v_readlane_b32 s81, v42, 57 +; SI-NEXT: s_lshr_b32 s49, s82, 16 +; SI-NEXT: v_readlane_b32 s82, v42, 58 +; SI-NEXT: s_lshr_b32 s51, s83, 16 +; SI-NEXT: s_mov_b32 s99, s91 +; SI-NEXT: s_mov_b32 s91, s79 +; SI-NEXT: s_mov_b32 s98, s88 +; SI-NEXT: s_mov_b32 s79, s89 +; SI-NEXT: s_mov_b32 s89, s90 +; SI-NEXT: v_readlane_b32 s90, v42, 60 +; SI-NEXT: s_mov_b32 s88, s93 +; SI-NEXT: v_readlane_b32 s93, v42, 59 +; SI-NEXT: s_lshr_b32 s53, s84, 16 +; SI-NEXT: s_mov_b32 s68, s16 +; SI-NEXT: s_lshr_b32 s70, s96, 16 ; SI-NEXT: s_cbranch_execnz .LBB97_3 ; SI-NEXT: .LBB97_2: ; %cmp.true -; SI-NEXT: v_readlane_b32 s4, v43, 42 +; SI-NEXT: v_readlane_b32 s4, v43, 40 ; SI-NEXT: s_add_i32 s4, s4, 3 -; SI-NEXT: v_readlane_b32 s6, v43, 41 +; SI-NEXT: v_readlane_b32 s5, v43, 38 +; SI-NEXT: v_readlane_b32 s6, v43, 37 ; SI-NEXT: s_and_b32 s4, s4, 0xff -; SI-NEXT: s_lshl_b32 s5, s90, 8 +; SI-NEXT: s_lshl_b32 s5, s5, 8 ; SI-NEXT: s_add_i32 s6, s6, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: v_readlane_b32 s5, v43, 18 +; SI-NEXT: v_readlane_b32 s5, v43, 17 ; SI-NEXT: s_and_b32 s6, s6, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 24 ; SI-NEXT: s_lshl_b32 s6, s6, 16 @@ -200997,15 +198800,15 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: v_readlane_b32 s5, v43, 17 +; SI-NEXT: v_readlane_b32 s5, v43, 16 ; SI-NEXT: s_add_i32 s5, s5, 3 -; SI-NEXT: v_readlane_b32 s6, v43, 16 -; SI-NEXT: v_readlane_b32 s7, v43, 15 +; SI-NEXT: v_readlane_b32 s6, v43, 15 +; SI-NEXT: v_readlane_b32 s7, v43, 14 ; SI-NEXT: s_and_b32 s5, s5, 0xff ; SI-NEXT: s_lshl_b32 s6, s6, 8 ; SI-NEXT: s_add_i32 s7, s7, 3 ; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: v_readlane_b32 s6, v43, 14 +; SI-NEXT: v_readlane_b32 s6, v43, 13 ; SI-NEXT: s_and_b32 s7, s7, 0xff ; SI-NEXT: s_lshl_b32 s6, s6, 24 ; SI-NEXT: s_lshl_b32 s7, s7, 16 @@ -201013,14 +198816,15 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_or_b32 s6, s6, s7 ; SI-NEXT: s_and_b32 s5, s5, 0xffff ; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: v_readlane_b32 s6, v43, 44 +; SI-NEXT: v_readlane_b32 s6, v43, 43 ; SI-NEXT: s_add_i32 s6, s6, 3 -; SI-NEXT: v_readlane_b32 s8, v43, 24 +; SI-NEXT: v_readlane_b32 s7, v43, 41 +; SI-NEXT: v_readlane_b32 s8, v43, 23 ; SI-NEXT: s_and_b32 s6, s6, 0xff -; SI-NEXT: s_lshl_b32 s7, s91, 8 +; SI-NEXT: s_lshl_b32 s7, s7, 8 ; SI-NEXT: s_add_i32 s8, s8, 3 ; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: v_readlane_b32 s7, v43, 23 +; SI-NEXT: v_readlane_b32 s7, v43, 22 ; SI-NEXT: s_and_b32 s8, s8, 0xff ; SI-NEXT: s_lshl_b32 s7, s7, 24 ; SI-NEXT: s_lshl_b32 s8, s8, 16 @@ -201028,15 +198832,15 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_or_b32 s7, s7, s8 ; SI-NEXT: s_and_b32 s6, s6, 0xffff ; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: v_readlane_b32 s7, v43, 22 +; SI-NEXT: v_readlane_b32 s7, v43, 21 ; SI-NEXT: s_add_i32 s7, s7, 3 -; SI-NEXT: v_readlane_b32 s8, v43, 21 -; SI-NEXT: v_readlane_b32 s9, v43, 20 +; SI-NEXT: v_readlane_b32 s8, v43, 20 +; SI-NEXT: v_readlane_b32 s9, v43, 19 ; SI-NEXT: s_and_b32 s7, s7, 0xff ; SI-NEXT: s_lshl_b32 s8, s8, 8 ; SI-NEXT: s_add_i32 s9, s9, 3 ; SI-NEXT: s_or_b32 s7, s8, s7 -; SI-NEXT: v_readlane_b32 s8, v43, 19 +; SI-NEXT: v_readlane_b32 s8, v43, 18 ; SI-NEXT: s_and_b32 s9, s9, 0xff ; SI-NEXT: s_lshl_b32 s8, s8, 24 ; SI-NEXT: s_lshl_b32 s9, s9, 16 @@ -201044,13 +198848,13 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_or_b32 s8, s8, s9 ; SI-NEXT: s_and_b32 s7, s7, 0xffff ; SI-NEXT: s_or_b32 s7, s8, s7 -; SI-NEXT: v_readlane_b32 s8, v43, 43 -; SI-NEXT: s_add_i32 s8, s8, 3 +; SI-NEXT: s_add_i32 s8, s80, 3 +; SI-NEXT: v_readlane_b32 s10, v43, 28 ; SI-NEXT: s_and_b32 s8, s8, 0xff -; SI-NEXT: s_lshl_b32 s9, s78, 8 -; SI-NEXT: s_add_i32 s10, s89, 3 +; SI-NEXT: s_lshl_b32 s9, s93, 8 +; SI-NEXT: s_add_i32 s10, s10, 3 ; SI-NEXT: s_or_b32 s8, s9, s8 -; SI-NEXT: v_readlane_b32 s9, v43, 29 +; SI-NEXT: v_readlane_b32 s9, v43, 27 ; SI-NEXT: s_and_b32 s10, s10, 0xff ; SI-NEXT: s_lshl_b32 s9, s9, 24 ; SI-NEXT: s_lshl_b32 s10, s10, 16 @@ -201058,15 +198862,14 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_or_b32 s9, s9, s10 ; SI-NEXT: s_and_b32 s8, s8, 0xffff ; SI-NEXT: s_or_b32 s8, s9, s8 -; SI-NEXT: v_readlane_b32 s9, v43, 28 -; SI-NEXT: s_add_i32 s9, s9, 3 -; SI-NEXT: v_readlane_b32 s10, v43, 27 -; SI-NEXT: v_readlane_b32 s11, v43, 26 +; SI-NEXT: s_add_i32 s9, s91, 3 +; SI-NEXT: v_readlane_b32 s10, v43, 26 +; SI-NEXT: v_readlane_b32 s11, v43, 25 ; SI-NEXT: s_and_b32 s9, s9, 0xff ; SI-NEXT: s_lshl_b32 s10, s10, 8 ; SI-NEXT: s_add_i32 s11, s11, 3 ; SI-NEXT: s_or_b32 s9, s10, s9 -; SI-NEXT: v_readlane_b32 s10, v43, 25 +; SI-NEXT: v_readlane_b32 s10, v43, 24 ; SI-NEXT: s_and_b32 s11, s11, 0xff ; SI-NEXT: s_lshl_b32 s10, s10, 24 ; SI-NEXT: s_lshl_b32 s11, s11, 16 @@ -201074,26 +198877,27 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_or_b32 s10, s10, s11 ; SI-NEXT: s_and_b32 s9, s9, 0xffff ; SI-NEXT: s_or_b32 s9, s10, s9 -; SI-NEXT: v_readlane_b32 s10, v43, 46 +; SI-NEXT: v_readlane_b32 s10, v43, 45 ; SI-NEXT: s_add_i32 s10, s10, 3 -; SI-NEXT: v_readlane_b32 s11, v43, 45 -; SI-NEXT: v_readlane_b32 s12, v43, 32 +; SI-NEXT: v_readlane_b32 s11, v43, 44 +; SI-NEXT: v_readlane_b32 s12, v43, 42 ; SI-NEXT: s_and_b32 s10, s10, 0xff ; SI-NEXT: s_lshl_b32 s11, s11, 8 ; SI-NEXT: s_add_i32 s12, s12, 3 ; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: v_readlane_b32 s11, v43, 39 ; SI-NEXT: s_and_b32 s12, s12, 0xff -; SI-NEXT: s_lshl_b32 s11, s70, 24 +; SI-NEXT: s_lshl_b32 s11, s11, 24 ; SI-NEXT: s_lshl_b32 s12, s12, 16 ; SI-NEXT: s_addk_i32 s10, 0x300 ; SI-NEXT: s_or_b32 s11, s11, s12 ; SI-NEXT: s_and_b32 s10, s10, 0xffff ; SI-NEXT: s_or_b32 s10, s11, s10 -; SI-NEXT: v_readlane_b32 s11, v43, 31 -; SI-NEXT: s_add_i32 s11, s11, 3 -; SI-NEXT: v_readlane_b32 s13, v43, 30 +; SI-NEXT: s_add_i32 s11, s79, 3 +; SI-NEXT: v_readlane_b32 s12, v43, 30 +; SI-NEXT: v_readlane_b32 s13, v43, 29 ; SI-NEXT: s_and_b32 s11, s11, 0xff -; SI-NEXT: s_lshl_b32 s12, s96, 8 +; SI-NEXT: s_lshl_b32 s12, s12, 8 ; SI-NEXT: s_add_i32 s13, s13, 3 ; SI-NEXT: s_or_b32 s11, s12, s11 ; SI-NEXT: s_and_b32 s13, s13, 0xff @@ -201103,27 +198907,26 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_or_b32 s12, s12, s13 ; SI-NEXT: s_and_b32 s11, s11, 0xffff ; SI-NEXT: s_or_b32 s11, s12, s11 -; SI-NEXT: s_add_i32 s12, s36, 3 +; SI-NEXT: s_add_i32 s12, s81, 3 ; SI-NEXT: s_and_b32 s12, s12, 0xff -; SI-NEXT: s_lshl_b32 s13, s16, 8 -; SI-NEXT: s_add_i32 s14, s30, 3 +; SI-NEXT: s_lshl_b32 s13, s95, 8 +; SI-NEXT: s_add_i32 s14, s89, 3 ; SI-NEXT: s_or_b32 s12, s13, s12 ; SI-NEXT: s_and_b32 s14, s14, 0xff -; SI-NEXT: s_lshl_b32 s13, s69, 24 +; SI-NEXT: s_lshl_b32 s13, s90, 24 ; SI-NEXT: s_lshl_b32 s14, s14, 16 ; SI-NEXT: s_addk_i32 s12, 0x300 ; SI-NEXT: s_or_b32 s13, s13, s14 ; SI-NEXT: s_and_b32 s12, s12, 0xffff ; SI-NEXT: s_or_b32 s12, s13, s12 -; SI-NEXT: v_readlane_b32 s13, v43, 36 -; SI-NEXT: s_add_i32 s13, s13, 3 -; SI-NEXT: v_readlane_b32 s14, v43, 35 -; SI-NEXT: v_readlane_b32 s15, v43, 34 +; SI-NEXT: s_add_i32 s13, s98, 3 +; SI-NEXT: v_readlane_b32 s14, v43, 33 +; SI-NEXT: v_readlane_b32 s15, v43, 32 ; SI-NEXT: s_and_b32 s13, s13, 0xff ; SI-NEXT: s_lshl_b32 s14, s14, 8 ; SI-NEXT: s_add_i32 s15, s15, 3 ; SI-NEXT: s_or_b32 s13, s14, s13 -; SI-NEXT: v_readlane_b32 s14, v43, 33 +; SI-NEXT: v_readlane_b32 s14, v43, 31 ; SI-NEXT: s_and_b32 s15, s15, 0xff ; SI-NEXT: s_lshl_b32 s14, s14, 24 ; SI-NEXT: s_lshl_b32 s15, s15, 16 @@ -201132,28 +198935,28 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_and_b32 s13, s13, 0xffff ; SI-NEXT: s_or_b32 s13, s14, s13 ; SI-NEXT: v_readlane_b32 s14, v42, 50 -; SI-NEXT: s_add_i32 s17, s14, 3 +; SI-NEXT: s_add_i32 s19, s14, 3 ; SI-NEXT: v_readlane_b32 s15, v42, 49 -; SI-NEXT: s_and_b32 s14, s17, 0xff +; SI-NEXT: v_readlane_b32 s16, v42, 48 +; SI-NEXT: s_and_b32 s14, s19, 0xff ; SI-NEXT: s_lshl_b32 s15, s15, 8 -; SI-NEXT: s_add_i32 s16, s95, 3 +; SI-NEXT: s_add_i32 s18, s16, 3 ; SI-NEXT: s_or_b32 s14, s15, s14 -; SI-NEXT: s_and_b32 s16, s16, 0xff -; SI-NEXT: s_lshl_b32 s15, s93, 24 +; SI-NEXT: s_and_b32 s16, s18, 0xff +; SI-NEXT: s_lshl_b32 s15, s88, 24 ; SI-NEXT: s_lshl_b32 s16, s16, 16 ; SI-NEXT: s_addk_i32 s14, 0x300 ; SI-NEXT: s_or_b32 s15, s15, s16 ; SI-NEXT: s_and_b32 s14, s14, 0xffff ; SI-NEXT: s_or_b32 s14, s15, s14 -; SI-NEXT: v_readlane_b32 s15, v43, 40 +; SI-NEXT: v_readlane_b32 s15, v43, 36 ; SI-NEXT: s_add_i32 s15, s15, 3 -; SI-NEXT: v_readlane_b32 s16, v43, 39 -; SI-NEXT: v_readlane_b32 s17, v43, 38 +; SI-NEXT: v_readlane_b32 s17, v43, 35 ; SI-NEXT: s_and_b32 s15, s15, 0xff -; SI-NEXT: s_lshl_b32 s16, s16, 8 +; SI-NEXT: s_lshl_b32 s16, s99, 8 ; SI-NEXT: s_add_i32 s17, s17, 3 ; SI-NEXT: s_or_b32 s15, s16, s15 -; SI-NEXT: v_readlane_b32 s16, v43, 37 +; SI-NEXT: v_readlane_b32 s16, v43, 34 ; SI-NEXT: s_and_b32 s17, s17, 0xff ; SI-NEXT: s_lshl_b32 s16, s16, 24 ; SI-NEXT: s_lshl_b32 s17, s17, 16 @@ -201161,50 +198964,50 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_or_b32 s16, s16, s17 ; SI-NEXT: s_and_b32 s15, s15, 0xffff ; SI-NEXT: s_or_b32 s15, s16, s15 -; SI-NEXT: v_readlane_b32 s16, v42, 48 +; SI-NEXT: v_readlane_b32 s16, v42, 47 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_readlane_b32 s17, v42, 47 -; SI-NEXT: v_readlane_b32 s18, v42, 42 +; SI-NEXT: v_readlane_b32 s17, v42, 46 +; SI-NEXT: v_readlane_b32 s18, v42, 40 ; SI-NEXT: s_and_b32 s16, s16, 0xff ; SI-NEXT: s_lshl_b32 s17, s17, 8 -; SI-NEXT: s_add_i32 s99, s18, 3 +; SI-NEXT: s_add_i32 s97, s18, 3 ; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: v_readlane_b32 s17, v42, 43 -; SI-NEXT: s_and_b32 s18, s99, 0xff +; SI-NEXT: v_readlane_b32 s17, v42, 41 +; SI-NEXT: s_and_b32 s18, s97, 0xff ; SI-NEXT: s_lshl_b32 s17, s17, 24 ; SI-NEXT: s_lshl_b32 s18, s18, 16 ; SI-NEXT: s_addk_i32 s16, 0x300 ; SI-NEXT: s_or_b32 s17, s17, s18 ; SI-NEXT: s_and_b32 s16, s16, 0xffff ; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: v_readlane_b32 s17, v42, 38 -; SI-NEXT: s_add_i32 s87, s17, 3 -; SI-NEXT: v_readlane_b32 s18, v42, 35 -; SI-NEXT: v_readlane_b32 s19, v42, 29 -; SI-NEXT: s_and_b32 s17, s87, 0xff +; SI-NEXT: v_readlane_b32 s17, v42, 36 +; SI-NEXT: s_add_i32 s85, s17, 3 +; SI-NEXT: v_readlane_b32 s18, v42, 34 +; SI-NEXT: v_readlane_b32 s19, v42, 28 +; SI-NEXT: s_and_b32 s17, s85, 0xff ; SI-NEXT: s_lshl_b32 s18, s18, 8 -; SI-NEXT: s_add_i32 s23, s19, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 ; SI-NEXT: s_or_b32 s17, s18, s17 -; SI-NEXT: v_readlane_b32 s18, v42, 30 -; SI-NEXT: s_and_b32 s23, s23, 0xff +; SI-NEXT: v_readlane_b32 s18, v42, 29 +; SI-NEXT: s_and_b32 s19, s19, 0xff ; SI-NEXT: s_lshl_b32 s18, s18, 24 -; SI-NEXT: s_lshl_b32 s23, s23, 16 +; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_addk_i32 s17, 0x300 -; SI-NEXT: s_or_b32 s18, s18, s23 +; SI-NEXT: s_or_b32 s18, s18, s19 ; SI-NEXT: s_and_b32 s17, s17, 0xffff ; SI-NEXT: s_or_b32 s17, s18, s17 ; SI-NEXT: s_add_i32 s40, s16, 0x3000000 ; SI-NEXT: v_readlane_b32 s16, v42, 45 ; SI-NEXT: s_add_i32 s41, s17, 0x3000000 -; SI-NEXT: s_add_i32 s68, s16, 3 -; SI-NEXT: v_readlane_b32 s17, v42, 44 -; SI-NEXT: v_readlane_b32 s18, v42, 39 -; SI-NEXT: s_and_b32 s16, s68, 0xff +; SI-NEXT: s_add_i32 s23, s16, 3 +; SI-NEXT: v_readlane_b32 s17, v42, 42 +; SI-NEXT: v_readlane_b32 s18, v42, 38 +; SI-NEXT: s_and_b32 s16, s23, 0xff ; SI-NEXT: s_lshl_b32 s17, s17, 8 -; SI-NEXT: s_add_i32 s96, s18, 3 +; SI-NEXT: s_add_i32 s87, s18, 3 ; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: v_readlane_b32 s17, v42, 40 -; SI-NEXT: s_and_b32 s18, s96, 0xff +; SI-NEXT: v_readlane_b32 s17, v42, 39 +; SI-NEXT: s_and_b32 s18, s87, 0xff ; SI-NEXT: s_lshl_b32 s17, s17, 24 ; SI-NEXT: s_lshl_b32 s18, s18, 16 ; SI-NEXT: s_addk_i32 s16, 0x300 @@ -201222,22 +199025,22 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: s_add_i32 s42, s16, 0x3000000 ; SI-NEXT: s_and_b32 s16, s17, 0xffff -; SI-NEXT: v_readlane_b32 s17, v42, 25 +; SI-NEXT: v_readlane_b32 s17, v42, 23 ; SI-NEXT: s_and_b32 s18, s18, 0xff ; SI-NEXT: s_lshl_b32 s17, s17, 24 ; SI-NEXT: s_lshl_b32 s18, s18, 16 ; SI-NEXT: s_or_b32 s17, s17, s18 ; SI-NEXT: s_or_b32 s16, s17, s16 ; SI-NEXT: s_add_i32 s43, s16, 0x3000000 -; SI-NEXT: v_readlane_b32 s16, v42, 46 -; SI-NEXT: s_add_i32 s23, s16, 3 -; SI-NEXT: v_readlane_b32 s17, v42, 41 +; SI-NEXT: v_readlane_b32 s16, v42, 44 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_readlane_b32 s17, v42, 43 ; SI-NEXT: v_readlane_b32 s18, v42, 37 -; SI-NEXT: s_and_b32 s16, s23, 0xff +; SI-NEXT: s_and_b32 s16, s16, 0xff ; SI-NEXT: s_lshl_b32 s17, s17, 8 ; SI-NEXT: s_add_i32 s86, s18, 3 ; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: v_readlane_b32 s17, v42, 36 +; SI-NEXT: v_readlane_b32 s17, v42, 35 ; SI-NEXT: s_and_b32 s18, s86, 0xff ; SI-NEXT: s_addk_i32 s16, 0x300 ; SI-NEXT: s_lshl_b32 s17, s17, 24 @@ -201246,15 +199049,15 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_or_b32 s17, s17, s18 ; SI-NEXT: s_or_b32 s16, s17, s16 ; SI-NEXT: s_add_i32 s44, s16, 0x3000000 -; SI-NEXT: v_readlane_b32 s16, v42, 28 +; SI-NEXT: v_readlane_b32 s16, v42, 27 ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: v_readlane_b32 s17, v42, 22 -; SI-NEXT: v_readlane_b32 s18, v42, 16 +; SI-NEXT: v_readlane_b32 s18, v42, 17 ; SI-NEXT: s_and_b32 s16, s16, 0xff ; SI-NEXT: s_lshl_b32 s17, s17, 8 ; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: v_readlane_b32 s17, v42, 15 +; SI-NEXT: v_readlane_b32 s17, v42, 14 ; SI-NEXT: s_and_b32 s18, s18, 0xff ; SI-NEXT: s_addk_i32 s16, 0x300 ; SI-NEXT: s_lshl_b32 s17, s17, 24 @@ -201263,15 +199066,15 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_or_b32 s17, s17, s18 ; SI-NEXT: s_or_b32 s16, s17, s16 ; SI-NEXT: s_add_i32 s45, s16, 0x3000000 -; SI-NEXT: v_readlane_b32 s16, v42, 34 -; SI-NEXT: s_add_i32 s83, s16, 3 -; SI-NEXT: v_readlane_b32 s17, v42, 32 -; SI-NEXT: v_readlane_b32 s18, v42, 26 -; SI-NEXT: s_and_b32 s16, s83, 0xff +; SI-NEXT: v_readlane_b32 s16, v42, 32 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_readlane_b32 s17, v42, 30 +; SI-NEXT: v_readlane_b32 s18, v42, 25 +; SI-NEXT: s_and_b32 s16, s16, 0xff ; SI-NEXT: s_lshl_b32 s17, s17, 8 ; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: v_readlane_b32 s17, v42, 27 +; SI-NEXT: v_readlane_b32 s17, v42, 26 ; SI-NEXT: s_and_b32 s18, s18, 0xff ; SI-NEXT: s_addk_i32 s16, 0x300 ; SI-NEXT: s_lshl_b32 s17, s17, 24 @@ -201280,15 +199083,15 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_or_b32 s17, s17, s18 ; SI-NEXT: s_or_b32 s16, s17, s16 ; SI-NEXT: s_add_i32 s46, s16, 0x3000000 -; SI-NEXT: v_readlane_b32 s16, v42, 23 +; SI-NEXT: v_readlane_b32 s16, v42, 21 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_readlane_b32 s17, v42, 21 -; SI-NEXT: v_readlane_b32 s18, v42, 17 +; SI-NEXT: v_readlane_b32 s17, v42, 20 +; SI-NEXT: v_readlane_b32 s18, v42, 15 ; SI-NEXT: s_and_b32 s16, s16, 0xff ; SI-NEXT: s_lshl_b32 s17, s17, 8 ; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: v_readlane_b32 s17, v42, 18 +; SI-NEXT: v_readlane_b32 s17, v42, 16 ; SI-NEXT: s_and_b32 s18, s18, 0xff ; SI-NEXT: s_addk_i32 s16, 0x300 ; SI-NEXT: s_lshl_b32 s17, s17, 24 @@ -201297,15 +199100,15 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_or_b32 s17, s17, s18 ; SI-NEXT: s_or_b32 s16, s17, s16 ; SI-NEXT: s_add_i32 s47, s16, 0x3000000 -; SI-NEXT: v_readlane_b32 s16, v42, 20 +; SI-NEXT: v_readlane_b32 s16, v42, 19 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_readlane_b32 s17, v42, 19 -; SI-NEXT: v_readlane_b32 s18, v42, 13 +; SI-NEXT: v_readlane_b32 s17, v42, 18 +; SI-NEXT: v_readlane_b32 s18, v42, 12 ; SI-NEXT: s_and_b32 s16, s16, 0xff ; SI-NEXT: s_lshl_b32 s17, s17, 8 ; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: v_readlane_b32 s17, v42, 14 +; SI-NEXT: v_readlane_b32 s17, v42, 13 ; SI-NEXT: s_and_b32 s18, s18, 0xff ; SI-NEXT: s_addk_i32 s16, 0x300 ; SI-NEXT: s_lshl_b32 s17, s17, 24 @@ -201314,15 +199117,15 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_or_b32 s17, s17, s18 ; SI-NEXT: s_or_b32 s16, s17, s16 ; SI-NEXT: s_add_i32 s56, s16, 0x3000000 -; SI-NEXT: v_readlane_b32 s16, v42, 12 +; SI-NEXT: v_readlane_b32 s16, v42, 11 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_readlane_b32 s17, v42, 11 -; SI-NEXT: v_readlane_b32 s18, v42, 7 +; SI-NEXT: v_readlane_b32 s17, v42, 10 +; SI-NEXT: v_readlane_b32 s18, v42, 6 ; SI-NEXT: s_and_b32 s16, s16, 0xff ; SI-NEXT: s_lshl_b32 s17, s17, 8 ; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: v_readlane_b32 s17, v42, 8 +; SI-NEXT: v_readlane_b32 s17, v42, 7 ; SI-NEXT: s_and_b32 s18, s18, 0xff ; SI-NEXT: s_addk_i32 s16, 0x300 ; SI-NEXT: s_lshl_b32 s17, s17, 24 @@ -201331,15 +199134,15 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_or_b32 s17, s17, s18 ; SI-NEXT: s_or_b32 s16, s17, s16 ; SI-NEXT: s_add_i32 s57, s16, 0x3000000 -; SI-NEXT: v_readlane_b32 s16, v42, 10 +; SI-NEXT: v_readlane_b32 s16, v42, 9 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_readlane_b32 s17, v42, 9 -; SI-NEXT: v_readlane_b32 s18, v42, 5 +; SI-NEXT: v_readlane_b32 s17, v42, 8 +; SI-NEXT: v_readlane_b32 s18, v42, 4 ; SI-NEXT: s_and_b32 s16, s16, 0xff ; SI-NEXT: s_lshl_b32 s17, s17, 8 ; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: v_readlane_b32 s17, v42, 6 +; SI-NEXT: v_readlane_b32 s17, v42, 5 ; SI-NEXT: s_and_b32 s18, s18, 0xff ; SI-NEXT: s_addk_i32 s16, 0x300 ; SI-NEXT: s_lshl_b32 s17, s17, 24 @@ -201348,15 +199151,15 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_or_b32 s17, s17, s18 ; SI-NEXT: s_or_b32 s16, s17, s16 ; SI-NEXT: s_add_i32 s58, s16, 0x3000000 -; SI-NEXT: v_readlane_b32 s16, v42, 4 +; SI-NEXT: v_readlane_b32 s16, v42, 3 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_readlane_b32 s17, v42, 3 -; SI-NEXT: v_readlane_b32 s18, v43, 63 +; SI-NEXT: v_readlane_b32 s17, v42, 2 +; SI-NEXT: v_readlane_b32 s18, v43, 62 ; SI-NEXT: s_and_b32 s16, s16, 0xff ; SI-NEXT: s_lshl_b32 s17, s17, 8 ; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: v_readlane_b32 s17, v42, 0 +; SI-NEXT: v_readlane_b32 s17, v43, 63 ; SI-NEXT: s_and_b32 s18, s18, 0xff ; SI-NEXT: s_addk_i32 s16, 0x300 ; SI-NEXT: s_lshl_b32 s17, s17, 24 @@ -201365,15 +199168,15 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_or_b32 s17, s17, s18 ; SI-NEXT: s_or_b32 s16, s17, s16 ; SI-NEXT: s_add_i32 s59, s16, 0x3000000 -; SI-NEXT: v_readlane_b32 s16, v42, 2 +; SI-NEXT: v_readlane_b32 s16, v42, 1 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_readlane_b32 s17, v42, 1 -; SI-NEXT: v_readlane_b32 s18, v43, 61 +; SI-NEXT: v_readlane_b32 s17, v42, 0 +; SI-NEXT: v_readlane_b32 s18, v43, 60 ; SI-NEXT: s_and_b32 s16, s16, 0xff ; SI-NEXT: s_lshl_b32 s17, s17, 8 ; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: v_readlane_b32 s17, v43, 62 +; SI-NEXT: v_readlane_b32 s17, v43, 61 ; SI-NEXT: s_and_b32 s18, s18, 0xff ; SI-NEXT: s_addk_i32 s16, 0x300 ; SI-NEXT: s_lshl_b32 s17, s17, 24 @@ -201382,15 +199185,15 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_or_b32 s17, s17, s18 ; SI-NEXT: s_or_b32 s16, s17, s16 ; SI-NEXT: s_add_i32 s60, s16, 0x3000000 -; SI-NEXT: v_readlane_b32 s16, v43, 60 +; SI-NEXT: v_readlane_b32 s16, v43, 59 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_readlane_b32 s17, v43, 59 -; SI-NEXT: v_readlane_b32 s18, v43, 55 +; SI-NEXT: v_readlane_b32 s17, v43, 58 +; SI-NEXT: v_readlane_b32 s18, v43, 54 ; SI-NEXT: s_and_b32 s16, s16, 0xff ; SI-NEXT: s_lshl_b32 s17, s17, 8 ; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: v_readlane_b32 s17, v43, 56 +; SI-NEXT: v_readlane_b32 s17, v43, 55 ; SI-NEXT: s_and_b32 s18, s18, 0xff ; SI-NEXT: s_addk_i32 s16, 0x300 ; SI-NEXT: s_lshl_b32 s17, s17, 24 @@ -201399,15 +199202,15 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_or_b32 s17, s17, s18 ; SI-NEXT: s_or_b32 s16, s17, s16 ; SI-NEXT: s_add_i32 s61, s16, 0x3000000 -; SI-NEXT: v_readlane_b32 s16, v43, 58 +; SI-NEXT: v_readlane_b32 s16, v43, 57 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_readlane_b32 s17, v43, 57 -; SI-NEXT: v_readlane_b32 s18, v43, 53 +; SI-NEXT: v_readlane_b32 s17, v43, 56 +; SI-NEXT: v_readlane_b32 s18, v43, 52 ; SI-NEXT: s_and_b32 s16, s16, 0xff ; SI-NEXT: s_lshl_b32 s17, s17, 8 ; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: v_readlane_b32 s17, v43, 54 +; SI-NEXT: v_readlane_b32 s17, v43, 53 ; SI-NEXT: s_and_b32 s18, s18, 0xff ; SI-NEXT: s_addk_i32 s16, 0x300 ; SI-NEXT: s_lshl_b32 s17, s17, 24 @@ -201416,15 +199219,15 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_or_b32 s17, s17, s18 ; SI-NEXT: s_or_b32 s16, s17, s16 ; SI-NEXT: s_add_i32 s62, s16, 0x3000000 -; SI-NEXT: v_readlane_b32 s16, v43, 52 +; SI-NEXT: v_readlane_b32 s16, v43, 51 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_readlane_b32 s17, v43, 51 -; SI-NEXT: v_readlane_b32 s18, v43, 49 +; SI-NEXT: v_readlane_b32 s17, v43, 50 +; SI-NEXT: v_readlane_b32 s18, v43, 48 ; SI-NEXT: s_and_b32 s16, s16, 0xff ; SI-NEXT: s_lshl_b32 s17, s17, 8 ; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: v_readlane_b32 s17, v43, 50 +; SI-NEXT: v_readlane_b32 s17, v43, 49 ; SI-NEXT: s_and_b32 s18, s18, 0xff ; SI-NEXT: s_addk_i32 s16, 0x300 ; SI-NEXT: s_lshl_b32 s17, s17, 24 @@ -201453,12 +199256,12 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_readlane_b32 s16, v43, 1 ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: v_readlane_b32 s17, v43, 0 -; SI-NEXT: v_readlane_b32 s18, v43, 47 +; SI-NEXT: v_readlane_b32 s18, v43, 46 ; SI-NEXT: s_and_b32 s16, s16, 0xff ; SI-NEXT: s_lshl_b32 s17, s17, 8 ; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: v_readlane_b32 s17, v43, 48 +; SI-NEXT: v_readlane_b32 s17, v43, 47 ; SI-NEXT: s_and_b32 s18, s18, 0xff ; SI-NEXT: s_addk_i32 s16, 0x300 ; SI-NEXT: s_lshl_b32 s17, s17, 24 @@ -201467,8 +199270,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_or_b32 s17, s17, s18 ; SI-NEXT: s_or_b32 s16, s17, s16 ; SI-NEXT: s_add_i32 s73, s16, 0x3000000 -; SI-NEXT: v_readlane_b32 s16, v43, 13 -; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s16, s22, 3 ; SI-NEXT: v_readlane_b32 s17, v43, 12 ; SI-NEXT: v_readlane_b32 s18, v43, 11 ; SI-NEXT: s_and_b32 s16, s16, 0xff @@ -201499,7 +199301,6 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_lshl_b32 s18, s18, 16 ; SI-NEXT: s_and_b32 s16, s16, 0xffff ; SI-NEXT: s_or_b32 s17, s17, s18 -; SI-NEXT: s_or_b32 s16, s17, s16 ; SI-NEXT: s_add_i32 s4, s4, 0x3000000 ; SI-NEXT: s_add_i32 s5, s5, 0x3000000 ; SI-NEXT: s_add_i32 s6, s6, 0x3000000 @@ -201512,8 +199313,9 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_add_i32 s13, s13, 0x3000000 ; SI-NEXT: s_add_i32 s14, s14, 0x3000000 ; SI-NEXT: s_add_i32 s15, s15, 0x3000000 +; SI-NEXT: s_or_b32 s16, s17, s16 ; SI-NEXT: s_add_i32 s75, s16, 0x3000000 -; SI-NEXT: s_lshr_b64 s[76:77], s[74:75], 16 +; SI-NEXT: s_lshr_b64 s[68:69], s[42:43], 16 ; SI-NEXT: s_lshr_b64 s[38:39], s[40:41], 16 ; SI-NEXT: s_lshr_b64 s[48:49], s[14:15], 16 ; SI-NEXT: s_lshr_b64 s[50:51], s[12:13], 16 @@ -201521,253 +199323,160 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_lshr_b64 s[54:55], s[8:9], 16 ; SI-NEXT: s_lshr_b64 s[64:65], s[6:7], 16 ; SI-NEXT: s_lshr_b64 s[66:67], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[26:27], s[72:73], 16 -; SI-NEXT: s_lshr_b64 s[28:29], s[62:63], 16 -; SI-NEXT: s_lshr_b64 s[88:89], s[60:61], 16 -; SI-NEXT: s_lshr_b64 s[20:21], s[58:59], 16 -; SI-NEXT: s_lshr_b64 s[24:25], s[56:57], 16 -; SI-NEXT: s_lshr_b64 s[34:35], s[46:47], 16 -; SI-NEXT: s_lshr_b64 s[94:95], s[44:45], 16 -; SI-NEXT: s_lshr_b64 s[84:85], s[42:43], 16 -; SI-NEXT: s_lshr_b32 s55, s75, 16 -; SI-NEXT: s_lshr_b32 s53, s73, 16 -; SI-NEXT: s_lshr_b32 s51, s63, 16 -; SI-NEXT: s_lshr_b32 s49, s61, 16 -; SI-NEXT: s_lshr_b32 s86, s59, 16 -; SI-NEXT: s_lshr_b32 s39, s57, 16 -; SI-NEXT: s_lshr_b32 s18, s47, 16 -; SI-NEXT: s_lshr_b32 s22, s45, 16 -; SI-NEXT: s_lshr_b32 s97, s43, 16 -; SI-NEXT: s_lshr_b32 s65, s41, 16 -; SI-NEXT: s_lshr_b32 s19, s15, 16 -; SI-NEXT: s_lshr_b32 s71, s13, 16 -; SI-NEXT: s_lshr_b32 s67, s11, 16 -; SI-NEXT: s_lshr_b32 s80, s9, 16 -; SI-NEXT: s_lshr_b32 s81, s7, 16 -; SI-NEXT: s_lshr_b32 s77, s5, 16 +; SI-NEXT: s_lshr_b64 s[18:19], s[74:75], 16 +; SI-NEXT: s_lshr_b64 s[20:21], s[72:73], 16 +; SI-NEXT: s_lshr_b64 s[24:25], s[62:63], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[60:61], 16 +; SI-NEXT: s_lshr_b64 s[92:93], s[58:59], 16 +; SI-NEXT: s_lshr_b64 s[94:95], s[56:57], 16 +; SI-NEXT: s_lshr_b64 s[30:31], s[46:47], 16 +; SI-NEXT: s_lshr_b64 s[34:35], s[44:45], 16 +; SI-NEXT: s_lshr_b32 s77, s75, 16 +; SI-NEXT: s_lshr_b32 s76, s73, 16 +; SI-NEXT: s_lshr_b32 s78, s63, 16 +; SI-NEXT: s_lshr_b32 s67, s61, 16 +; SI-NEXT: s_lshr_b32 s27, s59, 16 +; SI-NEXT: s_lshr_b32 s55, s57, 16 +; SI-NEXT: s_lshr_b32 s36, s47, 16 +; SI-NEXT: s_lshr_b32 s69, s45, 16 +; SI-NEXT: s_lshr_b32 s65, s43, 16 +; SI-NEXT: s_lshr_b32 s71, s41, 16 +; SI-NEXT: s_lshr_b32 s37, s15, 16 +; SI-NEXT: s_lshr_b32 s39, s13, 16 +; SI-NEXT: s_lshr_b32 s49, s11, 16 +; SI-NEXT: s_lshr_b32 s51, s9, 16 +; SI-NEXT: s_lshr_b32 s53, s7, 16 +; SI-NEXT: s_lshr_b32 s70, s5, 16 ; SI-NEXT: .LBB97_3: ; %end +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload ; SI-NEXT: s_and_b32 s16, s74, 0xffff -; SI-NEXT: s_lshl_b32 s17, s76, 16 -; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: v_mov_b32_e32 v1, s16 -; SI-NEXT: s_and_b32 s16, s75, 0xffff -; SI-NEXT: s_lshl_b32 s17, s55, 16 -; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s72, 0xffff -; SI-NEXT: s_lshl_b32 s17, s26, 16 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s73, 0xffff -; SI-NEXT: s_lshl_b32 s17, s53, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 8, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s62, 0xffff -; SI-NEXT: s_lshl_b32 s17, s28, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 12, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s63, 0xffff -; SI-NEXT: s_lshl_b32 s17, s51, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 16, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s60, 0xffff -; SI-NEXT: s_lshl_b32 s17, s88, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 20, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s61, 0xffff -; SI-NEXT: s_lshl_b32 s17, s49, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 24, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s58, 0xffff -; SI-NEXT: s_lshl_b32 s17, s20, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 28, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s59, 0xffff -; SI-NEXT: s_lshl_b32 s17, s86, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 32, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s56, 0xffff -; SI-NEXT: s_lshl_b32 s17, s24, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 36, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s57, 0xffff -; SI-NEXT: s_lshl_b32 s17, s39, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 40, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s46, 0xffff -; SI-NEXT: s_lshl_b32 s17, s34, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 44, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s47, 0xffff -; SI-NEXT: s_lshl_b32 s17, s18, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 48, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s44, 0xffff -; SI-NEXT: s_lshl_b32 s17, s94, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 52, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s45, 0xffff -; SI-NEXT: s_lshl_b32 s17, s22, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 56, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s42, 0xffff -; SI-NEXT: s_lshl_b32 s17, s84, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 60, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s43, 0xffff -; SI-NEXT: s_lshl_b32 s17, s97, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 64, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s40, 0xffff -; SI-NEXT: s_lshl_b32 s17, s38, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x44, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s41, 0xffff -; SI-NEXT: s_lshl_b32 s17, s65, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x48, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_or_b32 s16, s16, s18 +; SI-NEXT: s_and_b32 s18, s75, 0xffff +; SI-NEXT: s_lshl_b32 s19, s77, 16 +; SI-NEXT: s_or_b32 s18, s18, s19 +; SI-NEXT: s_and_b32 s19, s72, 0xffff +; SI-NEXT: s_lshl_b32 s20, s20, 16 +; SI-NEXT: s_or_b32 s19, s19, s20 +; SI-NEXT: s_and_b32 s20, s73, 0xffff +; SI-NEXT: s_lshl_b32 s21, s76, 16 +; SI-NEXT: s_or_b32 s20, s20, s21 +; SI-NEXT: s_and_b32 s21, s62, 0xffff +; SI-NEXT: s_lshl_b32 s22, s24, 16 +; SI-NEXT: s_or_b32 s21, s21, s22 +; SI-NEXT: s_and_b32 s22, s63, 0xffff +; SI-NEXT: s_lshl_b32 s23, s78, 16 +; SI-NEXT: s_or_b32 s22, s22, s23 +; SI-NEXT: s_and_b32 s23, s60, 0xffff +; SI-NEXT: s_lshl_b32 s24, s28, 16 +; SI-NEXT: s_or_b32 s23, s23, s24 +; SI-NEXT: s_and_b32 s24, s61, 0xffff +; SI-NEXT: s_lshl_b32 s25, s67, 16 +; SI-NEXT: s_or_b32 s24, s24, s25 +; SI-NEXT: s_and_b32 s25, s58, 0xffff +; SI-NEXT: s_lshl_b32 s26, s92, 16 +; SI-NEXT: s_or_b32 s25, s25, s26 +; SI-NEXT: s_and_b32 s26, s59, 0xffff +; SI-NEXT: s_lshl_b32 s27, s27, 16 +; SI-NEXT: s_or_b32 s26, s26, s27 +; SI-NEXT: s_and_b32 s27, s56, 0xffff +; SI-NEXT: s_lshl_b32 s28, s94, 16 +; SI-NEXT: s_or_b32 s27, s27, s28 +; SI-NEXT: s_and_b32 s28, s57, 0xffff +; SI-NEXT: s_lshl_b32 s29, s55, 16 +; SI-NEXT: s_or_b32 s28, s28, s29 +; SI-NEXT: s_and_b32 s29, s46, 0xffff +; SI-NEXT: s_lshl_b32 s46, s30, 16 +; SI-NEXT: s_or_b32 s29, s29, s46 +; SI-NEXT: s_and_b32 s46, s47, 0xffff +; SI-NEXT: s_lshl_b32 s47, s36, 16 +; SI-NEXT: s_or_b32 s46, s46, s47 +; SI-NEXT: s_and_b32 s44, s44, 0xffff +; SI-NEXT: s_lshl_b32 s47, s34, 16 +; SI-NEXT: s_or_b32 s44, s44, s47 +; SI-NEXT: s_and_b32 s45, s45, 0xffff +; SI-NEXT: s_lshl_b32 s47, s69, 16 +; SI-NEXT: s_or_b32 s45, s45, s47 +; SI-NEXT: s_and_b32 s42, s42, 0xffff +; SI-NEXT: s_lshl_b32 s47, s68, 16 +; SI-NEXT: s_or_b32 s42, s42, s47 +; SI-NEXT: s_and_b32 s43, s43, 0xffff +; SI-NEXT: s_lshl_b32 s47, s65, 16 +; SI-NEXT: s_or_b32 s43, s43, s47 +; SI-NEXT: s_and_b32 s40, s40, 0xffff +; SI-NEXT: s_lshl_b32 s47, s38, 16 +; SI-NEXT: s_or_b32 s40, s40, s47 +; SI-NEXT: s_and_b32 s41, s41, 0xffff +; SI-NEXT: s_lshl_b32 s47, s71, 16 +; SI-NEXT: s_or_b32 s41, s41, s47 ; SI-NEXT: s_and_b32 s14, s14, 0xffff -; SI-NEXT: s_lshl_b32 s16, s48, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x4c, v0 -; SI-NEXT: s_or_b32 s14, s14, s16 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s14 -; SI-NEXT: s_and_b32 s14, s15, 0xffff -; SI-NEXT: s_lshl_b32 s15, s19, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x50, v0 -; SI-NEXT: s_or_b32 s14, s14, s15 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s14 +; SI-NEXT: s_lshl_b32 s47, s48, 16 +; SI-NEXT: s_or_b32 s14, s14, s47 +; SI-NEXT: s_and_b32 s15, s15, 0xffff +; SI-NEXT: s_lshl_b32 s47, s37, 16 +; SI-NEXT: s_or_b32 s15, s15, s47 ; SI-NEXT: s_and_b32 s12, s12, 0xffff -; SI-NEXT: s_lshl_b32 s14, s50, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x54, v0 -; SI-NEXT: s_or_b32 s12, s12, s14 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s12 -; SI-NEXT: s_and_b32 s12, s13, 0xffff -; SI-NEXT: s_lshl_b32 s13, s71, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x58, v0 -; SI-NEXT: s_or_b32 s12, s12, s13 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s12 +; SI-NEXT: s_lshl_b32 s47, s50, 16 +; SI-NEXT: s_or_b32 s12, s12, s47 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_lshl_b32 s47, s39, 16 +; SI-NEXT: s_or_b32 s13, s13, s47 ; SI-NEXT: s_and_b32 s10, s10, 0xffff -; SI-NEXT: s_lshl_b32 s12, s52, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x5c, v0 -; SI-NEXT: s_or_b32 s10, s10, s12 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s10 -; SI-NEXT: s_and_b32 s10, s11, 0xffff -; SI-NEXT: s_lshl_b32 s11, s67, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x60, v0 -; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: s_lshl_b32 s47, s52, 16 +; SI-NEXT: s_or_b32 s10, s10, s47 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: s_lshl_b32 s47, s49, 16 +; SI-NEXT: s_or_b32 s11, s11, s47 ; SI-NEXT: s_and_b32 s8, s8, 0xffff -; SI-NEXT: s_lshl_b32 s10, s54, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x64, v0 -; SI-NEXT: s_or_b32 s8, s8, s10 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s8 -; SI-NEXT: s_and_b32 s8, s9, 0xffff -; SI-NEXT: s_lshl_b32 s9, s80, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x68, v0 -; SI-NEXT: s_or_b32 s8, s8, s9 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: s_lshl_b32 s47, s54, 16 +; SI-NEXT: s_or_b32 s8, s8, s47 +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_lshl_b32 s47, s51, 16 +; SI-NEXT: s_or_b32 s9, s9, s47 ; SI-NEXT: s_and_b32 s6, s6, 0xffff -; SI-NEXT: s_lshl_b32 s8, s64, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x6c, v0 -; SI-NEXT: s_or_b32 s6, s6, s8 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: s_and_b32 s6, s7, 0xffff -; SI-NEXT: s_lshl_b32 s7, s81, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x70, v0 -; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_lshl_b32 s47, s64, 16 +; SI-NEXT: s_or_b32 s6, s6, s47 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_lshl_b32 s47, s53, 16 +; SI-NEXT: s_or_b32 s7, s7, s47 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_lshl_b32 s6, s66, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x74, v0 -; SI-NEXT: s_or_b32 s4, s4, s6 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s4 -; SI-NEXT: s_and_b32 s4, s5, 0xffff -; SI-NEXT: s_lshl_b32 s5, s77, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x78, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; SI-NEXT: v_mov_b32_e32 v1, s4 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: s_lshl_b32 s47, s66, 16 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s17, s70, 16 +; SI-NEXT: s_or_b32 s4, s4, s47 +; SI-NEXT: s_or_b32 s5, s5, s17 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s18 +; SI-NEXT: v_mov_b32_e32 v2, s19 +; SI-NEXT: v_mov_b32_e32 v3, s20 +; SI-NEXT: v_mov_b32_e32 v4, s21 +; SI-NEXT: v_mov_b32_e32 v5, s22 +; SI-NEXT: v_mov_b32_e32 v6, s23 +; SI-NEXT: v_mov_b32_e32 v7, s24 +; SI-NEXT: v_mov_b32_e32 v8, s25 +; SI-NEXT: v_mov_b32_e32 v9, s26 +; SI-NEXT: v_mov_b32_e32 v10, s27 +; SI-NEXT: v_mov_b32_e32 v11, s28 +; SI-NEXT: v_mov_b32_e32 v12, s29 +; SI-NEXT: v_mov_b32_e32 v13, s46 +; SI-NEXT: v_mov_b32_e32 v14, s44 +; SI-NEXT: v_mov_b32_e32 v15, s45 +; SI-NEXT: v_mov_b32_e32 v16, s42 +; SI-NEXT: v_mov_b32_e32 v17, s43 +; SI-NEXT: v_mov_b32_e32 v18, s40 +; SI-NEXT: v_mov_b32_e32 v19, s41 +; SI-NEXT: v_mov_b32_e32 v20, s14 +; SI-NEXT: v_mov_b32_e32 v21, s15 +; SI-NEXT: v_mov_b32_e32 v22, s12 +; SI-NEXT: v_mov_b32_e32 v23, s13 +; SI-NEXT: v_mov_b32_e32 v24, s10 +; SI-NEXT: v_mov_b32_e32 v25, s11 +; SI-NEXT: v_mov_b32_e32 v26, s8 +; SI-NEXT: v_mov_b32_e32 v27, s9 +; SI-NEXT: v_mov_b32_e32 v28, s6 +; SI-NEXT: v_mov_b32_e32 v29, s7 +; SI-NEXT: v_mov_b32_e32 v30, s4 +; SI-NEXT: v_mov_b32_e32 v31, s5 ; SI-NEXT: v_readlane_b32 s99, v41, 35 ; SI-NEXT: v_readlane_b32 s98, v41, 34 ; SI-NEXT: v_readlane_b32 s97, v41, 33 @@ -201805,61 +199514,61 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_readlane_b32 s31, v41, 1 ; SI-NEXT: v_readlane_b32 s30, v41, 0 ; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[4:5] -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB97_4: ; SI-NEXT: ; implicit-def: $sgpr74 -; SI-NEXT: ; implicit-def: $sgpr76 -; SI-NEXT: ; implicit-def: $sgpr55 +; SI-NEXT: ; implicit-def: $sgpr18 +; SI-NEXT: ; implicit-def: $sgpr77 ; SI-NEXT: ; implicit-def: $sgpr72 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr53 +; SI-NEXT: ; implicit-def: $sgpr20 +; SI-NEXT: ; implicit-def: $sgpr76 ; SI-NEXT: ; implicit-def: $sgpr62 -; SI-NEXT: ; implicit-def: $sgpr28 -; SI-NEXT: ; implicit-def: $sgpr51 +; SI-NEXT: ; implicit-def: $sgpr24 +; SI-NEXT: ; implicit-def: $sgpr78 ; SI-NEXT: ; implicit-def: $sgpr60 -; SI-NEXT: ; implicit-def: $sgpr88 -; SI-NEXT: ; implicit-def: $sgpr49 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr67 ; SI-NEXT: ; implicit-def: $sgpr58 -; SI-NEXT: ; implicit-def: $sgpr20 -; SI-NEXT: ; implicit-def: $sgpr86 +; SI-NEXT: ; implicit-def: $sgpr92 +; SI-NEXT: ; implicit-def: $sgpr27 ; SI-NEXT: ; implicit-def: $sgpr56 -; SI-NEXT: ; implicit-def: $sgpr24 -; SI-NEXT: ; implicit-def: $sgpr39 +; SI-NEXT: ; implicit-def: $sgpr94 +; SI-NEXT: ; implicit-def: $sgpr55 ; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr34 -; SI-NEXT: ; implicit-def: $sgpr18 +; SI-NEXT: ; implicit-def: $sgpr30 +; SI-NEXT: ; implicit-def: $sgpr36 ; SI-NEXT: ; implicit-def: $sgpr44 -; SI-NEXT: ; implicit-def: $sgpr94 -; SI-NEXT: ; implicit-def: $sgpr22 +; SI-NEXT: ; implicit-def: $sgpr34 +; SI-NEXT: ; implicit-def: $sgpr69 ; SI-NEXT: ; implicit-def: $sgpr42 -; SI-NEXT: ; implicit-def: $sgpr84 -; SI-NEXT: ; implicit-def: $sgpr97 +; SI-NEXT: ; implicit-def: $sgpr68 +; SI-NEXT: ; implicit-def: $sgpr65 ; SI-NEXT: ; implicit-def: $sgpr40 ; SI-NEXT: ; implicit-def: $sgpr38 -; SI-NEXT: ; implicit-def: $sgpr65 +; SI-NEXT: ; implicit-def: $sgpr71 ; SI-NEXT: ; implicit-def: $sgpr14 ; SI-NEXT: ; implicit-def: $sgpr48 -; SI-NEXT: ; implicit-def: $sgpr19 +; SI-NEXT: ; implicit-def: $sgpr37 ; SI-NEXT: ; implicit-def: $sgpr12 ; SI-NEXT: ; implicit-def: $sgpr50 -; SI-NEXT: ; implicit-def: $sgpr71 +; SI-NEXT: ; implicit-def: $sgpr39 ; SI-NEXT: ; implicit-def: $sgpr10 ; SI-NEXT: ; implicit-def: $sgpr52 -; SI-NEXT: ; implicit-def: $sgpr67 +; SI-NEXT: ; implicit-def: $sgpr49 ; SI-NEXT: ; implicit-def: $sgpr8 ; SI-NEXT: ; implicit-def: $sgpr54 -; SI-NEXT: ; implicit-def: $sgpr80 +; SI-NEXT: ; implicit-def: $sgpr51 ; SI-NEXT: ; implicit-def: $sgpr6 ; SI-NEXT: ; implicit-def: $sgpr64 -; SI-NEXT: ; implicit-def: $sgpr81 +; SI-NEXT: ; implicit-def: $sgpr53 ; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr66 -; SI-NEXT: ; implicit-def: $sgpr77 +; SI-NEXT: ; implicit-def: $sgpr70 ; SI-NEXT: s_branch .LBB97_2 ; ; VI-LABEL: bitcast_v128i8_to_v64i16_scalar: @@ -205542,2035 +203251,1939 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v64i16_to_v128i8: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:108 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:124 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:132 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:128 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v23 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v27 +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v10 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v2 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v29 +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v14 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v4 +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v18 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v22 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v26 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v10 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v7 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v12 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v58 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v14 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v57 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v16 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v56 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v18 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v47 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v20 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v46 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v22 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v45 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v24 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v44 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v26 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v43 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v28 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v42 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v30 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v41 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v40 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v9 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v60 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:120 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v11 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; kill: killed $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:24 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:116 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:112 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:36 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:16 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v62 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:104 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v60 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v59 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v63 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:88 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:100 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:84 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:80 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v61 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v56 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:72 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:32 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:56 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:52 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:48 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v12 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v20 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v47 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:68 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:40 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v58 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v57 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB98_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v47 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v46, v2, v6 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; SI-NEXT: v_or_b32_e32 v41, v1, v31 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_or_b32_e32 v36, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v45, v2, v6 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v47, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v44, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v44, v2, v6 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v8 +; SI-NEXT: v_or_b32_e32 v52, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v43, v2, v6 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v37, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v42, v2, v6 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v12 +; SI-NEXT: v_or_b32_e32 v35, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v41, v2, v6 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v34, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v40, v2, v6 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v16 +; SI-NEXT: v_or_b32_e32 v33, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v55, v2, v6 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v32, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v54, v2, v6 +; SI-NEXT: v_or_b32_e32 v31, v1, v2 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v53, v2, v6 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v52, v2, v6 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v24 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v51, v2, v6 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v63, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 +; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v50, v2, v6 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v49, v2, v27 +; SI-NEXT: v_or_b32_e32 v62, v1, v2 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v48, v2, v6 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v39, v2, v19 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v38, v2, v6 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v37, v2, v29 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v36, v2, v6 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v35, v2, v25 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v34, v2, v6 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v33, v2, v11 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v31, v2, v6 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v32, v2, v7 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 +; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v26, v2, v15 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v61, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 ; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v30, v2, v3 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: v_or_b32_e32 v59, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v18, v2, v21 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: v_or_b32_e32 v58, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 +; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v22, v2, v5 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: v_or_b32_e32 v57, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v14, v2, v17 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: v_or_b32_e32 v56, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 +; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v10, v2, v9 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: v_or_b32_e32 v46, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v6, v2, v13 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: v_or_b32_e32 v45, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 +; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v2, v2, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v42, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v1, v36, v41, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v43, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v1, v36, v41, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v55, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v1, v36, v41, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v28 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v40, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v1, v44, v47, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v60 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v25 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v53, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v1, v44, v47, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v59 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v54, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v1, v44, v47, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v63 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v57 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v27 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v50, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v1, v37, v52, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v58 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v51, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v1, v37, v52, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v4, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v3, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v1, v37, v52, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v8, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v48, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v1, v34, v35, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v12, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v39 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v2, v1, v2 +; SI-NEXT: v_alignbit_b32 v1, v34, v35, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v16, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v60 +; SI-NEXT: v_or_b32_e32 v38, v1, v4 +; SI-NEXT: v_alignbit_b32 v1, v34, v35, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v20, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v32, v33, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v24, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v32, v33, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v28, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v32, v33, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v60, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v63, v31, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v59, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v63, v31, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v63, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v63, v31, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v57, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v61, v62, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v58, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v61, v62, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v56 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v61, v62, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v56, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v58, v59, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v61 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v58, v59, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v61, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v58, v59, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v62 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v56, v57, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v62, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v56, v57, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v47, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v56, v57, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v45, v46, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v45, v46, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v45, v46, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v43, v44, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v43, v44, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v43, v44, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v41, v42, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v41, v42, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v41, v42, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v55, v40, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v55, v40, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v55, v40, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v53, v54, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v53, v54, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v53, v54, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v51, v52, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v51, v52, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v43, v42, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v51, v52, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v43, v42, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v49, v50, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v43, v42, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v49, v50, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v40, v55, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v49, v50, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v40, v55, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v39, v48, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v40, v55, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v39, v48, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v54, v53, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v39, v48, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v54, v53, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v37, v38, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v54, v53, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v37, v38, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v51, v50, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v37, v38, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v51, v50, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v35, v36, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v51, v50, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v35, v36, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v48, v3, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v35, v36, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v48, v3, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v1, v48, v3, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v33, v34, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v38, v2, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v33, v34, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v38, v2, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v1, v38, v2, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v33, v34, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v36 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v32, v31, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v44 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v32, v31, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v37 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v32, v31, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v34 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v30, v26, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v32 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v30, v26, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v63 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v30, v26, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v61 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v22, v18, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v58 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v22, v18, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v56 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v22, v18, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v45 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v10, v14, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v43 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v10, v14, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v40 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v10, v14, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v54 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v2, v6, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v51 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v2, v6, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v48 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v2, v6, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v38 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v45 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_bfe_u32 v1, v1, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v43 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_bfe_u32 v1, v1, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v41 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_bfe_u32 v1, v1, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v55 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_bfe_u32 v1, v1, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v53 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_bfe_u32 v1, v1, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v51 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_bfe_u32 v1, v1, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v49 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_bfe_u32 v1, v1, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v39 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_bfe_u32 v1, v1, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v37 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_bfe_u32 v1, v1, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v35 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_bfe_u32 v1, v1, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v33 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_bfe_u32 v1, v1, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v32 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_bfe_u32 v1, v1, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v30 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_bfe_u32 v1, v1, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v22 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_bfe_u32 v1, v1, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v10 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_bfe_u32 v1, v1, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v2 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_bfe_u32 v1, v1, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: .LBB98_2: ; %Flow -; SI-NEXT: s_or_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; SI-NEXT: s_xor_b64 exec, exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB98_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v39 +; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; SI-NEXT: v_and_b32_e32 v30, 0xffff, v30 ; SI-NEXT: s_mov_b32 s6, 0x30000 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; SI-NEXT: v_or_b32_e32 v2, v13, v2 +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_or_b32_e32 v1, v1, v4 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; SI-NEXT: v_or_b32_e32 v8, v9, v6 -; SI-NEXT: v_add_i32_e32 v6, vcc, 0x30000, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v8 -; SI-NEXT: v_alignbit_b32 v12, v2, v6, 24 -; SI-NEXT: v_alignbit_b32 v20, v2, v6, 16 -; SI-NEXT: v_alignbit_b32 v47, v2, v6, 8 -; SI-NEXT: v_lshrrev_b32_e32 v24, 24, v10 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v8, 24, v2 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v2 -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v21, v1 -; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v31, v32, v31 +; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v60 +; SI-NEXT: v_and_b32_e32 v32, 0xffff, v32 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v5, v1 -; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; SI-NEXT: v_alignbit_b32 v62, v22, v18, 24 -; SI-NEXT: v_alignbit_b32 v63, v22, v18, 16 -; SI-NEXT: v_lshrrev_b32_e32 v59, 24, v22 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v22 +; SI-NEXT: v_or_b32_e32 v32, v33, v32 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v39, vcc, 0x30000, v31 +; SI-NEXT: v_add_i32_e32 v38, vcc, s6, v32 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v15, v1 -; SI-NEXT: v_add_i32_e32 v26, vcc, s6, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v29, v33, v29 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v29, vcc, s6, v29 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_add_i32_e32 v30, vcc, s6, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v30, v33, v30 +; SI-NEXT: v_add_i32_e32 v48, vcc, s6, v30 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_add_i32_e32 v31, vcc, s6, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v7, v1 -; SI-NEXT: v_add_i32_e32 v32, vcc, s6, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v32 +; SI-NEXT: v_or_b32_e32 v27, v30, v27 +; SI-NEXT: v_add_i32_e32 v50, vcc, s6, v27 +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v28 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_add_i32_e32 v34, vcc, s6, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v11, v1 -; SI-NEXT: v_add_i32_e32 v33, vcc, s6, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v33 +; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: v_add_i32_e32 v51, vcc, s6, v27 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_add_i32_e32 v36, vcc, s6, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v25, v1 -; SI-NEXT: v_add_i32_e32 v35, vcc, s6, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v25, v27, v25 +; SI-NEXT: v_add_i32_e32 v53, vcc, s6, v25 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v26 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_add_i32_e32 v38, vcc, s6, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v29, v1 -; SI-NEXT: v_add_i32_e32 v37, vcc, s6, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: v_add_i32_e32 v54, vcc, s6, v25 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_add_i32_e32 v48, vcc, s6, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v19, v1 -; SI-NEXT: v_add_i32_e32 v39, vcc, s6, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v23, v25, v23 +; SI-NEXT: v_add_i32_e32 v55, vcc, s6, v23 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_add_i32_e32 v50, vcc, s6, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; SI-NEXT: v_or_b32_e32 v4, v17, v4 -; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v4 -; SI-NEXT: v_alignbit_b32 v56, v10, v14, 24 -; SI-NEXT: v_alignbit_b32 v57, v10, v14, 16 -; SI-NEXT: v_alignbit_b32 v61, v10, v14, 8 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v27, v1 -; SI-NEXT: v_add_i32_e32 v49, vcc, s6, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: v_add_i32_e32 v40, vcc, s6, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_add_i32_e32 v52, vcc, s6, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: v_add_i32_e32 v42, vcc, s6, v21 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_add_i32_e32 v51, vcc, s6, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: v_add_i32_e32 v43, vcc, s6, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_add_i32_e32 v54, vcc, s6, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: v_add_i32_e32 v46, vcc, s6, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_add_i32_e32 v53, vcc, s6, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_add_i32_e32 v45, vcc, s6, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_add_i32_e32 v40, vcc, s6, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: v_add_i32_e32 v57, vcc, s6, v17 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_add_i32_e32 v55, vcc, s6, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_add_i32_e32 v56, vcc, s6, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_add_i32_e32 v42, vcc, s6, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: v_add_i32_e32 v59, vcc, s6, v15 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_add_i32_e32 v41, vcc, s6, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_add_i32_e32 v58, vcc, s6, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_add_i32_e32 v44, vcc, s6, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_add_i32_e32 v62, vcc, s6, v13 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_add_i32_e32 v43, vcc, s6, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_add_i32_e32 v61, vcc, s6, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_add_i32_e32 v31, vcc, s6, v11 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v12 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_add_i32_e32 v63, vcc, s6, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_add_i32_e32 v33, vcc, s6, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_add_i32_e32 v32, vcc, s6, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_add_i32_e32 v35, vcc, s6, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_add_i32_e32 v34, vcc, s6, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_add_i32_e32 v52, vcc, s6, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_add_i32_e32 v37, vcc, s6, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_add_i32_e32 v47, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_add_i32_e32 v44, vcc, s6, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_add_i32_e32 v46, vcc, s6, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v41, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_add_i32_e32 v45, vcc, s6, v1 -; SI-NEXT: v_alignbit_b32 v1, v45, v46, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v45, v46, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v36, vcc, s6, v1 +; SI-NEXT: v_alignbit_b32 v1, v36, v41, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v43, v44, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v36, v41, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v43, v44, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v36, v41, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v43, v44, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v44, v47, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v41, v42, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v44, v47, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v41, v42, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v44, v47, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v41, v42, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v37, v52, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v55, v40, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v37, v52, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v55, v40, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v37, v52, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v55, v40, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v34, v35, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v53, v54, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v34, v35, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v53, v54, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v34, v35, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v53, v54, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v32, v33, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v51, v52, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v32, v33, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v51, v52, 16 +; SI-NEXT: v_alignbit_b32 v1, v32, v33, 8 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v51, v52, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v49, v50, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v49, v50, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v49, v50, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v63, v31, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v39, v48, 24 +; SI-NEXT: v_alignbit_b32 v1, v63, v31, 16 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v39, v48, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v39, v48, 8 +; SI-NEXT: v_alignbit_b32 v1, v63, v31, 8 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v37, v38, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v37, v38, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v37, v38, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v61, v62, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v35, v36, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v61, v62, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v35, v36, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v61, v62, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v35, v36, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v58, v59, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v33, v34, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v58, v59, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v33, v34, 16 +; SI-NEXT: v_alignbit_b32 v1, v58, v59, 8 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v33, v34, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v56, v57, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v32, v31, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v56, v57, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v32, v31, 16 +; SI-NEXT: v_alignbit_b32 v1, v56, v57, 8 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v32, v31, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v45, v46, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v30, v26, 24 +; SI-NEXT: v_alignbit_b32 v1, v45, v46, 16 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v30, v26, 16 +; SI-NEXT: v_alignbit_b32 v1, v45, v46, 8 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v30, v26, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v43, v42, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v22, v18, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v43, v42, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v45 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v43, v42, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v45 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v40, v55, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v45 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v40, v55, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v43 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v40, v55, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v43 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v54, v53, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v43 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v54, v53, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v41 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v54, v53, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v41 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v51, v50, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v41 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v51, v50, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v55 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v51, v50, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v55 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v48, v29, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v55 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v48, v29, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v1, v48, v29, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v53 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v38, v39, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v53 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v38, v39, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v1, v38, v39, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v53 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v36 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v51 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v36 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v51 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v36 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v51 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v44 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v49 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v44 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v49 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v44 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v49 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v37 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v39 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v37 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v39 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v37 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v39 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v34 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v37 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v34 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v37 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v34 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v37 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v32 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v35 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v32 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v35 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v32 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v35 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v63 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v33 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v63 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v33 +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v63 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v32 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v61 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v32 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v61 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v61 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v58 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v58 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v58 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v56 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v56 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v56 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v30 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v45 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v30 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v45 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v30 +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v45 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v43 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v43 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v43 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v22 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v40 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v10 -; SI-NEXT: v_alignbit_b32 v4, v45, v46, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v40 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v2 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v40 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v54 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v54 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v54 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v51 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v51 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v51 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v48 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v48 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v48 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v38 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v38 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v38 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: .LBB98_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v46 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v41 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v45 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v36 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v44 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v47 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v43 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v44 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v42 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v52 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v41 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v37 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v40 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v35 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v55 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v34 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v54 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v33 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v53 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v32 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v52 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v31 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v51 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v63 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v50 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v62 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v49 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v61 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v48 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v59 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v39 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v58 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v38 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v57 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v37 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v56 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v36 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v46 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v35 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v45 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v34 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v42 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v33 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v43 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v58 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v31 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v55 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v32 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v40 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v26 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v53 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v30 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v54 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v62 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v63 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v50 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v22 -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v59 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v60 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v61 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v57 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v56 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v51 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v24 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v28 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v48 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v47 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v20 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v12 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x78, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v8 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v38 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v23 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -210941,1137 +208554,1064 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-LABEL: bitcast_v64i16_to_v128i8_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[4:5] -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:48 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_writelane_b32 v40, s30, 0 -; SI-NEXT: v_writelane_b32 v40, s31, 1 -; SI-NEXT: v_writelane_b32 v40, s34, 2 -; SI-NEXT: v_writelane_b32 v40, s35, 3 -; SI-NEXT: v_writelane_b32 v40, s36, 4 -; SI-NEXT: v_writelane_b32 v40, s37, 5 -; SI-NEXT: v_writelane_b32 v40, s38, 6 -; SI-NEXT: v_writelane_b32 v40, s39, 7 -; SI-NEXT: v_writelane_b32 v40, s48, 8 -; SI-NEXT: v_writelane_b32 v40, s49, 9 -; SI-NEXT: v_writelane_b32 v40, s50, 10 -; SI-NEXT: v_writelane_b32 v40, s51, 11 -; SI-NEXT: v_writelane_b32 v40, s52, 12 -; SI-NEXT: v_writelane_b32 v40, s53, 13 -; SI-NEXT: v_writelane_b32 v40, s54, 14 -; SI-NEXT: v_writelane_b32 v40, s55, 15 -; SI-NEXT: v_writelane_b32 v40, s64, 16 -; SI-NEXT: v_writelane_b32 v40, s65, 17 -; SI-NEXT: v_writelane_b32 v40, s66, 18 -; SI-NEXT: v_writelane_b32 v40, s67, 19 -; SI-NEXT: v_writelane_b32 v40, s68, 20 -; SI-NEXT: v_writelane_b32 v40, s69, 21 -; SI-NEXT: v_writelane_b32 v40, s70, 22 -; SI-NEXT: s_mov_b32 s88, s17 -; SI-NEXT: v_writelane_b32 v40, s71, 23 -; SI-NEXT: v_writelane_b32 v40, s80, 24 -; SI-NEXT: v_writelane_b32 v40, s81, 25 -; SI-NEXT: v_writelane_b32 v40, s82, 26 -; SI-NEXT: v_writelane_b32 v40, s83, 27 -; SI-NEXT: v_readfirstlane_b32 s6, v16 -; SI-NEXT: ; implicit-def: $vgpr41 : SGPR spill to VGPR lane -; SI-NEXT: v_readfirstlane_b32 s7, v15 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_writelane_b32 v41, s6, 0 -; SI-NEXT: v_readfirstlane_b32 s8, v21 -; SI-NEXT: v_writelane_b32 v41, s7, 1 -; SI-NEXT: v_readfirstlane_b32 s9, v20 -; SI-NEXT: v_writelane_b32 v41, s8, 2 -; SI-NEXT: v_readfirstlane_b32 s10, v19 -; SI-NEXT: v_writelane_b32 v41, s9, 3 -; SI-NEXT: v_readfirstlane_b32 s11, v25 -; SI-NEXT: v_writelane_b32 v41, s10, 4 -; SI-NEXT: v_readfirstlane_b32 s12, v24 -; SI-NEXT: v_writelane_b32 v41, s11, 5 -; SI-NEXT: v_readfirstlane_b32 s13, v23 -; SI-NEXT: v_writelane_b32 v41, s12, 6 -; SI-NEXT: v_readfirstlane_b32 s15, v29 -; SI-NEXT: v_writelane_b32 v41, s13, 7 -; SI-NEXT: v_readfirstlane_b32 s14, v28 -; SI-NEXT: v_writelane_b32 v41, s15, 8 -; SI-NEXT: s_mov_b32 s79, s16 -; SI-NEXT: v_readfirstlane_b32 s16, v27 -; SI-NEXT: v_writelane_b32 v41, s14, 9 -; SI-NEXT: v_writelane_b32 v41, s16, 10 -; SI-NEXT: v_writelane_b32 v40, s84, 28 -; SI-NEXT: v_writelane_b32 v40, s85, 29 -; SI-NEXT: v_writelane_b32 v40, s86, 30 -; SI-NEXT: v_writelane_b32 v40, s87, 31 -; SI-NEXT: v_writelane_b32 v40, s96, 32 -; SI-NEXT: v_writelane_b32 v40, s97, 33 -; SI-NEXT: v_writelane_b32 v40, s98, 34 -; SI-NEXT: v_writelane_b32 v40, s99, 35 -; SI-NEXT: v_readfirstlane_b32 s98, v30 -; SI-NEXT: v_readfirstlane_b32 s97, v26 -; SI-NEXT: v_readfirstlane_b32 s96, v22 -; SI-NEXT: v_readfirstlane_b32 s87, v18 -; SI-NEXT: v_readfirstlane_b32 s81, v17 -; SI-NEXT: v_readfirstlane_b32 s86, v14 -; SI-NEXT: v_readfirstlane_b32 s67, v13 -; SI-NEXT: v_readfirstlane_b32 s69, v12 -; SI-NEXT: v_readfirstlane_b32 s71, v11 -; SI-NEXT: v_readfirstlane_b32 s85, v10 -; SI-NEXT: v_readfirstlane_b32 s51, v9 -; SI-NEXT: v_readfirstlane_b32 s53, v8 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_readfirstlane_b32 s89, v31 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_readfirstlane_b32 s91, v32 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_readfirstlane_b32 s93, v33 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_readfirstlane_b32 s55, v34 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_readfirstlane_b32 s17, v35 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_readfirstlane_b32 s95, v36 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_readfirstlane_b32 s35, v37 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:16 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_readfirstlane_b32 s83, v38 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:80 -; SI-NEXT: v_readfirstlane_b32 s65, v7 -; SI-NEXT: v_readfirstlane_b32 s84, v6 -; SI-NEXT: v_readfirstlane_b32 s31, v5 -; SI-NEXT: v_readfirstlane_b32 s37, v4 -; SI-NEXT: v_readfirstlane_b32 s49, v3 -; SI-NEXT: v_readfirstlane_b32 s78, v2 -; SI-NEXT: v_readfirstlane_b32 s39, v1 -; SI-NEXT: ; implicit-def: $vgpr43 : SGPR spill to VGPR lane -; SI-NEXT: ; implicit-def: $vgpr42 : SGPR spill to VGPR lane -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_readfirstlane_b32 s77, v31 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_readfirstlane_b32 s38, v32 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_readfirstlane_b32 s48, v33 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_readfirstlane_b32 s50, v39 -; SI-NEXT: v_readfirstlane_b32 s90, v35 -; SI-NEXT: v_readfirstlane_b32 s92, v36 -; SI-NEXT: v_writelane_b32 v41, s90, 11 -; SI-NEXT: v_readfirstlane_b32 s94, v37 -; SI-NEXT: v_writelane_b32 v41, s92, 12 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_readfirstlane_b32 s30, v49 -; SI-NEXT: v_writelane_b32 v41, s94, 13 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_readfirstlane_b32 s34, v50 -; SI-NEXT: v_writelane_b32 v41, s30, 14 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_readfirstlane_b32 s36, v51 -; SI-NEXT: v_writelane_b32 v41, s34, 15 -; SI-NEXT: v_writelane_b32 v41, s36, 16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v38 -; SI-NEXT: v_writelane_b32 v41, s38, 17 -; SI-NEXT: v_readfirstlane_b32 s76, v48 -; SI-NEXT: v_readfirstlane_b32 s99, v34 +; SI-NEXT: v_writelane_b32 v21, s30, 0 +; SI-NEXT: v_writelane_b32 v21, s31, 1 +; SI-NEXT: v_writelane_b32 v21, s34, 2 +; SI-NEXT: v_writelane_b32 v21, s35, 3 +; SI-NEXT: v_writelane_b32 v21, s36, 4 +; SI-NEXT: v_writelane_b32 v21, s37, 5 +; SI-NEXT: v_writelane_b32 v21, s38, 6 +; SI-NEXT: v_writelane_b32 v21, s39, 7 +; SI-NEXT: v_writelane_b32 v21, s48, 8 +; SI-NEXT: v_writelane_b32 v21, s49, 9 +; SI-NEXT: v_writelane_b32 v21, s50, 10 +; SI-NEXT: v_writelane_b32 v21, s51, 11 +; SI-NEXT: v_writelane_b32 v21, s52, 12 +; SI-NEXT: v_writelane_b32 v21, s53, 13 +; SI-NEXT: v_writelane_b32 v21, s54, 14 +; SI-NEXT: v_writelane_b32 v21, s55, 15 +; SI-NEXT: v_writelane_b32 v21, s64, 16 +; SI-NEXT: v_writelane_b32 v21, s65, 17 +; SI-NEXT: v_writelane_b32 v21, s66, 18 +; SI-NEXT: v_writelane_b32 v21, s67, 19 +; SI-NEXT: v_writelane_b32 v21, s68, 20 +; SI-NEXT: v_readfirstlane_b32 s6, v12 +; SI-NEXT: ; implicit-def: $vgpr22 : SGPR spill to VGPR lane +; SI-NEXT: v_writelane_b32 v21, s69, 21 +; SI-NEXT: v_readfirstlane_b32 s7, v1 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_writelane_b32 v22, s6, 0 +; SI-NEXT: v_writelane_b32 v21, s70, 22 +; SI-NEXT: v_readfirstlane_b32 s8, v4 +; SI-NEXT: v_writelane_b32 v22, s7, 1 +; SI-NEXT: v_writelane_b32 v21, s71, 23 +; SI-NEXT: v_readfirstlane_b32 s9, v11 +; SI-NEXT: v_writelane_b32 v22, s8, 2 +; SI-NEXT: v_writelane_b32 v21, s80, 24 +; SI-NEXT: v_readfirstlane_b32 s48, v14 +; SI-NEXT: v_writelane_b32 v22, s9, 3 +; SI-NEXT: v_writelane_b32 v21, s81, 25 +; SI-NEXT: v_readfirstlane_b32 s10, v3 +; SI-NEXT: v_writelane_b32 v22, s48, 4 +; SI-NEXT: v_writelane_b32 v21, s82, 26 +; SI-NEXT: v_readfirstlane_b32 s50, v13 +; SI-NEXT: v_writelane_b32 v22, s10, 5 +; SI-NEXT: v_writelane_b32 v21, s83, 27 +; SI-NEXT: v_readfirstlane_b32 s52, v16 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v1 +; SI-NEXT: v_writelane_b32 v22, s50, 6 +; SI-NEXT: v_writelane_b32 v21, s84, 28 +; SI-NEXT: v_readfirstlane_b32 s11, v20 +; SI-NEXT: v_writelane_b32 v22, s52, 7 +; SI-NEXT: v_writelane_b32 v21, s85, 29 +; SI-NEXT: v_readfirstlane_b32 s54, v15 +; SI-NEXT: v_writelane_b32 v22, s11, 8 +; SI-NEXT: v_writelane_b32 v21, s86, 30 +; SI-NEXT: v_readfirstlane_b32 s64, v18 +; SI-NEXT: v_writelane_b32 v22, s54, 9 +; SI-NEXT: v_writelane_b32 v21, s87, 31 +; SI-NEXT: v_readfirstlane_b32 s66, v17 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; SI-NEXT: v_writelane_b32 v22, s64, 10 +; SI-NEXT: v_writelane_b32 v21, s96, 32 +; SI-NEXT: v_readfirstlane_b32 s81, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_readfirstlane_b32 s12, v4 +; SI-NEXT: v_writelane_b32 v22, s66, 11 +; SI-NEXT: v_writelane_b32 v21, s97, 33 +; SI-NEXT: v_readfirstlane_b32 s85, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_readfirstlane_b32 s13, v5 +; SI-NEXT: v_writelane_b32 v22, s12, 12 +; SI-NEXT: v_writelane_b32 v21, s98, 34 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_readfirstlane_b32 s87, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_readfirstlane_b32 s97, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_readfirstlane_b32 s83, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_readfirstlane_b32 s71, v6 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: s_lshr_b32 s92, s29, 16 +; SI-NEXT: s_lshr_b32 s69, s28, 16 +; SI-NEXT: s_lshr_b32 s93, s27, 16 +; SI-NEXT: s_lshr_b32 s67, s26, 16 +; SI-NEXT: s_lshr_b32 s95, s25, 16 +; SI-NEXT: s_lshr_b32 s65, s24, 16 +; SI-NEXT: s_lshr_b32 s30, s23, 16 +; SI-NEXT: s_lshr_b32 s55, s22, 16 +; SI-NEXT: s_lshr_b32 s31, s21, 16 +; SI-NEXT: s_lshr_b32 s53, s20, 16 +; SI-NEXT: s_lshr_b32 s34, s19, 16 +; SI-NEXT: s_lshr_b32 s51, s18, 16 +; SI-NEXT: s_lshr_b32 s35, s17, 16 +; SI-NEXT: s_lshr_b32 s49, s16, 16 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_readfirstlane_b32 s14, v7 +; SI-NEXT: v_writelane_b32 v22, s13, 13 +; SI-NEXT: v_writelane_b32 v21, s99, 35 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_writelane_b32 v41, s48, 18 -; SI-NEXT: v_writelane_b32 v41, s50, 19 +; SI-NEXT: v_readfirstlane_b32 s99, v2 +; SI-NEXT: v_readfirstlane_b32 s76, v18 +; SI-NEXT: v_readfirstlane_b32 s39, v17 +; SI-NEXT: v_readfirstlane_b32 s77, v16 +; SI-NEXT: v_readfirstlane_b32 s38, v15 +; SI-NEXT: v_readfirstlane_b32 s78, v14 +; SI-NEXT: v_readfirstlane_b32 s37, v13 +; SI-NEXT: v_readfirstlane_b32 s79, v12 +; SI-NEXT: v_readfirstlane_b32 s36, v11 +; SI-NEXT: v_readfirstlane_b32 s88, v10 +; SI-NEXT: v_readfirstlane_b32 vcc_lo, v9 +; SI-NEXT: v_readfirstlane_b32 s89, v8 +; SI-NEXT: v_readfirstlane_b32 s90, v6 +; SI-NEXT: v_readfirstlane_b32 s91, v19 +; SI-NEXT: v_readfirstlane_b32 s94, v3 +; SI-NEXT: v_writelane_b32 v22, s14, 14 +; SI-NEXT: v_writelane_b32 v22, vcc_lo, 15 +; SI-NEXT: ; implicit-def: $vgpr23 : SGPR spill to VGPR lane ; SI-NEXT: s_cbranch_scc0 .LBB99_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_and_b32 s4, s79, 0xffff -; SI-NEXT: s_lshl_b32 s5, s88, 16 -; SI-NEXT: s_or_b32 s60, s4, s5 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s49, 16 +; SI-NEXT: s_or_b32 s62, s4, s5 +; SI-NEXT: s_and_b32 s4, s17, 0xffff +; SI-NEXT: s_lshl_b32 s5, s35, 16 +; SI-NEXT: s_or_b32 s63, s4, s5 ; SI-NEXT: s_and_b32 s4, s18, 0xffff -; SI-NEXT: s_lshl_b32 s5, s19, 16 -; SI-NEXT: s_or_b32 s61, s4, s5 +; SI-NEXT: s_lshl_b32 s5, s51, 16 +; SI-NEXT: s_or_b32 s58, s4, s5 +; SI-NEXT: s_and_b32 s4, s19, 0xffff +; SI-NEXT: s_lshl_b32 s5, s34, 16 +; SI-NEXT: s_or_b32 s59, s4, s5 ; SI-NEXT: s_and_b32 s4, s20, 0xffff -; SI-NEXT: s_lshl_b32 s5, s21, 16 +; SI-NEXT: s_lshl_b32 s5, s53, 16 ; SI-NEXT: s_or_b32 s56, s4, s5 -; SI-NEXT: s_and_b32 s4, s22, 0xffff -; SI-NEXT: s_lshl_b32 s5, s23, 16 +; SI-NEXT: s_and_b32 s4, s21, 0xffff +; SI-NEXT: s_lshl_b32 s5, s31, 16 ; SI-NEXT: s_or_b32 s57, s4, s5 -; SI-NEXT: s_and_b32 s4, s24, 0xffff -; SI-NEXT: s_lshl_b32 s5, s25, 16 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: s_lshl_b32 s5, s55, 16 ; SI-NEXT: s_or_b32 s44, s4, s5 -; SI-NEXT: s_and_b32 s4, s26, 0xffff -; SI-NEXT: s_lshl_b32 s5, s27, 16 +; SI-NEXT: s_and_b32 s4, s23, 0xffff +; SI-NEXT: s_lshl_b32 s5, s30, 16 ; SI-NEXT: s_or_b32 s45, s4, s5 -; SI-NEXT: s_and_b32 s4, s28, 0xffff -; SI-NEXT: s_lshl_b32 s5, s29, 16 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: s_lshl_b32 s5, s65, 16 ; SI-NEXT: s_or_b32 s74, s4, s5 -; SI-NEXT: s_and_b32 s4, s39, 0xffff -; SI-NEXT: s_lshl_b32 s5, s78, 16 +; SI-NEXT: s_and_b32 s4, s25, 0xffff +; SI-NEXT: s_lshl_b32 s5, s95, 16 ; SI-NEXT: s_or_b32 s75, s4, s5 -; SI-NEXT: s_and_b32 s4, s49, 0xffff -; SI-NEXT: s_lshl_b32 s5, s37, 16 +; SI-NEXT: s_and_b32 s4, s26, 0xffff +; SI-NEXT: s_lshl_b32 s5, s67, 16 ; SI-NEXT: s_or_b32 s72, s4, s5 -; SI-NEXT: s_and_b32 s4, s31, 0xffff -; SI-NEXT: s_lshl_b32 s5, s84, 16 +; SI-NEXT: s_and_b32 s4, s27, 0xffff +; SI-NEXT: s_lshl_b32 s5, s93, 16 ; SI-NEXT: s_or_b32 s73, s4, s5 -; SI-NEXT: s_and_b32 s4, s65, 0xffff -; SI-NEXT: s_lshl_b32 s5, s53, 16 -; SI-NEXT: s_or_b32 s62, s4, s5 -; SI-NEXT: s_and_b32 s4, s51, 0xffff -; SI-NEXT: s_lshl_b32 s5, s85, 16 -; SI-NEXT: s_or_b32 s63, s4, s5 -; SI-NEXT: s_and_b32 s4, s71, 0xffff +; SI-NEXT: s_and_b32 s4, s28, 0xffff ; SI-NEXT: s_lshl_b32 s5, s69, 16 -; SI-NEXT: s_or_b32 s58, s4, s5 -; SI-NEXT: s_and_b32 s4, s67, 0xffff -; SI-NEXT: s_lshl_b32 s5, s86, 16 -; SI-NEXT: s_or_b32 s59, s4, s5 +; SI-NEXT: s_or_b32 s60, s4, s5 +; SI-NEXT: s_and_b32 s4, s29, 0xffff +; SI-NEXT: s_lshl_b32 s5, s92, 16 +; SI-NEXT: s_or_b32 s61, s4, s5 ; SI-NEXT: s_and_b32 s4, s7, 0xffff -; SI-NEXT: s_lshl_b32 s5, s6, 16 +; SI-NEXT: s_lshl_b32 s5, s11, 16 ; SI-NEXT: s_or_b32 s46, s4, s5 -; SI-NEXT: s_and_b32 s4, s81, 0xffff -; SI-NEXT: s_lshl_b32 s5, s87, 16 +; SI-NEXT: s_and_b32 s4, s99, 0xffff +; SI-NEXT: s_lshl_b32 s5, s94, 16 ; SI-NEXT: s_or_b32 s47, s4, s5 ; SI-NEXT: s_and_b32 s4, s10, 0xffff -; SI-NEXT: s_lshl_b32 s5, s9, 16 +; SI-NEXT: s_lshl_b32 s5, s12, 16 ; SI-NEXT: s_or_b32 s42, s4, s5 ; SI-NEXT: s_and_b32 s4, s8, 0xffff -; SI-NEXT: s_lshl_b32 s5, s96, 16 +; SI-NEXT: s_lshl_b32 s5, s91, 16 ; SI-NEXT: s_or_b32 s43, s4, s5 -; SI-NEXT: s_and_b32 s4, s13, 0xffff -; SI-NEXT: s_lshl_b32 s5, s12, 16 +; SI-NEXT: s_and_b32 s4, s81, 0xffff +; SI-NEXT: s_lshl_b32 s5, s13, 16 ; SI-NEXT: s_or_b32 s40, s4, s5 -; SI-NEXT: s_and_b32 s4, s11, 0xffff -; SI-NEXT: s_lshl_b32 s5, s97, 16 +; SI-NEXT: s_and_b32 s4, s71, 0xffff +; SI-NEXT: s_lshl_b32 s5, s90, 16 ; SI-NEXT: s_or_b32 s41, s4, s5 -; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_and_b32 s4, s85, 0xffff ; SI-NEXT: s_lshl_b32 s5, s14, 16 ; SI-NEXT: s_or_b32 s14, s4, s5 -; SI-NEXT: s_and_b32 s4, s15, 0xffff -; SI-NEXT: s_lshl_b32 s5, s98, 16 +; SI-NEXT: s_and_b32 s4, s83, 0xffff +; SI-NEXT: s_lshl_b32 s5, s89, 16 ; SI-NEXT: s_or_b32 s15, s4, s5 -; SI-NEXT: s_and_b32 s4, s94, 0xffff -; SI-NEXT: s_lshl_b32 s5, s92, 16 +; SI-NEXT: s_and_b32 s4, s97, 0xffff +; SI-NEXT: s_lshl_b32 s5, vcc_lo, 16 ; SI-NEXT: s_or_b32 s12, s4, s5 -; SI-NEXT: s_and_b32 s4, s90, 0xffff -; SI-NEXT: s_lshl_b32 s5, s99, 16 +; SI-NEXT: s_and_b32 s4, s87, 0xffff +; SI-NEXT: s_lshl_b32 s5, s88, 16 ; SI-NEXT: s_or_b32 s13, s4, s5 -; SI-NEXT: s_and_b32 s4, s36, 0xffff -; SI-NEXT: s_lshl_b32 s5, s34, 16 +; SI-NEXT: s_and_b32 s4, s9, 0xffff +; SI-NEXT: s_lshl_b32 s5, s36, 16 ; SI-NEXT: s_or_b32 s10, s4, s5 -; SI-NEXT: s_and_b32 s4, s30, 0xffff -; SI-NEXT: s_lshl_b32 s5, s76, 16 +; SI-NEXT: s_and_b32 s4, s6, 0xffff +; SI-NEXT: s_lshl_b32 s5, s79, 16 ; SI-NEXT: s_or_b32 s11, s4, s5 ; SI-NEXT: s_and_b32 s4, s50, 0xffff -; SI-NEXT: s_lshl_b32 s5, s48, 16 +; SI-NEXT: s_lshl_b32 s5, s37, 16 ; SI-NEXT: s_or_b32 s8, s4, s5 -; SI-NEXT: s_and_b32 s4, s38, 0xffff -; SI-NEXT: s_lshl_b32 s5, s77, 16 +; SI-NEXT: s_and_b32 s4, s48, 0xffff +; SI-NEXT: s_lshl_b32 s5, s78, 16 ; SI-NEXT: s_or_b32 s9, s4, s5 -; SI-NEXT: s_and_b32 s4, s83, 0xffff -; SI-NEXT: s_lshl_b32 s5, s35, 16 +; SI-NEXT: s_and_b32 s4, s54, 0xffff +; SI-NEXT: s_lshl_b32 s5, s38, 16 ; SI-NEXT: s_or_b32 s6, s4, s5 -; SI-NEXT: s_and_b32 s4, s95, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_and_b32 s4, s52, 0xffff +; SI-NEXT: s_lshl_b32 s5, s77, 16 ; SI-NEXT: s_or_b32 s7, s4, s5 -; SI-NEXT: s_and_b32 s4, s55, 0xffff -; SI-NEXT: s_lshl_b32 s5, s93, 16 +; SI-NEXT: s_and_b32 s4, s66, 0xffff +; SI-NEXT: s_lshl_b32 s5, s39, 16 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s91, 0xffff -; SI-NEXT: s_lshl_b32 s16, s89, 16 -; SI-NEXT: s_or_b32 s5, s5, s16 -; SI-NEXT: s_lshr_b32 s16, s61, 8 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v43, s16, 20 -; SI-NEXT: s_lshr_b32 s16, s57, 8 -; SI-NEXT: v_writelane_b32 v43, s16, 23 -; SI-NEXT: s_lshr_b32 s16, s45, 8 -; SI-NEXT: v_writelane_b32 v43, s16, 26 -; SI-NEXT: s_lshr_b32 s16, s75, 8 -; SI-NEXT: v_writelane_b32 v43, s16, 29 -; SI-NEXT: s_lshr_b32 s16, s73, 8 -; SI-NEXT: s_lshr_b64 vcc, s[60:61], 24 -; SI-NEXT: v_writelane_b32 v43, s16, 32 -; SI-NEXT: s_lshr_b32 s16, s63, 8 -; SI-NEXT: v_writelane_b32 v41, vcc_lo, 22 -; SI-NEXT: v_writelane_b32 v43, s16, 35 -; SI-NEXT: s_lshr_b32 s16, s59, 8 -; SI-NEXT: v_writelane_b32 v41, vcc_hi, 23 -; SI-NEXT: s_lshr_b64 vcc, s[60:61], 16 -; SI-NEXT: v_writelane_b32 v43, s16, 38 -; SI-NEXT: s_lshr_b32 s16, s47, 8 -; SI-NEXT: v_writelane_b32 v41, vcc_lo, 20 -; SI-NEXT: v_writelane_b32 v43, s16, 41 -; SI-NEXT: s_lshr_b32 s16, s43, 8 -; SI-NEXT: v_writelane_b32 v41, vcc_hi, 21 +; SI-NEXT: s_and_b32 s5, s64, 0xffff +; SI-NEXT: s_lshl_b32 vcc_lo, s76, 16 +; SI-NEXT: s_or_b32 s5, s5, vcc_lo +; SI-NEXT: s_lshr_b32 vcc_lo, s63, 8 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v23, vcc_lo, 21 +; SI-NEXT: s_lshr_b32 vcc_lo, s59, 8 +; SI-NEXT: v_writelane_b32 v23, vcc_lo, 23 +; SI-NEXT: s_lshr_b32 vcc_lo, s57, 8 +; SI-NEXT: v_writelane_b32 v23, vcc_lo, 25 +; SI-NEXT: s_lshr_b32 vcc_lo, s45, 8 +; SI-NEXT: v_writelane_b32 v23, vcc_lo, 27 +; SI-NEXT: s_lshr_b32 vcc_lo, s75, 8 +; SI-NEXT: v_writelane_b32 v23, vcc_lo, 29 +; SI-NEXT: s_lshr_b32 vcc_lo, s73, 8 +; SI-NEXT: v_writelane_b32 v23, vcc_lo, 31 +; SI-NEXT: s_lshr_b32 vcc_lo, s61, 8 +; SI-NEXT: v_writelane_b32 v23, vcc_lo, 33 +; SI-NEXT: s_lshr_b32 vcc_lo, s47, 8 +; SI-NEXT: v_writelane_b32 v23, vcc_lo, 35 +; SI-NEXT: s_lshr_b32 vcc_lo, s43, 8 +; SI-NEXT: v_writelane_b32 v23, vcc_lo, 37 +; SI-NEXT: s_lshr_b32 vcc_lo, s41, 8 +; SI-NEXT: v_writelane_b32 v23, vcc_lo, 39 +; SI-NEXT: s_lshr_b32 vcc_lo, s15, 8 +; SI-NEXT: v_writelane_b32 v23, vcc_lo, 41 +; SI-NEXT: s_lshr_b32 vcc_lo, s13, 8 +; SI-NEXT: v_writelane_b32 v23, vcc_lo, 43 +; SI-NEXT: s_lshr_b32 vcc_lo, s11, 8 +; SI-NEXT: v_writelane_b32 v23, vcc_lo, 45 +; SI-NEXT: s_lshr_b32 vcc_lo, s9, 8 +; SI-NEXT: v_writelane_b32 v23, vcc_lo, 47 +; SI-NEXT: s_lshr_b32 vcc_lo, s7, 8 +; SI-NEXT: v_writelane_b32 v23, vcc_lo, 49 +; SI-NEXT: s_lshr_b32 vcc_lo, s5, 8 +; SI-NEXT: v_writelane_b32 v23, vcc_lo, 51 +; SI-NEXT: s_bfe_u32 vcc_lo, s35, 0x80008 +; SI-NEXT: v_writelane_b32 v23, vcc_lo, 20 +; SI-NEXT: s_bfe_u32 vcc_lo, s34, 0x80008 +; SI-NEXT: v_writelane_b32 v23, vcc_lo, 22 +; SI-NEXT: s_bfe_u32 vcc_lo, s31, 0x80008 +; SI-NEXT: v_writelane_b32 v23, vcc_lo, 24 +; SI-NEXT: s_bfe_u32 vcc_lo, s30, 0x80008 +; SI-NEXT: v_writelane_b32 v23, vcc_lo, 26 +; SI-NEXT: s_bfe_u32 vcc_lo, s95, 0x80008 +; SI-NEXT: v_writelane_b32 v23, vcc_lo, 28 +; SI-NEXT: s_bfe_u32 vcc_lo, s93, 0x80008 +; SI-NEXT: v_writelane_b32 v23, vcc_lo, 30 +; SI-NEXT: s_bfe_u32 vcc_lo, s92, 0x80008 +; SI-NEXT: v_writelane_b32 v23, vcc_lo, 32 +; SI-NEXT: s_bfe_u32 vcc_lo, s94, 0x80008 +; SI-NEXT: v_writelane_b32 v23, vcc_lo, 34 +; SI-NEXT: s_bfe_u32 vcc_lo, s91, 0x80008 +; SI-NEXT: v_writelane_b32 v23, vcc_lo, 36 +; SI-NEXT: s_bfe_u32 vcc_lo, s90, 0x80008 +; SI-NEXT: v_writelane_b32 v23, vcc_lo, 38 +; SI-NEXT: s_bfe_u32 vcc_lo, s89, 0x80008 +; SI-NEXT: v_writelane_b32 v23, vcc_lo, 40 +; SI-NEXT: s_bfe_u32 vcc_lo, s88, 0x80008 +; SI-NEXT: v_writelane_b32 v23, vcc_lo, 42 +; SI-NEXT: s_bfe_u32 vcc_lo, s79, 0x80008 +; SI-NEXT: v_writelane_b32 v23, vcc_lo, 44 +; SI-NEXT: s_bfe_u32 vcc_lo, s78, 0x80008 +; SI-NEXT: v_writelane_b32 v23, vcc_lo, 46 +; SI-NEXT: s_bfe_u32 vcc_lo, s77, 0x80008 +; SI-NEXT: v_writelane_b32 v23, vcc_lo, 48 +; SI-NEXT: s_bfe_u32 vcc_lo, s76, 0x80008 +; SI-NEXT: v_writelane_b32 v23, vcc_lo, 50 +; SI-NEXT: s_lshr_b64 vcc, s[62:63], 24 +; SI-NEXT: v_writelane_b32 v22, vcc_lo, 18 +; SI-NEXT: v_writelane_b32 v22, vcc_hi, 19 +; SI-NEXT: s_lshr_b64 vcc, s[62:63], 16 +; SI-NEXT: v_writelane_b32 v22, vcc_lo, 16 +; SI-NEXT: v_writelane_b32 v22, vcc_hi, 17 +; SI-NEXT: s_mov_b32 vcc_lo, s51 +; SI-NEXT: s_lshr_b64 s[50:51], s[62:63], 8 +; SI-NEXT: s_mov_b32 s51, vcc_lo +; SI-NEXT: s_lshr_b64 vcc, s[58:59], 24 +; SI-NEXT: v_writelane_b32 v22, vcc_lo, 24 +; SI-NEXT: v_writelane_b32 v22, vcc_hi, 25 +; SI-NEXT: s_lshr_b64 vcc, s[58:59], 16 +; SI-NEXT: v_writelane_b32 v22, vcc_lo, 22 +; SI-NEXT: v_writelane_b32 v22, vcc_hi, 23 +; SI-NEXT: s_lshr_b64 vcc, s[58:59], 8 +; SI-NEXT: v_writelane_b32 v22, vcc_lo, 20 +; SI-NEXT: v_writelane_b32 v22, vcc_hi, 21 ; SI-NEXT: s_lshr_b64 vcc, s[56:57], 24 -; SI-NEXT: v_writelane_b32 v43, s16, 44 -; SI-NEXT: s_lshr_b32 s16, s41, 8 -; SI-NEXT: v_writelane_b32 v41, vcc_lo, 28 -; SI-NEXT: v_writelane_b32 v43, s16, 47 -; SI-NEXT: s_lshr_b32 s16, s15, 8 -; SI-NEXT: v_writelane_b32 v41, vcc_hi, 29 +; SI-NEXT: v_writelane_b32 v22, vcc_lo, 30 +; SI-NEXT: v_writelane_b32 v22, vcc_hi, 31 ; SI-NEXT: s_lshr_b64 vcc, s[56:57], 16 -; SI-NEXT: v_writelane_b32 v43, s16, 50 -; SI-NEXT: s_lshr_b32 s16, s13, 8 -; SI-NEXT: v_writelane_b32 v41, vcc_lo, 26 -; SI-NEXT: v_writelane_b32 v43, s16, 53 -; SI-NEXT: s_lshr_b32 s16, s11, 8 -; SI-NEXT: v_writelane_b32 v41, vcc_hi, 27 +; SI-NEXT: v_writelane_b32 v22, vcc_lo, 28 +; SI-NEXT: v_writelane_b32 v22, vcc_hi, 29 ; SI-NEXT: s_lshr_b64 vcc, s[56:57], 8 -; SI-NEXT: v_writelane_b32 v43, s16, 56 -; SI-NEXT: s_lshr_b32 s16, s9, 8 -; SI-NEXT: v_writelane_b32 v41, vcc_lo, 24 -; SI-NEXT: v_writelane_b32 v43, s16, 59 -; SI-NEXT: s_lshr_b32 s16, s7, 8 -; SI-NEXT: v_writelane_b32 v41, vcc_hi, 25 +; SI-NEXT: v_writelane_b32 v22, vcc_lo, 26 +; SI-NEXT: v_writelane_b32 v22, vcc_hi, 27 +; SI-NEXT: s_mov_b32 vcc_lo, s87 +; SI-NEXT: s_lshr_b64 s[86:87], s[44:45], 24 +; SI-NEXT: s_mov_b32 s87, vcc_lo +; SI-NEXT: s_mov_b32 vcc_lo, s97 +; SI-NEXT: s_lshr_b64 s[96:97], s[44:45], 16 +; SI-NEXT: s_mov_b32 s97, vcc_lo +; SI-NEXT: s_mov_b32 vcc_lo, s99 +; SI-NEXT: s_lshr_b64 s[98:99], s[44:45], 8 +; SI-NEXT: s_mov_b32 s99, vcc_lo +; SI-NEXT: s_mov_b32 vcc_lo, s81 +; SI-NEXT: s_lshr_b64 s[80:81], s[74:75], 24 +; SI-NEXT: s_mov_b32 s81, vcc_lo +; SI-NEXT: s_mov_b32 vcc_lo, s83 +; SI-NEXT: s_lshr_b64 s[82:83], s[74:75], 16 +; SI-NEXT: s_mov_b32 s83, vcc_lo +; SI-NEXT: s_mov_b32 vcc_lo, s85 +; SI-NEXT: s_lshr_b64 s[84:85], s[74:75], 8 +; SI-NEXT: s_mov_b32 s85, vcc_lo +; SI-NEXT: s_mov_b32 vcc_lo, s67 +; SI-NEXT: s_lshr_b64 s[66:67], s[72:73], 24 +; SI-NEXT: s_mov_b32 s67, vcc_lo +; SI-NEXT: s_mov_b32 vcc_lo, s69 +; SI-NEXT: s_lshr_b64 s[68:69], s[72:73], 16 +; SI-NEXT: s_mov_b32 s69, vcc_lo +; SI-NEXT: s_mov_b32 vcc_lo, s71 +; SI-NEXT: s_lshr_b64 s[70:71], s[72:73], 8 +; SI-NEXT: s_mov_b32 s71, vcc_lo +; SI-NEXT: s_mov_b32 vcc_lo, s53 +; SI-NEXT: s_lshr_b64 s[52:53], s[60:61], 24 +; SI-NEXT: s_mov_b32 s53, vcc_lo +; SI-NEXT: s_mov_b32 vcc_lo, s55 +; SI-NEXT: s_lshr_b64 s[54:55], s[60:61], 16 +; SI-NEXT: s_mov_b32 s55, vcc_lo +; SI-NEXT: s_mov_b32 vcc_lo, s65 +; SI-NEXT: s_lshr_b64 s[64:65], s[60:61], 8 +; SI-NEXT: s_mov_b32 s65, vcc_lo ; SI-NEXT: s_lshr_b64 vcc, s[46:47], 24 -; SI-NEXT: v_writelane_b32 v43, s16, 62 -; SI-NEXT: s_lshr_b32 s16, s5, 8 -; SI-NEXT: v_writelane_b32 v41, vcc_lo, 32 -; SI-NEXT: v_writelane_b32 v42, s16, 1 -; SI-NEXT: s_and_b32 s16, s19, 0xffff -; SI-NEXT: v_writelane_b32 v41, vcc_hi, 33 +; SI-NEXT: v_writelane_b32 v22, vcc_lo, 36 +; SI-NEXT: v_writelane_b32 v22, vcc_hi, 37 ; SI-NEXT: s_lshr_b64 vcc, s[46:47], 16 -; SI-NEXT: v_writelane_b32 v43, s16, 19 -; SI-NEXT: s_and_b32 s16, s23, 0xffff -; SI-NEXT: v_writelane_b32 v41, vcc_lo, 30 -; SI-NEXT: v_writelane_b32 v43, s16, 22 -; SI-NEXT: s_and_b32 s16, s27, 0xffff -; SI-NEXT: v_writelane_b32 v41, vcc_hi, 31 +; SI-NEXT: v_writelane_b32 v22, vcc_lo, 34 +; SI-NEXT: v_writelane_b32 v22, vcc_hi, 35 +; SI-NEXT: s_lshr_b64 vcc, s[46:47], 8 +; SI-NEXT: v_writelane_b32 v22, vcc_lo, 32 +; SI-NEXT: v_writelane_b32 v22, vcc_hi, 33 ; SI-NEXT: s_lshr_b64 vcc, s[42:43], 24 -; SI-NEXT: v_writelane_b32 v43, s16, 25 -; SI-NEXT: s_and_b32 s16, s78, 0xffff -; SI-NEXT: v_writelane_b32 v41, vcc_lo, 38 -; SI-NEXT: v_writelane_b32 v43, s16, 28 -; SI-NEXT: s_and_b32 s16, s84, 0xffff -; SI-NEXT: v_writelane_b32 v41, vcc_hi, 39 +; SI-NEXT: v_writelane_b32 v22, vcc_lo, 42 +; SI-NEXT: v_writelane_b32 v22, vcc_hi, 43 ; SI-NEXT: s_lshr_b64 vcc, s[42:43], 16 -; SI-NEXT: v_writelane_b32 v43, s16, 31 -; SI-NEXT: s_and_b32 s16, s85, 0xffff -; SI-NEXT: v_writelane_b32 v41, vcc_lo, 36 -; SI-NEXT: v_writelane_b32 v43, s16, 34 -; SI-NEXT: s_and_b32 s16, s86, 0xffff -; SI-NEXT: v_writelane_b32 v41, vcc_hi, 37 +; SI-NEXT: v_writelane_b32 v22, vcc_lo, 40 +; SI-NEXT: v_writelane_b32 v22, vcc_hi, 41 ; SI-NEXT: s_lshr_b64 vcc, s[42:43], 8 -; SI-NEXT: v_writelane_b32 v43, s16, 37 -; SI-NEXT: s_and_b32 s16, s87, 0xffff -; SI-NEXT: v_writelane_b32 v41, vcc_lo, 34 -; SI-NEXT: v_writelane_b32 v43, s16, 40 -; SI-NEXT: s_and_b32 s16, s96, 0xffff -; SI-NEXT: v_writelane_b32 v41, vcc_hi, 35 +; SI-NEXT: v_writelane_b32 v22, vcc_lo, 38 +; SI-NEXT: v_writelane_b32 v22, vcc_hi, 39 ; SI-NEXT: s_lshr_b64 vcc, s[40:41], 24 -; SI-NEXT: v_writelane_b32 v43, s16, 43 -; SI-NEXT: s_and_b32 s16, s97, 0xffff -; SI-NEXT: v_writelane_b32 v41, vcc_lo, 44 -; SI-NEXT: v_writelane_b32 v43, s16, 46 -; SI-NEXT: s_and_b32 s16, s98, 0xffff -; SI-NEXT: v_writelane_b32 v41, vcc_hi, 45 +; SI-NEXT: v_writelane_b32 v22, vcc_lo, 48 +; SI-NEXT: v_writelane_b32 v22, vcc_hi, 49 ; SI-NEXT: s_lshr_b64 vcc, s[40:41], 16 -; SI-NEXT: v_writelane_b32 v43, s16, 49 -; SI-NEXT: s_and_b32 s16, s99, 0xffff -; SI-NEXT: v_writelane_b32 v41, vcc_lo, 42 -; SI-NEXT: v_writelane_b32 v43, s16, 52 -; SI-NEXT: s_and_b32 s16, s76, 0xffff -; SI-NEXT: v_writelane_b32 v41, vcc_hi, 43 +; SI-NEXT: v_writelane_b32 v22, vcc_lo, 46 +; SI-NEXT: v_writelane_b32 v22, vcc_hi, 47 ; SI-NEXT: s_lshr_b64 vcc, s[40:41], 8 -; SI-NEXT: v_writelane_b32 v43, s16, 55 -; SI-NEXT: s_and_b32 s16, s77, 0xffff -; SI-NEXT: v_writelane_b32 v41, vcc_lo, 40 -; SI-NEXT: v_writelane_b32 v43, s16, 58 -; SI-NEXT: s_and_b32 s16, s17, 0xffff -; SI-NEXT: v_writelane_b32 v41, vcc_hi, 41 +; SI-NEXT: v_writelane_b32 v22, vcc_lo, 44 +; SI-NEXT: v_writelane_b32 v22, vcc_hi, 45 ; SI-NEXT: s_lshr_b64 vcc, s[14:15], 24 -; SI-NEXT: v_writelane_b32 v43, s16, 61 -; SI-NEXT: s_and_b32 s16, s89, 0xffff -; SI-NEXT: v_writelane_b32 v41, vcc_lo, 50 -; SI-NEXT: v_writelane_b32 v42, s16, 0 -; SI-NEXT: s_bfe_u32 s16, s19, 0x80008 -; SI-NEXT: v_writelane_b32 v41, vcc_hi, 51 +; SI-NEXT: v_writelane_b32 v22, vcc_lo, 54 +; SI-NEXT: v_writelane_b32 v22, vcc_hi, 55 ; SI-NEXT: s_lshr_b64 vcc, s[14:15], 16 -; SI-NEXT: v_writelane_b32 v43, s16, 18 -; SI-NEXT: s_bfe_u32 s16, s23, 0x80008 -; SI-NEXT: v_writelane_b32 v41, vcc_lo, 48 -; SI-NEXT: v_writelane_b32 v43, s16, 21 -; SI-NEXT: s_bfe_u32 s16, s27, 0x80008 -; SI-NEXT: v_writelane_b32 v41, vcc_hi, 49 +; SI-NEXT: v_writelane_b32 v22, vcc_lo, 52 +; SI-NEXT: v_writelane_b32 v22, vcc_hi, 53 ; SI-NEXT: s_lshr_b64 vcc, s[14:15], 8 -; SI-NEXT: v_writelane_b32 v43, s16, 24 -; SI-NEXT: s_bfe_u32 s16, s78, 0x80008 -; SI-NEXT: v_writelane_b32 v41, vcc_lo, 46 -; SI-NEXT: v_writelane_b32 v43, s16, 27 -; SI-NEXT: s_bfe_u32 s16, s84, 0x80008 -; SI-NEXT: v_writelane_b32 v41, vcc_hi, 47 +; SI-NEXT: v_writelane_b32 v22, vcc_lo, 50 +; SI-NEXT: v_writelane_b32 v22, vcc_hi, 51 ; SI-NEXT: s_lshr_b64 vcc, s[12:13], 24 -; SI-NEXT: v_writelane_b32 v43, s16, 30 -; SI-NEXT: s_bfe_u32 s16, s85, 0x80008 -; SI-NEXT: v_writelane_b32 v41, vcc_lo, 56 -; SI-NEXT: v_writelane_b32 v43, s16, 33 -; SI-NEXT: s_bfe_u32 s16, s86, 0x80008 -; SI-NEXT: v_writelane_b32 v41, vcc_hi, 57 +; SI-NEXT: v_writelane_b32 v22, vcc_lo, 60 +; SI-NEXT: v_writelane_b32 v22, vcc_hi, 61 ; SI-NEXT: s_lshr_b64 vcc, s[12:13], 16 -; SI-NEXT: v_writelane_b32 v43, s16, 36 -; SI-NEXT: s_bfe_u32 s16, s87, 0x80008 -; SI-NEXT: v_writelane_b32 v41, vcc_lo, 54 -; SI-NEXT: v_writelane_b32 v43, s16, 39 -; SI-NEXT: s_bfe_u32 s16, s96, 0x80008 -; SI-NEXT: v_writelane_b32 v41, vcc_hi, 55 +; SI-NEXT: v_writelane_b32 v22, vcc_lo, 58 +; SI-NEXT: v_writelane_b32 v22, vcc_hi, 59 ; SI-NEXT: s_lshr_b64 vcc, s[12:13], 8 -; SI-NEXT: v_writelane_b32 v43, s16, 42 -; SI-NEXT: s_bfe_u32 s16, s97, 0x80008 -; SI-NEXT: v_writelane_b32 v41, vcc_lo, 52 -; SI-NEXT: v_writelane_b32 v43, s16, 45 -; SI-NEXT: s_bfe_u32 s16, s98, 0x80008 -; SI-NEXT: v_writelane_b32 v41, vcc_hi, 53 +; SI-NEXT: v_writelane_b32 v22, vcc_lo, 56 +; SI-NEXT: v_writelane_b32 v22, vcc_hi, 57 ; SI-NEXT: s_lshr_b64 vcc, s[10:11], 24 -; SI-NEXT: v_writelane_b32 v43, s16, 48 -; SI-NEXT: s_bfe_u32 s16, s99, 0x80008 -; SI-NEXT: v_writelane_b32 v41, vcc_lo, 62 -; SI-NEXT: v_writelane_b32 v43, s16, 51 -; SI-NEXT: s_bfe_u32 s16, s76, 0x80008 -; SI-NEXT: v_writelane_b32 v41, vcc_hi, 63 +; SI-NEXT: v_writelane_b32 v23, vcc_lo, 2 +; SI-NEXT: v_writelane_b32 v23, vcc_hi, 3 ; SI-NEXT: s_lshr_b64 vcc, s[10:11], 16 -; SI-NEXT: v_writelane_b32 v43, s16, 54 -; SI-NEXT: s_bfe_u32 s16, s77, 0x80008 -; SI-NEXT: v_writelane_b32 v41, vcc_lo, 60 -; SI-NEXT: v_writelane_b32 v43, s16, 57 -; SI-NEXT: s_bfe_u32 s16, s17, 0x80008 -; SI-NEXT: v_writelane_b32 v41, vcc_hi, 61 +; SI-NEXT: v_writelane_b32 v23, vcc_lo, 0 +; SI-NEXT: v_writelane_b32 v23, vcc_hi, 1 ; SI-NEXT: s_lshr_b64 vcc, s[10:11], 8 -; SI-NEXT: v_writelane_b32 v43, s16, 60 -; SI-NEXT: s_bfe_u32 s16, s89, 0x80008 -; SI-NEXT: v_writelane_b32 v41, vcc_lo, 58 -; SI-NEXT: v_writelane_b32 v43, s16, 63 -; SI-NEXT: v_writelane_b32 v41, vcc_hi, 59 +; SI-NEXT: v_writelane_b32 v22, vcc_lo, 62 +; SI-NEXT: v_writelane_b32 v22, vcc_hi, 63 ; SI-NEXT: s_lshr_b64 vcc, s[8:9], 24 -; SI-NEXT: s_mov_b32 s16, s93 -; SI-NEXT: s_lshr_b64 s[92:93], s[60:61], 8 -; SI-NEXT: v_writelane_b32 v43, vcc_lo, 4 -; SI-NEXT: s_mov_b32 s93, s16 -; SI-NEXT: s_mov_b32 s16, s71 -; SI-NEXT: s_lshr_b64 s[70:71], s[44:45], 24 -; SI-NEXT: v_writelane_b32 v43, vcc_hi, 5 +; SI-NEXT: v_writelane_b32 v23, vcc_lo, 8 +; SI-NEXT: v_writelane_b32 v23, vcc_hi, 9 ; SI-NEXT: s_lshr_b64 vcc, s[8:9], 16 -; SI-NEXT: s_mov_b32 s71, s16 -; SI-NEXT: s_mov_b32 s16, s81 -; SI-NEXT: s_lshr_b64 s[80:81], s[44:45], 16 -; SI-NEXT: v_writelane_b32 v43, vcc_lo, 2 -; SI-NEXT: s_mov_b32 s81, s16 -; SI-NEXT: s_mov_b32 s16, s83 -; SI-NEXT: s_lshr_b64 s[82:83], s[44:45], 8 -; SI-NEXT: v_writelane_b32 v43, vcc_hi, 3 +; SI-NEXT: v_writelane_b32 v23, vcc_lo, 6 +; SI-NEXT: v_writelane_b32 v23, vcc_hi, 7 ; SI-NEXT: s_lshr_b64 vcc, s[8:9], 8 -; SI-NEXT: s_mov_b32 s83, s16 -; SI-NEXT: s_mov_b32 s16, s65 -; SI-NEXT: s_lshr_b64 s[64:65], s[74:75], 24 -; SI-NEXT: v_writelane_b32 v43, vcc_lo, 0 -; SI-NEXT: s_mov_b32 s65, s16 -; SI-NEXT: s_mov_b32 s16, s67 -; SI-NEXT: s_lshr_b64 s[66:67], s[74:75], 16 -; SI-NEXT: v_writelane_b32 v43, vcc_hi, 1 +; SI-NEXT: v_writelane_b32 v23, vcc_lo, 4 +; SI-NEXT: v_writelane_b32 v23, vcc_hi, 5 ; SI-NEXT: s_lshr_b64 vcc, s[6:7], 24 -; SI-NEXT: s_mov_b32 s67, s16 -; SI-NEXT: s_mov_b32 s16, s69 -; SI-NEXT: s_lshr_b64 s[68:69], s[74:75], 8 -; SI-NEXT: v_writelane_b32 v43, vcc_lo, 10 -; SI-NEXT: s_mov_b32 s69, s16 -; SI-NEXT: s_mov_b32 s16, s51 -; SI-NEXT: s_lshr_b64 s[50:51], s[72:73], 24 -; SI-NEXT: v_writelane_b32 v43, vcc_hi, 11 +; SI-NEXT: v_writelane_b32 v23, vcc_lo, 14 +; SI-NEXT: v_writelane_b32 v23, vcc_hi, 15 ; SI-NEXT: s_lshr_b64 vcc, s[6:7], 16 -; SI-NEXT: s_mov_b32 s51, s16 -; SI-NEXT: s_mov_b32 s16, s53 -; SI-NEXT: s_lshr_b64 s[52:53], s[72:73], 16 -; SI-NEXT: v_writelane_b32 v43, vcc_lo, 8 -; SI-NEXT: s_mov_b32 s53, s16 -; SI-NEXT: s_mov_b32 s16, s55 -; SI-NEXT: s_lshr_b64 s[54:55], s[72:73], 8 -; SI-NEXT: v_writelane_b32 v43, vcc_hi, 9 +; SI-NEXT: v_writelane_b32 v23, vcc_lo, 12 +; SI-NEXT: v_writelane_b32 v23, vcc_hi, 13 ; SI-NEXT: s_lshr_b64 vcc, s[6:7], 8 -; SI-NEXT: s_mov_b32 s55, s16 -; SI-NEXT: s_mov_b32 s16, s37 -; SI-NEXT: s_lshr_b64 s[36:37], s[62:63], 24 -; SI-NEXT: v_writelane_b32 v43, vcc_lo, 6 -; SI-NEXT: s_mov_b32 s37, s16 -; SI-NEXT: s_mov_b32 s16, s39 -; SI-NEXT: s_lshr_b64 s[38:39], s[62:63], 16 -; SI-NEXT: v_writelane_b32 v43, vcc_hi, 7 +; SI-NEXT: v_writelane_b32 v23, vcc_lo, 10 +; SI-NEXT: v_writelane_b32 v23, vcc_hi, 11 ; SI-NEXT: s_lshr_b64 vcc, s[4:5], 24 -; SI-NEXT: s_mov_b32 s39, s16 -; SI-NEXT: s_mov_b32 s16, s49 -; SI-NEXT: s_lshr_b64 s[48:49], s[62:63], 8 -; SI-NEXT: v_writelane_b32 v43, vcc_lo, 16 -; SI-NEXT: s_mov_b32 s49, s16 -; SI-NEXT: s_mov_b32 s16, s95 -; SI-NEXT: s_lshr_b64 s[94:95], s[58:59], 24 -; SI-NEXT: v_writelane_b32 v43, vcc_hi, 17 -; SI-NEXT: s_lshr_b64 vcc, s[4:5], 16 -; SI-NEXT: s_mov_b32 s95, s16 -; SI-NEXT: s_mov_b32 s16, s31 -; SI-NEXT: s_lshr_b64 s[30:31], s[58:59], 16 -; SI-NEXT: v_writelane_b32 v43, vcc_lo, 14 -; SI-NEXT: s_mov_b32 s31, s16 -; SI-NEXT: s_mov_b32 s16, s35 -; SI-NEXT: s_lshr_b64 s[34:35], s[58:59], 8 -; SI-NEXT: v_writelane_b32 v43, vcc_hi, 15 +; SI-NEXT: v_writelane_b32 v23, vcc_lo, 18 +; SI-NEXT: v_writelane_b32 v23, vcc_hi, 19 +; SI-NEXT: s_mov_b32 vcc_lo, s49 +; SI-NEXT: s_lshr_b64 s[48:49], s[4:5], 16 +; SI-NEXT: s_mov_b32 s49, vcc_lo ; SI-NEXT: s_lshr_b64 vcc, s[4:5], 8 -; SI-NEXT: s_mov_b32 s35, s16 -; SI-NEXT: s_mov_b32 s16, s91 -; SI-NEXT: s_lshr_b64 s[90:91], s[46:47], 8 -; SI-NEXT: v_writelane_b32 v43, vcc_lo, 12 -; SI-NEXT: s_mov_b32 s91, s16 -; SI-NEXT: v_writelane_b32 v43, vcc_hi, 13 +; SI-NEXT: v_writelane_b32 v23, vcc_lo, 16 +; SI-NEXT: v_writelane_b32 v23, vcc_hi, 17 ; SI-NEXT: s_cbranch_execnz .LBB99_3 ; SI-NEXT: .LBB99_2: ; %cmp.true -; SI-NEXT: s_add_i32 s4, s55, 3 +; SI-NEXT: v_readlane_b32 s4, v22, 11 +; SI-NEXT: s_add_i32 s4, s4, 3 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_lshl_b32 s5, s93, 16 +; SI-NEXT: s_lshl_b32 s5, s39, 16 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_add_i32 s5, s91, 3 +; SI-NEXT: v_readlane_b32 s5, v22, 10 +; SI-NEXT: s_add_i32 s5, s5, 3 ; SI-NEXT: s_and_b32 s5, s5, 0xffff -; SI-NEXT: s_lshl_b32 s6, s89, 16 +; SI-NEXT: s_lshl_b32 s6, s76, 16 ; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_add_i32 s6, s83, 3 +; SI-NEXT: v_readlane_b32 s6, v22, 9 +; SI-NEXT: s_add_i32 s6, s6, 3 ; SI-NEXT: s_and_b32 s6, s6, 0xffff -; SI-NEXT: s_lshl_b32 s7, s35, 16 +; SI-NEXT: s_lshl_b32 s7, s38, 16 ; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_add_i32 s7, s95, 3 +; SI-NEXT: v_readlane_b32 s7, v22, 7 +; SI-NEXT: s_add_i32 s7, s7, 3 ; SI-NEXT: s_and_b32 s7, s7, 0xffff -; SI-NEXT: s_lshl_b32 s8, s17, 16 +; SI-NEXT: s_lshl_b32 s8, s77, 16 ; SI-NEXT: s_or_b32 s7, s8, s7 -; SI-NEXT: v_readlane_b32 s8, v41, 19 +; SI-NEXT: v_readlane_b32 s8, v22, 6 ; SI-NEXT: s_add_i32 s8, s8, 3 -; SI-NEXT: v_readlane_b32 s9, v41, 18 ; SI-NEXT: s_and_b32 s8, s8, 0xffff -; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_lshl_b32 s9, s37, 16 ; SI-NEXT: s_or_b32 s8, s9, s8 -; SI-NEXT: v_readlane_b32 s9, v41, 17 +; SI-NEXT: v_readlane_b32 s9, v22, 4 ; SI-NEXT: s_add_i32 s9, s9, 3 ; SI-NEXT: s_and_b32 s9, s9, 0xffff -; SI-NEXT: s_lshl_b32 s10, s77, 16 +; SI-NEXT: s_lshl_b32 s10, s78, 16 ; SI-NEXT: s_or_b32 s9, s10, s9 -; SI-NEXT: v_readlane_b32 s10, v41, 16 +; SI-NEXT: v_readlane_b32 s10, v22, 3 ; SI-NEXT: s_add_i32 s10, s10, 3 -; SI-NEXT: v_readlane_b32 s11, v41, 15 ; SI-NEXT: s_and_b32 s10, s10, 0xffff -; SI-NEXT: s_lshl_b32 s11, s11, 16 +; SI-NEXT: s_lshl_b32 s11, s36, 16 ; SI-NEXT: s_or_b32 s10, s11, s10 -; SI-NEXT: v_readlane_b32 s11, v41, 14 +; SI-NEXT: v_readlane_b32 s11, v22, 0 ; SI-NEXT: s_add_i32 s11, s11, 3 ; SI-NEXT: s_and_b32 s11, s11, 0xffff -; SI-NEXT: s_lshl_b32 s12, s76, 16 +; SI-NEXT: s_lshl_b32 s12, s79, 16 ; SI-NEXT: s_or_b32 s11, s12, s11 -; SI-NEXT: v_readlane_b32 s12, v41, 13 -; SI-NEXT: s_add_i32 s12, s12, 3 -; SI-NEXT: v_readlane_b32 s13, v41, 12 +; SI-NEXT: s_add_i32 s12, s97, 3 +; SI-NEXT: v_readlane_b32 s13, v22, 15 ; SI-NEXT: s_and_b32 s12, s12, 0xffff ; SI-NEXT: s_lshl_b32 s13, s13, 16 ; SI-NEXT: s_or_b32 s12, s13, s12 -; SI-NEXT: v_readlane_b32 s13, v41, 11 -; SI-NEXT: s_add_i32 s13, s13, 3 +; SI-NEXT: s_add_i32 s13, s87, 3 ; SI-NEXT: s_and_b32 s13, s13, 0xffff -; SI-NEXT: s_lshl_b32 s14, s99, 16 +; SI-NEXT: s_lshl_b32 s14, s88, 16 ; SI-NEXT: s_or_b32 s13, s14, s13 -; SI-NEXT: v_readlane_b32 s14, v41, 10 -; SI-NEXT: s_add_i32 s14, s14, 3 -; SI-NEXT: v_readlane_b32 s15, v41, 9 +; SI-NEXT: s_add_i32 s14, s85, 3 +; SI-NEXT: v_readlane_b32 s15, v22, 14 ; SI-NEXT: s_and_b32 s14, s14, 0xffff ; SI-NEXT: s_lshl_b32 s15, s15, 16 ; SI-NEXT: s_or_b32 s14, s15, s14 -; SI-NEXT: v_readlane_b32 s15, v41, 8 -; SI-NEXT: s_add_i32 s15, s15, 3 +; SI-NEXT: s_add_i32 s15, s83, 3 ; SI-NEXT: s_and_b32 s15, s15, 0xffff -; SI-NEXT: s_lshl_b32 s16, s98, 16 -; SI-NEXT: s_or_b32 s15, s16, s15 -; SI-NEXT: v_readlane_b32 s16, v41, 7 -; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_readlane_b32 s17, v41, 6 -; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: s_lshl_b32 s17, s17, 16 -; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: s_add_i32 s40, s16, 0x30000 -; SI-NEXT: v_readlane_b32 s16, v41, 5 -; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: s_lshl_b32 s17, s97, 16 -; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: s_add_i32 s41, s16, 0x30000 -; SI-NEXT: v_readlane_b32 s16, v41, 4 -; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_readlane_b32 s17, v41, 3 -; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: s_lshl_b32 s17, s17, 16 -; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: s_add_i32 s42, s16, 0x30000 -; SI-NEXT: v_readlane_b32 s16, v41, 2 -; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: s_lshl_b32 s17, s96, 16 -; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: s_add_i32 s43, s16, 0x30000 -; SI-NEXT: v_readlane_b32 s16, v41, 1 +; SI-NEXT: s_lshl_b32 s40, s89, 16 +; SI-NEXT: s_or_b32 s15, s40, s15 +; SI-NEXT: s_add_i32 s40, s81, 3 +; SI-NEXT: v_readlane_b32 s41, v22, 13 +; SI-NEXT: s_and_b32 s40, s40, 0xffff +; SI-NEXT: s_lshl_b32 s41, s41, 16 +; SI-NEXT: s_or_b32 s40, s41, s40 +; SI-NEXT: s_add_i32 s41, s71, 3 +; SI-NEXT: s_and_b32 s41, s41, 0xffff +; SI-NEXT: s_lshl_b32 s42, s90, 16 +; SI-NEXT: s_or_b32 s41, s42, s41 +; SI-NEXT: v_readlane_b32 s42, v22, 5 +; SI-NEXT: s_add_i32 s42, s42, 3 +; SI-NEXT: v_readlane_b32 s43, v22, 12 +; SI-NEXT: s_and_b32 s42, s42, 0xffff +; SI-NEXT: s_lshl_b32 s43, s43, 16 +; SI-NEXT: s_or_b32 s42, s43, s42 +; SI-NEXT: v_readlane_b32 s43, v22, 2 +; SI-NEXT: s_add_i32 s43, s43, 3 +; SI-NEXT: s_and_b32 s43, s43, 0xffff +; SI-NEXT: s_lshl_b32 s44, s91, 16 +; SI-NEXT: s_or_b32 s43, s44, s43 +; SI-NEXT: v_readlane_b32 s44, v22, 1 +; SI-NEXT: s_add_i32 s44, s44, 3 +; SI-NEXT: v_readlane_b32 s45, v22, 8 +; SI-NEXT: s_and_b32 s44, s44, 0xffff +; SI-NEXT: s_lshl_b32 s45, s45, 16 +; SI-NEXT: s_or_b32 s44, s45, s44 +; SI-NEXT: s_add_i32 s46, s44, 0x30000 +; SI-NEXT: s_add_i32 s44, s99, 3 +; SI-NEXT: s_and_b32 s44, s44, 0xffff +; SI-NEXT: s_lshl_b32 s45, s94, 16 +; SI-NEXT: s_or_b32 s44, s45, s44 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s47, s44, 0x30000 +; SI-NEXT: s_and_b32 s28, s28, 0xffff +; SI-NEXT: s_lshl_b32 s44, s69, 16 +; SI-NEXT: s_or_b32 s28, s44, s28 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_add_i32 s60, s28, 0x30000 +; SI-NEXT: s_and_b32 s28, s29, 0xffff +; SI-NEXT: s_lshl_b32 s29, s92, 16 +; SI-NEXT: s_or_b32 s28, s29, s28 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s61, s28, 0x30000 +; SI-NEXT: s_and_b32 s26, s26, 0xffff +; SI-NEXT: s_lshl_b32 s28, s67, 16 +; SI-NEXT: s_or_b32 s26, s28, s26 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_add_i32 s72, s26, 0x30000 +; SI-NEXT: s_and_b32 s26, s27, 0xffff +; SI-NEXT: s_lshl_b32 s27, s93, 16 +; SI-NEXT: s_or_b32 s26, s27, s26 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s73, s26, 0x30000 +; SI-NEXT: s_and_b32 s24, s24, 0xffff +; SI-NEXT: s_lshl_b32 s26, s65, 16 +; SI-NEXT: s_or_b32 s24, s26, s24 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s74, s24, 0x30000 +; SI-NEXT: s_and_b32 s24, s25, 0xffff +; SI-NEXT: s_lshl_b32 s25, s95, 16 +; SI-NEXT: s_or_b32 s24, s25, s24 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s75, s24, 0x30000 +; SI-NEXT: s_and_b32 s22, s22, 0xffff +; SI-NEXT: s_lshl_b32 s24, s55, 16 +; SI-NEXT: s_or_b32 s22, s24, s22 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s44, s22, 0x30000 +; SI-NEXT: s_and_b32 s22, s23, 0xffff +; SI-NEXT: s_lshl_b32 s23, s30, 16 +; SI-NEXT: s_or_b32 s22, s23, s22 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s45, s22, 0x30000 +; SI-NEXT: s_and_b32 s20, s20, 0xffff +; SI-NEXT: s_lshl_b32 s22, s53, 16 +; SI-NEXT: s_or_b32 s20, s22, s20 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s56, s20, 0x30000 +; SI-NEXT: s_and_b32 s20, s21, 0xffff +; SI-NEXT: s_lshl_b32 s21, s31, 16 +; SI-NEXT: s_or_b32 s20, s21, s20 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s57, s20, 0x30000 +; SI-NEXT: s_and_b32 s18, s18, 0xffff +; SI-NEXT: s_lshl_b32 s20, s51, 16 +; SI-NEXT: s_or_b32 s18, s20, s18 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s58, s18, 0x30000 +; SI-NEXT: s_and_b32 s18, s19, 0xffff +; SI-NEXT: s_lshl_b32 s19, s34, 16 +; SI-NEXT: s_or_b32 s18, s19, s18 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_readlane_b32 s17, v41, 0 -; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: s_lshl_b32 s17, s17, 16 -; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: s_add_i32 s46, s16, 0x30000 -; SI-NEXT: s_add_i32 s16, s81, 3 -; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: s_lshl_b32 s17, s87, 16 -; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: s_add_i32 s47, s16, 0x30000 -; SI-NEXT: s_add_i32 s16, s71, 3 -; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: s_lshl_b32 s17, s69, 16 -; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: s_add_i32 s58, s16, 0x30000 -; SI-NEXT: s_add_i32 s16, s67, 3 -; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: s_lshl_b32 s17, s86, 16 -; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: s_add_i32 s59, s16, 0x30000 -; SI-NEXT: s_add_i32 s16, s65, 3 +; SI-NEXT: s_add_i32 s59, s18, 0x30000 ; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: s_lshl_b32 s17, s53, 16 -; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_lshl_b32 s18, s49, 16 +; SI-NEXT: s_or_b32 s16, s18, s16 +; SI-NEXT: s_add_i32 s17, s17, 3 ; SI-NEXT: s_add_i32 s62, s16, 0x30000 -; SI-NEXT: s_add_i32 s16, s51, 3 -; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: s_lshl_b32 s17, s85, 16 +; SI-NEXT: s_and_b32 s16, s17, 0xffff +; SI-NEXT: s_lshl_b32 s17, s35, 16 ; SI-NEXT: s_or_b32 s16, s17, s16 ; SI-NEXT: s_add_i32 s63, s16, 0x30000 -; SI-NEXT: s_add_i32 s16, s49, 3 -; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: s_lshl_b32 s17, s37, 16 -; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: s_add_i32 s72, s16, 0x30000 -; SI-NEXT: s_add_i32 s16, s31, 3 -; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: s_lshl_b32 s17, s84, 16 -; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: s_add_i32 s73, s16, 0x30000 -; SI-NEXT: s_add_i32 s16, s28, 3 -; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: s_lshl_b32 s17, s29, 16 -; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: s_add_i32 s74, s16, 0x30000 -; SI-NEXT: s_add_i32 s16, s39, 3 -; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: s_lshl_b32 s17, s78, 16 -; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: s_add_i32 s75, s16, 0x30000 -; SI-NEXT: s_add_i32 s16, s24, 3 -; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: s_lshl_b32 s17, s25, 16 -; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: s_add_i32 s44, s16, 0x30000 -; SI-NEXT: s_add_i32 s16, s26, 3 -; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: s_lshl_b32 s17, s27, 16 -; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: s_add_i32 s45, s16, 0x30000 -; SI-NEXT: s_add_i32 s16, s20, 3 -; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: s_lshl_b32 s17, s21, 16 -; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: s_add_i32 s56, s16, 0x30000 -; SI-NEXT: s_add_i32 s16, s22, 3 -; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: s_lshl_b32 s17, s23, 16 -; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: s_add_i32 s57, s16, 0x30000 -; SI-NEXT: s_add_i32 s16, s79, 3 -; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: s_lshl_b32 s17, s88, 16 -; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: s_add_i32 s60, s16, 0x30000 -; SI-NEXT: s_add_i32 s16, s18, 3 -; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: s_lshl_b32 s17, s19, 16 -; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: s_add_i32 s61, s16, 0x30000 -; SI-NEXT: s_lshr_b64 s[16:17], s[60:61], 24 -; SI-NEXT: v_writelane_b32 v41, s16, 22 -; SI-NEXT: v_writelane_b32 v41, s17, 23 -; SI-NEXT: s_lshr_b64 s[16:17], s[60:61], 16 -; SI-NEXT: v_writelane_b32 v41, s16, 20 -; SI-NEXT: v_writelane_b32 v41, s17, 21 -; SI-NEXT: s_lshr_b32 s16, s61, 24 -; SI-NEXT: v_writelane_b32 v43, s16, 18 -; SI-NEXT: s_lshr_b32 s16, s61, 16 -; SI-NEXT: v_writelane_b32 v43, s16, 19 -; SI-NEXT: s_lshr_b32 s16, s61, 8 -; SI-NEXT: v_writelane_b32 v43, s16, 20 +; SI-NEXT: s_lshr_b64 s[16:17], s[62:63], 24 +; SI-NEXT: v_writelane_b32 v22, s16, 18 +; SI-NEXT: v_writelane_b32 v22, s17, 19 +; SI-NEXT: s_lshr_b64 s[16:17], s[62:63], 16 +; SI-NEXT: v_writelane_b32 v22, s16, 16 +; SI-NEXT: v_writelane_b32 v22, s17, 17 +; SI-NEXT: s_lshr_b32 s16, s63, 24 +; SI-NEXT: v_writelane_b32 v23, s16, 20 +; SI-NEXT: s_lshr_b32 s16, s63, 8 +; SI-NEXT: v_writelane_b32 v23, s16, 21 +; SI-NEXT: s_lshr_b32 s16, s59, 24 +; SI-NEXT: v_writelane_b32 v23, s16, 22 +; SI-NEXT: s_lshr_b32 s16, s59, 8 +; SI-NEXT: v_writelane_b32 v23, s16, 23 ; SI-NEXT: s_lshr_b32 s16, s57, 24 -; SI-NEXT: v_writelane_b32 v43, s16, 21 -; SI-NEXT: s_lshr_b32 s16, s57, 16 -; SI-NEXT: v_writelane_b32 v43, s16, 22 +; SI-NEXT: v_writelane_b32 v23, s16, 24 ; SI-NEXT: s_lshr_b32 s16, s57, 8 -; SI-NEXT: v_writelane_b32 v43, s16, 23 +; SI-NEXT: v_writelane_b32 v23, s16, 25 ; SI-NEXT: s_lshr_b32 s16, s45, 24 -; SI-NEXT: v_writelane_b32 v43, s16, 24 -; SI-NEXT: s_lshr_b32 s16, s45, 16 -; SI-NEXT: v_writelane_b32 v43, s16, 25 +; SI-NEXT: v_writelane_b32 v23, s16, 26 ; SI-NEXT: s_lshr_b32 s16, s45, 8 -; SI-NEXT: v_writelane_b32 v43, s16, 26 +; SI-NEXT: v_writelane_b32 v23, s16, 27 ; SI-NEXT: s_lshr_b32 s16, s75, 24 -; SI-NEXT: v_writelane_b32 v43, s16, 27 -; SI-NEXT: s_lshr_b32 s16, s75, 16 -; SI-NEXT: v_writelane_b32 v43, s16, 28 +; SI-NEXT: v_writelane_b32 v23, s16, 28 ; SI-NEXT: s_lshr_b32 s16, s75, 8 -; SI-NEXT: v_writelane_b32 v43, s16, 29 +; SI-NEXT: v_writelane_b32 v23, s16, 29 ; SI-NEXT: s_lshr_b32 s16, s73, 24 -; SI-NEXT: v_writelane_b32 v43, s16, 30 -; SI-NEXT: s_lshr_b32 s16, s73, 16 -; SI-NEXT: v_writelane_b32 v43, s16, 31 +; SI-NEXT: v_writelane_b32 v23, s16, 30 ; SI-NEXT: s_lshr_b32 s16, s73, 8 -; SI-NEXT: v_writelane_b32 v43, s16, 32 -; SI-NEXT: s_lshr_b32 s16, s63, 24 -; SI-NEXT: v_writelane_b32 v43, s16, 33 -; SI-NEXT: s_lshr_b32 s16, s63, 16 -; SI-NEXT: v_writelane_b32 v43, s16, 34 -; SI-NEXT: s_lshr_b32 s16, s63, 8 -; SI-NEXT: v_writelane_b32 v43, s16, 35 -; SI-NEXT: s_lshr_b32 s16, s59, 24 -; SI-NEXT: v_writelane_b32 v43, s16, 36 -; SI-NEXT: s_lshr_b32 s16, s59, 16 -; SI-NEXT: v_writelane_b32 v43, s16, 37 -; SI-NEXT: s_lshr_b32 s16, s59, 8 -; SI-NEXT: v_writelane_b32 v43, s16, 38 +; SI-NEXT: v_writelane_b32 v23, s16, 31 +; SI-NEXT: s_lshr_b32 s16, s61, 24 +; SI-NEXT: v_writelane_b32 v23, s16, 32 +; SI-NEXT: s_lshr_b32 s16, s61, 8 +; SI-NEXT: v_writelane_b32 v23, s16, 33 ; SI-NEXT: s_lshr_b32 s16, s47, 24 -; SI-NEXT: v_writelane_b32 v43, s16, 39 -; SI-NEXT: s_lshr_b32 s16, s47, 16 -; SI-NEXT: v_writelane_b32 v43, s16, 40 +; SI-NEXT: s_add_i32 s43, s43, 0x30000 +; SI-NEXT: v_writelane_b32 v23, s16, 34 ; SI-NEXT: s_lshr_b32 s16, s47, 8 -; SI-NEXT: v_writelane_b32 v43, s16, 41 +; SI-NEXT: v_writelane_b32 v23, s16, 35 ; SI-NEXT: s_lshr_b32 s16, s43, 24 -; SI-NEXT: v_writelane_b32 v43, s16, 42 -; SI-NEXT: s_lshr_b32 s16, s43, 16 -; SI-NEXT: v_writelane_b32 v43, s16, 43 +; SI-NEXT: s_add_i32 s41, s41, 0x30000 +; SI-NEXT: v_writelane_b32 v23, s16, 36 ; SI-NEXT: s_lshr_b32 s16, s43, 8 -; SI-NEXT: v_writelane_b32 v43, s16, 44 +; SI-NEXT: v_writelane_b32 v23, s16, 37 ; SI-NEXT: s_lshr_b32 s16, s41, 24 -; SI-NEXT: v_writelane_b32 v43, s16, 45 -; SI-NEXT: s_lshr_b32 s16, s41, 16 ; SI-NEXT: s_add_i32 s15, s15, 0x30000 -; SI-NEXT: v_writelane_b32 v43, s16, 46 +; SI-NEXT: v_writelane_b32 v23, s16, 38 ; SI-NEXT: s_lshr_b32 s16, s41, 8 -; SI-NEXT: v_writelane_b32 v43, s16, 47 +; SI-NEXT: v_writelane_b32 v23, s16, 39 ; SI-NEXT: s_lshr_b32 s16, s15, 24 -; SI-NEXT: v_writelane_b32 v43, s16, 48 -; SI-NEXT: s_lshr_b32 s16, s15, 16 ; SI-NEXT: s_add_i32 s13, s13, 0x30000 -; SI-NEXT: v_writelane_b32 v43, s16, 49 +; SI-NEXT: v_writelane_b32 v23, s16, 40 ; SI-NEXT: s_lshr_b32 s16, s15, 8 -; SI-NEXT: v_writelane_b32 v43, s16, 50 +; SI-NEXT: v_writelane_b32 v23, s16, 41 ; SI-NEXT: s_lshr_b32 s16, s13, 24 -; SI-NEXT: v_writelane_b32 v43, s16, 51 -; SI-NEXT: s_lshr_b32 s16, s13, 16 ; SI-NEXT: s_add_i32 s11, s11, 0x30000 -; SI-NEXT: v_writelane_b32 v43, s16, 52 +; SI-NEXT: v_writelane_b32 v23, s16, 42 ; SI-NEXT: s_lshr_b32 s16, s13, 8 -; SI-NEXT: v_writelane_b32 v43, s16, 53 +; SI-NEXT: v_writelane_b32 v23, s16, 43 ; SI-NEXT: s_lshr_b32 s16, s11, 24 -; SI-NEXT: v_writelane_b32 v43, s16, 54 -; SI-NEXT: s_lshr_b32 s16, s11, 16 ; SI-NEXT: s_add_i32 s9, s9, 0x30000 -; SI-NEXT: v_writelane_b32 v43, s16, 55 +; SI-NEXT: v_writelane_b32 v23, s16, 44 ; SI-NEXT: s_lshr_b32 s16, s11, 8 -; SI-NEXT: v_writelane_b32 v43, s16, 56 +; SI-NEXT: v_writelane_b32 v23, s16, 45 ; SI-NEXT: s_lshr_b32 s16, s9, 24 -; SI-NEXT: v_writelane_b32 v43, s16, 57 -; SI-NEXT: s_lshr_b32 s16, s9, 16 ; SI-NEXT: s_add_i32 s7, s7, 0x30000 -; SI-NEXT: v_writelane_b32 v43, s16, 58 +; SI-NEXT: v_writelane_b32 v23, s16, 46 ; SI-NEXT: s_lshr_b32 s16, s9, 8 -; SI-NEXT: v_writelane_b32 v43, s16, 59 +; SI-NEXT: v_writelane_b32 v23, s16, 47 ; SI-NEXT: s_lshr_b32 s16, s7, 24 -; SI-NEXT: v_writelane_b32 v43, s16, 60 -; SI-NEXT: s_lshr_b32 s16, s7, 16 ; SI-NEXT: s_add_i32 s5, s5, 0x30000 -; SI-NEXT: v_writelane_b32 v43, s16, 61 +; SI-NEXT: v_writelane_b32 v23, s16, 48 ; SI-NEXT: s_lshr_b32 s16, s7, 8 -; SI-NEXT: v_writelane_b32 v43, s16, 62 +; SI-NEXT: v_writelane_b32 v23, s16, 49 ; SI-NEXT: s_lshr_b32 s16, s5, 24 -; SI-NEXT: v_writelane_b32 v43, s16, 63 -; SI-NEXT: s_lshr_b32 s16, s5, 16 -; SI-NEXT: v_writelane_b32 v42, s16, 0 +; SI-NEXT: v_writelane_b32 v23, s16, 50 ; SI-NEXT: s_lshr_b32 s16, s5, 8 -; SI-NEXT: v_writelane_b32 v42, s16, 1 +; SI-NEXT: v_writelane_b32 v23, s16, 51 +; SI-NEXT: s_lshr_b64 s[16:17], s[58:59], 24 +; SI-NEXT: v_writelane_b32 v22, s16, 24 +; SI-NEXT: v_writelane_b32 v22, s17, 25 +; SI-NEXT: s_lshr_b64 s[16:17], s[58:59], 16 +; SI-NEXT: v_writelane_b32 v22, s16, 22 +; SI-NEXT: v_writelane_b32 v22, s17, 23 +; SI-NEXT: s_lshr_b64 s[16:17], s[58:59], 8 +; SI-NEXT: v_writelane_b32 v22, s16, 20 +; SI-NEXT: v_writelane_b32 v22, s17, 21 ; SI-NEXT: s_lshr_b64 s[16:17], s[56:57], 24 -; SI-NEXT: v_writelane_b32 v41, s16, 28 -; SI-NEXT: v_writelane_b32 v41, s17, 29 +; SI-NEXT: v_writelane_b32 v22, s16, 30 +; SI-NEXT: v_writelane_b32 v22, s17, 31 ; SI-NEXT: s_lshr_b64 s[16:17], s[56:57], 16 -; SI-NEXT: v_writelane_b32 v41, s16, 26 -; SI-NEXT: v_writelane_b32 v41, s17, 27 +; SI-NEXT: v_writelane_b32 v22, s16, 28 +; SI-NEXT: v_writelane_b32 v22, s17, 29 ; SI-NEXT: s_lshr_b64 s[16:17], s[56:57], 8 -; SI-NEXT: v_writelane_b32 v41, s16, 24 -; SI-NEXT: v_writelane_b32 v41, s17, 25 +; SI-NEXT: v_writelane_b32 v22, s16, 26 +; SI-NEXT: v_writelane_b32 v22, s17, 27 ; SI-NEXT: s_lshr_b64 s[16:17], s[46:47], 24 -; SI-NEXT: v_writelane_b32 v41, s16, 32 -; SI-NEXT: v_writelane_b32 v41, s17, 33 +; SI-NEXT: v_writelane_b32 v22, s16, 36 +; SI-NEXT: v_writelane_b32 v22, s17, 37 ; SI-NEXT: s_lshr_b64 s[16:17], s[46:47], 16 -; SI-NEXT: v_writelane_b32 v41, s16, 30 -; SI-NEXT: v_writelane_b32 v41, s17, 31 +; SI-NEXT: v_writelane_b32 v22, s16, 34 +; SI-NEXT: v_writelane_b32 v22, s17, 35 +; SI-NEXT: s_lshr_b64 s[16:17], s[46:47], 8 +; SI-NEXT: s_add_i32 s42, s42, 0x30000 +; SI-NEXT: v_writelane_b32 v22, s16, 32 +; SI-NEXT: v_writelane_b32 v22, s17, 33 ; SI-NEXT: s_lshr_b64 s[16:17], s[42:43], 24 -; SI-NEXT: v_writelane_b32 v41, s16, 38 -; SI-NEXT: v_writelane_b32 v41, s17, 39 +; SI-NEXT: v_writelane_b32 v22, s16, 42 +; SI-NEXT: v_writelane_b32 v22, s17, 43 ; SI-NEXT: s_lshr_b64 s[16:17], s[42:43], 16 -; SI-NEXT: v_writelane_b32 v41, s16, 36 -; SI-NEXT: v_writelane_b32 v41, s17, 37 +; SI-NEXT: v_writelane_b32 v22, s16, 40 +; SI-NEXT: v_writelane_b32 v22, s17, 41 ; SI-NEXT: s_lshr_b64 s[16:17], s[42:43], 8 -; SI-NEXT: v_writelane_b32 v41, s16, 34 -; SI-NEXT: v_writelane_b32 v41, s17, 35 +; SI-NEXT: s_add_i32 s40, s40, 0x30000 +; SI-NEXT: v_writelane_b32 v22, s16, 38 +; SI-NEXT: v_writelane_b32 v22, s17, 39 ; SI-NEXT: s_lshr_b64 s[16:17], s[40:41], 24 -; SI-NEXT: v_writelane_b32 v41, s16, 44 -; SI-NEXT: v_writelane_b32 v41, s17, 45 +; SI-NEXT: v_writelane_b32 v22, s16, 48 +; SI-NEXT: v_writelane_b32 v22, s17, 49 ; SI-NEXT: s_lshr_b64 s[16:17], s[40:41], 16 -; SI-NEXT: v_writelane_b32 v41, s16, 42 -; SI-NEXT: v_writelane_b32 v41, s17, 43 +; SI-NEXT: v_writelane_b32 v22, s16, 46 +; SI-NEXT: v_writelane_b32 v22, s17, 47 ; SI-NEXT: s_lshr_b64 s[16:17], s[40:41], 8 ; SI-NEXT: s_add_i32 s14, s14, 0x30000 -; SI-NEXT: v_writelane_b32 v41, s16, 40 -; SI-NEXT: v_writelane_b32 v41, s17, 41 +; SI-NEXT: v_writelane_b32 v22, s16, 44 +; SI-NEXT: v_writelane_b32 v22, s17, 45 ; SI-NEXT: s_lshr_b64 s[16:17], s[14:15], 24 -; SI-NEXT: v_writelane_b32 v41, s16, 50 -; SI-NEXT: v_writelane_b32 v41, s17, 51 +; SI-NEXT: v_writelane_b32 v22, s16, 54 +; SI-NEXT: v_writelane_b32 v22, s17, 55 ; SI-NEXT: s_lshr_b64 s[16:17], s[14:15], 16 -; SI-NEXT: v_writelane_b32 v41, s16, 48 -; SI-NEXT: v_writelane_b32 v41, s17, 49 +; SI-NEXT: v_writelane_b32 v22, s16, 52 +; SI-NEXT: v_writelane_b32 v22, s17, 53 ; SI-NEXT: s_lshr_b64 s[16:17], s[14:15], 8 ; SI-NEXT: s_add_i32 s12, s12, 0x30000 -; SI-NEXT: v_writelane_b32 v41, s16, 46 -; SI-NEXT: v_writelane_b32 v41, s17, 47 +; SI-NEXT: v_writelane_b32 v22, s16, 50 +; SI-NEXT: v_writelane_b32 v22, s17, 51 ; SI-NEXT: s_lshr_b64 s[16:17], s[12:13], 24 -; SI-NEXT: v_writelane_b32 v41, s16, 56 -; SI-NEXT: v_writelane_b32 v41, s17, 57 +; SI-NEXT: v_writelane_b32 v22, s16, 60 +; SI-NEXT: v_writelane_b32 v22, s17, 61 ; SI-NEXT: s_lshr_b64 s[16:17], s[12:13], 16 -; SI-NEXT: v_writelane_b32 v41, s16, 54 -; SI-NEXT: v_writelane_b32 v41, s17, 55 +; SI-NEXT: v_writelane_b32 v22, s16, 58 +; SI-NEXT: v_writelane_b32 v22, s17, 59 ; SI-NEXT: s_lshr_b64 s[16:17], s[12:13], 8 ; SI-NEXT: s_add_i32 s10, s10, 0x30000 -; SI-NEXT: v_writelane_b32 v41, s16, 52 -; SI-NEXT: v_writelane_b32 v41, s17, 53 +; SI-NEXT: v_writelane_b32 v22, s16, 56 +; SI-NEXT: v_writelane_b32 v22, s17, 57 ; SI-NEXT: s_lshr_b64 s[16:17], s[10:11], 24 -; SI-NEXT: v_writelane_b32 v41, s16, 62 -; SI-NEXT: v_writelane_b32 v41, s17, 63 +; SI-NEXT: v_writelane_b32 v23, s16, 2 +; SI-NEXT: v_writelane_b32 v23, s17, 3 ; SI-NEXT: s_lshr_b64 s[16:17], s[10:11], 16 -; SI-NEXT: v_writelane_b32 v41, s16, 60 -; SI-NEXT: v_writelane_b32 v41, s17, 61 +; SI-NEXT: v_writelane_b32 v23, s16, 0 +; SI-NEXT: v_writelane_b32 v23, s17, 1 ; SI-NEXT: s_lshr_b64 s[16:17], s[10:11], 8 ; SI-NEXT: s_add_i32 s8, s8, 0x30000 -; SI-NEXT: v_writelane_b32 v41, s16, 58 -; SI-NEXT: v_writelane_b32 v41, s17, 59 +; SI-NEXT: v_writelane_b32 v22, s16, 62 +; SI-NEXT: v_writelane_b32 v22, s17, 63 ; SI-NEXT: s_lshr_b64 s[16:17], s[8:9], 24 -; SI-NEXT: v_writelane_b32 v43, s16, 4 -; SI-NEXT: v_writelane_b32 v43, s17, 5 +; SI-NEXT: v_writelane_b32 v23, s16, 8 +; SI-NEXT: v_writelane_b32 v23, s17, 9 ; SI-NEXT: s_lshr_b64 s[16:17], s[8:9], 16 -; SI-NEXT: v_writelane_b32 v43, s16, 2 -; SI-NEXT: v_writelane_b32 v43, s17, 3 +; SI-NEXT: v_writelane_b32 v23, s16, 6 +; SI-NEXT: v_writelane_b32 v23, s17, 7 ; SI-NEXT: s_lshr_b64 s[16:17], s[8:9], 8 ; SI-NEXT: s_add_i32 s6, s6, 0x30000 -; SI-NEXT: v_writelane_b32 v43, s16, 0 -; SI-NEXT: v_writelane_b32 v43, s17, 1 +; SI-NEXT: v_writelane_b32 v23, s16, 4 +; SI-NEXT: v_writelane_b32 v23, s17, 5 ; SI-NEXT: s_lshr_b64 s[16:17], s[6:7], 24 -; SI-NEXT: v_writelane_b32 v43, s16, 10 -; SI-NEXT: v_writelane_b32 v43, s17, 11 +; SI-NEXT: v_writelane_b32 v23, s16, 14 +; SI-NEXT: v_writelane_b32 v23, s17, 15 ; SI-NEXT: s_lshr_b64 s[16:17], s[6:7], 16 -; SI-NEXT: v_writelane_b32 v43, s16, 8 -; SI-NEXT: v_writelane_b32 v43, s17, 9 +; SI-NEXT: v_writelane_b32 v23, s16, 12 +; SI-NEXT: v_writelane_b32 v23, s17, 13 ; SI-NEXT: s_lshr_b64 s[16:17], s[6:7], 8 ; SI-NEXT: s_add_i32 s4, s4, 0x30000 -; SI-NEXT: v_writelane_b32 v43, s16, 6 -; SI-NEXT: v_writelane_b32 v43, s17, 7 +; SI-NEXT: v_writelane_b32 v23, s16, 10 +; SI-NEXT: v_writelane_b32 v23, s17, 11 ; SI-NEXT: s_lshr_b64 s[16:17], s[4:5], 24 -; SI-NEXT: v_writelane_b32 v43, s16, 16 -; SI-NEXT: v_writelane_b32 v43, s17, 17 -; SI-NEXT: s_lshr_b64 s[16:17], s[4:5], 16 -; SI-NEXT: v_writelane_b32 v43, s16, 14 -; SI-NEXT: v_writelane_b32 v43, s17, 15 +; SI-NEXT: v_writelane_b32 v23, s16, 18 +; SI-NEXT: v_writelane_b32 v23, s17, 19 ; SI-NEXT: s_lshr_b64 s[16:17], s[4:5], 8 -; SI-NEXT: s_lshr_b64 s[92:93], s[60:61], 8 -; SI-NEXT: s_lshr_b64 s[70:71], s[44:45], 24 -; SI-NEXT: s_lshr_b64 s[80:81], s[44:45], 16 -; SI-NEXT: s_lshr_b64 s[82:83], s[44:45], 8 -; SI-NEXT: s_lshr_b64 s[64:65], s[74:75], 24 -; SI-NEXT: s_lshr_b64 s[66:67], s[74:75], 16 -; SI-NEXT: s_lshr_b64 s[68:69], s[74:75], 8 -; SI-NEXT: s_lshr_b64 s[50:51], s[72:73], 24 -; SI-NEXT: s_lshr_b64 s[52:53], s[72:73], 16 -; SI-NEXT: s_lshr_b64 s[54:55], s[72:73], 8 -; SI-NEXT: s_lshr_b64 s[36:37], s[62:63], 24 -; SI-NEXT: s_lshr_b64 s[38:39], s[62:63], 16 -; SI-NEXT: s_lshr_b64 s[48:49], s[62:63], 8 -; SI-NEXT: s_lshr_b64 s[94:95], s[58:59], 24 -; SI-NEXT: s_lshr_b64 s[30:31], s[58:59], 16 -; SI-NEXT: s_lshr_b64 s[34:35], s[58:59], 8 -; SI-NEXT: s_lshr_b64 s[90:91], s[46:47], 8 -; SI-NEXT: v_writelane_b32 v43, s16, 12 -; SI-NEXT: v_writelane_b32 v43, s17, 13 +; SI-NEXT: s_lshr_b32 s35, s63, 16 +; SI-NEXT: s_lshr_b32 s34, s59, 16 +; SI-NEXT: s_lshr_b32 s31, s57, 16 +; SI-NEXT: s_lshr_b32 s30, s45, 16 +; SI-NEXT: s_lshr_b32 s95, s75, 16 +; SI-NEXT: s_lshr_b32 s93, s73, 16 +; SI-NEXT: s_lshr_b32 s92, s61, 16 +; SI-NEXT: s_lshr_b32 s94, s47, 16 +; SI-NEXT: s_lshr_b32 s91, s43, 16 +; SI-NEXT: s_lshr_b32 s90, s41, 16 +; SI-NEXT: s_lshr_b32 s89, s15, 16 +; SI-NEXT: s_lshr_b32 s88, s13, 16 +; SI-NEXT: s_lshr_b32 s79, s11, 16 +; SI-NEXT: s_lshr_b32 s78, s9, 16 +; SI-NEXT: s_lshr_b32 s77, s7, 16 +; SI-NEXT: s_lshr_b32 s76, s5, 16 +; SI-NEXT: s_lshr_b64 s[50:51], s[62:63], 8 +; SI-NEXT: s_lshr_b64 s[86:87], s[44:45], 24 +; SI-NEXT: s_lshr_b64 s[96:97], s[44:45], 16 +; SI-NEXT: s_lshr_b64 s[98:99], s[44:45], 8 +; SI-NEXT: s_lshr_b64 s[80:81], s[74:75], 24 +; SI-NEXT: s_lshr_b64 s[82:83], s[74:75], 16 +; SI-NEXT: s_lshr_b64 s[84:85], s[74:75], 8 +; SI-NEXT: s_lshr_b64 s[66:67], s[72:73], 24 +; SI-NEXT: s_lshr_b64 s[68:69], s[72:73], 16 +; SI-NEXT: s_lshr_b64 s[70:71], s[72:73], 8 +; SI-NEXT: s_lshr_b64 s[52:53], s[60:61], 24 +; SI-NEXT: s_lshr_b64 s[54:55], s[60:61], 16 +; SI-NEXT: s_lshr_b64 s[64:65], s[60:61], 8 +; SI-NEXT: s_lshr_b64 s[48:49], s[4:5], 16 +; SI-NEXT: v_writelane_b32 v23, s16, 16 +; SI-NEXT: v_writelane_b32 v23, s17, 17 ; SI-NEXT: .LBB99_3: ; %end -; SI-NEXT: s_lshl_b32 s17, s92, 8 -; SI-NEXT: s_and_b32 s18, s60, 0xff +; SI-NEXT: v_readlane_b32 s18, v22, 16 +; SI-NEXT: s_lshl_b32 s16, s50, 8 +; SI-NEXT: s_and_b32 s17, s62, 0xff +; SI-NEXT: v_readlane_b32 s19, v22, 17 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_and_b32 s17, s18, 0xff +; SI-NEXT: v_readlane_b32 s18, v22, 18 +; SI-NEXT: s_lshl_b32 s18, s18, 24 +; SI-NEXT: s_lshl_b32 s17, s17, 16 ; SI-NEXT: s_or_b32 s17, s18, s17 -; SI-NEXT: v_readlane_b32 s18, v41, 20 -; SI-NEXT: v_readlane_b32 s19, v41, 21 -; SI-NEXT: s_and_b32 s18, s18, 0xff -; SI-NEXT: v_readlane_b32 s20, v41, 22 -; SI-NEXT: s_lshl_b32 s19, s20, 24 -; SI-NEXT: s_lshl_b32 s18, s18, 16 -; SI-NEXT: s_or_b32 s18, s19, s18 -; SI-NEXT: s_and_b32 s17, s17, 0xffff -; SI-NEXT: s_or_b32 s17, s17, s18 -; SI-NEXT: v_readlane_b32 s16, v43, 20 -; SI-NEXT: v_mov_b32_e32 v1, s17 -; SI-NEXT: s_and_b32 s17, s61, 0xff -; SI-NEXT: s_lshl_b32 s18, s16, 8 -; SI-NEXT: v_readlane_b32 s16, v43, 19 -; SI-NEXT: s_or_b32 s17, s17, s18 -; SI-NEXT: s_and_b32 s18, s16, 0xff -; SI-NEXT: v_readlane_b32 s16, v43, 18 -; SI-NEXT: s_lshl_b32 s18, s18, 16 -; SI-NEXT: s_lshl_b32 s19, s16, 24 -; SI-NEXT: s_or_b32 s18, s19, s18 -; SI-NEXT: s_and_b32 s17, s17, 0xffff -; SI-NEXT: s_or_b32 s17, s17, s18 -; SI-NEXT: v_mov_b32_e32 v2, s17 -; SI-NEXT: v_readlane_b32 s16, v41, 24 -; SI-NEXT: v_readlane_b32 s17, v41, 25 -; SI-NEXT: s_lshl_b32 s17, s16, 8 -; SI-NEXT: s_and_b32 s18, s56, 0xff -; SI-NEXT: v_readlane_b32 s21, v41, 23 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_readlane_b32 s17, v23, 21 +; SI-NEXT: v_mov_b32_e32 v1, s16 +; SI-NEXT: s_and_b32 s16, s63, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 8 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_and_b32 s17, s35, 0xff +; SI-NEXT: v_readlane_b32 s18, v23, 20 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s18, s18, 24 ; SI-NEXT: s_or_b32 s17, s18, s17 -; SI-NEXT: v_readlane_b32 s18, v41, 26 -; SI-NEXT: v_readlane_b32 s19, v41, 27 -; SI-NEXT: s_and_b32 s18, s18, 0xff -; SI-NEXT: v_readlane_b32 s20, v41, 28 -; SI-NEXT: s_lshl_b32 s19, s20, 24 -; SI-NEXT: s_lshl_b32 s18, s18, 16 -; SI-NEXT: s_or_b32 s18, s19, s18 -; SI-NEXT: s_and_b32 s17, s17, 0xffff -; SI-NEXT: s_or_b32 s17, s17, s18 -; SI-NEXT: v_readlane_b32 s16, v43, 23 -; SI-NEXT: v_mov_b32_e32 v3, s17 -; SI-NEXT: s_and_b32 s17, s57, 0xff -; SI-NEXT: s_lshl_b32 s18, s16, 8 -; SI-NEXT: v_readlane_b32 s16, v43, 22 -; SI-NEXT: s_or_b32 s17, s17, s18 -; SI-NEXT: s_and_b32 s18, s16, 0xff -; SI-NEXT: v_readlane_b32 s16, v43, 21 -; SI-NEXT: s_lshl_b32 s18, s18, 16 -; SI-NEXT: s_lshl_b32 s19, s16, 24 -; SI-NEXT: s_or_b32 s18, s19, s18 -; SI-NEXT: s_and_b32 s17, s17, 0xffff -; SI-NEXT: s_or_b32 s17, s17, s18 -; SI-NEXT: v_mov_b32_e32 v4, s17 -; SI-NEXT: s_lshl_b32 s17, s82, 8 -; SI-NEXT: s_and_b32 s18, s44, 0xff +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_readlane_b32 s19, v22, 19 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: v_readlane_b32 s16, v22, 20 +; SI-NEXT: v_readlane_b32 s17, v22, 21 +; SI-NEXT: v_readlane_b32 s18, v22, 22 +; SI-NEXT: s_lshl_b32 s16, s16, 8 +; SI-NEXT: s_and_b32 s17, s58, 0xff +; SI-NEXT: v_readlane_b32 s19, v22, 23 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_and_b32 s17, s18, 0xff +; SI-NEXT: v_readlane_b32 s18, v22, 24 +; SI-NEXT: s_lshl_b32 s18, s18, 24 +; SI-NEXT: s_lshl_b32 s17, s17, 16 ; SI-NEXT: s_or_b32 s17, s18, s17 -; SI-NEXT: s_and_b32 s18, s80, 0xff -; SI-NEXT: s_lshl_b32 s19, s70, 24 -; SI-NEXT: s_lshl_b32 s18, s18, 16 -; SI-NEXT: s_or_b32 s18, s19, s18 -; SI-NEXT: s_and_b32 s17, s17, 0xffff -; SI-NEXT: s_or_b32 s17, s17, s18 -; SI-NEXT: v_readlane_b32 s16, v43, 26 -; SI-NEXT: v_mov_b32_e32 v5, s17 -; SI-NEXT: s_and_b32 s17, s45, 0xff -; SI-NEXT: s_lshl_b32 s18, s16, 8 -; SI-NEXT: v_readlane_b32 s16, v43, 25 -; SI-NEXT: s_or_b32 s17, s17, s18 -; SI-NEXT: s_and_b32 s18, s16, 0xff -; SI-NEXT: v_readlane_b32 s16, v43, 24 -; SI-NEXT: s_lshl_b32 s18, s18, 16 -; SI-NEXT: s_lshl_b32 s19, s16, 24 -; SI-NEXT: s_or_b32 s18, s19, s18 -; SI-NEXT: s_and_b32 s17, s17, 0xffff -; SI-NEXT: s_or_b32 s17, s17, s18 -; SI-NEXT: v_mov_b32_e32 v6, s17 -; SI-NEXT: s_lshl_b32 s17, s68, 8 -; SI-NEXT: s_and_b32 s18, s74, 0xff +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_readlane_b32 s17, v23, 23 +; SI-NEXT: v_mov_b32_e32 v3, s16 +; SI-NEXT: s_and_b32 s16, s59, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 8 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_and_b32 s17, s34, 0xff +; SI-NEXT: v_readlane_b32 s18, v23, 22 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s18, s18, 24 ; SI-NEXT: s_or_b32 s17, s18, s17 -; SI-NEXT: s_and_b32 s18, s66, 0xff -; SI-NEXT: s_lshl_b32 s19, s64, 24 -; SI-NEXT: s_lshl_b32 s18, s18, 16 -; SI-NEXT: s_or_b32 s18, s19, s18 -; SI-NEXT: s_and_b32 s17, s17, 0xffff -; SI-NEXT: s_or_b32 s17, s17, s18 -; SI-NEXT: v_readlane_b32 s16, v43, 29 -; SI-NEXT: v_mov_b32_e32 v7, s17 -; SI-NEXT: s_and_b32 s17, s75, 0xff -; SI-NEXT: s_lshl_b32 s18, s16, 8 -; SI-NEXT: v_readlane_b32 s16, v43, 28 -; SI-NEXT: s_or_b32 s17, s17, s18 -; SI-NEXT: s_and_b32 s18, s16, 0xff -; SI-NEXT: v_readlane_b32 s16, v43, 27 -; SI-NEXT: s_lshl_b32 s18, s18, 16 -; SI-NEXT: s_lshl_b32 s19, s16, 24 -; SI-NEXT: s_or_b32 s18, s19, s18 -; SI-NEXT: s_and_b32 s17, s17, 0xffff -; SI-NEXT: s_or_b32 s17, s17, s18 -; SI-NEXT: v_mov_b32_e32 v8, s17 -; SI-NEXT: s_lshl_b32 s17, s54, 8 -; SI-NEXT: s_and_b32 s18, s72, 0xff +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_readlane_b32 s19, v22, 25 +; SI-NEXT: v_mov_b32_e32 v4, s16 +; SI-NEXT: v_readlane_b32 s16, v22, 26 +; SI-NEXT: v_readlane_b32 s17, v22, 27 +; SI-NEXT: v_readlane_b32 s18, v22, 28 +; SI-NEXT: s_lshl_b32 s16, s16, 8 +; SI-NEXT: s_and_b32 s17, s56, 0xff +; SI-NEXT: v_readlane_b32 s19, v22, 29 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_and_b32 s17, s18, 0xff +; SI-NEXT: v_readlane_b32 s18, v22, 30 +; SI-NEXT: s_lshl_b32 s18, s18, 24 +; SI-NEXT: s_lshl_b32 s17, s17, 16 ; SI-NEXT: s_or_b32 s17, s18, s17 -; SI-NEXT: s_and_b32 s18, s52, 0xff -; SI-NEXT: s_lshl_b32 s19, s50, 24 -; SI-NEXT: s_lshl_b32 s18, s18, 16 -; SI-NEXT: s_or_b32 s18, s19, s18 -; SI-NEXT: s_and_b32 s17, s17, 0xffff -; SI-NEXT: s_or_b32 s17, s17, s18 -; SI-NEXT: v_readlane_b32 s16, v43, 32 -; SI-NEXT: v_mov_b32_e32 v9, s17 -; SI-NEXT: s_and_b32 s17, s73, 0xff -; SI-NEXT: s_lshl_b32 s18, s16, 8 -; SI-NEXT: v_readlane_b32 s16, v43, 31 -; SI-NEXT: s_or_b32 s17, s17, s18 -; SI-NEXT: s_and_b32 s18, s16, 0xff -; SI-NEXT: v_readlane_b32 s16, v43, 30 -; SI-NEXT: s_lshl_b32 s18, s18, 16 -; SI-NEXT: s_lshl_b32 s19, s16, 24 -; SI-NEXT: s_or_b32 s18, s19, s18 -; SI-NEXT: s_and_b32 s17, s17, 0xffff -; SI-NEXT: s_or_b32 s17, s17, s18 -; SI-NEXT: v_mov_b32_e32 v10, s17 -; SI-NEXT: s_lshl_b32 s17, s48, 8 -; SI-NEXT: s_and_b32 s18, s62, 0xff +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_readlane_b32 s17, v23, 25 +; SI-NEXT: v_mov_b32_e32 v5, s16 +; SI-NEXT: s_and_b32 s16, s57, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 8 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_and_b32 s17, s31, 0xff +; SI-NEXT: v_readlane_b32 s18, v23, 24 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s18, s18, 24 ; SI-NEXT: s_or_b32 s17, s18, s17 -; SI-NEXT: s_and_b32 s18, s38, 0xff -; SI-NEXT: s_lshl_b32 s19, s36, 24 -; SI-NEXT: s_lshl_b32 s18, s18, 16 -; SI-NEXT: s_or_b32 s18, s19, s18 -; SI-NEXT: s_and_b32 s17, s17, 0xffff -; SI-NEXT: s_or_b32 s17, s17, s18 -; SI-NEXT: v_readlane_b32 s16, v43, 35 -; SI-NEXT: v_mov_b32_e32 v11, s17 -; SI-NEXT: s_and_b32 s17, s63, 0xff -; SI-NEXT: s_lshl_b32 s18, s16, 8 -; SI-NEXT: v_readlane_b32 s16, v43, 34 -; SI-NEXT: s_or_b32 s17, s17, s18 -; SI-NEXT: s_and_b32 s18, s16, 0xff -; SI-NEXT: v_readlane_b32 s16, v43, 33 -; SI-NEXT: s_lshl_b32 s18, s18, 16 -; SI-NEXT: s_lshl_b32 s19, s16, 24 -; SI-NEXT: s_or_b32 s18, s19, s18 -; SI-NEXT: s_and_b32 s17, s17, 0xffff -; SI-NEXT: s_or_b32 s17, s17, s18 -; SI-NEXT: v_mov_b32_e32 v12, s17 -; SI-NEXT: s_lshl_b32 s17, s34, 8 -; SI-NEXT: s_and_b32 s18, s58, 0xff +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_mov_b32_e32 v6, s16 +; SI-NEXT: s_lshl_b32 s16, s98, 8 +; SI-NEXT: s_and_b32 s17, s44, 0xff +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_and_b32 s17, s96, 0xff +; SI-NEXT: s_lshl_b32 s18, s86, 24 +; SI-NEXT: s_lshl_b32 s17, s17, 16 ; SI-NEXT: s_or_b32 s17, s18, s17 -; SI-NEXT: s_and_b32 s18, s30, 0xff -; SI-NEXT: s_lshl_b32 s19, s94, 24 -; SI-NEXT: s_lshl_b32 s18, s18, 16 -; SI-NEXT: s_or_b32 s18, s19, s18 -; SI-NEXT: s_and_b32 s17, s17, 0xffff -; SI-NEXT: s_or_b32 s17, s17, s18 -; SI-NEXT: v_readlane_b32 s16, v43, 38 -; SI-NEXT: v_mov_b32_e32 v13, s17 -; SI-NEXT: s_and_b32 s17, s59, 0xff -; SI-NEXT: s_lshl_b32 s18, s16, 8 -; SI-NEXT: v_readlane_b32 s16, v43, 37 -; SI-NEXT: s_or_b32 s17, s17, s18 -; SI-NEXT: s_and_b32 s18, s16, 0xff -; SI-NEXT: v_readlane_b32 s16, v43, 36 -; SI-NEXT: s_lshl_b32 s18, s18, 16 -; SI-NEXT: s_lshl_b32 s19, s16, 24 -; SI-NEXT: s_or_b32 s18, s19, s18 -; SI-NEXT: s_and_b32 s17, s17, 0xffff -; SI-NEXT: s_or_b32 s17, s17, s18 -; SI-NEXT: v_mov_b32_e32 v14, s17 -; SI-NEXT: s_lshl_b32 s17, s90, 8 -; SI-NEXT: s_and_b32 s18, s46, 0xff -; SI-NEXT: v_readlane_b32 s21, v41, 29 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_readlane_b32 s17, v23, 27 +; SI-NEXT: v_mov_b32_e32 v7, s16 +; SI-NEXT: s_and_b32 s16, s45, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 8 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_and_b32 s17, s30, 0xff +; SI-NEXT: v_readlane_b32 s18, v23, 26 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s18, s18, 24 ; SI-NEXT: s_or_b32 s17, s18, s17 -; SI-NEXT: v_readlane_b32 s18, v41, 30 -; SI-NEXT: v_readlane_b32 s19, v41, 31 -; SI-NEXT: s_and_b32 s18, s18, 0xff -; SI-NEXT: v_readlane_b32 s20, v41, 32 -; SI-NEXT: s_lshl_b32 s19, s20, 24 -; SI-NEXT: s_lshl_b32 s18, s18, 16 -; SI-NEXT: s_or_b32 s18, s19, s18 -; SI-NEXT: s_and_b32 s17, s17, 0xffff -; SI-NEXT: s_or_b32 s17, s17, s18 -; SI-NEXT: v_readlane_b32 s16, v43, 41 -; SI-NEXT: v_mov_b32_e32 v15, s17 -; SI-NEXT: s_and_b32 s17, s47, 0xff -; SI-NEXT: s_lshl_b32 s18, s16, 8 -; SI-NEXT: v_readlane_b32 s16, v43, 40 -; SI-NEXT: s_or_b32 s17, s17, s18 -; SI-NEXT: s_and_b32 s18, s16, 0xff -; SI-NEXT: v_readlane_b32 s16, v43, 39 -; SI-NEXT: s_lshl_b32 s18, s18, 16 -; SI-NEXT: s_lshl_b32 s19, s16, 24 -; SI-NEXT: s_or_b32 s18, s19, s18 -; SI-NEXT: s_and_b32 s17, s17, 0xffff +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_mov_b32_e32 v8, s16 +; SI-NEXT: s_lshl_b32 s16, s84, 8 +; SI-NEXT: s_and_b32 s17, s74, 0xff +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_and_b32 s17, s82, 0xff +; SI-NEXT: s_lshl_b32 s18, s80, 24 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_readlane_b32 s17, v23, 29 +; SI-NEXT: v_mov_b32_e32 v9, s16 +; SI-NEXT: s_and_b32 s16, s75, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 8 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_and_b32 s17, s95, 0xff +; SI-NEXT: v_readlane_b32 s18, v23, 28 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s18, s18, 24 +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_mov_b32_e32 v10, s16 +; SI-NEXT: s_lshl_b32 s16, s70, 8 +; SI-NEXT: s_and_b32 s17, s72, 0xff +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_and_b32 s17, s68, 0xff +; SI-NEXT: s_lshl_b32 s18, s66, 24 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_readlane_b32 s17, v23, 31 +; SI-NEXT: v_mov_b32_e32 v11, s16 +; SI-NEXT: s_and_b32 s16, s73, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 8 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_and_b32 s17, s93, 0xff +; SI-NEXT: v_readlane_b32 s18, v23, 30 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s18, s18, 24 +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_mov_b32_e32 v12, s16 +; SI-NEXT: s_lshl_b32 s16, s64, 8 +; SI-NEXT: s_and_b32 s17, s60, 0xff +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_and_b32 s17, s54, 0xff +; SI-NEXT: s_lshl_b32 s18, s52, 24 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_readlane_b32 s17, v23, 33 +; SI-NEXT: v_mov_b32_e32 v13, s16 +; SI-NEXT: s_and_b32 s16, s61, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 8 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_and_b32 s17, s92, 0xff +; SI-NEXT: v_readlane_b32 s18, v23, 32 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s18, s18, 24 +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_readlane_b32 s19, v22, 31 +; SI-NEXT: v_mov_b32_e32 v14, s16 +; SI-NEXT: v_readlane_b32 s16, v22, 32 +; SI-NEXT: v_readlane_b32 s17, v22, 33 +; SI-NEXT: v_readlane_b32 s18, v22, 34 +; SI-NEXT: s_lshl_b32 s16, s16, 8 +; SI-NEXT: s_and_b32 s17, s46, 0xff +; SI-NEXT: v_readlane_b32 s19, v22, 35 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_and_b32 s17, s18, 0xff +; SI-NEXT: v_readlane_b32 s18, v22, 36 +; SI-NEXT: s_lshl_b32 s18, s18, 24 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_readlane_b32 s17, v23, 35 +; SI-NEXT: v_mov_b32_e32 v15, s16 +; SI-NEXT: s_and_b32 s16, s47, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 8 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_and_b32 s17, s94, 0xff +; SI-NEXT: v_readlane_b32 s18, v23, 34 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s18, s18, 24 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: s_and_b32 s16, s16, 0xffff ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 8, v0 -; SI-NEXT: v_mov_b32_e32 v16, s17 -; SI-NEXT: v_readlane_b32 s16, v41, 34 +; SI-NEXT: s_or_b32 s16, s16, s17 ; SI-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 12, v0 -; SI-NEXT: v_readlane_b32 s17, v41, 35 -; SI-NEXT: v_readlane_b32 s18, v41, 36 +; SI-NEXT: v_readlane_b32 s19, v22, 37 +; SI-NEXT: v_mov_b32_e32 v16, s16 +; SI-NEXT: v_readlane_b32 s16, v22, 38 ; SI-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 16, v0 -; SI-NEXT: s_lshl_b32 s16, s16, 8 -; SI-NEXT: s_and_b32 s17, s42, 0xff -; SI-NEXT: v_readlane_b32 s19, v41, 37 +; SI-NEXT: v_readlane_b32 s17, v22, 39 +; SI-NEXT: v_readlane_b32 s18, v22, 40 ; SI-NEXT: buffer_store_dword v5, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 20, v0 -; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: s_and_b32 s17, s18, 0xff -; SI-NEXT: v_readlane_b32 s18, v41, 38 +; SI-NEXT: s_lshl_b32 s16, s16, 8 +; SI-NEXT: s_and_b32 s17, s42, 0xff +; SI-NEXT: v_readlane_b32 s19, v22, 41 ; SI-NEXT: buffer_store_dword v6, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 24, v0 -; SI-NEXT: s_lshl_b32 s18, s18, 24 -; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_and_b32 s17, s18, 0xff +; SI-NEXT: v_readlane_b32 s18, v22, 42 ; SI-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 28, v0 -; SI-NEXT: s_or_b32 s17, s18, s17 -; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_lshl_b32 s18, s18, 24 +; SI-NEXT: s_lshl_b32 s17, s17, 16 ; SI-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 32, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: s_and_b32 s16, s16, 0xffff ; SI-NEXT: buffer_store_dword v9, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 36, v0 -; SI-NEXT: v_readlane_b32 s17, v43, 44 +; SI-NEXT: s_or_b32 s16, s16, s17 ; SI-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 40, v0 +; SI-NEXT: v_readlane_b32 s17, v23, 37 +; SI-NEXT: buffer_store_dword v11, v1, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 44, v0 ; SI-NEXT: v_mov_b32_e32 v2, s16 ; SI-NEXT: s_and_b32 s16, s43, 0xff ; SI-NEXT: s_lshl_b32 s17, s17, 8 -; SI-NEXT: buffer_store_dword v11, v1, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v1, vcc, 44, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: v_readlane_b32 s17, v43, 43 ; SI-NEXT: buffer_store_dword v12, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 48, v0 -; SI-NEXT: s_and_b32 s17, s17, 0xff -; SI-NEXT: v_readlane_b32 s18, v43, 42 -; SI-NEXT: v_readlane_b32 s19, v41, 39 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_and_b32 s17, s91, 0xff +; SI-NEXT: v_readlane_b32 s18, v23, 36 +; SI-NEXT: v_readlane_b32 s19, v22, 43 ; SI-NEXT: buffer_store_dword v13, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 52, v0 ; SI-NEXT: s_lshl_b32 s17, s17, 16 @@ -212079,40 +209619,39 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: buffer_store_dword v14, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 56, v0 ; SI-NEXT: s_or_b32 s17, s18, s17 -; SI-NEXT: v_readlane_b32 s18, v41, 40 +; SI-NEXT: v_readlane_b32 s18, v22, 44 ; SI-NEXT: buffer_store_dword v15, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 60, v0 ; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: v_readlane_b32 s19, v41, 41 +; SI-NEXT: v_readlane_b32 s19, v22, 45 ; SI-NEXT: buffer_store_dword v16, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 64, v0 ; SI-NEXT: s_or_b32 s16, s16, s17 ; SI-NEXT: s_lshl_b32 s17, s18, 8 -; SI-NEXT: v_readlane_b32 s18, v41, 42 +; SI-NEXT: v_readlane_b32 s18, v22, 46 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s16 ; SI-NEXT: s_and_b32 s16, s40, 0xff -; SI-NEXT: v_readlane_b32 s19, v41, 43 +; SI-NEXT: v_readlane_b32 s19, v22, 47 ; SI-NEXT: s_or_b32 s16, s16, s17 ; SI-NEXT: s_and_b32 s17, s18, 0xff -; SI-NEXT: v_readlane_b32 s18, v41, 44 +; SI-NEXT: v_readlane_b32 s18, v22, 48 ; SI-NEXT: s_lshl_b32 s17, s17, 16 ; SI-NEXT: s_lshl_b32 s18, s18, 24 ; SI-NEXT: s_and_b32 s16, s16, 0xffff ; SI-NEXT: s_or_b32 s17, s18, s17 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x44, v0 ; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: v_readlane_b32 s17, v43, 47 +; SI-NEXT: v_readlane_b32 s17, v23, 39 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s16 ; SI-NEXT: s_and_b32 s16, s41, 0xff ; SI-NEXT: s_lshl_b32 s17, s17, 8 ; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: v_readlane_b32 s17, v43, 46 -; SI-NEXT: s_and_b32 s17, s17, 0xff -; SI-NEXT: v_readlane_b32 s18, v43, 45 +; SI-NEXT: s_and_b32 s17, s90, 0xff +; SI-NEXT: v_readlane_b32 s18, v23, 38 ; SI-NEXT: s_lshl_b32 s17, s17, 16 ; SI-NEXT: s_lshl_b32 s18, s18, 24 ; SI-NEXT: s_and_b32 s16, s16, 0xffff @@ -212122,16 +209661,16 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: v_readlane_b32 s16, v41, 46 +; SI-NEXT: v_readlane_b32 s16, v22, 50 ; SI-NEXT: s_and_b32 s14, s14, 0xff -; SI-NEXT: v_readlane_b32 s17, v41, 47 +; SI-NEXT: v_readlane_b32 s17, v22, 51 ; SI-NEXT: s_lshl_b32 s16, s16, 8 -; SI-NEXT: v_readlane_b32 s19, v41, 45 +; SI-NEXT: v_readlane_b32 s19, v22, 49 ; SI-NEXT: s_or_b32 s14, s14, s16 -; SI-NEXT: v_readlane_b32 s16, v41, 48 -; SI-NEXT: v_readlane_b32 s17, v41, 49 +; SI-NEXT: v_readlane_b32 s16, v22, 52 +; SI-NEXT: v_readlane_b32 s17, v22, 53 ; SI-NEXT: s_and_b32 s16, s16, 0xff -; SI-NEXT: v_readlane_b32 s18, v41, 50 +; SI-NEXT: v_readlane_b32 s18, v22, 54 ; SI-NEXT: s_lshl_b32 s16, s16, 16 ; SI-NEXT: s_lshl_b32 s17, s18, 24 ; SI-NEXT: s_and_b32 s14, s14, 0xffff @@ -212142,12 +209681,11 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s14 ; SI-NEXT: s_and_b32 s14, s15, 0xff -; SI-NEXT: v_readlane_b32 s15, v43, 50 +; SI-NEXT: v_readlane_b32 s15, v23, 41 ; SI-NEXT: s_lshl_b32 s15, s15, 8 ; SI-NEXT: s_or_b32 s14, s14, s15 -; SI-NEXT: v_readlane_b32 s15, v43, 49 -; SI-NEXT: s_and_b32 s15, s15, 0xff -; SI-NEXT: v_readlane_b32 s16, v43, 48 +; SI-NEXT: s_and_b32 s15, s89, 0xff +; SI-NEXT: v_readlane_b32 s16, v23, 40 ; SI-NEXT: s_lshl_b32 s15, s15, 16 ; SI-NEXT: s_lshl_b32 s16, s16, 24 ; SI-NEXT: s_and_b32 s14, s14, 0xffff @@ -212157,15 +209695,15 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s14 -; SI-NEXT: v_readlane_b32 s14, v41, 52 +; SI-NEXT: v_readlane_b32 s14, v22, 56 ; SI-NEXT: s_and_b32 s12, s12, 0xff -; SI-NEXT: v_readlane_b32 s15, v41, 53 +; SI-NEXT: v_readlane_b32 s15, v22, 57 ; SI-NEXT: s_lshl_b32 s14, s14, 8 ; SI-NEXT: s_or_b32 s12, s12, s14 -; SI-NEXT: v_readlane_b32 s14, v41, 54 -; SI-NEXT: v_readlane_b32 s15, v41, 55 +; SI-NEXT: v_readlane_b32 s14, v22, 58 +; SI-NEXT: v_readlane_b32 s15, v22, 59 ; SI-NEXT: s_and_b32 s14, s14, 0xff -; SI-NEXT: v_readlane_b32 s16, v41, 56 +; SI-NEXT: v_readlane_b32 s16, v22, 60 ; SI-NEXT: s_lshl_b32 s14, s14, 16 ; SI-NEXT: s_lshl_b32 s15, s16, 24 ; SI-NEXT: s_and_b32 s12, s12, 0xffff @@ -212176,12 +209714,11 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s12 ; SI-NEXT: s_and_b32 s12, s13, 0xff -; SI-NEXT: v_readlane_b32 s13, v43, 53 +; SI-NEXT: v_readlane_b32 s13, v23, 43 ; SI-NEXT: s_lshl_b32 s13, s13, 8 ; SI-NEXT: s_or_b32 s12, s12, s13 -; SI-NEXT: v_readlane_b32 s13, v43, 52 -; SI-NEXT: s_and_b32 s13, s13, 0xff -; SI-NEXT: v_readlane_b32 s14, v43, 51 +; SI-NEXT: s_and_b32 s13, s88, 0xff +; SI-NEXT: v_readlane_b32 s14, v23, 42 ; SI-NEXT: s_lshl_b32 s13, s13, 16 ; SI-NEXT: s_lshl_b32 s14, s14, 24 ; SI-NEXT: s_and_b32 s12, s12, 0xffff @@ -212191,15 +209728,15 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s12 -; SI-NEXT: v_readlane_b32 s12, v41, 58 +; SI-NEXT: v_readlane_b32 s12, v22, 62 ; SI-NEXT: s_and_b32 s10, s10, 0xff -; SI-NEXT: v_readlane_b32 s13, v41, 59 +; SI-NEXT: v_readlane_b32 s13, v22, 63 ; SI-NEXT: s_lshl_b32 s12, s12, 8 ; SI-NEXT: s_or_b32 s10, s10, s12 -; SI-NEXT: v_readlane_b32 s12, v41, 60 -; SI-NEXT: v_readlane_b32 s13, v41, 61 +; SI-NEXT: v_readlane_b32 s12, v23, 0 +; SI-NEXT: v_readlane_b32 s13, v23, 1 ; SI-NEXT: s_and_b32 s12, s12, 0xff -; SI-NEXT: v_readlane_b32 s14, v41, 62 +; SI-NEXT: v_readlane_b32 s14, v23, 2 ; SI-NEXT: s_lshl_b32 s12, s12, 16 ; SI-NEXT: s_lshl_b32 s13, s14, 24 ; SI-NEXT: s_and_b32 s10, s10, 0xffff @@ -212210,12 +209747,11 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s10 ; SI-NEXT: s_and_b32 s10, s11, 0xff -; SI-NEXT: v_readlane_b32 s11, v43, 56 +; SI-NEXT: v_readlane_b32 s11, v23, 45 ; SI-NEXT: s_lshl_b32 s11, s11, 8 ; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_readlane_b32 s11, v43, 55 -; SI-NEXT: s_and_b32 s11, s11, 0xff -; SI-NEXT: v_readlane_b32 s12, v43, 54 +; SI-NEXT: s_and_b32 s11, s79, 0xff +; SI-NEXT: v_readlane_b32 s12, v23, 44 ; SI-NEXT: s_lshl_b32 s11, s11, 16 ; SI-NEXT: s_lshl_b32 s12, s12, 24 ; SI-NEXT: s_and_b32 s10, s10, 0xffff @@ -212225,15 +209761,15 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s10 -; SI-NEXT: v_readlane_b32 s10, v43, 0 +; SI-NEXT: v_readlane_b32 s10, v23, 4 ; SI-NEXT: s_and_b32 s8, s8, 0xff -; SI-NEXT: v_readlane_b32 s11, v43, 1 +; SI-NEXT: v_readlane_b32 s11, v23, 5 ; SI-NEXT: s_lshl_b32 s10, s10, 8 ; SI-NEXT: s_or_b32 s8, s8, s10 -; SI-NEXT: v_readlane_b32 s10, v43, 2 -; SI-NEXT: v_readlane_b32 s11, v43, 3 +; SI-NEXT: v_readlane_b32 s10, v23, 6 +; SI-NEXT: v_readlane_b32 s11, v23, 7 ; SI-NEXT: s_and_b32 s10, s10, 0xff -; SI-NEXT: v_readlane_b32 s12, v43, 4 +; SI-NEXT: v_readlane_b32 s12, v23, 8 ; SI-NEXT: s_lshl_b32 s10, s10, 16 ; SI-NEXT: s_lshl_b32 s11, s12, 24 ; SI-NEXT: s_and_b32 s8, s8, 0xffff @@ -212244,12 +209780,11 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s8 ; SI-NEXT: s_and_b32 s8, s9, 0xff -; SI-NEXT: v_readlane_b32 s9, v43, 59 +; SI-NEXT: v_readlane_b32 s9, v23, 47 ; SI-NEXT: s_lshl_b32 s9, s9, 8 ; SI-NEXT: s_or_b32 s8, s8, s9 -; SI-NEXT: v_readlane_b32 s9, v43, 58 -; SI-NEXT: s_and_b32 s9, s9, 0xff -; SI-NEXT: v_readlane_b32 s10, v43, 57 +; SI-NEXT: s_and_b32 s9, s78, 0xff +; SI-NEXT: v_readlane_b32 s10, v23, 46 ; SI-NEXT: s_lshl_b32 s9, s9, 16 ; SI-NEXT: s_lshl_b32 s10, s10, 24 ; SI-NEXT: s_and_b32 s8, s8, 0xffff @@ -212259,15 +209794,15 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s8 -; SI-NEXT: v_readlane_b32 s8, v43, 6 +; SI-NEXT: v_readlane_b32 s8, v23, 10 ; SI-NEXT: s_and_b32 s6, s6, 0xff -; SI-NEXT: v_readlane_b32 s9, v43, 7 +; SI-NEXT: v_readlane_b32 s9, v23, 11 ; SI-NEXT: s_lshl_b32 s8, s8, 8 ; SI-NEXT: s_or_b32 s6, s6, s8 -; SI-NEXT: v_readlane_b32 s8, v43, 8 -; SI-NEXT: v_readlane_b32 s9, v43, 9 +; SI-NEXT: v_readlane_b32 s8, v23, 12 +; SI-NEXT: v_readlane_b32 s9, v23, 13 ; SI-NEXT: s_and_b32 s8, s8, 0xff -; SI-NEXT: v_readlane_b32 s10, v43, 10 +; SI-NEXT: v_readlane_b32 s10, v23, 14 ; SI-NEXT: s_lshl_b32 s8, s8, 16 ; SI-NEXT: s_lshl_b32 s9, s10, 24 ; SI-NEXT: s_and_b32 s6, s6, 0xffff @@ -212278,12 +209813,11 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: s_and_b32 s6, s7, 0xff -; SI-NEXT: v_readlane_b32 s7, v43, 62 +; SI-NEXT: v_readlane_b32 s7, v23, 49 ; SI-NEXT: s_lshl_b32 s7, s7, 8 ; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: v_readlane_b32 s7, v43, 61 -; SI-NEXT: s_and_b32 s7, s7, 0xff -; SI-NEXT: v_readlane_b32 s8, v43, 60 +; SI-NEXT: s_and_b32 s7, s77, 0xff +; SI-NEXT: v_readlane_b32 s8, v23, 48 ; SI-NEXT: s_lshl_b32 s7, s7, 16 ; SI-NEXT: s_lshl_b32 s8, s8, 24 ; SI-NEXT: s_and_b32 s6, s6, 0xffff @@ -212293,15 +209827,13 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_readlane_b32 s6, v43, 12 +; SI-NEXT: v_readlane_b32 s6, v23, 16 ; SI-NEXT: s_and_b32 s4, s4, 0xff -; SI-NEXT: v_readlane_b32 s7, v43, 13 ; SI-NEXT: s_lshl_b32 s6, s6, 8 +; SI-NEXT: v_readlane_b32 s7, v23, 17 ; SI-NEXT: s_or_b32 s4, s4, s6 -; SI-NEXT: v_readlane_b32 s6, v43, 14 -; SI-NEXT: v_readlane_b32 s7, v43, 15 -; SI-NEXT: s_and_b32 s6, s6, 0xff -; SI-NEXT: v_readlane_b32 s8, v43, 16 +; SI-NEXT: s_and_b32 s6, s48, 0xff +; SI-NEXT: v_readlane_b32 s8, v23, 18 ; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: s_lshl_b32 s7, s8, 24 ; SI-NEXT: s_and_b32 s4, s4, 0xffff @@ -212312,12 +209844,11 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s4 ; SI-NEXT: s_and_b32 s4, s5, 0xff -; SI-NEXT: v_readlane_b32 s5, v42, 1 +; SI-NEXT: v_readlane_b32 s5, v23, 51 ; SI-NEXT: s_lshl_b32 s5, s5, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s5, v42, 0 -; SI-NEXT: s_and_b32 s5, s5, 0xff -; SI-NEXT: v_readlane_b32 s6, v43, 63 +; SI-NEXT: s_and_b32 s5, s76, 0xff +; SI-NEXT: v_readlane_b32 s6, v23, 50 ; SI-NEXT: s_lshl_b32 s5, s5, 16 ; SI-NEXT: s_lshl_b32 s6, s6, 24 ; SI-NEXT: s_and_b32 s4, s4, 0xffff @@ -212327,89 +209858,83 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 ; SI-NEXT: v_mov_b32_e32 v1, s4 -; SI-NEXT: v_readlane_b32 s21, v41, 33 -; SI-NEXT: v_readlane_b32 s19, v41, 51 -; SI-NEXT: v_readlane_b32 s17, v41, 57 -; SI-NEXT: v_readlane_b32 s15, v41, 63 -; SI-NEXT: v_readlane_b32 s13, v43, 5 -; SI-NEXT: v_readlane_b32 s11, v43, 11 -; SI-NEXT: v_readlane_b32 s9, v43, 17 +; SI-NEXT: v_readlane_b32 s19, v22, 55 +; SI-NEXT: v_readlane_b32 s17, v22, 61 +; SI-NEXT: v_readlane_b32 s15, v23, 3 +; SI-NEXT: v_readlane_b32 s13, v23, 9 +; SI-NEXT: v_readlane_b32 s11, v23, 15 +; SI-NEXT: v_readlane_b32 s9, v23, 19 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: v_readlane_b32 s99, v40, 35 -; SI-NEXT: v_readlane_b32 s98, v40, 34 -; SI-NEXT: v_readlane_b32 s97, v40, 33 -; SI-NEXT: v_readlane_b32 s96, v40, 32 -; SI-NEXT: v_readlane_b32 s87, v40, 31 -; SI-NEXT: v_readlane_b32 s86, v40, 30 -; SI-NEXT: v_readlane_b32 s85, v40, 29 -; SI-NEXT: v_readlane_b32 s84, v40, 28 -; SI-NEXT: v_readlane_b32 s83, v40, 27 -; SI-NEXT: v_readlane_b32 s82, v40, 26 -; SI-NEXT: v_readlane_b32 s81, v40, 25 -; SI-NEXT: v_readlane_b32 s80, v40, 24 -; SI-NEXT: v_readlane_b32 s71, v40, 23 -; SI-NEXT: v_readlane_b32 s70, v40, 22 -; SI-NEXT: v_readlane_b32 s69, v40, 21 -; SI-NEXT: v_readlane_b32 s68, v40, 20 -; SI-NEXT: v_readlane_b32 s67, v40, 19 -; SI-NEXT: v_readlane_b32 s66, v40, 18 -; SI-NEXT: v_readlane_b32 s65, v40, 17 -; SI-NEXT: v_readlane_b32 s64, v40, 16 -; SI-NEXT: v_readlane_b32 s55, v40, 15 -; SI-NEXT: v_readlane_b32 s54, v40, 14 -; SI-NEXT: v_readlane_b32 s53, v40, 13 -; SI-NEXT: v_readlane_b32 s52, v40, 12 -; SI-NEXT: v_readlane_b32 s51, v40, 11 -; SI-NEXT: v_readlane_b32 s50, v40, 10 -; SI-NEXT: v_readlane_b32 s49, v40, 9 -; SI-NEXT: v_readlane_b32 s48, v40, 8 -; SI-NEXT: v_readlane_b32 s39, v40, 7 -; SI-NEXT: v_readlane_b32 s38, v40, 6 -; SI-NEXT: v_readlane_b32 s37, v40, 5 -; SI-NEXT: v_readlane_b32 s36, v40, 4 -; SI-NEXT: v_readlane_b32 s35, v40, 3 -; SI-NEXT: v_readlane_b32 s34, v40, 2 -; SI-NEXT: v_readlane_b32 s31, v40, 1 -; SI-NEXT: v_readlane_b32 s30, v40, 0 -; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: v_readlane_b32 s99, v21, 35 +; SI-NEXT: v_readlane_b32 s98, v21, 34 +; SI-NEXT: v_readlane_b32 s97, v21, 33 +; SI-NEXT: v_readlane_b32 s96, v21, 32 +; SI-NEXT: v_readlane_b32 s87, v21, 31 +; SI-NEXT: v_readlane_b32 s86, v21, 30 +; SI-NEXT: v_readlane_b32 s85, v21, 29 +; SI-NEXT: v_readlane_b32 s84, v21, 28 +; SI-NEXT: v_readlane_b32 s83, v21, 27 +; SI-NEXT: v_readlane_b32 s82, v21, 26 +; SI-NEXT: v_readlane_b32 s81, v21, 25 +; SI-NEXT: v_readlane_b32 s80, v21, 24 +; SI-NEXT: v_readlane_b32 s71, v21, 23 +; SI-NEXT: v_readlane_b32 s70, v21, 22 +; SI-NEXT: v_readlane_b32 s69, v21, 21 +; SI-NEXT: v_readlane_b32 s68, v21, 20 +; SI-NEXT: v_readlane_b32 s67, v21, 19 +; SI-NEXT: v_readlane_b32 s66, v21, 18 +; SI-NEXT: v_readlane_b32 s65, v21, 17 +; SI-NEXT: v_readlane_b32 s64, v21, 16 +; SI-NEXT: v_readlane_b32 s55, v21, 15 +; SI-NEXT: v_readlane_b32 s54, v21, 14 +; SI-NEXT: v_readlane_b32 s53, v21, 13 +; SI-NEXT: v_readlane_b32 s52, v21, 12 +; SI-NEXT: v_readlane_b32 s51, v21, 11 +; SI-NEXT: v_readlane_b32 s50, v21, 10 +; SI-NEXT: v_readlane_b32 s49, v21, 9 +; SI-NEXT: v_readlane_b32 s48, v21, 8 +; SI-NEXT: v_readlane_b32 s39, v21, 7 +; SI-NEXT: v_readlane_b32 s38, v21, 6 +; SI-NEXT: v_readlane_b32 s37, v21, 5 +; SI-NEXT: v_readlane_b32 s36, v21, 4 +; SI-NEXT: v_readlane_b32 s35, v21, 3 +; SI-NEXT: v_readlane_b32 s34, v21, 2 +; SI-NEXT: v_readlane_b32 s31, v21, 1 +; SI-NEXT: v_readlane_b32 s30, v21, 0 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB99_4: ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: ; implicit-def: $sgpr16 -; SI-NEXT: v_writelane_b32 v41, s4, 20 -; SI-NEXT: v_writelane_b32 v41, s5, 21 +; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: v_writelane_b32 v22, s4, 16 +; SI-NEXT: v_writelane_b32 v22, s5, 17 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: ; implicit-def: $sgpr60 -; SI-NEXT: ; implicit-def: $sgpr92 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: ; implicit-def: $sgpr58 ; SI-NEXT: ; implicit-def: $sgpr56 ; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr98 +; SI-NEXT: ; implicit-def: $sgpr96 +; SI-NEXT: ; implicit-def: $sgpr86 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr84 ; SI-NEXT: ; implicit-def: $sgpr82 ; SI-NEXT: ; implicit-def: $sgpr80 +; SI-NEXT: ; implicit-def: $sgpr72 ; SI-NEXT: ; implicit-def: $sgpr70 -; SI-NEXT: ; implicit-def: $sgpr74 ; SI-NEXT: ; implicit-def: $sgpr68 ; SI-NEXT: ; implicit-def: $sgpr66 +; SI-NEXT: ; implicit-def: $sgpr60 ; SI-NEXT: ; implicit-def: $sgpr64 -; SI-NEXT: ; implicit-def: $sgpr72 ; SI-NEXT: ; implicit-def: $sgpr54 ; SI-NEXT: ; implicit-def: $sgpr52 -; SI-NEXT: ; implicit-def: $sgpr50 -; SI-NEXT: ; implicit-def: $sgpr62 -; SI-NEXT: ; implicit-def: $sgpr48 -; SI-NEXT: ; implicit-def: $sgpr38 -; SI-NEXT: ; implicit-def: $sgpr36 -; SI-NEXT: ; implicit-def: $sgpr58 -; SI-NEXT: ; implicit-def: $sgpr34 -; SI-NEXT: ; implicit-def: $sgpr30 -; SI-NEXT: ; implicit-def: $sgpr94 ; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr90 ; SI-NEXT: ; implicit-def: $sgpr42 ; SI-NEXT: ; implicit-def: $sgpr40 ; SI-NEXT: ; implicit-def: $sgpr14 @@ -212417,33 +209942,34 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: ; implicit-def: $sgpr10 ; SI-NEXT: ; implicit-def: $sgpr8 ; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: v_writelane_b32 v41, s4, 22 -; SI-NEXT: v_writelane_b32 v41, s5, 23 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: v_writelane_b32 v22, s4, 18 +; SI-NEXT: v_writelane_b32 v22, s5, 19 ; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v41, s4, 24 -; SI-NEXT: v_writelane_b32 v41, s5, 25 +; SI-NEXT: v_writelane_b32 v22, s4, 20 +; SI-NEXT: v_writelane_b32 v22, s5, 21 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v41, s4, 26 -; SI-NEXT: v_writelane_b32 v41, s5, 27 +; SI-NEXT: v_writelane_b32 v22, s4, 22 +; SI-NEXT: v_writelane_b32 v22, s5, 23 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v41, s4, 28 -; SI-NEXT: v_writelane_b32 v41, s5, 29 +; SI-NEXT: v_writelane_b32 v22, s4, 24 +; SI-NEXT: v_writelane_b32 v22, s5, 25 ; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: v_writelane_b32 v22, s4, 26 +; SI-NEXT: v_writelane_b32 v22, s5, 27 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: v_writelane_b32 v22, s4, 28 +; SI-NEXT: v_writelane_b32 v22, s5, 29 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: v_writelane_b32 v22, s4, 30 +; SI-NEXT: v_writelane_b32 v22, s5, 31 ; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 @@ -212501,108 +210027,85 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: v_writelane_b32 v22, s4, 32 +; SI-NEXT: v_writelane_b32 v22, s5, 33 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: v_writelane_b32 v22, s4, 34 +; SI-NEXT: v_writelane_b32 v22, s5, 35 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: v_writelane_b32 v22, s4, 36 +; SI-NEXT: v_writelane_b32 v22, s5, 37 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: v_writelane_b32 v22, s4, 38 +; SI-NEXT: v_writelane_b32 v22, s5, 39 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: v_writelane_b32 v22, s4, 40 +; SI-NEXT: v_writelane_b32 v22, s5, 41 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: v_writelane_b32 v22, s4, 42 +; SI-NEXT: v_writelane_b32 v22, s5, 43 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: v_writelane_b32 v22, s4, 44 +; SI-NEXT: v_writelane_b32 v22, s5, 45 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: v_writelane_b32 v22, s4, 46 +; SI-NEXT: v_writelane_b32 v22, s5, 47 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: v_writelane_b32 v22, s4, 48 +; SI-NEXT: v_writelane_b32 v22, s5, 49 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: ; kill: killed $sgpr4 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: ; kill: killed $sgpr4 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: ; kill: killed $sgpr4 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v41, s4, 30 -; SI-NEXT: v_writelane_b32 v41, s5, 31 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v41, s4, 32 -; SI-NEXT: v_writelane_b32 v41, s5, 33 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v41, s4, 34 -; SI-NEXT: v_writelane_b32 v41, s5, 35 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v41, s4, 36 -; SI-NEXT: v_writelane_b32 v41, s5, 37 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v41, s4, 38 -; SI-NEXT: v_writelane_b32 v41, s5, 39 +; SI-NEXT: v_writelane_b32 v22, s4, 50 +; SI-NEXT: v_writelane_b32 v22, s5, 51 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v41, s4, 40 -; SI-NEXT: v_writelane_b32 v41, s5, 41 +; SI-NEXT: v_writelane_b32 v22, s4, 52 +; SI-NEXT: v_writelane_b32 v22, s5, 53 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v41, s4, 42 -; SI-NEXT: v_writelane_b32 v41, s5, 43 +; SI-NEXT: v_writelane_b32 v22, s4, 54 +; SI-NEXT: v_writelane_b32 v22, s5, 55 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v41, s4, 44 -; SI-NEXT: v_writelane_b32 v41, s5, 45 +; SI-NEXT: v_writelane_b32 v22, s4, 56 +; SI-NEXT: v_writelane_b32 v22, s5, 57 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v41, s4, 46 -; SI-NEXT: v_writelane_b32 v41, s5, 47 +; SI-NEXT: v_writelane_b32 v22, s4, 58 +; SI-NEXT: v_writelane_b32 v22, s5, 59 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v41, s4, 48 -; SI-NEXT: v_writelane_b32 v41, s5, 49 +; SI-NEXT: v_writelane_b32 v22, s4, 60 +; SI-NEXT: v_writelane_b32 v22, s5, 61 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v41, s4, 50 -; SI-NEXT: v_writelane_b32 v41, s5, 51 +; SI-NEXT: v_writelane_b32 v22, s4, 62 +; SI-NEXT: v_writelane_b32 v22, s5, 63 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v41, s4, 52 -; SI-NEXT: v_writelane_b32 v41, s5, 53 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v41, s4, 54 -; SI-NEXT: v_writelane_b32 v41, s5, 55 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v41, s4, 56 -; SI-NEXT: v_writelane_b32 v41, s5, 57 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v41, s4, 58 -; SI-NEXT: v_writelane_b32 v41, s5, 59 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v41, s4, 60 -; SI-NEXT: v_writelane_b32 v41, s5, 61 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v23, s4, 0 +; SI-NEXT: v_writelane_b32 v23, s5, 1 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v41, s4, 62 -; SI-NEXT: v_writelane_b32 v41, s5, 63 +; SI-NEXT: v_writelane_b32 v23, s4, 2 +; SI-NEXT: v_writelane_b32 v23, s5, 3 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v43, s4, 0 -; SI-NEXT: v_writelane_b32 v43, s5, 1 +; SI-NEXT: v_writelane_b32 v23, s4, 4 +; SI-NEXT: v_writelane_b32 v23, s5, 5 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v43, s4, 2 -; SI-NEXT: v_writelane_b32 v43, s5, 3 +; SI-NEXT: v_writelane_b32 v23, s4, 6 +; SI-NEXT: v_writelane_b32 v23, s5, 7 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v43, s4, 4 -; SI-NEXT: v_writelane_b32 v43, s5, 5 +; SI-NEXT: v_writelane_b32 v23, s4, 8 +; SI-NEXT: v_writelane_b32 v23, s5, 9 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v43, s4, 6 -; SI-NEXT: v_writelane_b32 v43, s5, 7 +; SI-NEXT: v_writelane_b32 v23, s4, 10 +; SI-NEXT: v_writelane_b32 v23, s5, 11 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v43, s4, 8 -; SI-NEXT: v_writelane_b32 v43, s5, 9 +; SI-NEXT: v_writelane_b32 v23, s4, 12 +; SI-NEXT: v_writelane_b32 v23, s5, 13 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v43, s4, 10 -; SI-NEXT: v_writelane_b32 v43, s5, 11 -; SI-NEXT: v_writelane_b32 v43, s16, 12 -; SI-NEXT: v_writelane_b32 v43, s17, 13 -; SI-NEXT: ; implicit-def: $sgpr16 +; SI-NEXT: v_writelane_b32 v23, s4, 14 +; SI-NEXT: v_writelane_b32 v23, s5, 15 +; SI-NEXT: v_writelane_b32 v23, s48, 16 +; SI-NEXT: v_writelane_b32 v23, s49, 17 +; SI-NEXT: ; implicit-def: $sgpr48 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v43, s16, 14 -; SI-NEXT: v_writelane_b32 v43, s17, 15 -; SI-NEXT: ; implicit-def: $sgpr16 -; SI-NEXT: v_writelane_b32 v43, s16, 16 -; SI-NEXT: v_writelane_b32 v43, s17, 17 +; SI-NEXT: v_writelane_b32 v23, s48, 18 +; SI-NEXT: v_writelane_b32 v23, s49, 19 +; SI-NEXT: ; implicit-def: $sgpr48 ; SI-NEXT: s_branch .LBB99_2 ; ; VI-LABEL: bitcast_v64i16_to_v128i8_scalar: @@ -216190,59 +213693,87 @@ define <64 x half> @bitcast_v64bf16_to_v64f16(<64 x bfloat> %a, i32 %b) { ; SI-LABEL: bitcast_v64bf16_to_v64f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:136 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:88 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v2 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v55, 0xffff0000, v23 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v23 +; SI-NEXT: v_and_b32_e32 v53, 0xffff0000, v22 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v22 +; SI-NEXT: v_and_b32_e32 v51, 0xffff0000, v21 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v21 +; SI-NEXT: v_and_b32_e32 v49, 0xffff0000, v20 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v20 +; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v19 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v19 +; SI-NEXT: v_and_b32_e32 v37, 0xffff0000, v18 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v18 +; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v17 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v17 +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v16 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v16 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v11 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v11 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v10 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v10 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v9 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v9 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v8 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v8 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; SI-NEXT: v_and_b32_e32 v61, 0xffff0000, v30 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v30 +; SI-NEXT: v_and_b32_e32 v59, 0xffff0000, v29 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v29 +; SI-NEXT: v_and_b32_e32 v57, 0xffff0000, v28 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v28 +; SI-NEXT: v_and_b32_e32 v47, 0xffff0000, v27 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v27 +; SI-NEXT: v_and_b32_e32 v45, 0xffff0000, v26 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v26 +; SI-NEXT: v_and_b32_e32 v43, 0xffff0000, v25 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v25 +; SI-NEXT: v_and_b32_e32 v41, 0xffff0000, v24 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v24 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v63, 0xffff0000, v15 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v15 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v14 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v14 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v13 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v13 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v12 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v12 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v7 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; SI-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; SI-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; SI-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; SI-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; SI-NEXT: v_mul_f32_e32 v9, 1.0, v9 ; SI-NEXT: v_mul_f32_e32 v10, 1.0, v10 ; SI-NEXT: v_mul_f32_e32 v11, 1.0, v11 ; SI-NEXT: v_mul_f32_e32 v12, 1.0, v12 @@ -216259,22 +213790,152 @@ define <64 x half> @bitcast_v64bf16_to_v64f16(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: v_mul_f32_e32 v23, 1.0, v23 ; SI-NEXT: v_mul_f32_e32 v24, 1.0, v24 ; SI-NEXT: v_mul_f32_e32 v25, 1.0, v25 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v31 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v31 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v31 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v63 +; SI-NEXT: v_mul_f32_e32 v63, 1.0, v8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; kill: killed $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; kill: killed $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; kill: killed $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; kill: killed $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; kill: killed $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; kill: killed $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; kill: killed $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; kill: killed $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; kill: killed $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; kill: killed $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; kill: killed $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; kill: killed $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; kill: killed $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; kill: killed $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; kill: killed $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; kill: killed $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; kill: killed $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; kill: killed $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; kill: killed $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; kill: killed $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; kill: killed $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; kill: killed $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; kill: killed $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; kill: killed $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; kill: killed $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; kill: killed $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; kill: killed $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; kill: killed $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; kill: killed $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; kill: killed $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; kill: killed $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; kill: killed $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; kill: killed $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; kill: killed $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; kill: killed $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; kill: killed $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; kill: killed $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; kill: killed $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; kill: killed $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; kill: killed $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; kill: killed $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; kill: killed $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; kill: killed $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; kill: killed $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; kill: killed $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; kill: killed $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; kill: killed $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; kill: killed $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; kill: killed $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; kill: killed $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; kill: killed $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; kill: killed $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; kill: killed $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; kill: killed $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; kill: killed $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v32 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v62 +; SI-NEXT: v_mul_f32_e32 v62, 1.0, v9 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; kill: killed $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; kill: killed $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; kill: killed $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; SI-NEXT: v_mul_f32_e32 v26, 1.0, v26 ; SI-NEXT: v_mul_f32_e32 v27, 1.0, v27 ; SI-NEXT: v_mul_f32_e32 v28, 1.0, v28 ; SI-NEXT: v_mul_f32_e32 v29, 1.0, v29 ; SI-NEXT: v_mul_f32_e32 v30, 1.0, v30 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:100 -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:104 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:108 -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:112 -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:116 -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:120 -; SI-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; SI-NEXT: v_mul_f32_e32 v33, 1.0, v33 ; SI-NEXT: v_mul_f32_e32 v34, 1.0, v34 ; SI-NEXT: v_mul_f32_e32 v35, 1.0, v35 @@ -216286,7 +213947,6 @@ define <64 x half> @bitcast_v64bf16_to_v64f16(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: v_mul_f32_e32 v49, 1.0, v49 ; SI-NEXT: v_mul_f32_e32 v50, 1.0, v50 ; SI-NEXT: v_mul_f32_e32 v51, 1.0, v51 -; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_mul_f32_e32 v52, 1.0, v52 ; SI-NEXT: v_mul_f32_e32 v53, 1.0, v53 ; SI-NEXT: v_mul_f32_e32 v54, 1.0, v54 @@ -216294,1255 +213954,990 @@ define <64 x half> @bitcast_v64bf16_to_v64f16(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: v_mul_f32_e32 v40, 1.0, v40 ; SI-NEXT: v_mul_f32_e32 v41, 1.0, v41 ; SI-NEXT: v_mul_f32_e32 v42, 1.0, v42 -; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_mul_f32_e32 v43, 1.0, v43 -; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_mul_f32_e32 v44, 1.0, v44 -; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_mul_f32_e32 v45, 1.0, v45 -; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_mul_f32_e32 v46, 1.0, v46 ; SI-NEXT: v_mul_f32_e32 v47, 1.0, v47 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_mul_f32_e32 v56, 1.0, v1 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_mul_f32_e32 v57, 1.0, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:124 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:128 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 -; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_mul_f32_e32 v56, 1.0, v56 +; SI-NEXT: v_mul_f32_e32 v57, 1.0, v57 ; SI-NEXT: v_mul_f32_e32 v58, 1.0, v58 -; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_mul_f32_e32 v59, 1.0, v59 -; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_mul_f32_e32 v60, 1.0, v60 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v62 ; SI-NEXT: v_mul_f32_e32 v61, 1.0, v61 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_mul_f32_e32 v62, 1.0, v31 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v63, 1.0, v63 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v31, 1.0, v2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; kill: killed $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; kill: killed $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; kill: killed $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; kill: killed $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; kill: killed $vgpr8 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB100_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v9, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload ; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: .LBB100_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB100_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v31 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v63 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v62 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v61 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v60 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v59 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v58 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v57 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v56 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v47 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v46 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v45 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v44 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v43 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v42 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v41 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v40 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v55 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v54 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v53 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v52 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v51 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v50 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v49 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v48 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v39 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v38 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v37 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v36 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v35 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v34 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v33 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v4 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v34 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v5 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v35 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v6 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v36 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v7 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v37 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v8 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v9 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v39 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 ; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 ; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v10 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v48 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v10 ; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 ; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v11 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v0, v11 ; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 ; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v12 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v12 ; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 ; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v13 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v13 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v63 ; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v62 ; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v14 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v14 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v61 ; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v60 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v15 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v53 -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v15 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v59 ; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v58 ; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v17 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v17 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v57 ; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v56 ; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v18 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v18 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v47 ; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v46 ; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v19 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v19 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v45 ; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v44 ; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v20 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v20 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v43 ; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v42 ; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v21 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v21 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v41 ; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v40 ; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v22 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v22 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v55 ; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v54 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v23 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v23 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v53 ; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v52 ; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v24 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v24 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v51 ; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v50 ; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v25 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v25 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v49 ; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v48 ; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v26 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v26 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v39 ; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v38 ; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v27 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v27 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v37 ; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v36 ; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v28 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v28 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v35 ; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v34 ; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v29 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v29 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v33 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v30 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v32 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v59 -; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v9 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v31 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v8 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v32 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v33 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v34 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v35 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v36 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v38 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v39 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v48 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v6 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 +; SI-NEXT: v_mov_b32_e32 v9, v3 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v53 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v54 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v55 ; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 ; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v40 ; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 ; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v41 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v30 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v42 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v60 -; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v33 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v33 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v44 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v32 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v45 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v46 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v47 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v56 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v57 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v58 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v60 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v61 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v62 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v63 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; SI-NEXT: .LBB100_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v4, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v23, v25, v23 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v25, v27, v25 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x78, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_or_b32_e32 v26, v27, v26 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v27, v29, v27 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_or_b32_e32 v28, v29, v28 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v30 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v29, v31, v29 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_or_b32_e32 v30, v31, v30 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v32 +; SI-NEXT: v_or_b32_e32 v31, v33, v31 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v64bf16_to_v64f16: @@ -219847,1220 +217242,1057 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg % ; SI-LABEL: bitcast_v64bf16_to_v64f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v16 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v16 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v14 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v14 +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v11 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v11 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v10 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v10 +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v8 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v8 +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v7 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v5 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v34, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_and_b32 s6, s29, 0xffff0000 +; SI-NEXT: s_lshl_b32 s7, s29, 16 +; SI-NEXT: s_and_b32 s8, s28, 0xffff0000 +; SI-NEXT: s_lshl_b32 s9, s28, 16 +; SI-NEXT: s_and_b32 s10, s27, 0xffff0000 +; SI-NEXT: s_lshl_b32 s11, s27, 16 +; SI-NEXT: s_and_b32 s12, s26, 0xffff0000 +; SI-NEXT: s_lshl_b32 s13, s26, 16 +; SI-NEXT: s_and_b32 s14, s25, 0xffff0000 +; SI-NEXT: s_lshl_b32 s15, s25, 16 +; SI-NEXT: s_and_b32 s25, s24, 0xffff0000 +; SI-NEXT: s_lshl_b32 s24, s24, 16 +; SI-NEXT: s_and_b32 s26, s23, 0xffff0000 +; SI-NEXT: s_lshl_b32 s23, s23, 16 +; SI-NEXT: s_and_b32 s27, s22, 0xffff0000 +; SI-NEXT: s_lshl_b32 s22, s22, 16 +; SI-NEXT: s_and_b32 s28, s21, 0xffff0000 +; SI-NEXT: s_lshl_b32 s21, s21, 16 +; SI-NEXT: s_and_b32 s29, s20, 0xffff0000 +; SI-NEXT: s_lshl_b32 s20, s20, 16 +; SI-NEXT: s_and_b32 s40, s19, 0xffff0000 +; SI-NEXT: s_lshl_b32 s19, s19, 16 +; SI-NEXT: s_and_b32 s41, s18, 0xffff0000 +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_and_b32 s42, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_and_b32 s43, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s16, s16, 16 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mul_f32_e32 v36, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v35, 1.0, v35 +; SI-NEXT: v_mul_f32_e32 v37, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v38, 1.0, v34 +; SI-NEXT: v_mul_f32_e32 v48, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v50, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v33 +; SI-NEXT: v_mul_f32_e32 v52, 1.0, v32 +; SI-NEXT: v_mul_f32_e32 v53, 1.0, v31 +; SI-NEXT: v_mul_f32_e32 v54, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v55, 1.0, v30 +; SI-NEXT: v_mul_f32_e32 v40, 1.0, v29 +; SI-NEXT: v_mul_f32_e32 v41, 1.0, v28 +; SI-NEXT: v_mul_f32_e32 v42, 1.0, v27 +; SI-NEXT: v_mul_f32_e32 v43, 1.0, v26 +; SI-NEXT: v_mul_f32_e32 v44, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v45, 1.0, v25 +; SI-NEXT: v_mul_f32_e32 v46, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v47, 1.0, v24 +; SI-NEXT: v_mul_f32_e32 v56, 1.0, v23 ; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:44 +; SI-NEXT: v_mul_f32_e32 v57, 1.0, v22 ; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:48 +; SI-NEXT: v_mul_f32_e32 v58, 1.0, v15 ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:52 +; SI-NEXT: v_mul_f32_e32 v59, 1.0, v21 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:56 +; SI-NEXT: v_mul_f32_e32 v60, 1.0, v20 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:60 +; SI-NEXT: v_mul_f32_e32 v61, 1.0, v18 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:64 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:76 -; SI-NEXT: v_mul_f32_e32 v41, 1.0, v12 -; SI-NEXT: v_mul_f32_e32 v12, 1.0, v30 -; SI-NEXT: v_mul_f32_e32 v53, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v36, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v54, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v40, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v48, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v50, 1.0, v6 -; SI-NEXT: v_mul_f32_e32 v52, 1.0, v7 -; SI-NEXT: v_mul_f32_e32 v35, 1.0, v8 -; SI-NEXT: v_mul_f32_e32 v37, 1.0, v9 -; SI-NEXT: v_mul_f32_e32 v55, 1.0, v10 -; SI-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v13 -; SI-NEXT: v_mul_f32_e32 v14, 1.0, v14 -; SI-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; SI-NEXT: v_mul_f32_e32 v18, 1.0, v18 -; SI-NEXT: v_mul_f32_e32 v19, 1.0, v19 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v20 -; SI-NEXT: v_mul_f32_e32 v3, 1.0, v21 -; SI-NEXT: v_mul_f32_e32 v4, 1.0, v22 -; SI-NEXT: v_mul_f32_e32 v5, 1.0, v23 -; SI-NEXT: v_mul_f32_e32 v6, 1.0, v24 -; SI-NEXT: v_mul_f32_e32 v7, 1.0, v25 -; SI-NEXT: v_mul_f32_e32 v8, 1.0, v26 -; SI-NEXT: v_mul_f32_e32 v9, 1.0, v27 -; SI-NEXT: v_mul_f32_e32 v10, 1.0, v28 -; SI-NEXT: v_mul_f32_e32 v29, 1.0, v29 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: v_mul_f32_e64 v20, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v21, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v22, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v23, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v24, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v25, 1.0, s24 -; SI-NEXT: v_mul_f32_e64 v26, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v27, 1.0, s26 -; SI-NEXT: v_mul_f32_e64 v28, 1.0, s27 +; SI-NEXT: v_mul_f32_e32 v62, 1.0, v17 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v63, 1.0, v19 +; SI-NEXT: v_mul_f32_e64 v18, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v20, 1.0, s43 +; SI-NEXT: v_mul_f32_e64 v21, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v22, 1.0, s42 +; SI-NEXT: v_mul_f32_e64 v23, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v24, 1.0, s41 +; SI-NEXT: v_mul_f32_e64 v25, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v26, 1.0, s40 +; SI-NEXT: v_mul_f32_e64 v27, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v28, 1.0, s29 +; SI-NEXT: v_mul_f32_e64 v29, 1.0, s21 ; SI-NEXT: v_mul_f32_e64 v30, 1.0, s28 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: v_mul_f32_e32 v31, 1.0, v15 -; SI-NEXT: v_mul_f32_e32 v15, 1.0, v16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v12, 1.0, v32 -; SI-NEXT: v_mul_f32_e32 v39, 1.0, v39 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mul_f32_e32 v49, 1.0, v49 -; SI-NEXT: v_mul_f32_e32 v13, 1.0, v51 -; SI-NEXT: v_mul_f32_e32 v42, 1.0, v42 -; SI-NEXT: v_mul_f32_e32 v43, 1.0, v43 -; SI-NEXT: v_mul_f32_e32 v44, 1.0, v44 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_mul_f32_e32 v45, 1.0, v45 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_mul_f32_e32 v46, 1.0, v46 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_mul_f32_e32 v47, 1.0, v47 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_mul_f32_e32 v56, 1.0, v56 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_mul_f32_e32 v51, 1.0, v57 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_mul_f32_e32 v58, 1.0, v58 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_mul_f32_e32 v59, 1.0, v59 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_mul_f32_e32 v60, 1.0, v60 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_mul_f32_e32 v61, 1.0, v61 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_mul_f32_e32 v62, 1.0, v62 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_mul_f32_e32 v63, 1.0, v63 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_mul_f32_e32 v57, 1.0, v33 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v38, 1.0, v34 -; SI-NEXT: v_mul_f32_e64 v16, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v32, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v33, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v34, 1.0, s29 -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; SI-NEXT: s_cbranch_scc0 .LBB101_2 +; SI-NEXT: v_mul_f32_e64 v31, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v32, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v33, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v34, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v39, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v49, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s15 +; SI-NEXT: v_mul_f32_e64 v0, 1.0, s14 +; SI-NEXT: v_mul_f32_e64 v15, 1.0, s13 +; SI-NEXT: v_mul_f32_e64 v19, 1.0, s12 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s11 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s10 +; SI-NEXT: v_mul_f32_e64 v13, 1.0, s9 +; SI-NEXT: v_mul_f32_e64 v9, 1.0, s8 +; SI-NEXT: v_mul_f32_e64 v17, 1.0, s7 +; SI-NEXT: v_mul_f32_e64 v12, 1.0, s6 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB101_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: s_mov_b64 s[4:5], 0 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_mov_b32_e32 v32, v53 -; SI-NEXT: v_mov_b32_e32 v53, v54 -; SI-NEXT: v_mov_b32_e32 v54, v40 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_mov_b32_e32 v40, v35 -; SI-NEXT: v_mov_b32_e32 v35, v37 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v20 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v53 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v55 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v41 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v43 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v45 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v47 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v61 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v39 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v31 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v17 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v18 -; SI-NEXT: v_mov_b32_e32 v31, v15 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v19 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v3 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v33 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v57 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v29 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v36, v16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v16 -; SI-NEXT: v_mov_b32_e32 v46, v54 -; SI-NEXT: v_mov_b32_e32 v16, v20 -; SI-NEXT: v_mov_b32_e32 v20, v32 -; SI-NEXT: v_mov_b32_e32 v32, v53 -; SI-NEXT: v_mov_b32_e32 v45, v40 -; SI-NEXT: v_mov_b32_e32 v56, v37 -; SI-NEXT: v_mov_b32_e32 v47, v11 -; SI-NEXT: s_branch .LBB101_3 -; SI-NEXT: .LBB101_2: -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: s_mov_b64 s[4:5], -1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: v_mov_b32_e32 v20, v53 -; SI-NEXT: v_mov_b32_e32 v32, v54 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mov_b32_e32 v46, v40 -; SI-NEXT: v_mov_b32_e32 v45, v35 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v59 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v56, v37 -; SI-NEXT: v_mov_b32_e32 v47, v11 -; SI-NEXT: v_mov_b32_e32 v31, v15 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: .LBB101_3: ; %Flow -; SI-NEXT: v_mov_b32_e32 v5, v59 -; SI-NEXT: v_mov_b32_e32 v10, v63 -; SI-NEXT: v_mov_b32_e32 v59, v28 -; SI-NEXT: v_mov_b32_e32 v63, v26 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v7, v60 -; SI-NEXT: v_mov_b32_e32 v11, v57 -; SI-NEXT: v_mov_b32_e32 v12, v38 -; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] -; SI-NEXT: v_mov_b32_e32 v35, v1 -; SI-NEXT: v_mov_b32_e32 v37, v55 -; SI-NEXT: v_mov_b32_e32 v15, v36 -; SI-NEXT: v_mov_b32_e32 v38, v4 -; SI-NEXT: v_mov_b32_e32 v53, v6 -; SI-NEXT: v_mov_b32_e32 v54, v8 -; SI-NEXT: v_mov_b32_e32 v40, v9 -; SI-NEXT: v_mov_b32_e32 v57, v13 -; SI-NEXT: v_mov_b32_e32 v60, v41 -; SI-NEXT: v_mov_b32_e32 v36, v2 -; SI-NEXT: s_cbranch_vccnz .LBB101_5 -; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v12 -; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v16 -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v43 -; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v16 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v51 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v11 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v62 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v35 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v5 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v10 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v61 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v7 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v58 -; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v16 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v21 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v22 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v16 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v21 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v22 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v16 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v21 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v22 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v16 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v21 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v22 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v16 -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v29 -; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v16 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v21 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v22 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v16 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v21 -; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v22 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v16 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v21 -; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v22 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v16 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v21 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v22 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v16 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v22 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v37 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v60 ; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v22 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v44 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v51 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v12 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v41 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v16 -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v31 -; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v29 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v1 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v2 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v43 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v22, v27 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v19 -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v46 -; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v19 -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v32 -; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v19 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v22, v23 -; SI-NEXT: v_and_b32_e32 v44, 0xffff0000, v44 -; SI-NEXT: v_add_f32_e32 v44, 0x40c00000, v44 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v44 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: s_cbranch_execnz .LBB101_3 +; SI-NEXT: .LBB101_2: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v63 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v22, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v8 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v24, v10 -; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v62 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v61 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v60 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v59 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v58 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v57 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v56 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v47 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v46 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v45 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v44 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v43 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v42 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v41 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v40 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v55 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v54 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v53 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v52 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v51 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v15 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v15 ; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 ; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 ; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v32 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v19 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v46 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v31 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v4 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v1 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v13 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v15, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v9 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v16 -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v47 -; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v22, v47 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v9 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v22, v4 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v17 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v62 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v12 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v53, 0xffff0000, v53 +; SI-NEXT: v_add_f32_e32 v53, 0x40c00000, v53 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v53 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v53 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v16 -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v56 -; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v16 -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v45 -; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v17, v45 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v52, 0xffff0000, v52 +; SI-NEXT: v_add_f32_e32 v52, 0x40c00000, v52 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v52 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v38 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v52 +; SI-NEXT: v_and_b32_e32 v51, 0xffff0000, v51 +; SI-NEXT: v_add_f32_e32 v51, 0x40c00000, v51 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v51 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v56 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v50 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v21 +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v3 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v20 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v6 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v18 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v11 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v5 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v61 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v2 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v7 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v16 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v13 -; SI-NEXT: .LBB101_5: ; %end -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: v_cvt_f32_f16_e32 v0, v8 +; SI-NEXT: v_and_b32_e32 v54, 0xffff0000, v54 +; SI-NEXT: v_and_b32_e32 v55, 0xffff0000, v55 +; SI-NEXT: v_and_b32_e32 v40, 0xffff0000, v40 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_cvt_f32_f16_e32 v0, v7 +; SI-NEXT: v_and_b32_e32 v41, 0xffff0000, v41 +; SI-NEXT: v_and_b32_e32 v42, 0xffff0000, v42 +; SI-NEXT: v_and_b32_e32 v43, 0xffff0000, v43 +; SI-NEXT: v_and_b32_e32 v44, 0xffff0000, v44 +; SI-NEXT: v_and_b32_e32 v45, 0xffff0000, v45 +; SI-NEXT: v_and_b32_e32 v46, 0xffff0000, v46 +; SI-NEXT: v_and_b32_e32 v47, 0xffff0000, v47 +; SI-NEXT: v_and_b32_e32 v56, 0xffff0000, v56 +; SI-NEXT: v_and_b32_e32 v57, 0xffff0000, v57 +; SI-NEXT: v_and_b32_e32 v58, 0xffff0000, v58 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v59, 0xffff0000, v59 +; SI-NEXT: v_and_b32_e32 v60, 0xffff0000, v60 +; SI-NEXT: v_and_b32_e32 v61, 0xffff0000, v61 +; SI-NEXT: v_and_b32_e32 v62, 0xffff0000, v62 +; SI-NEXT: v_and_b32_e32 v63, 0xffff0000, v63 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v54, 0x40c00000, v54 +; SI-NEXT: v_add_f32_e32 v55, 0x40c00000, v55 +; SI-NEXT: v_add_f32_e32 v40, 0x40c00000, v40 +; SI-NEXT: v_add_f32_e32 v41, 0x40c00000, v41 +; SI-NEXT: v_add_f32_e32 v42, 0x40c00000, v42 +; SI-NEXT: v_add_f32_e32 v43, 0x40c00000, v43 +; SI-NEXT: v_add_f32_e32 v44, 0x40c00000, v44 +; SI-NEXT: v_add_f32_e32 v45, 0x40c00000, v45 +; SI-NEXT: v_add_f32_e32 v46, 0x40c00000, v46 +; SI-NEXT: v_add_f32_e32 v47, 0x40c00000, v47 +; SI-NEXT: v_add_f32_e32 v56, 0x40c00000, v56 +; SI-NEXT: v_add_f32_e32 v57, 0x40c00000, v57 +; SI-NEXT: v_add_f32_e32 v58, 0x40c00000, v58 +; SI-NEXT: v_add_f32_e32 v59, 0x40c00000, v59 +; SI-NEXT: v_add_f32_e32 v60, 0x40c00000, v60 +; SI-NEXT: v_add_f32_e32 v61, 0x40c00000, v61 +; SI-NEXT: v_add_f32_e32 v62, 0x40c00000, v62 +; SI-NEXT: v_add_f32_e32 v63, 0x40c00000, v63 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v40 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v42 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v43 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v44 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v45 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v46 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v47 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v56 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v57 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v58 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v59 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v60 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v61 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v62 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v18, v6 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_cvt_f32_f16_e32 v0, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: .LBB101_3: ; %end ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v52, v18 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v18 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v28 -; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v26 -; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v63 -; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 -; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v18 -; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v17, v51, v17 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v52 +; SI-NEXT: v_or_b32_e32 v18, v51, v18 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v52, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v36 +; SI-NEXT: v_or_b32_e32 v19, v53, v19 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v22, v22, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v26 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v50 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v34 +; SI-NEXT: v_or_b32_e32 v25, v27, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v31 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v35 +; SI-NEXT: v_or_b32_e32 v26, v27, v26 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v38 +; SI-NEXT: v_or_b32_e32 v27, v29, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v39 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_or_b32_e32 v21, v52, v21 +; SI-NEXT: v_or_b32_e32 v28, v29, v28 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v49 +; SI-NEXT: v_or_b32_e32 v29, v31, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v37 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_or_b32_e32 v30, v31, v30 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v32 +; SI-NEXT: v_or_b32_e32 v31, v33, v31 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v35 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v14 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v15 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v38 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v53 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v54 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v40 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v57 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v42 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v17 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v52 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v34 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x78, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v16 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 +; SI-NEXT: v_or_b32_e32 v20, v51, v20 ; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB101_4: +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: s_branch .LBB101_2 ; ; VI-LABEL: bitcast_v64bf16_to_v64f16_scalar: ; VI: ; %bb.0: @@ -223840,310 +221072,248 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v64f16_to_v64bf16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v47, v0 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:136 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:88 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v28 +; SI-NEXT: v_mov_b32_e32 v53, v3 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v32, v33 +; SI-NEXT: v_mov_b32_e32 v3, v0 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v9 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v4 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; kill: killed $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; kill: killed $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; kill: killed $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; SI-NEXT: ; kill: killed $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; kill: killed $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; kill: killed $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; kill: killed $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; kill: killed $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; kill: killed $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; kill: killed $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; kill: killed $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; kill: killed $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; kill: killed $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; kill: killed $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; kill: killed $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; kill: killed $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; kill: killed $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; kill: killed $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; kill: killed $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; kill: killed $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; kill: killed $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; kill: killed $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; kill: killed $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; kill: killed $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; kill: killed $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; kill: killed $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; kill: killed $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; kill: killed $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; kill: killed $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; kill: killed $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; kill: killed $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; kill: killed $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; kill: killed $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; kill: killed $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; kill: killed $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; kill: killed $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; kill: killed $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; kill: killed $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; kill: killed $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v34 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v0 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v63 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v44 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:100 -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:104 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:108 -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:112 -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:116 -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:120 -; SI-NEXT: ; kill: killed $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; kill: killed $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; kill: killed $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; kill: killed $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; kill: killed $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; kill: killed $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; kill: killed $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; kill: killed $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; kill: killed $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; kill: killed $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; kill: killed $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; kill: killed $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; kill: killed $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; kill: killed $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; kill: killed $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; kill: killed $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v46 +; SI-NEXT: v_mov_b32_e32 v55, v2 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v14 +; SI-NEXT: v_mov_b32_e32 v51, v1 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v56 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v58 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v1 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v62 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 ; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v44 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f16_f32_e32 v45, v45 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f16_f32_e32 v46, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: ; kill: killed $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; kill: killed $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; kill: killed $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; kill: killed $vgpr4 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v56, v1 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v57, v31 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f16_f32_e32 v58, v58 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v59, v59 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v60, v60 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v61, v61 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v4 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v62, v62 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v51 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v63, v31 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v31, v2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execz .LBB102_2 -; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v62, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v55 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v56 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v1, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v57 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v47 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v58 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v1, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v54 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v59 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v1, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v9 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v60 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v1, v11 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v11, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v61 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v62 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v1, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v63 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v1, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v31 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v50, v21 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v43 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v25 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v57 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v59 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v46, v27 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v62 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v63 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f16_f32_e32 v59, v2 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f16_f32_e32 v61, v0 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 @@ -224153,200 +221323,346 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v57, v14 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB102_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v6 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v11 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v10 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v10 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v11 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v9 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v13 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v14 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v16 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v18 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v19 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v21 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v22 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v24 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v29 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v26 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v30 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v9 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v27 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v32 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v12 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v33 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v29 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v34 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v15 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v35 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v30 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v36 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v17 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v37 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v33 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v38 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v20 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v39 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v35 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v48 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v23 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v49 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v36 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v50 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v25 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v38 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v28 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v53 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v39 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v54 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v32 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v55 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v49 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v40 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v34 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v41 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v51 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v42 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v37 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v43 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v52 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v44 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v48 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v45 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v54 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v46 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v50 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v47 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v55 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v56 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v53 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v57 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v41 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v58 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v40 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v59 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v43 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v60 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v42 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v61 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v44 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v45 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v62 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v46 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v63 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr34 @@ -224370,773 +221686,631 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: .LBB102_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB102_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v2, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 -; SI-NEXT: v_add_f32_e32 v62, 0x38000000, v63 -; SI-NEXT: v_add_f32_e32 v63, 0x38000000, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v59 -; SI-NEXT: v_add_f32_e32 v59, 0x38000000, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v46 -; SI-NEXT: v_add_f32_e32 v60, 0x38000000, v61 -; SI-NEXT: v_add_f32_e32 v61, 0x38000000, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 -; SI-NEXT: v_add_f32_e32 v46, 0x38000000, v57 -; SI-NEXT: v_add_f32_e32 v57, 0x38000000, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v45 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v31 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v1 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v56 -; SI-NEXT: v_add_f32_e32 v56, 0x38000000, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v42 -; SI-NEXT: v_add_f32_e32 v42, 0x38000000, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v41 -; SI-NEXT: v_add_f32_e32 v43, 0x38000000, v44 -; SI-NEXT: v_add_f32_e32 v44, 0x38000000, v45 -; SI-NEXT: v_add_f32_e32 v45, 0x38000000, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v54 -; SI-NEXT: v_add_f32_e32 v54, 0x38000000, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v53 -; SI-NEXT: v_add_f32_e32 v55, 0x38000000, v40 -; SI-NEXT: v_add_f32_e32 v40, 0x38000000, v41 -; SI-NEXT: v_add_f32_e32 v41, 0x38000000, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v50 -; SI-NEXT: v_add_f32_e32 v50, 0x38000000, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v49 -; SI-NEXT: v_add_f32_e32 v51, 0x38000000, v52 -; SI-NEXT: v_add_f32_e32 v52, 0x38000000, v53 -; SI-NEXT: v_add_f32_e32 v53, 0x38000000, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v38 -; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v37 -; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v48 -; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v49 -; SI-NEXT: v_add_f32_e32 v49, 0x38000000, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v34 -; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v33 -; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v36 -; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v37 -; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v29 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v28 -; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v32 -; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v33 -; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v25 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v27 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v28 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v24, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v58, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v58, 0x38000000, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v58 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v60 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v58 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v8 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v12 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v16 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 +; SI-NEXT: v_add_f32_e32 v61, 0x38000000, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v58 +; SI-NEXT: v_add_f32_e32 v62, 0x38000000, v14 +; SI-NEXT: v_add_f32_e32 v60, 0x38000000, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v57 +; SI-NEXT: v_add_f32_e32 v57, 0x38000000, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v46 +; SI-NEXT: v_add_f32_e32 v63, 0x38000000, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v56 +; SI-NEXT: v_add_f32_e32 v56, 0x38000000, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 +; SI-NEXT: v_add_f32_e32 v58, 0x38000000, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v45 +; SI-NEXT: v_add_f32_e32 v45, 0x38000000, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v42 +; SI-NEXT: v_add_f32_e32 v59, 0x38000000, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v44 +; SI-NEXT: v_add_f32_e32 v44, 0x38000000, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v55 +; SI-NEXT: v_add_f32_e32 v46, 0x38000000, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v41 +; SI-NEXT: v_add_f32_e32 v41, 0x38000000, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v54 +; SI-NEXT: v_add_f32_e32 v47, 0x38000000, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v40 +; SI-NEXT: v_add_f32_e32 v40, 0x38000000, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 +; SI-NEXT: v_add_f32_e32 v42, 0x38000000, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v53 +; SI-NEXT: v_add_f32_e32 v53, 0x38000000, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v50 +; SI-NEXT: v_add_f32_e32 v43, 0x38000000, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v52 +; SI-NEXT: v_add_f32_e32 v52, 0x38000000, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v39 +; SI-NEXT: v_add_f32_e32 v54, 0x38000000, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v49 +; SI-NEXT: v_add_f32_e32 v49, 0x38000000, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v38 +; SI-NEXT: v_add_f32_e32 v55, 0x38000000, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v48 +; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v35 +; SI-NEXT: v_add_f32_e32 v50, 0x38000000, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v37 +; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v34 +; SI-NEXT: v_add_f32_e32 v51, 0x38000000, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v36 +; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v33 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v30 +; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v32 +; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v3 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v4 +; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v29 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v6 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v7 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v8 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v9 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v10 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v1, v26 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v2, v25 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v28 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v25 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v29 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v34 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v38 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v49 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v50 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v53 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v54 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v41 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v42 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v45 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v46 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v59 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v12 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v1, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v13 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v2, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v60 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v63 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v62 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v2, v31 -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_mov_b32_e32 v27, v9 +; SI-NEXT: v_mov_b32_e32 v26, v11 +; SI-NEXT: v_mov_b32_e32 v25, v12 +; SI-NEXT: v_mov_b32_e32 v28, v10 +; SI-NEXT: s_waitcnt expcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 -; SI-NEXT: v_mov_b32_e32 v3, v24 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; SI-NEXT: .LBB102_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 -; SI-NEXT: buffer_store_dword v0, v47, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v3 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v47 -; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 8, v47 -; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 12, v47 -; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 16, v47 -; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 20, v47 -; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 24, v47 -; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v22 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v23 ; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 28, v47 -; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v20 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v21 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v18 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v19 +; SI-NEXT: v_alignbit_b32 v2, v2, v3, 16 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v16 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v17 +; SI-NEXT: v_alignbit_b32 v3, v3, v4, 16 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v24 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v15 +; SI-NEXT: v_alignbit_b32 v4, v4, v5, 16 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v25 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_mul_f32_e32 v6, 1.0, v14 +; SI-NEXT: v_alignbit_b32 v5, v5, v6, 16 +; SI-NEXT: v_mul_f32_e32 v6, 1.0, v28 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v26 +; SI-NEXT: v_alignbit_b32 v6, v6, v7, 16 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v27 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_alignbit_b32 v7, v7, v8, 16 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_mul_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_mul_f32_e32 v13, 1.0, v13 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v18 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v27 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v16 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v23 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v26 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v30 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v21 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v24 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v25 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v28 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_mul_f32_e32 v29, 1.0, v29 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v31 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 32, v47 -; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_alignbit_b32 v8, v8, v9, 16 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 36, v47 -; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_alignbit_b32 v9, v9, v10, 16 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 40, v47 -; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_alignbit_b32 v10, v10, v11, 16 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 44, v47 -; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_alignbit_b32 v11, v11, v12, 16 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 48, v47 -; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_mul_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_alignbit_b32 v12, v12, v13, 16 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 52, v47 -; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_mul_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_alignbit_b32 v13, v13, v14, 16 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 56, v47 -; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_mul_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_alignbit_b32 v14, v14, v15, 16 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 60, v47 -; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_mul_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_alignbit_b32 v15, v15, v16, 16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 64, v47 -; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_alignbit_b32 v16, v16, v17, 16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x44, v47 -; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_alignbit_b32 v17, v17, v18, 16 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x48, v47 -; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_alignbit_b32 v18, v18, v19, 16 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x4c, v47 -; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_alignbit_b32 v19, v19, v20, 16 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x50, v47 -; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_alignbit_b32 v20, v20, v21, 16 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x54, v47 -; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_alignbit_b32 v21, v21, v22, 16 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x58, v47 -; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_alignbit_b32 v22, v22, v23, 16 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x5c, v47 -; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_alignbit_b32 v23, v23, v24, 16 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x60, v47 -; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_alignbit_b32 v24, v24, v25, 16 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x64, v47 -; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_alignbit_b32 v25, v25, v26, 16 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x68, v47 -; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_alignbit_b32 v26, v26, v27, 16 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x6c, v47 -; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_alignbit_b32 v27, v27, v28, 16 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x70, v47 -; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_alignbit_b32 v28, v28, v29, 16 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x74, v47 -; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_mul_f32_e32 v29, 1.0, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_alignbit_b32 v29, v29, v30, 16 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x78, v47 -; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_alignbit_b32 v30, v30, v31, 16 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x7c, v47 -; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_alignbit_b32 v31, v31, v32, 16 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v64f16_to_v64bf16: @@ -225377,1227 +222551,1228 @@ define inreg <64 x bfloat> @bitcast_v64f16_to_v64bf16_scalar(<64 x half> inreg % ; SI-LABEL: bitcast_v64f16_to_v64bf16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:44 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:48 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:52 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:56 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:60 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:64 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:76 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v1 -; SI-NEXT: v_mov_b32_e32 v37, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v7, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v18, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v27, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v26, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v25, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v28, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v38, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v30, s27 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v52 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v29, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v43 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v44, v44 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f16_f32_e32 v45, v45 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f16_f32_e32 v46, v46 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f16_f32_e32 v47, v47 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f16_f32_e32 v56, v56 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f16_f32_e32 v57, v57 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f16_f32_e32 v58, v58 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f16_f32_e32 v59, v59 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v60, v60 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v61, v61 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v62, v62 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v63, v63 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v37, v31 -; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v31, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v33, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v39, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v49, s26 -; SI-NEXT: v_cvt_f16_f32_e32 v52, s28 -; SI-NEXT: v_cvt_f16_f32_e32 v53, s29 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; SI-NEXT: s_cbranch_scc0 .LBB103_2 -; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v36 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v8 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v10 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v11 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v12 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v14 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v32 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v18 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v18, v20 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v16 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v17 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v19 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v15 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v22 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v23 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v24 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v54 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v31 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v48 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v33 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v50 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v27 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v51 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v26 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v55 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v25 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v40 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v28 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v41 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v38 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v39 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v29 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v49 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v42 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v30 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v44 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v52 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v45 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v53 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v46 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v34 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v56 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v35 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v58 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v60 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v37 -; SI-NEXT: s_mov_b64 s[4:5], 0 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v21 -; SI-NEXT: v_mov_b32_e32 v38, v54 -; SI-NEXT: v_mov_b32_e32 v52, v51 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v20 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v43 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v47 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v57 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v59 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v61 -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v62 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v17 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v43, v19 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v19 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: s_lshr_b32 s4, s29, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s4 +; SI-NEXT: s_lshr_b32 s4, s28, 16 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v63 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v1 -; SI-NEXT: v_mov_b32_e32 v11, v32 -; SI-NEXT: v_mov_b32_e32 v27, v26 -; SI-NEXT: v_mov_b32_e32 v25, v28 -; SI-NEXT: v_mov_b32_e32 v30, v48 -; SI-NEXT: v_mov_b32_e32 v32, v50 -; SI-NEXT: v_mov_b32_e32 v39, v49 -; SI-NEXT: v_mov_b32_e32 v48, v55 -; SI-NEXT: v_mov_b32_e32 v49, v40 -; SI-NEXT: v_mov_b32_e32 v50, v20 -; SI-NEXT: s_branch .LBB103_3 -; SI-NEXT: .LBB103_2: -; SI-NEXT: v_mov_b32_e32 v11, v32 -; SI-NEXT: v_mov_b32_e32 v32, v50 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_mov_b32_e32 v52, v51 -; SI-NEXT: v_mov_b32_e32 v38, v54 -; SI-NEXT: v_mov_b32_e32 v18, v20 -; SI-NEXT: s_mov_b64 s[4:5], -1 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v27, v26 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s4 +; SI-NEXT: s_lshr_b32 s4, s27, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v55, s4 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v25, v28 -; SI-NEXT: v_mov_b32_e32 v30, v48 -; SI-NEXT: v_mov_b32_e32 v39, v49 -; SI-NEXT: v_mov_b32_e32 v48, v55 -; SI-NEXT: v_mov_b32_e32 v49, v40 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: .LBB103_3: ; %Flow -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v51, v2 -; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] -; SI-NEXT: s_cbranch_vccnz .LBB103_5 -; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v31, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v46 -; SI-NEXT: v_mov_b32_e32 v28, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v24 -; SI-NEXT: v_add_f32_e32 v58, 0x38000000, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v47 -; SI-NEXT: v_add_f32_e32 v47, 0x38000000, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v43 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v46, 0x38000000, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v42 -; SI-NEXT: v_add_f32_e32 v43, 0x38000000, v33 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v33, v50 -; SI-NEXT: v_add_f32_e32 v42, 0x38000000, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v52 -; SI-NEXT: v_add_f32_e32 v55, 0x38000000, v33 -; SI-NEXT: v_add_f32_e32 v54, 0x38000000, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v13 -; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v30 -; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v37 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v21 -; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v33 -; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v17 -; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v11 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v37, v23 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v12 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v40, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v60 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v2 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: s_lshr_b32 s4, s26, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v42, s4 +; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v61 -; SI-NEXT: v_add_f32_e32 v62, 0x38000000, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v59 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v9 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v56 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v7 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v10 -; SI-NEXT: v_add_f32_e32 v56, 0x38000000, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v45 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v9 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v19 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v19 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v19 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v19 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v19 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v34, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v16 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v29 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v29, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v18 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v31 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 -; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 -; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 -; SI-NEXT: v_add_f32_e32 v51, 0x38000000, v51 -; SI-NEXT: v_add_f32_e32 v50, 0x38000000, v50 -; SI-NEXT: v_add_f32_e32 v40, 0x38000000, v40 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_add_f32_e32 v44, 0x38000000, v44 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v53, v11 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v53, 0x38000000, v53 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v61, v19 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v61, 0x38000000, v61 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v52, v11 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v52, 0x38000000, v52 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v63, v19 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v63, 0x38000000, v63 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v49, v11 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v49, 0x38000000, v49 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v30, v11 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v19 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v19 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v19 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v19 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v19 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 +; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v45, s4 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v46, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v47, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v56, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v57, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v58, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v59, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v60, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v61, s4 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v7, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v16, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v16 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v63, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v62, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v62 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v53, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v19 +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v12, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s29 ; SI-NEXT: v_cvt_f32_f16_e32 v28, v11 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v23, v11 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s28 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s22 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v14, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v11 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v41, v13 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v41, 0x38000000, v41 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v11 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v45, v13 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v31, v8 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v45, 0x38000000, v45 -; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f16_f32_e32 v39, v7 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f16_f32_e32 v51, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v43 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v57, v13 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v57, 0x38000000, v57 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v41, v19 +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f16_f32_e32 v20, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v17, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v30 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v30, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: s_cbranch_scc0 .LBB103_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v44 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v60 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v20 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v0 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v21 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v6 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v23 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v29 +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v11, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v25 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v38 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v60 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v20 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v61 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v21 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v59 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v23 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v53 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v11, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v49 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v20 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v57 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v21 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v23 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v56 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v11, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v63 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v20 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v47 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v19 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v11 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v57 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v46 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v31 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v10 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v11, v61 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v59, v13 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v45 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v9 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v58 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v31 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v8 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v33 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v42 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v62 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v21 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v55 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v50 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v32 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v49 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v40 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v23 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v39 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v30 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v12 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v25 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v20 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v41 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v22 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v63 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v51 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v43 +; SI-NEXT: v_mov_b32_e32 v42, v60 +; SI-NEXT: s_branch .LBB103_3 +; SI-NEXT: .LBB103_2: +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: s_mov_b64 s[4:5], -1 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; kill: killed $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; kill: killed $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: .LBB103_3: ; %Flow +; SI-NEXT: v_mov_b32_e32 v53, v15 +; SI-NEXT: v_mov_b32_e32 v56, v13 +; SI-NEXT: v_mov_b32_e32 v8, v39 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_mov_b32_e32 v27, v38 +; SI-NEXT: v_mov_b32_e32 v13, v0 +; SI-NEXT: v_mov_b32_e32 v38, v24 +; SI-NEXT: v_mov_b32_e32 v39, v12 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v57, v14 +; SI-NEXT: v_mov_b32_e32 v47, v11 +; SI-NEXT: v_mov_b32_e32 v23, v40 +; SI-NEXT: v_mov_b32_e32 v40, v5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v25, v16 +; SI-NEXT: v_mov_b32_e32 v16, v9 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v46, v10 +; SI-NEXT: v_mov_b32_e32 v10, v2 +; SI-NEXT: v_mov_b32_e32 v6, v31 +; SI-NEXT: v_mov_b32_e32 v22, v29 +; SI-NEXT: v_mov_b32_e32 v26, v33 +; SI-NEXT: v_mov_b32_e32 v2, v49 +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_cbranch_vccnz .LBB103_5 +; SI-NEXT: ; %bb.4: ; %cmp.true +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v25 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v30 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v20 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v33 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v34 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v45, 0x38000000, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v8 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v2 +; SI-NEXT: v_add_f32_e32 v40, 0x38000000, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v52 +; SI-NEXT: v_add_f32_e32 v49, 0x38000000, v33 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v37 +; SI-NEXT: v_add_f32_e32 v51, 0x38000000, v34 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v28 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v50 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v63 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v19 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v21 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v22 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v30 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v6 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v47 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v62 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v58, 0x38000000, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v58 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v56 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_add_f32_e32 v47, 0x38000000, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v42 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v54, 0x38000000, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 +; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 +; SI-NEXT: v_add_f32_e32 v50, 0x38000000, v50 +; SI-NEXT: v_add_f32_e32 v57, 0x38000000, v57 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 +; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: v_add_f32_e32 v59, 0x38000000, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v5 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v43 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v19 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v46, v55 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_add_f32_e32 v46, 0x38000000, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v46 +; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 +; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v32, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v32 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v60, v13 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v20 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v60, v55 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 ; SI-NEXT: v_add_f32_e32 v60, 0x38000000, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v60 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v19 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v60, v60 +; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v60 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v21 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v1, v14 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v11, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v41 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v3, v13 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v5 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v14 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v11, v12 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v8 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v1, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v6 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v16 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v15 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v1, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v20 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v18 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v8 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v11 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v23 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v32 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f32_f16_e32 v61, v55 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_add_f32_e32 v61, 0x38000000, v61 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v9, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v61 +; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v11 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v36 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v24 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v25 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v62, v55 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v56, v53 +; SI-NEXT: v_add_f32_e32 v62, 0x38000000, v62 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v62 +; SI-NEXT: v_add_f32_e32 v56, 0x38000000, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v56 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v63, v55 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v58 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v17 +; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v56 +; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v8 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v47 +; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v11 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v46 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v63, 0x38000000, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v7 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v38 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v1, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v63 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v63 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v24 +; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v8 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v62 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v11 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v1, v30 +; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v55 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v61 +; SI-NEXT: v_mov_b32_e32 v18, v53 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v8 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v1, v28 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v11 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v9 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v43 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v29 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v11 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v10 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v46 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v8 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v58 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v52, 0x38000000, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v56 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v62 -; SI-NEXT: v_mov_b32_e32 v36, v12 -; SI-NEXT: v_mov_b32_e32 v35, v19 -; SI-NEXT: v_mov_b32_e32 v8, v14 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v1 +; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 +; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v4 -; SI-NEXT: v_mov_b32_e32 v4, v13 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; SI-NEXT: .LBB103_5: ; %end ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v18 +; SI-NEXT: v_mul_f32_e32 v36, 1.0, v55 +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v56, 1.0, v16 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_mul_f32_e32 v43, 1.0, v40 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v46, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v44 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v14, 1.0, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v4 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v35 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v36 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v8 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v58 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v58, 1.0, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v60, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v32 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v62, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v34 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v38 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v34, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v39 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v5 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v38, 1.0, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v48, 1.0, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v9 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v50, 1.0, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v10 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v52, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v54 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v31 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v54, 1.0, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v34 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mul_f32_e32 v41, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v19 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v45 +; SI-NEXT: v_lshr_b64 v[45:46], v[46:47], 16 +; SI-NEXT: v_lshr_b64 v[46:47], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[2:3], v[7:8], 16 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v6 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshr_b64 v[3:4], v[3:4], 16 +; SI-NEXT: v_lshr_b64 v[4:5], v[5:6], 16 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v3 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v33 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v7 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: v_lshr_b64 v[5:6], v[5:6], 16 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshr_b64 v[6:7], v[6:7], 16 +; SI-NEXT: v_lshr_b64 v[7:8], v[14:15], 16 +; SI-NEXT: v_lshr_b64 v[8:9], v[9:10], 16 +; SI-NEXT: v_lshr_b64 v[9:10], v[11:12], 16 +; SI-NEXT: v_lshr_b64 v[10:11], v[20:21], 16 +; SI-NEXT: v_lshr_b64 v[11:12], v[22:23], 16 +; SI-NEXT: v_lshr_b64 v[12:13], v[24:25], 16 +; SI-NEXT: v_lshr_b64 v[13:14], v[26:27], 16 +; SI-NEXT: v_lshr_b64 v[14:15], v[28:29], 16 +; SI-NEXT: v_lshr_b64 v[15:16], v[30:31], 16 +; SI-NEXT: v_lshr_b64 v[16:17], v[17:18], 16 +; SI-NEXT: v_lshr_b64 v[17:18], v[56:57], 16 +; SI-NEXT: v_lshr_b64 v[18:19], v[58:59], 16 +; SI-NEXT: v_lshr_b64 v[19:20], v[60:61], 16 +; SI-NEXT: v_lshr_b64 v[20:21], v[62:63], 16 +; SI-NEXT: v_lshr_b64 v[21:22], v[32:33], 16 +; SI-NEXT: v_lshr_b64 v[22:23], v[34:35], 16 +; SI-NEXT: v_lshr_b64 v[23:24], v[36:37], 16 +; SI-NEXT: v_lshr_b64 v[24:25], v[38:39], 16 +; SI-NEXT: v_lshr_b64 v[25:26], v[48:49], 16 +; SI-NEXT: v_lshr_b64 v[26:27], v[50:51], 16 +; SI-NEXT: v_lshr_b64 v[27:28], v[52:53], 16 +; SI-NEXT: v_lshr_b64 v[28:29], v[54:55], 16 +; SI-NEXT: v_lshr_b64 v[29:30], v[41:42], 16 +; SI-NEXT: v_lshr_b64 v[30:31], v[43:44], 16 +; SI-NEXT: v_lshr_b64 v[31:32], v[0:1], 16 +; SI-NEXT: v_mov_b32_e32 v0, v45 +; SI-NEXT: v_mov_b32_e32 v1, v46 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v64f16_to_v64bf16_scalar: @@ -226913,1187 +224088,1127 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) { ; SI-LABEL: bitcast_v64bf16_to_v64i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:136 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:60 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:64 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:68 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:72 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:76 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v42, 0xffff0000, v19 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:88 -; SI-NEXT: v_mul_f32_e32 v33, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v3 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v19 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; kill: killed $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; kill: killed $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; kill: killed $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; kill: killed $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; kill: killed $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; kill: killed $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; kill: killed $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; kill: killed $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: v_and_b32_e32 v44, 0xffff0000, v21 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v21 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: ; kill: killed $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; SI-NEXT: ; kill: killed $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: v_and_b32_e32 v34, 0xffff0000, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v40, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v4 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v21 +; SI-NEXT: ; kill: killed $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: ; kill: killed $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: v_and_b32_e32 v60, 0xffff0000, v18 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: ; kill: killed $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; kill: killed $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: v_and_b32_e32 v49, 0xffff0000, v29 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v29 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: ; kill: killed $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; kill: killed $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; kill: killed $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; kill: killed $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; kill: killed $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; kill: killed $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; kill: killed $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: v_and_b32_e32 v41, 0xffff0000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: ; kill: killed $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; kill: killed $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: v_and_b32_e32 v36, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: ; kill: killed $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; kill: killed $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: v_and_b32_e32 v56, 0xffff0000, v25 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v25 +; SI-NEXT: v_and_b32_e32 v62, 0xffff0000, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: ; kill: killed $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; kill: killed $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v30 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v30 +; SI-NEXT: v_and_b32_e32 v51, 0xffff0000, v28 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v28 +; SI-NEXT: v_and_b32_e32 v53, 0xffff0000, v27 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v27 +; SI-NEXT: v_and_b32_e32 v55, 0xffff0000, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v23 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v23 +; SI-NEXT: v_and_b32_e32 v47, 0xffff0000, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_and_b32_e32 v58, 0xffff0000, v20 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v20 +; SI-NEXT: v_and_b32_e32 v63, 0xffff0000, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v12 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v32 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v6, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v11 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v34 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v13 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v18 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v15 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; kill: killed $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; kill: killed $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; kill: killed $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; kill: killed $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; kill: killed $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: ; kill: killed $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v5 +; SI-NEXT: ; kill: killed $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v41 +; SI-NEXT: ; kill: killed $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v7 +; SI-NEXT: ; kill: killed $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v36 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v29 +; SI-NEXT: ; kill: killed $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v9 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v17 +; SI-NEXT: ; kill: killed $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v31 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_and_b32_e32 v37, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v38, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v25 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v62 +; SI-NEXT: ; kill: killed $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; kill: killed $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v38, 1.0, v38 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v37 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v40 +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v35 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v20 +; SI-NEXT: v_mul_f32_e32 v37, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v41, 1.0, v27 +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v13, 1.0, v33 +; SI-NEXT: v_mul_f32_e32 v36, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v29, 1.0, v63 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v61 +; SI-NEXT: v_mul_f32_e32 v14, 1.0, v60 +; SI-NEXT: v_mul_f32_e32 v43, 1.0, v43 +; SI-NEXT: v_mul_f32_e32 v42, 1.0, v42 +; SI-NEXT: v_mul_f32_e32 v12, 1.0, v59 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v58 +; SI-NEXT: v_mul_f32_e32 v45, 1.0, v45 +; SI-NEXT: v_mul_f32_e32 v44, 1.0, v44 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v22 +; SI-NEXT: v_mul_f32_e32 v35, 1.0, v47 +; SI-NEXT: v_mul_f32_e32 v47, 1.0, v46 +; SI-NEXT: v_mul_f32_e32 v46, 1.0, v30 +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v24 +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v28 +; SI-NEXT: v_mul_f32_e32 v57, 1.0, v57 +; SI-NEXT: v_mul_f32_e32 v56, 1.0, v56 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v26 +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v55 +; SI-NEXT: v_mul_f32_e32 v59, 1.0, v54 +; SI-NEXT: v_mul_f32_e32 v58, 1.0, v53 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v52 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v51 +; SI-NEXT: v_mul_f32_e32 v61, 1.0, v50 +; SI-NEXT: v_mul_f32_e32 v60, 1.0, v49 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v48 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v39 +; SI-NEXT: v_mul_f32_e32 v63, 1.0, v31 +; SI-NEXT: v_mul_f32_e32 v62, 1.0, v23 +; SI-NEXT: ; kill: killed $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; kill: killed $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; kill: killed $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; kill: killed $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB104_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v7 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v38 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v11 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v12 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v21 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v15 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v32 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v19 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v5 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v20 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v6 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v23 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v27 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v9 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v28 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; SI-NEXT: v_mul_f32_e32 v54, 1.0, v13 -; SI-NEXT: v_mul_f32_e32 v19, 1.0, v21 -; SI-NEXT: v_mul_f32_e32 v12, 1.0, v22 -; SI-NEXT: v_mul_f32_e32 v13, 1.0, v25 -; SI-NEXT: v_mul_f32_e32 v15, 1.0, v17 -; SI-NEXT: v_mul_f32_e32 v17, 1.0, v29 -; SI-NEXT: v_mul_f32_e32 v43, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v40, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v32, 1.0, v6 -; SI-NEXT: v_mul_f32_e32 v31, 1.0, v9 -; SI-NEXT: v_mul_f32_e32 v55, 1.0, v10 -; SI-NEXT: v_mul_f32_e32 v53, 1.0, v14 -; SI-NEXT: v_mul_f32_e32 v14, 1.0, v18 -; SI-NEXT: v_mul_f32_e32 v18, 1.0, v26 -; SI-NEXT: v_mul_f32_e32 v11, 1.0, v30 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v34 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v37 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v35 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v41 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v36 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v11 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v39 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v48 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; SI-NEXT: v_mul_f32_e32 v10, 1.0, v37 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v36 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v52 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v29 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:100 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:104 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:112 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:116 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:120 -; SI-NEXT: v_mul_f32_e32 v28, 1.0, v44 -; SI-NEXT: v_mul_f32_e32 v44, 1.0, v46 -; SI-NEXT: v_mul_f32_e32 v9, 1.0, v38 -; SI-NEXT: v_mul_f32_e32 v8, 1.0, v49 -; SI-NEXT: v_mul_f32_e32 v7, 1.0, v50 -; SI-NEXT: v_mul_f32_e32 v46, 1.0, v58 -; SI-NEXT: v_mul_f32_e32 v29, 1.0, v59 -; SI-NEXT: v_mul_f32_e32 v30, 1.0, v51 -; SI-NEXT: v_mul_f32_e32 v6, 1.0, v41 -; SI-NEXT: v_mul_f32_e32 v5, 1.0, v42 -; SI-NEXT: v_mul_f32_e32 v27, 1.0, v45 -; SI-NEXT: v_mul_f32_e32 v23, 1.0, v61 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; kill: killed $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; kill: killed $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; kill: killed $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; kill: killed $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; kill: killed $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; kill: killed $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; kill: killed $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; kill: killed $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; kill: killed $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; kill: killed $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; kill: killed $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; kill: killed $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; kill: killed $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; kill: killed $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; kill: killed $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; kill: killed $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; kill: killed $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; kill: killed $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; kill: killed $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; kill: killed $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; kill: killed $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; kill: killed $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; kill: killed $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; kill: killed $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; kill: killed $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; kill: killed $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; kill: killed $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: v_mul_f32_e32 v4, 1.0, v47 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v56 -; SI-NEXT: v_mul_f32_e32 v47, 1.0, v57 -; SI-NEXT: v_mul_f32_e32 v60, 1.0, v60 -; SI-NEXT: v_mul_f32_e32 v57, 1.0, v62 -; SI-NEXT: v_mul_f32_e32 v56, 1.0, v63 -; SI-NEXT: ; kill: killed $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; kill: killed $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; kill: killed $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_mul_f32_e32 v59, 1.0, v1 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_mul_f32_e32 v58, 1.0, v3 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_mul_f32_e32 v24, 1.0, v20 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_mul_f32_e32 v3, 1.0, v21 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v22 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:124 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:128 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:132 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_mul_f32_e32 v20, 1.0, v25 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_mul_f32_e32 v26, 1.0, v21 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v25, 1.0, v22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v21, 1.0, v34 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execz .LBB104_2 -; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v33 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v43 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v43 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v30 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v42 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v10 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v20 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v45 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v26 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v44 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v25 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v21 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v35 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v47 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v47 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v46 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v46 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v29 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v30 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v60 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v57 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v57 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v56 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v56 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v23 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v20 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v27 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v59 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v59 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v58 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v58 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v28 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v26 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v16 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v24 +; SI-NEXT: ; kill: killed $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v44 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v61 +; SI-NEXT: ; kill: killed $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v24 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v3 -; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v60 +; SI-NEXT: ; kill: killed $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v22 +; SI-NEXT: ; kill: killed $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v18 +; SI-NEXT: ; kill: killed $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v63 +; SI-NEXT: ; kill: killed $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v62 +; SI-NEXT: ; kill: killed $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v22 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v12 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v40 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v32 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v19 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v13 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v18 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v9 -; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v22 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v12 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v31 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v55 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v17 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v22 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v54 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v53 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v22 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v11 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v11 -; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 ; SI-NEXT: .LBB104_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB104_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v33 -; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v43 -; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 -; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v33 -; SI-NEXT: v_alignbit_b32 v22, v34, v22, 16 -; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v38 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v40 -; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v32 -; SI-NEXT: v_alignbit_b32 v22, v34, v22, 16 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v15 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v19 +; SI-NEXT: v_alignbit_b32 v0, v15, v0, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v31 -; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v55 -; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 -; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v31 -; SI-NEXT: v_alignbit_b32 v22, v34, v22, 16 -; SI-NEXT: v_and_b32_e32 v34, 0xffff0000, v53 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_alignbit_b32 v0, v2, v0, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v54 -; SI-NEXT: v_add_f32_e32 v51, 0x40c00000, v34 -; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v51 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_alignbit_b32 v22, v34, v22, 16 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; SI-NEXT: v_alignbit_b32 v0, v2, v0, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v14 -; SI-NEXT: v_alignbit_b32 v15, v22, v15, 16 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v32 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v5 +; SI-NEXT: v_alignbit_b32 v0, v2, v0, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v19 -; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v12 -; SI-NEXT: v_alignbit_b32 v15, v19, v15, 16 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v7 +; SI-NEXT: v_alignbit_b32 v0, v2, v0, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v9 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v18 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v15 -; SI-NEXT: v_alignbit_b32 v13, v18, v13, 16 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v4 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v9 +; SI-NEXT: v_alignbit_b32 v0, v2, v0, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v41 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v37 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; SI-NEXT: v_alignbit_b32 v0, v4, v0, 16 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v17 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; SI-NEXT: v_alignbit_b32 v0, v6, v0, 16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v29 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v36 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v6 +; SI-NEXT: v_alignbit_b32 v0, v11, v0, 16 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v16 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v11 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; SI-NEXT: v_alignbit_b32 v13, v17, v13, 16 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v11 +; SI-NEXT: v_alignbit_b32 v0, v13, v0, 16 ; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v12 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v10 +; SI-NEXT: v_alignbit_b32 v0, v12, v0, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v9 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; SI-NEXT: v_alignbit_b32 v10, v13, v10, 16 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v35 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v8 +; SI-NEXT: v_alignbit_b32 v0, v12, v0, 16 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v7 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; SI-NEXT: v_alignbit_b32 v8, v10, v8, 16 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v5 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: v_alignbit_b32 v6, v8, v6, 16 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; SI-NEXT: v_alignbit_b32 v4, v6, v4, 16 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v60 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v30 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v12 +; SI-NEXT: v_alignbit_b32 v0, v13, v0, 16 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v29 -; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v6 -; SI-NEXT: v_alignbit_b32 v4, v8, v4, 16 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v27 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v20 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v13 +; SI-NEXT: v_alignbit_b32 v0, v14, v0, 16 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v23 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v8 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_alignbit_b32 v4, v10, v4, 16 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v24 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v10 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v61, v1, v3, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v26 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_alignbit_b32 v0, v15, v0, 16 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v10 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v58 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v59 -; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v22 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v15 +; SI-NEXT: v_alignbit_b32 v0, v16, v0, 16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v62 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v16 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v19, v10, v3, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v63 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v18, v16, v0, 16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v56 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v57 -; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_alignbit_b32 v4, v13, v4, 16 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v60 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v61 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v20, v16, v15, 16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v20 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v26 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v20, v10, v8, 16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v58 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v59 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v21, v16, v15, 16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v46 -; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v47 -; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v13 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_alignbit_b32 v4, v16, v4, 16 -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v21 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v21, v10, v8, 16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v56 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v57 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v22, v16, v15, 16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v44 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v28 -; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v23, v10, v8, 16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v46 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v47 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v24, v16, v15, 16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v30 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v44 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v45 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v26, v16, v15, 16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v25 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v42 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v43 ; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 ; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_alignbit_b32 v18, v16, v4, 16 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v13 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; SI-NEXT: v_alignbit_b32 v2, v21, v2, 16 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v27, v16, v15, 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_alignbit_b32 v2, v20, v6, 16 -; SI-NEXT: v_alignbit_b32 v1, v61, v1, 16 -; SI-NEXT: v_alignbit_b32 v5, v23, v5, 16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_alignbit_b32 v2, v19, v3, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_alignbit_b32 v1, v18, v4, 16 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v24, v10, v8, 16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; SI-NEXT: v_alignbit_b32 v7, v24, v7, 16 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v13 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_alignbit_b32 v0, v18, v0, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v17 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: v_alignbit_b32 v25, v45, v8, 16 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v9 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; SI-NEXT: v_alignbit_b32 v8, v25, v8, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v13 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_alignbit_b32 v62, v63, v16, 16 -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v33 -; SI-NEXT: v_alignbit_b32 v16, v62, v16, 16 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; SI-NEXT: v_alignbit_b32 v22, v34, v9, 16 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v11 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; SI-NEXT: v_alignbit_b32 v9, v22, v9, 16 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v11 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: v_alignbit_b32 v37, v38, v11, 16 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v12 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; SI-NEXT: v_alignbit_b32 v11, v37, v11, 16 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; SI-NEXT: v_alignbit_b32 v39, v48, v12, 16 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_alignbit_b32 v35, v36, v10, 16 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v15 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; SI-NEXT: v_alignbit_b32 v12, v39, v12, 16 -; SI-NEXT: v_alignbit_b32 v10, v35, v10, 16 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_alignbit_b32 v33, v50, v16, 16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_alignbit_b32 v34, v51, v16, 16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_alignbit_b32 v25, v52, v16, 16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v15 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; SI-NEXT: v_alignbit_b32 v49, v50, v13, 16 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v51 -; SI-NEXT: v_alignbit_b32 v13, v49, v13, 16 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v28, v39, v15, 16 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_alignbit_b32 v51, v52, v14, 16 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v31 -; SI-NEXT: v_alignbit_b32 v14, v51, v14, 16 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v17 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_alignbit_b32 v23, v53, v16, 16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: v_alignbit_b32 v41, v42, v15, 16 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v32 -; SI-NEXT: v_alignbit_b32 v15, v41, v15, 16 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; SI-NEXT: .LBB104_4: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v31, v48, v15, 16 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v62 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v63 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_alignbit_b32 v40, v54, v16, 16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v1, v40, v1, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_alignbit_b32 v1, v23, v3, 16 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v41 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v42 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_alignbit_b32 v1, v25, v5, 16 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_alignbit_b32 v1, v34, v7, 16 +; SI-NEXT: v_alignbit_b32 v15, v49, v15, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v51 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_alignbit_b32 v1, v33, v9, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_alignbit_b32 v1, v15, v2, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v49 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_alignbit_b32 v1, v31, v4, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_alignbit_b32 v17, v55, v16, 16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v19 +; SI-NEXT: v_mov_b32_e32 v19, v28 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_alignbit_b32 v1, v19, v6, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v1, v27, v11, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v1, v26, v10, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v1, v24, v8, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v1, v22, v12, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v1, v21, v13, 16 +; SI-NEXT: v_alignbit_b32 v16, v17, v16, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v1, v20, v14, 16 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: .LBB104_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v39 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v54 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v53 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v52 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v51 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v50 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v39 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v37 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v40 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v35 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v23 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v25 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v34 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v45 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v33 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v15 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v31 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v19 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v24, v24, v25 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v61 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v26, v26, v27 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: v_or_b32_e32 v28, v28, v29 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: v_or_b32_e32 v29, v29, v30 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; SI-NEXT: v_or_b32_e32 v30, v30, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; SI-NEXT: v_or_b32_e32 v31, v31, v32 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v64bf16_to_v64i16: @@ -230322,1280 +227437,1073 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a ; SI-LABEL: bitcast_v64bf16_to_v64i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:64 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:68 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:72 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:76 -; SI-NEXT: v_mul_f32_e32 v52, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v13 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v17 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v21 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v25 -; SI-NEXT: v_mov_b32_e32 v50, v27 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v29 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v54, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v58, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v3, 1.0, v6 -; SI-NEXT: v_mul_f32_e32 v48, 1.0, v7 -; SI-NEXT: v_mul_f32_e32 v40, 1.0, v8 -; SI-NEXT: v_mul_f32_e32 v61, 1.0, v9 -; SI-NEXT: v_mul_f32_e32 v9, 1.0, v10 -; SI-NEXT: v_mul_f32_e32 v5, 1.0, v11 -; SI-NEXT: v_mul_f32_e32 v42, 1.0, v12 -; SI-NEXT: v_mul_f32_e32 v46, 1.0, v14 -; SI-NEXT: v_mul_f32_e32 v27, 1.0, v15 -; SI-NEXT: v_mul_f32_e32 v11, 1.0, v16 -; SI-NEXT: v_mul_f32_e32 v63, 1.0, v18 -; SI-NEXT: v_mul_f32_e32 v13, 1.0, v19 -; SI-NEXT: v_mul_f32_e32 v19, 1.0, v20 -; SI-NEXT: v_mul_f32_e32 v56, 1.0, v22 -; SI-NEXT: v_mul_f32_e32 v21, 1.0, v23 -; SI-NEXT: v_mul_f32_e32 v17, 1.0, v24 -; SI-NEXT: v_mul_f32_e32 v23, 1.0, v26 -; SI-NEXT: v_mul_f32_e32 v22, 1.0, v50 -; SI-NEXT: v_mul_f32_e32 v24, 1.0, v28 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; SI-NEXT: v_mul_f32_e32 v29, 1.0, v30 -; SI-NEXT: v_mul_f32_e64 v28, 1.0, s24 -; SI-NEXT: v_mul_f32_e64 v30, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v50, 1.0, s27 -; SI-NEXT: v_mul_f32_e64 v7, 1.0, s28 -; SI-NEXT: v_mul_f32_e64 v26, 1.0, s29 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: v_mul_f32_e32 v18, 1.0, v32 -; SI-NEXT: v_mul_f32_e32 v20, 1.0, v33 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v16 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v16 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v14 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v14 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v12 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v12 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v6 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v34, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_and_b32 s6, s29, 0xffff0000 +; SI-NEXT: s_lshl_b32 s7, s29, 16 +; SI-NEXT: s_and_b32 s8, s28, 0xffff0000 +; SI-NEXT: s_lshl_b32 s9, s28, 16 +; SI-NEXT: s_and_b32 s10, s27, 0xffff0000 +; SI-NEXT: s_lshl_b32 s11, s27, 16 +; SI-NEXT: s_and_b32 s12, s26, 0xffff0000 +; SI-NEXT: s_lshl_b32 s13, s26, 16 +; SI-NEXT: s_and_b32 s14, s25, 0xffff0000 +; SI-NEXT: s_lshl_b32 s15, s25, 16 +; SI-NEXT: s_and_b32 s25, s24, 0xffff0000 +; SI-NEXT: s_lshl_b32 s24, s24, 16 +; SI-NEXT: s_and_b32 s26, s23, 0xffff0000 +; SI-NEXT: s_lshl_b32 s23, s23, 16 +; SI-NEXT: s_and_b32 s27, s22, 0xffff0000 +; SI-NEXT: s_lshl_b32 s22, s22, 16 +; SI-NEXT: s_and_b32 s28, s21, 0xffff0000 +; SI-NEXT: s_lshl_b32 s21, s21, 16 +; SI-NEXT: s_and_b32 s29, s20, 0xffff0000 +; SI-NEXT: s_lshl_b32 s20, s20, 16 +; SI-NEXT: s_and_b32 s40, s19, 0xffff0000 +; SI-NEXT: s_lshl_b32 s19, s19, 16 +; SI-NEXT: s_and_b32 s41, s18, 0xffff0000 +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_and_b32 s42, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_and_b32 s43, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s16, s16, 16 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mul_f32_e32 v32, 1.0, v34 -; SI-NEXT: v_mul_f32_e32 v25, 1.0, v35 -; SI-NEXT: v_mul_f32_e32 v14, 1.0, v36 -; SI-NEXT: v_mul_f32_e32 v16, 1.0, v37 -; SI-NEXT: v_mul_f32_e32 v37, 1.0, v38 -; SI-NEXT: v_mul_f32_e32 v39, 1.0, v39 -; SI-NEXT: v_mul_f32_e32 v10, 1.0, v49 -; SI-NEXT: v_mul_f32_e32 v12, 1.0, v51 -; SI-NEXT: v_mul_f32_e32 v15, 1.0, v53 -; SI-NEXT: v_mul_f32_e32 v51, 1.0, v55 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_mul_f32_e32 v6, 1.0, v41 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_mul_f32_e32 v8, 1.0, v43 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_mul_f32_e32 v55, 1.0, v44 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_mul_f32_e32 v44, 1.0, v45 -; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v47 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_mul_f32_e32 v4, 1.0, v57 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_mul_f32_e32 v45, 1.0, v59 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_mul_f32_e32 v59, 1.0, v60 -; SI-NEXT: v_mul_f32_e64 v34, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v36, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v53, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v38, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v31, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v33, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v60, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v49, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v35, 1.0, s26 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill -; SI-NEXT: s_cbranch_scc0 .LBB105_2 +; SI-NEXT: v_mul_f32_e32 v41, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v43, 1.0, v35 +; SI-NEXT: v_mul_f32_e32 v44, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v42, 1.0, v34 +; SI-NEXT: v_mul_f32_e32 v53, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v55, 1.0, v33 +; SI-NEXT: v_mul_f32_e32 v46, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v40, 1.0, v32 +; SI-NEXT: v_mul_f32_e32 v49, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v6, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v54, 1.0, v31 +; SI-NEXT: v_mul_f32_e32 v36, 1.0, v30 +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v12, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v52, 1.0, v29 +; SI-NEXT: v_mul_f32_e32 v33, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v35, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v14, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v50, 1.0, v28 +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v48, 1.0, v27 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v26 +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v20 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v25 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v24 +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v22 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v21 +; SI-NEXT: v_mul_f32_e32 v37, 1.0, v17 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v19 +; SI-NEXT: v_mul_f32_e64 v38, 1.0, s16 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mul_f32_e64 v61, 1.0, s43 +; SI-NEXT: v_mul_f32_e64 v29, 1.0, s17 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v63, 1.0, s42 +; SI-NEXT: v_mul_f32_e64 v57, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v59, 1.0, s41 +; SI-NEXT: v_mul_f32_e64 v30, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v62, 1.0, s40 +; SI-NEXT: v_mul_f32_e64 v45, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v47, 1.0, s29 +; SI-NEXT: v_mul_f32_e64 v32, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v60, 1.0, s28 +; SI-NEXT: v_mul_f32_e64 v24, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v27, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v34, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v58, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v21, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v22, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v0, 1.0, s15 +; SI-NEXT: v_mul_f32_e64 v56, 1.0, s14 +; SI-NEXT: v_mul_f32_e64 v17, 1.0, s13 +; SI-NEXT: v_mul_f32_e64 v19, 1.0, s12 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s11 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s10 +; SI-NEXT: v_mul_f32_e64 v13, 1.0, s9 +; SI-NEXT: v_mul_f32_e64 v15, 1.0, s8 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s7 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s6 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB105_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v54 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v34 -; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v38 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v30 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v60 -; SI-NEXT: v_mov_b32_e32 v28, v3 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v3 -; SI-NEXT: v_mov_b32_e32 v3, v9 -; SI-NEXT: s_mov_b64 s[4:5], 0 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v36 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v50 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v52 -; SI-NEXT: v_mov_b32_e32 v50, v48 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v3 -; SI-NEXT: v_mov_b32_e32 v38, v13 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v23 -; SI-NEXT: v_mov_b32_e32 v41, v44 -; SI-NEXT: v_mov_b32_e32 v52, v63 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v48 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v33 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v49 -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_mov_b32_e32 v7, v40 -; SI-NEXT: v_mov_b32_e32 v48, v27 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v12 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v53 -; SI-NEXT: v_mov_b32_e32 v53, v58 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v40 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v46 -; SI-NEXT: v_mov_b32_e32 v46, v9 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v51 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v35 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v42 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v45 +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v37 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v47 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v2 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v19 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v11 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v63 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v13 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v38 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v57 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v61 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v59 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v60 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v20 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v19 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v41 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v22 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v63 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v62 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v56 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v40 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v50 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v34 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v56 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v43 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v42 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v21 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v17 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v7 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v61 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v44 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v58 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v58 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v29 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v18 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v20 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v52 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v46 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v12 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v14 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v54 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v6 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v39 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v10 -; SI-NEXT: v_mov_b32_e32 v39, v35 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v32 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v48 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v44 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v4 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v55 -; SI-NEXT: v_mov_b32_e32 v5, v23 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v59 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v16 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v45 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v35 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v42 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v60 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v21 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v11 -; SI-NEXT: v_mov_b32_e32 v9, v11 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v1 -; SI-NEXT: s_branch .LBB105_3 -; SI-NEXT: .LBB105_2: -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: v_mov_b32_e32 v41, v44 -; SI-NEXT: v_mov_b32_e32 v7, v40 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_mov_b32_e32 v50, v48 -; SI-NEXT: v_mov_b32_e32 v48, v27 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v39, v35 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v25 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v46, v9 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v53, v58 -; SI-NEXT: v_mov_b32_e32 v28, v3 -; SI-NEXT: v_mov_b32_e32 v38, v13 -; SI-NEXT: s_mov_b64 s[4:5], -1 -; SI-NEXT: v_mov_b32_e32 v52, v63 -; SI-NEXT: v_mov_b32_e32 v5, v23 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: .LBB105_3: ; %Flow -; SI-NEXT: v_mov_b32_e32 v13, v37 -; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] -; SI-NEXT: v_mov_b32_e32 v19, v48 -; SI-NEXT: v_mov_b32_e32 v63, v7 -; SI-NEXT: v_mov_b32_e32 v58, v53 -; SI-NEXT: v_mov_b32_e32 v37, v27 -; SI-NEXT: v_mov_b32_e32 v48, v49 -; SI-NEXT: s_cbranch_vccnz .LBB105_5 -; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v59 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_mov_b32_e32 v7, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v4 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v41 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_mov_b32_e32 v17, v38 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v45 -; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v55 -; SI-NEXT: v_add_f32_e32 v37, 0x40c00000, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v15 -; SI-NEXT: v_add_f32_e32 v48, 0x40c00000, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v13 -; SI-NEXT: v_mov_b32_e32 v11, v50 -; SI-NEXT: v_add_f32_e32 v50, 0x40c00000, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v32 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v4 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v25 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v29 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v37 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v9 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v14 +; SI-NEXT: s_cbranch_execnz .LBB105_3 +; SI-NEXT: .LBB105_2: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v26 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v23 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v18 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v48 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v37 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v21 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v56 -; SI-NEXT: v_add_f32_e32 v53, 0x40c00000, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v60 +; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v25 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v50 +; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v20 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v3 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v42 +; SI-NEXT: v_add_f32_e32 v35, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v16 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v3 ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v52 +; SI-NEXT: v_add_f32_e32 v48, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v14 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_add_f32_e32 v40, 0x40c00000, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v35 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v59, 0x40c00000, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v50, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v12 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v54 +; SI-NEXT: v_add_f32_e32 v52, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: v_add_f32_e32 v56, 0x40c00000, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v58 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v46 -; SI-NEXT: v_add_f32_e32 v46, 0x40c00000, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 -; SI-NEXT: v_add_f32_e32 v35, 0x40c00000, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v39 -; SI-NEXT: v_add_f32_e32 v61, 0x40c00000, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v54, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v46 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v40 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v7 +; SI-NEXT: v_add_f32_e32 v40, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v44 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v42 +; SI-NEXT: v_add_f32_e32 v42, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v10 +; SI-NEXT: v_add_f32_e32 v44, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 +; SI-NEXT: v_add_f32_e32 v46, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v56 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v58 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v60 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v62 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; SI-NEXT: v_add_f32_e32 v56, 0x40c00000, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v34 +; SI-NEXT: v_add_f32_e32 v58, 0x40c00000, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v32 +; SI-NEXT: v_add_f32_e32 v60, 0x40c00000, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v30 +; SI-NEXT: v_add_f32_e32 v62, 0x40c00000, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v29 +; SI-NEXT: v_add_f32_e32 v37, 0x40c00000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v3 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v8 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v3 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v2 +; SI-NEXT: v_lshr_b64 v[33:34], v[23:24], 16 +; SI-NEXT: v_lshr_b64 v[31:32], v[26:27], 16 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: v_add_f32_e32 v34, 0x40c00000, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v34 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v3 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_add_f32_e32 v43, 0x40c00000, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v21 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v21 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v31 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v22 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: v_add_f32_e32 v52, 0x40c00000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v52 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v19 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v26 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v17 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v24 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v63 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v11 -; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v22 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v15 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v20 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v13 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v19 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v18 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v11 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v17 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v16 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v9 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v14 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v7 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v12 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v10 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v8 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v6 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v4 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v34, 0xffff0000, v4 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v4 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v6 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v8 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, v28 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v8 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v10 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[2:3], v[27:28], 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 -; SI-NEXT: v_mov_b32_e32 v4, v38 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v1, v27 +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v10 +; SI-NEXT: v_lshr_b64 v[27:28], v[35:36], 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[4:5], v[37:38], 16 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; SI-NEXT: v_mov_b32_e32 v6, v49 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v1, v24 +; SI-NEXT: v_lshr_b64 v[24:25], v[37:38], 16 +; SI-NEXT: v_lshr_b64 v[25:26], v[26:27], 16 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[6:7], v[48:49], 16 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v8 -; SI-NEXT: v_mov_b32_e32 v8, v51 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[25:26], v[32:33], 16 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[8:9], v[50:51], 16 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v10 -; SI-NEXT: v_and_b32_e32 v50, 0xffff0000, v52 -; SI-NEXT: v_lshr_b64 v[51:52], v[61:62], 16 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_lshr_b64 v[10:11], v[27:28], 16 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v12 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshr_b64 v[12:13], v[37:38], 16 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v14 -; SI-NEXT: v_lshr_b64 v[14:15], v[53:54], 16 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v16 -; SI-NEXT: v_mov_b32_e32 v16, v33 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[25:26], v[30:31], 16 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[16:17], v[32:33], 16 -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v18 -; SI-NEXT: v_mov_b32_e32 v18, v41 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[18:19], v[40:41], 16 -; SI-NEXT: v_mov_b32_e32 v39, v59 -; SI-NEXT: v_mov_b32_e32 v40, v60 -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v20 -; SI-NEXT: v_lshr_b64 v[20:21], v[39:40], 16 -; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v22 -; SI-NEXT: v_lshr_b64 v[22:23], v[56:57], 16 -; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v24 -; SI-NEXT: v_lshr_b64 v[24:25], v[46:47], 16 -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v26 -; SI-NEXT: v_lshr_b64 v[26:27], v[35:36], 16 -; SI-NEXT: v_mov_b32_e32 v27, v30 -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v34 -; SI-NEXT: v_lshr_b64 v[33:34], v[29:30], 16 -; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v31 -; SI-NEXT: v_mov_b32_e32 v30, v44 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v1, v36 +; SI-NEXT: v_lshr_b64 v[35:36], v[48:49], 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[30:31], v[43:44], 16 -; SI-NEXT: v_lshr_b64 v[43:44], v[32:33], 16 -; SI-NEXT: v_lshr_b64 v[34:35], v[50:51], 16 -; SI-NEXT: v_lshr_b64 v[48:49], v[5:6], 16 -; SI-NEXT: v_lshr_b64 v[37:38], v[3:4], 16 -; SI-NEXT: v_lshr_b64 v[44:45], v[1:2], 16 -; SI-NEXT: v_lshr_b64 v[27:28], v[25:26], 16 -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[1:2], v[50:51], 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; SI-NEXT: v_lshr_b64 v[3:4], v[52:53], 16 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; SI-NEXT: v_lshr_b64 v[5:6], v[54:55], 16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 +; SI-NEXT: v_lshr_b64 v[7:8], v[40:41], 16 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v9 +; SI-NEXT: v_lshr_b64 v[9:10], v[42:43], 16 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v11 +; SI-NEXT: v_lshr_b64 v[11:12], v[44:45], 16 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v13 +; SI-NEXT: v_lshr_b64 v[13:14], v[46:47], 16 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v15 +; SI-NEXT: v_lshr_b64 v[15:16], v[56:57], 16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v17 +; SI-NEXT: v_lshr_b64 v[17:18], v[58:59], 16 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v19 +; SI-NEXT: v_lshr_b64 v[19:20], v[60:61], 16 +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v22 +; SI-NEXT: v_lshr_b64 v[21:22], v[62:63], 16 +; SI-NEXT: v_lshr_b64 v[22:23], v[23:24], 16 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[27:28], v[23:24], 16 -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[22:23], v[20:21], 16 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[27:28], v[21:22], 16 -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[22:23], v[18:19], 16 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[27:28], v[19:20], 16 -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[22:23], v[16:17], 16 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[27:28], v[17:18], 16 -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[22:23], v[14:15], 16 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[27:28], v[15:16], 16 -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[22:23], v[12:13], 16 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[27:28], v[13:14], 16 -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[22:23], v[10:11], 16 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[27:28], v[11:12], 16 -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[22:23], v[8:9], 16 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[27:28], v[9:10], 16 -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; SI-NEXT: v_lshr_b64 v[31:32], v[29:30], 16 +; SI-NEXT: v_lshr_b64 v[22:23], v[6:7], 16 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[27:28], v[7:8], 16 -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; SI-NEXT: .LBB105_5: ; %end -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v33 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v43 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v32 -; SI-NEXT: v_or_b32_e32 v3, v3, v5 -; SI-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen +; SI-NEXT: v_lshr_b64 v[22:23], v[4:5], 16 +; SI-NEXT: v_lshr_b64 v[39:40], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[36:37], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[28:29], v[34:35], 16 +; SI-NEXT: .LBB105_3: ; %end ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v1 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v28 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v31 -; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v32 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v30 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v30 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v34 -; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v27 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v51 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v62 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v27 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v27 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v26 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v36 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v21 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v49 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v25 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v25 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v24 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v47 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v38 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v63 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v24 ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v61 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v59 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v57 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v47 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v23 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v57 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v21 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v40 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v12, v12, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v45 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v19 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v18 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v17 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v16 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v14, v14, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v43 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v15 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v54 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v13 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v12 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v16, v16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v41 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v18, v18, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v55 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v11 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v20, v20, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v53 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v36 +; SI-NEXT: v_or_b32_e32 v22, v22, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v51 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v10 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v28 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v24, v24, v25 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v35 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v9 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v28 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_or_b32_e32 v26, v26, v28 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v8 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v48 -; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v37 -; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v29 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_or_b32_e32 v28, v28, v29 +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v33 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v44 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x78, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v33 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v29, v29, v30 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v30, 0xffff, v32 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_or_b32_e32 v30, v30, v32 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v33 +; SI-NEXT: v_or_b32_e32 v31, v31, v32 ; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB105_4: +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_branch .LBB105_2 ; ; VI-LABEL: bitcast_v64bf16_to_v64i16_scalar: ; VI: ; %bb.0: @@ -234223,853 +231131,748 @@ define <64 x bfloat> @bitcast_v64i16_to_v64bf16(<64 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v64i16_to_v64bf16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:136 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:100 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:108 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:116 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:124 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:132 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:112 -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:120 -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:128 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v1 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:88 -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:56 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v23 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v29 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 ; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 ; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v41 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v44 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v45 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v46 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v56 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v58 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v60 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v61 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v63 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v62 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB106_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v56 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v57 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v47 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v41 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v42 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v43 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v44 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v45 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v46 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v58 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v9 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v59 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v10 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v61 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v11 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v62 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v63 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v40 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v60 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: .LBB106_2: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB106_4 -; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v60 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_or_b32_e32 v3, v55, v3 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v9, v52, v1 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x30000, v3 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v40 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: s_mov_b32 s6, 0x30000 -; SI-NEXT: v_or_b32_e32 v3, v51, v3 -; SI-NEXT: v_add_i32_e32 v52, vcc, s6, v3 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v63 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v3, v50, v3 -; SI-NEXT: v_add_i32_e32 v51, vcc, s6, v3 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v62 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v3, v49, v3 -; SI-NEXT: v_add_i32_e32 v50, vcc, s6, v3 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v61 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v3, v48, v3 -; SI-NEXT: v_add_i32_e32 v49, vcc, s6, v3 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v59 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v3, v39, v3 -; SI-NEXT: v_add_i32_e32 v48, vcc, s6, v3 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v58 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v3, v38, v3 -; SI-NEXT: v_add_i32_e32 v39, vcc, s6, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v46 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v3, v37, v3 -; SI-NEXT: v_add_i32_e32 v38, vcc, s6, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v45 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v3, v36, v3 -; SI-NEXT: v_add_i32_e32 v37, vcc, s6, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v44 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v3, v35, v3 -; SI-NEXT: v_add_i32_e32 v36, vcc, s6, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v43 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v3, v34, v3 -; SI-NEXT: v_add_i32_e32 v35, vcc, s6, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v42 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v3, v33, v3 -; SI-NEXT: v_add_i32_e32 v34, vcc, s6, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v41 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v3, v32, v3 -; SI-NEXT: v_add_i32_e32 v33, vcc, s6, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v47 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: v_or_b32_e32 v3, v31, v3 -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: v_add_i32_e32 v32, vcc, s6, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v11, v18, v11 -; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v11 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v57 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; SI-NEXT: v_or_b32_e32 v5, v54, v5 -; SI-NEXT: v_add_i32_e32 v55, vcc, s6, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v56 -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: v_or_b32_e32 v7, v53, v7 -; SI-NEXT: v_add_i32_e32 v54, vcc, s6, v7 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v53, vcc, s6, v9 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v15 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v3, v30, v3 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: v_add_i32_e32 v31, vcc, s6, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v11, v16, v11 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 -; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v31 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; SI-NEXT: v_or_b32_e32 v14, v14, v16 -; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v3, v28, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v31 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v32 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v33 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v34 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v35 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v20 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v36 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v21 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v37 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v38 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v39 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v48 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v49 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v50 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v51 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v52 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v29 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v53 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v30 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v54 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v63 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: .LBB106_2: ; %Flow +; SI-NEXT: s_or_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v16 -; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v32 -; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v33 -; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v34 -; SI-NEXT: v_and_b32_e32 v34, 0xffff0000, v35 -; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v36 -; SI-NEXT: v_and_b32_e32 v36, 0xffff0000, v37 -; SI-NEXT: v_and_b32_e32 v37, 0xffff0000, v38 -; SI-NEXT: v_and_b32_e32 v38, 0xffff0000, v39 -; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v48 -; SI-NEXT: v_and_b32_e32 v48, 0xffff0000, v49 -; SI-NEXT: v_and_b32_e32 v49, 0xffff0000, v50 -; SI-NEXT: v_and_b32_e32 v50, 0xffff0000, v51 -; SI-NEXT: v_and_b32_e32 v51, 0xffff0000, v52 -; SI-NEXT: v_and_b32_e32 v52, 0xffff0000, v53 -; SI-NEXT: v_and_b32_e32 v53, 0xffff0000, v54 -; SI-NEXT: v_and_b32_e32 v54, 0xffff0000, v55 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: v_and_b32_e32 v55, 0xffff0000, v1 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; SI-NEXT: v_or_b32_e32 v12, v12, v14 -; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v12 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v14 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_xor_b64 exec, exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB106_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; SI-NEXT: v_or_b32_e32 v10, v10, v12 -; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v0, v31, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v1, v32, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v2, v33, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v3, v34, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 ; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; SI-NEXT: v_or_b32_e32 v5, v26, v5 -; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v12 -; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; SI-NEXT: v_or_b32_e32 v8, v8, v10 -; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v8 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v4, v35, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 ; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: v_or_b32_e32 v7, v24, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v10 -; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; SI-NEXT: v_or_b32_e32 v6, v6, v8 -; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: v_or_b32_e32 v9, v22, v9 -; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v9 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v8 -; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v13 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; SI-NEXT: v_or_b32_e32 v4, v4, v6 -; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: v_or_b32_e32 v9, v20, v9 -; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v6 -; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; SI-NEXT: v_or_b32_e32 v2, v2, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v4 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v10 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v12 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v14 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v16 -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: .LBB106_4: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v2 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v27 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v4 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v29 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v5, v36, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v6 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v25 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v6, v37, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v8 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v23 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v7, v38, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v10 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v21 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v8, v39, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v12 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v19 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v9, v48, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v14 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v17 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v7 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_or_b32_e32 v10, v49, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v16 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v11 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v8 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v11, v50, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v18 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v15 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v9 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_or_b32_e32 v12, v51, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v20 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v9 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v10 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_or_b32_e32 v13, v52, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v22 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v13 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v11 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_or_b32_e32 v14, v53, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v24 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v7 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v12 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v15, v54, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v26 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v5 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v13 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_or_b32_e32 v16, v55, v16 +; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v28 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v14 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v17, v40, v17 +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v30 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v15 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_or_b32_e32 v18, v41, v18 +; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v31 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v16 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v19, v42, v19 +; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v32 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v17 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v20, v43, v20 +; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v33 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v18 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v21, v44, v21 +; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v20 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v34 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v19 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_or_b32_e32 v22, v45, v22 +; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v21 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v35 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v20 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v23, v46, v23 +; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v36 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v21 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_or_b32_e32 v24, v47, v24 +; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v37 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v22 +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v25, v56, v25 +; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v38 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v23 +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_or_b32_e32 v26, v57, v26 +; SI-NEXT: v_add_i32_e32 v25, vcc, s6, v25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v39 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v24 +; SI-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: v_or_b32_e32 v27, v58, v27 +; SI-NEXT: v_add_i32_e32 v26, vcc, s6, v26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v48 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v25 +; SI-NEXT: v_or_b32_e32 v30, v61, v30 +; SI-NEXT: v_or_b32_e32 v60, v60, v29 +; SI-NEXT: v_or_b32_e32 v59, v59, v28 +; SI-NEXT: v_add_i32_e32 v27, vcc, s6, v27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v49 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v26 +; SI-NEXT: v_add_i32_e32 v63, vcc, 3, v63 +; SI-NEXT: v_add_i32_e32 v29, vcc, s6, v30 +; SI-NEXT: v_add_i32_e32 v30, vcc, s6, v60 +; SI-NEXT: v_add_i32_e32 v60, vcc, s6, v59 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v50 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v27 +; SI-NEXT: v_and_b32_e32 v63, 0xffff, v63 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v51 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v60 +; SI-NEXT: v_or_b32_e32 v62, v62, v63 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v52 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v30 +; SI-NEXT: v_add_i32_e32 v28, vcc, 0x30000, v62 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v29 +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v1 +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v2 +; SI-NEXT: v_and_b32_e32 v34, 0xffff0000, v3 +; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v4 +; SI-NEXT: v_and_b32_e32 v36, 0xffff0000, v5 +; SI-NEXT: v_and_b32_e32 v37, 0xffff0000, v6 +; SI-NEXT: v_and_b32_e32 v38, 0xffff0000, v7 +; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v8 +; SI-NEXT: v_and_b32_e32 v48, 0xffff0000, v9 +; SI-NEXT: v_and_b32_e32 v49, 0xffff0000, v10 +; SI-NEXT: v_and_b32_e32 v50, 0xffff0000, v11 +; SI-NEXT: v_and_b32_e32 v51, 0xffff0000, v12 +; SI-NEXT: v_and_b32_e32 v52, 0xffff0000, v13 +; SI-NEXT: v_and_b32_e32 v53, 0xffff0000, v14 +; SI-NEXT: v_and_b32_e32 v54, 0xffff0000, v15 +; SI-NEXT: v_and_b32_e32 v55, 0xffff0000, v16 +; SI-NEXT: v_and_b32_e32 v40, 0xffff0000, v17 +; SI-NEXT: v_and_b32_e32 v41, 0xffff0000, v18 +; SI-NEXT: v_and_b32_e32 v42, 0xffff0000, v19 +; SI-NEXT: v_and_b32_e32 v43, 0xffff0000, v20 +; SI-NEXT: v_and_b32_e32 v44, 0xffff0000, v21 +; SI-NEXT: v_and_b32_e32 v45, 0xffff0000, v22 +; SI-NEXT: v_and_b32_e32 v46, 0xffff0000, v23 +; SI-NEXT: v_and_b32_e32 v47, 0xffff0000, v24 +; SI-NEXT: v_and_b32_e32 v56, 0xffff0000, v25 +; SI-NEXT: v_and_b32_e32 v57, 0xffff0000, v26 +; SI-NEXT: v_and_b32_e32 v58, 0xffff0000, v27 +; SI-NEXT: v_and_b32_e32 v59, 0xffff0000, v60 +; SI-NEXT: v_and_b32_e32 v60, 0xffff0000, v30 +; SI-NEXT: v_and_b32_e32 v61, 0xffff0000, v29 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v62, 0xffff0000, v28 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v53 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: .LBB106_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v32 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v54 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v33 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_alignbit_b32 v2, v2, v3, 16 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v34 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_alignbit_b32 v3, v3, v4, 16 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v35 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_alignbit_b32 v4, v4, v5, 16 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v36 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_alignbit_b32 v5, v5, v6, 16 +; SI-NEXT: v_mul_f32_e32 v6, 1.0, v37 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_alignbit_b32 v6, v6, v7, 16 +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v38 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_alignbit_b32 v7, v7, v8, 16 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v39 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_alignbit_b32 v8, v8, v9, 16 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v48 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_alignbit_b32 v9, v9, v10, 16 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v49 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_alignbit_b32 v10, v10, v11, 16 +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v50 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_mul_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_alignbit_b32 v11, v11, v12, 16 +; SI-NEXT: v_mul_f32_e32 v12, 1.0, v51 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_mul_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_alignbit_b32 v12, v12, v13, 16 +; SI-NEXT: v_mul_f32_e32 v13, 1.0, v52 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_mul_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_alignbit_b32 v13, v13, v14, 16 +; SI-NEXT: v_mul_f32_e32 v14, 1.0, v53 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_mul_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_alignbit_b32 v14, v14, v15, 16 +; SI-NEXT: v_mul_f32_e32 v15, 1.0, v54 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_alignbit_b32 v15, v15, v16, 16 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v55 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_alignbit_b32 v16, v16, v17, 16 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v40 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_alignbit_b32 v17, v17, v18, 16 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v41 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_alignbit_b32 v18, v18, v19, 16 +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v42 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_alignbit_b32 v19, v19, v20, 16 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v43 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_alignbit_b32 v20, v20, v21, 16 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v44 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_alignbit_b32 v21, v21, v22, 16 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v45 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_alignbit_b32 v22, v22, v23, 16 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v46 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_alignbit_b32 v23, v23, v24, 16 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v47 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v25 +; SI-NEXT: v_alignbit_b32 v24, v24, v25, 16 +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v56 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v26 +; SI-NEXT: v_alignbit_b32 v25, v25, v26, 16 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v57 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v27 +; SI-NEXT: v_alignbit_b32 v26, v26, v27, 16 +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v58 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v28 +; SI-NEXT: v_alignbit_b32 v27, v27, v28, 16 +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v59 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_mul_f32_e32 v29, 1.0, v29 +; SI-NEXT: v_alignbit_b32 v28, v28, v29, 16 +; SI-NEXT: v_mul_f32_e32 v29, 1.0, v60 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v30 +; SI-NEXT: v_alignbit_b32 v29, v29, v30, 16 +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v61 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v31 +; SI-NEXT: v_alignbit_b32 v30, v30, v31, 16 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v62 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; SI-NEXT: v_alignbit_b32 v31, v31, v32, 16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v55 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v3 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v64i16_to_v64bf16: @@ -235309,355 +232112,318 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a ; SI-LABEL: bitcast_v64i16_to_v64bf16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[4:5] -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:48 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_writelane_b32 v40, s30, 0 -; SI-NEXT: v_writelane_b32 v40, s31, 1 -; SI-NEXT: v_writelane_b32 v40, s34, 2 -; SI-NEXT: v_writelane_b32 v40, s35, 3 -; SI-NEXT: v_writelane_b32 v40, s36, 4 -; SI-NEXT: v_writelane_b32 v40, s37, 5 -; SI-NEXT: v_writelane_b32 v40, s38, 6 -; SI-NEXT: v_writelane_b32 v40, s39, 7 -; SI-NEXT: v_writelane_b32 v40, s48, 8 -; SI-NEXT: v_writelane_b32 v40, s49, 9 -; SI-NEXT: v_writelane_b32 v40, s50, 10 -; SI-NEXT: v_writelane_b32 v40, s51, 11 -; SI-NEXT: v_writelane_b32 v40, s52, 12 -; SI-NEXT: v_writelane_b32 v40, s53, 13 -; SI-NEXT: v_writelane_b32 v40, s54, 14 -; SI-NEXT: v_writelane_b32 v40, s55, 15 -; SI-NEXT: v_writelane_b32 v40, s64, 16 -; SI-NEXT: v_writelane_b32 v40, s65, 17 -; SI-NEXT: v_writelane_b32 v40, s66, 18 -; SI-NEXT: v_writelane_b32 v40, s67, 19 -; SI-NEXT: v_writelane_b32 v40, s68, 20 -; SI-NEXT: v_writelane_b32 v40, s69, 21 -; SI-NEXT: v_writelane_b32 v40, s70, 22 -; SI-NEXT: v_writelane_b32 v40, s71, 23 -; SI-NEXT: v_writelane_b32 v40, s80, 24 -; SI-NEXT: v_writelane_b32 v40, s81, 25 -; SI-NEXT: v_writelane_b32 v40, s82, 26 -; SI-NEXT: v_writelane_b32 v40, s83, 27 -; SI-NEXT: v_writelane_b32 v40, s84, 28 -; SI-NEXT: v_writelane_b32 v40, s85, 29 -; SI-NEXT: v_writelane_b32 v40, s86, 30 -; SI-NEXT: v_writelane_b32 v40, s87, 31 -; SI-NEXT: s_mov_b32 s72, s21 -; SI-NEXT: s_mov_b32 s61, s18 -; SI-NEXT: ; implicit-def: $vgpr41 : SGPR spill to VGPR lane -; SI-NEXT: s_mov_b32 s60, s16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v41, s17, 0 -; SI-NEXT: v_writelane_b32 v41, s60, 1 -; SI-NEXT: v_writelane_b32 v41, s19, 2 -; SI-NEXT: v_writelane_b32 v41, s61, 3 -; SI-NEXT: v_writelane_b32 v41, s72, 4 -; SI-NEXT: s_mov_b32 s74, s23 -; SI-NEXT: v_writelane_b32 v41, s20, 5 -; SI-NEXT: v_writelane_b32 v41, s74, 6 -; SI-NEXT: s_mov_b32 s76, s25 -; SI-NEXT: v_writelane_b32 v41, s22, 7 -; SI-NEXT: v_writelane_b32 v41, s76, 8 -; SI-NEXT: s_mov_b32 s78, s27 -; SI-NEXT: v_writelane_b32 v41, s24, 9 -; SI-NEXT: v_writelane_b32 v41, s78, 10 -; SI-NEXT: s_mov_b32 s88, s29 -; SI-NEXT: v_writelane_b32 v41, s26, 11 -; SI-NEXT: v_writelane_b32 v41, s88, 12 -; SI-NEXT: v_readfirstlane_b32 s23, v2 -; SI-NEXT: v_writelane_b32 v41, s28, 13 -; SI-NEXT: v_readfirstlane_b32 s79, v4 -; SI-NEXT: v_writelane_b32 v41, s23, 14 -; SI-NEXT: v_readfirstlane_b32 s90, v3 -; SI-NEXT: v_writelane_b32 v41, s79, 15 -; SI-NEXT: v_readfirstlane_b32 s91, v6 -; SI-NEXT: v_writelane_b32 v41, s90, 16 -; SI-NEXT: v_readfirstlane_b32 s92, v5 -; SI-NEXT: v_writelane_b32 v41, s91, 17 -; SI-NEXT: v_readfirstlane_b32 s93, v8 -; SI-NEXT: v_writelane_b32 v41, s92, 18 -; SI-NEXT: v_readfirstlane_b32 s94, v7 -; SI-NEXT: v_writelane_b32 v41, s93, 19 -; SI-NEXT: v_readfirstlane_b32 s95, v10 -; SI-NEXT: v_writelane_b32 v41, s94, 20 -; SI-NEXT: v_readfirstlane_b32 s30, v9 -; SI-NEXT: v_writelane_b32 v41, s95, 21 -; SI-NEXT: v_readfirstlane_b32 s31, v12 -; SI-NEXT: v_writelane_b32 v41, s30, 22 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_readfirstlane_b32 s21, v31 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_readfirstlane_b32 s80, v32 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_readfirstlane_b32 s75, v33 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:16 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_readfirstlane_b32 s84, v34 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_readfirstlane_b32 s77, v35 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_readfirstlane_b32 s83, v36 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_readfirstlane_b32 s87, v38 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:80 -; SI-NEXT: v_readfirstlane_b32 s18, v37 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 -; SI-NEXT: v_writelane_b32 v41, s31, 23 -; SI-NEXT: v_readfirstlane_b32 s34, v11 -; SI-NEXT: v_readfirstlane_b32 s35, v14 -; SI-NEXT: v_readfirstlane_b32 s36, v13 -; SI-NEXT: v_writelane_b32 v40, s96, 32 -; SI-NEXT: v_readfirstlane_b32 s37, v16 -; SI-NEXT: v_writelane_b32 v40, s97, 33 -; SI-NEXT: v_readfirstlane_b32 s38, v15 -; SI-NEXT: v_writelane_b32 v40, s98, 34 -; SI-NEXT: v_readfirstlane_b32 s14, v30 -; SI-NEXT: v_readfirstlane_b32 s15, v29 -; SI-NEXT: v_readfirstlane_b32 s12, v28 -; SI-NEXT: v_readfirstlane_b32 s13, v27 -; SI-NEXT: v_readfirstlane_b32 s10, v26 -; SI-NEXT: v_readfirstlane_b32 s11, v25 -; SI-NEXT: v_readfirstlane_b32 s8, v24 -; SI-NEXT: v_readfirstlane_b32 s9, v23 -; SI-NEXT: v_readfirstlane_b32 s89, v22 -; SI-NEXT: v_readfirstlane_b32 s7, v21 -; SI-NEXT: v_readfirstlane_b32 s25, v20 -; SI-NEXT: v_readfirstlane_b32 s29, v19 -; SI-NEXT: v_readfirstlane_b32 s39, v18 -; SI-NEXT: v_readfirstlane_b32 s27, v17 -; SI-NEXT: v_writelane_b32 v40, s99, 35 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_readfirstlane_b32 s58, v31 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_readfirstlane_b32 s59, v32 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_readfirstlane_b32 s56, v33 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_readfirstlane_b32 s57, v39 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_readfirstlane_b32 s46, v48 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_readfirstlane_b32 s47, v49 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_readfirstlane_b32 s44, v50 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_readfirstlane_b32 s45, v51 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_readfirstlane_b32 s42, v34 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v38 +; SI-NEXT: v_writelane_b32 v33, s30, 0 +; SI-NEXT: v_writelane_b32 v33, s31, 1 +; SI-NEXT: v_writelane_b32 v33, s34, 2 +; SI-NEXT: v_writelane_b32 v33, s35, 3 +; SI-NEXT: v_writelane_b32 v33, s36, 4 +; SI-NEXT: v_writelane_b32 v33, s37, 5 +; SI-NEXT: v_writelane_b32 v33, s38, 6 +; SI-NEXT: v_writelane_b32 v33, s39, 7 +; SI-NEXT: v_writelane_b32 v33, s48, 8 +; SI-NEXT: v_writelane_b32 v33, s49, 9 +; SI-NEXT: v_writelane_b32 v33, s50, 10 +; SI-NEXT: v_writelane_b32 v33, s51, 11 +; SI-NEXT: v_writelane_b32 v33, s52, 12 +; SI-NEXT: v_writelane_b32 v33, s53, 13 +; SI-NEXT: v_writelane_b32 v33, s54, 14 +; SI-NEXT: v_writelane_b32 v33, s55, 15 +; SI-NEXT: v_writelane_b32 v33, s64, 16 +; SI-NEXT: v_writelane_b32 v33, s65, 17 +; SI-NEXT: s_lshr_b32 s6, s16, 16 +; SI-NEXT: ; implicit-def: $vgpr34 : SGPR spill to VGPR lane +; SI-NEXT: v_writelane_b32 v33, s66, 18 +; SI-NEXT: s_lshr_b32 s7, s17, 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v34, s6, 0 +; SI-NEXT: v_writelane_b32 v33, s67, 19 +; SI-NEXT: s_lshr_b32 s56, s18, 16 +; SI-NEXT: v_writelane_b32 v34, s7, 1 +; SI-NEXT: v_writelane_b32 v33, s68, 20 +; SI-NEXT: s_lshr_b32 s57, s19, 16 +; SI-NEXT: v_writelane_b32 v34, s56, 2 +; SI-NEXT: v_writelane_b32 v33, s69, 21 +; SI-NEXT: s_lshr_b32 s90, s20, 16 +; SI-NEXT: v_writelane_b32 v34, s57, 3 +; SI-NEXT: v_writelane_b32 v33, s70, 22 +; SI-NEXT: s_lshr_b32 s91, s21, 16 +; SI-NEXT: v_writelane_b32 v34, s90, 4 +; SI-NEXT: v_writelane_b32 v33, s71, 23 +; SI-NEXT: s_lshr_b32 s92, s22, 16 +; SI-NEXT: v_writelane_b32 v34, s91, 5 +; SI-NEXT: v_writelane_b32 v33, s80, 24 +; SI-NEXT: s_lshr_b32 s93, s23, 16 +; SI-NEXT: v_writelane_b32 v34, s92, 6 +; SI-NEXT: v_writelane_b32 v33, s81, 25 +; SI-NEXT: s_lshr_b32 s94, s24, 16 +; SI-NEXT: v_writelane_b32 v34, s93, 7 +; SI-NEXT: v_writelane_b32 v33, s82, 26 +; SI-NEXT: s_lshr_b32 s95, s25, 16 +; SI-NEXT: v_writelane_b32 v34, s94, 8 +; SI-NEXT: v_writelane_b32 v33, s83, 27 +; SI-NEXT: s_lshr_b32 s30, s26, 16 +; SI-NEXT: v_writelane_b32 v34, s95, 9 +; SI-NEXT: v_writelane_b32 v33, s84, 28 +; SI-NEXT: s_lshr_b32 s31, s27, 16 +; SI-NEXT: v_writelane_b32 v34, s30, 10 +; SI-NEXT: v_writelane_b32 v33, s85, 29 +; SI-NEXT: s_lshr_b32 s34, s28, 16 +; SI-NEXT: v_writelane_b32 v34, s31, 11 +; SI-NEXT: v_writelane_b32 v33, s86, 30 +; SI-NEXT: s_lshr_b32 s35, s29, 16 +; SI-NEXT: v_writelane_b32 v34, s34, 12 +; SI-NEXT: v_writelane_b32 v33, s87, 31 +; SI-NEXT: v_readfirstlane_b32 s36, v4 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: v_writelane_b32 v34, s35, 13 +; SI-NEXT: v_writelane_b32 v33, s96, 32 +; SI-NEXT: v_readfirstlane_b32 s37, v5 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s5, v1 -; SI-NEXT: v_writelane_b32 v41, s5, 24 -; SI-NEXT: v_writelane_b32 v41, s34, 25 -; SI-NEXT: v_writelane_b32 v41, s35, 26 -; SI-NEXT: v_writelane_b32 v41, s36, 27 -; SI-NEXT: v_writelane_b32 v41, s37, 28 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_readfirstlane_b32 s43, v35 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_readfirstlane_b32 s40, v36 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s41, v37 -; SI-NEXT: v_writelane_b32 v41, s38, 29 -; SI-NEXT: v_writelane_b32 v41, s39, 30 +; SI-NEXT: v_writelane_b32 v34, s36, 14 +; SI-NEXT: v_writelane_b32 v33, s97, 33 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v17 +; SI-NEXT: v_readfirstlane_b32 s47, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v16 +; SI-NEXT: v_readfirstlane_b32 s46, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v15 +; SI-NEXT: v_readfirstlane_b32 s44, v15 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_readfirstlane_b32 s42, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v13 +; SI-NEXT: v_readfirstlane_b32 s40, v13 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v12 +; SI-NEXT: v_readfirstlane_b32 s14, v12 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v11 +; SI-NEXT: v_readfirstlane_b32 s13, v11 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_readfirstlane_b32 s12, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v9 +; SI-NEXT: v_readfirstlane_b32 s11, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v8 +; SI-NEXT: v_readfirstlane_b32 s10, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v7 +; SI-NEXT: v_readfirstlane_b32 s9, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_readfirstlane_b32 s8, v6 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v2 +; SI-NEXT: v_readfirstlane_b32 s45, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_readfirstlane_b32 s43, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_readfirstlane_b32 s5, v0 +; SI-NEXT: v_writelane_b32 v34, s37, 15 +; SI-NEXT: v_writelane_b32 v33, s98, 34 +; SI-NEXT: v_readfirstlane_b32 s41, v1 +; SI-NEXT: v_readfirstlane_b32 s89, v19 +; SI-NEXT: v_readfirstlane_b32 s88, v17 +; SI-NEXT: v_readfirstlane_b32 s79, v16 +; SI-NEXT: v_readfirstlane_b32 s78, v15 +; SI-NEXT: v_readfirstlane_b32 s77, v14 +; SI-NEXT: v_readfirstlane_b32 s76, v13 +; SI-NEXT: v_readfirstlane_b32 s75, v12 +; SI-NEXT: v_readfirstlane_b32 s74, v11 +; SI-NEXT: v_readfirstlane_b32 s73, v10 +; SI-NEXT: v_readfirstlane_b32 s72, v9 +; SI-NEXT: v_readfirstlane_b32 s63, v8 +; SI-NEXT: v_readfirstlane_b32 s62, v7 +; SI-NEXT: v_readfirstlane_b32 s61, v6 +; SI-NEXT: v_readfirstlane_b32 s60, v5 +; SI-NEXT: v_readfirstlane_b32 s59, v4 +; SI-NEXT: v_readfirstlane_b32 s58, v18 +; SI-NEXT: v_readfirstlane_b32 s15, v3 +; SI-NEXT: v_readfirstlane_b32 vcc_lo, v2 +; SI-NEXT: v_writelane_b32 v34, s5, 16 +; SI-NEXT: v_writelane_b32 v33, s99, 35 +; SI-NEXT: v_writelane_b32 v34, vcc_lo, 17 ; SI-NEXT: s_cbranch_scc0 .LBB107_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshl_b32 s4, s60, 16 -; SI-NEXT: v_writelane_b32 v41, s4, 32 +; SI-NEXT: s_lshl_b32 s4, s16, 16 +; SI-NEXT: v_writelane_b32 v34, s4, 19 +; SI-NEXT: s_lshl_b32 s4, s6, 16 +; SI-NEXT: v_writelane_b32 v34, s4, 18 ; SI-NEXT: s_lshl_b32 s4, s17, 16 -; SI-NEXT: v_writelane_b32 v41, s4, 31 -; SI-NEXT: s_lshl_b32 s4, s61, 16 -; SI-NEXT: v_writelane_b32 v41, s4, 34 +; SI-NEXT: v_writelane_b32 v34, s4, 21 +; SI-NEXT: s_lshl_b32 s4, s7, 16 +; SI-NEXT: v_writelane_b32 v34, s4, 20 +; SI-NEXT: s_lshl_b32 s4, s18, 16 +; SI-NEXT: v_writelane_b32 v34, s4, 23 +; SI-NEXT: s_lshl_b32 s4, s56, 16 +; SI-NEXT: v_writelane_b32 v34, s4, 22 ; SI-NEXT: s_lshl_b32 s4, s19, 16 -; SI-NEXT: v_writelane_b32 v41, s4, 33 +; SI-NEXT: v_writelane_b32 v34, s4, 25 +; SI-NEXT: s_lshl_b32 s4, s57, 16 +; SI-NEXT: v_writelane_b32 v34, s4, 24 ; SI-NEXT: s_lshl_b32 s4, s20, 16 -; SI-NEXT: v_writelane_b32 v41, s4, 36 -; SI-NEXT: s_lshl_b32 s4, s72, 16 -; SI-NEXT: v_writelane_b32 v41, s4, 35 -; SI-NEXT: s_lshl_b32 s4, s74, 16 -; SI-NEXT: s_lshl_b32 s16, s22, 16 -; SI-NEXT: v_writelane_b32 v41, s4, 37 -; SI-NEXT: s_lshl_b32 s6, s24, 16 -; SI-NEXT: s_lshl_b32 s73, s76, 16 -; SI-NEXT: s_lshl_b32 s98, s26, 16 -; SI-NEXT: s_lshl_b32 s63, s78, 16 -; SI-NEXT: s_lshl_b32 s96, s28, 16 -; SI-NEXT: s_lshl_b32 s62, s88, 16 -; SI-NEXT: s_lshl_b32 s97, s5, 16 -; SI-NEXT: s_lshl_b32 s99, s23, 16 -; SI-NEXT: s_lshl_b32 s85, s90, 16 -; SI-NEXT: s_lshl_b32 s86, s79, 16 -; SI-NEXT: s_lshl_b32 s81, s92, 16 -; SI-NEXT: s_lshl_b32 s82, s91, 16 -; SI-NEXT: s_lshl_b32 s70, s94, 16 -; SI-NEXT: s_lshl_b32 s71, s93, 16 -; SI-NEXT: s_lshl_b32 s68, s30, 16 -; SI-NEXT: s_lshl_b32 s69, s95, 16 -; SI-NEXT: s_lshl_b32 s66, s34, 16 -; SI-NEXT: s_lshl_b32 s67, s31, 16 -; SI-NEXT: s_lshl_b32 s64, s36, 16 -; SI-NEXT: s_lshl_b32 s65, s35, 16 -; SI-NEXT: s_lshl_b32 s54, s38, 16 +; SI-NEXT: v_writelane_b32 v34, s4, 27 +; SI-NEXT: s_lshl_b32 s4, s90, 16 +; SI-NEXT: v_writelane_b32 v34, s4, 26 +; SI-NEXT: s_lshl_b32 s4, s21, 16 +; SI-NEXT: v_writelane_b32 v34, s4, 29 +; SI-NEXT: s_lshl_b32 s4, s91, 16 +; SI-NEXT: v_writelane_b32 v34, s4, 28 +; SI-NEXT: s_lshl_b32 s4, s22, 16 +; SI-NEXT: v_writelane_b32 v34, s4, 31 +; SI-NEXT: s_lshl_b32 s4, s92, 16 +; SI-NEXT: v_writelane_b32 v34, s4, 30 +; SI-NEXT: s_lshl_b32 s4, s23, 16 +; SI-NEXT: v_writelane_b32 v34, s4, 33 +; SI-NEXT: s_lshl_b32 s4, s93, 16 +; SI-NEXT: v_writelane_b32 v34, s4, 32 +; SI-NEXT: s_lshl_b32 s4, s24, 16 +; SI-NEXT: v_writelane_b32 v34, s4, 35 +; SI-NEXT: s_lshl_b32 s4, s94, 16 +; SI-NEXT: v_writelane_b32 v34, s4, 34 +; SI-NEXT: s_lshl_b32 s7, s25, 16 +; SI-NEXT: s_lshl_b32 s6, s95, 16 +; SI-NEXT: s_lshl_b32 s99, s26, 16 +; SI-NEXT: s_lshl_b32 s57, s30, 16 +; SI-NEXT: s_lshl_b32 s87, s27, 16 +; SI-NEXT: s_lshl_b32 s56, s31, 16 +; SI-NEXT: s_lshl_b32 s85, s28, 16 +; SI-NEXT: s_lshl_b32 s98, s34, 16 +; SI-NEXT: s_lshl_b32 s83, s29, 16 +; SI-NEXT: s_lshl_b32 s97, s35, 16 +; SI-NEXT: s_lshl_b32 s81, s5, 16 +; SI-NEXT: s_lshl_b32 s96, vcc_lo, 16 +; SI-NEXT: s_lshl_b32 s71, s41, 16 +; SI-NEXT: s_lshl_b32 s86, s15, 16 +; SI-NEXT: s_lshl_b32 s69, s43, 16 +; SI-NEXT: s_lshl_b32 s84, s58, 16 +; SI-NEXT: s_lshl_b32 s70, s45, 16 +; SI-NEXT: s_lshl_b32 s82, s59, 16 +; SI-NEXT: s_lshl_b32 s65, s36, 16 +; SI-NEXT: s_lshl_b32 s80, s60, 16 ; SI-NEXT: s_lshl_b32 s55, s37, 16 -; SI-NEXT: s_lshl_b32 s52, s27, 16 -; SI-NEXT: s_lshl_b32 s53, s39, 16 -; SI-NEXT: s_lshl_b32 s50, s29, 16 -; SI-NEXT: s_lshl_b32 s51, s25, 16 -; SI-NEXT: s_lshl_b32 s48, s7, 16 -; SI-NEXT: s_lshl_b32 s49, s89, 16 -; SI-NEXT: s_lshl_b32 s38, s9, 16 -; SI-NEXT: s_lshl_b32 s39, s8, 16 -; SI-NEXT: s_lshl_b32 s37, s11, 16 -; SI-NEXT: s_lshl_b32 s35, s10, 16 -; SI-NEXT: s_lshl_b32 s31, s13, 16 -; SI-NEXT: s_lshl_b32 s36, s12, 16 -; SI-NEXT: s_lshl_b32 s95, s15, 16 -; SI-NEXT: s_lshl_b32 s34, s14, 16 -; SI-NEXT: s_lshl_b32 s93, s41, 16 -; SI-NEXT: s_lshl_b32 s30, s40, 16 -; SI-NEXT: s_lshl_b32 s91, s43, 16 -; SI-NEXT: s_lshl_b32 s94, s42, 16 -; SI-NEXT: s_lshl_b32 s92, s45, 16 -; SI-NEXT: s_lshl_b32 s90, s44, 16 -; SI-NEXT: s_lshl_b32 s88, s47, 16 -; SI-NEXT: s_lshl_b32 s28, s46, 16 -; SI-NEXT: s_lshl_b32 s78, s57, 16 -; SI-NEXT: s_lshl_b32 s26, s56, 16 -; SI-NEXT: s_lshl_b32 s76, s59, 16 -; SI-NEXT: s_lshl_b32 s24, s58, 16 -; SI-NEXT: s_lshl_b32 s74, s87, 16 -; SI-NEXT: s_mov_b32 s23, s18 -; SI-NEXT: s_lshl_b32 s22, s18, 16 -; SI-NEXT: s_lshl_b32 s72, s83, 16 -; SI-NEXT: s_mov_b32 s79, s77 -; SI-NEXT: s_lshl_b32 s20, s77, 16 -; SI-NEXT: s_lshl_b32 s61, s84, 16 -; SI-NEXT: s_mov_b32 s18, s75 -; SI-NEXT: s_lshl_b32 s19, s75, 16 -; SI-NEXT: s_lshl_b32 s60, s80, 16 -; SI-NEXT: s_mov_b32 s77, s21 -; SI-NEXT: s_lshl_b32 s17, s21, 16 +; SI-NEXT: s_lshl_b32 s67, s61, 16 +; SI-NEXT: s_lshl_b32 s53, s8, 16 +; SI-NEXT: s_lshl_b32 s68, s62, 16 +; SI-NEXT: s_lshl_b32 s51, s9, 16 +; SI-NEXT: s_lshl_b32 s66, s63, 16 +; SI-NEXT: s_lshl_b32 s49, s10, 16 +; SI-NEXT: s_lshl_b32 s64, s72, 16 +; SI-NEXT: s_lshl_b32 s39, s11, 16 +; SI-NEXT: s_lshl_b32 s54, s73, 16 +; SI-NEXT: s_lshl_b32 s37, s12, 16 +; SI-NEXT: s_lshl_b32 s52, s74, 16 +; SI-NEXT: s_lshl_b32 s35, s13, 16 +; SI-NEXT: s_lshl_b32 s50, s75, 16 +; SI-NEXT: s_lshl_b32 s31, s14, 16 +; SI-NEXT: s_lshl_b32 s48, s76, 16 +; SI-NEXT: s_lshl_b32 s95, s40, 16 +; SI-NEXT: s_lshl_b32 s38, s77, 16 +; SI-NEXT: s_lshl_b32 s34, s42, 16 +; SI-NEXT: s_lshl_b32 s36, s78, 16 +; SI-NEXT: s_lshl_b32 s92, s44, 16 +; SI-NEXT: s_lshl_b32 s30, s79, 16 +; SI-NEXT: s_lshl_b32 s93, s46, 16 +; SI-NEXT: s_lshl_b32 s94, s88, 16 +; SI-NEXT: s_lshl_b32 s90, s47, 16 +; SI-NEXT: s_lshl_b32 s91, s89, 16 ; SI-NEXT: s_mov_b64 s[4:5], 0 ; SI-NEXT: s_branch .LBB107_3 ; SI-NEXT: .LBB107_2: +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr7 ; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; kill: killed $sgpr6 -; SI-NEXT: s_mov_b32 s79, s77 -; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; kill: killed $sgpr6 -; SI-NEXT: s_mov_b32 s23, s18 -; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; kill: killed $sgpr6 -; SI-NEXT: s_mov_b32 s77, s21 -; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; kill: killed $sgpr6 -; SI-NEXT: s_mov_b32 s18, s75 -; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; kill: killed $sgpr6 -; SI-NEXT: s_mov_b64 s[4:5], -1 -; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; kill: killed $sgpr6 -; SI-NEXT: ; implicit-def: $sgpr16 -; SI-NEXT: ; implicit-def: $sgpr73 -; SI-NEXT: ; implicit-def: $sgpr98 -; SI-NEXT: ; implicit-def: $sgpr63 -; SI-NEXT: ; implicit-def: $sgpr96 -; SI-NEXT: ; implicit-def: $sgpr62 -; SI-NEXT: ; implicit-def: $sgpr97 ; SI-NEXT: ; implicit-def: $sgpr99 +; SI-NEXT: ; implicit-def: $sgpr57 +; SI-NEXT: ; implicit-def: $sgpr87 +; SI-NEXT: ; implicit-def: $sgpr56 ; SI-NEXT: ; implicit-def: $sgpr85 -; SI-NEXT: ; implicit-def: $sgpr86 +; SI-NEXT: ; implicit-def: $sgpr98 +; SI-NEXT: ; implicit-def: $sgpr83 +; SI-NEXT: ; implicit-def: $sgpr97 ; SI-NEXT: ; implicit-def: $sgpr81 -; SI-NEXT: ; implicit-def: $sgpr82 -; SI-NEXT: ; implicit-def: $sgpr70 +; SI-NEXT: ; implicit-def: $sgpr96 ; SI-NEXT: ; implicit-def: $sgpr71 -; SI-NEXT: ; implicit-def: $sgpr68 +; SI-NEXT: ; implicit-def: $sgpr86 ; SI-NEXT: ; implicit-def: $sgpr69 -; SI-NEXT: ; implicit-def: $sgpr66 -; SI-NEXT: ; implicit-def: $sgpr67 -; SI-NEXT: ; implicit-def: $sgpr64 +; SI-NEXT: ; implicit-def: $sgpr84 +; SI-NEXT: ; implicit-def: $sgpr70 +; SI-NEXT: ; implicit-def: $sgpr82 ; SI-NEXT: ; implicit-def: $sgpr65 -; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: ; implicit-def: $sgpr80 ; SI-NEXT: ; implicit-def: $sgpr55 -; SI-NEXT: ; implicit-def: $sgpr52 +; SI-NEXT: ; implicit-def: $sgpr67 ; SI-NEXT: ; implicit-def: $sgpr53 -; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: ; implicit-def: $sgpr68 ; SI-NEXT: ; implicit-def: $sgpr51 -; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; implicit-def: $sgpr66 ; SI-NEXT: ; implicit-def: $sgpr49 -; SI-NEXT: ; implicit-def: $sgpr38 +; SI-NEXT: ; implicit-def: $sgpr64 ; SI-NEXT: ; implicit-def: $sgpr39 +; SI-NEXT: ; implicit-def: $sgpr54 ; SI-NEXT: ; implicit-def: $sgpr37 +; SI-NEXT: ; implicit-def: $sgpr52 ; SI-NEXT: ; implicit-def: $sgpr35 +; SI-NEXT: ; implicit-def: $sgpr50 ; SI-NEXT: ; implicit-def: $sgpr31 -; SI-NEXT: ; implicit-def: $sgpr36 +; SI-NEXT: ; implicit-def: $sgpr48 ; SI-NEXT: ; implicit-def: $sgpr95 +; SI-NEXT: ; implicit-def: $sgpr38 ; SI-NEXT: ; implicit-def: $sgpr34 -; SI-NEXT: ; implicit-def: $sgpr93 +; SI-NEXT: ; implicit-def: $sgpr36 +; SI-NEXT: ; implicit-def: $sgpr92 ; SI-NEXT: ; implicit-def: $sgpr30 -; SI-NEXT: ; implicit-def: $sgpr91 +; SI-NEXT: ; implicit-def: $sgpr93 ; SI-NEXT: ; implicit-def: $sgpr94 -; SI-NEXT: ; implicit-def: $sgpr92 ; SI-NEXT: ; implicit-def: $sgpr90 -; SI-NEXT: ; implicit-def: $sgpr88 -; SI-NEXT: ; implicit-def: $sgpr28 -; SI-NEXT: ; implicit-def: $sgpr78 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr76 -; SI-NEXT: ; implicit-def: $sgpr24 -; SI-NEXT: ; implicit-def: $sgpr74 -; SI-NEXT: ; implicit-def: $sgpr22 -; SI-NEXT: ; implicit-def: $sgpr72 -; SI-NEXT: ; implicit-def: $sgpr20 -; SI-NEXT: ; implicit-def: $sgpr61 -; SI-NEXT: ; implicit-def: $sgpr19 -; SI-NEXT: ; implicit-def: $sgpr60 -; SI-NEXT: ; implicit-def: $sgpr17 -; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; kill: killed $sgpr6 -; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr91 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: s_mov_b64 s[4:5], -1 ; SI-NEXT: .LBB107_3: ; %Flow ; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] -; SI-NEXT: s_mov_b32 s4, s60 -; SI-NEXT: s_mov_b32 s5, s17 -; SI-NEXT: s_mov_b32 s17, s61 -; SI-NEXT: s_mov_b32 s60, s72 -; SI-NEXT: s_mov_b32 s61, s74 -; SI-NEXT: s_mov_b32 s72, s76 -; SI-NEXT: s_mov_b32 s74, s78 -; SI-NEXT: s_mov_b32 s76, s88 -; SI-NEXT: s_mov_b32 s78, s92 -; SI-NEXT: s_mov_b32 s88, s91 -; SI-NEXT: s_mov_b32 s91, s93 -; SI-NEXT: s_mov_b32 s92, s94 +; SI-NEXT: s_mov_b32 s4, s90 +; SI-NEXT: s_mov_b32 s5, s93 +; SI-NEXT: s_mov_b32 s90, s92 +; SI-NEXT: s_mov_b32 s92, s34 ; SI-NEXT: s_mov_b32 s93, s95 -; SI-NEXT: s_mov_b32 s94, s30 ; SI-NEXT: s_mov_b32 s95, s31 -; SI-NEXT: s_mov_b32 s30, s34 -; SI-NEXT: s_mov_b32 s31, s37 +; SI-NEXT: s_mov_b32 s31, s35 ; SI-NEXT: s_mov_b32 s34, s36 +; SI-NEXT: s_mov_b32 s35, s37 ; SI-NEXT: s_mov_b32 s36, s38 ; SI-NEXT: s_mov_b32 s37, s39 ; SI-NEXT: s_mov_b32 s38, s48 @@ -235668,543 +232434,465 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a ; SI-NEXT: s_mov_b32 s51, s53 ; SI-NEXT: s_mov_b32 s52, s54 ; SI-NEXT: s_mov_b32 s53, s55 -; SI-NEXT: s_mov_b32 s54, s6 -; SI-NEXT: s_mov_b32 s55, s16 +; SI-NEXT: s_mov_b32 s54, s64 +; SI-NEXT: s_mov_b32 s55, s65 +; SI-NEXT: s_mov_b32 s64, s66 +; SI-NEXT: s_mov_b32 s65, s70 +; SI-NEXT: s_mov_b32 s66, s68 +; SI-NEXT: s_mov_b32 s68, s80 +; SI-NEXT: s_mov_b32 s70, s82 +; SI-NEXT: s_mov_b32 s80, s7 +; SI-NEXT: s_mov_b32 s82, s6 ; SI-NEXT: s_cbranch_vccnz .LBB107_5 ; SI-NEXT: ; %bb.4: ; %cmp.true +; SI-NEXT: s_add_i32 s47, s47, 3 +; SI-NEXT: s_and_b32 s4, s47, 0xffff +; SI-NEXT: s_lshl_b32 s5, s89, 16 +; SI-NEXT: s_add_i32 s46, s46, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s46, 0xffff +; SI-NEXT: s_lshl_b32 s46, s88, 16 +; SI-NEXT: s_add_i32 s44, s44, 3 +; SI-NEXT: s_or_b32 s5, s46, s5 +; SI-NEXT: s_and_b32 s44, s44, 0xffff +; SI-NEXT: s_lshl_b32 s46, s79, 16 +; SI-NEXT: s_add_i32 s42, s42, 3 +; SI-NEXT: s_or_b32 s44, s46, s44 +; SI-NEXT: s_and_b32 s42, s42, 0xffff +; SI-NEXT: s_lshl_b32 s46, s78, 16 +; SI-NEXT: s_add_i32 s40, s40, 3 +; SI-NEXT: s_or_b32 s42, s46, s42 +; SI-NEXT: s_and_b32 s40, s40, 0xffff +; SI-NEXT: s_lshl_b32 s46, s77, 16 +; SI-NEXT: s_add_i32 s14, s14, 3 +; SI-NEXT: s_or_b32 s40, s46, s40 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: s_lshl_b32 s46, s76, 16 ; SI-NEXT: s_add_i32 s13, s13, 3 -; SI-NEXT: s_add_i32 s80, s80, 3 +; SI-NEXT: s_or_b32 s14, s46, s14 ; SI-NEXT: s_and_b32 s13, s13, 0xffff -; SI-NEXT: s_lshl_b32 s12, s12, 16 +; SI-NEXT: s_lshl_b32 s46, s75, 16 +; SI-NEXT: s_add_i32 s12, s12, 3 +; SI-NEXT: s_or_b32 s13, s46, s13 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_lshl_b32 s46, s74, 16 ; SI-NEXT: s_add_i32 s11, s11, 3 -; SI-NEXT: v_readlane_b32 s6, v41, 30 -; SI-NEXT: s_and_b32 s4, s80, 0xffff -; SI-NEXT: s_lshl_b32 s5, s77, 16 -; SI-NEXT: s_add_i32 s84, s84, 3 -; SI-NEXT: s_add_i32 s15, s15, 3 -; SI-NEXT: s_or_b32 s12, s12, s13 +; SI-NEXT: s_or_b32 s12, s46, s12 ; SI-NEXT: s_and_b32 s11, s11, 0xffff -; SI-NEXT: s_lshl_b32 s10, s10, 16 -; SI-NEXT: s_lshl_b32 s13, s6, 16 -; SI-NEXT: v_readlane_b32 s6, v41, 29 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s5, s84, 0xffff -; SI-NEXT: s_lshl_b32 s60, s18, 16 -; SI-NEXT: s_add_i32 s83, s83, 3 -; SI-NEXT: s_and_b32 s15, s15, 0xffff -; SI-NEXT: s_lshl_b32 s14, s14, 16 -; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: s_lshl_b32 s11, s25, 16 -; SI-NEXT: s_add_i32 s25, s6, 3 -; SI-NEXT: v_readlane_b32 s6, v41, 28 -; SI-NEXT: s_or_b32 s5, s60, s5 -; SI-NEXT: s_and_b32 s60, s83, 0xffff -; SI-NEXT: s_lshl_b32 s61, s79, 16 -; SI-NEXT: s_or_b32 s14, s14, s15 -; SI-NEXT: s_lshl_b32 s15, s6, 16 -; SI-NEXT: v_readlane_b32 s6, v41, 27 -; SI-NEXT: s_or_b32 vcc_lo, s61, s60 -; SI-NEXT: s_lshl_b32 s61, s23, 16 -; SI-NEXT: s_add_i32 s23, s6, 3 -; SI-NEXT: v_readlane_b32 s6, v41, 26 +; SI-NEXT: s_lshl_b32 s46, s73, 16 +; SI-NEXT: s_add_i32 s10, s10, 3 +; SI-NEXT: s_or_b32 s11, s46, s11 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_lshl_b32 s46, s72, 16 ; SI-NEXT: s_add_i32 s9, s9, 3 -; SI-NEXT: s_lshl_b32 s20, s6, 16 -; SI-NEXT: v_readlane_b32 s6, v41, 25 +; SI-NEXT: s_or_b32 s10, s46, s10 ; SI-NEXT: s_and_b32 s9, s9, 0xffff -; SI-NEXT: s_lshl_b32 s8, s8, 16 -; SI-NEXT: s_add_i32 s7, s7, 3 -; SI-NEXT: s_add_i32 s21, s6, 3 -; SI-NEXT: v_readlane_b32 s6, v41, 23 -; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_lshl_b32 s46, s63, 16 +; SI-NEXT: s_add_i32 s8, s8, 3 +; SI-NEXT: v_readlane_b32 s6, v34, 15 +; SI-NEXT: s_or_b32 s9, s46, s9 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_lshl_b32 s46, s62, 16 +; SI-NEXT: s_add_i32 s7, s6, 3 +; SI-NEXT: v_readlane_b32 s6, v34, 14 +; SI-NEXT: s_or_b32 s8, s46, s8 ; SI-NEXT: s_and_b32 s7, s7, 0xffff -; SI-NEXT: s_lshl_b32 s9, s89, 16 -; SI-NEXT: s_add_i32 s29, s29, 3 -; SI-NEXT: s_lshl_b32 s19, s6, 16 -; SI-NEXT: v_readlane_b32 s6, v41, 22 -; SI-NEXT: s_or_b32 s7, s9, s7 -; SI-NEXT: s_and_b32 s9, s29, 0xffff -; SI-NEXT: s_add_i32 s27, s27, 3 -; SI-NEXT: s_add_i32 s16, s6, 3 -; SI-NEXT: v_readlane_b32 s6, v41, 21 -; SI-NEXT: s_or_b32 s9, s11, s9 -; SI-NEXT: s_and_b32 s11, s27, 0xffff -; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: s_lshl_b32 s17, s6, 16 -; SI-NEXT: v_readlane_b32 s6, v41, 20 -; SI-NEXT: s_or_b32 s11, s13, s11 -; SI-NEXT: s_and_b32 s13, s25, 0xffff -; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_lshl_b32 s46, s61, 16 ; SI-NEXT: s_add_i32 s6, s6, 3 -; SI-NEXT: v_readlane_b32 s17, v41, 19 -; SI-NEXT: s_or_b32 s13, s15, s13 -; SI-NEXT: s_and_b32 s15, s23, 0xffff +; SI-NEXT: s_or_b32 s7, s46, s7 ; SI-NEXT: s_and_b32 s6, s6, 0xffff -; SI-NEXT: s_lshl_b32 s17, s17, 16 -; SI-NEXT: s_or_b32 s15, s20, s15 -; SI-NEXT: s_and_b32 s20, s21, 0xffff -; SI-NEXT: s_or_b32 s6, s17, s6 -; SI-NEXT: v_readlane_b32 s17, v41, 18 -; SI-NEXT: v_readlane_b32 s18, v41, 17 -; SI-NEXT: s_or_b32 s19, s19, s20 -; SI-NEXT: s_add_i32 s98, s17, 3 -; SI-NEXT: s_lshl_b32 s20, s18, 16 -; SI-NEXT: v_readlane_b32 s18, v41, 16 -; SI-NEXT: s_and_b32 s17, s98, 0xffff -; SI-NEXT: s_add_i32 s96, s18, 3 -; SI-NEXT: v_readlane_b32 s18, v41, 15 -; SI-NEXT: s_or_b32 s17, s20, s17 -; SI-NEXT: s_and_b32 s20, s96, 0xffff -; SI-NEXT: s_lshl_b32 s21, s18, 16 -; SI-NEXT: v_readlane_b32 s18, v41, 24 -; SI-NEXT: s_or_b32 s20, s21, s20 -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: v_readlane_b32 s21, v41, 14 -; SI-NEXT: s_and_b32 s18, s18, 0xffff -; SI-NEXT: s_lshl_b32 s21, s21, 16 -; SI-NEXT: s_or_b32 s18, s21, s18 -; SI-NEXT: v_readlane_b32 s21, v41, 13 -; SI-NEXT: s_add_i32 s21, s21, 3 -; SI-NEXT: v_readlane_b32 s22, v41, 12 -; SI-NEXT: s_and_b32 s21, s21, 0xffff -; SI-NEXT: s_lshl_b32 s22, s22, 16 -; SI-NEXT: s_or_b32 s21, s22, s21 -; SI-NEXT: v_readlane_b32 s22, v41, 11 -; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: v_readlane_b32 s23, v41, 10 -; SI-NEXT: s_and_b32 s22, s22, 0xffff -; SI-NEXT: s_lshl_b32 s23, s23, 16 -; SI-NEXT: s_or_b32 s22, s23, s22 -; SI-NEXT: v_readlane_b32 s23, v41, 9 -; SI-NEXT: s_add_i32 s23, s23, 3 -; SI-NEXT: v_readlane_b32 s24, v41, 8 -; SI-NEXT: s_and_b32 s23, s23, 0xffff -; SI-NEXT: s_lshl_b32 s24, s24, 16 -; SI-NEXT: s_or_b32 s23, s24, s23 -; SI-NEXT: v_readlane_b32 s24, v41, 7 -; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: v_readlane_b32 s25, v41, 6 -; SI-NEXT: s_and_b32 s24, s24, 0xffff -; SI-NEXT: s_lshl_b32 s25, s25, 16 -; SI-NEXT: s_or_b32 s24, s25, s24 -; SI-NEXT: v_readlane_b32 s25, v41, 5 -; SI-NEXT: s_add_i32 s25, s25, 3 -; SI-NEXT: v_readlane_b32 s26, v41, 4 -; SI-NEXT: s_and_b32 s25, s25, 0xffff -; SI-NEXT: s_lshl_b32 s26, s26, 16 -; SI-NEXT: s_or_b32 s25, s26, s25 -; SI-NEXT: v_readlane_b32 s26, v41, 3 -; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: v_readlane_b32 s27, v41, 2 -; SI-NEXT: s_and_b32 s26, s26, 0xffff -; SI-NEXT: s_lshl_b32 s27, s27, 16 -; SI-NEXT: s_or_b32 s26, s27, s26 -; SI-NEXT: v_readlane_b32 s27, v41, 1 -; SI-NEXT: s_add_i32 s27, s27, 3 -; SI-NEXT: v_readlane_b32 s28, v41, 0 -; SI-NEXT: s_and_b32 s27, s27, 0xffff -; SI-NEXT: s_lshl_b32 s28, s28, 16 -; SI-NEXT: s_or_b32 s27, s28, s27 -; SI-NEXT: s_add_i32 s27, s27, 0x30000 -; SI-NEXT: s_and_b32 s28, s27, 0xffff0000 -; SI-NEXT: s_add_i32 s26, s26, 0x30000 -; SI-NEXT: v_writelane_b32 v41, s28, 31 -; SI-NEXT: s_lshl_b32 s27, s27, 16 -; SI-NEXT: s_add_i32 s87, s87, 3 -; SI-NEXT: s_add_i32 s59, s59, 3 -; SI-NEXT: s_add_i32 s57, s57, 3 -; SI-NEXT: s_add_i32 s47, s47, 3 +; SI-NEXT: s_lshl_b32 s46, s60, 16 ; SI-NEXT: s_add_i32 s45, s45, 3 -; SI-NEXT: s_add_i32 s43, s43, 3 -; SI-NEXT: s_add_i32 s41, s41, 3 -; SI-NEXT: v_writelane_b32 v41, s27, 32 -; SI-NEXT: s_and_b32 s27, s26, 0xffff0000 -; SI-NEXT: s_and_b32 s60, s87, 0xffff -; SI-NEXT: s_and_b32 s59, s59, 0xffff -; SI-NEXT: s_lshl_b32 s58, s58, 16 -; SI-NEXT: s_and_b32 s57, s57, 0xffff -; SI-NEXT: s_lshl_b32 s56, s56, 16 -; SI-NEXT: s_and_b32 s47, s47, 0xffff -; SI-NEXT: s_lshl_b32 s46, s46, 16 +; SI-NEXT: s_or_b32 s6, s46, s6 ; SI-NEXT: s_and_b32 s45, s45, 0xffff -; SI-NEXT: s_lshl_b32 s44, s44, 16 +; SI-NEXT: s_lshl_b32 s46, s59, 16 +; SI-NEXT: s_add_i32 s43, s43, 3 +; SI-NEXT: s_or_b32 s45, s46, s45 ; SI-NEXT: s_and_b32 s43, s43, 0xffff -; SI-NEXT: s_lshl_b32 s42, s42, 16 +; SI-NEXT: s_lshl_b32 s46, s58, 16 +; SI-NEXT: s_add_i32 s41, s41, 3 +; SI-NEXT: s_or_b32 s43, s46, s43 ; SI-NEXT: s_and_b32 s41, s41, 0xffff -; SI-NEXT: s_lshl_b32 s40, s40, 16 -; SI-NEXT: s_add_i32 s25, s25, 0x30000 -; SI-NEXT: v_writelane_b32 v41, s27, 33 -; SI-NEXT: s_lshl_b32 s26, s26, 16 -; SI-NEXT: s_or_b32 vcc_hi, s61, s60 -; SI-NEXT: s_or_b32 s58, s58, s59 -; SI-NEXT: s_or_b32 s56, s56, s57 -; SI-NEXT: s_or_b32 s46, s46, s47 -; SI-NEXT: s_or_b32 s44, s44, s45 -; SI-NEXT: s_or_b32 s42, s42, s43 -; SI-NEXT: s_or_b32 s40, s40, s41 -; SI-NEXT: v_writelane_b32 v41, s26, 34 -; SI-NEXT: s_and_b32 s26, s25, 0xffff0000 +; SI-NEXT: s_lshl_b32 s46, s15, 16 +; SI-NEXT: v_readlane_b32 s15, v34, 16 +; SI-NEXT: s_or_b32 s41, s46, s41 +; SI-NEXT: s_add_i32 s15, s15, 3 +; SI-NEXT: v_readlane_b32 s46, v34, 17 +; SI-NEXT: s_and_b32 s15, s15, 0xffff +; SI-NEXT: s_lshl_b32 s46, s46, 16 +; SI-NEXT: s_or_b32 s15, s46, s15 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: v_readlane_b32 s46, v34, 13 +; SI-NEXT: s_and_b32 s29, s29, 0xffff +; SI-NEXT: s_lshl_b32 s46, s46, 16 +; SI-NEXT: s_or_b32 s29, s46, s29 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: v_readlane_b32 s46, v34, 12 +; SI-NEXT: s_and_b32 s28, s28, 0xffff +; SI-NEXT: s_lshl_b32 s46, s46, 16 +; SI-NEXT: s_or_b32 s28, s46, s28 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: v_readlane_b32 s46, v34, 11 +; SI-NEXT: s_and_b32 s27, s27, 0xffff +; SI-NEXT: s_lshl_b32 s46, s46, 16 +; SI-NEXT: s_or_b32 s27, s46, s27 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: v_readlane_b32 s46, v34, 10 +; SI-NEXT: s_and_b32 s26, s26, 0xffff +; SI-NEXT: s_lshl_b32 s46, s46, 16 +; SI-NEXT: s_or_b32 s26, s46, s26 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: v_readlane_b32 s46, v34, 9 +; SI-NEXT: s_and_b32 s25, s25, 0xffff +; SI-NEXT: s_lshl_b32 s46, s46, 16 +; SI-NEXT: s_or_b32 s25, s46, s25 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: v_readlane_b32 s46, v34, 8 +; SI-NEXT: s_and_b32 s24, s24, 0xffff +; SI-NEXT: s_lshl_b32 s46, s46, 16 +; SI-NEXT: s_or_b32 s24, s46, s24 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: v_readlane_b32 s46, v34, 7 +; SI-NEXT: s_and_b32 s23, s23, 0xffff +; SI-NEXT: s_lshl_b32 s46, s46, 16 +; SI-NEXT: s_or_b32 s23, s46, s23 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: v_readlane_b32 s46, v34, 6 +; SI-NEXT: s_and_b32 s22, s22, 0xffff +; SI-NEXT: s_lshl_b32 s46, s46, 16 +; SI-NEXT: s_or_b32 s22, s46, s22 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: v_readlane_b32 s46, v34, 5 +; SI-NEXT: s_and_b32 s21, s21, 0xffff +; SI-NEXT: s_lshl_b32 s46, s46, 16 +; SI-NEXT: s_or_b32 s21, s46, s21 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_readlane_b32 s46, v34, 4 +; SI-NEXT: s_and_b32 s20, s20, 0xffff +; SI-NEXT: s_lshl_b32 s46, s46, 16 +; SI-NEXT: s_or_b32 s20, s46, s20 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: v_readlane_b32 s46, v34, 3 +; SI-NEXT: s_and_b32 s19, s19, 0xffff +; SI-NEXT: s_lshl_b32 s46, s46, 16 +; SI-NEXT: s_or_b32 s19, s46, s19 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: v_readlane_b32 s46, v34, 2 +; SI-NEXT: s_and_b32 s18, s18, 0xffff +; SI-NEXT: s_lshl_b32 s46, s46, 16 +; SI-NEXT: s_or_b32 s18, s46, s18 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: v_readlane_b32 s46, v34, 1 +; SI-NEXT: s_and_b32 s17, s17, 0xffff +; SI-NEXT: s_lshl_b32 s46, s46, 16 +; SI-NEXT: s_or_b32 s17, s46, s17 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_readlane_b32 s46, v34, 0 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_lshl_b32 s46, s46, 16 +; SI-NEXT: s_or_b32 s16, s46, s16 +; SI-NEXT: s_add_i32 s16, s16, 0x30000 +; SI-NEXT: s_and_b32 s46, s16, 0xffff0000 +; SI-NEXT: s_add_i32 s17, s17, 0x30000 +; SI-NEXT: v_writelane_b32 v34, s46, 18 +; SI-NEXT: s_lshl_b32 s16, s16, 16 +; SI-NEXT: v_writelane_b32 v34, s16, 19 +; SI-NEXT: s_and_b32 s16, s17, 0xffff0000 +; SI-NEXT: s_add_i32 s18, s18, 0x30000 +; SI-NEXT: v_writelane_b32 v34, s16, 20 +; SI-NEXT: s_lshl_b32 s16, s17, 16 +; SI-NEXT: v_writelane_b32 v34, s16, 21 +; SI-NEXT: s_and_b32 s16, s18, 0xffff0000 +; SI-NEXT: s_add_i32 s19, s19, 0x30000 +; SI-NEXT: v_writelane_b32 v34, s16, 22 +; SI-NEXT: s_lshl_b32 s16, s18, 16 +; SI-NEXT: v_writelane_b32 v34, s16, 23 +; SI-NEXT: s_and_b32 s16, s19, 0xffff0000 +; SI-NEXT: s_add_i32 s20, s20, 0x30000 +; SI-NEXT: v_writelane_b32 v34, s16, 24 +; SI-NEXT: s_lshl_b32 s16, s19, 16 +; SI-NEXT: v_writelane_b32 v34, s16, 25 +; SI-NEXT: s_and_b32 s16, s20, 0xffff0000 +; SI-NEXT: s_add_i32 s21, s21, 0x30000 +; SI-NEXT: v_writelane_b32 v34, s16, 26 +; SI-NEXT: s_lshl_b32 s16, s20, 16 +; SI-NEXT: v_writelane_b32 v34, s16, 27 +; SI-NEXT: s_and_b32 s16, s21, 0xffff0000 +; SI-NEXT: s_add_i32 s22, s22, 0x30000 +; SI-NEXT: v_writelane_b32 v34, s16, 28 +; SI-NEXT: s_lshl_b32 s16, s21, 16 +; SI-NEXT: v_writelane_b32 v34, s16, 29 +; SI-NEXT: s_and_b32 s16, s22, 0xffff0000 +; SI-NEXT: s_add_i32 s23, s23, 0x30000 +; SI-NEXT: v_writelane_b32 v34, s16, 30 +; SI-NEXT: s_lshl_b32 s16, s22, 16 +; SI-NEXT: v_writelane_b32 v34, s16, 31 +; SI-NEXT: s_and_b32 s16, s23, 0xffff0000 +; SI-NEXT: s_add_i32 s24, s24, 0x30000 +; SI-NEXT: v_writelane_b32 v34, s16, 32 +; SI-NEXT: s_lshl_b32 s16, s23, 16 ; SI-NEXT: s_add_i32 s4, s4, 0x30000 ; SI-NEXT: s_add_i32 s5, s5, 0x30000 -; SI-NEXT: s_add_i32 vcc_lo, vcc_lo, 0x30000 -; SI-NEXT: s_add_i32 vcc_hi, vcc_hi, 0x30000 -; SI-NEXT: s_add_i32 s58, s58, 0x30000 -; SI-NEXT: s_add_i32 s56, s56, 0x30000 -; SI-NEXT: s_add_i32 s46, s46, 0x30000 ; SI-NEXT: s_add_i32 s44, s44, 0x30000 ; SI-NEXT: s_add_i32 s42, s42, 0x30000 ; SI-NEXT: s_add_i32 s40, s40, 0x30000 ; SI-NEXT: s_add_i32 s14, s14, 0x30000 +; SI-NEXT: s_add_i32 s13, s13, 0x30000 ; SI-NEXT: s_add_i32 s12, s12, 0x30000 +; SI-NEXT: s_add_i32 s11, s11, 0x30000 ; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 ; SI-NEXT: s_add_i32 s8, s8, 0x30000 ; SI-NEXT: s_add_i32 s7, s7, 0x30000 -; SI-NEXT: s_add_i32 s9, s9, 0x30000 -; SI-NEXT: s_add_i32 s11, s11, 0x30000 -; SI-NEXT: s_add_i32 s13, s13, 0x30000 -; SI-NEXT: s_add_i32 s15, s15, 0x30000 -; SI-NEXT: s_add_i32 s19, s19, 0x30000 -; SI-NEXT: s_add_i32 s16, s16, 0x30000 ; SI-NEXT: s_add_i32 s6, s6, 0x30000 -; SI-NEXT: s_add_i32 s17, s17, 0x30000 -; SI-NEXT: s_add_i32 s20, s20, 0x30000 -; SI-NEXT: s_add_i32 s18, s18, 0x30000 -; SI-NEXT: s_add_i32 s21, s21, 0x30000 -; SI-NEXT: s_add_i32 s22, s22, 0x30000 -; SI-NEXT: s_add_i32 s23, s23, 0x30000 -; SI-NEXT: s_add_i32 s24, s24, 0x30000 -; SI-NEXT: v_writelane_b32 v41, s26, 35 -; SI-NEXT: s_lshl_b32 s25, s25, 16 -; SI-NEXT: v_writelane_b32 v41, s25, 36 -; SI-NEXT: s_and_b32 s25, s24, 0xffff0000 -; SI-NEXT: s_lshl_b32 s55, s24, 16 -; SI-NEXT: s_and_b32 s73, s23, 0xffff0000 -; SI-NEXT: s_lshl_b32 s54, s23, 16 -; SI-NEXT: s_and_b32 s63, s22, 0xffff0000 -; SI-NEXT: s_lshl_b32 s98, s22, 16 -; SI-NEXT: s_and_b32 s62, s21, 0xffff0000 -; SI-NEXT: s_lshl_b32 s96, s21, 16 -; SI-NEXT: s_and_b32 s99, s18, 0xffff0000 -; SI-NEXT: s_lshl_b32 s97, s18, 16 -; SI-NEXT: s_and_b32 s86, s20, 0xffff0000 -; SI-NEXT: s_lshl_b32 s85, s20, 16 -; SI-NEXT: s_and_b32 s82, s17, 0xffff0000 -; SI-NEXT: s_lshl_b32 s81, s17, 16 -; SI-NEXT: s_and_b32 s71, s6, 0xffff0000 -; SI-NEXT: s_lshl_b32 s70, s6, 16 -; SI-NEXT: s_and_b32 s69, s16, 0xffff0000 -; SI-NEXT: s_lshl_b32 s68, s16, 16 -; SI-NEXT: s_and_b32 s67, s19, 0xffff0000 -; SI-NEXT: s_lshl_b32 s66, s19, 16 -; SI-NEXT: s_and_b32 s65, s15, 0xffff0000 -; SI-NEXT: s_lshl_b32 s64, s15, 16 -; SI-NEXT: s_and_b32 s53, s13, 0xffff0000 -; SI-NEXT: s_lshl_b32 s52, s13, 16 -; SI-NEXT: s_and_b32 s51, s11, 0xffff0000 -; SI-NEXT: s_lshl_b32 s50, s11, 16 -; SI-NEXT: s_and_b32 s49, s9, 0xffff0000 -; SI-NEXT: s_lshl_b32 s48, s9, 16 -; SI-NEXT: s_and_b32 s39, s7, 0xffff0000 -; SI-NEXT: s_lshl_b32 s38, s7, 16 -; SI-NEXT: s_and_b32 s37, s8, 0xffff0000 -; SI-NEXT: s_lshl_b32 s36, s8, 16 -; SI-NEXT: s_and_b32 s35, s10, 0xffff0000 -; SI-NEXT: s_lshl_b32 s31, s10, 16 -; SI-NEXT: s_and_b32 s34, s12, 0xffff0000 -; SI-NEXT: s_lshl_b32 s95, s12, 16 -; SI-NEXT: s_and_b32 s30, s14, 0xffff0000 -; SI-NEXT: s_lshl_b32 s93, s14, 16 -; SI-NEXT: s_and_b32 s94, s40, 0xffff0000 -; SI-NEXT: s_lshl_b32 s91, s40, 16 -; SI-NEXT: s_and_b32 s92, s42, 0xffff0000 -; SI-NEXT: s_lshl_b32 s88, s42, 16 -; SI-NEXT: s_and_b32 s90, s44, 0xffff0000 -; SI-NEXT: s_lshl_b32 s78, s44, 16 -; SI-NEXT: s_and_b32 s28, s46, 0xffff0000 -; SI-NEXT: s_lshl_b32 s76, s46, 16 -; SI-NEXT: s_and_b32 s26, s56, 0xffff0000 -; SI-NEXT: s_lshl_b32 s74, s56, 16 -; SI-NEXT: s_and_b32 s24, s58, 0xffff0000 -; SI-NEXT: s_lshl_b32 s72, s58, 16 -; SI-NEXT: s_and_b32 s22, vcc_hi, 0xffff0000 -; SI-NEXT: s_lshl_b32 s61, vcc_hi, 16 -; SI-NEXT: s_and_b32 s20, vcc_lo, 0xffff0000 -; SI-NEXT: s_lshl_b32 s60, vcc_lo, 16 -; SI-NEXT: s_and_b32 s19, s5, 0xffff0000 -; SI-NEXT: s_lshl_b32 s17, s5, 16 -; SI-NEXT: s_and_b32 s5, s4, 0xffff0000 +; SI-NEXT: s_add_i32 s45, s45, 0x30000 +; SI-NEXT: s_add_i32 s43, s43, 0x30000 +; SI-NEXT: s_add_i32 s41, s41, 0x30000 +; SI-NEXT: s_add_i32 s15, s15, 0x30000 +; SI-NEXT: s_add_i32 s29, s29, 0x30000 +; SI-NEXT: s_add_i32 s28, s28, 0x30000 +; SI-NEXT: s_add_i32 s27, s27, 0x30000 +; SI-NEXT: s_add_i32 s26, s26, 0x30000 +; SI-NEXT: s_add_i32 s25, s25, 0x30000 +; SI-NEXT: v_writelane_b32 v34, s16, 33 +; SI-NEXT: s_and_b32 s16, s24, 0xffff0000 +; SI-NEXT: v_writelane_b32 v34, s16, 34 +; SI-NEXT: s_lshl_b32 s16, s24, 16 +; SI-NEXT: s_and_b32 s82, s25, 0xffff0000 +; SI-NEXT: s_lshl_b32 s80, s25, 16 +; SI-NEXT: s_and_b32 s57, s26, 0xffff0000 +; SI-NEXT: s_lshl_b32 s99, s26, 16 +; SI-NEXT: s_and_b32 s56, s27, 0xffff0000 +; SI-NEXT: s_lshl_b32 s87, s27, 16 +; SI-NEXT: s_and_b32 s98, s28, 0xffff0000 +; SI-NEXT: s_lshl_b32 s85, s28, 16 +; SI-NEXT: s_and_b32 s97, s29, 0xffff0000 +; SI-NEXT: s_lshl_b32 s83, s29, 16 +; SI-NEXT: s_and_b32 s96, s15, 0xffff0000 +; SI-NEXT: s_lshl_b32 s81, s15, 16 +; SI-NEXT: s_and_b32 s86, s41, 0xffff0000 +; SI-NEXT: s_lshl_b32 s71, s41, 16 +; SI-NEXT: s_and_b32 s84, s43, 0xffff0000 +; SI-NEXT: s_lshl_b32 s69, s43, 16 +; SI-NEXT: s_and_b32 s70, s45, 0xffff0000 +; SI-NEXT: s_lshl_b32 s65, s45, 16 +; SI-NEXT: s_and_b32 s68, s6, 0xffff0000 +; SI-NEXT: s_lshl_b32 s55, s6, 16 +; SI-NEXT: s_and_b32 s67, s7, 0xffff0000 +; SI-NEXT: s_lshl_b32 s53, s7, 16 +; SI-NEXT: s_and_b32 s66, s8, 0xffff0000 +; SI-NEXT: s_lshl_b32 s51, s8, 16 +; SI-NEXT: s_and_b32 s64, s9, 0xffff0000 +; SI-NEXT: s_lshl_b32 s49, s9, 16 +; SI-NEXT: s_and_b32 s54, s10, 0xffff0000 +; SI-NEXT: s_lshl_b32 s39, s10, 16 +; SI-NEXT: s_and_b32 s52, s11, 0xffff0000 +; SI-NEXT: s_lshl_b32 s37, s11, 16 +; SI-NEXT: s_and_b32 s50, s12, 0xffff0000 +; SI-NEXT: s_lshl_b32 s35, s12, 16 +; SI-NEXT: s_and_b32 s48, s13, 0xffff0000 +; SI-NEXT: s_lshl_b32 s31, s13, 16 +; SI-NEXT: s_and_b32 s38, s14, 0xffff0000 +; SI-NEXT: s_lshl_b32 s95, s14, 16 +; SI-NEXT: s_and_b32 s36, s40, 0xffff0000 +; SI-NEXT: s_lshl_b32 s93, s40, 16 +; SI-NEXT: s_and_b32 s34, s42, 0xffff0000 +; SI-NEXT: s_lshl_b32 s92, s42, 16 +; SI-NEXT: s_and_b32 s30, s44, 0xffff0000 +; SI-NEXT: s_lshl_b32 s90, s44, 16 +; SI-NEXT: s_and_b32 s94, s5, 0xffff0000 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_and_b32 s91, s4, 0xffff0000 ; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_writelane_b32 v41, s25, 37 +; SI-NEXT: v_writelane_b32 v34, s16, 35 ; SI-NEXT: .LBB107_5: ; %end -; SI-NEXT: v_readlane_b32 s6, v41, 31 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s6 -; SI-NEXT: v_readlane_b32 s6, v41, 32 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s6 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_readlane_b32 s6, v41, 33 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s6 -; SI-NEXT: v_readlane_b32 s6, v41, 34 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s6 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 -; SI-NEXT: v_readlane_b32 s6, v41, 35 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_readlane_b32 s6, v34, 18 +; SI-NEXT: v_mul_f32_e64 v0, 1.0, s6 +; SI-NEXT: v_readlane_b32 s6, v34, 19 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_mul_f32_e64 v0, 1.0, s6 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_readlane_b32 s6, v34, 20 ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s6 -; SI-NEXT: v_readlane_b32 s6, v41, 36 +; SI-NEXT: v_readlane_b32 s6, v34, 21 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s6 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; SI-NEXT: v_readlane_b32 s6, v41, 37 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s6 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s55 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s73 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s54 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s63 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s98 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s62 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s96 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s99 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s97 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s86 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s85 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s82 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s81 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s71 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s70 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s69 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s68 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s67 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s66 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s65 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s64 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s53 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s52 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s51 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s50 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s49 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s48 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s39 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s38 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s37 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s36 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s35 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s31 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s34 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s95 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s30 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s93 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s94 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s91 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s92 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s88 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s90 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s78 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s28 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s76 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s26 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s74 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s24 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s72 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s22 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s61 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s20 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s60 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s19 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s17 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s5 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: v_readlane_b32 s99, v40, 35 -; SI-NEXT: v_readlane_b32 s98, v40, 34 -; SI-NEXT: v_readlane_b32 s97, v40, 33 -; SI-NEXT: v_readlane_b32 s96, v40, 32 -; SI-NEXT: v_readlane_b32 s87, v40, 31 -; SI-NEXT: v_readlane_b32 s86, v40, 30 -; SI-NEXT: v_readlane_b32 s85, v40, 29 -; SI-NEXT: v_readlane_b32 s84, v40, 28 -; SI-NEXT: v_readlane_b32 s83, v40, 27 -; SI-NEXT: v_readlane_b32 s82, v40, 26 -; SI-NEXT: v_readlane_b32 s81, v40, 25 -; SI-NEXT: v_readlane_b32 s80, v40, 24 -; SI-NEXT: v_readlane_b32 s71, v40, 23 -; SI-NEXT: v_readlane_b32 s70, v40, 22 -; SI-NEXT: v_readlane_b32 s69, v40, 21 -; SI-NEXT: v_readlane_b32 s68, v40, 20 -; SI-NEXT: v_readlane_b32 s67, v40, 19 -; SI-NEXT: v_readlane_b32 s66, v40, 18 -; SI-NEXT: v_readlane_b32 s65, v40, 17 -; SI-NEXT: v_readlane_b32 s64, v40, 16 -; SI-NEXT: v_readlane_b32 s55, v40, 15 -; SI-NEXT: v_readlane_b32 s54, v40, 14 -; SI-NEXT: v_readlane_b32 s53, v40, 13 -; SI-NEXT: v_readlane_b32 s52, v40, 12 -; SI-NEXT: v_readlane_b32 s51, v40, 11 -; SI-NEXT: v_readlane_b32 s50, v40, 10 -; SI-NEXT: v_readlane_b32 s49, v40, 9 -; SI-NEXT: v_readlane_b32 s48, v40, 8 -; SI-NEXT: v_readlane_b32 s39, v40, 7 -; SI-NEXT: v_readlane_b32 s38, v40, 6 -; SI-NEXT: v_readlane_b32 s37, v40, 5 -; SI-NEXT: v_readlane_b32 s36, v40, 4 -; SI-NEXT: v_readlane_b32 s35, v40, 3 -; SI-NEXT: v_readlane_b32 s34, v40, 2 -; SI-NEXT: v_readlane_b32 s31, v40, 1 -; SI-NEXT: v_readlane_b32 s30, v40, 0 -; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: v_readlane_b32 s6, v34, 22 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s6 +; SI-NEXT: v_readlane_b32 s6, v34, 23 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s6 +; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 +; SI-NEXT: v_readlane_b32 s6, v34, 24 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s6 +; SI-NEXT: v_readlane_b32 s6, v34, 25 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s6 +; SI-NEXT: v_lshr_b64 v[3:4], v[3:4], 16 +; SI-NEXT: v_readlane_b32 s6, v34, 26 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s6 +; SI-NEXT: v_readlane_b32 s6, v34, 27 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s6 +; SI-NEXT: v_lshr_b64 v[4:5], v[4:5], 16 +; SI-NEXT: v_readlane_b32 s6, v34, 28 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s6 +; SI-NEXT: v_readlane_b32 s6, v34, 29 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s6 +; SI-NEXT: v_lshr_b64 v[5:6], v[5:6], 16 +; SI-NEXT: v_readlane_b32 s6, v34, 30 +; SI-NEXT: v_mul_f32_e64 v6, 1.0, s6 +; SI-NEXT: v_readlane_b32 s6, v34, 31 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_mul_f32_e64 v6, 1.0, s6 +; SI-NEXT: v_lshr_b64 v[6:7], v[6:7], 16 +; SI-NEXT: v_readlane_b32 s6, v34, 32 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s6 +; SI-NEXT: v_readlane_b32 s6, v34, 33 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v7 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s6 +; SI-NEXT: v_lshr_b64 v[7:8], v[7:8], 16 +; SI-NEXT: v_readlane_b32 s6, v34, 34 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s6 +; SI-NEXT: v_readlane_b32 s6, v34, 35 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v8 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s6 +; SI-NEXT: v_lshr_b64 v[8:9], v[8:9], 16 +; SI-NEXT: v_mul_f32_e64 v9, 1.0, s82 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v9 +; SI-NEXT: v_mul_f32_e64 v9, 1.0, s80 +; SI-NEXT: v_lshr_b64 v[9:10], v[9:10], 16 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s57 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s99 +; SI-NEXT: v_lshr_b64 v[10:11], v[10:11], 16 +; SI-NEXT: v_mul_f32_e64 v11, 1.0, s56 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v11 +; SI-NEXT: v_mul_f32_e64 v11, 1.0, s87 +; SI-NEXT: v_lshr_b64 v[11:12], v[11:12], 16 +; SI-NEXT: v_mul_f32_e64 v12, 1.0, s98 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v12 +; SI-NEXT: v_mul_f32_e64 v12, 1.0, s85 +; SI-NEXT: v_lshr_b64 v[12:13], v[12:13], 16 +; SI-NEXT: v_mul_f32_e64 v13, 1.0, s97 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v13 +; SI-NEXT: v_mul_f32_e64 v13, 1.0, s83 +; SI-NEXT: v_lshr_b64 v[13:14], v[13:14], 16 +; SI-NEXT: v_mul_f32_e64 v14, 1.0, s96 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_mul_f32_e64 v14, 1.0, s81 +; SI-NEXT: v_lshr_b64 v[14:15], v[14:15], 16 +; SI-NEXT: v_mul_f32_e64 v15, 1.0, s86 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v15 +; SI-NEXT: v_mul_f32_e64 v15, 1.0, s71 +; SI-NEXT: v_lshr_b64 v[15:16], v[15:16], 16 +; SI-NEXT: v_mul_f32_e64 v16, 1.0, s84 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v16 +; SI-NEXT: v_mul_f32_e64 v16, 1.0, s69 +; SI-NEXT: v_lshr_b64 v[16:17], v[16:17], 16 +; SI-NEXT: v_mul_f32_e64 v17, 1.0, s70 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v17 +; SI-NEXT: v_mul_f32_e64 v17, 1.0, s65 +; SI-NEXT: v_lshr_b64 v[17:18], v[17:18], 16 +; SI-NEXT: v_mul_f32_e64 v18, 1.0, s68 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; SI-NEXT: v_mul_f32_e64 v18, 1.0, s55 +; SI-NEXT: v_lshr_b64 v[18:19], v[18:19], 16 +; SI-NEXT: v_mul_f32_e64 v19, 1.0, s67 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v19 +; SI-NEXT: v_mul_f32_e64 v19, 1.0, s53 +; SI-NEXT: v_lshr_b64 v[19:20], v[19:20], 16 +; SI-NEXT: v_mul_f32_e64 v20, 1.0, s66 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v20 +; SI-NEXT: v_mul_f32_e64 v20, 1.0, s51 +; SI-NEXT: v_lshr_b64 v[20:21], v[20:21], 16 +; SI-NEXT: v_mul_f32_e64 v21, 1.0, s64 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v21 +; SI-NEXT: v_mul_f32_e64 v21, 1.0, s49 +; SI-NEXT: v_lshr_b64 v[21:22], v[21:22], 16 +; SI-NEXT: v_mul_f32_e64 v22, 1.0, s54 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 +; SI-NEXT: v_mul_f32_e64 v22, 1.0, s39 +; SI-NEXT: v_lshr_b64 v[22:23], v[22:23], 16 +; SI-NEXT: v_mul_f32_e64 v23, 1.0, s52 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v23 +; SI-NEXT: v_mul_f32_e64 v23, 1.0, s37 +; SI-NEXT: v_lshr_b64 v[23:24], v[23:24], 16 +; SI-NEXT: v_mul_f32_e64 v24, 1.0, s50 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v24 +; SI-NEXT: v_mul_f32_e64 v24, 1.0, s35 +; SI-NEXT: v_lshr_b64 v[24:25], v[24:25], 16 +; SI-NEXT: v_mul_f32_e64 v25, 1.0, s48 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v25 +; SI-NEXT: v_mul_f32_e64 v25, 1.0, s31 +; SI-NEXT: v_lshr_b64 v[25:26], v[25:26], 16 +; SI-NEXT: v_mul_f32_e64 v26, 1.0, s38 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v26 +; SI-NEXT: v_mul_f32_e64 v26, 1.0, s95 +; SI-NEXT: v_lshr_b64 v[26:27], v[26:27], 16 +; SI-NEXT: v_mul_f32_e64 v27, 1.0, s36 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v27 +; SI-NEXT: v_mul_f32_e64 v27, 1.0, s93 +; SI-NEXT: v_lshr_b64 v[27:28], v[27:28], 16 +; SI-NEXT: v_mul_f32_e64 v28, 1.0, s34 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v28 +; SI-NEXT: v_mul_f32_e64 v28, 1.0, s92 +; SI-NEXT: v_lshr_b64 v[28:29], v[28:29], 16 +; SI-NEXT: v_mul_f32_e64 v29, 1.0, s30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v29 +; SI-NEXT: v_mul_f32_e64 v29, 1.0, s90 +; SI-NEXT: v_lshr_b64 v[29:30], v[29:30], 16 +; SI-NEXT: v_mul_f32_e64 v30, 1.0, s94 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v30 +; SI-NEXT: v_mul_f32_e64 v30, 1.0, s5 +; SI-NEXT: v_lshr_b64 v[30:31], v[30:31], 16 +; SI-NEXT: v_mul_f32_e64 v31, 1.0, s91 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v31 +; SI-NEXT: v_mul_f32_e64 v31, 1.0, s4 +; SI-NEXT: v_lshr_b64 v[31:32], v[31:32], 16 +; SI-NEXT: v_readlane_b32 s99, v33, 35 +; SI-NEXT: v_readlane_b32 s98, v33, 34 +; SI-NEXT: v_readlane_b32 s97, v33, 33 +; SI-NEXT: v_readlane_b32 s96, v33, 32 +; SI-NEXT: v_readlane_b32 s87, v33, 31 +; SI-NEXT: v_readlane_b32 s86, v33, 30 +; SI-NEXT: v_readlane_b32 s85, v33, 29 +; SI-NEXT: v_readlane_b32 s84, v33, 28 +; SI-NEXT: v_readlane_b32 s83, v33, 27 +; SI-NEXT: v_readlane_b32 s82, v33, 26 +; SI-NEXT: v_readlane_b32 s81, v33, 25 +; SI-NEXT: v_readlane_b32 s80, v33, 24 +; SI-NEXT: v_readlane_b32 s71, v33, 23 +; SI-NEXT: v_readlane_b32 s70, v33, 22 +; SI-NEXT: v_readlane_b32 s69, v33, 21 +; SI-NEXT: v_readlane_b32 s68, v33, 20 +; SI-NEXT: v_readlane_b32 s67, v33, 19 +; SI-NEXT: v_readlane_b32 s66, v33, 18 +; SI-NEXT: v_readlane_b32 s65, v33, 17 +; SI-NEXT: v_readlane_b32 s64, v33, 16 +; SI-NEXT: v_readlane_b32 s55, v33, 15 +; SI-NEXT: v_readlane_b32 s54, v33, 14 +; SI-NEXT: v_readlane_b32 s53, v33, 13 +; SI-NEXT: v_readlane_b32 s52, v33, 12 +; SI-NEXT: v_readlane_b32 s51, v33, 11 +; SI-NEXT: v_readlane_b32 s50, v33, 10 +; SI-NEXT: v_readlane_b32 s49, v33, 9 +; SI-NEXT: v_readlane_b32 s48, v33, 8 +; SI-NEXT: v_readlane_b32 s39, v33, 7 +; SI-NEXT: v_readlane_b32 s38, v33, 6 +; SI-NEXT: v_readlane_b32 s37, v33, 5 +; SI-NEXT: v_readlane_b32 s36, v33, 4 +; SI-NEXT: v_readlane_b32 s35, v33, 3 +; SI-NEXT: v_readlane_b32 s34, v33, 2 +; SI-NEXT: v_readlane_b32 s31, v33, 1 +; SI-NEXT: v_readlane_b32 s30, v33, 0 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[4:5] -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v64i16_to_v64bf16_scalar: @@ -236650,706 +233338,605 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v64f16_to_v64i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:136 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:44 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:48 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:52 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:4 +; SI-NEXT: v_mov_b32_e32 v50, v2 +; SI-NEXT: v_mov_b32_e32 v2, v1 +; SI-NEXT: v_mov_b32_e32 v1, v0 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v7 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v20 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:60 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v13 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:64 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v19 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:76 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:88 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v2 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v14 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v23 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v23, v25 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v61 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v62 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v12 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v41 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v63 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v25 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v49 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v47, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v41 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v58, v58 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v62 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f16_f32_e32 v49, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v60 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f16_f32_e32 v50, v37 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v31 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v10 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v24, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v62 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v61 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v61, v55 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:100 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:104 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:108 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:112 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:116 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:120 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v45, v39 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v14, v6 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v7 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v41, v8 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v30, v10 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:124 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:128 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:132 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v7, v32 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v8, v46 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v46, v56 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v32, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v37 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v10 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v10, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v0 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_or_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: v_mov_b32_e32 v59, v29 -; SI-NEXT: v_mov_b32_e32 v29, v27 -; SI-NEXT: v_mov_b32_e32 v57, v23 -; SI-NEXT: v_mov_b32_e32 v60, v3 -; SI-NEXT: v_mov_b32_e32 v62, v4 -; SI-NEXT: v_mov_b32_e32 v63, v49 -; SI-NEXT: v_mov_b32_e32 v49, v12 -; SI-NEXT: s_xor_b64 exec, exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB108_2 ; SI-NEXT: ; %bb.1: ; %cmp.true -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 -; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: v_or_b32_e32 v6, v6, v37 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v8 -; SI-NEXT: v_or_b32_e32 v7, v7, v37 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v15 -; SI-NEXT: v_or_b32_e32 v14, v14, v37 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v36 -; SI-NEXT: v_or_b32_e32 v33, v33, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v12 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v32 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v32 +; SI-NEXT: v_or_b32_e32 v31, v0, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v4 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_or_b32_e32 v61, v3, v37 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v58 -; SI-NEXT: v_or_b32_e32 v11, v11, v37 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v17 -; SI-NEXT: v_or_b32_e32 v16, v16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v22 -; SI-NEXT: v_or_b32_e32 v21, v21, v37 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v25 -; SI-NEXT: v_or_b32_e32 v24, v24, v37 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v28 -; SI-NEXT: v_or_b32_e32 v31, v31, v37 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v20 -; SI-NEXT: v_or_b32_e32 v19, v19, v37 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v13 -; SI-NEXT: v_or_b32_e32 v18, v18, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v37 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v29, v0, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v48 -; SI-NEXT: v_or_b32_e32 v2, v2, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v37 -; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v53 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v52, v37, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_or_b32_e32 v27, v0, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v62 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v37 -; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v39 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 +; SI-NEXT: v_or_b32_e32 v25, v0, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v24 +; SI-NEXT: v_add_f32_e32 v50, 0x38000000, v50 ; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 -; SI-NEXT: v_add_f32_e32 v56, 0x38000000, v56 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v56 -; SI-NEXT: v_or_b32_e32 v55, v37, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v50 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v50, 0x38000000, v41 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v50 +; SI-NEXT: v_or_b32_e32 v23, v0, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v37 -; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: v_or_b32_e32 v43, v37, v39 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_or_b32_e32 v21, v0, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 ; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 +; SI-NEXT: v_or_b32_e32 v19, v0, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 ; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: v_add_f32_e32 v51, 0x38000000, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 +; SI-NEXT: v_or_b32_e32 v17, v0, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_or_b32_e32 v15, v0, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_or_b32_e32 v13, v0, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v12 +; SI-NEXT: v_add_f32_e32 v44, 0x38000000, v44 +; SI-NEXT: v_add_f32_e32 v60, 0x38000000, v60 +; SI-NEXT: v_add_f32_e32 v59, 0x38000000, v59 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v58, 0x38000000, v58 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_add_f32_e32 v57, 0x38000000, v57 +; SI-NEXT: v_or_b32_e32 v11, v0, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v10 +; SI-NEXT: v_add_f32_e32 v56, 0x38000000, v56 ; SI-NEXT: v_add_f32_e32 v47, 0x38000000, v47 +; SI-NEXT: v_add_f32_e32 v46, 0x38000000, v46 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v45, 0x38000000, v45 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v44 +; SI-NEXT: v_or_b32_e32 v9, v0, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v8 +; SI-NEXT: v_add_f32_e32 v43, 0x38000000, v43 ; SI-NEXT: v_add_f32_e32 v42, 0x38000000, v42 -; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v37, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v50, 0x38000000, v50 -; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 -; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_add_f32_e32 v54, 0x38000000, v54 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v40, 0x38000000, v40 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v60 +; SI-NEXT: v_or_b32_e32 v7, v0, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v6 +; SI-NEXT: v_add_f32_e32 v55, 0x38000000, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v59 +; SI-NEXT: v_add_f32_e32 v53, 0x38000000, v53 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v58 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_add_f32_e32 v52, 0x38000000, v52 +; SI-NEXT: v_or_b32_e32 v5, v0, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v57 +; SI-NEXT: v_add_f32_e32 v51, 0x38000000, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v56 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v49, 0x38000000, v49 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v47, v47 +; SI-NEXT: v_or_b32_e32 v3, v0, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v2 +; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v46 +; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v45 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 +; SI-NEXT: v_or_b32_e32 v1, v0, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v43 +; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 ; SI-NEXT: v_cvt_f16_f32_e32 v42, v42 -; SI-NEXT: v_add_f32_e32 v54, 0x38000000, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 -; SI-NEXT: v_add_f32_e32 v45, 0x38000000, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_add_f32_e32 v41, 0x38000000, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: v_add_f32_e32 v46, 0x38000000, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 ; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 -; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v46 -; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v47 -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; SI-NEXT: v_or_b32_e32 v38, v38, v47 -; SI-NEXT: v_or_b32_e32 v54, v54, v42 -; SI-NEXT: v_or_b32_e32 v45, v45, v50 -; SI-NEXT: v_or_b32_e32 v41, v41, v30 -; SI-NEXT: v_or_b32_e32 v46, v46, v32 -; SI-NEXT: v_alignbit_b32 v47, v16, v47, 16 -; SI-NEXT: v_alignbit_b32 v42, v11, v42, 16 -; SI-NEXT: v_alignbit_b32 v50, v14, v50, 16 -; SI-NEXT: v_alignbit_b32 v30, v7, v30, 16 -; SI-NEXT: v_alignbit_b32 v32, v6, v32, 16 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v39, v3 -; SI-NEXT: v_or_b32_e32 v3, v37, v34 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v34, v43, v34, 16 -; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_or_b32_e32 v3, v39, v1 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: v_alignbit_b32 v1, v55, v1, 16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v37, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v39, v3 -; SI-NEXT: v_or_b32_e32 v3, v37, v5 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v37, v49 -; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 -; SI-NEXT: v_alignbit_b32 v5, v52, v5, 16 +; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v50, v37, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v61 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v62 +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v40, v40 ; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 ; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_or_b32_e32 v3, v39, v9 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 ; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; SI-NEXT: v_or_b32_e32 v62, v56, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v60 -; SI-NEXT: v_alignbit_b32 v9, v2, v9, 16 -; SI-NEXT: v_add_f32_e32 v56, 0x38000000, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v56 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v39, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; SI-NEXT: v_or_b32_e32 v60, v56, v39 -; SI-NEXT: v_add_f32_e32 v56, 0x38000000, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v56 -; SI-NEXT: v_or_b32_e32 v57, v56, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v59 -; SI-NEXT: v_alignbit_b32 v26, v31, v26, 16 -; SI-NEXT: v_add_f32_e32 v56, 0x38000000, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v56 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v23, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_or_b32_e32 v29, v29, v23 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v27, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; SI-NEXT: v_or_b32_e32 v59, v56, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v63 -; SI-NEXT: v_add_f32_e32 v56, 0x38000000, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v56 -; SI-NEXT: v_or_b32_e32 v63, v56, v35 -; SI-NEXT: v_alignbit_b32 v35, v33, v35, 16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v49, v3 -; SI-NEXT: v_add_f32_e32 v49, 0x38000000, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 ; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 -; SI-NEXT: v_or_b32_e32 v3, v49, v51 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v3, v19, v39, 16 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v44 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v60 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v58 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v56 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v46 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v45 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 +; SI-NEXT: v_or_b32_e32 v41, v41, v61 +; SI-NEXT: v_or_b32_e32 v55, v55, v60 +; SI-NEXT: v_or_b32_e32 v53, v53, v59 +; SI-NEXT: v_or_b32_e32 v52, v52, v58 +; SI-NEXT: v_or_b32_e32 v51, v51, v57 +; SI-NEXT: v_or_b32_e32 v49, v49, v56 +; SI-NEXT: v_or_b32_e32 v48, v48, v47 +; SI-NEXT: v_or_b32_e32 v39, v39, v46 +; SI-NEXT: v_or_b32_e32 v38, v38, v45 +; SI-NEXT: v_or_b32_e32 v36, v36, v43 +; SI-NEXT: v_or_b32_e32 v34, v34, v42 +; SI-NEXT: v_or_b32_e32 v35, v35, v54 +; SI-NEXT: v_or_b32_e32 v33, v33, v40 +; SI-NEXT: v_alignbit_b32 v63, v1, v0, 16 +; SI-NEXT: v_alignbit_b32 v60, v7, v60, 16 +; SI-NEXT: v_alignbit_b32 v59, v9, v59, 16 +; SI-NEXT: v_alignbit_b32 v58, v11, v58, 16 +; SI-NEXT: v_alignbit_b32 v57, v13, v57, 16 +; SI-NEXT: v_alignbit_b32 v56, v15, v56, 16 +; SI-NEXT: v_alignbit_b32 v47, v17, v47, 16 +; SI-NEXT: v_alignbit_b32 v46, v19, v46, 16 +; SI-NEXT: v_alignbit_b32 v45, v21, v45, 16 +; SI-NEXT: v_alignbit_b32 v43, v25, v43, 16 +; SI-NEXT: v_alignbit_b32 v42, v27, v42, 16 +; SI-NEXT: v_alignbit_b32 v54, v29, v54, 16 +; SI-NEXT: v_alignbit_b32 v40, v31, v40, 16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 +; SI-NEXT: v_add_f32_e32 v62, 0x38000000, v62 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v62 +; SI-NEXT: v_or_b32_e32 v62, v62, v37 +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v3, v24, v23, 16 -; SI-NEXT: v_alignbit_b32 v49, v18, v37, 16 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 +; SI-NEXT: v_add_f32_e32 v62, 0x38000000, v62 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v62 +; SI-NEXT: v_or_b32_e32 v62, v62, v44 +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v3, v21, v27, 16 -; SI-NEXT: v_alignbit_b32 v51, v61, v51, 16 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v62, v3, v61, 16 +; SI-NEXT: v_alignbit_b32 v61, v5, v37, 16 +; SI-NEXT: v_alignbit_b32 v44, v23, v44, 16 ; SI-NEXT: .LBB108_2: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v37, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v34, v37, v34 -; SI-NEXT: buffer_store_dword v34, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v34, 0xffff, v43 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v44 -; SI-NEXT: v_or_b32_e32 v34, v34, v37 -; SI-NEXT: v_add_i32_e32 v37, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v34, v37, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; SI-NEXT: v_and_b32_e32 v34, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v1, v34, v1 -; SI-NEXT: v_add_i32_e32 v34, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v1, v34, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v55 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v40 -; SI-NEXT: v_or_b32_e32 v1, v1, v34 -; SI-NEXT: v_add_i32_e32 v34, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v1, v34, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v1, v5 -; SI-NEXT: v_add_i32_e32 v5, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v52 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v53 -; SI-NEXT: v_or_b32_e32 v1, v1, v5 -; SI-NEXT: v_add_i32_e32 v5, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v9 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v63 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v1, v5 -; SI-NEXT: v_add_i32_e32 v5, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v62 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v49 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v13 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v60 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v20 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v57 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v26 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v31 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v28 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v29 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v24 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v59 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v37 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v22 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v38 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v47 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v17 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v42 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v58 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v61 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v41 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v62 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v2, v2, v37 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v61 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v55 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v53 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v52 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v51 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v49 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v48 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v39 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v38 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v43 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v34 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v42 +; SI-NEXT: v_or_b32_e32 v29, v29, v30 +; SI-NEXT: v_and_b32_e32 v30, 0xffff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v40 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_or_b32_e32 v26, v26, v34 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v35 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v54 +; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_or_b32_e32 v24, v24, v36 +; SI-NEXT: v_or_b32_e32 v28, v28, v34 +; SI-NEXT: v_or_b32_e32 v30, v30, v33 +; SI-NEXT: v_or_b32_e32 v31, v31, v32 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v4, v4, v37 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v60 +; SI-NEXT: v_or_b32_e32 v6, v6, v37 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v59 +; SI-NEXT: v_or_b32_e32 v8, v8, v37 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v58 +; SI-NEXT: v_or_b32_e32 v10, v10, v37 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v57 +; SI-NEXT: v_or_b32_e32 v12, v12, v37 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v56 +; SI-NEXT: v_or_b32_e32 v14, v14, v37 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v47 +; SI-NEXT: v_or_b32_e32 v16, v16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v46 +; SI-NEXT: v_or_b32_e32 v18, v18, v37 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v45 +; SI-NEXT: v_or_b32_e32 v20, v20, v37 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v44 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v22, v22, v37 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v63 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v33 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v45 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v15 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v41 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v8 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v46 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v10 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v64f16_to_v64i16: @@ -237590,155 +234177,224 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i ; SI-LABEL: bitcast_v64f16_to_v64i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:52 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:56 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:68 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:72 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v3 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v60, v30 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v61, s28 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v8 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v62, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v49, s21 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v14 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v11 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v1, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v12 +; SI-NEXT: s_lshr_b32 s4, s29, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v25 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v54, s4 +; SI-NEXT: s_lshr_b32 s4, s28, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s4 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v15 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v17 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: s_lshr_b32 s4, s27, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, s23 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v1, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v30 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v13 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v1, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, s27 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v1, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v40, s4 +; SI-NEXT: s_lshr_b32 s4, s26, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s4 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v9 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 +; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v41, s4 +; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s4 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v42, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v43, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v59, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v45, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v51, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v56, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v46, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v57, s22 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v44, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v47, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v58, s20 +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v59, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v46 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v40 +; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v36, v5 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v27, s19 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v54, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v33 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v33, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v43 +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v61, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v39 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v40, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v48, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v14 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v52, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v38, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v35, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v56 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v39, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v36, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v58, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v34, s26 -; SI-NEXT: v_cvt_f16_f32_e32 v56, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v39 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v43 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v59 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB109_2 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_mov_b64 s[4:5], 0 @@ -237747,620 +234403,537 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i ; SI-NEXT: s_mov_b64 s[4:5], -1 ; SI-NEXT: .LBB109_3: ; %Flow ; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] -; SI-NEXT: v_mov_b32_e32 v8, v3 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v40, v61 +; SI-NEXT: v_mov_b32_e32 v34, v54 ; SI-NEXT: s_cbranch_vccnz .LBB109_5 ; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v5, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v62 -; SI-NEXT: v_mov_b32_e32 v47, v38 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v5 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_or_b32_e32 v39, v3, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v36 -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v49 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v5 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_or_b32_e32 v36, v3, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v34 -; SI-NEXT: v_mov_b32_e32 v45, v35 -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v1 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v5 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v58 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v23 -; SI-NEXT: v_mov_b32_e32 v57, v33 -; SI-NEXT: v_or_b32_e32 v34, v3, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v32 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v5 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v56 -; SI-NEXT: v_or_b32_e32 v32, v3, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v3, v30 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v30, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_mov_b32_e32 v52, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v18 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v30 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v32 +; SI-NEXT: v_or_b32_e32 v29, v29, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v52 +; SI-NEXT: v_mov_b32_e32 v52, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_mov_b32_e32 v24, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v36 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_mov_b32_e32 v28, v53 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v2, v59 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_mov_b32_e32 v9, v31 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v4 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v60 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_lshr_b64 v[58:59], v[33:34], 16 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v44 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v30, v3, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v3, v26 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v49 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v54 -; SI-NEXT: v_mov_b32_e32 v54, v15 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v12 -; SI-NEXT: v_mov_b32_e32 v12, v42 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v26, v3, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v3, v22 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v49 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v22, v3, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v3, v18 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v18, v3, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v3, v16 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v16, v3, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v3, v14 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v33 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v43 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v41 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_mov_b32_e32 v58, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v8 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v10 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v14, v3, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v51 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v5, v60 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v60 -; SI-NEXT: v_or_b32_e32 v43, v3, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v10 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v5 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v55 -; SI-NEXT: v_or_b32_e32 v10, v3, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v41 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v5 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v48 -; SI-NEXT: v_or_b32_e32 v41, v3, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v6 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v5 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 -; SI-NEXT: v_or_b32_e32 v6, v3, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v63 -; SI-NEXT: v_lshr_b64 v[62:63], v[38:39], 16 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v51, v4 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v51 -; SI-NEXT: v_or_b32_e32 v4, v3, v4 -; SI-NEXT: v_mov_b32_e32 v63, v51 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v45 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v47 -; SI-NEXT: v_or_b32_e32 v44, v28, v33 -; SI-NEXT: v_lshr_b64 v[46:47], v[29:30], 16 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_or_b32_e32 v49, v24, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v12 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v11 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v11 -; SI-NEXT: v_or_b32_e32 v2, v2, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v52 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_or_b32_e32 v52, v20, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v61 -; SI-NEXT: v_or_b32_e32 v61, v24, v29 -; SI-NEXT: v_mov_b32_e32 v38, v49 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_or_b32_e32 v37, v20, v31 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v28, v12 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v12 -; SI-NEXT: v_or_b32_e32 v12, v28, v25 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v24, v12 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v28, v12 -; SI-NEXT: v_or_b32_e32 v12, v20, v21 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v12 -; SI-NEXT: v_or_b32_e32 v12, v24, v17 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v44 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_or_b32_e32 v12, v28, v15 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v24 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v41 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v50 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v38 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v42 +; SI-NEXT: v_or_b32_e32 v31, v31, v38 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v39, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v56 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v46 +; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_or_b32_e32 v36, v39, v2 +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v24, v12 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v28, v12 -; SI-NEXT: v_or_b32_e32 v12, v20, v13 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_or_b32_e32 v12, v24, v42 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v20, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v54 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 +; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_or_b32_e32 v45, v48, v4 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v40, v39, v8 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_or_b32_e32 v12, v28, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v8 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: v_lshr_b64 v[56:57], v[31:32], 16 -; SI-NEXT: v_or_b32_e32 v54, v20, v40 -; SI-NEXT: v_or_b32_e32 v20, v24, v5 -; SI-NEXT: v_mov_b32_e32 v33, v54 -; SI-NEXT: v_mov_b32_e32 v31, v55 -; SI-NEXT: v_lshr_b64 v[54:55], v[15:16], 16 -; SI-NEXT: v_mov_b32_e32 v15, v20 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v50, v8 -; SI-NEXT: v_or_b32_e32 v8, v28, v3 -; SI-NEXT: v_lshr_b64 v[28:29], v[5:6], 16 -; SI-NEXT: v_add_f32_e32 v50, 0x38000000, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_or_b32_e32 v12, v50, v1 -; SI-NEXT: v_lshr_b64 v[49:50], v[35:36], 16 -; SI-NEXT: v_lshr_b64 v[50:51], v[21:22], 16 -; SI-NEXT: v_lshr_b64 v[20:21], v[42:43], 16 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[20:21], v[9:10], 16 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v35, v44 -; SI-NEXT: v_lshr_b64 v[44:45], v[25:26], 16 -; SI-NEXT: v_lshr_b64 v[24:25], v[17:18], 16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[20:21], v[40:41], 16 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[12:13], v[13:14], 16 -; SI-NEXT: v_lshr_b64 v[24:25], v[3:4], 16 -; SI-NEXT: v_lshr_b64 v[20:21], v[1:2], 16 -; SI-NEXT: v_mov_b32_e32 v42, v61 -; SI-NEXT: v_mov_b32_e32 v61, v37 -; SI-NEXT: v_mov_b32_e32 v37, v53 -; SI-NEXT: v_mov_b32_e32 v51, v43 -; SI-NEXT: .LBB109_5: ; %end -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v62 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v52 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v39 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v27 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v49 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v38 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v36 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v19 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v58 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v35 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v16 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v37 +; SI-NEXT: v_mov_b32_e32 v49, v55 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v63 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v34 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v23 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v26 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v35 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_or_b32_e32 v38, v38, v0 +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v56 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v61 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v32 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v37 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v53 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_lshr_b64 v[62:63], v[0:1], 16 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshr_b64 v[56:57], v[6:7], 16 +; SI-NEXT: v_mov_b32_e32 v57, v43 +; SI-NEXT: v_mov_b32_e32 v43, v44 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_mov_b32_e32 v63, v35 +; SI-NEXT: v_lshr_b64 v[46:47], v[8:9], 16 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: v_lshr_b64 v[60:61], v[2:3], 16 +; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 +; SI-NEXT: v_or_b32_e32 v39, v48, v10 +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v46 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v42 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 +; SI-NEXT: v_or_b32_e32 v38, v38, v6 +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v30 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v53 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 +; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 +; SI-NEXT: v_or_b32_e32 v39, v39, v14 +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v53 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v44 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v39, v48, v16 +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v26 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v39, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v49 +; SI-NEXT: v_or_b32_e32 v38, v38, v12 +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 +; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v52 +; SI-NEXT: v_or_b32_e32 v34, v39, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v36 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_or_b32_e32 v55, v48, v22 +; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 +; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 +; SI-NEXT: v_or_b32_e32 v52, v38, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v58 +; SI-NEXT: v_lshr_b64 v[58:59], v[4:5], 16 +; SI-NEXT: v_mov_b32_e32 v59, v51 +; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v28 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v53 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v36 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: v_cvt_f32_f16_e32 v49, v36 +; SI-NEXT: v_or_b32_e32 v36, v38, v24 +; SI-NEXT: v_or_b32_e32 v38, v39, v26 +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v49, 0x38000000, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v21 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v38, v48, v28 +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v44, v36 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v38, v49, v30 +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[38:39], v[12:13], 16 +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[35:36], v[18:19], 16 +; SI-NEXT: v_mov_b32_e32 v18, v33 +; SI-NEXT: v_mov_b32_e32 v33, v50 +; SI-NEXT: v_lshr_b64 v[50:51], v[20:21], 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[38:39], v[14:15], 16 +; SI-NEXT: v_mov_b32_e32 v36, v52 +; SI-NEXT: v_lshr_b64 v[51:52], v[26:27], 16 +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshr_b64 v[38:39], v[16:17], 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[51:52], v[28:29], 16 +; SI-NEXT: v_lshr_b64 v[47:48], v[10:11], 16 +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[48:49], v[22:23], 16 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshr_b64 v[38:39], v[24:25], 16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: v_lshr_b64 v[51:52], v[30:31], 16 +; SI-NEXT: .LBB109_5: ; %end +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v62 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v54 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v45 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v40 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v35 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v50 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v48 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v38 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v59 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v60 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v18 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v58 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v43 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v56 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v57 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v46 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v41 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v47 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v39 +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v39 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v37 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v39 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v36 +; SI-NEXT: v_or_b32_e32 v18, v18, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v34 +; SI-NEXT: v_or_b32_e32 v20, v20, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v12 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v51 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v60 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v55 +; SI-NEXT: v_or_b32_e32 v22, v22, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v33 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v44 +; SI-NEXT: v_or_b32_e32 v24, v24, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v63 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v12 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v31 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v33 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v26, v26, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v53 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v9 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v41 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v48 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v28 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v24 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v63 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v33 +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: v_or_b32_e32 v28, v28, v30 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v32 +; SI-NEXT: v_or_b32_e32 v29, v29, v30 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v51 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; SI-NEXT: v_or_b32_e32 v30, v30, v32 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v42 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v31, v31, v32 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x78, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v11 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v64f16_to_v64i16_scalar: @@ -238676,1091 +235249,965 @@ define <64 x half> @bitcast_v64i16_to_v64f16(<64 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v64i16_to_v64f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:136 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v6 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:132 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:128 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:124 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:120 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:116 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:112 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:108 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:104 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:100 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:88 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:52 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v44 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:8 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v57 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v56 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v0 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB110_2 ; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v31, v32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v60 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v57 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v58 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v3 ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v4 ; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v62 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v5 ; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v63 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v6 ; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v7 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v7 ; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v8 ; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v9 ; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v10 ; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v11 ; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v12 ; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v37 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v13 ; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v14 ; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v39 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v15 ; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v48 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v16 ; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v49 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v17 ; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v50 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v18 ; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v19 ; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v52 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v20 ; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v53 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v21 ; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v54 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v22 ; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v55 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v23 ; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v40 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v24 ; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v41 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v25 ; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v42 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v26 ; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v27 ; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v44 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v28 ; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v45 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v46 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v63 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v47 ; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v56 ; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v57 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v58 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v59 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v60 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v61 ; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v62 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v45 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v46 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v36 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v37 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v39 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v48 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v49 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v51 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v52 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v53 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v54 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v55 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v40 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v41 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v42 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: .LBB110_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB110_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v43 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_add_i32_e32 v63, vcc, 3, v63 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v47, vcc, 3, v47 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_add_i32_e32 v60, vcc, 3, v60 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v63 -; SI-NEXT: v_add_i32_e32 v56, vcc, 3, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v60 ; SI-NEXT: v_add_i32_e32 v57, vcc, 3, v57 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 ; SI-NEXT: v_add_i32_e32 v58, vcc, 3, v58 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v57 ; SI-NEXT: v_add_i32_e32 v59, vcc, 3, v59 -; SI-NEXT: v_add_i32_e32 v60, vcc, 3, v60 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v57 -; SI-NEXT: v_add_i32_e32 v61, vcc, 3, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v2 ; SI-NEXT: v_add_i32_e32 v62, vcc, 3, v62 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v58 -; SI-NEXT: v_add_i32_e32 v44, vcc, 3, v44 -; SI-NEXT: v_add_i32_e32 v45, vcc, 3, v45 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v58 +; SI-NEXT: v_add_i32_e32 v63, vcc, 3, v63 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v59 -; SI-NEXT: v_add_i32_e32 v46, vcc, 3, v46 -; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v31 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v3 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v60 -; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 ; SI-NEXT: v_add_i32_e32 v33, vcc, 3, v33 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v4 ; SI-NEXT: v_add_i32_e32 v34, vcc, 3, v34 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v62 ; SI-NEXT: v_add_i32_e32 v35, vcc, 3, v35 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v5 ; SI-NEXT: v_add_i32_e32 v36, vcc, 3, v36 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v63 ; SI-NEXT: v_add_i32_e32 v37, vcc, 3, v37 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v6 ; SI-NEXT: v_add_i32_e32 v38, vcc, 3, v38 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v31 ; SI-NEXT: v_add_i32_e32 v39, vcc, 3, v39 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v7 ; SI-NEXT: v_add_i32_e32 v48, vcc, 3, v48 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v8 ; SI-NEXT: v_add_i32_e32 v49, vcc, 3, v49 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v33 ; SI-NEXT: v_add_i32_e32 v50, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v9 ; SI-NEXT: v_add_i32_e32 v51, vcc, 3, v51 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v34 ; SI-NEXT: v_add_i32_e32 v52, vcc, 3, v52 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v10 ; SI-NEXT: v_add_i32_e32 v53, vcc, 3, v53 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v35 ; SI-NEXT: v_add_i32_e32 v54, vcc, 3, v54 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v11 ; SI-NEXT: v_add_i32_e32 v55, vcc, 3, v55 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v3 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v37 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v36 ; SI-NEXT: v_add_i32_e32 v40, vcc, 3, v40 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v5 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v39 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v12 ; SI-NEXT: v_add_i32_e32 v41, vcc, 3, v41 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v48 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v7 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v49 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v37 ; SI-NEXT: v_add_i32_e32 v42, vcc, 3, v42 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v9 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v52 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v10 -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v53 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v11 -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v54 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v13 +; SI-NEXT: v_add_i32_e32 v43, vcc, 3, v43 +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v12 -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v38 +; SI-NEXT: v_add_i32_e32 v44, vcc, 3, v44 +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v55 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v14 +; SI-NEXT: v_add_i32_e32 v45, vcc, 3, v45 +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v13 -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v39 +; SI-NEXT: v_add_i32_e32 v46, vcc, 3, v46 +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v40 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v15 +; SI-NEXT: v_add_i32_e32 v47, vcc, 3, v47 +; SI-NEXT: v_add_i32_e32 v56, vcc, 3, v56 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v14 -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v48 +; SI-NEXT: v_add_i32_e32 v61, vcc, 3, v61 +; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v41 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v32 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v15 -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v49 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v42 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v16 -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v50 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v17 -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v18 -; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v19 -; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v20 -; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v20 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v21 -; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v53 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v22 -; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v21 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v23 -; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 -; SI-NEXT: v_add_i32_e32 v43, vcc, 3, v43 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v54 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v25 -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v55 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v26 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v27 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v40 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v28 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v29 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v41 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v30 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: .LBB110_4: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_cvt_f32_f16_e32 v0, v25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: v_cvt_f32_f16_e32 v0, v42 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_cvt_f32_f16_e32 v0, v26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: v_cvt_f32_f16_e32 v0, v27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: v_cvt_f32_f16_e32 v0, v44 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: v_cvt_f32_f16_e32 v0, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: v_cvt_f32_f16_e32 v0, v45 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: v_cvt_f32_f16_e32 v0, v29 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: v_cvt_f32_f16_e32 v0, v46 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: v_cvt_f32_f16_e32 v0, v30 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: v_cvt_f32_f16_e32 v0, v47 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: v_cvt_f32_f16_e32 v0, v56 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: v_cvt_f32_f16_e32 v0, v61 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: .LBB110_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v8, v31 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v23, v25, v23 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v25, v27, v25 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x78, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_or_b32_e32 v26, v27, v26 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v27, v29, v27 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_or_b32_e32 v28, v29, v28 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v30 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v29, v31, v29 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_or_b32_e32 v30, v31, v30 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v32 +; SI-NEXT: v_or_b32_e32 v31, v33, v31 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v64i16_to_v64f16: @@ -240000,875 +236447,747 @@ define inreg <64 x half> @bitcast_v64i16_to_v64f16_scalar(<64 x i16> inreg %a, i ; SI-LABEL: bitcast_v64i16_to_v64f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:80 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 -; SI-NEXT: v_mov_b32_e32 v42, v4 -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v41 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: s_lshr_b32 s43, s29, 16 +; SI-NEXT: s_lshr_b32 s42, s28, 16 +; SI-NEXT: s_lshr_b32 s41, s27, 16 +; SI-NEXT: s_lshr_b32 s40, s26, 16 +; SI-NEXT: s_lshr_b32 s15, s25, 16 +; SI-NEXT: s_lshr_b32 s14, s24, 16 +; SI-NEXT: s_lshr_b32 s13, s23, 16 +; SI-NEXT: s_lshr_b32 s12, s22, 16 +; SI-NEXT: s_lshr_b32 s11, s21, 16 +; SI-NEXT: s_lshr_b32 s10, s20, 16 +; SI-NEXT: s_lshr_b32 s9, s19, 16 +; SI-NEXT: s_lshr_b32 s8, s18, 16 +; SI-NEXT: s_lshr_b32 s7, s17, 16 +; SI-NEXT: s_lshr_b32 s6, s16, 16 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v16 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: s_cbranch_scc0 .LBB111_2 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v15 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v0 +; SI-NEXT: s_cbranch_scc0 .LBB111_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v4, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v41, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v44, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v43, s29 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v18, s16 +; SI-NEXT: v_mov_b32_e32 v37, v21 +; SI-NEXT: v_mov_b32_e32 v39, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v22 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v18, s6 +; SI-NEXT: v_mov_b32_e32 v48, v23 +; SI-NEXT: v_mov_b32_e32 v49, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v24 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v18, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v55, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v10 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v18, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s42 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v18, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s43 +; SI-NEXT: v_mov_b32_e32 v50, v25 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v18, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v25 +; SI-NEXT: v_mov_b32_e32 v51, v26 +; SI-NEXT: v_mov_b32_e32 v52, v27 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v18, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v27 +; SI-NEXT: v_mov_b32_e32 v53, v28 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v18, s9 +; SI-NEXT: v_mov_b32_e32 v54, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v9 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v2 -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v18, s20 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v40, v32 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v41, s17 -; SI-NEXT: s_mov_b64 s[4:5], 0 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v24, v32 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v3 -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v18, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v11 +; SI-NEXT: v_mov_b32_e32 v41, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v36 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v41, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v59, s27 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v18, s21 +; SI-NEXT: v_mov_b32_e32 v42, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v13 +; SI-NEXT: v_mov_b32_e32 v43, v59 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v42 -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v18, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v14 +; SI-NEXT: v_mov_b32_e32 v44, v63 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v41, s19 -; SI-NEXT: v_mov_b32_e32 v2, v9 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v18, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v63 +; SI-NEXT: v_mov_b32_e32 v47, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v46 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v5 -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v18, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v16 +; SI-NEXT: v_mov_b32_e32 v36, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v41, s20 -; SI-NEXT: v_mov_b32_e32 v3, v10 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v18, s23 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v6 -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v18, s13 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v41, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v21 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v18, s24 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v7 -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v18, s14 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v41, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v18, s25 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v8 -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v18, s15 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v41, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v24 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v18, s26 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v9 -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v18, s40 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v41, s24 -; SI-NEXT: v_mov_b32_e32 v24, v43 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v18, s27 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v10 -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v18, s41 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v41, s25 -; SI-NEXT: v_mov_b32_e32 v60, v29 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v18, v0 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v11 -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v18, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v6 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v41, s28 -; SI-NEXT: v_mov_b32_e32 v61, v30 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v18, v1 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v62 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v21, v7 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v55 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v53 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v51 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v18, v2 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v48 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v21, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v15 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v25 -; SI-NEXT: v_mov_b32_e32 v25, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v39 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v18, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v12 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v37 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v23 -; SI-NEXT: v_mov_b32_e32 v23, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v34 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v35 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v49 -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v43, v33 -; SI-NEXT: s_branch .LBB111_3 -; SI-NEXT: .LBB111_2: -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; kill: killed $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; kill: killed $vgpr41 -; SI-NEXT: v_mov_b32_e32 v61, v30 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; kill: killed $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; kill: killed $vgpr41 -; SI-NEXT: v_mov_b32_e32 v60, v29 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; kill: killed $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; kill: killed $vgpr41 -; SI-NEXT: v_mov_b32_e32 v3, v10 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; kill: killed $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; kill: killed $vgpr41 -; SI-NEXT: v_mov_b32_e32 v2, v9 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; kill: killed $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; kill: killed $vgpr41 -; SI-NEXT: s_mov_b64 s[4:5], -1 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; kill: killed $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; kill: killed $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; kill: killed $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; kill: killed $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; kill: killed $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; kill: killed $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; kill: killed $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; kill: killed $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; kill: killed $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; kill: killed $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; kill: killed $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; kill: killed $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; kill: killed $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; kill: killed $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; kill: killed $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; kill: killed $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; kill: killed $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; kill: killed $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; kill: killed $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; kill: killed $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; kill: killed $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: .LBB111_3: ; %Flow -; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: v_cvt_f32_f16_e32 v18, v3 +; SI-NEXT: v_mov_b32_e32 v46, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v30 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v41, v42 -; SI-NEXT: v_mov_b32_e32 v42, v44 -; SI-NEXT: v_mov_b32_e32 v44, v46 -; SI-NEXT: v_mov_b32_e32 v46, v56 -; SI-NEXT: v_mov_b32_e32 v56, v58 -; SI-NEXT: v_mov_b32_e32 v58, v5 -; SI-NEXT: v_mov_b32_e32 v5, v7 -; SI-NEXT: v_mov_b32_e32 v7, v9 -; SI-NEXT: v_mov_b32_e32 v9, v11 -; SI-NEXT: v_mov_b32_e32 v11, v13 -; SI-NEXT: v_mov_b32_e32 v13, v15 -; SI-NEXT: v_mov_b32_e32 v15, v17 -; SI-NEXT: v_mov_b32_e32 v17, v19 -; SI-NEXT: v_mov_b32_e32 v19, v1 -; SI-NEXT: s_cbranch_vccnz .LBB111_5 -; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v3 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v2 -; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v41, s16 -; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v18, v4 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v41, s17 -; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: s_add_i32 s21, s21, 3 -; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v18, v5 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v41, s18 -; SI-NEXT: s_add_i32 s23, s23, 3 -; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: s_add_i32 s25, s25, 3 -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v18, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v38 +; SI-NEXT: v_mov_b32_e32 v38, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v17 +; SI-NEXT: s_cbranch_execnz .LBB111_3 +; SI-NEXT: .LBB111_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_add_i32_e32 v35, vcc, 3, v37 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v41, s19 -; SI-NEXT: v_add_i32_e32 v33, vcc, 3, v33 -; SI-NEXT: v_add_i32_e32 v34, vcc, 3, v34 -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v35 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: s_add_i32 s6, s6, 3 +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v41, s20 -; SI-NEXT: v_add_i32_e32 v35, vcc, 3, v35 -; SI-NEXT: v_add_i32_e32 v36, vcc, 3, v36 -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v36, s6 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v41, s21 -; SI-NEXT: v_add_i32_e32 v37, vcc, 3, v37 -; SI-NEXT: v_add_i32_e32 v38, vcc, 3, v38 -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v41, s22 -; SI-NEXT: v_add_i32_e32 v39, vcc, 3, v39 -; SI-NEXT: v_add_i32_e32 v48, vcc, 3, v48 -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v36, s17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v41, s23 -; SI-NEXT: v_add_i32_e32 v49, vcc, 3, v49 -; SI-NEXT: v_add_i32_e32 v50, vcc, 3, v50 -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v2 +; SI-NEXT: v_add_i32_e32 v33, vcc, 3, v48 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s7, s7, 3 +; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v50 +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v41, s24 -; SI-NEXT: v_add_i32_e32 v51, vcc, 3, v51 -; SI-NEXT: v_add_i32_e32 v52, vcc, 3, v52 -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v36, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v41, s25 -; SI-NEXT: v_add_i32_e32 v53, vcc, 3, v53 -; SI-NEXT: v_add_i32_e32 v54, vcc, 3, v54 -; SI-NEXT: v_add_i32_e32 v55, vcc, 3, v55 -; SI-NEXT: v_add_i32_e32 v40, vcc, 3, v40 -; SI-NEXT: v_add_i32_e32 v58, vcc, 3, v62 -; SI-NEXT: v_add_i32_e32 v59, vcc, 3, v63 -; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v31 -; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32 -; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v61 -; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v60 -; SI-NEXT: s_add_i32 s29, s29, 3 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: s_add_i32 s27, s27, 3 -; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v56, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v33 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: s_add_i32 s14, s14, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v41, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v33 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v36, s18 +; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v2 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v50, s14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v3 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v3 ; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s8, s8, 3 +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v49 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v36, s8 +; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v40 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v50, s25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v51 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: s_add_i32 s15, s15, 3 +; SI-NEXT: s_add_i32 s9, s9, 3 +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v59, s27 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v36, s9 +; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v53 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v50, s15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v32 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s10, s10, 3 +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v55 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v36, s10 +; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v29 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v50, s26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v54 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: s_add_i32 s40, s40, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v27 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v36, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s22 +; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v31 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v50, s40 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s26 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_add_i32 s12, s12, 3 +; SI-NEXT: s_add_i32 s11, s11, 3 +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v30 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v36, s11 +; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s28 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v48, s12 +; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v28 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v50, s27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s29 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v8 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v38 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v47 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v44 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v43 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v42 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v41 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v40 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v34, vcc, 3, v39 +; SI-NEXT: s_add_i32 s43, s43, 3 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_add_i32 s42, s42, 3 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s41, s41, 3 +; SI-NEXT: s_add_i32 s13, s13, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s20 +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v26 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v48, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s13 +; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: .LBB111_5: ; %end +; SI-NEXT: v_cvt_f32_f16_e32 v50, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: v_cvt_f32_f16_e32 v0, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s43 +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: .LBB111_3: ; %end ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v14, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v25 -; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v23 -; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v34 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v18 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v18 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v19 +; SI-NEXT: v_or_b32_e32 v17, v20, v17 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v19, v57 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v34 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v19, v35, v19 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v61 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: v_or_b32_e32 v20, v34, v20 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v21, v35, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v45 +; SI-NEXT: v_or_b32_e32 v23, v35, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v58 +; SI-NEXT: v_or_b32_e32 v25, v35, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v62 +; SI-NEXT: v_or_b32_e32 v27, v35, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v63 +; SI-NEXT: v_or_b32_e32 v29, v35, v29 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_or_b32_e32 v22, v34, v22 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v19 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v17 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v15 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v13 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v11 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v9 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v7 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v5 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v58 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v56 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v46 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v44 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v42 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v41 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x78, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_or_b32_e32 v24, v34, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v46 +; SI-NEXT: v_or_b32_e32 v26, v34, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v59 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v28, v34, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v30 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v33 +; SI-NEXT: v_or_b32_e32 v30, v32, v30 +; SI-NEXT: v_or_b32_e32 v31, v34, v31 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB111_4: +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: v_mov_b32_e32 v37, v21 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: v_mov_b32_e32 v47, v46 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: v_mov_b32_e32 v44, v63 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: v_mov_b32_e32 v43, v59 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: v_mov_b32_e32 v42, v38 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: v_mov_b32_e32 v41, v36 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: v_mov_b32_e32 v40, v32 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: v_mov_b32_e32 v55, v31 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: v_mov_b32_e32 v54, v29 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: v_mov_b32_e32 v53, v28 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: v_mov_b32_e32 v52, v27 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: v_mov_b32_e32 v51, v26 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: v_mov_b32_e32 v50, v25 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: v_mov_b32_e32 v49, v24 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: v_mov_b32_e32 v48, v23 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: v_mov_b32_e32 v39, v22 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: v_mov_b32_e32 v38, v30 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: v_mov_b32_e32 v36, v33 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; kill: killed $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; kill: killed $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: s_branch .LBB111_2 ; ; VI-LABEL: bitcast_v64i16_to_v64f16_scalar: ; VI: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll index a498525c92360..7351cff50f25f 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll @@ -1106,35 +1106,43 @@ define <8 x i16> @bitcast_v4i32_to_v8i16(<4 x i32> %a, i32 %b) { ; SI-LABEL: bitcast_v4i32_to_v8i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v6, v3 -; SI-NEXT: v_mov_b32_e32 v8, v2 -; SI-NEXT: v_mov_b32_e32 v2, v1 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_alignbit_b32 v5, v6, v8, 16 -; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_alignbit_b32 v4, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v6, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 ; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; SI-NEXT: v_alignbit_b32 v5, v6, v8, 16 -; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_alignbit_b32 v4, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v6, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 ; SI-NEXT: ; %bb.4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_mov_b32_e32 v4, v8 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v0, v0, v6 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; SI-NEXT: v_or_b32_e32 v1, v1, v6 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4i32_to_v8i16: @@ -1224,14 +1232,22 @@ define inreg <8 x i16> @bitcast_v4i32_to_v8i16_scalar(<4 x i32> inreg %a, i32 in ; SI-NEXT: s_lshr_b32 s10, s19, 16 ; SI-NEXT: s_lshr_b32 s11, s17, 16 ; SI-NEXT: .LBB13_3: ; %end -; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: s_and_b32 s5, s16, 0xffff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s17, 0xffff +; SI-NEXT: s_lshl_b32 s7, s11, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s18, 0xffff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_or_b32 s4, s7, s4 +; SI-NEXT: s_and_b32 s7, s19, 0xffff +; SI-NEXT: s_lshl_b32 s8, s10, 16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: v_mov_b32_e32 v0, s5 ; SI-NEXT: v_mov_b32_e32 v1, s6 -; SI-NEXT: v_mov_b32_e32 v2, s17 -; SI-NEXT: v_mov_b32_e32 v3, s11 -; SI-NEXT: v_mov_b32_e32 v4, s18 -; SI-NEXT: v_mov_b32_e32 v5, s4 -; SI-NEXT: v_mov_b32_e32 v6, s19 -; SI-NEXT: v_mov_b32_e32 v7, s10 +; SI-NEXT: v_mov_b32_e32 v2, s4 +; SI-NEXT: v_mov_b32_e32 v3, s7 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB13_4: ; SI-NEXT: ; implicit-def: $sgpr6 @@ -1324,13 +1340,19 @@ define <4 x i32> @bitcast_v8i16_to_v4i32(<8 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v8i16_to_v4i32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v10, v2 -; SI-NEXT: v_mov_b32_e32 v9, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v1 +; SI-NEXT: v_mov_b32_e32 v8, v3 +; SI-NEXT: v_mov_b32_e32 v5, v2 +; SI-NEXT: v_mov_b32_e32 v6, v1 +; SI-NEXT: v_mov_b32_e32 v7, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v7 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v0 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -1342,37 +1364,37 @@ define <4 x i32> @bitcast_v8i16_to_v4i32(<8 x i16> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB14_3: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v9 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v4 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v6 -; SI-NEXT: v_or_b32_e32 v0, v0, v12 -; SI-NEXT: v_or_b32_e32 v1, v1, v11 -; SI-NEXT: v_or_b32_e32 v2, v2, v8 -; SI-NEXT: v_or_b32_e32 v3, v3, v5 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v0, v0, v11 +; SI-NEXT: v_or_b32_e32 v1, v1, v10 +; SI-NEXT: v_or_b32_e32 v2, v2, v9 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB14_2 ; SI-NEXT: .LBB14_4: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v9 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v10 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v4 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v8 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v0, v12, v0 -; SI-NEXT: v_or_b32_e32 v1, v11, v1 -; SI-NEXT: v_or_b32_e32 v2, v8, v2 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_or_b32_e32 v0, v11, v0 +; SI-NEXT: v_or_b32_e32 v1, v10, v1 +; SI-NEXT: v_or_b32_e32 v2, v9, v2 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x30000, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x30000, v2 @@ -1459,38 +1481,42 @@ define inreg <4 x i32> @bitcast_v8i16_to_v4i32_scalar(<8 x i16> inreg %a, i32 in ; SI-LABEL: bitcast_v8i16_to_v4i32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: s_lshr_b32 s10, s19, 16 +; SI-NEXT: s_lshr_b32 s11, s18, 16 +; SI-NEXT: s_lshr_b32 s12, s17, 16 +; SI-NEXT: s_lshr_b32 s13, s16, 16 +; SI-NEXT: s_cmp_lg_u32 s20, 0 ; SI-NEXT: s_cbranch_scc0 .LBB15_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_lshl_b32 s5, s13, 16 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s12, 16 ; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_and_b32 s6, s18, 0xffff +; SI-NEXT: s_lshl_b32 s7, s11, 16 ; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_and_b32 s7, s19, 0xffff +; SI-NEXT: s_lshl_b32 s8, s10, 16 ; SI-NEXT: s_or_b32 s7, s7, s8 ; SI-NEXT: s_cbranch_execnz .LBB15_3 ; SI-NEXT: .LBB15_2: ; %cmp.true ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_lshl_b32 s5, s13, 16 +; SI-NEXT: s_add_i32 s17, s17, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s12, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_and_b32 s6, s18, 0xffff +; SI-NEXT: s_lshl_b32 s7, s11, 16 +; SI-NEXT: s_add_i32 s19, s19, 3 ; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_and_b32 s7, s19, 0xffff +; SI-NEXT: s_lshl_b32 s8, s10, 16 ; SI-NEXT: s_or_b32 s7, s8, s7 ; SI-NEXT: s_add_i32 s4, s4, 0x30000 ; SI-NEXT: s_add_i32 s5, s5, 0x30000 @@ -1607,65 +1633,73 @@ define <8 x half> @bitcast_v4i32_to_v8f16(<4 x i32> %a, i32 %b) { ; SI-LABEL: bitcast_v4i32_to_v8f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v8, v3 -; SI-NEXT: v_mov_b32_e32 v9, v2 -; SI-NEXT: v_mov_b32_e32 v10, v1 -; SI-NEXT: v_mov_b32_e32 v11, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB16_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB16_4 -; SI-NEXT: .LBB16_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB16_3: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v11 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB16_2 -; SI-NEXT: .LBB16_4: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v11 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v10 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v9 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v8 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: .LBB16_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB16_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: .LBB16_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v8 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v7 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4i32_to_v8f16: @@ -1741,17 +1775,17 @@ define inreg <8 x half> @bitcast_v4i32_to_v8f16_scalar(<4 x i32> inreg %a, i32 i ; SI-NEXT: s_cbranch_scc0 .LBB17_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 ; SI-NEXT: s_lshr_b32 s4, s18, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 ; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s19 ; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s16 ; SI-NEXT: s_cbranch_execnz .LBB17_3 ; SI-NEXT: .LBB17_2: ; %cmp.true ; SI-NEXT: s_add_i32 s16, s16, 3 @@ -1762,25 +1796,41 @@ define inreg <8 x half> @bitcast_v4i32_to_v8f16_scalar(<4 x i32> inreg %a, i32 i ; SI-NEXT: s_lshr_b32 s5, s17, 16 ; SI-NEXT: s_lshr_b32 s6, s18, 16 ; SI-NEXT: s_lshr_b32 s7, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s19 ; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s7 ; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 ; SI-NEXT: .LBB17_3: ; %end +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_or_b32_e32 v0, v6, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v8, v1 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_or_b32_e32 v3, v6, v3 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB17_4: +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: s_branch .LBB17_2 ; ; VI-LABEL: bitcast_v4i32_to_v8f16_scalar: @@ -1867,15 +1917,27 @@ define <4 x i32> @bitcast_v8f16_to_v4i32(<8 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v8f16_to_v4i32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v14, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v6 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v3 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -1887,29 +1949,29 @@ define <4 x i32> @bitcast_v8f16_to_v4i32(<8 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB18_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v14 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; SI-NEXT: v_or_b32_e32 v0, v13, v0 -; SI-NEXT: v_or_b32_e32 v1, v11, v1 -; SI-NEXT: v_or_b32_e32 v2, v9, v2 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; SI-NEXT: v_or_b32_e32 v0, v11, v0 +; SI-NEXT: v_or_b32_e32 v1, v9, v1 +; SI-NEXT: v_or_b32_e32 v2, v7, v2 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 ; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB18_2 ; SI-NEXT: .LBB18_4: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v9 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -1917,27 +1979,27 @@ define <4 x i32> @bitcast_v8f16_to_v4i32(<8 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v8 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v7 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -2021,15 +2083,27 @@ define inreg <4 x i32> @bitcast_v8f16_to_v4i32_scalar(<8 x half> inreg %a, i32 i ; SI-LABEL: bitcast_v8f16_to_v4i32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v11, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v10, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v9, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v8, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v7, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v6, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v5, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v4, s22 -; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: s_cmp_lg_u32 s20, 0 ; SI-NEXT: s_cbranch_scc0 .LBB19_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v11 @@ -2184,57 +2258,64 @@ define <8 x bfloat> @bitcast_v4i32_to_v8bf16(<4 x i32> %a, i32 %b) { ; SI-LABEL: bitcast_v4i32_to_v8bf16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v11, v3 -; SI-NEXT: v_mov_b32_e32 v10, v2 -; SI-NEXT: v_mov_b32_e32 v9, v1 -; SI-NEXT: v_mov_b32_e32 v8, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB20_3 -; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v1 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB20_4 -; SI-NEXT: .LBB20_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB20_3: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v11 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v11 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v10 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v10 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v9 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v9 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v8 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB20_2 -; SI-NEXT: .LBB20_4: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v8 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v9 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v10 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v11 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v3 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_cbranch_execz .LBB20_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v1 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v0 +; SI-NEXT: .LBB20_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v11 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v10 +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v9 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v8 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v7 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v6 +; SI-NEXT: v_alignbit_b32 v2, v2, v3, 16 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v5 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_alignbit_b32 v3, v3, v4, 16 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4i32_to_v8bf16: @@ -2309,47 +2390,55 @@ define inreg <8 x bfloat> @bitcast_v4i32_to_v8bf16_scalar(<4 x i32> inreg %a, i3 ; SI-NEXT: s_cmp_lg_u32 s20, 0 ; SI-NEXT: s_cbranch_scc0 .LBB21_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_and_b32 s6, s19, 0xffff0000 -; SI-NEXT: s_lshl_b32 s7, s19, 16 -; SI-NEXT: s_and_b32 s8, s18, 0xffff0000 -; SI-NEXT: s_lshl_b32 s9, s18, 16 -; SI-NEXT: s_and_b32 s10, s17, 0xffff0000 -; SI-NEXT: s_lshl_b32 s11, s17, 16 -; SI-NEXT: s_and_b32 s12, s16, 0xffff0000 -; SI-NEXT: s_lshl_b32 s13, s16, 16 +; SI-NEXT: s_and_b32 s7, s19, 0xffff0000 +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_and_b32 s9, s18, 0xffff0000 +; SI-NEXT: s_lshl_b32 s8, s18, 16 +; SI-NEXT: s_and_b32 s11, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s10, s17, 16 +; SI-NEXT: s_and_b32 s13, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s12, s16, 16 ; SI-NEXT: s_cbranch_execnz .LBB21_3 ; SI-NEXT: .LBB21_2: ; %cmp.true ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_add_i32 s17, s17, 3 ; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: s_and_b32 s6, s19, 0xffff0000 -; SI-NEXT: s_lshl_b32 s7, s19, 16 -; SI-NEXT: s_and_b32 s8, s18, 0xffff0000 -; SI-NEXT: s_lshl_b32 s9, s18, 16 -; SI-NEXT: s_and_b32 s10, s17, 0xffff0000 -; SI-NEXT: s_lshl_b32 s11, s17, 16 -; SI-NEXT: s_and_b32 s12, s16, 0xffff0000 -; SI-NEXT: s_lshl_b32 s13, s16, 16 +; SI-NEXT: s_and_b32 s7, s19, 0xffff0000 +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_and_b32 s9, s18, 0xffff0000 +; SI-NEXT: s_lshl_b32 s8, s18, 16 +; SI-NEXT: s_and_b32 s11, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s10, s17, 16 +; SI-NEXT: s_and_b32 s13, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s12, s16, 16 ; SI-NEXT: .LBB21_3: ; %end -; SI-NEXT: v_mov_b32_e32 v0, s13 -; SI-NEXT: v_mov_b32_e32 v1, s12 -; SI-NEXT: v_mov_b32_e32 v2, s11 -; SI-NEXT: v_mov_b32_e32 v3, s10 -; SI-NEXT: v_mov_b32_e32 v4, s9 -; SI-NEXT: v_mov_b32_e32 v5, s8 -; SI-NEXT: v_mov_b32_e32 v6, s7 -; SI-NEXT: v_mov_b32_e32 v7, s6 +; SI-NEXT: v_mul_f32_e64 v0, 1.0, s13 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_mul_f32_e64 v0, 1.0, s12 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s11 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s10 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s9 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s8 +; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s7 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s6 +; SI-NEXT: v_lshr_b64 v[3:4], v[3:4], 16 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB21_4: -; SI-NEXT: ; implicit-def: $sgpr13 ; SI-NEXT: ; implicit-def: $sgpr12 -; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr13 ; SI-NEXT: ; implicit-def: $sgpr10 -; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $sgpr11 ; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr9 ; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr7 ; SI-NEXT: s_branch .LBB21_2 ; ; VI-LABEL: bitcast_v4i32_to_v8bf16_scalar: @@ -2436,15 +2525,23 @@ define <4 x i32> @bitcast_v8bf16_to_v4i32(<8 x bfloat> %a, i32 %b) { ; SI-LABEL: bitcast_v8bf16_to_v4i32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; SI-NEXT: v_mul_f32_e32 v12, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v13, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v10, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v11, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v8, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v9, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v4, 1.0, v7 -; SI-NEXT: v_mul_f32_e32 v5, 1.0, v6 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v3 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -2456,39 +2553,39 @@ define <4 x i32> @bitcast_v8bf16_to_v4i32(<8 x bfloat> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB22_3: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v6 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_alignbit_b32 v0, v0, v13, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v11, 16 -; SI-NEXT: v_alignbit_b32 v2, v2, v9, 16 +; SI-NEXT: v_alignbit_b32 v0, v0, v11, 16 +; SI-NEXT: v_alignbit_b32 v1, v1, v9, 16 +; SI-NEXT: v_alignbit_b32 v2, v2, v7, 16 ; SI-NEXT: v_alignbit_b32 v3, v3, v5, 16 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB22_2 ; SI-NEXT: .LBB22_4: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v12 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v13 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v11 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v10 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v8 ; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v11 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v9 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v8 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 ; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v9 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v7 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 @@ -2852,15 +2949,23 @@ define inreg <4 x i32> @bitcast_v8bf16_to_v4i32_scalar(<8 x bfloat> inreg %a, i3 ; SI-LABEL: bitcast_v8bf16_to_v4i32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_cmp_lg_u32 s24, 0 -; SI-NEXT: v_mul_f32_e64 v16, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v11, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v15, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v9, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v14, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v7, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v13, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v5, 1.0, s22 +; SI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; SI-NEXT: s_lshl_b32 s5, s19, 16 +; SI-NEXT: s_and_b32 s6, s18, 0xffff0000 +; SI-NEXT: s_lshl_b32 s7, s18, 16 +; SI-NEXT: s_and_b32 s8, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s9, s17, 16 +; SI-NEXT: s_and_b32 s10, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s11, s16, 16 +; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: v_mul_f32_e64 v16, 1.0, s10 +; SI-NEXT: v_mul_f32_e64 v11, 1.0, s11 +; SI-NEXT: v_mul_f32_e64 v15, 1.0, s8 +; SI-NEXT: v_mul_f32_e64 v9, 1.0, s9 +; SI-NEXT: v_mul_f32_e64 v14, 1.0, s6 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s7 +; SI-NEXT: v_mul_f32_e64 v13, 1.0, s4 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s5 ; SI-NEXT: s_cbranch_scc0 .LBB23_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v16 @@ -5640,35 +5745,43 @@ define <8 x i16> @bitcast_v4f32_to_v8i16(<4 x float> %a, i32 %b) { ; SI-LABEL: bitcast_v4f32_to_v8i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v6, v3 -; SI-NEXT: v_mov_b32_e32 v8, v2 -; SI-NEXT: v_mov_b32_e32 v2, v1 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_alignbit_b32 v5, v6, v8, 16 -; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_alignbit_b32 v4, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v6, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 ; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 -; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 -; SI-NEXT: v_alignbit_b32 v5, v6, v8, 16 -; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v4, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v6, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 ; SI-NEXT: ; %bb.4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_mov_b32_e32 v4, v8 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v0, v0, v6 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; SI-NEXT: v_or_b32_e32 v1, v1, v6 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4f32_to_v8i16: @@ -5747,14 +5860,14 @@ define inreg <8 x i16> @bitcast_v4f32_to_v8i16_scalar(<4 x float> inreg %a, i32 ; SI-NEXT: s_lshr_b64 s[6:7], s[16:17], 16 ; SI-NEXT: s_cbranch_execnz .LBB37_4 ; SI-NEXT: .LBB37_2: ; %cmp.true -; SI-NEXT: v_add_f32_e64 v11, s17, 1.0 -; SI-NEXT: v_add_f32_e64 v10, s16, 1.0 -; SI-NEXT: v_add_f32_e64 v9, s19, 1.0 -; SI-NEXT: v_add_f32_e64 v8, s18, 1.0 -; SI-NEXT: v_lshr_b64 v[5:6], v[8:9], 16 -; SI-NEXT: v_lshr_b64 v[1:2], v[10:11], 16 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v11 +; SI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: v_lshr_b64 v[4:5], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[5:6], v[0:1], 16 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 ; SI-NEXT: s_branch .LBB37_5 ; SI-NEXT: .LBB37_3: ; SI-NEXT: ; implicit-def: $sgpr6 @@ -5763,19 +5876,27 @@ define inreg <8 x i16> @bitcast_v4f32_to_v8i16_scalar(<4 x float> inreg %a, i32 ; SI-NEXT: ; implicit-def: $sgpr11 ; SI-NEXT: s_branch .LBB37_2 ; SI-NEXT: .LBB37_4: -; SI-NEXT: v_mov_b32_e32 v10, s16 -; SI-NEXT: v_mov_b32_e32 v11, s17 -; SI-NEXT: v_mov_b32_e32 v8, s18 -; SI-NEXT: v_mov_b32_e32 v9, s19 -; SI-NEXT: v_mov_b32_e32 v3, s10 -; SI-NEXT: v_mov_b32_e32 v7, s11 -; SI-NEXT: v_mov_b32_e32 v5, s4 -; SI-NEXT: v_mov_b32_e32 v1, s6 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v7, s10 +; SI-NEXT: v_mov_b32_e32 v6, s11 +; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: v_mov_b32_e32 v5, s6 ; SI-NEXT: .LBB37_5: ; %end -; SI-NEXT: v_mov_b32_e32 v0, v10 -; SI-NEXT: v_mov_b32_e32 v2, v11 -; SI-NEXT: v_mov_b32_e32 v4, v8 -; SI-NEXT: v_mov_b32_e32 v6, v9 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v0, v0, v5 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; SI-NEXT: v_or_b32_e32 v1, v1, v5 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4f32_to_v8i16_scalar: @@ -5864,13 +5985,19 @@ define <4 x float> @bitcast_v8i16_to_v4f32(<8 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v8i16_to_v4f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v10, v2 -; SI-NEXT: v_mov_b32_e32 v9, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v1 +; SI-NEXT: v_mov_b32_e32 v8, v3 +; SI-NEXT: v_mov_b32_e32 v5, v2 +; SI-NEXT: v_mov_b32_e32 v6, v1 +; SI-NEXT: v_mov_b32_e32 v7, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v7 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v0 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -5882,37 +6009,37 @@ define <4 x float> @bitcast_v8i16_to_v4f32(<8 x i16> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB38_3: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v9 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v4 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v6 -; SI-NEXT: v_or_b32_e32 v0, v0, v12 -; SI-NEXT: v_or_b32_e32 v1, v1, v11 -; SI-NEXT: v_or_b32_e32 v2, v2, v8 -; SI-NEXT: v_or_b32_e32 v3, v3, v5 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v0, v0, v11 +; SI-NEXT: v_or_b32_e32 v1, v1, v10 +; SI-NEXT: v_or_b32_e32 v2, v2, v9 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB38_2 ; SI-NEXT: .LBB38_4: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v9 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v10 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v4 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v8 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v0, v12, v0 -; SI-NEXT: v_or_b32_e32 v1, v11, v1 -; SI-NEXT: v_or_b32_e32 v2, v8, v2 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_or_b32_e32 v0, v11, v0 +; SI-NEXT: v_or_b32_e32 v1, v10, v1 +; SI-NEXT: v_or_b32_e32 v2, v9, v2 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x30000, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x30000, v2 @@ -5999,38 +6126,42 @@ define inreg <4 x float> @bitcast_v8i16_to_v4f32_scalar(<8 x i16> inreg %a, i32 ; SI-LABEL: bitcast_v8i16_to_v4f32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: s_lshr_b32 s10, s19, 16 +; SI-NEXT: s_lshr_b32 s11, s18, 16 +; SI-NEXT: s_lshr_b32 s12, s17, 16 +; SI-NEXT: s_lshr_b32 s13, s16, 16 +; SI-NEXT: s_cmp_lg_u32 s20, 0 ; SI-NEXT: s_cbranch_scc0 .LBB39_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_lshl_b32 s5, s13, 16 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s12, 16 ; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_and_b32 s6, s18, 0xffff +; SI-NEXT: s_lshl_b32 s7, s11, 16 ; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_and_b32 s7, s19, 0xffff +; SI-NEXT: s_lshl_b32 s8, s10, 16 ; SI-NEXT: s_or_b32 s7, s7, s8 ; SI-NEXT: s_cbranch_execnz .LBB39_3 ; SI-NEXT: .LBB39_2: ; %cmp.true ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_lshl_b32 s5, s13, 16 +; SI-NEXT: s_add_i32 s17, s17, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s12, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_and_b32 s6, s18, 0xffff +; SI-NEXT: s_lshl_b32 s7, s11, 16 +; SI-NEXT: s_add_i32 s19, s19, 3 ; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_and_b32 s7, s19, 0xffff +; SI-NEXT: s_lshl_b32 s8, s10, 16 ; SI-NEXT: s_or_b32 s7, s8, s7 ; SI-NEXT: s_add_i32 s4, s4, 0x30000 ; SI-NEXT: s_add_i32 s5, s5, 0x30000 @@ -6147,65 +6278,73 @@ define <8 x half> @bitcast_v4f32_to_v8f16(<4 x float> %a, i32 %b) { ; SI-LABEL: bitcast_v4f32_to_v8f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v8, v3 -; SI-NEXT: v_mov_b32_e32 v9, v2 -; SI-NEXT: v_mov_b32_e32 v10, v1 -; SI-NEXT: v_mov_b32_e32 v11, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB40_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB40_4 -; SI-NEXT: .LBB40_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB40_3: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v11 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB40_2 -; SI-NEXT: .LBB40_4: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v0, 1.0, v11 -; SI-NEXT: v_add_f32_e32 v2, 1.0, v10 -; SI-NEXT: v_add_f32_e32 v4, 1.0, v9 -; SI-NEXT: v_add_f32_e32 v6, 1.0, v8 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: .LBB40_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB40_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: .LBB40_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v8 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v7 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4f32_to_v8f16: @@ -6279,46 +6418,62 @@ define inreg <8 x half> @bitcast_v4f32_to_v8f16_scalar(<4 x float> inreg %a, i32 ; SI-NEXT: s_cbranch_scc0 .LBB41_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 ; SI-NEXT: s_lshr_b32 s4, s18, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 ; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s19 ; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s16 ; SI-NEXT: s_cbranch_execnz .LBB41_3 ; SI-NEXT: .LBB41_2: ; %cmp.true -; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 -; SI-NEXT: v_add_f32_e64 v2, s17, 1.0 -; SI-NEXT: v_add_f32_e64 v4, s18, 1.0 -; SI-NEXT: v_add_f32_e64 v6, s19, 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e64 v1, s16, 1.0 +; SI-NEXT: v_add_f32_e64 v0, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v3, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v2, s19, 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: .LBB41_3: ; %end +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_or_b32_e32 v0, v6, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v8, v1 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_or_b32_e32 v3, v6, v3 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB41_4: +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: s_branch .LBB41_2 ; ; VI-LABEL: bitcast_v4f32_to_v8f16_scalar: @@ -6407,15 +6562,27 @@ define <4 x float> @bitcast_v8f16_to_v4f32(<8 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v8f16_to_v4f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v14, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v6 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v3 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -6427,29 +6594,29 @@ define <4 x float> @bitcast_v8f16_to_v4f32(<8 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB42_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v14 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; SI-NEXT: v_or_b32_e32 v0, v13, v0 -; SI-NEXT: v_or_b32_e32 v1, v11, v1 -; SI-NEXT: v_or_b32_e32 v2, v9, v2 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; SI-NEXT: v_or_b32_e32 v0, v11, v0 +; SI-NEXT: v_or_b32_e32 v1, v9, v1 +; SI-NEXT: v_or_b32_e32 v2, v7, v2 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 ; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB42_2 ; SI-NEXT: .LBB42_4: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v9 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -6457,27 +6624,27 @@ define <4 x float> @bitcast_v8f16_to_v4f32(<8 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v8 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v7 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -6561,15 +6728,27 @@ define inreg <4 x float> @bitcast_v8f16_to_v4f32_scalar(<8 x half> inreg %a, i32 ; SI-LABEL: bitcast_v8f16_to_v4f32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v11, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v10, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v9, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v8, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v7, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v6, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v5, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v4, s22 -; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: s_cmp_lg_u32 s20, 0 ; SI-NEXT: s_cbranch_scc0 .LBB43_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v11 @@ -6724,57 +6903,64 @@ define <8 x bfloat> @bitcast_v4f32_to_v8bf16(<4 x float> %a, i32 %b) { ; SI-LABEL: bitcast_v4f32_to_v8bf16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v11, v3 -; SI-NEXT: v_mov_b32_e32 v10, v2 -; SI-NEXT: v_mov_b32_e32 v9, v1 -; SI-NEXT: v_mov_b32_e32 v8, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB44_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB44_4 -; SI-NEXT: .LBB44_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB44_3: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v11 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v11 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v10 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v10 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v9 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v9 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v8 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v1 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB44_2 -; SI-NEXT: .LBB44_4: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v0, 1.0, v8 -; SI-NEXT: v_add_f32_e32 v1, 1.0, v9 -; SI-NEXT: v_add_f32_e32 v2, 1.0, v10 -; SI-NEXT: v_add_f32_e32 v3, 1.0, v11 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v3 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_cbranch_execz .LBB44_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v1 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v0 +; SI-NEXT: .LBB44_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v11 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v10 +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v9 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v8 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v7 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v6 +; SI-NEXT: v_alignbit_b32 v2, v2, v3, 16 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v5 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_alignbit_b32 v3, v3, v4, 16 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4f32_to_v8bf16: @@ -6861,15 +7047,15 @@ define inreg <8 x bfloat> @bitcast_v4f32_to_v8bf16_scalar(<4 x float> inreg %a, ; SI-NEXT: v_add_f32_e64 v1, s17, 1.0 ; SI-NEXT: v_add_f32_e64 v2, s18, 1.0 ; SI-NEXT: v_add_f32_e64 v3, s19, 1.0 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v3 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: s_branch .LBB45_5 ; SI-NEXT: .LBB45_3: ; SI-NEXT: ; implicit-def: $sgpr13 ; SI-NEXT: ; implicit-def: $sgpr12 @@ -6884,11 +7070,28 @@ define inreg <8 x bfloat> @bitcast_v4f32_to_v8bf16_scalar(<4 x float> inreg %a, ; SI-NEXT: v_mov_b32_e32 v0, s13 ; SI-NEXT: v_mov_b32_e32 v1, s12 ; SI-NEXT: v_mov_b32_e32 v2, s11 -; SI-NEXT: v_mov_b32_e32 v3, s10 -; SI-NEXT: v_mov_b32_e32 v4, s9 -; SI-NEXT: v_mov_b32_e32 v5, s8 -; SI-NEXT: v_mov_b32_e32 v6, s7 -; SI-NEXT: v_mov_b32_e32 v7, s6 +; SI-NEXT: v_mov_b32_e32 v7, s10 +; SI-NEXT: v_mov_b32_e32 v3, s9 +; SI-NEXT: v_mov_b32_e32 v6, s8 +; SI-NEXT: v_mov_b32_e32 v4, s7 +; SI-NEXT: v_mov_b32_e32 v5, s6 +; SI-NEXT: .LBB45_5: ; %end +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v7 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v1 +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v2 +; SI-NEXT: v_lshr_b64 v[1:2], v[7:8], 16 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v6 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; SI-NEXT: v_mul_f32_e32 v6, 1.0, v3 +; SI-NEXT: v_lshr_b64 v[2:3], v[6:7], 16 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_lshr_b64 v[3:4], v[4:5], 16 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4f32_to_v8bf16_scalar: @@ -6977,15 +7180,23 @@ define <4 x float> @bitcast_v8bf16_to_v4f32(<8 x bfloat> %a, i32 %b) { ; SI-LABEL: bitcast_v8bf16_to_v4f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; SI-NEXT: v_mul_f32_e32 v12, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v13, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v10, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v11, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v8, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v9, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v4, 1.0, v7 -; SI-NEXT: v_mul_f32_e32 v5, 1.0, v6 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v3 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -6997,39 +7208,39 @@ define <4 x float> @bitcast_v8bf16_to_v4f32(<8 x bfloat> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB46_3: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v6 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_alignbit_b32 v0, v0, v13, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v11, 16 -; SI-NEXT: v_alignbit_b32 v2, v2, v9, 16 +; SI-NEXT: v_alignbit_b32 v0, v0, v11, 16 +; SI-NEXT: v_alignbit_b32 v1, v1, v9, 16 +; SI-NEXT: v_alignbit_b32 v2, v2, v7, 16 ; SI-NEXT: v_alignbit_b32 v3, v3, v5, 16 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB46_2 ; SI-NEXT: .LBB46_4: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v12 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v13 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v11 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v10 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v8 ; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v11 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v9 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v8 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 ; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v9 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v7 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 @@ -7393,15 +7604,23 @@ define inreg <4 x float> @bitcast_v8bf16_to_v4f32_scalar(<8 x bfloat> inreg %a, ; SI-LABEL: bitcast_v8bf16_to_v4f32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_cmp_lg_u32 s24, 0 -; SI-NEXT: v_mul_f32_e64 v16, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v11, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v15, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v9, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v14, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v7, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v13, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v5, 1.0, s22 +; SI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; SI-NEXT: s_lshl_b32 s5, s19, 16 +; SI-NEXT: s_and_b32 s6, s18, 0xffff0000 +; SI-NEXT: s_lshl_b32 s7, s18, 16 +; SI-NEXT: s_and_b32 s8, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s9, s17, 16 +; SI-NEXT: s_and_b32 s10, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s11, s16, 16 +; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: v_mul_f32_e64 v16, 1.0, s10 +; SI-NEXT: v_mul_f32_e64 v11, 1.0, s11 +; SI-NEXT: v_mul_f32_e64 v15, 1.0, s8 +; SI-NEXT: v_mul_f32_e64 v9, 1.0, s9 +; SI-NEXT: v_mul_f32_e64 v14, 1.0, s6 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s7 +; SI-NEXT: v_mul_f32_e64 v13, 1.0, s4 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s5 ; SI-NEXT: s_cbranch_scc0 .LBB47_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v16 @@ -9835,35 +10054,43 @@ define <8 x i16> @bitcast_v2i64_to_v8i16(<2 x i64> %a, i32 %b) { ; SI-LABEL: bitcast_v2i64_to_v8i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v6, v3 -; SI-NEXT: v_mov_b32_e32 v8, v2 -; SI-NEXT: v_mov_b32_e32 v2, v1 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_alignbit_b32 v5, v6, v8, 16 -; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_alignbit_b32 v4, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v6, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 ; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; SI-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc -; SI-NEXT: v_alignbit_b32 v5, v6, v8, 16 -; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_alignbit_b32 v4, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v6, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 ; SI-NEXT: ; %bb.4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_mov_b32_e32 v4, v8 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v0, v0, v6 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; SI-NEXT: v_or_b32_e32 v1, v1, v6 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2i64_to_v8i16: @@ -9954,14 +10181,22 @@ define inreg <8 x i16> @bitcast_v2i64_to_v8i16_scalar(<2 x i64> inreg %a, i32 in ; SI-NEXT: s_lshr_b64 s[4:5], s[18:19], 16 ; SI-NEXT: s_lshr_b64 s[6:7], s[16:17], 16 ; SI-NEXT: .LBB57_3: ; %end -; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: s_and_b32 s5, s16, 0xffff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s17, 0xffff +; SI-NEXT: s_lshl_b32 s7, s11, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s18, 0xffff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_or_b32 s4, s7, s4 +; SI-NEXT: s_and_b32 s7, s19, 0xffff +; SI-NEXT: s_lshl_b32 s8, s10, 16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: v_mov_b32_e32 v0, s5 ; SI-NEXT: v_mov_b32_e32 v1, s6 -; SI-NEXT: v_mov_b32_e32 v2, s17 -; SI-NEXT: v_mov_b32_e32 v3, s11 -; SI-NEXT: v_mov_b32_e32 v4, s18 -; SI-NEXT: v_mov_b32_e32 v5, s4 -; SI-NEXT: v_mov_b32_e32 v6, s19 -; SI-NEXT: v_mov_b32_e32 v7, s10 +; SI-NEXT: v_mov_b32_e32 v2, s4 +; SI-NEXT: v_mov_b32_e32 v3, s7 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB57_4: ; SI-NEXT: ; implicit-def: $sgpr6 @@ -10054,13 +10289,19 @@ define <2 x i64> @bitcast_v8i16_to_v2i64(<8 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v8i16_to_v2i64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v10, v2 -; SI-NEXT: v_mov_b32_e32 v9, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v1 +; SI-NEXT: v_mov_b32_e32 v8, v3 +; SI-NEXT: v_mov_b32_e32 v5, v2 +; SI-NEXT: v_mov_b32_e32 v6, v1 +; SI-NEXT: v_mov_b32_e32 v7, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v7 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v0 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -10072,37 +10313,37 @@ define <2 x i64> @bitcast_v8i16_to_v2i64(<8 x i16> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB58_3: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v9 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v4 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v6 -; SI-NEXT: v_or_b32_e32 v0, v0, v12 -; SI-NEXT: v_or_b32_e32 v1, v1, v11 -; SI-NEXT: v_or_b32_e32 v2, v2, v8 -; SI-NEXT: v_or_b32_e32 v3, v3, v5 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v0, v0, v11 +; SI-NEXT: v_or_b32_e32 v1, v1, v10 +; SI-NEXT: v_or_b32_e32 v2, v2, v9 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB58_2 ; SI-NEXT: .LBB58_4: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v9 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v10 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v4 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v8 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v0, v12, v0 -; SI-NEXT: v_or_b32_e32 v1, v11, v1 -; SI-NEXT: v_or_b32_e32 v2, v8, v2 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_or_b32_e32 v0, v11, v0 +; SI-NEXT: v_or_b32_e32 v1, v10, v1 +; SI-NEXT: v_or_b32_e32 v2, v9, v2 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x30000, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x30000, v2 @@ -10189,38 +10430,42 @@ define inreg <2 x i64> @bitcast_v8i16_to_v2i64_scalar(<8 x i16> inreg %a, i32 in ; SI-LABEL: bitcast_v8i16_to_v2i64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: s_lshr_b32 s10, s19, 16 +; SI-NEXT: s_lshr_b32 s11, s18, 16 +; SI-NEXT: s_lshr_b32 s12, s17, 16 +; SI-NEXT: s_lshr_b32 s13, s16, 16 +; SI-NEXT: s_cmp_lg_u32 s20, 0 ; SI-NEXT: s_cbranch_scc0 .LBB59_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_lshl_b32 s5, s13, 16 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s12, 16 ; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_and_b32 s6, s18, 0xffff +; SI-NEXT: s_lshl_b32 s7, s11, 16 ; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_and_b32 s7, s19, 0xffff +; SI-NEXT: s_lshl_b32 s8, s10, 16 ; SI-NEXT: s_or_b32 s7, s7, s8 ; SI-NEXT: s_cbranch_execnz .LBB59_3 ; SI-NEXT: .LBB59_2: ; %cmp.true ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_lshl_b32 s5, s13, 16 +; SI-NEXT: s_add_i32 s17, s17, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s12, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_and_b32 s6, s18, 0xffff +; SI-NEXT: s_lshl_b32 s7, s11, 16 +; SI-NEXT: s_add_i32 s19, s19, 3 ; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_and_b32 s7, s19, 0xffff +; SI-NEXT: s_lshl_b32 s8, s10, 16 ; SI-NEXT: s_or_b32 s7, s8, s7 ; SI-NEXT: s_add_i32 s4, s4, 0x30000 ; SI-NEXT: s_add_i32 s5, s5, 0x30000 @@ -10337,65 +10582,73 @@ define <8 x half> @bitcast_v2i64_to_v8f16(<2 x i64> %a, i32 %b) { ; SI-LABEL: bitcast_v2i64_to_v8f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v9, v3 -; SI-NEXT: v_mov_b32_e32 v8, v2 -; SI-NEXT: v_mov_b32_e32 v11, v1 -; SI-NEXT: v_mov_b32_e32 v10, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB60_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB60_4 -; SI-NEXT: .LBB60_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB60_3: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v10 ; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB60_2 -; SI-NEXT: .LBB60_4: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v10 -; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v11, vcc -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v8 -; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v9, vcc -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB60_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: .LBB60_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB60_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: .LBB60_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v8 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v7 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2i64_to_v8f16: @@ -10472,17 +10725,17 @@ define inreg <8 x half> @bitcast_v2i64_to_v8f16_scalar(<2 x i64> inreg %a, i32 i ; SI-NEXT: s_cbranch_scc0 .LBB61_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 ; SI-NEXT: s_lshr_b32 s4, s18, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 ; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s19 ; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s16 ; SI-NEXT: s_cbranch_execnz .LBB61_3 ; SI-NEXT: .LBB61_2: ; %cmp.true ; SI-NEXT: s_add_u32 s4, s16, 3 @@ -10493,25 +10746,41 @@ define inreg <8 x half> @bitcast_v2i64_to_v8f16_scalar(<2 x i64> inreg %a, i32 i ; SI-NEXT: s_addc_u32 s9, s19, 0 ; SI-NEXT: s_lshr_b32 s10, s8, 16 ; SI-NEXT: s_lshr_b32 s11, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s9 ; SI-NEXT: v_cvt_f32_f16_e32 v4, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s11 ; SI-NEXT: v_cvt_f32_f16_e32 v5, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s6 ; SI-NEXT: .LBB61_3: ; %end +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_or_b32_e32 v0, v6, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v8, v1 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_or_b32_e32 v3, v6, v3 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB61_4: +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: s_branch .LBB61_2 ; ; VI-LABEL: bitcast_v2i64_to_v8f16_scalar: @@ -10598,15 +10867,27 @@ define <2 x i64> @bitcast_v8f16_to_v2i64(<8 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v8f16_to_v2i64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v14, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v6 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v3 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -10618,29 +10899,29 @@ define <2 x i64> @bitcast_v8f16_to_v2i64(<8 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB62_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v14 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; SI-NEXT: v_or_b32_e32 v0, v13, v0 -; SI-NEXT: v_or_b32_e32 v1, v11, v1 -; SI-NEXT: v_or_b32_e32 v2, v9, v2 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; SI-NEXT: v_or_b32_e32 v0, v11, v0 +; SI-NEXT: v_or_b32_e32 v1, v9, v1 +; SI-NEXT: v_or_b32_e32 v2, v7, v2 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 ; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB62_2 ; SI-NEXT: .LBB62_4: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v9 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -10648,27 +10929,27 @@ define <2 x i64> @bitcast_v8f16_to_v2i64(<8 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v8 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v7 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -10752,15 +11033,27 @@ define inreg <2 x i64> @bitcast_v8f16_to_v2i64_scalar(<8 x half> inreg %a, i32 i ; SI-LABEL: bitcast_v8f16_to_v2i64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v11, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v10, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v9, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v8, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v7, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v6, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v5, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v4, s22 -; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: s_cmp_lg_u32 s20, 0 ; SI-NEXT: s_cbranch_scc0 .LBB63_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v11 @@ -10915,57 +11208,64 @@ define <8 x bfloat> @bitcast_v2i64_to_v8bf16(<2 x i64> %a, i32 %b) { ; SI-LABEL: bitcast_v2i64_to_v8bf16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v11, v3 -; SI-NEXT: v_mov_b32_e32 v10, v2 -; SI-NEXT: v_mov_b32_e32 v9, v1 -; SI-NEXT: v_mov_b32_e32 v8, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB64_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB64_4 -; SI-NEXT: .LBB64_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB64_3: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v11 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v11 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v10 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v10 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v9 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v9 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v8 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v1 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB64_2 -; SI-NEXT: .LBB64_4: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v8 -; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v9, vcc -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v10 -; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v11, vcc -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v3 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_cbranch_execz .LBB64_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v1 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v0 +; SI-NEXT: .LBB64_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v11 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v10 +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v9 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v8 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v7 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v6 +; SI-NEXT: v_alignbit_b32 v2, v2, v3, 16 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v5 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_alignbit_b32 v3, v3, v4, 16 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2i64_to_v8bf16: @@ -11041,47 +11341,55 @@ define inreg <8 x bfloat> @bitcast_v2i64_to_v8bf16_scalar(<2 x i64> inreg %a, i3 ; SI-NEXT: s_cmp_lg_u32 s20, 0 ; SI-NEXT: s_cbranch_scc0 .LBB65_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_and_b32 s6, s19, 0xffff0000 -; SI-NEXT: s_lshl_b32 s7, s19, 16 -; SI-NEXT: s_and_b32 s8, s18, 0xffff0000 -; SI-NEXT: s_lshl_b32 s9, s18, 16 -; SI-NEXT: s_and_b32 s10, s17, 0xffff0000 -; SI-NEXT: s_lshl_b32 s11, s17, 16 -; SI-NEXT: s_and_b32 s12, s16, 0xffff0000 -; SI-NEXT: s_lshl_b32 s13, s16, 16 +; SI-NEXT: s_and_b32 s7, s19, 0xffff0000 +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_and_b32 s9, s18, 0xffff0000 +; SI-NEXT: s_lshl_b32 s8, s18, 16 +; SI-NEXT: s_and_b32 s11, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s10, s17, 16 +; SI-NEXT: s_and_b32 s13, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s12, s16, 16 ; SI-NEXT: s_cbranch_execnz .LBB65_3 ; SI-NEXT: .LBB65_2: ; %cmp.true ; SI-NEXT: s_add_u32 s4, s16, 3 ; SI-NEXT: s_addc_u32 s5, s17, 0 -; SI-NEXT: s_add_u32 s9, s18, 3 -; SI-NEXT: s_addc_u32 s7, s19, 0 -; SI-NEXT: s_and_b32 s6, s7, 0xffff0000 -; SI-NEXT: s_lshl_b32 s7, s7, 16 -; SI-NEXT: s_and_b32 s8, s9, 0xffff0000 -; SI-NEXT: s_lshl_b32 s9, s9, 16 -; SI-NEXT: s_and_b32 s10, s5, 0xffff0000 -; SI-NEXT: s_lshl_b32 s11, s5, 16 -; SI-NEXT: s_and_b32 s12, s4, 0xffff0000 -; SI-NEXT: s_lshl_b32 s13, s4, 16 +; SI-NEXT: s_add_u32 s8, s18, 3 +; SI-NEXT: s_addc_u32 s6, s19, 0 +; SI-NEXT: s_and_b32 s7, s6, 0xffff0000 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s9, s8, 0xffff0000 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_and_b32 s11, s5, 0xffff0000 +; SI-NEXT: s_lshl_b32 s10, s5, 16 +; SI-NEXT: s_and_b32 s13, s4, 0xffff0000 +; SI-NEXT: s_lshl_b32 s12, s4, 16 ; SI-NEXT: .LBB65_3: ; %end -; SI-NEXT: v_mov_b32_e32 v0, s13 -; SI-NEXT: v_mov_b32_e32 v1, s12 -; SI-NEXT: v_mov_b32_e32 v2, s11 -; SI-NEXT: v_mov_b32_e32 v3, s10 -; SI-NEXT: v_mov_b32_e32 v4, s9 -; SI-NEXT: v_mov_b32_e32 v5, s8 -; SI-NEXT: v_mov_b32_e32 v6, s7 -; SI-NEXT: v_mov_b32_e32 v7, s6 +; SI-NEXT: v_mul_f32_e64 v0, 1.0, s13 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_mul_f32_e64 v0, 1.0, s12 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s11 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s10 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s9 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s8 +; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s7 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s6 +; SI-NEXT: v_lshr_b64 v[3:4], v[3:4], 16 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB65_4: -; SI-NEXT: ; implicit-def: $sgpr13 ; SI-NEXT: ; implicit-def: $sgpr12 -; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr13 ; SI-NEXT: ; implicit-def: $sgpr10 -; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $sgpr11 ; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr9 ; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr7 ; SI-NEXT: s_branch .LBB65_2 ; ; VI-LABEL: bitcast_v2i64_to_v8bf16_scalar: @@ -11168,15 +11476,23 @@ define <2 x i64> @bitcast_v8bf16_to_v2i64(<8 x bfloat> %a, i32 %b) { ; SI-LABEL: bitcast_v8bf16_to_v2i64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; SI-NEXT: v_mul_f32_e32 v12, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v13, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v10, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v11, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v8, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v9, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v4, 1.0, v7 -; SI-NEXT: v_mul_f32_e32 v5, 1.0, v6 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v3 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -11188,39 +11504,39 @@ define <2 x i64> @bitcast_v8bf16_to_v2i64(<8 x bfloat> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB66_3: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v6 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_alignbit_b32 v0, v0, v13, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v11, 16 -; SI-NEXT: v_alignbit_b32 v2, v2, v9, 16 +; SI-NEXT: v_alignbit_b32 v0, v0, v11, 16 +; SI-NEXT: v_alignbit_b32 v1, v1, v9, 16 +; SI-NEXT: v_alignbit_b32 v2, v2, v7, 16 ; SI-NEXT: v_alignbit_b32 v3, v3, v5, 16 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB66_2 ; SI-NEXT: .LBB66_4: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v12 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v13 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v11 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v10 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v8 ; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v11 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v9 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v8 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 ; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v9 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v7 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 @@ -11584,15 +11900,23 @@ define inreg <2 x i64> @bitcast_v8bf16_to_v2i64_scalar(<8 x bfloat> inreg %a, i3 ; SI-LABEL: bitcast_v8bf16_to_v2i64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_cmp_lg_u32 s24, 0 -; SI-NEXT: v_mul_f32_e64 v16, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v11, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v15, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v9, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v14, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v7, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v13, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v5, 1.0, s22 +; SI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; SI-NEXT: s_lshl_b32 s5, s19, 16 +; SI-NEXT: s_and_b32 s6, s18, 0xffff0000 +; SI-NEXT: s_lshl_b32 s7, s18, 16 +; SI-NEXT: s_and_b32 s8, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s9, s17, 16 +; SI-NEXT: s_and_b32 s10, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s11, s16, 16 +; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: v_mul_f32_e64 v16, 1.0, s10 +; SI-NEXT: v_mul_f32_e64 v11, 1.0, s11 +; SI-NEXT: v_mul_f32_e64 v15, 1.0, s8 +; SI-NEXT: v_mul_f32_e64 v9, 1.0, s9 +; SI-NEXT: v_mul_f32_e64 v14, 1.0, s6 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s7 +; SI-NEXT: v_mul_f32_e64 v13, 1.0, s4 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s5 ; SI-NEXT: s_cbranch_scc0 .LBB67_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v16 @@ -13643,41 +13967,45 @@ define <8 x i16> @bitcast_v2f64_to_v8i16(<2 x double> %a, i32 %b) { ; SI-LABEL: bitcast_v2f64_to_v8i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v9, v3 -; SI-NEXT: v_mov_b32_e32 v8, v2 -; SI-NEXT: v_mov_b32_e32 v11, v1 -; SI-NEXT: v_mov_b32_e32 v10, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_alignbit_b32 v5, v9, v8, 16 -; SI-NEXT: v_alignbit_b32 v1, v11, v10, 16 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v11 +; SI-NEXT: v_alignbit_b32 v4, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v6, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 ; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; SI-NEXT: v_alignbit_b32 v5, v9, v8, 16 -; SI-NEXT: v_alignbit_b32 v1, v11, v10, 16 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v11 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_alignbit_b32 v4, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v6, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 ; SI-NEXT: ; %bb.4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_mov_b32_e32 v0, v10 -; SI-NEXT: v_mov_b32_e32 v2, v11 -; SI-NEXT: v_mov_b32_e32 v4, v8 -; SI-NEXT: v_mov_b32_e32 v6, v9 -; SI-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: bitcast_v2f64_to_v8i16: -; VI: ; %bb.0: +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v0, v0, v6 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; SI-NEXT: v_or_b32_e32 v1, v1, v6 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v2f64_to_v8i16: +; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -13751,12 +14079,12 @@ define inreg <8 x i16> @bitcast_v2f64_to_v8i16_scalar(<2 x double> inreg %a, i32 ; SI-NEXT: s_lshr_b64 s[6:7], s[16:17], 16 ; SI-NEXT: s_cbranch_execnz .LBB73_4 ; SI-NEXT: .LBB73_2: ; %cmp.true -; SI-NEXT: v_add_f64 v[8:9], s[18:19], 1.0 -; SI-NEXT: v_add_f64 v[10:11], s[16:17], 1.0 -; SI-NEXT: v_lshr_b64 v[5:6], v[8:9], 16 -; SI-NEXT: v_lshr_b64 v[1:2], v[10:11], 16 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v11 +; SI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; SI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; SI-NEXT: v_lshr_b64 v[4:5], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[5:6], v[0:1], 16 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 ; SI-NEXT: s_branch .LBB73_5 ; SI-NEXT: .LBB73_3: ; SI-NEXT: ; implicit-def: $sgpr6 @@ -13765,19 +14093,27 @@ define inreg <8 x i16> @bitcast_v2f64_to_v8i16_scalar(<2 x double> inreg %a, i32 ; SI-NEXT: ; implicit-def: $sgpr11 ; SI-NEXT: s_branch .LBB73_2 ; SI-NEXT: .LBB73_4: -; SI-NEXT: v_mov_b32_e32 v9, s19 -; SI-NEXT: v_mov_b32_e32 v11, s17 -; SI-NEXT: v_mov_b32_e32 v10, s16 -; SI-NEXT: v_mov_b32_e32 v8, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v2, s18 ; SI-NEXT: v_mov_b32_e32 v7, s11 -; SI-NEXT: v_mov_b32_e32 v3, s10 -; SI-NEXT: v_mov_b32_e32 v1, s6 -; SI-NEXT: v_mov_b32_e32 v5, s4 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: v_mov_b32_e32 v5, s6 +; SI-NEXT: v_mov_b32_e32 v4, s4 ; SI-NEXT: .LBB73_5: ; %end -; SI-NEXT: v_mov_b32_e32 v0, v10 -; SI-NEXT: v_mov_b32_e32 v2, v11 -; SI-NEXT: v_mov_b32_e32 v4, v8 -; SI-NEXT: v_mov_b32_e32 v6, v9 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v0, v0, v5 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v7 +; SI-NEXT: v_or_b32_e32 v1, v1, v5 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2f64_to_v8i16_scalar: @@ -13860,13 +14196,19 @@ define <2 x double> @bitcast_v8i16_to_v2f64(<8 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v8i16_to_v2f64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v10, v2 -; SI-NEXT: v_mov_b32_e32 v9, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v1 +; SI-NEXT: v_mov_b32_e32 v8, v3 +; SI-NEXT: v_mov_b32_e32 v5, v2 +; SI-NEXT: v_mov_b32_e32 v6, v1 +; SI-NEXT: v_mov_b32_e32 v7, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v7 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v0 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -13878,37 +14220,37 @@ define <2 x double> @bitcast_v8i16_to_v2f64(<8 x i16> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB74_3: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v9 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v4 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v6 -; SI-NEXT: v_or_b32_e32 v0, v0, v12 -; SI-NEXT: v_or_b32_e32 v1, v1, v11 -; SI-NEXT: v_or_b32_e32 v2, v2, v8 -; SI-NEXT: v_or_b32_e32 v3, v3, v5 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v0, v0, v11 +; SI-NEXT: v_or_b32_e32 v1, v1, v10 +; SI-NEXT: v_or_b32_e32 v2, v2, v9 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB74_2 ; SI-NEXT: .LBB74_4: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v9 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v10 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v4 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v8 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v0, v12, v0 -; SI-NEXT: v_or_b32_e32 v1, v11, v1 -; SI-NEXT: v_or_b32_e32 v2, v8, v2 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_or_b32_e32 v0, v11, v0 +; SI-NEXT: v_or_b32_e32 v1, v10, v1 +; SI-NEXT: v_or_b32_e32 v2, v9, v2 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x30000, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x30000, v2 @@ -13995,38 +14337,42 @@ define inreg <2 x double> @bitcast_v8i16_to_v2f64_scalar(<8 x i16> inreg %a, i32 ; SI-LABEL: bitcast_v8i16_to_v2f64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: s_lshr_b32 s10, s19, 16 +; SI-NEXT: s_lshr_b32 s11, s18, 16 +; SI-NEXT: s_lshr_b32 s12, s17, 16 +; SI-NEXT: s_lshr_b32 s13, s16, 16 +; SI-NEXT: s_cmp_lg_u32 s20, 0 ; SI-NEXT: s_cbranch_scc0 .LBB75_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_lshl_b32 s5, s13, 16 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s12, 16 ; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_and_b32 s6, s18, 0xffff +; SI-NEXT: s_lshl_b32 s7, s11, 16 ; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_and_b32 s7, s19, 0xffff +; SI-NEXT: s_lshl_b32 s8, s10, 16 ; SI-NEXT: s_or_b32 s7, s7, s8 ; SI-NEXT: s_cbranch_execnz .LBB75_3 ; SI-NEXT: .LBB75_2: ; %cmp.true ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_lshl_b32 s5, s13, 16 +; SI-NEXT: s_add_i32 s17, s17, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s12, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_and_b32 s6, s18, 0xffff +; SI-NEXT: s_lshl_b32 s7, s11, 16 +; SI-NEXT: s_add_i32 s19, s19, 3 ; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_and_b32 s7, s19, 0xffff +; SI-NEXT: s_lshl_b32 s8, s10, 16 ; SI-NEXT: s_or_b32 s7, s8, s7 ; SI-NEXT: s_add_i32 s4, s4, 0x30000 ; SI-NEXT: s_add_i32 s5, s5, 0x30000 @@ -14146,27 +14492,27 @@ define <8 x half> @bitcast_v2f64_to_v8f16(<2 x double> %a, i32 %b) { ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 ; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB76_2 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v4 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v4 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v0 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr2 @@ -14177,23 +14523,35 @@ define <8 x half> @bitcast_v2f64_to_v8f16(<2 x double> %a, i32 %b) { ; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v1 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: .LBB76_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_mov_b32_e32 v0, v10 -; SI-NEXT: v_mov_b32_e32 v1, v11 -; SI-NEXT: v_mov_b32_e32 v2, v9 -; SI-NEXT: v_mov_b32_e32 v3, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v8 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v7 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2f64_to_v8f16: @@ -14266,44 +14624,60 @@ define inreg <8 x half> @bitcast_v2f64_to_v8f16_scalar(<2 x double> inreg %a, i3 ; SI-NEXT: s_cbranch_scc0 .LBB77_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 ; SI-NEXT: s_lshr_b32 s4, s18, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 ; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s19 ; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s16 ; SI-NEXT: s_cbranch_execnz .LBB77_3 ; SI-NEXT: .LBB77_2: ; %cmp.true -; SI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 -; SI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v2 +; SI-NEXT: v_add_f64 v[5:6], s[16:17], 1.0 +; SI-NEXT: v_add_f64 v[0:1], s[18:19], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: .LBB77_3: ; %end +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_or_b32_e32 v0, v6, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v8, v1 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_or_b32_e32 v3, v6, v3 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB77_4: +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: s_branch .LBB77_2 ; ; VI-LABEL: bitcast_v2f64_to_v8f16_scalar: @@ -14386,15 +14760,27 @@ define <2 x double> @bitcast_v8f16_to_v2f64(<8 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v8f16_to_v2f64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v14, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v6 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v3 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -14406,29 +14792,29 @@ define <2 x double> @bitcast_v8f16_to_v2f64(<8 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB78_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v14 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; SI-NEXT: v_or_b32_e32 v0, v13, v0 -; SI-NEXT: v_or_b32_e32 v1, v11, v1 -; SI-NEXT: v_or_b32_e32 v2, v9, v2 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; SI-NEXT: v_or_b32_e32 v0, v11, v0 +; SI-NEXT: v_or_b32_e32 v1, v9, v1 +; SI-NEXT: v_or_b32_e32 v2, v7, v2 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 ; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB78_2 ; SI-NEXT: .LBB78_4: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v9 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -14436,27 +14822,27 @@ define <2 x double> @bitcast_v8f16_to_v2f64(<8 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v8 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v7 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -14540,15 +14926,27 @@ define inreg <2 x double> @bitcast_v8f16_to_v2f64_scalar(<8 x half> inreg %a, i3 ; SI-LABEL: bitcast_v8f16_to_v2f64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v11, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v10, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v9, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v8, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v7, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v6, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v5, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v4, s22 -; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: s_cmp_lg_u32 s20, 0 ; SI-NEXT: s_cbranch_scc0 .LBB79_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v11 @@ -14704,25 +15102,25 @@ define <8 x bfloat> @bitcast_v2f64_to_v8bf16(<2 x double> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v3 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v1 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v1 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v0 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v0 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v1 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v0 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; %bb.2: ; %Flow @@ -14731,20 +15129,32 @@ define <8 x bfloat> @bitcast_v2f64_to_v8bf16(<2 x double> %a, i32 %b) { ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v3 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v1 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v1 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v0 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v0 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v1 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v0 ; SI-NEXT: .LBB80_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_mov_b32_e32 v0, v11 -; SI-NEXT: v_mov_b32_e32 v1, v10 -; SI-NEXT: v_mov_b32_e32 v2, v9 -; SI-NEXT: v_mov_b32_e32 v3, v8 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v11 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v10 +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v9 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v8 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v7 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v6 +; SI-NEXT: v_alignbit_b32 v2, v2, v3, 16 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v5 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_alignbit_b32 v3, v3, v4, 16 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2f64_to_v8bf16: @@ -14828,15 +15238,15 @@ define inreg <8 x bfloat> @bitcast_v2f64_to_v8bf16_scalar(<2 x double> inreg %a, ; SI-NEXT: .LBB81_2: ; %cmp.true ; SI-NEXT: v_add_f64 v[0:1], s[18:19], 1.0 ; SI-NEXT: v_add_f64 v[8:9], s[16:17], 1.0 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v1 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v0 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v9 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v1 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v9 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v9 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v8 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v8 -; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: s_branch .LBB81_5 ; SI-NEXT: .LBB81_3: ; SI-NEXT: ; implicit-def: $sgpr6 ; SI-NEXT: ; implicit-def: $sgpr7 @@ -14848,14 +15258,31 @@ define inreg <8 x bfloat> @bitcast_v2f64_to_v8bf16_scalar(<2 x double> inreg %a, ; SI-NEXT: ; implicit-def: $sgpr13 ; SI-NEXT: s_branch .LBB81_2 ; SI-NEXT: .LBB81_4: -; SI-NEXT: v_mov_b32_e32 v7, s13 -; SI-NEXT: v_mov_b32_e32 v6, s12 -; SI-NEXT: v_mov_b32_e32 v5, s11 -; SI-NEXT: v_mov_b32_e32 v4, s10 -; SI-NEXT: v_mov_b32_e32 v3, s9 +; SI-NEXT: v_mov_b32_e32 v5, s13 +; SI-NEXT: v_mov_b32_e32 v4, s12 +; SI-NEXT: v_mov_b32_e32 v6, s11 +; SI-NEXT: v_mov_b32_e32 v3, s10 +; SI-NEXT: v_mov_b32_e32 v7, s9 ; SI-NEXT: v_mov_b32_e32 v2, s8 ; SI-NEXT: v_mov_b32_e32 v1, s7 ; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: .LBB81_5: ; %end +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v7 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v1 +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v2 +; SI-NEXT: v_lshr_b64 v[1:2], v[7:8], 16 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v6 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; SI-NEXT: v_mul_f32_e32 v6, 1.0, v3 +; SI-NEXT: v_lshr_b64 v[2:3], v[6:7], 16 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_lshr_b64 v[3:4], v[4:5], 16 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2f64_to_v8bf16_scalar: @@ -14938,15 +15365,23 @@ define <2 x double> @bitcast_v8bf16_to_v2f64(<8 x bfloat> %a, i32 %b) { ; SI-LABEL: bitcast_v8bf16_to_v2f64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; SI-NEXT: v_mul_f32_e32 v12, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v13, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v10, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v11, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v8, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v9, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v4, 1.0, v7 -; SI-NEXT: v_mul_f32_e32 v5, 1.0, v6 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v3 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -14958,39 +15393,39 @@ define <2 x double> @bitcast_v8bf16_to_v2f64(<8 x bfloat> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB82_3: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v6 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_alignbit_b32 v0, v0, v13, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v11, 16 -; SI-NEXT: v_alignbit_b32 v2, v2, v9, 16 +; SI-NEXT: v_alignbit_b32 v0, v0, v11, 16 +; SI-NEXT: v_alignbit_b32 v1, v1, v9, 16 +; SI-NEXT: v_alignbit_b32 v2, v2, v7, 16 ; SI-NEXT: v_alignbit_b32 v3, v3, v5, 16 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB82_2 ; SI-NEXT: .LBB82_4: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v12 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v13 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v11 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v10 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v8 ; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v11 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v9 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v8 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 ; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v9 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v7 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 @@ -15354,15 +15789,23 @@ define inreg <2 x double> @bitcast_v8bf16_to_v2f64_scalar(<8 x bfloat> inreg %a, ; SI-LABEL: bitcast_v8bf16_to_v2f64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_cmp_lg_u32 s24, 0 -; SI-NEXT: v_mul_f32_e64 v16, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v11, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v15, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v9, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v14, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v7, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v13, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v5, 1.0, s22 +; SI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; SI-NEXT: s_lshl_b32 s5, s19, 16 +; SI-NEXT: s_and_b32 s6, s18, 0xffff0000 +; SI-NEXT: s_lshl_b32 s7, s18, 16 +; SI-NEXT: s_and_b32 s8, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s9, s17, 16 +; SI-NEXT: s_and_b32 s10, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s11, s16, 16 +; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: v_mul_f32_e64 v16, 1.0, s10 +; SI-NEXT: v_mul_f32_e64 v11, 1.0, s11 +; SI-NEXT: v_mul_f32_e64 v15, 1.0, s8 +; SI-NEXT: v_mul_f32_e64 v9, 1.0, s9 +; SI-NEXT: v_mul_f32_e64 v14, 1.0, s6 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s7 +; SI-NEXT: v_mul_f32_e64 v13, 1.0, s4 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s5 ; SI-NEXT: s_cbranch_scc0 .LBB83_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v16 @@ -17425,69 +17868,77 @@ define <8 x half> @bitcast_v8i16_to_v8f16(<8 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v8i16_to_v8f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v16, v7 -; SI-NEXT: v_mov_b32_e32 v9, v6 -; SI-NEXT: v_mov_b32_e32 v10, v5 -; SI-NEXT: v_mov_b32_e32 v11, v4 -; SI-NEXT: v_mov_b32_e32 v12, v3 -; SI-NEXT: v_mov_b32_e32 v13, v2 -; SI-NEXT: v_mov_b32_e32 v14, v1 -; SI-NEXT: v_mov_b32_e32 v15, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 ; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB88_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB88_4 -; SI-NEXT: .LBB88_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB88_3: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v0, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v16 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: s_cbranch_execz .LBB88_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v12 ; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: .LBB88_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB88_2 -; SI-NEXT: .LBB88_4: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v16 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v9 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v10 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v11 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v12 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v13 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v14 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: s_cbranch_execz .LBB88_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: .LBB88_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v6 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v9 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8i16_to_v8f16: @@ -17569,45 +18020,65 @@ define inreg <8 x half> @bitcast_v8i16_to_v8f16_scalar(<8 x i16> inreg %a, i32 i ; SI-LABEL: bitcast_v8i16_to_v8f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: s_lshr_b32 s9, s19, 16 +; SI-NEXT: s_lshr_b32 s8, s18, 16 +; SI-NEXT: s_lshr_b32 s7, s17, 16 +; SI-NEXT: s_lshr_b32 s6, s16, 16 +; SI-NEXT: s_cmp_lg_u32 s20, 0 ; SI-NEXT: s_cbranch_scc0 .LBB89_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s6 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s7 ; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s8 ; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s9 ; SI-NEXT: s_cbranch_execnz .LBB89_3 ; SI-NEXT: .LBB89_2: ; %cmp.true -; SI-NEXT: s_add_i32 s23, s23, 3 -; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: s_add_i32 s21, s21, 3 -; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s9, s9, 3 ; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s8, s8, 3 ; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s7, s7, 3 ; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s6, s6, 3 ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s6 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s7 ; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s8 ; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s9 ; SI-NEXT: .LBB89_3: ; %end +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v0, v0, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; SI-NEXT: v_or_b32_e32 v1, v1, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB89_4: ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: s_branch .LBB89_2 ; @@ -17712,56 +18183,80 @@ define <8 x i16> @bitcast_v8f16_to_v8i16(<8 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v8f16_to_v8i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v9 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB90_2 ; SI-NEXT: ; %bb.1: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v7 -; SI-NEXT: v_or_b32_e32 v6, v6, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v6, v6, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v2, v2, v8 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_or_b32_e32 v4, v4, v5 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; SI-NEXT: v_or_b32_e32 v0, v0, v4 +; SI-NEXT: v_or_b32_e32 v5, v5, v7 +; SI-NEXT: v_alignbit_b32 v8, v2, v4, 16 +; SI-NEXT: v_alignbit_b32 v7, v6, v7, 16 ; SI-NEXT: .LBB90_2: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v4 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v7 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8f16_to_v8i16: @@ -17844,56 +18339,78 @@ define inreg <8 x i16> @bitcast_v8f16_to_v8i16_scalar(<8 x half> inreg %a, i32 i ; SI-LABEL: bitcast_v8f16_to_v8i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v10, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v2, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v3, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v4, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v8, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v6, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v7, s23 -; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v0 +; SI-NEXT: s_cmp_lg_u32 s20, 0 ; SI-NEXT: s_cbranch_scc0 .LBB91_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_cbranch_execnz .LBB91_3 ; SI-NEXT: .LBB91_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v10 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v8 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v7 -; SI-NEXT: v_or_b32_e32 v6, v6, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v2, v2, v8 -; SI-NEXT: v_lshr_b64 v[10:11], v[1:2], 16 -; SI-NEXT: v_lshr_b64 v[8:9], v[5:6], 16 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v9 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v4 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_or_b32_e32 v10, v5, v0 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v9 +; SI-NEXT: v_or_b32_e32 v1, v1, v4 +; SI-NEXT: v_lshr_b64 v[6:7], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[4:5], v[2:3], 16 +; SI-NEXT: v_or_b32_e32 v2, v11, v2 ; SI-NEXT: .LBB91_3: ; %end -; SI-NEXT: v_mov_b32_e32 v1, v10 -; SI-NEXT: v_mov_b32_e32 v5, v8 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v0, v0, v5 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v9 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; SI-NEXT: v_or_b32_e32 v1, v1, v5 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB91_4: ; SI-NEXT: s_branch .LBB91_2 @@ -18002,66 +18519,77 @@ define <8 x bfloat> @bitcast_v8i16_to_v8bf16(<8 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v8i16_to_v8bf16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v11, v6 -; SI-NEXT: v_mov_b32_e32 v12, v4 -; SI-NEXT: v_mov_b32_e32 v9, v2 -; SI-NEXT: v_mov_b32_e32 v10, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v8 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB92_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB92_4 -; SI-NEXT: .LBB92_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB92_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v11 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB92_2 -; SI-NEXT: .LBB92_4: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v11 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v7, v0 -; SI-NEXT: v_add_i32_e32 v6, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v12 +; SI-NEXT: s_cbranch_execz .LBB92_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v3, v7, v3 ; SI-NEXT: s_mov_b32 s6, 0x30000 -; SI-NEXT: v_or_b32_e32 v0, v5, v0 -; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v9 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v3, v0 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v10 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v2, v6, v2 +; SI-NEXT: v_or_b32_e32 v1, v5, v1 +; SI-NEXT: v_or_b32_e32 v0, v4, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x30000, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x30000, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v0 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v1 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v2 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v3 +; SI-NEXT: .LBB92_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v8 +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v5 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v9 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v6 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v10 +; SI-NEXT: v_alignbit_b32 v2, v2, v3, 16 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v7 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v11 +; SI-NEXT: v_alignbit_b32 v3, v3, v4, 16 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8i16_to_v8bf16: @@ -18143,66 +18671,78 @@ define inreg <8 x bfloat> @bitcast_v8i16_to_v8bf16_scalar(<8 x i16> inreg %a, i3 ; SI-LABEL: bitcast_v8i16_to_v8bf16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: s_lshr_b32 s22, s19, 16 +; SI-NEXT: s_lshr_b32 s21, s18, 16 +; SI-NEXT: s_lshr_b32 s15, s17, 16 +; SI-NEXT: s_lshr_b32 s14, s16, 16 +; SI-NEXT: s_cmp_lg_u32 s20, 0 ; SI-NEXT: s_cbranch_scc0 .LBB93_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshl_b32 s6, s16, 16 -; SI-NEXT: s_lshl_b32 s7, s17, 16 -; SI-NEXT: s_lshl_b32 s8, s18, 16 -; SI-NEXT: s_lshl_b32 s9, s19, 16 -; SI-NEXT: s_lshl_b32 s10, s20, 16 +; SI-NEXT: s_lshl_b32 s10, s16, 16 +; SI-NEXT: s_lshl_b32 s13, s14, 16 +; SI-NEXT: s_lshl_b32 s8, s17, 16 +; SI-NEXT: s_lshl_b32 s12, s15, 16 +; SI-NEXT: s_lshl_b32 s7, s18, 16 ; SI-NEXT: s_lshl_b32 s11, s21, 16 -; SI-NEXT: s_lshl_b32 s13, s22, 16 -; SI-NEXT: s_lshl_b32 s12, s23, 16 +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_lshl_b32 s9, s22, 16 ; SI-NEXT: s_cbranch_execnz .LBB93_3 ; SI-NEXT: .LBB93_2: ; %cmp.true -; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: s_and_b32 s4, s22, 0xffff -; SI-NEXT: s_lshl_b32 s5, s23, 16 -; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_and_b32 s4, s19, 0xffff +; SI-NEXT: s_lshl_b32 s5, s22, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s5, s20, 0xffff +; SI-NEXT: s_and_b32 s5, s18, 0xffff ; SI-NEXT: s_lshl_b32 s6, s21, 16 -; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 ; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_and_b32 s6, s18, 0xffff -; SI-NEXT: s_lshl_b32 s7, s19, 16 -; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s6, s17, 0xffff +; SI-NEXT: s_lshl_b32 s7, s15, 16 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: s_add_i32 s8, s6, 0x30000 -; SI-NEXT: s_and_b32 s6, s16, 0xffff -; SI-NEXT: s_lshl_b32 s7, s17, 16 ; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s16, 0xffff +; SI-NEXT: s_lshl_b32 s8, s14, 16 +; SI-NEXT: s_or_b32 s7, s8, s7 ; SI-NEXT: s_add_i32 s4, s4, 0x30000 ; SI-NEXT: s_add_i32 s5, s5, 0x30000 ; SI-NEXT: s_add_i32 s6, s6, 0x30000 -; SI-NEXT: s_and_b32 s7, s6, 0xffff0000 -; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: s_and_b32 s9, s8, 0xffff0000 -; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_and_b32 s13, s7, 0xffff0000 +; SI-NEXT: s_lshl_b32 s10, s7, 16 +; SI-NEXT: s_and_b32 s12, s6, 0xffff0000 +; SI-NEXT: s_lshl_b32 s8, s6, 16 ; SI-NEXT: s_and_b32 s11, s5, 0xffff0000 -; SI-NEXT: s_lshl_b32 s10, s5, 16 -; SI-NEXT: s_and_b32 s12, s4, 0xffff0000 -; SI-NEXT: s_lshl_b32 s13, s4, 16 +; SI-NEXT: s_lshl_b32 s7, s5, 16 +; SI-NEXT: s_and_b32 s9, s4, 0xffff0000 +; SI-NEXT: s_lshl_b32 s6, s4, 16 ; SI-NEXT: .LBB93_3: ; %end -; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: v_mov_b32_e32 v1, s7 -; SI-NEXT: v_mov_b32_e32 v2, s8 -; SI-NEXT: v_mov_b32_e32 v3, s9 -; SI-NEXT: v_mov_b32_e32 v4, s10 -; SI-NEXT: v_mov_b32_e32 v5, s11 -; SI-NEXT: v_mov_b32_e32 v6, s13 -; SI-NEXT: v_mov_b32_e32 v7, s12 +; SI-NEXT: v_mul_f32_e64 v0, 1.0, s13 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_mul_f32_e64 v0, 1.0, s10 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s12 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s8 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s11 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s7 +; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s9 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s6 +; SI-NEXT: v_lshr_b64 v[3:4], v[3:4], 16 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB93_4: -; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; implicit-def: $sgpr7 -; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: ; implicit-def: $sgpr9 ; SI-NEXT: ; implicit-def: $sgpr10 -; SI-NEXT: ; implicit-def: $sgpr11 ; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $sgpr8 ; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr9 ; SI-NEXT: s_branch .LBB93_2 ; ; VI-LABEL: bitcast_v8i16_to_v8bf16_scalar: @@ -18306,41 +18846,42 @@ define <8 x i16> @bitcast_v8bf16_to_v8i16(<8 x bfloat> %a, i32 %b) { ; SI-LABEL: bitcast_v8bf16_to_v8i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 ; SI-NEXT: v_mul_f32_e32 v15, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v14, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v9, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v8, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v13, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v12, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v11, 1.0, v6 -; SI-NEXT: v_mul_f32_e32 v10, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v14, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v13, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v12, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v5 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB94_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB94_4 -; SI-NEXT: .LBB94_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB94_3: ; %cmp.false +; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v10 ; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr13 @@ -18349,38 +18890,52 @@ define <8 x i16> @bitcast_v8bf16_to_v8i16(<8 x bfloat> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB94_2 -; SI-NEXT: .LBB94_4: ; %cmp.true +; SI-NEXT: s_cbranch_execz .LBB94_4 +; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v14 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v15 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v12 -; SI-NEXT: v_alignbit_b32 v0, v2, v0, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v13 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v12 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v5 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v10 +; SI-NEXT: v_alignbit_b32 v2, v2, v1, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v11 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v10 -; SI-NEXT: v_alignbit_b32 v4, v4, v2, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v11 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v8 +; SI-NEXT: v_alignbit_b32 v3, v4, v1, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v9 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v8 -; SI-NEXT: v_alignbit_b32 v6, v7, v2, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v9 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_alignbit_b32 v1, v5, v1, 16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_alignbit_b32 v6, v1, v6, 16 +; SI-NEXT: v_alignbit_b32 v7, v3, v7, 16 +; SI-NEXT: .LBB94_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v1, v1, v5 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v0, v0, v6 +; SI-NEXT: v_or_b32_e32 v2, v2, v5 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8bf16_to_v8i16: @@ -18725,70 +19280,86 @@ define inreg <8 x i16> @bitcast_v8bf16_to_v8i16_scalar(<8 x bfloat> inreg %a, i3 ; SI-LABEL: bitcast_v8bf16_to_v8i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_cmp_lg_u32 s24, 0 -; SI-NEXT: v_mul_f32_e64 v15, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v9, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v3, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v14, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v5, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v7, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v13, 1.0, s23 +; SI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; SI-NEXT: s_lshl_b32 s5, s19, 16 +; SI-NEXT: s_and_b32 s6, s18, 0xffff0000 +; SI-NEXT: s_lshl_b32 s7, s18, 16 +; SI-NEXT: s_and_b32 s8, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s9, s17, 16 +; SI-NEXT: s_and_b32 s10, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s11, s16, 16 +; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: v_mul_f32_e64 v15, 1.0, s11 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s10 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s9 +; SI-NEXT: v_mul_f32_e64 v6, 1.0, s8 +; SI-NEXT: v_mul_f32_e64 v14, 1.0, s7 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s6 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s5 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 ; SI-NEXT: s_cbranch_scc0 .LBB95_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 ; SI-NEXT: s_cbranch_execnz .LBB95_3 ; SI-NEXT: .LBB95_2: ; %cmp.true ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v15 -; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v12 ; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v10 -; SI-NEXT: v_lshr_b64 v[4:5], v[1:2], 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v7 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v13 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v8 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v9 -; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v10 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 +; SI-NEXT: v_lshr_b64 v[7:8], v[7:8], 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v1 -; SI-NEXT: v_lshr_b64 v[2:3], v[9:10], 16 -; SI-NEXT: v_lshr_b64 v[6:7], v[7:8], 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v11 -; SI-NEXT: v_lshr_b64 v[11:12], v[1:2], 16 -; SI-NEXT: v_lshr_b64 v[12:13], v[5:6], 16 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v1 +; SI-NEXT: v_lshr_b64 v[3:4], v[8:9], 16 +; SI-NEXT: v_lshr_b64 v[5:6], v[10:11], 16 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v12 +; SI-NEXT: v_lshr_b64 v[12:13], v[4:5], 16 +; SI-NEXT: v_lshr_b64 v[13:14], v[2:3], 16 ; SI-NEXT: .LBB95_3: ; %end -; SI-NEXT: v_mov_b32_e32 v1, v11 -; SI-NEXT: v_mov_b32_e32 v3, v10 -; SI-NEXT: v_mov_b32_e32 v5, v12 -; SI-NEXT: v_mov_b32_e32 v7, v8 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v12 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v11 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v13 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v9 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB95_4: ; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: s_branch .LBB95_2 ; ; VI-LABEL: bitcast_v8bf16_to_v8i16_scalar: @@ -19182,24 +19753,25 @@ define <16 x i8> @bitcast_v8i16_to_v16i8(<8 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v8i16_to_v16i8: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v20, v7 -; SI-NEXT: v_mov_b32_e32 v21, v3 -; SI-NEXT: v_mov_b32_e32 v16, v6 -; SI-NEXT: v_mov_b32_e32 v17, v4 +; SI-NEXT: v_mov_b32_e32 v16, v3 ; SI-NEXT: v_mov_b32_e32 v18, v2 +; SI-NEXT: v_mov_b32_e32 v17, v1 ; SI-NEXT: v_mov_b32_e32 v19, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v19 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v14 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr9 @@ -19207,7 +19779,6 @@ define <16 x i8> @bitcast_v8i16_to_v16i8(<8 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -19219,14 +19790,14 @@ define <16 x i8> @bitcast_v8i16_to_v16i8(<8 x i16> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB96_3: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v18 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v19 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 -; SI-NEXT: v_or_b32_e32 v8, v5, v24 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 +; SI-NEXT: v_or_b32_e32 v8, v5, v22 ; SI-NEXT: v_and_b32_e32 v5, 0xffff, v16 -; SI-NEXT: v_or_b32_e32 v0, v0, v25 -; SI-NEXT: v_or_b32_e32 v4, v1, v23 -; SI-NEXT: v_or_b32_e32 v12, v5, v22 +; SI-NEXT: v_or_b32_e32 v0, v0, v23 +; SI-NEXT: v_or_b32_e32 v4, v1, v21 +; SI-NEXT: v_or_b32_e32 v12, v5, v20 ; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 ; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 ; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 @@ -19235,36 +19806,34 @@ define <16 x i8> @bitcast_v8i16_to_v16i8(<8 x i16> %a, i32 %b) { ; SI-NEXT: v_alignbit_b32 v9, v12, v8, 8 ; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 ; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v12 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v21 -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v20 -; SI-NEXT: v_bfe_u32 v7, v21, 8, 8 -; SI-NEXT: v_bfe_u32 v15, v20, 8, 8 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: v_bfe_u32 v7, v6, 8, 8 +; SI-NEXT: v_bfe_u32 v15, v14, 8, 8 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB96_2 ; SI-NEXT: .LBB96_4: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v18 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v24, v0 +; SI-NEXT: v_or_b32_e32 v0, v22, v0 ; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v16 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_mov_b32 s6, 0x30000 -; SI-NEXT: v_or_b32_e32 v0, v22, v0 +; SI-NEXT: v_or_b32_e32 v0, v20, v0 ; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v19 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v17 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v0, v25, v0 -; SI-NEXT: v_or_b32_e32 v1, v23, v1 +; SI-NEXT: v_or_b32_e32 v0, v23, v0 +; SI-NEXT: v_or_b32_e32 v1, v21, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 ; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v1 ; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 @@ -19564,52 +20133,54 @@ define inreg <16 x i8> @bitcast_v8i16_to_v16i8_scalar(<8 x i16> inreg %a, i32 in ; SI-LABEL: bitcast_v8i16_to_v16i8_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: s_lshr_b32 s27, s19, 16 +; SI-NEXT: s_lshr_b32 s29, s18, 16 +; SI-NEXT: s_lshr_b32 s26, s17, 16 +; SI-NEXT: s_lshr_b32 s28, s16, 16 +; SI-NEXT: s_cmp_lg_u32 s20, 0 ; SI-NEXT: s_cbranch_scc0 .LBB97_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_lshl_b32 s5, s28, 16 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s26, 16 ; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_lshr_b64 s[8:9], s[4:5], 24 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_and_b32 s6, s18, 0xffff +; SI-NEXT: s_lshl_b32 s7, s29, 16 ; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s9, s23, 16 +; SI-NEXT: s_and_b32 s7, s19, 0xffff +; SI-NEXT: s_lshl_b32 s9, s27, 16 ; SI-NEXT: s_or_b32 s7, s7, s9 ; SI-NEXT: s_lshr_b64 s[10:11], s[4:5], 16 ; SI-NEXT: s_lshr_b64 s[12:13], s[4:5], 8 ; SI-NEXT: s_lshr_b64 s[14:15], s[6:7], 24 -; SI-NEXT: s_lshr_b64 s[24:25], s[6:7], 16 -; SI-NEXT: s_lshr_b64 s[26:27], s[6:7], 8 +; SI-NEXT: s_lshr_b64 s[20:21], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[22:23], s[6:7], 8 ; SI-NEXT: s_lshr_b32 s9, s5, 8 ; SI-NEXT: s_lshr_b32 s15, s7, 8 -; SI-NEXT: s_and_b32 s11, s19, 0xffff -; SI-NEXT: s_and_b32 s25, s23, 0xffff -; SI-NEXT: s_bfe_u32 s13, s19, 0x80008 -; SI-NEXT: s_bfe_u32 s27, s23, 0x80008 +; SI-NEXT: s_bfe_u32 s11, s26, 0x80008 +; SI-NEXT: s_bfe_u32 s13, s27, 0x80008 ; SI-NEXT: s_cbranch_execnz .LBB97_3 ; SI-NEXT: .LBB97_2: ; %cmp.true -; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: s_and_b32 s4, s20, 0xffff -; SI-NEXT: s_lshl_b32 s5, s21, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: s_lshl_b32 s5, s29, 16 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 ; SI-NEXT: s_add_i32 s6, s4, 0x30000 -; SI-NEXT: s_and_b32 s4, s22, 0xffff -; SI-NEXT: s_lshl_b32 s5, s23, 16 +; SI-NEXT: s_and_b32 s4, s19, 0xffff +; SI-NEXT: s_lshl_b32 s5, s27, 16 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_add_i32 s7, s4, 0x30000 ; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_lshl_b32 s5, s28, 16 +; SI-NEXT: s_add_i32 s17, s17, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s8, s19, 16 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s8, s26, 16 ; SI-NEXT: s_or_b32 s5, s8, s5 ; SI-NEXT: s_add_i32 s4, s4, 0x30000 ; SI-NEXT: s_add_i32 s5, s5, 0x30000 @@ -19617,13 +20188,13 @@ define inreg <16 x i8> @bitcast_v8i16_to_v16i8_scalar(<8 x i16> inreg %a, i32 in ; SI-NEXT: s_lshr_b64 s[10:11], s[4:5], 16 ; SI-NEXT: s_lshr_b64 s[12:13], s[4:5], 8 ; SI-NEXT: s_lshr_b64 s[14:15], s[6:7], 24 -; SI-NEXT: s_lshr_b64 s[24:25], s[6:7], 16 -; SI-NEXT: s_lshr_b64 s[26:27], s[6:7], 8 -; SI-NEXT: s_lshr_b32 s13, s5, 24 -; SI-NEXT: s_lshr_b32 s11, s5, 16 +; SI-NEXT: s_lshr_b64 s[20:21], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[22:23], s[6:7], 8 +; SI-NEXT: s_lshr_b32 s11, s5, 24 +; SI-NEXT: s_lshr_b32 s26, s5, 16 ; SI-NEXT: s_lshr_b32 s9, s5, 8 -; SI-NEXT: s_lshr_b32 s27, s7, 24 -; SI-NEXT: s_lshr_b32 s25, s7, 16 +; SI-NEXT: s_lshr_b32 s13, s7, 24 +; SI-NEXT: s_lshr_b32 s27, s7, 16 ; SI-NEXT: s_lshr_b32 s15, s7, 8 ; SI-NEXT: .LBB97_3: ; %end ; SI-NEXT: v_mov_b32_e32 v0, s4 @@ -19632,16 +20203,16 @@ define inreg <16 x i8> @bitcast_v8i16_to_v16i8_scalar(<8 x i16> inreg %a, i32 in ; SI-NEXT: v_mov_b32_e32 v3, s8 ; SI-NEXT: v_mov_b32_e32 v4, s5 ; SI-NEXT: v_mov_b32_e32 v5, s9 -; SI-NEXT: v_mov_b32_e32 v6, s11 -; SI-NEXT: v_mov_b32_e32 v7, s13 +; SI-NEXT: v_mov_b32_e32 v6, s26 +; SI-NEXT: v_mov_b32_e32 v7, s11 ; SI-NEXT: v_mov_b32_e32 v8, s6 -; SI-NEXT: v_mov_b32_e32 v9, s26 -; SI-NEXT: v_mov_b32_e32 v10, s24 +; SI-NEXT: v_mov_b32_e32 v9, s22 +; SI-NEXT: v_mov_b32_e32 v10, s20 ; SI-NEXT: v_mov_b32_e32 v11, s14 ; SI-NEXT: v_mov_b32_e32 v12, s7 ; SI-NEXT: v_mov_b32_e32 v13, s15 -; SI-NEXT: v_mov_b32_e32 v14, s25 -; SI-NEXT: v_mov_b32_e32 v15, s27 +; SI-NEXT: v_mov_b32_e32 v14, s27 +; SI-NEXT: v_mov_b32_e32 v15, s13 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB97_4: ; SI-NEXT: ; implicit-def: $sgpr4 @@ -19650,14 +20221,12 @@ define inreg <16 x i8> @bitcast_v8i16_to_v16i8_scalar(<8 x i16> inreg %a, i32 in ; SI-NEXT: ; implicit-def: $sgpr8 ; SI-NEXT: ; implicit-def: $sgpr9 ; SI-NEXT: ; implicit-def: $sgpr11 -; SI-NEXT: ; implicit-def: $sgpr13 ; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr24 +; SI-NEXT: ; implicit-def: $sgpr22 +; SI-NEXT: ; implicit-def: $sgpr20 ; SI-NEXT: ; implicit-def: $sgpr14 ; SI-NEXT: ; implicit-def: $sgpr15 -; SI-NEXT: ; implicit-def: $sgpr25 -; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr13 ; SI-NEXT: s_branch .LBB97_2 ; ; VI-LABEL: bitcast_v8i16_to_v16i8_scalar: @@ -19913,135 +20482,143 @@ define <8 x i16> @bitcast_v16i8_to_v8i16(<16 x i8> %a, i32 %b) { ; SI-LABEL: bitcast_v16i8_to_v8i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v17, v6 -; SI-NEXT: v_mov_b32_e32 v19, v4 -; SI-NEXT: v_mov_b32_e32 v18, v2 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v5 ; SI-NEXT: v_lshlrev_b32_e32 v16, 24, v7 -; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v3 -; SI-NEXT: v_lshlrev_b32_e32 v22, 8, v13 -; SI-NEXT: v_lshlrev_b32_e32 v15, 24, v15 -; SI-NEXT: v_lshlrev_b32_e32 v23, 24, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v1 -; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 -; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: v_lshlrev_b32_e32 v18, 24, v3 +; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v13 +; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v15 +; SI-NEXT: v_lshlrev_b32_e32 v22, 24, v11 +; SI-NEXT: v_lshlrev_b32_e32 v13, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v9 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB98_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v1, 0xff, v19 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v17 -; SI-NEXT: v_or_b32_e32 v1, v1, v20 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v3, v16, v2 -; SI-NEXT: v_or_b32_e32 v2, v1, v3 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v18 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v12 -; SI-NEXT: v_and_b32_e32 v6, 0xff, v14 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v6 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v5, v5, v22 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v0, v0, v11 -; SI-NEXT: v_or_b32_e32 v4, v21, v1 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; SI-NEXT: v_or_b32_e32 v7, v15, v6 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v14 +; SI-NEXT: v_or_b32_e32 v0, v0, v13 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v4 +; SI-NEXT: v_or_b32_e32 v4, v16, v3 +; SI-NEXT: v_or_b32_e32 v2, v18, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v6, v5, v7 +; SI-NEXT: v_or_b32_e32 v1, v1, v17 +; SI-NEXT: v_or_b32_e32 v3, v3, v21 +; SI-NEXT: v_or_b32_e32 v6, v20, v5 ; SI-NEXT: v_and_b32_e32 v5, 0xff, v10 -; SI-NEXT: v_or_b32_e32 v13, v0, v4 +; SI-NEXT: v_or_b32_e32 v9, v0, v2 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v8 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v0, v0, v9 -; SI-NEXT: v_or_b32_e32 v10, v23, v5 +; SI-NEXT: v_or_b32_e32 v0, v0, v23 +; SI-NEXT: v_or_b32_e32 v1, v1, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v6 +; SI-NEXT: v_or_b32_e32 v5, v22, v5 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_alignbit_b32 v1, v2, v4, 16 -; SI-NEXT: v_alignbit_b32 v5, v6, v10, 16 -; SI-NEXT: v_or_b32_e32 v4, v0, v10 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_alignbit_b32 v7, v1, v2, 16 +; SI-NEXT: v_alignbit_b32 v11, v3, v5, 16 +; SI-NEXT: v_or_b32_e32 v5, v0, v5 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v6 ; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: .LBB98_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB98_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v8 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v10 -; SI-NEXT: v_or_b32_e32 v1, v9, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v10 +; SI-NEXT: v_or_b32_e32 v1, v23, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v2, v23, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v3, v22, v3 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: s_mov_b32 s7, 0x3000000 -; SI-NEXT: v_add_i32_e32 v4, vcc, s7, v1 +; SI-NEXT: v_add_i32_e32 v5, vcc, s7, v1 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v12 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v14 ; SI-NEXT: s_movk_i32 s6, 0x300 -; SI-NEXT: v_or_b32_e32 v1, v22, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_or_b32_e32 v1, v21, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v2, v15, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v3, v20, v3 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_add_i32_e32 v6, vcc, s7, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, s7, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v18 -; SI-NEXT: v_or_b32_e32 v0, v11, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v2 +; SI-NEXT: v_or_b32_e32 v0, v13, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v21, v1 +; SI-NEXT: v_or_b32_e32 v1, v18, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v13, vcc, s7, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v9, vcc, s7, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v4 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v17 -; SI-NEXT: v_or_b32_e32 v0, v20, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v6 +; SI-NEXT: v_or_b32_e32 v0, v17, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v16, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x3000000, v0 -; SI-NEXT: v_alignbit_b32 v1, v2, v13, 16 -; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x3000000, v0 +; SI-NEXT: v_alignbit_b32 v7, v1, v9, 16 +; SI-NEXT: v_alignbit_b32 v11, v3, v5, 16 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v3 ; SI-NEXT: .LBB98_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_mov_b32_e32 v0, v13 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v15 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v11 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v19 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16i8_to_v8i16: @@ -20499,13 +21076,13 @@ define inreg <8 x i16> @bitcast_v16i8_to_v8i16_scalar(<16 x i8> inreg %a, i32 in ; SI-NEXT: s_lshl_b32 s9, s9, 16 ; SI-NEXT: s_lshl_b32 s10, s14, 24 ; SI-NEXT: s_and_b32 s5, s5, 0xffff -; SI-NEXT: s_or_b32 s12, s10, s9 -; SI-NEXT: s_or_b32 s43, s5, s12 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_or_b32 s43, s5, s9 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_lshr_b64 s[10:11], s[42:43], 16 ; SI-NEXT: s_or_b32 s4, s4, s42 -; SI-NEXT: s_lshr_b32 s9, s7, 16 -; SI-NEXT: s_lshr_b32 s11, s12, 16 +; SI-NEXT: s_lshr_b32 s11, s7, 16 +; SI-NEXT: s_lshr_b32 s9, s9, 16 ; SI-NEXT: s_mov_b32 s7, s41 ; SI-NEXT: s_mov_b32 s5, s43 ; SI-NEXT: s_cbranch_execnz .LBB99_3 @@ -20564,25 +21141,33 @@ define inreg <8 x i16> @bitcast_v16i8_to_v8i16_scalar(<16 x i8> inreg %a, i32 in ; SI-NEXT: s_add_i32 s7, s7, 0x3000000 ; SI-NEXT: s_lshr_b64 s[8:9], s[6:7], 16 ; SI-NEXT: s_lshr_b64 s[10:11], s[4:5], 16 -; SI-NEXT: s_lshr_b32 s9, s7, 16 -; SI-NEXT: s_lshr_b32 s11, s5, 16 +; SI-NEXT: s_lshr_b32 s11, s7, 16 +; SI-NEXT: s_lshr_b32 s9, s5, 16 ; SI-NEXT: .LBB99_3: ; %end +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_or_b32 s6, s6, s8 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_lshl_b32 s8, s11, 16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s8, s10, 16 +; SI-NEXT: s_or_b32 s4, s4, s8 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s8, s9, 16 +; SI-NEXT: s_or_b32 s5, s5, s8 ; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: v_mov_b32_e32 v1, s8 -; SI-NEXT: v_mov_b32_e32 v2, s7 -; SI-NEXT: v_mov_b32_e32 v3, s9 -; SI-NEXT: v_mov_b32_e32 v4, s4 -; SI-NEXT: v_mov_b32_e32 v5, s10 -; SI-NEXT: v_mov_b32_e32 v6, s5 -; SI-NEXT: v_mov_b32_e32 v7, s11 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s4 +; SI-NEXT: v_mov_b32_e32 v3, s5 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB99_4: ; SI-NEXT: ; implicit-def: $sgpr6 ; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $sgpr11 ; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr10 -; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr9 ; SI-NEXT: s_branch .LBB99_2 ; ; VI-LABEL: bitcast_v16i8_to_v8i16_scalar: @@ -20929,42 +21514,46 @@ define <8 x bfloat> @bitcast_v8f16_to_v8bf16(<8 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v8f16_to_v8bf16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v9, v0 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v7 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v5 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 ; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB100_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB100_4 -; SI-NEXT: .LBB100_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB100_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v9 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v9 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v13 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v14 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v16 -; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v15 ; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr13 @@ -20972,42 +21561,61 @@ define <8 x bfloat> @bitcast_v8f16_to_v8bf16(<8 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB100_2 -; SI-NEXT: .LBB100_4: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v13 +; SI-NEXT: s_cbranch_execz .LBB100_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v13 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v8 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v1 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v9 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: .LBB100_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_alignbit_b32 v0, v2, v0, 16 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v6 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v4 +; SI-NEXT: v_alignbit_b32 v2, v2, v3, 16 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v7 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v5 +; SI-NEXT: v_alignbit_b32 v3, v3, v4, 16 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8f16_to_v8bf16: @@ -21090,70 +21698,98 @@ define inreg <8 x bfloat> @bitcast_v8f16_to_v8bf16_scalar(<8 x half> inreg %a, i ; SI-LABEL: bitcast_v8f16_to_v8bf16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v8, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v9, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v10, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v11, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v12, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v13, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v14, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v15, s23 -; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v0 +; SI-NEXT: s_cmp_lg_u32 s20, 0 ; SI-NEXT: s_cbranch_scc0 .LBB101_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v9 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v14 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v15 ; SI-NEXT: s_cbranch_execnz .LBB101_3 ; SI-NEXT: .LBB101_2: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v12 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v1 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v12 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v8 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v0 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v4 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v4 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: .LBB101_3: ; %end +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v5 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_lshr_b64 v[1:2], v[2:3], 16 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v10 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v6 +; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v8 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_lshr_b64 v[3:4], v[4:5], 16 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB101_4: ; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: s_branch .LBB101_2 ; ; VI-LABEL: bitcast_v8f16_to_v8bf16_scalar: @@ -21260,48 +21896,50 @@ define <8 x half> @bitcast_v8bf16_to_v8f16(<8 x bfloat> %a, i32 %b) { ; SI-LABEL: bitcast_v8bf16_to_v8f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 ; SI-NEXT: v_mul_f32_e32 v8, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v9, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v10, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v11, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v12, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v13, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v14, 1.0, v6 -; SI-NEXT: v_mul_f32_e32 v15, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v12, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v13, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v14, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v15, 1.0, v5 ; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB102_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB102_4 -; SI-NEXT: .LBB102_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB102_3: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v8 +; SI-NEXT: s_cbranch_execz .LBB102_2 +; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v14 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr14 @@ -21311,42 +21949,60 @@ define <8 x half> @bitcast_v8bf16_to_v8f16(<8 x bfloat> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: .LBB102_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB102_2 -; SI-NEXT: .LBB102_4: ; %cmp.true +; SI-NEXT: s_cbranch_execz .LBB102_4 +; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v15 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v14 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v13 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v12 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v11 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v10 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v9 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v8 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: .LBB102_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v0, v0, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8bf16_to_v8f16: @@ -21699,78 +22355,102 @@ define inreg <8 x half> @bitcast_v8bf16_to_v8f16_scalar(<8 x bfloat> inreg %a, i ; SI-LABEL: bitcast_v8bf16_to_v8f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_cmp_lg_u32 s24, 0 -; SI-NEXT: v_mul_f32_e64 v8, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v9, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v10, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v11, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v12, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v13, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v14, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v15, 1.0, s23 +; SI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; SI-NEXT: s_lshl_b32 s5, s19, 16 +; SI-NEXT: s_and_b32 s6, s18, 0xffff0000 +; SI-NEXT: s_lshl_b32 s7, s18, 16 +; SI-NEXT: s_and_b32 s8, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s9, s17, 16 +; SI-NEXT: s_and_b32 s10, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s11, s16, 16 +; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: v_mul_f32_e64 v0, 1.0, s11 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s10 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s9 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s8 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s7 +; SI-NEXT: v_mul_f32_e64 v12, 1.0, s6 +; SI-NEXT: v_mul_f32_e64 v14, 1.0, s5 +; SI-NEXT: v_mul_f32_e64 v15, 1.0, s4 ; SI-NEXT: s_cbranch_scc0 .LBB103_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v6 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: s_cbranch_execnz .LBB103_3 ; SI-NEXT: .LBB103_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v15 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v13 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v12 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v11 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v10 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v9 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v8 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v15 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v12 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v8 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: .LBB103_3: ; %end +; SI-NEXT: v_cvt_f16_f32_e32 v0, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v5 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v11 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB103_4: -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: s_branch .LBB103_2 ; ; VI-LABEL: bitcast_v8bf16_to_v8f16_scalar: @@ -22187,16 +22867,27 @@ define <16 x i8> @bitcast_v8f16_to_v16i8(<8 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v8f16_to_v16i8: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v9, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v1 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v6 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v9 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v3 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 @@ -22568,15 +23259,27 @@ define inreg <16 x i8> @bitcast_v8f16_to_v16i8_scalar(<8 x half> inreg %a, i32 i ; SI-LABEL: bitcast_v8f16_to_v16i8_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v23, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v16, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v6, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v8, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v26, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v25, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v14, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v24, s22 -; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v4 +; SI-NEXT: s_cmp_lg_u32 s20, 0 ; SI-NEXT: s_cbranch_scc0 .LBB105_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v23 @@ -22919,55 +23622,53 @@ define <8 x half> @bitcast_v16i8_to_v8f16(<16 x i8> %a, i32 %b) { ; SI-LABEL: bitcast_v16i8_to_v8f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v18, v2 -; SI-NEXT: v_mov_b32_e32 v17, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 ; SI-NEXT: v_lshlrev_b32_e32 v16, 8, v1 -; SI-NEXT: v_lshlrev_b32_e32 v19, 8, v3 -; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v5 -; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v7 -; SI-NEXT: v_lshlrev_b32_e32 v22, 8, v9 -; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v11 -; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v13 -; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v15 -; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v3 +; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v19, 8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v9 +; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v11 +; SI-NEXT: v_lshlrev_b32_e32 v22, 8, v13 +; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v15 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB106_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v2, 0xff, v4 -; SI-NEXT: v_and_b32_e32 v4, 0xff, v8 -; SI-NEXT: v_or_b32_e32 v4, v4, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v4 -; SI-NEXT: v_and_b32_e32 v4, 0xff, v10 -; SI-NEXT: v_or_b32_e32 v4, v4, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v4 -; SI-NEXT: v_and_b32_e32 v4, 0xff, v12 -; SI-NEXT: v_or_b32_e32 v4, v4, v23 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v17 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v18 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v4 -; SI-NEXT: v_and_b32_e32 v4, 0xff, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: v_or_b32_e32 v0, v0, v16 -; SI-NEXT: v_or_b32_e32 v1, v1, v19 -; SI-NEXT: v_or_b32_e32 v2, v2, v20 -; SI-NEXT: v_or_b32_e32 v3, v3, v21 -; SI-NEXT: v_or_b32_e32 v4, v4, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v4 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v4 +; SI-NEXT: v_or_b32_e32 v0, v0, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v0, v0, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v8 +; SI-NEXT: v_or_b32_e32 v0, v0, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v10 +; SI-NEXT: v_or_b32_e32 v0, v0, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v12 +; SI-NEXT: v_or_b32_e32 v0, v0, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v14 +; SI-NEXT: v_or_b32_e32 v0, v0, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr8 @@ -22975,62 +23676,76 @@ define <8 x half> @bitcast_v16i8_to_v8f16(<16 x i8> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: .LBB106_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB106_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v14 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v15, v0 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x300, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v12 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v14 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v1, v23, v1 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x300, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v12 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: s_movk_i32 s6, 0x300 -; SI-NEXT: v_or_b32_e32 v0, v23, v0 -; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v10 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v11, v0 -; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v8 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v22, v0 -; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v6 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v21, v0 -; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v4 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v20, v0 -; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v18 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v19, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v17 +; SI-NEXT: v_or_b32_e32 v1, v22, v1 +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v10 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v1, v21, v1 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v8 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v1, v20, v1 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v6 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v1, v19, v1 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v1, v18, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v1, v17, v1 ; SI-NEXT: v_or_b32_e32 v0, v16, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x300, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v14 ; SI-NEXT: .LBB106_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_mov_b32_e32 v4, v9 -; SI-NEXT: v_mov_b32_e32 v6, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v13 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v11 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16i8_to_v8f16: @@ -23470,19 +24185,19 @@ define inreg <8 x half> @bitcast_v16i8_to_v8f16_scalar(<16 x i8> inreg %a, i32 i ; SI-NEXT: s_and_b32 s4, s22, 0xff ; SI-NEXT: s_lshl_b32 s5, s23, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 ; SI-NEXT: s_and_b32 s4, s24, 0xff ; SI-NEXT: s_lshl_b32 s5, s25, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 ; SI-NEXT: s_and_b32 s4, s26, 0xff ; SI-NEXT: s_lshl_b32 s5, s27, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 ; SI-NEXT: s_and_b32 s4, s28, 0xff ; SI-NEXT: s_lshl_b32 s5, s29, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 ; SI-NEXT: s_and_b32 s4, s7, 0xff ; SI-NEXT: s_lshl_b32 s5, s6, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 @@ -23532,21 +24247,37 @@ define inreg <8 x half> @bitcast_v16i8_to_v8f16_scalar(<16 x i8> inreg %a, i32 i ; SI-NEXT: v_cvt_f32_f16_e32 v0, s11 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s10 ; SI-NEXT: v_cvt_f32_f16_e32 v2, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 ; SI-NEXT: .LBB107_3: ; %end +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB107_4: ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: s_branch .LBB107_2 ; @@ -23894,15 +24625,23 @@ define <16 x i8> @bitcast_v8bf16_to_v16i8(<8 x bfloat> %a, i32 %b) { ; SI-LABEL: bitcast_v8bf16_to_v16i8: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; SI-NEXT: v_mul_f32_e32 v19, 1.0, v1 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v8 ; SI-NEXT: v_mul_f32_e32 v20, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v16, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v18, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v22, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v23, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v17, 1.0, v7 -; SI-NEXT: v_mul_f32_e32 v21, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v3 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 @@ -24545,15 +25284,23 @@ define inreg <16 x i8> @bitcast_v8bf16_to_v16i8_scalar(<8 x bfloat> inreg %a, i3 ; SI-LABEL: bitcast_v8bf16_to_v16i8_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_cmp_lg_u32 s24, 0 -; SI-NEXT: v_mul_f32_e64 v28, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v8, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v27, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v5, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v30, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v25, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v29, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v13, 1.0, s22 +; SI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; SI-NEXT: s_lshl_b32 s5, s19, 16 +; SI-NEXT: s_and_b32 s6, s18, 0xffff0000 +; SI-NEXT: s_lshl_b32 s7, s18, 16 +; SI-NEXT: s_and_b32 s8, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s9, s17, 16 +; SI-NEXT: s_and_b32 s10, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s11, s16, 16 +; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: v_mul_f32_e64 v28, 1.0, s10 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s11 +; SI-NEXT: v_mul_f32_e64 v27, 1.0, s8 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s9 +; SI-NEXT: v_mul_f32_e64 v30, 1.0, s6 +; SI-NEXT: v_mul_f32_e64 v25, 1.0, s7 +; SI-NEXT: v_mul_f32_e64 v29, 1.0, s4 +; SI-NEXT: v_mul_f32_e64 v13, 1.0, s5 ; SI-NEXT: s_cbranch_scc0 .LBB109_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v28 @@ -25243,55 +25990,53 @@ define <8 x bfloat> @bitcast_v16i8_to_v8bf16(<16 x i8> %a, i32 %b) { ; SI-LABEL: bitcast_v16i8_to_v8bf16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v18, v1 -; SI-NEXT: v_mov_b32_e32 v17, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v3 -; SI-NEXT: v_lshlrev_b32_e32 v22, 8, v5 -; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v7 -; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v11 +; SI-NEXT: v_lshlrev_b32_e32 v16, 24, v3 +; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v18, 24, v7 +; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v11 ; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v13 -; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v15 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: v_lshlrev_b32_e32 v22, 24, v15 ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB110_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xff, v17 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v18 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v4 -; SI-NEXT: v_or_b32_e32 v2, v2, v22 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v3, v20, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v9 -; SI-NEXT: v_or_b32_e32 v11, v4, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v5, v21, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v12 -; SI-NEXT: v_or_b32_e32 v2, v2, v23 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v19, v1 -; SI-NEXT: v_or_b32_e32 v7, v13, v2 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; SI-NEXT: v_or_b32_e32 v3, v1, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v5, v16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v4 +; SI-NEXT: v_or_b32_e32 v0, v0, v21 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v11, v18, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v9 +; SI-NEXT: v_or_b32_e32 v13, v1, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v15, v20, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v12 +; SI-NEXT: v_or_b32_e32 v0, v0, v23 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v19, v22, v0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr6 @@ -25300,77 +26045,90 @@ define <8 x bfloat> @bitcast_v16i8_to_v8bf16(<16 x i8> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: .LBB110_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB110_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v12 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v14 -; SI-NEXT: v_or_b32_e32 v0, v23, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v13, v1 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v12 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v14 +; SI-NEXT: v_or_b32_e32 v3, v23, v3 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x300, v3 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v5, v22, v5 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 ; SI-NEXT: s_mov_b32 s7, 0x3000000 -; SI-NEXT: v_add_i32_e32 v12, vcc, s7, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v8 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v9 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v12, vcc, s7, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v8 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v9 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v10 ; SI-NEXT: s_movk_i32 s6, 0x300 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v21, v1 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v7, vcc, s7, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v4 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v6 -; SI-NEXT: v_or_b32_e32 v0, v22, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v20, v1 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v4, vcc, s7, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v17 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v5, v20, v5 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, s7, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v4 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v18 +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v6 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v2 +; SI-NEXT: v_or_b32_e32 v3, v21, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v4, v18, v4 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v19, v1 +; SI-NEXT: v_or_b32_e32 v1, v16, v1 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v4, vcc, s7, v3 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x3000000, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v4 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v7 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v12 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v12 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v4 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v8 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v8 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v12 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v12 ; SI-NEXT: .LBB110_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_mov_b32_e32 v2, v15 -; SI-NEXT: v_mov_b32_e32 v4, v11 -; SI-NEXT: v_mov_b32_e32 v6, v16 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v5 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v3 +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v11 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v7 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v15 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v13 +; SI-NEXT: v_alignbit_b32 v2, v2, v3, 16 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v19 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v17 +; SI-NEXT: v_alignbit_b32 v3, v3, v4, 16 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16i8_to_v8bf16: @@ -25790,9 +26548,9 @@ define inreg <8 x bfloat> @bitcast_v16i8_to_v8bf16_scalar(<16 x i8> inreg %a, i3 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: v_readfirstlane_b32 s9, v1 +; SI-NEXT: v_readfirstlane_b32 s12, v1 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s11, v0 +; SI-NEXT: v_readfirstlane_b32 s14, v0 ; SI-NEXT: s_cbranch_scc0 .LBB111_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_and_b32 s4, s16, 0xff @@ -25802,11 +26560,11 @@ define inreg <8 x bfloat> @bitcast_v16i8_to_v8bf16_scalar(<16 x i8> inreg %a, i3 ; SI-NEXT: s_and_b32 s4, s18, 0xff ; SI-NEXT: s_lshl_b32 s4, s4, 16 ; SI-NEXT: s_lshl_b32 s5, s19, 24 -; SI-NEXT: s_or_b32 s7, s5, s4 +; SI-NEXT: s_or_b32 s8, s5, s4 ; SI-NEXT: s_and_b32 s4, s20, 0xff ; SI-NEXT: s_lshl_b32 s5, s21, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_lshl_b32 s8, s4, 16 +; SI-NEXT: s_lshl_b32 s7, s4, 16 ; SI-NEXT: s_and_b32 s4, s22, 0xff ; SI-NEXT: s_lshl_b32 s4, s4, 16 ; SI-NEXT: s_lshl_b32 s5, s23, 24 @@ -25814,7 +26572,7 @@ define inreg <8 x bfloat> @bitcast_v16i8_to_v8bf16_scalar(<16 x i8> inreg %a, i3 ; SI-NEXT: s_and_b32 s4, s24, 0xff ; SI-NEXT: s_lshl_b32 s4, s4, 16 ; SI-NEXT: s_lshl_b32 s5, s25, 24 -; SI-NEXT: s_or_b32 s12, s5, s4 +; SI-NEXT: s_or_b32 s9, s5, s4 ; SI-NEXT: s_and_b32 s4, s26, 0xff ; SI-NEXT: s_lshl_b32 s4, s4, 16 ; SI-NEXT: s_lshl_b32 s5, s27, 24 @@ -25822,21 +26580,21 @@ define inreg <8 x bfloat> @bitcast_v16i8_to_v8bf16_scalar(<16 x i8> inreg %a, i3 ; SI-NEXT: s_and_b32 s4, s28, 0xff ; SI-NEXT: s_lshl_b32 s5, s29, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_lshl_b32 s14, s4, 16 -; SI-NEXT: s_and_b32 s4, s11, 0xff +; SI-NEXT: s_lshl_b32 s11, s4, 16 +; SI-NEXT: s_and_b32 s4, s14, 0xff ; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: s_lshl_b32 s5, s9, 24 +; SI-NEXT: s_lshl_b32 s5, s12, 24 ; SI-NEXT: s_or_b32 s15, s5, s4 ; SI-NEXT: s_cbranch_execnz .LBB111_3 ; SI-NEXT: .LBB111_2: ; %cmp.true ; SI-NEXT: s_add_i32 s28, s28, 3 ; SI-NEXT: s_and_b32 s4, s28, 0xff ; SI-NEXT: s_lshl_b32 s5, s29, 8 -; SI-NEXT: s_add_i32 s11, s11, 3 +; SI-NEXT: s_add_i32 s14, s14, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s6, s11, 0xff +; SI-NEXT: s_and_b32 s6, s14, 0xff ; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: s_lshl_b32 s5, s9, 24 +; SI-NEXT: s_lshl_b32 s5, s12, 24 ; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s5, s6 @@ -25866,47 +26624,55 @@ define inreg <8 x bfloat> @bitcast_v16i8_to_v8bf16_scalar(<16 x i8> inreg %a, i3 ; SI-NEXT: s_or_b32 s7, s7, s8 ; SI-NEXT: s_or_b32 s6, s7, s6 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: s_add_i32 s8, s6, 0x3000000 +; SI-NEXT: s_add_i32 s7, s6, 0x3000000 ; SI-NEXT: s_and_b32 s6, s16, 0xff -; SI-NEXT: s_lshl_b32 s7, s17, 8 +; SI-NEXT: s_lshl_b32 s8, s17, 8 ; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_or_b32 s6, s8, s6 ; SI-NEXT: s_and_b32 s9, s18, 0xff ; SI-NEXT: s_addk_i32 s6, 0x300 -; SI-NEXT: s_lshl_b32 s7, s19, 24 +; SI-NEXT: s_lshl_b32 s8, s19, 24 ; SI-NEXT: s_lshl_b32 s9, s9, 16 ; SI-NEXT: s_and_b32 s6, s6, 0xffff -; SI-NEXT: s_or_b32 s7, s7, s9 -; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_or_b32 s6, s8, s6 ; SI-NEXT: s_add_i32 s4, s4, 0x3000000 ; SI-NEXT: s_add_i32 s5, s5, 0x3000000 ; SI-NEXT: s_add_i32 s6, s6, 0x3000000 -; SI-NEXT: s_and_b32 s7, s6, 0xffff0000 +; SI-NEXT: s_and_b32 s8, s6, 0xffff0000 ; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: s_and_b32 s10, s8, 0xffff0000 -; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_and_b32 s10, s7, 0xffff0000 +; SI-NEXT: s_lshl_b32 s7, s7, 16 ; SI-NEXT: s_and_b32 s13, s5, 0xffff0000 -; SI-NEXT: s_lshl_b32 s12, s5, 16 +; SI-NEXT: s_lshl_b32 s9, s5, 16 ; SI-NEXT: s_and_b32 s15, s4, 0xffff0000 -; SI-NEXT: s_lshl_b32 s14, s4, 16 +; SI-NEXT: s_lshl_b32 s11, s4, 16 ; SI-NEXT: .LBB111_3: ; %end -; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: v_mov_b32_e32 v1, s7 -; SI-NEXT: v_mov_b32_e32 v2, s8 -; SI-NEXT: v_mov_b32_e32 v3, s10 -; SI-NEXT: v_mov_b32_e32 v4, s12 -; SI-NEXT: v_mov_b32_e32 v5, s13 -; SI-NEXT: v_mov_b32_e32 v6, s14 -; SI-NEXT: v_mov_b32_e32 v7, s15 +; SI-NEXT: v_mul_f32_e64 v0, 1.0, s8 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_mul_f32_e64 v0, 1.0, s6 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s10 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s7 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s13 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s9 +; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s15 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s11 +; SI-NEXT: v_lshr_b64 v[3:4], v[3:4], 16 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB111_4: ; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; implicit-def: $sgpr7 ; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr7 ; SI-NEXT: ; implicit-def: $sgpr10 -; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr9 ; SI-NEXT: ; implicit-def: $sgpr13 -; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr11 ; SI-NEXT: ; implicit-def: $sgpr15 ; SI-NEXT: s_branch .LBB111_2 ; diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.160bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.160bit.ll index a8c54e8655882..8fbab2d6ab753 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.160bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.160bit.ll @@ -404,45 +404,51 @@ define <10 x i16> @bitcast_v5i32_to_v10i16(<5 x i32> %a, i32 %b) { ; SI-LABEL: bitcast_v5i32_to_v10i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v8, v4 -; SI-NEXT: v_mov_b32_e32 v6, v3 -; SI-NEXT: v_mov_b32_e32 v4, v2 -; SI-NEXT: v_mov_b32_e32 v2, v1 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB4_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB4_4 -; SI-NEXT: .LBB4_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB4_3: ; %cmp.false -; SI-NEXT: v_alignbit_b32 v9, v0, v8, 16 -; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v5, v0, v4, 16 +; SI-NEXT: v_alignbit_b32 v6, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v7, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v1 +; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB4_2 -; SI-NEXT: .LBB4_4: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: s_cbranch_execz .LBB4_4 +; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; SI-NEXT: v_alignbit_b32 v9, v0, v8, 16 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_alignbit_b32 v6, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v7, v1, v0, 16 +; SI-NEXT: v_alignbit_b32 v5, v0, v4, 16 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v1 +; SI-NEXT: .LBB4_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v0, v0, v7 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v9 +; SI-NEXT: v_or_b32_e32 v2, v2, v6 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v8 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v1, v1, v7 +; SI-NEXT: v_or_b32_e32 v3, v3, v6 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v5i32_to_v10i16: @@ -538,16 +544,26 @@ define inreg <10 x i16> @bitcast_v5i32_to_v10i16_scalar(<5 x i32> inreg %a, i32 ; SI-NEXT: s_lshr_b32 s13, s17, 16 ; SI-NEXT: s_lshr_b64 s[6:7], s[20:21], 16 ; SI-NEXT: .LBB5_3: ; %end -; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_mov_b32_e32 v1, s8 -; SI-NEXT: v_mov_b32_e32 v2, s17 -; SI-NEXT: v_mov_b32_e32 v3, s13 -; SI-NEXT: v_mov_b32_e32 v4, s18 -; SI-NEXT: v_mov_b32_e32 v5, s4 -; SI-NEXT: v_mov_b32_e32 v6, s19 -; SI-NEXT: v_mov_b32_e32 v7, s12 -; SI-NEXT: v_mov_b32_e32 v8, s20 -; SI-NEXT: v_mov_b32_e32 v9, s6 +; SI-NEXT: s_and_b32 s5, s16, 0xffff +; SI-NEXT: s_lshl_b32 s7, s8, 16 +; SI-NEXT: s_or_b32 s5, s5, s7 +; SI-NEXT: s_and_b32 s7, s17, 0xffff +; SI-NEXT: s_lshl_b32 s8, s13, 16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s18, 0xffff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_or_b32 s4, s8, s4 +; SI-NEXT: s_and_b32 s8, s19, 0xffff +; SI-NEXT: s_lshl_b32 s9, s12, 16 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s20, 0xffff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_or_b32 s6, s9, s6 +; SI-NEXT: v_mov_b32_e32 v0, s5 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s4 +; SI-NEXT: v_mov_b32_e32 v3, s8 +; SI-NEXT: v_mov_b32_e32 v4, s6 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB5_4: ; SI-NEXT: ; implicit-def: $sgpr8 @@ -647,15 +663,22 @@ define <5 x i32> @bitcast_v10i16_to_v5i32(<10 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v10i16_to_v5i32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v13, v4 -; SI-NEXT: v_mov_b32_e32 v12, v2 -; SI-NEXT: v_mov_b32_e32 v11, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v9 +; SI-NEXT: v_mov_b32_e32 v10, v4 +; SI-NEXT: v_mov_b32_e32 v6, v3 +; SI-NEXT: v_mov_b32_e32 v7, v2 +; SI-NEXT: v_mov_b32_e32 v8, v1 +; SI-NEXT: v_mov_b32_e32 v9, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v9 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v0 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -667,44 +690,44 @@ define <5 x i32> @bitcast_v10i16_to_v5i32(<10 x i16> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB6_3: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v11 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v7 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v6 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v8 -; SI-NEXT: v_or_b32_e32 v0, v0, v15 -; SI-NEXT: v_or_b32_e32 v1, v1, v14 -; SI-NEXT: v_or_b32_e32 v2, v2, v10 -; SI-NEXT: v_or_b32_e32 v3, v3, v7 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v0, v0, v14 +; SI-NEXT: v_or_b32_e32 v1, v1, v13 +; SI-NEXT: v_or_b32_e32 v2, v2, v12 +; SI-NEXT: v_or_b32_e32 v3, v3, v11 ; SI-NEXT: v_or_b32_e32 v4, v4, v5 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB6_2 ; SI-NEXT: .LBB6_4: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v11 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v12 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v7 ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v6 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v10 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; SI-NEXT: v_or_b32_e32 v0, v15, v0 +; SI-NEXT: v_or_b32_e32 v0, v14, v0 ; SI-NEXT: s_mov_b32 s6, 0x30000 -; SI-NEXT: v_or_b32_e32 v1, v14, v1 -; SI-NEXT: v_or_b32_e32 v2, v10, v2 -; SI-NEXT: v_or_b32_e32 v3, v7, v3 +; SI-NEXT: v_or_b32_e32 v1, v13, v1 +; SI-NEXT: v_or_b32_e32 v2, v12, v2 +; SI-NEXT: v_or_b32_e32 v3, v11, v3 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 @@ -798,45 +821,50 @@ define inreg <5 x i32> @bitcast_v10i16_to_v5i32_scalar(<10 x i16> inreg %a, i32 ; SI-LABEL: bitcast_v10i16_to_v5i32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_cmp_lg_u32 s26, 0 +; SI-NEXT: s_lshr_b32 s9, s20, 16 +; SI-NEXT: s_lshr_b32 s12, s19, 16 +; SI-NEXT: s_lshr_b32 s13, s18, 16 +; SI-NEXT: s_lshr_b32 s14, s17, 16 +; SI-NEXT: s_lshr_b32 s15, s16, 16 +; SI-NEXT: s_cmp_lg_u32 s21, 0 ; SI-NEXT: s_cbranch_scc0 .LBB7_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_lshl_b32 s5, s15, 16 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s14, 16 ; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_and_b32 s6, s18, 0xffff +; SI-NEXT: s_lshl_b32 s7, s13, 16 ; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_and_b32 s7, s19, 0xffff +; SI-NEXT: s_lshl_b32 s8, s12, 16 ; SI-NEXT: s_or_b32 s7, s7, s8 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s8, s20, 0xffff +; SI-NEXT: s_lshl_b32 s10, s9, 16 +; SI-NEXT: s_or_b32 s8, s8, s10 ; SI-NEXT: s_cbranch_execnz .LBB7_3 ; SI-NEXT: .LBB7_2: ; %cmp.true ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_lshl_b32 s5, s15, 16 +; SI-NEXT: s_add_i32 s17, s17, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s14, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_and_b32 s6, s18, 0xffff +; SI-NEXT: s_lshl_b32 s7, s13, 16 +; SI-NEXT: s_add_i32 s19, s19, 3 ; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_and_b32 s7, s19, 0xffff +; SI-NEXT: s_lshl_b32 s8, s12, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 ; SI-NEXT: s_or_b32 s7, s8, s7 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_and_b32 s8, s20, 0xffff +; SI-NEXT: s_lshl_b32 s9, s9, 16 ; SI-NEXT: s_or_b32 s8, s9, s8 ; SI-NEXT: s_add_i32 s4, s4, 0x30000 ; SI-NEXT: s_add_i32 s5, s5, 0x30000 @@ -966,76 +994,87 @@ define <10 x half> @bitcast_v5i32_to_v10f16(<5 x i32> %a, i32 %b) { ; SI-LABEL: bitcast_v5i32_to_v10f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v14, v4 -; SI-NEXT: v_mov_b32_e32 v13, v3 -; SI-NEXT: v_mov_b32_e32 v12, v2 -; SI-NEXT: v_mov_b32_e32 v11, v1 -; SI-NEXT: v_mov_b32_e32 v10, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB8_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v0 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB8_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB8_4 -; SI-NEXT: .LBB8_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB8_3: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v10 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: .LBB8_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB8_2 -; SI-NEXT: .LBB8_4: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v10 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v11 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v12 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v13 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_cbranch_execz .LBB8_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v0 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v0 +; SI-NEXT: .LBB8_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v10 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; SI-NEXT: v_or_b32_e32 v2, v9, v2 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v5i32_to_v10f16: @@ -1114,55 +1153,75 @@ define inreg <10 x half> @bitcast_v5i32_to_v10f16_scalar(<5 x i32> inreg %a, i32 ; SI-NEXT: s_cbranch_scc0 .LBB9_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 ; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 ; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 ; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 ; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s16 ; SI-NEXT: s_cbranch_execnz .LBB9_3 ; SI-NEXT: .LBB9_2: ; %cmp.true ; SI-NEXT: s_add_i32 s20, s20, 3 ; SI-NEXT: s_add_i32 s19, s19, 3 ; SI-NEXT: s_lshr_b32 s4, s20, 16 ; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 ; SI-NEXT: s_lshr_b32 s4, s19, 16 ; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 ; SI-NEXT: s_lshr_b32 s4, s18, 16 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 ; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 ; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 ; SI-NEXT: .LBB9_3: ; %end +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v9 +; SI-NEXT: v_or_b32_e32 v0, v7, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v2, v7, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v1, v10, v1 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_or_b32_e32 v4, v7, v4 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB9_4: +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: s_branch .LBB9_2 ; ; VI-LABEL: bitcast_v5i32_to_v10f16_scalar: @@ -1255,17 +1314,32 @@ define <5 x i32> @bitcast_v10f16_to_v5i32(<10 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v10f16_to_v5i32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v7 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v8 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v4 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -1277,33 +1351,33 @@ define <5 x i32> @bitcast_v10f16_to_v5i32(<10 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB10_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v6 -; SI-NEXT: v_or_b32_e32 v0, v16, v0 -; SI-NEXT: v_or_b32_e32 v1, v14, v1 -; SI-NEXT: v_or_b32_e32 v2, v12, v2 -; SI-NEXT: v_or_b32_e32 v3, v7, v3 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v7 +; SI-NEXT: v_or_b32_e32 v0, v14, v0 +; SI-NEXT: v_or_b32_e32 v1, v12, v1 +; SI-NEXT: v_or_b32_e32 v2, v10, v2 +; SI-NEXT: v_or_b32_e32 v3, v8, v3 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 ; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB10_2 ; SI-NEXT: .LBB10_4: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v12 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -1311,10 +1385,10 @@ define <5 x i32> @bitcast_v10f16_to_v5i32(<10 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v10 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v11 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -1322,24 +1396,24 @@ define <5 x i32> @bitcast_v10f16_to_v5i32(<10 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v9 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v6 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -1428,17 +1502,32 @@ define inreg <5 x i32> @bitcast_v10f16_to_v5i32_scalar(<10 x half> inreg %a, i32 ; SI-LABEL: bitcast_v10f16_to_v5i32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v14, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v13, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v12, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v11, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v10, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v9, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v8, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v7, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v6, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v5, s24 -; SI-NEXT: s_cmp_lg_u32 s26, 0 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: s_cmp_lg_u32 s21, 0 ; SI-NEXT: s_cbranch_scc0 .LBB11_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v14 @@ -1614,45 +1703,51 @@ define <10 x i16> @bitcast_v5f32_to_v10i16(<5 x float> %a, i32 %b) { ; SI-LABEL: bitcast_v5f32_to_v10i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v8, v4 -; SI-NEXT: v_mov_b32_e32 v6, v3 -; SI-NEXT: v_mov_b32_e32 v4, v2 -; SI-NEXT: v_mov_b32_e32 v2, v1 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB12_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB12_4 -; SI-NEXT: .LBB12_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB12_3: ; %cmp.false -; SI-NEXT: v_alignbit_b32 v9, v0, v8, 16 -; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v5, v0, v4, 16 +; SI-NEXT: v_alignbit_b32 v6, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v7, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v1 +; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB12_2 -; SI-NEXT: .LBB12_4: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 -; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: s_cbranch_execz .LBB12_4 +; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 -; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; SI-NEXT: v_alignbit_b32 v9, v0, v8, 16 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v6, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v7, v1, v0, 16 +; SI-NEXT: v_alignbit_b32 v5, v0, v4, 16 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v1 +; SI-NEXT: .LBB12_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v0, v0, v7 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v9 +; SI-NEXT: v_or_b32_e32 v2, v2, v6 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v8 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v1, v1, v7 +; SI-NEXT: v_or_b32_e32 v3, v3, v6 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v5f32_to_v10i16: @@ -1735,16 +1830,16 @@ define inreg <10 x i16> @bitcast_v5f32_to_v10i16_scalar(<5 x float> inreg %a, i3 ; SI-NEXT: s_lshr_b64 s[8:9], s[16:17], 16 ; SI-NEXT: s_cbranch_execnz .LBB13_4 ; SI-NEXT: .LBB13_2: ; %cmp.true -; SI-NEXT: v_add_f32_e64 v8, s20, 1.0 -; SI-NEXT: v_add_f32_e64 v14, s17, 1.0 -; SI-NEXT: v_add_f32_e64 v13, s16, 1.0 -; SI-NEXT: v_add_f32_e64 v12, s19, 1.0 -; SI-NEXT: v_add_f32_e64 v11, s18, 1.0 -; SI-NEXT: v_lshr_b64 v[5:6], v[11:12], 16 -; SI-NEXT: v_lshr_b64 v[1:2], v[13:14], 16 -; SI-NEXT: v_lshr_b64 v[9:10], v[8:9], 16 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v14 +; SI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; SI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: v_lshr_b64 v[7:8], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[8:9], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[5:6], v[4:5], 16 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 ; SI-NEXT: s_branch .LBB13_5 ; SI-NEXT: .LBB13_3: ; SI-NEXT: ; implicit-def: $sgpr8 @@ -1754,21 +1849,32 @@ define inreg <10 x i16> @bitcast_v5f32_to_v10i16_scalar(<5 x float> inreg %a, i3 ; SI-NEXT: ; implicit-def: $sgpr6 ; SI-NEXT: s_branch .LBB13_2 ; SI-NEXT: .LBB13_4: -; SI-NEXT: v_mov_b32_e32 v13, s16 -; SI-NEXT: v_mov_b32_e32 v14, s17 -; SI-NEXT: v_mov_b32_e32 v11, s18 -; SI-NEXT: v_mov_b32_e32 v12, s19 -; SI-NEXT: v_mov_b32_e32 v8, s20 -; SI-NEXT: v_mov_b32_e32 v3, s12 -; SI-NEXT: v_mov_b32_e32 v7, s13 -; SI-NEXT: v_mov_b32_e32 v9, s6 -; SI-NEXT: v_mov_b32_e32 v1, s8 -; SI-NEXT: v_mov_b32_e32 v5, s4 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v6, s12 +; SI-NEXT: v_mov_b32_e32 v9, s13 +; SI-NEXT: v_mov_b32_e32 v5, s6 +; SI-NEXT: v_mov_b32_e32 v8, s8 +; SI-NEXT: v_mov_b32_e32 v7, s4 ; SI-NEXT: .LBB13_5: ; %end -; SI-NEXT: v_mov_b32_e32 v0, v13 -; SI-NEXT: v_mov_b32_e32 v2, v14 -; SI-NEXT: v_mov_b32_e32 v4, v11 -; SI-NEXT: v_mov_b32_e32 v6, v12 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v1, v1, v6 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v2, v2, v6 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v9 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v0, v0, v8 +; SI-NEXT: v_or_b32_e32 v3, v3, v6 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v5f32_to_v10i16_scalar: @@ -1871,15 +1977,22 @@ define <5 x float> @bitcast_v10i16_to_v5f32(<10 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v10i16_to_v5f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v13, v4 -; SI-NEXT: v_mov_b32_e32 v12, v2 -; SI-NEXT: v_mov_b32_e32 v11, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v9 +; SI-NEXT: v_mov_b32_e32 v10, v4 +; SI-NEXT: v_mov_b32_e32 v6, v3 +; SI-NEXT: v_mov_b32_e32 v7, v2 +; SI-NEXT: v_mov_b32_e32 v8, v1 +; SI-NEXT: v_mov_b32_e32 v9, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v9 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v0 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -1891,44 +2004,44 @@ define <5 x float> @bitcast_v10i16_to_v5f32(<10 x i16> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB14_3: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v11 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v7 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v6 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v8 -; SI-NEXT: v_or_b32_e32 v0, v0, v15 -; SI-NEXT: v_or_b32_e32 v1, v1, v14 -; SI-NEXT: v_or_b32_e32 v2, v2, v10 -; SI-NEXT: v_or_b32_e32 v3, v3, v7 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v0, v0, v14 +; SI-NEXT: v_or_b32_e32 v1, v1, v13 +; SI-NEXT: v_or_b32_e32 v2, v2, v12 +; SI-NEXT: v_or_b32_e32 v3, v3, v11 ; SI-NEXT: v_or_b32_e32 v4, v4, v5 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB14_2 ; SI-NEXT: .LBB14_4: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v11 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v12 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v7 ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v6 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v10 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; SI-NEXT: v_or_b32_e32 v0, v15, v0 +; SI-NEXT: v_or_b32_e32 v0, v14, v0 ; SI-NEXT: s_mov_b32 s6, 0x30000 -; SI-NEXT: v_or_b32_e32 v1, v14, v1 -; SI-NEXT: v_or_b32_e32 v2, v10, v2 -; SI-NEXT: v_or_b32_e32 v3, v7, v3 +; SI-NEXT: v_or_b32_e32 v1, v13, v1 +; SI-NEXT: v_or_b32_e32 v2, v12, v2 +; SI-NEXT: v_or_b32_e32 v3, v11, v3 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 @@ -2022,45 +2135,50 @@ define inreg <5 x float> @bitcast_v10i16_to_v5f32_scalar(<10 x i16> inreg %a, i3 ; SI-LABEL: bitcast_v10i16_to_v5f32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_cmp_lg_u32 s26, 0 +; SI-NEXT: s_lshr_b32 s9, s20, 16 +; SI-NEXT: s_lshr_b32 s12, s19, 16 +; SI-NEXT: s_lshr_b32 s13, s18, 16 +; SI-NEXT: s_lshr_b32 s14, s17, 16 +; SI-NEXT: s_lshr_b32 s15, s16, 16 +; SI-NEXT: s_cmp_lg_u32 s21, 0 ; SI-NEXT: s_cbranch_scc0 .LBB15_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_lshl_b32 s5, s15, 16 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s14, 16 ; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_and_b32 s6, s18, 0xffff +; SI-NEXT: s_lshl_b32 s7, s13, 16 ; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_and_b32 s7, s19, 0xffff +; SI-NEXT: s_lshl_b32 s8, s12, 16 ; SI-NEXT: s_or_b32 s7, s7, s8 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s8, s20, 0xffff +; SI-NEXT: s_lshl_b32 s10, s9, 16 +; SI-NEXT: s_or_b32 s8, s8, s10 ; SI-NEXT: s_cbranch_execnz .LBB15_3 ; SI-NEXT: .LBB15_2: ; %cmp.true ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_lshl_b32 s5, s15, 16 +; SI-NEXT: s_add_i32 s17, s17, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s14, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_and_b32 s6, s18, 0xffff +; SI-NEXT: s_lshl_b32 s7, s13, 16 +; SI-NEXT: s_add_i32 s19, s19, 3 ; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_and_b32 s7, s19, 0xffff +; SI-NEXT: s_lshl_b32 s8, s12, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 ; SI-NEXT: s_or_b32 s7, s8, s7 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_and_b32 s8, s20, 0xffff +; SI-NEXT: s_lshl_b32 s9, s9, 16 ; SI-NEXT: s_or_b32 s8, s9, s8 ; SI-NEXT: s_add_i32 s4, s4, 0x30000 ; SI-NEXT: s_add_i32 s5, s5, 0x30000 @@ -2190,76 +2308,87 @@ define <10 x half> @bitcast_v5f32_to_v10f16(<5 x float> %a, i32 %b) { ; SI-LABEL: bitcast_v5f32_to_v10f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v14, v4 -; SI-NEXT: v_mov_b32_e32 v13, v3 -; SI-NEXT: v_mov_b32_e32 v12, v2 -; SI-NEXT: v_mov_b32_e32 v11, v1 -; SI-NEXT: v_mov_b32_e32 v10, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB16_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v0 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB16_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB16_4 -; SI-NEXT: .LBB16_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB16_3: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v10 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: .LBB16_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB16_2 -; SI-NEXT: .LBB16_4: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v1, 1.0, v10 -; SI-NEXT: v_add_f32_e32 v3, 1.0, v11 -; SI-NEXT: v_add_f32_e32 v5, 1.0, v12 -; SI-NEXT: v_add_f32_e32 v7, 1.0, v13 -; SI-NEXT: v_add_f32_e32 v9, 1.0, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_cbranch_execz .LBB16_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v0 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v0 +; SI-NEXT: .LBB16_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v10 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; SI-NEXT: v_or_b32_e32 v2, v9, v2 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v5f32_to_v10f16: @@ -2336,55 +2465,75 @@ define inreg <10 x half> @bitcast_v5f32_to_v10f16_scalar(<5 x float> inreg %a, i ; SI-NEXT: s_cbranch_scc0 .LBB17_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 ; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 ; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 ; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 ; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s16 ; SI-NEXT: s_cbranch_execnz .LBB17_3 ; SI-NEXT: .LBB17_2: ; %cmp.true -; SI-NEXT: v_add_f32_e64 v1, s16, 1.0 -; SI-NEXT: v_add_f32_e64 v3, s17, 1.0 -; SI-NEXT: v_add_f32_e64 v5, s18, 1.0 -; SI-NEXT: v_add_f32_e64 v7, s19, 1.0 -; SI-NEXT: v_add_f32_e64 v9, s20, 1.0 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 +; SI-NEXT: v_add_f32_e64 v9, s16, 1.0 +; SI-NEXT: v_add_f32_e64 v8, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v6, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v9 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: .LBB17_3: ; %end +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v9 +; SI-NEXT: v_or_b32_e32 v0, v7, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v2, v7, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v1, v10, v1 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_or_b32_e32 v4, v7, v4 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB17_4: +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: s_branch .LBB17_2 ; ; VI-LABEL: bitcast_v5f32_to_v10f16_scalar: @@ -2487,17 +2636,32 @@ define <5 x float> @bitcast_v10f16_to_v5f32(<10 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v10f16_to_v5f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v7 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v8 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v4 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -2509,33 +2673,33 @@ define <5 x float> @bitcast_v10f16_to_v5f32(<10 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB18_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v6 -; SI-NEXT: v_or_b32_e32 v0, v16, v0 -; SI-NEXT: v_or_b32_e32 v1, v14, v1 -; SI-NEXT: v_or_b32_e32 v2, v12, v2 -; SI-NEXT: v_or_b32_e32 v3, v7, v3 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v7 +; SI-NEXT: v_or_b32_e32 v0, v14, v0 +; SI-NEXT: v_or_b32_e32 v1, v12, v1 +; SI-NEXT: v_or_b32_e32 v2, v10, v2 +; SI-NEXT: v_or_b32_e32 v3, v8, v3 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 ; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB18_2 ; SI-NEXT: .LBB18_4: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v12 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -2543,10 +2707,10 @@ define <5 x float> @bitcast_v10f16_to_v5f32(<10 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v10 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v11 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -2554,24 +2718,24 @@ define <5 x float> @bitcast_v10f16_to_v5f32(<10 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v9 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v6 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -2660,17 +2824,32 @@ define inreg <5 x float> @bitcast_v10f16_to_v5f32_scalar(<10 x half> inreg %a, i ; SI-LABEL: bitcast_v10f16_to_v5f32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v14, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v13, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v12, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v11, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v10, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v9, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v8, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v7, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v6, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v5, s24 -; SI-NEXT: s_cmp_lg_u32 s26, 0 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: s_cmp_lg_u32 s21, 0 ; SI-NEXT: s_cbranch_scc0 .LBB19_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v14 @@ -2846,81 +3025,92 @@ define <10 x half> @bitcast_v10i16_to_v10f16(<10 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v10i16_to_v10f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v19, v9 -; SI-NEXT: v_mov_b32_e32 v18, v8 -; SI-NEXT: v_mov_b32_e32 v17, v7 -; SI-NEXT: v_mov_b32_e32 v16, v6 -; SI-NEXT: v_mov_b32_e32 v15, v5 -; SI-NEXT: v_mov_b32_e32 v14, v4 -; SI-NEXT: v_mov_b32_e32 v13, v3 -; SI-NEXT: v_mov_b32_e32 v12, v2 -; SI-NEXT: v_mov_b32_e32 v11, v1 -; SI-NEXT: v_mov_b32_e32 v20, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 ; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB20_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB20_4 -; SI-NEXT: .LBB20_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB20_3: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v0, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: s_cbranch_execz .LBB20_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v19 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: .LBB20_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB20_2 -; SI-NEXT: .LBB20_4: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v19 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v18 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v17 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v16 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v15 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v14 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v13 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v12 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v11 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: s_cbranch_execz .LBB20_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: .LBB20_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_cvt_f16_f32_e32 v0, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v8 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v11 +; SI-NEXT: v_or_b32_e32 v2, v5, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v9 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v10i16_to_v10f16: @@ -3007,53 +3197,78 @@ define inreg <10 x half> @bitcast_v10i16_to_v10f16_scalar(<10 x i16> inreg %a, i ; SI-LABEL: bitcast_v10i16_to_v10f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_cmp_lg_u32 s26, 0 +; SI-NEXT: s_lshr_b32 s10, s20, 16 +; SI-NEXT: s_lshr_b32 s9, s19, 16 +; SI-NEXT: s_lshr_b32 s8, s18, 16 +; SI-NEXT: s_lshr_b32 s7, s17, 16 +; SI-NEXT: s_lshr_b32 s6, s16, 16 +; SI-NEXT: s_cmp_lg_u32 s21, 0 ; SI-NEXT: s_cbranch_scc0 .LBB21_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s7 ; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s8 ; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s9 ; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 ; SI-NEXT: s_cbranch_execnz .LBB21_3 ; SI-NEXT: .LBB21_2: ; %cmp.true -; SI-NEXT: s_add_i32 s25, s25, 3 -; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: s_add_i32 s23, s23, 3 -; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s10, s10, 3 ; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s9, s9, 3 ; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s8, s8, 3 ; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s7, s7, 3 ; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s6, s6, 3 ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s7 ; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s8 ; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s9 ; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 ; SI-NEXT: .LBB21_3: ; %end +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v0, v0, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v2, v2, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v9 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v1, v1, v7 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB21_4: ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: s_branch .LBB21_2 ; @@ -3173,66 +3388,96 @@ define <10 x i16> @bitcast_v10f16_to_v10i16(<10 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v10f16_to_v10i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v6 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v11 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB22_2 ; SI-NEXT: ; %bb.1: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v9 -; SI-NEXT: v_or_b32_e32 v8, v8, v10 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v4 ; SI-NEXT: v_or_b32_e32 v6, v6, v10 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v8, v8, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v2, v2, v10 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_or_b32_e32 v4, v4, v5 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; SI-NEXT: v_or_b32_e32 v0, v0, v5 +; SI-NEXT: v_or_b32_e32 v7, v7, v9 +; SI-NEXT: v_alignbit_b32 v10, v2, v5, 16 +; SI-NEXT: v_alignbit_b32 v9, v8, v9, 16 ; SI-NEXT: .LBB22_2: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v10 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v5 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v9 +; SI-NEXT: v_or_b32_e32 v2, v2, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v10f16_to_v10i16: @@ -3320,66 +3565,94 @@ define inreg <10 x i16> @bitcast_v10f16_to_v10i16_scalar(<10 x half> inreg %a, i ; SI-LABEL: bitcast_v10f16_to_v10i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v10, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v2, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v3, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v4, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v11, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v6, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v7, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v8, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v9, s25 -; SI-NEXT: s_cmp_lg_u32 s26, 0 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v0 +; SI-NEXT: s_cmp_lg_u32 s21, 0 ; SI-NEXT: s_cbranch_scc0 .LBB23_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_cbranch_execnz .LBB23_3 ; SI-NEXT: .LBB23_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_or_b32_e32 v6, v6, v10 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v2, v2, v10 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshr_b64 v[10:11], v[1:2], 16 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v9 -; SI-NEXT: v_lshr_b64 v[11:12], v[5:6], 16 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_or_b32_e32 v4, v4, v5 -; SI-NEXT: v_or_b32_e32 v8, v8, v13 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v12, v2, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v11 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v10 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v9 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_or_b32_e32 v11, v6, v2 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v10 +; SI-NEXT: v_or_b32_e32 v1, v1, v4 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v8 +; SI-NEXT: v_lshr_b64 v[6:7], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[4:5], v[2:3], 16 +; SI-NEXT: v_or_b32_e32 v5, v14, v13 ; SI-NEXT: .LBB23_3: ; %end -; SI-NEXT: v_mov_b32_e32 v1, v10 -; SI-NEXT: v_mov_b32_e32 v5, v11 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v6 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v10 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v9 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v8 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB23_4: ; SI-NEXT: s_branch .LBB23_2 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.192bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.192bit.ll index 57eae8600dc4a..94ccde5a0a948 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.192bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.192bit.ll @@ -1240,46 +1240,58 @@ define <12 x i16> @bitcast_v6i32_to_v12i16(<6 x i32> %a, i32 %b) { ; SI-LABEL: bitcast_v6i32_to_v12i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v10, v5 -; SI-NEXT: v_mov_b32_e32 v8, v4 -; SI-NEXT: v_mov_b32_e32 v12, v3 -; SI-NEXT: v_mov_b32_e32 v4, v2 -; SI-NEXT: v_mov_b32_e32 v2, v1 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; SI-NEXT: v_alignbit_b32 v5, v12, v4, 16 -; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_alignbit_b32 v6, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v7, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v9, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v1 ; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB12_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 ; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; SI-NEXT: v_alignbit_b32 v5, v12, v4, 16 -; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_alignbit_b32 v6, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v7, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v9, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v1 ; SI-NEXT: .LBB12_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_mov_b32_e32 v6, v12 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v0, v0, v9 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v11 +; SI-NEXT: v_or_b32_e32 v2, v2, v7 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v10 +; SI-NEXT: v_or_b32_e32 v4, v4, v6 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v8 +; SI-NEXT: v_or_b32_e32 v1, v1, v9 +; SI-NEXT: v_or_b32_e32 v3, v3, v7 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v6i32_to_v12i16: @@ -1381,18 +1393,30 @@ define inreg <12 x i16> @bitcast_v6i32_to_v12i16_scalar(<6 x i32> inreg %a, i32 ; SI-NEXT: s_lshr_b32 s13, s19, 16 ; SI-NEXT: s_lshr_b32 s14, s17, 16 ; SI-NEXT: .LBB13_3: ; %end -; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_mov_b32_e32 v1, s8 -; SI-NEXT: v_mov_b32_e32 v2, s17 -; SI-NEXT: v_mov_b32_e32 v3, s14 -; SI-NEXT: v_mov_b32_e32 v4, s18 -; SI-NEXT: v_mov_b32_e32 v5, s6 -; SI-NEXT: v_mov_b32_e32 v6, s19 -; SI-NEXT: v_mov_b32_e32 v7, s13 -; SI-NEXT: v_mov_b32_e32 v8, s20 -; SI-NEXT: v_mov_b32_e32 v9, s4 -; SI-NEXT: v_mov_b32_e32 v10, s21 -; SI-NEXT: v_mov_b32_e32 v11, s12 +; SI-NEXT: s_and_b32 s5, s16, 0xffff +; SI-NEXT: s_lshl_b32 s7, s8, 16 +; SI-NEXT: s_or_b32 s5, s5, s7 +; SI-NEXT: s_and_b32 s7, s17, 0xffff +; SI-NEXT: s_lshl_b32 s8, s14, 16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_or_b32 s6, s8, s6 +; SI-NEXT: s_and_b32 s8, s19, 0xffff +; SI-NEXT: s_lshl_b32 s9, s13, 16 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s20, 0xffff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_or_b32 s4, s9, s4 +; SI-NEXT: s_and_b32 s9, s21, 0xffff +; SI-NEXT: s_lshl_b32 s10, s12, 16 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: v_mov_b32_e32 v0, s5 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s8 +; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: v_mov_b32_e32 v5, s9 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB13_4: ; SI-NEXT: ; implicit-def: $sgpr8 @@ -1498,16 +1522,25 @@ define <6 x i32> @bitcast_v12i16_to_v6i32(<12 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v12i16_to_v6i32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v15, v4 -; SI-NEXT: v_mov_b32_e32 v14, v2 -; SI-NEXT: v_mov_b32_e32 v13, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v11 +; SI-NEXT: v_mov_b32_e32 v12, v5 +; SI-NEXT: v_mov_b32_e32 v7, v4 +; SI-NEXT: v_mov_b32_e32 v8, v3 +; SI-NEXT: v_mov_b32_e32 v9, v2 +; SI-NEXT: v_mov_b32_e32 v10, v1 +; SI-NEXT: v_mov_b32_e32 v11, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v11 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v0 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -1519,52 +1552,52 @@ define <6 x i32> @bitcast_v12i16_to_v6i32(<12 x i16> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB14_3: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v13 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v15 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v6 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v8 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v10 -; SI-NEXT: v_or_b32_e32 v0, v0, v18 -; SI-NEXT: v_or_b32_e32 v1, v1, v17 -; SI-NEXT: v_or_b32_e32 v2, v2, v16 -; SI-NEXT: v_or_b32_e32 v3, v3, v12 -; SI-NEXT: v_or_b32_e32 v4, v4, v9 -; SI-NEXT: v_or_b32_e32 v5, v5, v7 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v0, v0, v17 +; SI-NEXT: v_or_b32_e32 v1, v1, v16 +; SI-NEXT: v_or_b32_e32 v2, v2, v15 +; SI-NEXT: v_or_b32_e32 v3, v3, v14 +; SI-NEXT: v_or_b32_e32 v4, v4, v13 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB14_2 ; SI-NEXT: .LBB14_4: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v13 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v14 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v15 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v6 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v8 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v12 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; SI-NEXT: v_or_b32_e32 v0, v18, v0 +; SI-NEXT: v_or_b32_e32 v0, v17, v0 ; SI-NEXT: s_mov_b32 s6, 0x30000 -; SI-NEXT: v_or_b32_e32 v1, v17, v1 -; SI-NEXT: v_or_b32_e32 v2, v16, v2 -; SI-NEXT: v_or_b32_e32 v3, v12, v3 -; SI-NEXT: v_or_b32_e32 v4, v9, v4 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_or_b32_e32 v1, v16, v1 +; SI-NEXT: v_or_b32_e32 v2, v15, v2 +; SI-NEXT: v_or_b32_e32 v3, v14, v3 +; SI-NEXT: v_or_b32_e32 v4, v13, v4 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 @@ -1663,52 +1696,58 @@ define inreg <6 x i32> @bitcast_v12i16_to_v6i32_scalar(<12 x i16> inreg %a, i32 ; SI-LABEL: bitcast_v12i16_to_v6i32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_cmp_lg_u32 s28, 0 +; SI-NEXT: s_lshr_b32 s12, s21, 16 +; SI-NEXT: s_lshr_b32 s13, s20, 16 +; SI-NEXT: s_lshr_b32 s14, s19, 16 +; SI-NEXT: s_lshr_b32 s15, s18, 16 +; SI-NEXT: s_lshr_b32 s23, s17, 16 +; SI-NEXT: s_lshr_b32 s24, s16, 16 +; SI-NEXT: s_cmp_lg_u32 s22, 0 ; SI-NEXT: s_cbranch_scc0 .LBB15_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_lshl_b32 s5, s24, 16 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s23, 16 ; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_and_b32 s6, s18, 0xffff +; SI-NEXT: s_lshl_b32 s7, s15, 16 ; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_and_b32 s7, s19, 0xffff +; SI-NEXT: s_lshl_b32 s8, s14, 16 ; SI-NEXT: s_or_b32 s7, s7, s8 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_and_b32 s8, s20, 0xffff +; SI-NEXT: s_lshl_b32 s9, s13, 16 ; SI-NEXT: s_or_b32 s8, s8, s9 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_and_b32 s9, s21, 0xffff +; SI-NEXT: s_lshl_b32 s10, s12, 16 ; SI-NEXT: s_or_b32 s9, s9, s10 ; SI-NEXT: s_cbranch_execnz .LBB15_3 ; SI-NEXT: .LBB15_2: ; %cmp.true ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_lshl_b32 s5, s24, 16 +; SI-NEXT: s_add_i32 s17, s17, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s23, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_and_b32 s6, s18, 0xffff +; SI-NEXT: s_lshl_b32 s7, s15, 16 +; SI-NEXT: s_add_i32 s19, s19, 3 ; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_and_b32 s7, s19, 0xffff +; SI-NEXT: s_lshl_b32 s8, s14, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 ; SI-NEXT: s_or_b32 s7, s8, s7 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_and_b32 s8, s20, 0xffff +; SI-NEXT: s_lshl_b32 s9, s13, 16 +; SI-NEXT: s_add_i32 s21, s21, 3 ; SI-NEXT: s_or_b32 s8, s9, s8 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_and_b32 s9, s21, 0xffff +; SI-NEXT: s_lshl_b32 s10, s12, 16 ; SI-NEXT: s_or_b32 s9, s10, s9 ; SI-NEXT: s_add_i32 s4, s4, 0x30000 ; SI-NEXT: s_add_i32 s5, s5, 0x30000 @@ -1850,87 +1889,101 @@ define <12 x half> @bitcast_v6i32_to_v12f16(<6 x i32> %a, i32 %b) { ; SI-LABEL: bitcast_v6i32_to_v12f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v17, v5 -; SI-NEXT: v_mov_b32_e32 v16, v4 -; SI-NEXT: v_mov_b32_e32 v15, v3 -; SI-NEXT: v_mov_b32_e32 v14, v2 -; SI-NEXT: v_mov_b32_e32 v13, v1 -; SI-NEXT: v_mov_b32_e32 v12, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB16_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v0 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB16_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB16_4 -; SI-NEXT: .LBB16_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB16_3: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: .LBB16_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB16_2 -; SI-NEXT: .LBB16_4: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v12 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v13 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v14 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v15 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v16 -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: s_cbranch_execz .LBB16_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v0 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v0 +; SI-NEXT: .LBB16_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v14 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v13 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v9 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v6i32_to_v12f16: @@ -2012,64 +2065,88 @@ define inreg <12 x half> @bitcast_v6i32_to_v12f16_scalar(<6 x i32> inreg %a, i32 ; SI-NEXT: s_cbranch_scc0 .LBB17_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 ; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 ; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 ; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 ; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 ; SI-NEXT: s_cbranch_execnz .LBB17_3 ; SI-NEXT: .LBB17_2: ; %cmp.true ; SI-NEXT: s_add_i32 s21, s21, 3 ; SI-NEXT: s_add_i32 s20, s20, 3 ; SI-NEXT: s_lshr_b32 s4, s21, 16 ; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 ; SI-NEXT: s_lshr_b32 s4, s20, 16 ; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 ; SI-NEXT: s_lshr_b32 s4, s19, 16 ; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 ; SI-NEXT: s_lshr_b32 s4, s18, 16 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 ; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 ; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 ; SI-NEXT: .LBB17_3: ; %end +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v11 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_or_b32_e32 v2, v8, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v8 +; SI-NEXT: v_or_b32_e32 v1, v12, v1 +; SI-NEXT: v_or_b32_e32 v3, v10, v3 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB17_4: -; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: s_branch .LBB17_2 ; ; VI-LABEL: bitcast_v6i32_to_v12f16_scalar: @@ -2167,19 +2244,37 @@ define <6 x i32> @bitcast_v12f16_to_v6i32(<12 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v12f16_to_v6i32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v10 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v5 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -2191,37 +2286,37 @@ define <6 x i32> @bitcast_v12f16_to_v6i32(<12 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB18_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v20 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v18 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v14 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 -; SI-NEXT: v_or_b32_e32 v0, v19, v0 -; SI-NEXT: v_or_b32_e32 v1, v17, v1 -; SI-NEXT: v_or_b32_e32 v2, v15, v2 -; SI-NEXT: v_or_b32_e32 v3, v13, v3 -; SI-NEXT: v_or_b32_e32 v4, v8, v4 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v8 +; SI-NEXT: v_or_b32_e32 v0, v17, v0 +; SI-NEXT: v_or_b32_e32 v1, v15, v1 +; SI-NEXT: v_or_b32_e32 v2, v13, v2 +; SI-NEXT: v_or_b32_e32 v3, v11, v3 +; SI-NEXT: v_or_b32_e32 v4, v9, v4 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB18_2 ; SI-NEXT: .LBB18_4: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v15 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -2230,42 +2325,42 @@ define <6 x i32> @bitcast_v12f16_to_v6i32(<12 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v14 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v11 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v13 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v12 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v10 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v9 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v4, v4, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -2359,31 +2454,49 @@ define inreg <6 x i32> @bitcast_v12f16_to_v6i32_scalar(<12 x half> inreg %a, i32 ; SI-LABEL: bitcast_v12f16_to_v6i32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v17, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v16, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v15, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v14, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v13, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v12, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v11, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v10, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v9, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v8, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v7, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v6, s26 -; SI-NEXT: s_cmp_lg_u32 s28, 0 -; SI-NEXT: s_cbranch_scc0 .LBB19_4 -; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 -; SI-NEXT: v_or_b32_e32 v0, v16, v0 -; SI-NEXT: v_or_b32_e32 v1, v14, v1 -; SI-NEXT: v_or_b32_e32 v2, v12, v2 -; SI-NEXT: v_or_b32_e32 v3, v10, v3 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: s_cmp_lg_u32 s22, 0 +; SI-NEXT: s_cbranch_scc0 .LBB19_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 +; SI-NEXT: v_or_b32_e32 v0, v16, v0 +; SI-NEXT: v_or_b32_e32 v1, v14, v1 +; SI-NEXT: v_or_b32_e32 v2, v12, v2 +; SI-NEXT: v_or_b32_e32 v3, v10, v3 ; SI-NEXT: v_or_b32_e32 v4, v8, v4 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 ; SI-NEXT: s_cbranch_execnz .LBB19_3 @@ -3385,46 +3498,58 @@ define <12 x i16> @bitcast_v6f32_to_v12i16(<6 x float> %a, i32 %b) { ; SI-LABEL: bitcast_v6f32_to_v12i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v10, v5 -; SI-NEXT: v_mov_b32_e32 v8, v4 -; SI-NEXT: v_mov_b32_e32 v12, v3 -; SI-NEXT: v_mov_b32_e32 v4, v2 -; SI-NEXT: v_mov_b32_e32 v2, v1 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; SI-NEXT: v_alignbit_b32 v5, v12, v4, 16 -; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_alignbit_b32 v6, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v7, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v9, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v1 ; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB28_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 ; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 -; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 -; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 -; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; SI-NEXT: v_alignbit_b32 v5, v12, v4, 16 -; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_alignbit_b32 v6, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v7, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v9, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v1 ; SI-NEXT: .LBB28_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_mov_b32_e32 v6, v12 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v0, v0, v9 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v11 +; SI-NEXT: v_or_b32_e32 v2, v2, v7 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v10 +; SI-NEXT: v_or_b32_e32 v4, v4, v6 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v8 +; SI-NEXT: v_or_b32_e32 v1, v1, v9 +; SI-NEXT: v_or_b32_e32 v3, v3, v7 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v6f32_to_v12i16: @@ -3510,18 +3635,18 @@ define inreg <12 x i16> @bitcast_v6f32_to_v12i16_scalar(<6 x float> inreg %a, i3 ; SI-NEXT: s_lshr_b64 s[8:9], s[16:17], 16 ; SI-NEXT: s_cbranch_execnz .LBB29_4 ; SI-NEXT: .LBB29_2: ; %cmp.true -; SI-NEXT: v_add_f32_e64 v17, s17, 1.0 -; SI-NEXT: v_add_f32_e64 v16, s16, 1.0 -; SI-NEXT: v_add_f32_e64 v15, s19, 1.0 -; SI-NEXT: v_add_f32_e64 v14, s18, 1.0 -; SI-NEXT: v_add_f32_e64 v13, s21, 1.0 -; SI-NEXT: v_add_f32_e64 v12, s20, 1.0 -; SI-NEXT: v_lshr_b64 v[9:10], v[12:13], 16 -; SI-NEXT: v_lshr_b64 v[5:6], v[14:15], 16 -; SI-NEXT: v_lshr_b64 v[1:2], v[16:17], 16 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v17 +; SI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; SI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; SI-NEXT: v_lshr_b64 v[6:7], v[4:5], 16 +; SI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: v_lshr_b64 v[7:8], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[8:9], v[0:1], 16 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v1 ; SI-NEXT: s_branch .LBB29_5 ; SI-NEXT: .LBB29_3: ; SI-NEXT: ; implicit-def: $sgpr8 @@ -3532,25 +3657,37 @@ define inreg <12 x i16> @bitcast_v6f32_to_v12i16_scalar(<6 x float> inreg %a, i3 ; SI-NEXT: ; implicit-def: $sgpr14 ; SI-NEXT: s_branch .LBB29_2 ; SI-NEXT: .LBB29_4: -; SI-NEXT: v_mov_b32_e32 v16, s16 -; SI-NEXT: v_mov_b32_e32 v17, s17 -; SI-NEXT: v_mov_b32_e32 v14, s18 -; SI-NEXT: v_mov_b32_e32 v15, s19 -; SI-NEXT: v_mov_b32_e32 v12, s20 -; SI-NEXT: v_mov_b32_e32 v13, s21 -; SI-NEXT: v_mov_b32_e32 v3, s12 -; SI-NEXT: v_mov_b32_e32 v7, s13 -; SI-NEXT: v_mov_b32_e32 v11, s14 -; SI-NEXT: v_mov_b32_e32 v1, s8 -; SI-NEXT: v_mov_b32_e32 v5, s6 -; SI-NEXT: v_mov_b32_e32 v9, s4 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v11, s12 +; SI-NEXT: v_mov_b32_e32 v10, s13 +; SI-NEXT: v_mov_b32_e32 v9, s14 +; SI-NEXT: v_mov_b32_e32 v8, s8 +; SI-NEXT: v_mov_b32_e32 v7, s6 +; SI-NEXT: v_mov_b32_e32 v6, s4 ; SI-NEXT: .LBB29_5: ; %end -; SI-NEXT: v_mov_b32_e32 v0, v16 -; SI-NEXT: v_mov_b32_e32 v2, v17 -; SI-NEXT: v_mov_b32_e32 v4, v14 -; SI-NEXT: v_mov_b32_e32 v6, v15 -; SI-NEXT: v_mov_b32_e32 v8, v12 -; SI-NEXT: v_mov_b32_e32 v10, v13 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v0, v0, v8 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v11 +; SI-NEXT: v_or_b32_e32 v2, v2, v7 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v10 +; SI-NEXT: v_or_b32_e32 v4, v4, v6 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v9 +; SI-NEXT: v_or_b32_e32 v1, v1, v8 +; SI-NEXT: v_or_b32_e32 v3, v3, v7 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v6f32_to_v12i16_scalar: @@ -3657,16 +3794,25 @@ define <6 x float> @bitcast_v12i16_to_v6f32(<12 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v12i16_to_v6f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v15, v4 -; SI-NEXT: v_mov_b32_e32 v14, v2 -; SI-NEXT: v_mov_b32_e32 v13, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v11 +; SI-NEXT: v_mov_b32_e32 v12, v5 +; SI-NEXT: v_mov_b32_e32 v7, v4 +; SI-NEXT: v_mov_b32_e32 v8, v3 +; SI-NEXT: v_mov_b32_e32 v9, v2 +; SI-NEXT: v_mov_b32_e32 v10, v1 +; SI-NEXT: v_mov_b32_e32 v11, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v11 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v0 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -3678,52 +3824,52 @@ define <6 x float> @bitcast_v12i16_to_v6f32(<12 x i16> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB30_3: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v13 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v15 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v6 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v8 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v10 -; SI-NEXT: v_or_b32_e32 v0, v0, v18 -; SI-NEXT: v_or_b32_e32 v1, v1, v17 -; SI-NEXT: v_or_b32_e32 v2, v2, v16 -; SI-NEXT: v_or_b32_e32 v3, v3, v12 -; SI-NEXT: v_or_b32_e32 v4, v4, v9 -; SI-NEXT: v_or_b32_e32 v5, v5, v7 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v0, v0, v17 +; SI-NEXT: v_or_b32_e32 v1, v1, v16 +; SI-NEXT: v_or_b32_e32 v2, v2, v15 +; SI-NEXT: v_or_b32_e32 v3, v3, v14 +; SI-NEXT: v_or_b32_e32 v4, v4, v13 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB30_2 ; SI-NEXT: .LBB30_4: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v13 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v14 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v15 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v6 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v8 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v12 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; SI-NEXT: v_or_b32_e32 v0, v18, v0 +; SI-NEXT: v_or_b32_e32 v0, v17, v0 ; SI-NEXT: s_mov_b32 s6, 0x30000 -; SI-NEXT: v_or_b32_e32 v1, v17, v1 -; SI-NEXT: v_or_b32_e32 v2, v16, v2 -; SI-NEXT: v_or_b32_e32 v3, v12, v3 -; SI-NEXT: v_or_b32_e32 v4, v9, v4 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_or_b32_e32 v1, v16, v1 +; SI-NEXT: v_or_b32_e32 v2, v15, v2 +; SI-NEXT: v_or_b32_e32 v3, v14, v3 +; SI-NEXT: v_or_b32_e32 v4, v13, v4 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 @@ -3822,52 +3968,58 @@ define inreg <6 x float> @bitcast_v12i16_to_v6f32_scalar(<12 x i16> inreg %a, i3 ; SI-LABEL: bitcast_v12i16_to_v6f32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_cmp_lg_u32 s28, 0 +; SI-NEXT: s_lshr_b32 s12, s21, 16 +; SI-NEXT: s_lshr_b32 s13, s20, 16 +; SI-NEXT: s_lshr_b32 s14, s19, 16 +; SI-NEXT: s_lshr_b32 s15, s18, 16 +; SI-NEXT: s_lshr_b32 s23, s17, 16 +; SI-NEXT: s_lshr_b32 s24, s16, 16 +; SI-NEXT: s_cmp_lg_u32 s22, 0 ; SI-NEXT: s_cbranch_scc0 .LBB31_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_lshl_b32 s5, s24, 16 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s23, 16 ; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_and_b32 s6, s18, 0xffff +; SI-NEXT: s_lshl_b32 s7, s15, 16 ; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_and_b32 s7, s19, 0xffff +; SI-NEXT: s_lshl_b32 s8, s14, 16 ; SI-NEXT: s_or_b32 s7, s7, s8 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_and_b32 s8, s20, 0xffff +; SI-NEXT: s_lshl_b32 s9, s13, 16 ; SI-NEXT: s_or_b32 s8, s8, s9 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_and_b32 s9, s21, 0xffff +; SI-NEXT: s_lshl_b32 s10, s12, 16 ; SI-NEXT: s_or_b32 s9, s9, s10 ; SI-NEXT: s_cbranch_execnz .LBB31_3 ; SI-NEXT: .LBB31_2: ; %cmp.true ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_lshl_b32 s5, s24, 16 +; SI-NEXT: s_add_i32 s17, s17, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s23, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_and_b32 s6, s18, 0xffff +; SI-NEXT: s_lshl_b32 s7, s15, 16 +; SI-NEXT: s_add_i32 s19, s19, 3 ; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_and_b32 s7, s19, 0xffff +; SI-NEXT: s_lshl_b32 s8, s14, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 ; SI-NEXT: s_or_b32 s7, s8, s7 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_and_b32 s8, s20, 0xffff +; SI-NEXT: s_lshl_b32 s9, s13, 16 +; SI-NEXT: s_add_i32 s21, s21, 3 ; SI-NEXT: s_or_b32 s8, s9, s8 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_and_b32 s9, s21, 0xffff +; SI-NEXT: s_lshl_b32 s10, s12, 16 ; SI-NEXT: s_or_b32 s9, s10, s9 ; SI-NEXT: s_add_i32 s4, s4, 0x30000 ; SI-NEXT: s_add_i32 s5, s5, 0x30000 @@ -4009,87 +4161,101 @@ define <12 x half> @bitcast_v6f32_to_v12f16(<6 x float> %a, i32 %b) { ; SI-LABEL: bitcast_v6f32_to_v12f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v17, v5 -; SI-NEXT: v_mov_b32_e32 v16, v4 -; SI-NEXT: v_mov_b32_e32 v15, v3 -; SI-NEXT: v_mov_b32_e32 v14, v2 -; SI-NEXT: v_mov_b32_e32 v13, v1 -; SI-NEXT: v_mov_b32_e32 v12, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB32_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v0 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB32_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB32_4 -; SI-NEXT: .LBB32_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB32_3: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: .LBB32_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB32_2 -; SI-NEXT: .LBB32_4: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v1, 1.0, v12 -; SI-NEXT: v_add_f32_e32 v3, 1.0, v13 -; SI-NEXT: v_add_f32_e32 v5, 1.0, v14 -; SI-NEXT: v_add_f32_e32 v7, 1.0, v15 -; SI-NEXT: v_add_f32_e32 v9, 1.0, v16 -; SI-NEXT: v_add_f32_e32 v11, 1.0, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: s_cbranch_execz .LBB32_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v0 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v0 +; SI-NEXT: .LBB32_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v14 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v13 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v9 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v6f32_to_v12f16: @@ -4168,64 +4334,88 @@ define inreg <12 x half> @bitcast_v6f32_to_v12f16_scalar(<6 x float> inreg %a, i ; SI-NEXT: s_cbranch_scc0 .LBB33_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 ; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 ; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 ; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 ; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 ; SI-NEXT: s_cbranch_execnz .LBB33_3 ; SI-NEXT: .LBB33_2: ; %cmp.true -; SI-NEXT: v_add_f32_e64 v1, s16, 1.0 -; SI-NEXT: v_add_f32_e64 v3, s17, 1.0 -; SI-NEXT: v_add_f32_e64 v5, s18, 1.0 -; SI-NEXT: v_add_f32_e64 v7, s19, 1.0 -; SI-NEXT: v_add_f32_e64 v9, s20, 1.0 -; SI-NEXT: v_add_f32_e64 v11, s21, 1.0 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v9 +; SI-NEXT: v_add_f32_e64 v11, s16, 1.0 +; SI-NEXT: v_add_f32_e64 v10, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v9, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v7, s20, 1.0 +; SI-NEXT: v_add_f32_e64 v4, s21, 1.0 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v11 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: .LBB33_3: ; %end +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v11 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_or_b32_e32 v2, v8, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v8 +; SI-NEXT: v_or_b32_e32 v1, v12, v1 +; SI-NEXT: v_or_b32_e32 v3, v10, v3 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB33_4: -; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: s_branch .LBB33_2 ; ; VI-LABEL: bitcast_v6f32_to_v12f16_scalar: @@ -4332,19 +4522,37 @@ define <6 x float> @bitcast_v12f16_to_v6f32(<12 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v12f16_to_v6f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v10 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v5 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -4356,37 +4564,37 @@ define <6 x float> @bitcast_v12f16_to_v6f32(<12 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB34_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v20 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v18 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v14 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 -; SI-NEXT: v_or_b32_e32 v0, v19, v0 -; SI-NEXT: v_or_b32_e32 v1, v17, v1 -; SI-NEXT: v_or_b32_e32 v2, v15, v2 -; SI-NEXT: v_or_b32_e32 v3, v13, v3 -; SI-NEXT: v_or_b32_e32 v4, v8, v4 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v8 +; SI-NEXT: v_or_b32_e32 v0, v17, v0 +; SI-NEXT: v_or_b32_e32 v1, v15, v1 +; SI-NEXT: v_or_b32_e32 v2, v13, v2 +; SI-NEXT: v_or_b32_e32 v3, v11, v3 +; SI-NEXT: v_or_b32_e32 v4, v9, v4 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB34_2 ; SI-NEXT: .LBB34_4: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v15 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -4395,42 +4603,42 @@ define <6 x float> @bitcast_v12f16_to_v6f32(<12 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v14 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v11 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v13 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v12 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v10 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v9 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v4, v4, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -4524,19 +4732,37 @@ define inreg <6 x float> @bitcast_v12f16_to_v6f32_scalar(<12 x half> inreg %a, i ; SI-LABEL: bitcast_v12f16_to_v6f32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v17, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v16, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v15, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v14, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v13, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v12, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v11, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v10, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v9, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v8, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v7, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v6, s26 -; SI-NEXT: s_cmp_lg_u32 s28, 0 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: s_cmp_lg_u32 s22, 0 ; SI-NEXT: s_cbranch_scc0 .LBB35_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v17 @@ -5131,46 +5357,58 @@ define <12 x i16> @bitcast_v3i64_to_v12i16(<3 x i64> %a, i32 %b) { ; SI-LABEL: bitcast_v3i64_to_v12i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v10, v5 -; SI-NEXT: v_mov_b32_e32 v8, v4 -; SI-NEXT: v_mov_b32_e32 v12, v3 -; SI-NEXT: v_mov_b32_e32 v4, v2 -; SI-NEXT: v_mov_b32_e32 v2, v1 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; SI-NEXT: v_alignbit_b32 v5, v12, v4, 16 -; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_alignbit_b32 v6, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v7, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v9, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v1 ; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB40_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; SI-NEXT: v_addc_u32_e32 v12, vcc, 0, v12, vcc -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; SI-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc -; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; SI-NEXT: v_alignbit_b32 v5, v12, v4, 16 -; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_alignbit_b32 v6, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v7, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v9, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v1 ; SI-NEXT: .LBB40_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_mov_b32_e32 v6, v12 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v0, v0, v9 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v11 +; SI-NEXT: v_or_b32_e32 v2, v2, v7 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v10 +; SI-NEXT: v_or_b32_e32 v4, v4, v6 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v8 +; SI-NEXT: v_or_b32_e32 v1, v1, v9 +; SI-NEXT: v_or_b32_e32 v3, v3, v7 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v3i64_to_v12i16: @@ -5274,18 +5512,30 @@ define inreg <12 x i16> @bitcast_v3i64_to_v12i16_scalar(<3 x i64> inreg %a, i32 ; SI-NEXT: s_lshr_b64 s[6:7], s[18:19], 16 ; SI-NEXT: s_lshr_b64 s[8:9], s[16:17], 16 ; SI-NEXT: .LBB41_3: ; %end -; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_mov_b32_e32 v1, s8 -; SI-NEXT: v_mov_b32_e32 v2, s17 -; SI-NEXT: v_mov_b32_e32 v3, s14 -; SI-NEXT: v_mov_b32_e32 v4, s18 -; SI-NEXT: v_mov_b32_e32 v5, s6 -; SI-NEXT: v_mov_b32_e32 v6, s19 -; SI-NEXT: v_mov_b32_e32 v7, s13 -; SI-NEXT: v_mov_b32_e32 v8, s20 -; SI-NEXT: v_mov_b32_e32 v9, s4 -; SI-NEXT: v_mov_b32_e32 v10, s21 -; SI-NEXT: v_mov_b32_e32 v11, s12 +; SI-NEXT: s_and_b32 s5, s16, 0xffff +; SI-NEXT: s_lshl_b32 s7, s8, 16 +; SI-NEXT: s_or_b32 s5, s5, s7 +; SI-NEXT: s_and_b32 s7, s17, 0xffff +; SI-NEXT: s_lshl_b32 s8, s14, 16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_or_b32 s6, s8, s6 +; SI-NEXT: s_and_b32 s8, s19, 0xffff +; SI-NEXT: s_lshl_b32 s9, s13, 16 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s20, 0xffff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_or_b32 s4, s9, s4 +; SI-NEXT: s_and_b32 s9, s21, 0xffff +; SI-NEXT: s_lshl_b32 s10, s12, 16 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: v_mov_b32_e32 v0, s5 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s8 +; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: v_mov_b32_e32 v5, s9 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB41_4: ; SI-NEXT: ; implicit-def: $sgpr8 @@ -5391,16 +5641,25 @@ define <3 x i64> @bitcast_v12i16_to_v3i64(<12 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v12i16_to_v3i64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v15, v4 -; SI-NEXT: v_mov_b32_e32 v14, v2 -; SI-NEXT: v_mov_b32_e32 v13, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v11 +; SI-NEXT: v_mov_b32_e32 v12, v5 +; SI-NEXT: v_mov_b32_e32 v7, v4 +; SI-NEXT: v_mov_b32_e32 v8, v3 +; SI-NEXT: v_mov_b32_e32 v9, v2 +; SI-NEXT: v_mov_b32_e32 v10, v1 +; SI-NEXT: v_mov_b32_e32 v11, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v11 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v0 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -5412,52 +5671,52 @@ define <3 x i64> @bitcast_v12i16_to_v3i64(<12 x i16> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB42_3: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v13 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v15 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v6 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v8 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v10 -; SI-NEXT: v_or_b32_e32 v0, v0, v18 -; SI-NEXT: v_or_b32_e32 v1, v1, v17 -; SI-NEXT: v_or_b32_e32 v2, v2, v16 -; SI-NEXT: v_or_b32_e32 v3, v3, v12 -; SI-NEXT: v_or_b32_e32 v4, v4, v9 -; SI-NEXT: v_or_b32_e32 v5, v5, v7 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v0, v0, v17 +; SI-NEXT: v_or_b32_e32 v1, v1, v16 +; SI-NEXT: v_or_b32_e32 v2, v2, v15 +; SI-NEXT: v_or_b32_e32 v3, v3, v14 +; SI-NEXT: v_or_b32_e32 v4, v4, v13 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB42_2 ; SI-NEXT: .LBB42_4: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v13 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v14 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v15 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v6 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v8 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v12 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; SI-NEXT: v_or_b32_e32 v0, v18, v0 +; SI-NEXT: v_or_b32_e32 v0, v17, v0 ; SI-NEXT: s_mov_b32 s6, 0x30000 -; SI-NEXT: v_or_b32_e32 v1, v17, v1 -; SI-NEXT: v_or_b32_e32 v2, v16, v2 -; SI-NEXT: v_or_b32_e32 v3, v12, v3 -; SI-NEXT: v_or_b32_e32 v4, v9, v4 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_or_b32_e32 v1, v16, v1 +; SI-NEXT: v_or_b32_e32 v2, v15, v2 +; SI-NEXT: v_or_b32_e32 v3, v14, v3 +; SI-NEXT: v_or_b32_e32 v4, v13, v4 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 @@ -5556,52 +5815,58 @@ define inreg <3 x i64> @bitcast_v12i16_to_v3i64_scalar(<12 x i16> inreg %a, i32 ; SI-LABEL: bitcast_v12i16_to_v3i64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_cmp_lg_u32 s28, 0 +; SI-NEXT: s_lshr_b32 s12, s21, 16 +; SI-NEXT: s_lshr_b32 s13, s20, 16 +; SI-NEXT: s_lshr_b32 s14, s19, 16 +; SI-NEXT: s_lshr_b32 s15, s18, 16 +; SI-NEXT: s_lshr_b32 s23, s17, 16 +; SI-NEXT: s_lshr_b32 s24, s16, 16 +; SI-NEXT: s_cmp_lg_u32 s22, 0 ; SI-NEXT: s_cbranch_scc0 .LBB43_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_lshl_b32 s5, s24, 16 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s23, 16 ; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_and_b32 s6, s18, 0xffff +; SI-NEXT: s_lshl_b32 s7, s15, 16 ; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_and_b32 s7, s19, 0xffff +; SI-NEXT: s_lshl_b32 s8, s14, 16 ; SI-NEXT: s_or_b32 s7, s7, s8 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_and_b32 s8, s20, 0xffff +; SI-NEXT: s_lshl_b32 s9, s13, 16 ; SI-NEXT: s_or_b32 s8, s8, s9 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_and_b32 s9, s21, 0xffff +; SI-NEXT: s_lshl_b32 s10, s12, 16 ; SI-NEXT: s_or_b32 s9, s9, s10 ; SI-NEXT: s_cbranch_execnz .LBB43_3 ; SI-NEXT: .LBB43_2: ; %cmp.true ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_lshl_b32 s5, s24, 16 +; SI-NEXT: s_add_i32 s17, s17, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s23, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_and_b32 s6, s18, 0xffff +; SI-NEXT: s_lshl_b32 s7, s15, 16 +; SI-NEXT: s_add_i32 s19, s19, 3 ; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_and_b32 s7, s19, 0xffff +; SI-NEXT: s_lshl_b32 s8, s14, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 ; SI-NEXT: s_or_b32 s7, s8, s7 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_and_b32 s8, s20, 0xffff +; SI-NEXT: s_lshl_b32 s9, s13, 16 +; SI-NEXT: s_add_i32 s21, s21, 3 ; SI-NEXT: s_or_b32 s8, s9, s8 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_and_b32 s9, s21, 0xffff +; SI-NEXT: s_lshl_b32 s10, s12, 16 ; SI-NEXT: s_or_b32 s9, s10, s9 ; SI-NEXT: s_add_i32 s4, s4, 0x30000 ; SI-NEXT: s_add_i32 s5, s5, 0x30000 @@ -5743,87 +6008,101 @@ define <12 x half> @bitcast_v3i64_to_v12f16(<3 x i64> %a, i32 %b) { ; SI-LABEL: bitcast_v3i64_to_v12f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v13, v5 -; SI-NEXT: v_mov_b32_e32 v12, v4 -; SI-NEXT: v_mov_b32_e32 v15, v3 -; SI-NEXT: v_mov_b32_e32 v14, v2 -; SI-NEXT: v_mov_b32_e32 v17, v1 -; SI-NEXT: v_mov_b32_e32 v16, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB44_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB44_4 -; SI-NEXT: .LBB44_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB44_3: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v16 ; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB44_2 -; SI-NEXT: .LBB44_4: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v16 -; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v17, vcc -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v14 -; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v15, vcc -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v12 -; SI-NEXT: v_addc_u32_e32 v8, vcc, 0, v13, vcc -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: .LBB44_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB44_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: .LBB44_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v14 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v13 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v8 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v3i64_to_v12f16: @@ -5907,23 +6186,23 @@ define inreg <12 x half> @bitcast_v3i64_to_v12f16_scalar(<3 x i64> inreg %a, i32 ; SI-NEXT: s_cbranch_scc0 .LBB45_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 ; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 ; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 ; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 ; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s16 ; SI-NEXT: s_cbranch_execnz .LBB45_3 ; SI-NEXT: .LBB45_2: ; %cmp.true ; SI-NEXT: s_add_u32 s4, s16, 3 @@ -5938,33 +6217,57 @@ define inreg <12 x half> @bitcast_v3i64_to_v12f16_scalar(<3 x i64> inreg %a, i32 ; SI-NEXT: s_addc_u32 s13, s21, 0 ; SI-NEXT: s_lshr_b32 s14, s12, 16 ; SI-NEXT: s_lshr_b32 s15, s13, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s6 ; SI-NEXT: .LBB45_3: ; %end +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v11 +; SI-NEXT: v_or_b32_e32 v0, v10, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_or_b32_e32 v2, v8, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v8 +; SI-NEXT: v_or_b32_e32 v1, v12, v1 +; SI-NEXT: v_or_b32_e32 v3, v10, v3 +; SI-NEXT: v_or_b32_e32 v4, v7, v4 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB45_4: +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: s_branch .LBB45_2 ; ; VI-LABEL: bitcast_v3i64_to_v12f16_scalar: @@ -6062,19 +6365,37 @@ define <3 x i64> @bitcast_v12f16_to_v3i64(<12 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v12f16_to_v3i64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v10 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v5 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -6086,37 +6407,37 @@ define <3 x i64> @bitcast_v12f16_to_v3i64(<12 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB46_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v20 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v18 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v14 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 -; SI-NEXT: v_or_b32_e32 v0, v19, v0 -; SI-NEXT: v_or_b32_e32 v1, v17, v1 -; SI-NEXT: v_or_b32_e32 v2, v15, v2 -; SI-NEXT: v_or_b32_e32 v3, v13, v3 -; SI-NEXT: v_or_b32_e32 v4, v8, v4 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v8 +; SI-NEXT: v_or_b32_e32 v0, v17, v0 +; SI-NEXT: v_or_b32_e32 v1, v15, v1 +; SI-NEXT: v_or_b32_e32 v2, v13, v2 +; SI-NEXT: v_or_b32_e32 v3, v11, v3 +; SI-NEXT: v_or_b32_e32 v4, v9, v4 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB46_2 ; SI-NEXT: .LBB46_4: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v15 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -6125,42 +6446,42 @@ define <3 x i64> @bitcast_v12f16_to_v3i64(<12 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v14 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v11 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v13 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v12 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v10 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v9 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v4, v4, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -6254,19 +6575,37 @@ define inreg <3 x i64> @bitcast_v12f16_to_v3i64_scalar(<12 x half> inreg %a, i32 ; SI-LABEL: bitcast_v12f16_to_v3i64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v17, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v16, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v15, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v14, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v13, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v12, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v11, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v10, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v9, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v8, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v7, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v6, s26 -; SI-NEXT: s_cmp_lg_u32 s28, 0 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: s_cmp_lg_u32 s22, 0 ; SI-NEXT: s_cbranch_scc0 .LBB47_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v17 @@ -6462,49 +6801,55 @@ define <12 x i16> @bitcast_v3f64_to_v12i16(<3 x double> %a, i32 %b) { ; SI-LABEL: bitcast_v3f64_to_v12i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v13, v5 -; SI-NEXT: v_mov_b32_e32 v12, v4 -; SI-NEXT: v_mov_b32_e32 v15, v3 -; SI-NEXT: v_mov_b32_e32 v14, v2 -; SI-NEXT: v_mov_b32_e32 v17, v1 -; SI-NEXT: v_mov_b32_e32 v16, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_alignbit_b32 v9, v13, v12, 16 -; SI-NEXT: v_alignbit_b32 v5, v15, v14, 16 -; SI-NEXT: v_alignbit_b32 v1, v17, v16, 16 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v17 +; SI-NEXT: v_alignbit_b32 v6, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v7, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v9, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v1 ; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB48_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; SI-NEXT: v_alignbit_b32 v9, v13, v12, 16 -; SI-NEXT: v_alignbit_b32 v5, v15, v14, 16 -; SI-NEXT: v_alignbit_b32 v1, v17, v16, 16 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v17 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_alignbit_b32 v6, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v7, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v9, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v1 ; SI-NEXT: .LBB48_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_mov_b32_e32 v0, v16 -; SI-NEXT: v_mov_b32_e32 v2, v17 -; SI-NEXT: v_mov_b32_e32 v4, v14 -; SI-NEXT: v_mov_b32_e32 v6, v15 -; SI-NEXT: v_mov_b32_e32 v8, v12 -; SI-NEXT: v_mov_b32_e32 v10, v13 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v0, v0, v9 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v11 +; SI-NEXT: v_or_b32_e32 v2, v2, v7 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v10 +; SI-NEXT: v_or_b32_e32 v4, v4, v6 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v8 +; SI-NEXT: v_or_b32_e32 v1, v1, v9 +; SI-NEXT: v_or_b32_e32 v3, v3, v7 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v3f64_to_v12i16: @@ -6587,15 +6932,15 @@ define inreg <12 x i16> @bitcast_v3f64_to_v12i16_scalar(<3 x double> inreg %a, i ; SI-NEXT: s_lshr_b64 s[8:9], s[16:17], 16 ; SI-NEXT: s_cbranch_execnz .LBB49_4 ; SI-NEXT: .LBB49_2: ; %cmp.true -; SI-NEXT: v_add_f64 v[16:17], s[16:17], 1.0 -; SI-NEXT: v_add_f64 v[12:13], s[20:21], 1.0 -; SI-NEXT: v_add_f64 v[14:15], s[18:19], 1.0 -; SI-NEXT: v_lshr_b64 v[9:10], v[12:13], 16 -; SI-NEXT: v_lshr_b64 v[5:6], v[14:15], 16 -; SI-NEXT: v_lshr_b64 v[1:2], v[16:17], 16 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v17 +; SI-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; SI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; SI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; SI-NEXT: v_lshr_b64 v[6:7], v[4:5], 16 +; SI-NEXT: v_lshr_b64 v[7:8], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[8:9], v[0:1], 16 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v1 ; SI-NEXT: s_branch .LBB49_5 ; SI-NEXT: .LBB49_3: ; SI-NEXT: ; implicit-def: $sgpr8 @@ -6606,25 +6951,37 @@ define inreg <12 x i16> @bitcast_v3f64_to_v12i16_scalar(<3 x double> inreg %a, i ; SI-NEXT: ; implicit-def: $sgpr14 ; SI-NEXT: s_branch .LBB49_2 ; SI-NEXT: .LBB49_4: -; SI-NEXT: v_mov_b32_e32 v16, s16 -; SI-NEXT: v_mov_b32_e32 v14, s18 -; SI-NEXT: v_mov_b32_e32 v12, s20 -; SI-NEXT: v_mov_b32_e32 v17, s17 -; SI-NEXT: v_mov_b32_e32 v15, s19 -; SI-NEXT: v_mov_b32_e32 v13, s21 -; SI-NEXT: v_mov_b32_e32 v3, s12 -; SI-NEXT: v_mov_b32_e32 v7, s13 -; SI-NEXT: v_mov_b32_e32 v11, s14 -; SI-NEXT: v_mov_b32_e32 v1, s8 -; SI-NEXT: v_mov_b32_e32 v5, s6 -; SI-NEXT: v_mov_b32_e32 v9, s4 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v9, s12 +; SI-NEXT: v_mov_b32_e32 v11, s13 +; SI-NEXT: v_mov_b32_e32 v10, s14 +; SI-NEXT: v_mov_b32_e32 v8, s8 +; SI-NEXT: v_mov_b32_e32 v7, s6 +; SI-NEXT: v_mov_b32_e32 v6, s4 ; SI-NEXT: .LBB49_5: ; %end -; SI-NEXT: v_mov_b32_e32 v0, v16 -; SI-NEXT: v_mov_b32_e32 v2, v17 -; SI-NEXT: v_mov_b32_e32 v4, v14 -; SI-NEXT: v_mov_b32_e32 v6, v15 -; SI-NEXT: v_mov_b32_e32 v8, v12 -; SI-NEXT: v_mov_b32_e32 v10, v13 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v0, v0, v8 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; SI-NEXT: v_or_b32_e32 v2, v2, v7 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v11 +; SI-NEXT: v_or_b32_e32 v4, v4, v6 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v10 +; SI-NEXT: v_or_b32_e32 v1, v1, v8 +; SI-NEXT: v_or_b32_e32 v3, v3, v7 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v3f64_to_v12i16_scalar: @@ -6722,16 +7079,25 @@ define <3 x double> @bitcast_v12i16_to_v3f64(<12 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v12i16_to_v3f64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v15, v4 -; SI-NEXT: v_mov_b32_e32 v14, v2 -; SI-NEXT: v_mov_b32_e32 v13, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v11 +; SI-NEXT: v_mov_b32_e32 v12, v5 +; SI-NEXT: v_mov_b32_e32 v7, v4 +; SI-NEXT: v_mov_b32_e32 v8, v3 +; SI-NEXT: v_mov_b32_e32 v9, v2 +; SI-NEXT: v_mov_b32_e32 v10, v1 +; SI-NEXT: v_mov_b32_e32 v11, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v11 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v0 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -6743,52 +7109,52 @@ define <3 x double> @bitcast_v12i16_to_v3f64(<12 x i16> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB50_3: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v13 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v15 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v6 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v8 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v10 -; SI-NEXT: v_or_b32_e32 v0, v0, v18 -; SI-NEXT: v_or_b32_e32 v1, v1, v17 -; SI-NEXT: v_or_b32_e32 v2, v2, v16 -; SI-NEXT: v_or_b32_e32 v3, v3, v12 -; SI-NEXT: v_or_b32_e32 v4, v4, v9 -; SI-NEXT: v_or_b32_e32 v5, v5, v7 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v0, v0, v17 +; SI-NEXT: v_or_b32_e32 v1, v1, v16 +; SI-NEXT: v_or_b32_e32 v2, v2, v15 +; SI-NEXT: v_or_b32_e32 v3, v3, v14 +; SI-NEXT: v_or_b32_e32 v4, v4, v13 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB50_2 ; SI-NEXT: .LBB50_4: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v13 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v14 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v15 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v6 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v8 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v12 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; SI-NEXT: v_or_b32_e32 v0, v18, v0 +; SI-NEXT: v_or_b32_e32 v0, v17, v0 ; SI-NEXT: s_mov_b32 s6, 0x30000 -; SI-NEXT: v_or_b32_e32 v1, v17, v1 -; SI-NEXT: v_or_b32_e32 v2, v16, v2 -; SI-NEXT: v_or_b32_e32 v3, v12, v3 -; SI-NEXT: v_or_b32_e32 v4, v9, v4 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_or_b32_e32 v1, v16, v1 +; SI-NEXT: v_or_b32_e32 v2, v15, v2 +; SI-NEXT: v_or_b32_e32 v3, v14, v3 +; SI-NEXT: v_or_b32_e32 v4, v13, v4 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 @@ -6887,52 +7253,58 @@ define inreg <3 x double> @bitcast_v12i16_to_v3f64_scalar(<12 x i16> inreg %a, i ; SI-LABEL: bitcast_v12i16_to_v3f64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_cmp_lg_u32 s28, 0 +; SI-NEXT: s_lshr_b32 s12, s21, 16 +; SI-NEXT: s_lshr_b32 s13, s20, 16 +; SI-NEXT: s_lshr_b32 s14, s19, 16 +; SI-NEXT: s_lshr_b32 s15, s18, 16 +; SI-NEXT: s_lshr_b32 s23, s17, 16 +; SI-NEXT: s_lshr_b32 s24, s16, 16 +; SI-NEXT: s_cmp_lg_u32 s22, 0 ; SI-NEXT: s_cbranch_scc0 .LBB51_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_lshl_b32 s5, s24, 16 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s23, 16 ; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_and_b32 s6, s18, 0xffff +; SI-NEXT: s_lshl_b32 s7, s15, 16 ; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_and_b32 s7, s19, 0xffff +; SI-NEXT: s_lshl_b32 s8, s14, 16 ; SI-NEXT: s_or_b32 s7, s7, s8 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_and_b32 s8, s20, 0xffff +; SI-NEXT: s_lshl_b32 s9, s13, 16 ; SI-NEXT: s_or_b32 s8, s8, s9 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_and_b32 s9, s21, 0xffff +; SI-NEXT: s_lshl_b32 s10, s12, 16 ; SI-NEXT: s_or_b32 s9, s9, s10 ; SI-NEXT: s_cbranch_execnz .LBB51_3 ; SI-NEXT: .LBB51_2: ; %cmp.true ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_lshl_b32 s5, s24, 16 +; SI-NEXT: s_add_i32 s17, s17, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s23, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_and_b32 s6, s18, 0xffff +; SI-NEXT: s_lshl_b32 s7, s15, 16 +; SI-NEXT: s_add_i32 s19, s19, 3 ; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_and_b32 s7, s19, 0xffff +; SI-NEXT: s_lshl_b32 s8, s14, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 ; SI-NEXT: s_or_b32 s7, s8, s7 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_and_b32 s8, s20, 0xffff +; SI-NEXT: s_lshl_b32 s9, s13, 16 +; SI-NEXT: s_add_i32 s21, s21, 3 ; SI-NEXT: s_or_b32 s8, s9, s8 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_and_b32 s9, s21, 0xffff +; SI-NEXT: s_lshl_b32 s10, s12, 16 ; SI-NEXT: s_or_b32 s9, s10, s9 ; SI-NEXT: s_add_i32 s4, s4, 0x30000 ; SI-NEXT: s_add_i32 s5, s5, 0x30000 @@ -7077,37 +7449,37 @@ define <12 x half> @bitcast_v3f64_to_v12f16(<3 x double> %a, i32 %b) { ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 ; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB52_2 ; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v6 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v6 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v6 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v6 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v0 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr2 @@ -7119,11 +7491,11 @@ define <12 x half> @bitcast_v3f64_to_v12f16(<3 x double> %a, i32 %b) { ; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v0 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 @@ -7131,20 +7503,38 @@ define <12 x half> @bitcast_v3f64_to_v12f16(<3 x double> %a, i32 %b) { ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v0 ; SI-NEXT: .LBB52_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_mov_b32_e32 v0, v15 -; SI-NEXT: v_mov_b32_e32 v1, v17 -; SI-NEXT: v_mov_b32_e32 v2, v13 -; SI-NEXT: v_mov_b32_e32 v3, v16 -; SI-NEXT: v_mov_b32_e32 v4, v12 -; SI-NEXT: v_mov_b32_e32 v5, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v14 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v13 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v9 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v3f64_to_v12f16: @@ -7220,61 +7610,85 @@ define inreg <12 x half> @bitcast_v3f64_to_v12f16_scalar(<3 x double> inreg %a, ; SI-NEXT: s_cbranch_scc0 .LBB53_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 ; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 ; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 ; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 ; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 ; SI-NEXT: s_cbranch_execnz .LBB53_3 ; SI-NEXT: .LBB53_2: ; %cmp.true -; SI-NEXT: v_add_f64 v[14:15], s[20:21], 1.0 -; SI-NEXT: v_add_f64 v[16:17], s[18:19], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v14 -; SI-NEXT: v_add_f64 v[12:13], s[16:17], 1.0 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v13 +; SI-NEXT: v_add_f64 v[3:4], s[20:21], 1.0 +; SI-NEXT: v_add_f64 v[11:12], s[16:17], 1.0 +; SI-NEXT: v_add_f64 v[9:10], s[18:19], 1.0 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v11 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: .LBB53_3: ; %end +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v11 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_or_b32_e32 v2, v8, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v8 +; SI-NEXT: v_or_b32_e32 v1, v12, v1 +; SI-NEXT: v_or_b32_e32 v3, v10, v3 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB53_4: -; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: s_branch .LBB53_2 ; ; VI-LABEL: bitcast_v3f64_to_v12f16_scalar: @@ -7372,19 +7786,37 @@ define <3 x double> @bitcast_v12f16_to_v3f64(<12 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v12f16_to_v3f64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v10 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v5 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -7396,37 +7828,37 @@ define <3 x double> @bitcast_v12f16_to_v3f64(<12 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB54_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v20 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v18 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v14 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 -; SI-NEXT: v_or_b32_e32 v0, v19, v0 -; SI-NEXT: v_or_b32_e32 v1, v17, v1 -; SI-NEXT: v_or_b32_e32 v2, v15, v2 -; SI-NEXT: v_or_b32_e32 v3, v13, v3 -; SI-NEXT: v_or_b32_e32 v4, v8, v4 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v8 +; SI-NEXT: v_or_b32_e32 v0, v17, v0 +; SI-NEXT: v_or_b32_e32 v1, v15, v1 +; SI-NEXT: v_or_b32_e32 v2, v13, v2 +; SI-NEXT: v_or_b32_e32 v3, v11, v3 +; SI-NEXT: v_or_b32_e32 v4, v9, v4 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB54_2 ; SI-NEXT: .LBB54_4: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v15 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -7435,42 +7867,42 @@ define <3 x double> @bitcast_v12f16_to_v3f64(<12 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v14 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v11 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v13 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v12 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v10 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v9 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v4, v4, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -7564,19 +7996,37 @@ define inreg <3 x double> @bitcast_v12f16_to_v3f64_scalar(<12 x half> inreg %a, ; SI-LABEL: bitcast_v12f16_to_v3f64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v17, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v16, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v15, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v14, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v13, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v12, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v11, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v10, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v9, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v8, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v7, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v6, s26 -; SI-NEXT: s_cmp_lg_u32 s28, 0 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: s_cmp_lg_u32 s22, 0 ; SI-NEXT: s_cbranch_scc0 .LBB55_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v17 @@ -7772,93 +8222,107 @@ define <12 x half> @bitcast_v12i16_to_v12f16(<12 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v12i16_to_v12f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v23, v11 -; SI-NEXT: v_mov_b32_e32 v22, v10 -; SI-NEXT: v_mov_b32_e32 v21, v9 -; SI-NEXT: v_mov_b32_e32 v20, v8 -; SI-NEXT: v_mov_b32_e32 v19, v7 -; SI-NEXT: v_mov_b32_e32 v18, v6 -; SI-NEXT: v_mov_b32_e32 v17, v5 -; SI-NEXT: v_mov_b32_e32 v16, v4 -; SI-NEXT: v_mov_b32_e32 v15, v3 -; SI-NEXT: v_mov_b32_e32 v14, v2 -; SI-NEXT: v_mov_b32_e32 v13, v1 -; SI-NEXT: v_mov_b32_e32 v24, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 ; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB56_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB56_4 -; SI-NEXT: .LBB56_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB56_3: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v0, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v23 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: s_cbranch_execz .LBB56_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_cvt_f32_f16_e32 v6, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v23 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: .LBB56_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB56_2 -; SI-NEXT: .LBB56_4: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v23 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v22 +; SI-NEXT: s_cbranch_execz .LBB56_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 ; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v21 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v19 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v18 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v17 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v16 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v15 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v14 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v13 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: .LBB56_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v8 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v13 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v15 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v11 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v12i16_to_v12f16: @@ -7950,61 +8414,91 @@ define inreg <12 x half> @bitcast_v12i16_to_v12f16_scalar(<12 x i16> inreg %a, i ; SI-LABEL: bitcast_v12i16_to_v12f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_cmp_lg_u32 s28, 0 +; SI-NEXT: s_lshr_b32 s11, s21, 16 +; SI-NEXT: s_lshr_b32 s10, s20, 16 +; SI-NEXT: s_lshr_b32 s9, s19, 16 +; SI-NEXT: s_lshr_b32 s8, s18, 16 +; SI-NEXT: s_lshr_b32 s7, s17, 16 +; SI-NEXT: s_lshr_b32 s6, s16, 16 +; SI-NEXT: s_cmp_lg_u32 s22, 0 ; SI-NEXT: s_cbranch_scc0 .LBB57_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s6 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s7 ; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s8 ; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s9 ; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 ; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s11 ; SI-NEXT: s_cbranch_execnz .LBB57_3 ; SI-NEXT: .LBB57_2: ; %cmp.true -; SI-NEXT: s_add_i32 s27, s27, 3 -; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: s_add_i32 s25, s25, 3 -; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: s_add_i32 s23, s23, 3 -; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s11, s11, 3 ; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s10, s10, 3 ; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s9, s9, 3 ; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s8, s8, 3 ; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s7, s7, 3 ; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s6, s6, 3 ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s6 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s7 ; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s8 ; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s9 ; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 ; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s11 ; SI-NEXT: .LBB57_3: ; %end +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v0, v0, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v8 +; SI-NEXT: v_or_b32_e32 v1, v1, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v2, v2, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 +; SI-NEXT: v_or_b32_e32 v3, v3, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v4, v4, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB57_4: ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: s_branch .LBB57_2 ; @@ -8133,77 +8627,113 @@ define <12 x i16> @bitcast_v12f16_to_v12i16(<12 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v12f16_to_v12i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v13 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB58_2 ; SI-NEXT: ; %bb.1: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v11 -; SI-NEXT: v_or_b32_e32 v10, v10, v12 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v7 -; SI-NEXT: v_or_b32_e32 v6, v6, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v5 +; SI-NEXT: v_or_b32_e32 v7, v7, v12 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v9, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v2, v2, v12 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_or_b32_e32 v4, v4, v5 -; SI-NEXT: v_or_b32_e32 v8, v8, v9 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 +; SI-NEXT: v_or_b32_e32 v0, v0, v6 +; SI-NEXT: v_or_b32_e32 v8, v8, v11 +; SI-NEXT: v_or_b32_e32 v4, v4, v10 +; SI-NEXT: v_alignbit_b32 v12, v2, v6, 16 +; SI-NEXT: v_alignbit_b32 v11, v9, v11, 16 +; SI-NEXT: v_alignbit_b32 v10, v7, v10, 16 ; SI-NEXT: .LBB58_2: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v12 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v6 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v11 +; SI-NEXT: v_or_b32_e32 v2, v2, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v6, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v10 +; SI-NEXT: v_or_b32_e32 v4, v4, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v12f16_to_v12i16: @@ -8296,78 +8826,111 @@ define inreg <12 x i16> @bitcast_v12f16_to_v12i16_scalar(<12 x half> inreg %a, i ; SI-LABEL: bitcast_v12f16_to_v12i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v14, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v2, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v3, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v4, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v15, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v6, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v7, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v8, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v12, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v10, s26 -; SI-NEXT: v_cvt_f16_f32_e32 v11, s27 -; SI-NEXT: s_cmp_lg_u32 s28, 0 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v0 +; SI-NEXT: s_cmp_lg_u32 s22, 0 ; SI-NEXT: s_cbranch_scc0 .LBB59_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_cbranch_execnz .LBB59_3 ; SI-NEXT: .LBB59_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v12 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v15 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v16, v2, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_or_b32_e32 v15, v7, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v12 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v8 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v14 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v7 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v6 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v11 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_or_b32_e32 v10, v10, v12 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_or_b32_e32 v6, v6, v12 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v2, v2, v12 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_lshr_b64 v[14:15], v[1:2], 16 -; SI-NEXT: v_lshr_b64 v[15:16], v[5:6], 16 -; SI-NEXT: v_lshr_b64 v[12:13], v[9:10], 16 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_or_b32_e32 v4, v4, v5 -; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v12 +; SI-NEXT: v_or_b32_e32 v3, v3, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v14 +; SI-NEXT: v_or_b32_e32 v5, v5, v7 +; SI-NEXT: v_or_b32_e32 v1, v1, v6 +; SI-NEXT: v_lshr_b64 v[10:11], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[8:9], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[6:7], v[4:5], 16 +; SI-NEXT: v_or_b32_e32 v4, v17, v4 ; SI-NEXT: .LBB59_3: ; %end -; SI-NEXT: v_mov_b32_e32 v1, v14 -; SI-NEXT: v_mov_b32_e32 v5, v15 -; SI-NEXT: v_mov_b32_e32 v9, v12 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v10 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v14 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v2, v2, v7 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v13 +; SI-NEXT: v_or_b32_e32 v4, v4, v6 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v12 +; SI-NEXT: v_or_b32_e32 v3, v3, v7 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB59_4: ; SI-NEXT: s_branch .LBB59_2 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.224bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.224bit.ll index e3b374b712717..cd5f3490a69e9 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.224bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.224bit.ll @@ -452,55 +452,65 @@ define <14 x i16> @bitcast_v7i32_to_v14i16(<7 x i32> %a, i32 %b) { ; SI-LABEL: bitcast_v7i32_to_v14i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v12, v6 -; SI-NEXT: v_mov_b32_e32 v10, v5 -; SI-NEXT: v_mov_b32_e32 v8, v4 -; SI-NEXT: v_mov_b32_e32 v6, v3 -; SI-NEXT: v_mov_b32_e32 v4, v2 -; SI-NEXT: v_mov_b32_e32 v2, v1 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB4_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB4_4 -; SI-NEXT: .LBB4_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB4_3: ; %cmp.false -; SI-NEXT: v_alignbit_b32 v13, v0, v12, 16 -; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v7, v0, v6, 16 +; SI-NEXT: v_alignbit_b32 v8, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v9, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v11, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v1 +; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB4_2 -; SI-NEXT: .LBB4_4: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: s_cbranch_execz .LBB4_4 +; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 ; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; SI-NEXT: v_alignbit_b32 v13, v0, v12, 16 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_alignbit_b32 v8, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v9, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v11, v1, v0, 16 +; SI-NEXT: v_alignbit_b32 v7, v0, v6, 16 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v1 +; SI-NEXT: .LBB4_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v0, v0, v11 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v13 +; SI-NEXT: v_or_b32_e32 v2, v2, v9 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v12 +; SI-NEXT: v_or_b32_e32 v4, v4, v8 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v10 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v1, v1, v11 +; SI-NEXT: v_or_b32_e32 v3, v3, v9 +; SI-NEXT: v_or_b32_e32 v5, v5, v8 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v7i32_to_v14i16: @@ -609,20 +619,34 @@ define inreg <14 x i16> @bitcast_v7i32_to_v14i16_scalar(<7 x i32> inreg %a, i32 ; SI-NEXT: s_lshr_b32 s15, s19, 16 ; SI-NEXT: s_lshr_b64 s[6:7], s[22:23], 16 ; SI-NEXT: .LBB5_3: ; %end -; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_mov_b32_e32 v1, s10 -; SI-NEXT: v_mov_b32_e32 v2, s17 -; SI-NEXT: v_mov_b32_e32 v3, s23 -; SI-NEXT: v_mov_b32_e32 v4, s18 -; SI-NEXT: v_mov_b32_e32 v5, s8 -; SI-NEXT: v_mov_b32_e32 v6, s19 -; SI-NEXT: v_mov_b32_e32 v7, s15 -; SI-NEXT: v_mov_b32_e32 v8, s20 -; SI-NEXT: v_mov_b32_e32 v9, s4 -; SI-NEXT: v_mov_b32_e32 v10, s21 -; SI-NEXT: v_mov_b32_e32 v11, s14 -; SI-NEXT: v_mov_b32_e32 v12, s22 -; SI-NEXT: v_mov_b32_e32 v13, s6 +; SI-NEXT: s_and_b32 s5, s16, 0xffff +; SI-NEXT: s_lshl_b32 s7, s10, 16 +; SI-NEXT: s_or_b32 s5, s5, s7 +; SI-NEXT: s_and_b32 s7, s17, 0xffff +; SI-NEXT: s_lshl_b32 s9, s23, 16 +; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: s_and_b32 s9, s18, 0xffff +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s19, 0xffff +; SI-NEXT: s_lshl_b32 s10, s15, 16 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s20, 0xffff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_or_b32 s4, s10, s4 +; SI-NEXT: s_and_b32 s10, s21, 0xffff +; SI-NEXT: s_lshl_b32 s11, s14, 16 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: s_and_b32 s11, s22, 0xffff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_or_b32 s6, s11, s6 +; SI-NEXT: v_mov_b32_e32 v0, s5 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: v_mov_b32_e32 v3, s9 +; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: v_mov_b32_e32 v5, s10 +; SI-NEXT: v_mov_b32_e32 v6, s6 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB5_4: ; SI-NEXT: ; implicit-def: $sgpr10 @@ -735,18 +759,28 @@ define <7 x i32> @bitcast_v14i16_to_v7i32(<14 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v14i16_to_v7i32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v18, v6 -; SI-NEXT: v_mov_b32_e32 v17, v4 -; SI-NEXT: v_mov_b32_e32 v16, v2 -; SI-NEXT: v_mov_b32_e32 v15, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v13 +; SI-NEXT: v_mov_b32_e32 v14, v6 +; SI-NEXT: v_mov_b32_e32 v8, v5 +; SI-NEXT: v_mov_b32_e32 v9, v4 +; SI-NEXT: v_mov_b32_e32 v10, v3 +; SI-NEXT: v_mov_b32_e32 v11, v2 +; SI-NEXT: v_mov_b32_e32 v12, v1 +; SI-NEXT: v_mov_b32_e32 v13, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v13 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v0 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -758,44 +792,44 @@ define <7 x i32> @bitcast_v14i16_to_v7i32(<14 x i16> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB6_3: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v15 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v17 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v18 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v8 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v10 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v12 -; SI-NEXT: v_or_b32_e32 v0, v0, v22 -; SI-NEXT: v_or_b32_e32 v1, v1, v21 -; SI-NEXT: v_or_b32_e32 v2, v2, v20 -; SI-NEXT: v_or_b32_e32 v3, v3, v19 -; SI-NEXT: v_or_b32_e32 v4, v4, v14 -; SI-NEXT: v_or_b32_e32 v5, v5, v9 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v14 +; SI-NEXT: v_or_b32_e32 v0, v0, v20 +; SI-NEXT: v_or_b32_e32 v1, v1, v19 +; SI-NEXT: v_or_b32_e32 v2, v2, v18 +; SI-NEXT: v_or_b32_e32 v3, v3, v17 +; SI-NEXT: v_or_b32_e32 v4, v4, v16 +; SI-NEXT: v_or_b32_e32 v5, v5, v15 ; SI-NEXT: v_or_b32_e32 v6, v6, v7 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB6_2 ; SI-NEXT: .LBB6_4: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v15 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v17 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v18 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v8 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v10 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v14 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 @@ -803,13 +837,13 @@ define <7 x i32> @bitcast_v14i16_to_v7i32(<14 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; SI-NEXT: v_or_b32_e32 v0, v22, v0 +; SI-NEXT: v_or_b32_e32 v0, v20, v0 ; SI-NEXT: s_mov_b32 s6, 0x30000 -; SI-NEXT: v_or_b32_e32 v1, v21, v1 -; SI-NEXT: v_or_b32_e32 v2, v20, v2 -; SI-NEXT: v_or_b32_e32 v3, v19, v3 -; SI-NEXT: v_or_b32_e32 v4, v14, v4 -; SI-NEXT: v_or_b32_e32 v5, v9, v5 +; SI-NEXT: v_or_b32_e32 v1, v19, v1 +; SI-NEXT: v_or_b32_e32 v2, v18, v2 +; SI-NEXT: v_or_b32_e32 v3, v17, v3 +; SI-NEXT: v_or_b32_e32 v4, v16, v4 +; SI-NEXT: v_or_b32_e32 v5, v15, v5 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 @@ -916,60 +950,66 @@ define inreg <7 x i32> @bitcast_v14i16_to_v7i32_scalar(<14 x i16> inreg %a, i32 ; SI-LABEL: bitcast_v14i16_to_v7i32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_lshr_b32 s11, s22, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s24, s19, 16 +; SI-NEXT: s_lshr_b32 s25, s18, 16 +; SI-NEXT: s_lshr_b32 s26, s17, 16 +; SI-NEXT: s_lshr_b32 s27, s16, 16 +; SI-NEXT: s_cmp_lg_u32 s23, 0 ; SI-NEXT: s_cbranch_scc0 .LBB7_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_lshl_b32 s5, s27, 16 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s26, 16 ; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_and_b32 s6, s18, 0xffff +; SI-NEXT: s_lshl_b32 s7, s25, 16 ; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_and_b32 s7, s19, 0xffff +; SI-NEXT: s_lshl_b32 s8, s24, 16 ; SI-NEXT: s_or_b32 s7, s7, s8 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_and_b32 s8, s20, 0xffff +; SI-NEXT: s_lshl_b32 s9, s15, 16 ; SI-NEXT: s_or_b32 s8, s8, s9 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_and_b32 s9, s21, 0xffff +; SI-NEXT: s_lshl_b32 s10, s14, 16 ; SI-NEXT: s_or_b32 s9, s9, s10 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: s_and_b32 s10, s22, 0xffff +; SI-NEXT: s_lshl_b32 s12, s11, 16 +; SI-NEXT: s_or_b32 s10, s10, s12 ; SI-NEXT: s_cbranch_execnz .LBB7_3 ; SI-NEXT: .LBB7_2: ; %cmp.true ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_lshl_b32 s5, s27, 16 +; SI-NEXT: s_add_i32 s17, s17, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s26, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_and_b32 s6, s18, 0xffff +; SI-NEXT: s_lshl_b32 s7, s25, 16 +; SI-NEXT: s_add_i32 s19, s19, 3 ; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_and_b32 s7, s19, 0xffff +; SI-NEXT: s_lshl_b32 s8, s24, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 ; SI-NEXT: s_or_b32 s7, s8, s7 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_and_b32 s8, s20, 0xffff +; SI-NEXT: s_lshl_b32 s9, s15, 16 +; SI-NEXT: s_add_i32 s21, s21, 3 ; SI-NEXT: s_or_b32 s8, s9, s8 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_and_b32 s9, s21, 0xffff +; SI-NEXT: s_lshl_b32 s10, s14, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 ; SI-NEXT: s_or_b32 s9, s10, s9 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: s_and_b32 s10, s22, 0xffff +; SI-NEXT: s_lshl_b32 s11, s11, 16 ; SI-NEXT: s_or_b32 s10, s11, s10 ; SI-NEXT: s_add_i32 s4, s4, 0x30000 ; SI-NEXT: s_add_i32 s5, s5, 0x30000 @@ -1124,14 +1164,46 @@ define <14 x half> @bitcast_v7i32_to_v14f16(<7 x i32> %a, i32 %b) { ; SI-LABEL: bitcast_v7i32_to_v14f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v20, v6 -; SI-NEXT: v_mov_b32_e32 v19, v5 -; SI-NEXT: v_mov_b32_e32 v18, v4 -; SI-NEXT: v_mov_b32_e32 v17, v3 -; SI-NEXT: v_mov_b32_e32 v16, v2 -; SI-NEXT: v_mov_b32_e32 v15, v1 -; SI-NEXT: v_mov_b32_e32 v14, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB8_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v0 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 @@ -1139,83 +1211,68 @@ define <14 x half> @bitcast_v7i32_to_v14f16(<7 x i32> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB8_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB8_4 -; SI-NEXT: .LBB8_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB8_3: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v14 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: .LBB8_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB8_2 -; SI-NEXT: .LBB8_4: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v14 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v15 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v16 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v17 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v18 -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v19 -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: s_cbranch_execz .LBB8_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v0 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v0 +; SI-NEXT: .LBB8_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v15 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v14 +; SI-NEXT: v_or_b32_e32 v2, v5, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v11 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v10 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v7i32_to_v14f16: @@ -1301,73 +1358,101 @@ define inreg <14 x half> @bitcast_v7i32_to_v14f16_scalar(<7 x i32> inreg %a, i32 ; SI-NEXT: s_cbranch_scc0 .LBB9_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 ; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 ; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 ; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 ; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 ; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 ; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s16 ; SI-NEXT: s_cbranch_execnz .LBB9_3 ; SI-NEXT: .LBB9_2: ; %cmp.true ; SI-NEXT: s_add_i32 s22, s22, 3 ; SI-NEXT: s_add_i32 s21, s21, 3 ; SI-NEXT: s_lshr_b32 s4, s22, 16 ; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 ; SI-NEXT: s_lshr_b32 s4, s21, 16 ; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 ; SI-NEXT: s_lshr_b32 s4, s20, 16 ; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 ; SI-NEXT: s_lshr_b32 s4, s19, 16 ; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 ; SI-NEXT: s_lshr_b32 s4, s18, 16 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 ; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 ; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 ; SI-NEXT: .LBB9_3: ; %end +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v13 +; SI-NEXT: v_or_b32_e32 v0, v11, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v11, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v3, v9, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v9 +; SI-NEXT: v_or_b32_e32 v1, v14, v1 +; SI-NEXT: v_or_b32_e32 v4, v11, v4 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB9_4: +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: s_branch .LBB9_2 ; ; VI-LABEL: bitcast_v7i32_to_v14f16_scalar: @@ -1471,21 +1556,42 @@ define <7 x i32> @bitcast_v14f16_to_v7i32(<14 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v14f16_to_v7i32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v24, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v12 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v6 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -1497,23 +1603,20 @@ define <7 x i32> @bitcast_v14f16_to_v7i32(<14 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB10_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v24 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v22 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v20 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v18 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v13 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v8 -; SI-NEXT: v_or_b32_e32 v0, v23, v0 -; SI-NEXT: v_or_b32_e32 v1, v21, v1 -; SI-NEXT: v_or_b32_e32 v2, v19, v2 -; SI-NEXT: v_or_b32_e32 v3, v17, v3 -; SI-NEXT: v_or_b32_e32 v4, v15, v4 -; SI-NEXT: v_or_b32_e32 v5, v9, v5 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v9 +; SI-NEXT: v_or_b32_e32 v0, v20, v0 +; SI-NEXT: v_or_b32_e32 v1, v18, v1 +; SI-NEXT: v_or_b32_e32 v2, v16, v2 +; SI-NEXT: v_or_b32_e32 v3, v14, v3 +; SI-NEXT: v_or_b32_e32 v4, v12, v4 +; SI-NEXT: v_or_b32_e32 v5, v10, v5 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 ; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr19 @@ -1521,17 +1624,20 @@ define <7 x i32> @bitcast_v14f16_to_v7i32(<14 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB10_2 ; SI-NEXT: .LBB10_4: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v18 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -1540,25 +1646,25 @@ define <7 x i32> @bitcast_v14f16_to_v7i32(<14 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v17 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v16 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v14 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v12 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -1569,21 +1675,21 @@ define <7 x i32> @bitcast_v14f16_to_v7i32(<14 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v5, v11 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v8 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -1683,22 +1789,42 @@ define inreg <7 x i32> @bitcast_v14f16_to_v7i32_scalar(<14 x half> inreg %a, i32 ; SI-LABEL: bitcast_v14f16_to_v7i32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v20, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v19, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v18, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v17, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v16, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v15, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v14, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v13, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v12, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v11, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v10, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v9, s26 -; SI-NEXT: v_cvt_f16_f32_e32 v8, s29 -; SI-NEXT: v_cvt_f16_f32_e32 v7, s28 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: s_cmp_lg_u32 s23, 0 ; SI-NEXT: s_cbranch_scc0 .LBB11_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v20 @@ -1915,55 +2041,65 @@ define <14 x i16> @bitcast_v7f32_to_v14i16(<7 x float> %a, i32 %b) { ; SI-LABEL: bitcast_v7f32_to_v14i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v12, v6 -; SI-NEXT: v_mov_b32_e32 v10, v5 -; SI-NEXT: v_mov_b32_e32 v8, v4 -; SI-NEXT: v_mov_b32_e32 v6, v3 -; SI-NEXT: v_mov_b32_e32 v4, v2 -; SI-NEXT: v_mov_b32_e32 v2, v1 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB12_3 -; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v7, v0, v6, 16 +; SI-NEXT: v_alignbit_b32 v8, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v9, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v11, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v1 +; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB12_4 -; SI-NEXT: .LBB12_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB12_3: ; %cmp.false -; SI-NEXT: v_alignbit_b32 v13, v0, v12, 16 -; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB12_2 -; SI-NEXT: .LBB12_4: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 -; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: s_cbranch_execz .LBB12_4 +; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 ; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 -; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 -; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 -; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; SI-NEXT: v_alignbit_b32 v13, v0, v12, 16 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_alignbit_b32 v8, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v9, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v11, v1, v0, 16 +; SI-NEXT: v_alignbit_b32 v7, v0, v6, 16 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v1 +; SI-NEXT: .LBB12_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v0, v0, v11 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v13 +; SI-NEXT: v_or_b32_e32 v2, v2, v9 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v12 +; SI-NEXT: v_or_b32_e32 v4, v4, v8 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v10 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v1, v1, v11 +; SI-NEXT: v_or_b32_e32 v3, v3, v9 +; SI-NEXT: v_or_b32_e32 v5, v5, v8 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v7f32_to_v14i16: @@ -2053,20 +2189,20 @@ define inreg <14 x i16> @bitcast_v7f32_to_v14i16_scalar(<7 x float> inreg %a, i3 ; SI-NEXT: s_lshr_b64 s[10:11], s[16:17], 16 ; SI-NEXT: s_cbranch_execnz .LBB13_4 ; SI-NEXT: .LBB13_2: ; %cmp.true -; SI-NEXT: v_add_f32_e64 v12, s22, 1.0 -; SI-NEXT: v_add_f32_e64 v20, s17, 1.0 -; SI-NEXT: v_add_f32_e64 v19, s16, 1.0 -; SI-NEXT: v_add_f32_e64 v18, s19, 1.0 -; SI-NEXT: v_add_f32_e64 v17, s18, 1.0 -; SI-NEXT: v_add_f32_e64 v16, s21, 1.0 -; SI-NEXT: v_add_f32_e64 v15, s20, 1.0 -; SI-NEXT: v_lshr_b64 v[9:10], v[15:16], 16 -; SI-NEXT: v_lshr_b64 v[5:6], v[17:18], 16 -; SI-NEXT: v_lshr_b64 v[1:2], v[19:20], 16 -; SI-NEXT: v_lshr_b64 v[13:14], v[12:13], 16 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v20 +; SI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; SI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; SI-NEXT: v_lshr_b64 v[7:8], v[4:5], 16 +; SI-NEXT: v_add_f32_e64 v6, s22, 1.0 +; SI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: v_lshr_b64 v[10:11], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[11:12], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[8:9], v[6:7], 16 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v1 ; SI-NEXT: s_branch .LBB13_5 ; SI-NEXT: .LBB13_3: ; SI-NEXT: ; implicit-def: $sgpr10 @@ -2078,27 +2214,42 @@ define inreg <14 x i16> @bitcast_v7f32_to_v14i16_scalar(<7 x float> inreg %a, i3 ; SI-NEXT: ; implicit-def: $sgpr6 ; SI-NEXT: s_branch .LBB13_2 ; SI-NEXT: .LBB13_4: -; SI-NEXT: v_mov_b32_e32 v19, s16 -; SI-NEXT: v_mov_b32_e32 v20, s17 -; SI-NEXT: v_mov_b32_e32 v17, s18 -; SI-NEXT: v_mov_b32_e32 v18, s19 -; SI-NEXT: v_mov_b32_e32 v15, s20 -; SI-NEXT: v_mov_b32_e32 v16, s21 -; SI-NEXT: v_mov_b32_e32 v12, s22 -; SI-NEXT: v_mov_b32_e32 v3, s14 -; SI-NEXT: v_mov_b32_e32 v7, s15 -; SI-NEXT: v_mov_b32_e32 v11, s23 -; SI-NEXT: v_mov_b32_e32 v13, s6 -; SI-NEXT: v_mov_b32_e32 v1, s10 -; SI-NEXT: v_mov_b32_e32 v5, s8 -; SI-NEXT: v_mov_b32_e32 v9, s4 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v9, s14 +; SI-NEXT: v_mov_b32_e32 v13, s15 +; SI-NEXT: v_mov_b32_e32 v12, s23 +; SI-NEXT: v_mov_b32_e32 v8, s6 +; SI-NEXT: v_mov_b32_e32 v11, s10 +; SI-NEXT: v_mov_b32_e32 v10, s8 +; SI-NEXT: v_mov_b32_e32 v7, s4 ; SI-NEXT: .LBB13_5: ; %end -; SI-NEXT: v_mov_b32_e32 v0, v19 -; SI-NEXT: v_mov_b32_e32 v2, v20 -; SI-NEXT: v_mov_b32_e32 v4, v17 -; SI-NEXT: v_mov_b32_e32 v6, v18 -; SI-NEXT: v_mov_b32_e32 v8, v15 -; SI-NEXT: v_mov_b32_e32 v10, v16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v1, v1, v9 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_or_b32_e32 v4, v4, v7 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v12 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v2, v2, v9 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v13 +; SI-NEXT: v_or_b32_e32 v5, v5, v7 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_or_b32_e32 v0, v0, v11 +; SI-NEXT: v_or_b32_e32 v3, v3, v9 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v7f32_to_v14i16_scalar: @@ -2209,18 +2360,28 @@ define <7 x float> @bitcast_v14i16_to_v7f32(<14 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v14i16_to_v7f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v18, v6 -; SI-NEXT: v_mov_b32_e32 v17, v4 -; SI-NEXT: v_mov_b32_e32 v16, v2 -; SI-NEXT: v_mov_b32_e32 v15, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v13 +; SI-NEXT: v_mov_b32_e32 v14, v6 +; SI-NEXT: v_mov_b32_e32 v8, v5 +; SI-NEXT: v_mov_b32_e32 v9, v4 +; SI-NEXT: v_mov_b32_e32 v10, v3 +; SI-NEXT: v_mov_b32_e32 v11, v2 +; SI-NEXT: v_mov_b32_e32 v12, v1 +; SI-NEXT: v_mov_b32_e32 v13, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v13 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v0 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -2232,44 +2393,44 @@ define <7 x float> @bitcast_v14i16_to_v7f32(<14 x i16> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB14_3: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v15 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v17 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v18 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v8 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v10 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v12 -; SI-NEXT: v_or_b32_e32 v0, v0, v22 -; SI-NEXT: v_or_b32_e32 v1, v1, v21 -; SI-NEXT: v_or_b32_e32 v2, v2, v20 -; SI-NEXT: v_or_b32_e32 v3, v3, v19 -; SI-NEXT: v_or_b32_e32 v4, v4, v14 -; SI-NEXT: v_or_b32_e32 v5, v5, v9 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v14 +; SI-NEXT: v_or_b32_e32 v0, v0, v20 +; SI-NEXT: v_or_b32_e32 v1, v1, v19 +; SI-NEXT: v_or_b32_e32 v2, v2, v18 +; SI-NEXT: v_or_b32_e32 v3, v3, v17 +; SI-NEXT: v_or_b32_e32 v4, v4, v16 +; SI-NEXT: v_or_b32_e32 v5, v5, v15 ; SI-NEXT: v_or_b32_e32 v6, v6, v7 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB14_2 ; SI-NEXT: .LBB14_4: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v15 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v17 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v18 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v8 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v10 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v14 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 @@ -2277,13 +2438,13 @@ define <7 x float> @bitcast_v14i16_to_v7f32(<14 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; SI-NEXT: v_or_b32_e32 v0, v22, v0 +; SI-NEXT: v_or_b32_e32 v0, v20, v0 ; SI-NEXT: s_mov_b32 s6, 0x30000 -; SI-NEXT: v_or_b32_e32 v1, v21, v1 -; SI-NEXT: v_or_b32_e32 v2, v20, v2 -; SI-NEXT: v_or_b32_e32 v3, v19, v3 -; SI-NEXT: v_or_b32_e32 v4, v14, v4 -; SI-NEXT: v_or_b32_e32 v5, v9, v5 +; SI-NEXT: v_or_b32_e32 v1, v19, v1 +; SI-NEXT: v_or_b32_e32 v2, v18, v2 +; SI-NEXT: v_or_b32_e32 v3, v17, v3 +; SI-NEXT: v_or_b32_e32 v4, v16, v4 +; SI-NEXT: v_or_b32_e32 v5, v15, v5 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 @@ -2390,60 +2551,66 @@ define inreg <7 x float> @bitcast_v14i16_to_v7f32_scalar(<14 x i16> inreg %a, i3 ; SI-LABEL: bitcast_v14i16_to_v7f32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_lshr_b32 s11, s22, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s24, s19, 16 +; SI-NEXT: s_lshr_b32 s25, s18, 16 +; SI-NEXT: s_lshr_b32 s26, s17, 16 +; SI-NEXT: s_lshr_b32 s27, s16, 16 +; SI-NEXT: s_cmp_lg_u32 s23, 0 ; SI-NEXT: s_cbranch_scc0 .LBB15_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_lshl_b32 s5, s27, 16 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s26, 16 ; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_and_b32 s6, s18, 0xffff +; SI-NEXT: s_lshl_b32 s7, s25, 16 ; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_and_b32 s7, s19, 0xffff +; SI-NEXT: s_lshl_b32 s8, s24, 16 ; SI-NEXT: s_or_b32 s7, s7, s8 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_and_b32 s8, s20, 0xffff +; SI-NEXT: s_lshl_b32 s9, s15, 16 ; SI-NEXT: s_or_b32 s8, s8, s9 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_and_b32 s9, s21, 0xffff +; SI-NEXT: s_lshl_b32 s10, s14, 16 ; SI-NEXT: s_or_b32 s9, s9, s10 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: s_and_b32 s10, s22, 0xffff +; SI-NEXT: s_lshl_b32 s12, s11, 16 +; SI-NEXT: s_or_b32 s10, s10, s12 ; SI-NEXT: s_cbranch_execnz .LBB15_3 ; SI-NEXT: .LBB15_2: ; %cmp.true ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_lshl_b32 s5, s27, 16 +; SI-NEXT: s_add_i32 s17, s17, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s26, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_and_b32 s6, s18, 0xffff +; SI-NEXT: s_lshl_b32 s7, s25, 16 +; SI-NEXT: s_add_i32 s19, s19, 3 ; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_and_b32 s7, s19, 0xffff +; SI-NEXT: s_lshl_b32 s8, s24, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 ; SI-NEXT: s_or_b32 s7, s8, s7 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_and_b32 s8, s20, 0xffff +; SI-NEXT: s_lshl_b32 s9, s15, 16 +; SI-NEXT: s_add_i32 s21, s21, 3 ; SI-NEXT: s_or_b32 s8, s9, s8 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_and_b32 s9, s21, 0xffff +; SI-NEXT: s_lshl_b32 s10, s14, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 ; SI-NEXT: s_or_b32 s9, s10, s9 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: s_and_b32 s10, s22, 0xffff +; SI-NEXT: s_lshl_b32 s11, s11, 16 ; SI-NEXT: s_or_b32 s10, s11, s10 ; SI-NEXT: s_add_i32 s4, s4, 0x30000 ; SI-NEXT: s_add_i32 s5, s5, 0x30000 @@ -2598,14 +2765,46 @@ define <14 x half> @bitcast_v7f32_to_v14f16(<7 x float> %a, i32 %b) { ; SI-LABEL: bitcast_v7f32_to_v14f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v20, v6 -; SI-NEXT: v_mov_b32_e32 v19, v5 -; SI-NEXT: v_mov_b32_e32 v18, v4 -; SI-NEXT: v_mov_b32_e32 v17, v3 -; SI-NEXT: v_mov_b32_e32 v16, v2 -; SI-NEXT: v_mov_b32_e32 v15, v1 -; SI-NEXT: v_mov_b32_e32 v14, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB16_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v0 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 @@ -2613,83 +2812,68 @@ define <14 x half> @bitcast_v7f32_to_v14f16(<7 x float> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB16_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB16_4 -; SI-NEXT: .LBB16_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB16_3: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v14 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: .LBB16_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB16_2 -; SI-NEXT: .LBB16_4: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v1, 1.0, v14 -; SI-NEXT: v_add_f32_e32 v3, 1.0, v15 -; SI-NEXT: v_add_f32_e32 v5, 1.0, v16 -; SI-NEXT: v_add_f32_e32 v7, 1.0, v17 -; SI-NEXT: v_add_f32_e32 v9, 1.0, v18 -; SI-NEXT: v_add_f32_e32 v11, 1.0, v19 -; SI-NEXT: v_add_f32_e32 v13, 1.0, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: s_cbranch_execz .LBB16_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v0 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v0 +; SI-NEXT: .LBB16_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v15 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v14 +; SI-NEXT: v_or_b32_e32 v2, v5, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v11 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v10 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v7f32_to_v14f16: @@ -2771,73 +2955,101 @@ define inreg <14 x half> @bitcast_v7f32_to_v14f16_scalar(<7 x float> inreg %a, i ; SI-NEXT: s_cbranch_scc0 .LBB17_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 ; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 ; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 ; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 ; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 ; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 ; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s16 ; SI-NEXT: s_cbranch_execnz .LBB17_3 ; SI-NEXT: .LBB17_2: ; %cmp.true -; SI-NEXT: v_add_f32_e64 v1, s16, 1.0 -; SI-NEXT: v_add_f32_e64 v3, s17, 1.0 -; SI-NEXT: v_add_f32_e64 v5, s18, 1.0 -; SI-NEXT: v_add_f32_e64 v7, s19, 1.0 -; SI-NEXT: v_add_f32_e64 v9, s20, 1.0 -; SI-NEXT: v_add_f32_e64 v11, s21, 1.0 -; SI-NEXT: v_add_f32_e64 v13, s22, 1.0 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_add_f32_e64 v13, s16, 1.0 +; SI-NEXT: v_add_f32_e64 v12, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v10, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; SI-NEXT: v_add_f32_e64 v8, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v5, s22, 1.0 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v13 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: .LBB17_3: ; %end +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v13 +; SI-NEXT: v_or_b32_e32 v0, v11, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v11, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v3, v9, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v9 +; SI-NEXT: v_or_b32_e32 v1, v14, v1 +; SI-NEXT: v_or_b32_e32 v4, v11, v4 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB17_4: +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: s_branch .LBB17_2 ; ; VI-LABEL: bitcast_v7f32_to_v14f16_scalar: @@ -2948,21 +3160,42 @@ define <7 x float> @bitcast_v14f16_to_v7f32(<14 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v14f16_to_v7f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v24, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v12 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v6 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -2974,23 +3207,20 @@ define <7 x float> @bitcast_v14f16_to_v7f32(<14 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB18_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v24 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v22 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v20 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v18 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v13 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v8 -; SI-NEXT: v_or_b32_e32 v0, v23, v0 -; SI-NEXT: v_or_b32_e32 v1, v21, v1 -; SI-NEXT: v_or_b32_e32 v2, v19, v2 -; SI-NEXT: v_or_b32_e32 v3, v17, v3 -; SI-NEXT: v_or_b32_e32 v4, v15, v4 -; SI-NEXT: v_or_b32_e32 v5, v9, v5 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v9 +; SI-NEXT: v_or_b32_e32 v0, v20, v0 +; SI-NEXT: v_or_b32_e32 v1, v18, v1 +; SI-NEXT: v_or_b32_e32 v2, v16, v2 +; SI-NEXT: v_or_b32_e32 v3, v14, v3 +; SI-NEXT: v_or_b32_e32 v4, v12, v4 +; SI-NEXT: v_or_b32_e32 v5, v10, v5 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 ; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr19 @@ -2998,17 +3228,20 @@ define <7 x float> @bitcast_v14f16_to_v7f32(<14 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB18_2 ; SI-NEXT: .LBB18_4: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v18 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -3017,25 +3250,25 @@ define <7 x float> @bitcast_v14f16_to_v7f32(<14 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v17 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v16 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v14 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v12 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -3046,21 +3279,21 @@ define <7 x float> @bitcast_v14f16_to_v7f32(<14 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v5, v11 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v8 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -3160,31 +3393,51 @@ define inreg <7 x float> @bitcast_v14f16_to_v7f32_scalar(<14 x half> inreg %a, i ; SI-LABEL: bitcast_v14f16_to_v7f32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v20, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v19, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v18, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v17, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v16, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v15, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v14, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v13, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v12, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v11, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v10, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v9, s26 -; SI-NEXT: v_cvt_f16_f32_e32 v8, s29 -; SI-NEXT: v_cvt_f16_f32_e32 v7, s28 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: s_cbranch_scc0 .LBB19_4 -; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v20 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v18 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v14 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v8 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: s_cmp_lg_u32 s23, 0 +; SI-NEXT: s_cbranch_scc0 .LBB19_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v8 ; SI-NEXT: v_or_b32_e32 v0, v19, v0 ; SI-NEXT: v_or_b32_e32 v1, v17, v1 ; SI-NEXT: v_or_b32_e32 v2, v15, v2 @@ -3392,66 +3645,53 @@ define <14 x half> @bitcast_v14i16_to_v14f16(<14 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v14i16_to_v14f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v27, v13 -; SI-NEXT: v_mov_b32_e32 v26, v12 -; SI-NEXT: v_mov_b32_e32 v25, v11 -; SI-NEXT: v_mov_b32_e32 v24, v10 -; SI-NEXT: v_mov_b32_e32 v23, v9 -; SI-NEXT: v_mov_b32_e32 v22, v8 -; SI-NEXT: v_mov_b32_e32 v21, v7 -; SI-NEXT: v_mov_b32_e32 v20, v6 -; SI-NEXT: v_mov_b32_e32 v19, v5 -; SI-NEXT: v_mov_b32_e32 v18, v4 -; SI-NEXT: v_mov_b32_e32 v17, v3 -; SI-NEXT: v_mov_b32_e32 v16, v2 -; SI-NEXT: v_mov_b32_e32 v15, v1 -; SI-NEXT: v_mov_b32_e32 v28, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 ; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB20_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB20_4 -; SI-NEXT: .LBB20_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB20_3: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v0, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v27 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: s_cbranch_execz .LBB20_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v27 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr23 @@ -3459,38 +3699,68 @@ define <14 x half> @bitcast_v14i16_to_v14f16(<14 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: .LBB20_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB20_2 -; SI-NEXT: .LBB20_4: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v27 -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v26 +; SI-NEXT: s_cbranch_execz .LBB20_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v27 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 ; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v25 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v24 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v23 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v22 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v21 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v20 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v19 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v18 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v17 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v15 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: .LBB20_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v10 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v15 +; SI-NEXT: v_or_b32_e32 v2, v5, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v12 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v18 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v13 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v14i16_to_v14f16: @@ -3588,70 +3858,104 @@ define inreg <14 x half> @bitcast_v14i16_to_v14f16_scalar(<14 x i16> inreg %a, i ; SI-LABEL: bitcast_v14i16_to_v14f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_lshr_b32 s12, s22, 16 +; SI-NEXT: s_lshr_b32 s11, s21, 16 +; SI-NEXT: s_lshr_b32 s10, s20, 16 +; SI-NEXT: s_lshr_b32 s9, s19, 16 +; SI-NEXT: s_lshr_b32 s8, s18, 16 +; SI-NEXT: s_lshr_b32 s7, s17, 16 +; SI-NEXT: s_lshr_b32 s6, s16, 16 +; SI-NEXT: s_cmp_lg_u32 s23, 0 ; SI-NEXT: s_cbranch_scc0 .LBB21_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s6 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s7 ; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s8 ; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s9 ; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s10 ; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s11 ; SI-NEXT: v_cvt_f32_f16_e32 v6, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s12 ; SI-NEXT: s_cbranch_execnz .LBB21_3 ; SI-NEXT: .LBB21_2: ; %cmp.true -; SI-NEXT: s_add_i32 s29, s29, 3 -; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: s_add_i32 s27, s27, 3 -; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: s_add_i32 s25, s25, 3 -; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s12, s12, 3 ; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s11, s11, 3 ; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s10, s10, 3 ; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s9, s9, 3 ; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s8, s8, 3 ; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s7, s7, 3 ; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s6, s6, 3 ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s6 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s7 ; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s8 ; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s9 ; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s10 ; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s11 ; SI-NEXT: v_cvt_f32_f16_e32 v6, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s12 ; SI-NEXT: .LBB21_3: ; %end +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v0, v0, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v2, v2, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v3, v3, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_or_b32_e32 v4, v4, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v13 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v1, v1, v9 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v5, v5, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB21_4: ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: s_branch .LBB21_2 ; @@ -3789,87 +4093,129 @@ define <14 x i16> @bitcast_v14f16_to_v14i16(<14 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v14f16_to_v14i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v15 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB22_2 ; SI-NEXT: ; %bb.1: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_or_b32_e32 v12, v12, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v11 -; SI-NEXT: v_or_b32_e32 v10, v10, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v7 -; SI-NEXT: v_or_b32_e32 v6, v6, v14 +; SI-NEXT: v_or_b32_e32 v9, v9, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v5 +; SI-NEXT: v_or_b32_e32 v8, v8, v14 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v11, v11, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v2, v2, v14 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_or_b32_e32 v4, v4, v5 -; SI-NEXT: v_or_b32_e32 v8, v8, v9 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 +; SI-NEXT: v_or_b32_e32 v0, v0, v7 +; SI-NEXT: v_or_b32_e32 v10, v10, v13 +; SI-NEXT: v_or_b32_e32 v4, v4, v12 +; SI-NEXT: v_alignbit_b32 v14, v2, v7, 16 +; SI-NEXT: v_alignbit_b32 v13, v11, v13, 16 +; SI-NEXT: v_alignbit_b32 v12, v8, v12, 16 ; SI-NEXT: .LBB22_2: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v14 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v7 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v13 +; SI-NEXT: v_or_b32_e32 v2, v2, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v7, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v12 +; SI-NEXT: v_or_b32_e32 v4, v4, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v14f16_to_v14i16: @@ -3968,89 +4314,127 @@ define inreg <14 x i16> @bitcast_v14f16_to_v14i16_scalar(<14 x half> inreg %a, i ; SI-LABEL: bitcast_v14f16_to_v14i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v14, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v2, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v3, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v4, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v15, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v6, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v7, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v8, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v16, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v10, s26 -; SI-NEXT: v_cvt_f16_f32_e32 v11, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v12, s28 -; SI-NEXT: v_cvt_f16_f32_e32 v13, s29 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v0 +; SI-NEXT: s_cmp_lg_u32 s23, 0 ; SI-NEXT: s_cbranch_scc0 .LBB23_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_cbranch_execnz .LBB23_3 ; SI-NEXT: .LBB23_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v14 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v18 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_or_b32_e32 v17, v2, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v12 +; SI-NEXT: v_or_b32_e32 v18, v7, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v13 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_or_b32_e32 v16, v7, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v14 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v8 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v15 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_or_b32_e32 v12, v12, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_or_b32_e32 v10, v10, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v7 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_or_b32_e32 v6, v6, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_or_b32_e32 v2, v2, v14 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshr_b64 v[14:15], v[1:2], 16 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_lshr_b64 v[15:16], v[5:6], 16 -; SI-NEXT: v_lshr_b64 v[16:17], v[9:10], 16 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_or_b32_e32 v4, v4, v5 -; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v13 +; SI-NEXT: v_or_b32_e32 v3, v3, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v15 +; SI-NEXT: v_or_b32_e32 v5, v5, v7 +; SI-NEXT: v_or_b32_e32 v1, v1, v6 +; SI-NEXT: v_lshr_b64 v[10:11], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[8:9], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[6:7], v[4:5], 16 +; SI-NEXT: v_or_b32_e32 v4, v19, v4 ; SI-NEXT: .LBB23_3: ; %end -; SI-NEXT: v_mov_b32_e32 v1, v14 -; SI-NEXT: v_mov_b32_e32 v5, v15 -; SI-NEXT: v_mov_b32_e32 v9, v16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v10 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v15 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v2, v2, v7 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v14 +; SI-NEXT: v_or_b32_e32 v4, v4, v6 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v13 +; SI-NEXT: v_or_b32_e32 v3, v3, v7 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v12 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB23_4: ; SI-NEXT: s_branch .LBB23_2 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll index b846e0ee0a12f..075216fc4791c 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll @@ -1377,56 +1377,72 @@ define <16 x i16> @bitcast_v8i32_to_v16i16(<8 x i32> %a, i32 %b) { ; SI-LABEL: bitcast_v8i32_to_v16i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v14, v7 -; SI-NEXT: v_mov_b32_e32 v12, v6 -; SI-NEXT: v_mov_b32_e32 v10, v5 -; SI-NEXT: v_mov_b32_e32 v16, v4 -; SI-NEXT: v_mov_b32_e32 v6, v3 -; SI-NEXT: v_mov_b32_e32 v4, v2 -; SI-NEXT: v_mov_b32_e32 v2, v1 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 -; SI-NEXT: v_alignbit_b32 v9, v10, v16, 16 -; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_alignbit_b32 v8, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v9, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v10, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v13, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v1 ; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB12_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 ; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 -; SI-NEXT: v_alignbit_b32 v9, v10, v16, 16 -; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_alignbit_b32 v8, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v9, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v10, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v13, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v1 ; SI-NEXT: .LBB12_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_mov_b32_e32 v8, v16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v0, v0, v13 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v15 +; SI-NEXT: v_or_b32_e32 v2, v2, v10 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v14 +; SI-NEXT: v_or_b32_e32 v4, v4, v9 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v12 +; SI-NEXT: v_or_b32_e32 v6, v6, v8 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v11 +; SI-NEXT: v_or_b32_e32 v1, v1, v13 +; SI-NEXT: v_or_b32_e32 v3, v3, v10 +; SI-NEXT: v_or_b32_e32 v5, v5, v9 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8i32_to_v16i16: @@ -1541,22 +1557,38 @@ define inreg <16 x i16> @bitcast_v8i32_to_v16i16_scalar(<8 x i32> inreg %a, i32 ; SI-NEXT: s_lshr_b32 s24, s19, 16 ; SI-NEXT: s_lshr_b32 s25, s17, 16 ; SI-NEXT: .LBB13_3: ; %end -; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_mov_b32_e32 v1, s10 -; SI-NEXT: v_mov_b32_e32 v2, s17 -; SI-NEXT: v_mov_b32_e32 v3, s25 -; SI-NEXT: v_mov_b32_e32 v4, s18 -; SI-NEXT: v_mov_b32_e32 v5, s8 -; SI-NEXT: v_mov_b32_e32 v6, s19 -; SI-NEXT: v_mov_b32_e32 v7, s24 -; SI-NEXT: v_mov_b32_e32 v8, s20 -; SI-NEXT: v_mov_b32_e32 v9, s6 -; SI-NEXT: v_mov_b32_e32 v10, s21 -; SI-NEXT: v_mov_b32_e32 v11, s15 -; SI-NEXT: v_mov_b32_e32 v12, s22 -; SI-NEXT: v_mov_b32_e32 v13, s4 -; SI-NEXT: v_mov_b32_e32 v14, s23 -; SI-NEXT: v_mov_b32_e32 v15, s14 +; SI-NEXT: s_and_b32 s5, s16, 0xffff +; SI-NEXT: s_lshl_b32 s7, s10, 16 +; SI-NEXT: s_or_b32 s5, s5, s7 +; SI-NEXT: s_and_b32 s7, s17, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: s_and_b32 s9, s18, 0xffff +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s19, 0xffff +; SI-NEXT: s_lshl_b32 s10, s24, 16 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s20, 0xffff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_or_b32 s6, s10, s6 +; SI-NEXT: s_and_b32 s10, s21, 0xffff +; SI-NEXT: s_lshl_b32 s11, s15, 16 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: s_and_b32 s11, s22, 0xffff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_or_b32 s4, s11, s4 +; SI-NEXT: s_and_b32 s11, s23, 0xffff +; SI-NEXT: s_lshl_b32 s12, s14, 16 +; SI-NEXT: s_or_b32 s11, s11, s12 +; SI-NEXT: v_mov_b32_e32 v0, s5 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: v_mov_b32_e32 v3, s9 +; SI-NEXT: v_mov_b32_e32 v4, s6 +; SI-NEXT: v_mov_b32_e32 v5, s10 +; SI-NEXT: v_mov_b32_e32 v6, s4 +; SI-NEXT: v_mov_b32_e32 v7, s11 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB13_4: ; SI-NEXT: ; implicit-def: $sgpr10 @@ -1675,19 +1707,31 @@ define <8 x i32> @bitcast_v16i16_to_v8i32(<16 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v16i16_to_v8i32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v20, v6 -; SI-NEXT: v_mov_b32_e32 v19, v4 -; SI-NEXT: v_mov_b32_e32 v18, v2 -; SI-NEXT: v_mov_b32_e32 v17, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v15 +; SI-NEXT: v_mov_b32_e32 v16, v7 +; SI-NEXT: v_mov_b32_e32 v9, v6 +; SI-NEXT: v_mov_b32_e32 v10, v5 +; SI-NEXT: v_mov_b32_e32 v11, v4 +; SI-NEXT: v_mov_b32_e32 v12, v3 +; SI-NEXT: v_mov_b32_e32 v13, v2 +; SI-NEXT: v_mov_b32_e32 v14, v1 +; SI-NEXT: v_mov_b32_e32 v15, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v15 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v0 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -1699,49 +1743,49 @@ define <8 x i32> @bitcast_v16i16_to_v8i32(<16 x i16> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB14_3: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v17 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v19 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v20 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v11 ; SI-NEXT: v_and_b32_e32 v5, 0xffff, v10 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v12 -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v14 -; SI-NEXT: v_or_b32_e32 v0, v0, v25 -; SI-NEXT: v_or_b32_e32 v1, v1, v24 -; SI-NEXT: v_or_b32_e32 v2, v2, v23 -; SI-NEXT: v_or_b32_e32 v3, v3, v22 -; SI-NEXT: v_or_b32_e32 v4, v4, v21 -; SI-NEXT: v_or_b32_e32 v5, v5, v16 -; SI-NEXT: v_or_b32_e32 v6, v6, v11 -; SI-NEXT: v_or_b32_e32 v7, v7, v9 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v0, v0, v23 +; SI-NEXT: v_or_b32_e32 v1, v1, v22 +; SI-NEXT: v_or_b32_e32 v2, v2, v21 +; SI-NEXT: v_or_b32_e32 v3, v3, v20 +; SI-NEXT: v_or_b32_e32 v4, v4, v19 +; SI-NEXT: v_or_b32_e32 v5, v5, v18 +; SI-NEXT: v_or_b32_e32 v6, v6, v17 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB14_2 ; SI-NEXT: .LBB14_4: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v17 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v18 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v19 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v20 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v11 ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v10 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v12 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v16 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 @@ -1750,15 +1794,15 @@ define <8 x i32> @bitcast_v16i16_to_v8i32(<16 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: v_or_b32_e32 v0, v25, v0 +; SI-NEXT: v_or_b32_e32 v0, v23, v0 ; SI-NEXT: s_mov_b32 s6, 0x30000 -; SI-NEXT: v_or_b32_e32 v1, v24, v1 -; SI-NEXT: v_or_b32_e32 v2, v23, v2 -; SI-NEXT: v_or_b32_e32 v3, v22, v3 -; SI-NEXT: v_or_b32_e32 v4, v21, v4 -; SI-NEXT: v_or_b32_e32 v5, v16, v5 -; SI-NEXT: v_or_b32_e32 v6, v11, v6 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_or_b32_e32 v1, v22, v1 +; SI-NEXT: v_or_b32_e32 v2, v21, v2 +; SI-NEXT: v_or_b32_e32 v3, v20, v3 +; SI-NEXT: v_or_b32_e32 v4, v19, v4 +; SI-NEXT: v_or_b32_e32 v5, v18, v5 +; SI-NEXT: v_or_b32_e32 v6, v17, v6 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 @@ -1870,74 +1914,75 @@ define inreg <8 x i32> @bitcast_v16i16_to_v8i32_scalar(<16 x i16> inreg %a, i32 ; SI-LABEL: bitcast_v16i16_to_v8i32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: v_mov_b32_e32 v8, v0 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v1 +; SI-NEXT: s_lshr_b32 s14, s23, 16 +; SI-NEXT: s_lshr_b32 s15, s22, 16 +; SI-NEXT: s_lshr_b32 s25, s21, 16 +; SI-NEXT: s_lshr_b32 s26, s20, 16 +; SI-NEXT: s_lshr_b32 s27, s19, 16 +; SI-NEXT: s_lshr_b32 s28, s18, 16 +; SI-NEXT: s_lshr_b32 s29, s17, 16 +; SI-NEXT: s_lshr_b32 s40, s16, 16 +; SI-NEXT: s_cmp_lg_u32 s24, 0 ; SI-NEXT: s_cbranch_scc0 .LBB15_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_lshl_b32 s5, s40, 16 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s29, 16 ; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_and_b32 s6, s18, 0xffff +; SI-NEXT: s_lshl_b32 s7, s28, 16 ; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_and_b32 s7, s19, 0xffff +; SI-NEXT: s_lshl_b32 s8, s27, 16 ; SI-NEXT: s_or_b32 s7, s7, s8 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_and_b32 s8, s20, 0xffff +; SI-NEXT: s_lshl_b32 s9, s26, 16 ; SI-NEXT: s_or_b32 s8, s8, s9 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_and_b32 s9, s21, 0xffff +; SI-NEXT: s_lshl_b32 s10, s25, 16 ; SI-NEXT: s_or_b32 s9, s9, s10 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: s_and_b32 s10, s22, 0xffff +; SI-NEXT: s_lshl_b32 s11, s15, 16 ; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v8 -; SI-NEXT: v_or_b32_e32 v7, v0, v9 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_mov_b32_e32 v5, s9 -; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_and_b32 s11, s23, 0xffff +; SI-NEXT: s_lshl_b32 s12, s14, 16 +; SI-NEXT: s_or_b32 s11, s11, s12 ; SI-NEXT: s_cbranch_execnz .LBB15_3 ; SI-NEXT: .LBB15_2: ; %cmp.true ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_lshl_b32 s5, s40, 16 +; SI-NEXT: s_add_i32 s17, s17, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s29, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_and_b32 s6, s18, 0xffff +; SI-NEXT: s_lshl_b32 s7, s28, 16 +; SI-NEXT: s_add_i32 s19, s19, 3 ; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_and_b32 s7, s19, 0xffff +; SI-NEXT: s_lshl_b32 s8, s27, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 ; SI-NEXT: s_or_b32 s7, s8, s7 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_and_b32 s8, s20, 0xffff +; SI-NEXT: s_lshl_b32 s9, s26, 16 +; SI-NEXT: s_add_i32 s21, s21, 3 ; SI-NEXT: s_or_b32 s8, s9, s8 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_and_b32 s9, s21, 0xffff +; SI-NEXT: s_lshl_b32 s10, s25, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 ; SI-NEXT: s_or_b32 s9, s10, s9 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v8 +; SI-NEXT: s_and_b32 s10, s22, 0xffff +; SI-NEXT: s_lshl_b32 s11, s15, 16 +; SI-NEXT: s_add_i32 s23, s23, 3 ; SI-NEXT: s_or_b32 s10, s11, s10 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_and_b32 s11, s23, 0xffff +; SI-NEXT: s_lshl_b32 s12, s14, 16 +; SI-NEXT: s_or_b32 s11, s12, s11 ; SI-NEXT: s_add_i32 s4, s4, 0x30000 ; SI-NEXT: s_add_i32 s5, s5, 0x30000 ; SI-NEXT: s_add_i32 s6, s6, 0x30000 @@ -1945,8 +1990,8 @@ define inreg <8 x i32> @bitcast_v16i16_to_v8i32_scalar(<16 x i16> inreg %a, i32 ; SI-NEXT: s_add_i32 s8, s8, 0x30000 ; SI-NEXT: s_add_i32 s9, s9, 0x30000 ; SI-NEXT: s_add_i32 s10, s10, 0x30000 -; SI-NEXT: v_or_b32_e32 v0, v9, v0 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 +; SI-NEXT: s_add_i32 s11, s11, 0x30000 +; SI-NEXT: .LBB15_3: ; %end ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_mov_b32_e32 v2, s6 @@ -1954,10 +1999,10 @@ define inreg <8 x i32> @bitcast_v16i16_to_v8i32_scalar(<16 x i16> inreg %a, i32 ; SI-NEXT: v_mov_b32_e32 v4, s8 ; SI-NEXT: v_mov_b32_e32 v5, s9 ; SI-NEXT: v_mov_b32_e32 v6, s10 -; SI-NEXT: .LBB15_3: ; %end +; SI-NEXT: v_mov_b32_e32 v7, s11 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB15_4: -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; SI-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11 ; SI-NEXT: s_branch .LBB15_2 ; ; VI-LABEL: bitcast_v16i16_to_v8i32_scalar: @@ -2103,15 +2148,51 @@ define <16 x half> @bitcast_v8i32_to_v16f16(<8 x i32> %a, i32 %b) { ; SI-LABEL: bitcast_v8i32_to_v16f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v16, v7 -; SI-NEXT: v_mov_b32_e32 v17, v6 -; SI-NEXT: v_mov_b32_e32 v18, v5 -; SI-NEXT: v_mov_b32_e32 v19, v4 -; SI-NEXT: v_mov_b32_e32 v20, v3 -; SI-NEXT: v_mov_b32_e32 v21, v2 -; SI-NEXT: v_mov_b32_e32 v22, v1 -; SI-NEXT: v_mov_b32_e32 v23, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB16_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v0 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 @@ -2120,92 +2201,76 @@ define <16 x half> @bitcast_v8i32_to_v16f16(<8 x i32> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB16_3 -; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: .LBB16_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB16_4 -; SI-NEXT: .LBB16_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB16_3: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v23 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB16_2 -; SI-NEXT: .LBB16_4: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v23 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v22 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v21 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v20 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v19 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v18 -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v17 -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v16 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: s_cbranch_execz .LBB16_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: .LBB16_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v20 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v19 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v16 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v14 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v13 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v10 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8i32_to_v16f16: @@ -2294,29 +2359,29 @@ define inreg <16 x half> @bitcast_v8i32_to_v16f16_scalar(<8 x i32> inreg %a, i32 ; SI-NEXT: s_cbranch_scc0 .LBB17_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 ; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 ; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 ; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 ; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 ; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 ; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s16 ; SI-NEXT: s_cbranch_execnz .LBB17_3 ; SI-NEXT: .LBB17_2: ; %cmp.true ; SI-NEXT: s_add_i32 s16, s16, 3 @@ -2335,41 +2400,73 @@ define inreg <16 x half> @bitcast_v8i32_to_v16f16_scalar(<8 x i32> inreg %a, i32 ; SI-NEXT: s_lshr_b32 s9, s21, 16 ; SI-NEXT: s_lshr_b32 s10, s22, 16 ; SI-NEXT: s_lshr_b32 s11, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 ; SI-NEXT: .LBB17_3: ; %end +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v15 +; SI-NEXT: v_or_b32_e32 v0, v14, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v13 +; SI-NEXT: v_or_b32_e32 v2, v12, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_or_b32_e32 v5, v5, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v10 +; SI-NEXT: v_or_b32_e32 v1, v16, v1 +; SI-NEXT: v_or_b32_e32 v3, v14, v3 +; SI-NEXT: v_or_b32_e32 v4, v11, v4 +; SI-NEXT: v_or_b32_e32 v6, v9, v6 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB17_4: +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: s_branch .LBB17_2 ; ; VI-LABEL: bitcast_v8i32_to_v16f16_scalar: @@ -2478,23 +2575,47 @@ define <8 x i32> @bitcast_v16f16_to_v8i32(<16 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v16f16_to_v8i32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v28, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v14 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v7 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -2506,26 +2627,22 @@ define <8 x i32> @bitcast_v16f16_to_v8i32(<16 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB18_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v28 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v26 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v24 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v22 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v20 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v18 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v9 -; SI-NEXT: v_or_b32_e32 v0, v27, v0 -; SI-NEXT: v_or_b32_e32 v1, v25, v1 -; SI-NEXT: v_or_b32_e32 v2, v23, v2 -; SI-NEXT: v_or_b32_e32 v3, v21, v3 -; SI-NEXT: v_or_b32_e32 v4, v19, v4 -; SI-NEXT: v_or_b32_e32 v5, v17, v5 -; SI-NEXT: v_or_b32_e32 v6, v10, v6 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v10 +; SI-NEXT: v_or_b32_e32 v0, v23, v0 +; SI-NEXT: v_or_b32_e32 v1, v21, v1 +; SI-NEXT: v_or_b32_e32 v2, v19, v2 +; SI-NEXT: v_or_b32_e32 v3, v17, v3 +; SI-NEXT: v_or_b32_e32 v4, v15, v4 +; SI-NEXT: v_or_b32_e32 v5, v13, v5 +; SI-NEXT: v_or_b32_e32 v6, v11, v6 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr22 @@ -2534,17 +2651,21 @@ define <8 x i32> @bitcast_v16f16_to_v8i32(<16 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB18_2 ; SI-NEXT: .LBB18_4: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v21 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -2557,10 +2678,10 @@ define <8 x i32> @bitcast_v16f16_to_v8i32(<16 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v16 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -2569,18 +2690,18 @@ define <8 x i32> @bitcast_v16f16_to_v8i32(<16 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v17 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v10 ; SI-NEXT: v_or_b32_e32 v3, v3, v4 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v14 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 @@ -2588,23 +2709,23 @@ define <8 x i32> @bitcast_v16f16_to_v8i32(<16 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v12 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v9 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -2709,24 +2830,47 @@ define inreg <8 x i32> @bitcast_v16f16_to_v8i32_scalar(<16 x half> inreg %a, i32 ; SI-LABEL: bitcast_v16f16_to_v8i32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v23, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v22, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v21, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v20, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v19, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v18, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v17, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v16, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v15, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v14, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v13, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v12, s26 -; SI-NEXT: v_cvt_f16_f32_e32 v11, s29 -; SI-NEXT: v_cvt_f16_f32_e32 v10, s28 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: s_cmp_lg_u32 s24, 0 ; SI-NEXT: s_cbranch_scc0 .LBB19_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v23 @@ -2963,15 +3107,43 @@ define <16 x bfloat> @bitcast_v8i32_to_v16bf16(<8 x i32> %a, i32 %b) { ; SI-LABEL: bitcast_v8i32_to_v16bf16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v23, v7 -; SI-NEXT: v_mov_b32_e32 v22, v6 -; SI-NEXT: v_mov_b32_e32 v21, v5 -; SI-NEXT: v_mov_b32_e32 v20, v4 -; SI-NEXT: v_mov_b32_e32 v19, v3 -; SI-NEXT: v_mov_b32_e32 v18, v2 -; SI-NEXT: v_mov_b32_e32 v17, v1 -; SI-NEXT: v_mov_b32_e32 v16, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB20_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v7 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v6 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v5 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v4 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v3 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v2 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v1 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v0 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 @@ -2980,76 +3152,68 @@ define <16 x bfloat> @bitcast_v8i32_to_v16bf16(<8 x i32> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB20_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB20_4 -; SI-NEXT: .LBB20_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB20_3: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v23 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v23 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v22 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v22 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v21 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v21 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v20 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v20 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v19 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v19 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v18 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v18 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v17 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v17 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v16 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v16 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: .LBB20_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB20_2 -; SI-NEXT: .LBB20_4: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v17 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v18 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v19 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v20 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v21 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v22 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v23 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v7 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v7 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v6 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v6 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v5 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v5 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v4 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v4 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v3 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_cbranch_execz .LBB20_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v7 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v6 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v5 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v4 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v3 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v2 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v1 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v0 +; SI-NEXT: .LBB20_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v23 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v22 +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v21 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v20 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v19 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v18 +; SI-NEXT: v_alignbit_b32 v2, v2, v3, 16 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v17 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v16 +; SI-NEXT: v_alignbit_b32 v3, v3, v4, 16 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v15 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v14 +; SI-NEXT: v_alignbit_b32 v4, v4, v5, 16 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v13 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_mul_f32_e32 v6, 1.0, v12 +; SI-NEXT: v_alignbit_b32 v5, v5, v6, 16 +; SI-NEXT: v_mul_f32_e32 v6, 1.0, v11 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v10 +; SI-NEXT: v_alignbit_b32 v6, v6, v7, 16 +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v9 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_alignbit_b32 v7, v7, v8, 16 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8i32_to_v16bf16: @@ -3137,22 +3301,22 @@ define inreg <16 x bfloat> @bitcast_v8i32_to_v16bf16_scalar(<8 x i32> inreg %a, ; SI-NEXT: s_cmp_lg_u32 s24, 0 ; SI-NEXT: s_cbranch_scc0 .LBB21_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_and_b32 s6, s23, 0xffff0000 -; SI-NEXT: s_lshl_b32 s7, s23, 16 -; SI-NEXT: s_and_b32 s8, s22, 0xffff0000 -; SI-NEXT: s_lshl_b32 s9, s22, 16 -; SI-NEXT: s_and_b32 s10, s21, 0xffff0000 -; SI-NEXT: s_lshl_b32 s11, s21, 16 -; SI-NEXT: s_and_b32 s12, s20, 0xffff0000 -; SI-NEXT: s_lshl_b32 s13, s20, 16 -; SI-NEXT: s_and_b32 s14, s19, 0xffff0000 -; SI-NEXT: s_lshl_b32 s15, s19, 16 -; SI-NEXT: s_and_b32 s24, s18, 0xffff0000 -; SI-NEXT: s_lshl_b32 s25, s18, 16 -; SI-NEXT: s_and_b32 s26, s17, 0xffff0000 -; SI-NEXT: s_lshl_b32 s27, s17, 16 -; SI-NEXT: s_and_b32 s28, s16, 0xffff0000 -; SI-NEXT: s_lshl_b32 s29, s16, 16 +; SI-NEXT: s_and_b32 s7, s23, 0xffff0000 +; SI-NEXT: s_lshl_b32 s6, s23, 16 +; SI-NEXT: s_and_b32 s9, s22, 0xffff0000 +; SI-NEXT: s_lshl_b32 s8, s22, 16 +; SI-NEXT: s_and_b32 s11, s21, 0xffff0000 +; SI-NEXT: s_lshl_b32 s10, s21, 16 +; SI-NEXT: s_and_b32 s13, s20, 0xffff0000 +; SI-NEXT: s_lshl_b32 s12, s20, 16 +; SI-NEXT: s_and_b32 s15, s19, 0xffff0000 +; SI-NEXT: s_lshl_b32 s14, s19, 16 +; SI-NEXT: s_and_b32 s25, s18, 0xffff0000 +; SI-NEXT: s_lshl_b32 s24, s18, 16 +; SI-NEXT: s_and_b32 s27, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s26, s17, 16 +; SI-NEXT: s_and_b32 s29, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s28, s16, 16 ; SI-NEXT: s_cbranch_execnz .LBB21_3 ; SI-NEXT: .LBB21_2: ; %cmp.true ; SI-NEXT: s_add_i32 s16, s16, 3 @@ -3163,57 +3327,73 @@ define inreg <16 x bfloat> @bitcast_v8i32_to_v16bf16_scalar(<8 x i32> inreg %a, ; SI-NEXT: s_add_i32 s21, s21, 3 ; SI-NEXT: s_add_i32 s22, s22, 3 ; SI-NEXT: s_add_i32 s23, s23, 3 -; SI-NEXT: s_and_b32 s6, s23, 0xffff0000 -; SI-NEXT: s_lshl_b32 s7, s23, 16 -; SI-NEXT: s_and_b32 s8, s22, 0xffff0000 -; SI-NEXT: s_lshl_b32 s9, s22, 16 -; SI-NEXT: s_and_b32 s10, s21, 0xffff0000 -; SI-NEXT: s_lshl_b32 s11, s21, 16 -; SI-NEXT: s_and_b32 s12, s20, 0xffff0000 -; SI-NEXT: s_lshl_b32 s13, s20, 16 -; SI-NEXT: s_and_b32 s14, s19, 0xffff0000 -; SI-NEXT: s_lshl_b32 s15, s19, 16 -; SI-NEXT: s_and_b32 s24, s18, 0xffff0000 -; SI-NEXT: s_lshl_b32 s25, s18, 16 -; SI-NEXT: s_and_b32 s26, s17, 0xffff0000 -; SI-NEXT: s_lshl_b32 s27, s17, 16 -; SI-NEXT: s_and_b32 s28, s16, 0xffff0000 -; SI-NEXT: s_lshl_b32 s29, s16, 16 +; SI-NEXT: s_and_b32 s7, s23, 0xffff0000 +; SI-NEXT: s_lshl_b32 s6, s23, 16 +; SI-NEXT: s_and_b32 s9, s22, 0xffff0000 +; SI-NEXT: s_lshl_b32 s8, s22, 16 +; SI-NEXT: s_and_b32 s11, s21, 0xffff0000 +; SI-NEXT: s_lshl_b32 s10, s21, 16 +; SI-NEXT: s_and_b32 s13, s20, 0xffff0000 +; SI-NEXT: s_lshl_b32 s12, s20, 16 +; SI-NEXT: s_and_b32 s15, s19, 0xffff0000 +; SI-NEXT: s_lshl_b32 s14, s19, 16 +; SI-NEXT: s_and_b32 s25, s18, 0xffff0000 +; SI-NEXT: s_lshl_b32 s24, s18, 16 +; SI-NEXT: s_and_b32 s27, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s26, s17, 16 +; SI-NEXT: s_and_b32 s29, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s28, s16, 16 ; SI-NEXT: .LBB21_3: ; %end -; SI-NEXT: v_mov_b32_e32 v0, s29 -; SI-NEXT: v_mov_b32_e32 v1, s28 -; SI-NEXT: v_mov_b32_e32 v2, s27 -; SI-NEXT: v_mov_b32_e32 v3, s26 -; SI-NEXT: v_mov_b32_e32 v4, s25 -; SI-NEXT: v_mov_b32_e32 v5, s24 -; SI-NEXT: v_mov_b32_e32 v6, s15 -; SI-NEXT: v_mov_b32_e32 v7, s14 -; SI-NEXT: v_mov_b32_e32 v8, s13 -; SI-NEXT: v_mov_b32_e32 v9, s12 -; SI-NEXT: v_mov_b32_e32 v10, s11 -; SI-NEXT: v_mov_b32_e32 v11, s10 -; SI-NEXT: v_mov_b32_e32 v12, s9 -; SI-NEXT: v_mov_b32_e32 v13, s8 -; SI-NEXT: v_mov_b32_e32 v14, s7 -; SI-NEXT: v_mov_b32_e32 v15, s6 +; SI-NEXT: v_mul_f32_e64 v0, 1.0, s29 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_mul_f32_e64 v0, 1.0, s28 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s27 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s26 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s25 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s24 +; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s15 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s14 +; SI-NEXT: v_lshr_b64 v[3:4], v[3:4], 16 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s13 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s12 +; SI-NEXT: v_lshr_b64 v[4:5], v[4:5], 16 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s11 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s10 +; SI-NEXT: v_lshr_b64 v[5:6], v[5:6], 16 +; SI-NEXT: v_mul_f32_e64 v6, 1.0, s9 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_mul_f32_e64 v6, 1.0, s8 +; SI-NEXT: v_lshr_b64 v[6:7], v[6:7], 16 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s7 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v7 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s6 +; SI-NEXT: v_lshr_b64 v[7:8], v[7:8], 16 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB21_4: -; SI-NEXT: ; implicit-def: $sgpr29 ; SI-NEXT: ; implicit-def: $sgpr28 -; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr29 ; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr25 +; SI-NEXT: ; implicit-def: $sgpr27 ; SI-NEXT: ; implicit-def: $sgpr24 -; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $sgpr25 ; SI-NEXT: ; implicit-def: $sgpr14 -; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $sgpr15 ; SI-NEXT: ; implicit-def: $sgpr12 -; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr13 ; SI-NEXT: ; implicit-def: $sgpr10 -; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $sgpr11 ; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr9 ; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr7 ; SI-NEXT: s_branch .LBB21_2 ; ; VI-LABEL: bitcast_v8i32_to_v16bf16_scalar: @@ -3322,23 +3502,39 @@ define <8 x i32> @bitcast_v16bf16_to_v8i32(<16 x bfloat> %a, i32 %b) { ; SI-LABEL: bitcast_v16bf16_to_v8i32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; SI-NEXT: v_mul_f32_e32 v26, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v27, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v24, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v25, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v22, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v23, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v20, 1.0, v7 -; SI-NEXT: v_mul_f32_e32 v21, 1.0, v6 -; SI-NEXT: v_mul_f32_e32 v18, 1.0, v9 -; SI-NEXT: v_mul_f32_e32 v19, 1.0, v8 -; SI-NEXT: v_mul_f32_e32 v16, 1.0, v11 -; SI-NEXT: v_mul_f32_e32 v17, 1.0, v10 -; SI-NEXT: v_mul_f32_e32 v10, 1.0, v13 -; SI-NEXT: v_mul_f32_e32 v11, 1.0, v12 -; SI-NEXT: v_mul_f32_e32 v8, 1.0, v15 -; SI-NEXT: v_mul_f32_e32 v9, 1.0, v14 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v14, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v15, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v12, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v13, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v7 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -3350,26 +3546,22 @@ define <8 x i32> @bitcast_v16bf16_to_v8i32(<16 x bfloat> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB22_3: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v12 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v10 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_alignbit_b32 v0, v0, v27, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v25, 16 -; SI-NEXT: v_alignbit_b32 v2, v2, v23, 16 -; SI-NEXT: v_alignbit_b32 v3, v3, v21, 16 -; SI-NEXT: v_alignbit_b32 v4, v4, v19, 16 -; SI-NEXT: v_alignbit_b32 v5, v5, v17, 16 +; SI-NEXT: v_alignbit_b32 v0, v0, v23, 16 +; SI-NEXT: v_alignbit_b32 v1, v1, v21, 16 +; SI-NEXT: v_alignbit_b32 v2, v2, v19, 16 +; SI-NEXT: v_alignbit_b32 v3, v3, v17, 16 +; SI-NEXT: v_alignbit_b32 v4, v4, v15, 16 +; SI-NEXT: v_alignbit_b32 v5, v5, v13, 16 ; SI-NEXT: v_alignbit_b32 v6, v6, v11, 16 ; SI-NEXT: v_alignbit_b32 v7, v7, v9, 16 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr20 @@ -3378,6 +3570,10 @@ define <8 x i32> @bitcast_v16bf16_to_v8i32(<16 x bfloat> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr8 @@ -3385,11 +3581,11 @@ define <8 x i32> @bitcast_v16bf16_to_v8i32(<16 x bfloat> %a, i32 %b) { ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB22_2 ; SI-NEXT: .LBB22_4: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v26 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v24 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v27 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v22 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v20 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v23 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v25 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v21 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 @@ -3397,26 +3593,26 @@ define <8 x i32> @bitcast_v16bf16_to_v8i32(<16 x bfloat> %a, i32 %b) { ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 ; SI-NEXT: v_alignbit_b32 v1, v3, v2, 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v22 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v23 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v18 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v19 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v20 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v16 ; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v21 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v17 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v18 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v14 ; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v19 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v15 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v12 ; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v17 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v13 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 @@ -4050,24 +4246,39 @@ define inreg <8 x i32> @bitcast_v16bf16_to_v8i32_scalar(<16 x bfloat> inreg %a, ; SI-LABEL: bitcast_v16bf16_to_v8i32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mul_f32_e64 v32, 1.0, s17 +; SI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; SI-NEXT: s_lshl_b32 s5, s23, 16 +; SI-NEXT: s_and_b32 s6, s22, 0xffff0000 +; SI-NEXT: s_lshl_b32 s7, s22, 16 +; SI-NEXT: s_and_b32 s8, s21, 0xffff0000 +; SI-NEXT: s_lshl_b32 s9, s21, 16 +; SI-NEXT: s_and_b32 s10, s20, 0xffff0000 +; SI-NEXT: s_lshl_b32 s11, s20, 16 +; SI-NEXT: s_and_b32 s12, s19, 0xffff0000 +; SI-NEXT: s_lshl_b32 s13, s19, 16 +; SI-NEXT: s_and_b32 s14, s18, 0xffff0000 +; SI-NEXT: s_lshl_b32 s15, s18, 16 +; SI-NEXT: s_and_b32 s18, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_and_b32 s19, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s16, s16, 16 +; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: v_mul_f32_e64 v32, 1.0, s19 ; SI-NEXT: v_mul_f32_e64 v23, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v31, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v21, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v30, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v19, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v29, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v17, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v28, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v15, 1.0, s24 -; SI-NEXT: v_mul_f32_e64 v27, 1.0, s27 -; SI-NEXT: v_mul_f32_e64 v26, 1.0, s29 -; SI-NEXT: v_mul_f32_e32 v25, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v9, 1.0, v0 -; SI-NEXT: v_mul_f32_e64 v13, 1.0, s26 -; SI-NEXT: v_mul_f32_e64 v11, 1.0, s28 +; SI-NEXT: v_mul_f32_e64 v31, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v21, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v30, 1.0, s14 +; SI-NEXT: v_mul_f32_e64 v19, 1.0, s15 +; SI-NEXT: v_mul_f32_e64 v29, 1.0, s12 +; SI-NEXT: v_mul_f32_e64 v17, 1.0, s13 +; SI-NEXT: v_mul_f32_e64 v28, 1.0, s10 +; SI-NEXT: v_mul_f32_e64 v15, 1.0, s11 +; SI-NEXT: v_mul_f32_e64 v27, 1.0, s8 +; SI-NEXT: v_mul_f32_e64 v13, 1.0, s9 +; SI-NEXT: v_mul_f32_e64 v26, 1.0, s6 +; SI-NEXT: v_mul_f32_e64 v25, 1.0, s4 +; SI-NEXT: v_mul_f32_e64 v11, 1.0, s7 +; SI-NEXT: v_mul_f32_e64 v9, 1.0, s5 ; SI-NEXT: s_cbranch_scc0 .LBB23_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v32 @@ -8518,56 +8729,72 @@ define <16 x i16> @bitcast_v8f32_to_v16i16(<8 x float> %a, i32 %b) { ; SI-LABEL: bitcast_v8f32_to_v16i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v14, v7 -; SI-NEXT: v_mov_b32_e32 v12, v6 -; SI-NEXT: v_mov_b32_e32 v10, v5 -; SI-NEXT: v_mov_b32_e32 v16, v4 -; SI-NEXT: v_mov_b32_e32 v6, v3 -; SI-NEXT: v_mov_b32_e32 v4, v2 -; SI-NEXT: v_mov_b32_e32 v2, v1 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 -; SI-NEXT: v_alignbit_b32 v9, v10, v16, 16 -; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_alignbit_b32 v8, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v9, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v10, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v13, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v1 ; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB36_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 ; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 -; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 -; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 -; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 -; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 -; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 -; SI-NEXT: v_alignbit_b32 v9, v10, v16, 16 -; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_alignbit_b32 v8, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v9, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v10, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v13, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v1 ; SI-NEXT: .LBB36_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_mov_b32_e32 v8, v16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v0, v0, v13 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v15 +; SI-NEXT: v_or_b32_e32 v2, v2, v10 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v14 +; SI-NEXT: v_or_b32_e32 v4, v4, v9 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v12 +; SI-NEXT: v_or_b32_e32 v6, v6, v8 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v11 +; SI-NEXT: v_or_b32_e32 v1, v1, v13 +; SI-NEXT: v_or_b32_e32 v3, v3, v10 +; SI-NEXT: v_or_b32_e32 v5, v5, v9 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8f32_to_v16i16: @@ -8660,22 +8887,22 @@ define inreg <16 x i16> @bitcast_v8f32_to_v16i16_scalar(<8 x float> inreg %a, i3 ; SI-NEXT: s_lshr_b64 s[10:11], s[16:17], 16 ; SI-NEXT: s_cbranch_execnz .LBB37_4 ; SI-NEXT: .LBB37_2: ; %cmp.true -; SI-NEXT: v_add_f32_e64 v23, s17, 1.0 -; SI-NEXT: v_add_f32_e64 v22, s16, 1.0 -; SI-NEXT: v_add_f32_e64 v21, s19, 1.0 -; SI-NEXT: v_add_f32_e64 v20, s18, 1.0 -; SI-NEXT: v_add_f32_e64 v19, s21, 1.0 -; SI-NEXT: v_add_f32_e64 v18, s20, 1.0 -; SI-NEXT: v_add_f32_e64 v17, s23, 1.0 -; SI-NEXT: v_add_f32_e64 v16, s22, 1.0 -; SI-NEXT: v_lshr_b64 v[13:14], v[16:17], 16 -; SI-NEXT: v_lshr_b64 v[9:10], v[18:19], 16 -; SI-NEXT: v_lshr_b64 v[5:6], v[20:21], 16 -; SI-NEXT: v_lshr_b64 v[1:2], v[22:23], 16 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v23 +; SI-NEXT: v_add_f32_e64 v7, s23, 1.0 +; SI-NEXT: v_add_f32_e64 v6, s22, 1.0 +; SI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; SI-NEXT: v_lshr_b64 v[8:9], v[6:7], 16 +; SI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; SI-NEXT: v_lshr_b64 v[9:10], v[4:5], 16 +; SI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: v_lshr_b64 v[10:11], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[11:12], v[0:1], 16 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v1 ; SI-NEXT: s_branch .LBB37_5 ; SI-NEXT: .LBB37_3: ; SI-NEXT: ; implicit-def: $sgpr10 @@ -8688,31 +8915,47 @@ define inreg <16 x i16> @bitcast_v8f32_to_v16i16_scalar(<8 x float> inreg %a, i3 ; SI-NEXT: ; implicit-def: $sgpr25 ; SI-NEXT: s_branch .LBB37_2 ; SI-NEXT: .LBB37_4: -; SI-NEXT: v_mov_b32_e32 v22, s16 -; SI-NEXT: v_mov_b32_e32 v23, s17 -; SI-NEXT: v_mov_b32_e32 v20, s18 -; SI-NEXT: v_mov_b32_e32 v21, s19 -; SI-NEXT: v_mov_b32_e32 v18, s20 -; SI-NEXT: v_mov_b32_e32 v19, s21 -; SI-NEXT: v_mov_b32_e32 v16, s22 -; SI-NEXT: v_mov_b32_e32 v17, s23 -; SI-NEXT: v_mov_b32_e32 v3, s14 -; SI-NEXT: v_mov_b32_e32 v7, s15 -; SI-NEXT: v_mov_b32_e32 v11, s24 -; SI-NEXT: v_mov_b32_e32 v15, s25 -; SI-NEXT: v_mov_b32_e32 v13, s4 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v15, s14 +; SI-NEXT: v_mov_b32_e32 v14, s15 +; SI-NEXT: v_mov_b32_e32 v13, s24 +; SI-NEXT: v_mov_b32_e32 v12, s25 +; SI-NEXT: v_mov_b32_e32 v8, s4 ; SI-NEXT: v_mov_b32_e32 v9, s6 -; SI-NEXT: v_mov_b32_e32 v5, s8 -; SI-NEXT: v_mov_b32_e32 v1, s10 +; SI-NEXT: v_mov_b32_e32 v10, s8 +; SI-NEXT: v_mov_b32_e32 v11, s10 ; SI-NEXT: .LBB37_5: ; %end -; SI-NEXT: v_mov_b32_e32 v0, v22 -; SI-NEXT: v_mov_b32_e32 v2, v23 -; SI-NEXT: v_mov_b32_e32 v4, v20 -; SI-NEXT: v_mov_b32_e32 v6, v21 -; SI-NEXT: v_mov_b32_e32 v8, v18 -; SI-NEXT: v_mov_b32_e32 v10, v19 -; SI-NEXT: v_mov_b32_e32 v12, v16 -; SI-NEXT: v_mov_b32_e32 v14, v17 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v0, v0, v11 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v15 +; SI-NEXT: v_or_b32_e32 v2, v2, v10 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v14 +; SI-NEXT: v_or_b32_e32 v4, v4, v9 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v13 +; SI-NEXT: v_or_b32_e32 v6, v6, v8 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v12 +; SI-NEXT: v_or_b32_e32 v1, v1, v11 +; SI-NEXT: v_or_b32_e32 v3, v3, v10 +; SI-NEXT: v_or_b32_e32 v5, v5, v9 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8f32_to_v16i16_scalar: @@ -8827,19 +9070,31 @@ define <8 x float> @bitcast_v16i16_to_v8f32(<16 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v16i16_to_v8f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v20, v6 -; SI-NEXT: v_mov_b32_e32 v19, v4 -; SI-NEXT: v_mov_b32_e32 v18, v2 -; SI-NEXT: v_mov_b32_e32 v17, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v15 +; SI-NEXT: v_mov_b32_e32 v16, v7 +; SI-NEXT: v_mov_b32_e32 v9, v6 +; SI-NEXT: v_mov_b32_e32 v10, v5 +; SI-NEXT: v_mov_b32_e32 v11, v4 +; SI-NEXT: v_mov_b32_e32 v12, v3 +; SI-NEXT: v_mov_b32_e32 v13, v2 +; SI-NEXT: v_mov_b32_e32 v14, v1 +; SI-NEXT: v_mov_b32_e32 v15, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v15 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v0 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -8851,49 +9106,49 @@ define <8 x float> @bitcast_v16i16_to_v8f32(<16 x i16> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB38_3: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v17 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v19 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v20 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v11 ; SI-NEXT: v_and_b32_e32 v5, 0xffff, v10 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v12 -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v14 -; SI-NEXT: v_or_b32_e32 v0, v0, v25 -; SI-NEXT: v_or_b32_e32 v1, v1, v24 -; SI-NEXT: v_or_b32_e32 v2, v2, v23 -; SI-NEXT: v_or_b32_e32 v3, v3, v22 -; SI-NEXT: v_or_b32_e32 v4, v4, v21 -; SI-NEXT: v_or_b32_e32 v5, v5, v16 -; SI-NEXT: v_or_b32_e32 v6, v6, v11 -; SI-NEXT: v_or_b32_e32 v7, v7, v9 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v0, v0, v23 +; SI-NEXT: v_or_b32_e32 v1, v1, v22 +; SI-NEXT: v_or_b32_e32 v2, v2, v21 +; SI-NEXT: v_or_b32_e32 v3, v3, v20 +; SI-NEXT: v_or_b32_e32 v4, v4, v19 +; SI-NEXT: v_or_b32_e32 v5, v5, v18 +; SI-NEXT: v_or_b32_e32 v6, v6, v17 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB38_2 ; SI-NEXT: .LBB38_4: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v17 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v18 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v19 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v20 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v11 ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v10 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v12 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v16 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 @@ -8902,15 +9157,15 @@ define <8 x float> @bitcast_v16i16_to_v8f32(<16 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: v_or_b32_e32 v0, v25, v0 +; SI-NEXT: v_or_b32_e32 v0, v23, v0 ; SI-NEXT: s_mov_b32 s6, 0x30000 -; SI-NEXT: v_or_b32_e32 v1, v24, v1 -; SI-NEXT: v_or_b32_e32 v2, v23, v2 -; SI-NEXT: v_or_b32_e32 v3, v22, v3 -; SI-NEXT: v_or_b32_e32 v4, v21, v4 -; SI-NEXT: v_or_b32_e32 v5, v16, v5 -; SI-NEXT: v_or_b32_e32 v6, v11, v6 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_or_b32_e32 v1, v22, v1 +; SI-NEXT: v_or_b32_e32 v2, v21, v2 +; SI-NEXT: v_or_b32_e32 v3, v20, v3 +; SI-NEXT: v_or_b32_e32 v4, v19, v4 +; SI-NEXT: v_or_b32_e32 v5, v18, v5 +; SI-NEXT: v_or_b32_e32 v6, v17, v6 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 @@ -9022,74 +9277,75 @@ define inreg <8 x float> @bitcast_v16i16_to_v8f32_scalar(<16 x i16> inreg %a, i3 ; SI-LABEL: bitcast_v16i16_to_v8f32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: v_mov_b32_e32 v8, v0 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v1 +; SI-NEXT: s_lshr_b32 s14, s23, 16 +; SI-NEXT: s_lshr_b32 s15, s22, 16 +; SI-NEXT: s_lshr_b32 s25, s21, 16 +; SI-NEXT: s_lshr_b32 s26, s20, 16 +; SI-NEXT: s_lshr_b32 s27, s19, 16 +; SI-NEXT: s_lshr_b32 s28, s18, 16 +; SI-NEXT: s_lshr_b32 s29, s17, 16 +; SI-NEXT: s_lshr_b32 s40, s16, 16 +; SI-NEXT: s_cmp_lg_u32 s24, 0 ; SI-NEXT: s_cbranch_scc0 .LBB39_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_lshl_b32 s5, s40, 16 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s29, 16 ; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_and_b32 s6, s18, 0xffff +; SI-NEXT: s_lshl_b32 s7, s28, 16 ; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_and_b32 s7, s19, 0xffff +; SI-NEXT: s_lshl_b32 s8, s27, 16 ; SI-NEXT: s_or_b32 s7, s7, s8 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_and_b32 s8, s20, 0xffff +; SI-NEXT: s_lshl_b32 s9, s26, 16 ; SI-NEXT: s_or_b32 s8, s8, s9 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_and_b32 s9, s21, 0xffff +; SI-NEXT: s_lshl_b32 s10, s25, 16 ; SI-NEXT: s_or_b32 s9, s9, s10 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: s_and_b32 s10, s22, 0xffff +; SI-NEXT: s_lshl_b32 s11, s15, 16 ; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v8 -; SI-NEXT: v_or_b32_e32 v7, v0, v9 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_mov_b32_e32 v5, s9 -; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_and_b32 s11, s23, 0xffff +; SI-NEXT: s_lshl_b32 s12, s14, 16 +; SI-NEXT: s_or_b32 s11, s11, s12 ; SI-NEXT: s_cbranch_execnz .LBB39_3 ; SI-NEXT: .LBB39_2: ; %cmp.true ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_lshl_b32 s5, s40, 16 +; SI-NEXT: s_add_i32 s17, s17, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s29, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_and_b32 s6, s18, 0xffff +; SI-NEXT: s_lshl_b32 s7, s28, 16 +; SI-NEXT: s_add_i32 s19, s19, 3 ; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_and_b32 s7, s19, 0xffff +; SI-NEXT: s_lshl_b32 s8, s27, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 ; SI-NEXT: s_or_b32 s7, s8, s7 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_and_b32 s8, s20, 0xffff +; SI-NEXT: s_lshl_b32 s9, s26, 16 +; SI-NEXT: s_add_i32 s21, s21, 3 ; SI-NEXT: s_or_b32 s8, s9, s8 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_and_b32 s9, s21, 0xffff +; SI-NEXT: s_lshl_b32 s10, s25, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 ; SI-NEXT: s_or_b32 s9, s10, s9 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v8 +; SI-NEXT: s_and_b32 s10, s22, 0xffff +; SI-NEXT: s_lshl_b32 s11, s15, 16 +; SI-NEXT: s_add_i32 s23, s23, 3 ; SI-NEXT: s_or_b32 s10, s11, s10 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_and_b32 s11, s23, 0xffff +; SI-NEXT: s_lshl_b32 s12, s14, 16 +; SI-NEXT: s_or_b32 s11, s12, s11 ; SI-NEXT: s_add_i32 s4, s4, 0x30000 ; SI-NEXT: s_add_i32 s5, s5, 0x30000 ; SI-NEXT: s_add_i32 s6, s6, 0x30000 @@ -9097,8 +9353,8 @@ define inreg <8 x float> @bitcast_v16i16_to_v8f32_scalar(<16 x i16> inreg %a, i3 ; SI-NEXT: s_add_i32 s8, s8, 0x30000 ; SI-NEXT: s_add_i32 s9, s9, 0x30000 ; SI-NEXT: s_add_i32 s10, s10, 0x30000 -; SI-NEXT: v_or_b32_e32 v0, v9, v0 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 +; SI-NEXT: s_add_i32 s11, s11, 0x30000 +; SI-NEXT: .LBB39_3: ; %end ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_mov_b32_e32 v2, s6 @@ -9106,10 +9362,10 @@ define inreg <8 x float> @bitcast_v16i16_to_v8f32_scalar(<16 x i16> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v4, s8 ; SI-NEXT: v_mov_b32_e32 v5, s9 ; SI-NEXT: v_mov_b32_e32 v6, s10 -; SI-NEXT: .LBB39_3: ; %end +; SI-NEXT: v_mov_b32_e32 v7, s11 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB39_4: -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; SI-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11 ; SI-NEXT: s_branch .LBB39_2 ; ; VI-LABEL: bitcast_v16i16_to_v8f32_scalar: @@ -9255,15 +9511,51 @@ define <16 x half> @bitcast_v8f32_to_v16f16(<8 x float> %a, i32 %b) { ; SI-LABEL: bitcast_v8f32_to_v16f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v16, v7 -; SI-NEXT: v_mov_b32_e32 v17, v6 -; SI-NEXT: v_mov_b32_e32 v18, v5 -; SI-NEXT: v_mov_b32_e32 v19, v4 -; SI-NEXT: v_mov_b32_e32 v20, v3 -; SI-NEXT: v_mov_b32_e32 v21, v2 -; SI-NEXT: v_mov_b32_e32 v22, v1 -; SI-NEXT: v_mov_b32_e32 v23, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB40_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v0 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 @@ -9272,92 +9564,76 @@ define <16 x half> @bitcast_v8f32_to_v16f16(<8 x float> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB40_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB40_4 -; SI-NEXT: .LBB40_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB40_3: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v23 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: .LBB40_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB40_2 -; SI-NEXT: .LBB40_4: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v0, 1.0, v23 -; SI-NEXT: v_add_f32_e32 v2, 1.0, v22 -; SI-NEXT: v_add_f32_e32 v4, 1.0, v21 -; SI-NEXT: v_add_f32_e32 v6, 1.0, v20 -; SI-NEXT: v_add_f32_e32 v8, 1.0, v19 -; SI-NEXT: v_add_f32_e32 v10, 1.0, v18 -; SI-NEXT: v_add_f32_e32 v12, 1.0, v17 -; SI-NEXT: v_add_f32_e32 v14, 1.0, v16 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: s_cbranch_execz .LBB40_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: .LBB40_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v20 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v19 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v16 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v14 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v13 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v10 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8f32_to_v16f16: @@ -9441,82 +9717,114 @@ define inreg <16 x half> @bitcast_v8f32_to_v16f16_scalar(<8 x float> inreg %a, i ; SI-NEXT: s_cbranch_scc0 .LBB41_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 ; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 ; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 ; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 ; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 ; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 ; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s16 ; SI-NEXT: s_cbranch_execnz .LBB41_3 ; SI-NEXT: .LBB41_2: ; %cmp.true -; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 -; SI-NEXT: v_add_f32_e64 v2, s17, 1.0 -; SI-NEXT: v_add_f32_e64 v4, s18, 1.0 -; SI-NEXT: v_add_f32_e64 v6, s19, 1.0 -; SI-NEXT: v_add_f32_e64 v8, s20, 1.0 -; SI-NEXT: v_add_f32_e64 v10, s21, 1.0 -; SI-NEXT: v_add_f32_e64 v12, s22, 1.0 -; SI-NEXT: v_add_f32_e64 v14, s23, 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e64 v1, s16, 1.0 +; SI-NEXT: v_add_f32_e64 v0, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v3, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v2, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; SI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v6, s22, 1.0 +; SI-NEXT: v_add_f32_e64 v7, s23, 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: .LBB41_3: ; %end +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v15 +; SI-NEXT: v_or_b32_e32 v0, v14, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v13 +; SI-NEXT: v_or_b32_e32 v2, v12, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_or_b32_e32 v5, v5, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v10 +; SI-NEXT: v_or_b32_e32 v1, v16, v1 +; SI-NEXT: v_or_b32_e32 v3, v14, v3 +; SI-NEXT: v_or_b32_e32 v4, v11, v4 +; SI-NEXT: v_or_b32_e32 v6, v9, v6 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB41_4: +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: s_branch .LBB41_2 ; ; VI-LABEL: bitcast_v8f32_to_v16f16_scalar: @@ -9631,23 +9939,47 @@ define <8 x float> @bitcast_v16f16_to_v8f32(<16 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v16f16_to_v8f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v28, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v14 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v7 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -9659,26 +9991,22 @@ define <8 x float> @bitcast_v16f16_to_v8f32(<16 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB42_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v28 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v26 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v24 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v22 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v20 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v18 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v9 -; SI-NEXT: v_or_b32_e32 v0, v27, v0 -; SI-NEXT: v_or_b32_e32 v1, v25, v1 -; SI-NEXT: v_or_b32_e32 v2, v23, v2 -; SI-NEXT: v_or_b32_e32 v3, v21, v3 -; SI-NEXT: v_or_b32_e32 v4, v19, v4 -; SI-NEXT: v_or_b32_e32 v5, v17, v5 -; SI-NEXT: v_or_b32_e32 v6, v10, v6 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v10 +; SI-NEXT: v_or_b32_e32 v0, v23, v0 +; SI-NEXT: v_or_b32_e32 v1, v21, v1 +; SI-NEXT: v_or_b32_e32 v2, v19, v2 +; SI-NEXT: v_or_b32_e32 v3, v17, v3 +; SI-NEXT: v_or_b32_e32 v4, v15, v4 +; SI-NEXT: v_or_b32_e32 v5, v13, v5 +; SI-NEXT: v_or_b32_e32 v6, v11, v6 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr22 @@ -9687,17 +10015,21 @@ define <8 x float> @bitcast_v16f16_to_v8f32(<16 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB42_2 ; SI-NEXT: .LBB42_4: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v21 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -9710,10 +10042,10 @@ define <8 x float> @bitcast_v16f16_to_v8f32(<16 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v16 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -9722,18 +10054,18 @@ define <8 x float> @bitcast_v16f16_to_v8f32(<16 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v17 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v10 ; SI-NEXT: v_or_b32_e32 v3, v3, v4 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v14 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 @@ -9741,23 +10073,23 @@ define <8 x float> @bitcast_v16f16_to_v8f32(<16 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v12 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v9 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -9862,24 +10194,47 @@ define inreg <8 x float> @bitcast_v16f16_to_v8f32_scalar(<16 x half> inreg %a, i ; SI-LABEL: bitcast_v16f16_to_v8f32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v23, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v22, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v21, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v20, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v19, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v18, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v17, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v16, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v15, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v14, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v13, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v12, s26 -; SI-NEXT: v_cvt_f16_f32_e32 v11, s29 -; SI-NEXT: v_cvt_f16_f32_e32 v10, s28 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: s_cmp_lg_u32 s24, 0 ; SI-NEXT: s_cbranch_scc0 .LBB43_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v23 @@ -10116,15 +10471,43 @@ define <16 x bfloat> @bitcast_v8f32_to_v16bf16(<8 x float> %a, i32 %b) { ; SI-LABEL: bitcast_v8f32_to_v16bf16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v23, v7 -; SI-NEXT: v_mov_b32_e32 v22, v6 -; SI-NEXT: v_mov_b32_e32 v21, v5 -; SI-NEXT: v_mov_b32_e32 v20, v4 -; SI-NEXT: v_mov_b32_e32 v19, v3 -; SI-NEXT: v_mov_b32_e32 v18, v2 -; SI-NEXT: v_mov_b32_e32 v17, v1 -; SI-NEXT: v_mov_b32_e32 v16, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB44_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v7 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v6 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v5 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v4 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v3 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v2 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v1 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v0 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 @@ -10133,76 +10516,68 @@ define <16 x bfloat> @bitcast_v8f32_to_v16bf16(<8 x float> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB44_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB44_4 -; SI-NEXT: .LBB44_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB44_3: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v23 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v23 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v22 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v22 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v21 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v21 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v20 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v20 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v19 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v19 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v18 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v18 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v17 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v17 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v16 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v16 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: .LBB44_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB44_2 -; SI-NEXT: .LBB44_4: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v0, 1.0, v16 -; SI-NEXT: v_add_f32_e32 v1, 1.0, v17 -; SI-NEXT: v_add_f32_e32 v2, 1.0, v18 -; SI-NEXT: v_add_f32_e32 v3, 1.0, v19 -; SI-NEXT: v_add_f32_e32 v4, 1.0, v20 -; SI-NEXT: v_add_f32_e32 v5, 1.0, v21 -; SI-NEXT: v_add_f32_e32 v6, 1.0, v22 -; SI-NEXT: v_add_f32_e32 v7, 1.0, v23 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v7 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v7 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v6 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v6 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v5 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v5 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v4 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v4 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v3 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_cbranch_execz .LBB44_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v7 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v6 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v5 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v4 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v3 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v2 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v1 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v0 +; SI-NEXT: .LBB44_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v23 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v22 +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v21 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v20 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v19 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v18 +; SI-NEXT: v_alignbit_b32 v2, v2, v3, 16 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v17 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v16 +; SI-NEXT: v_alignbit_b32 v3, v3, v4, 16 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v15 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v14 +; SI-NEXT: v_alignbit_b32 v4, v4, v5, 16 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v13 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_mul_f32_e32 v6, 1.0, v12 +; SI-NEXT: v_alignbit_b32 v5, v5, v6, 16 +; SI-NEXT: v_mul_f32_e32 v6, 1.0, v11 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v10 +; SI-NEXT: v_alignbit_b32 v6, v6, v7, 16 +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v9 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_alignbit_b32 v7, v7, v8, 16 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8f32_to_v16bf16: @@ -10311,23 +10686,23 @@ define inreg <16 x bfloat> @bitcast_v8f32_to_v16bf16_scalar(<8 x float> inreg %a ; SI-NEXT: v_add_f32_e64 v5, s21, 1.0 ; SI-NEXT: v_add_f32_e64 v6, s22, 1.0 ; SI-NEXT: v_add_f32_e64 v7, s23, 1.0 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v7 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v7 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v6 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v6 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v7 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v6 ; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v5 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v5 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v4 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v4 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v3 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v1 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: s_branch .LBB45_5 ; SI-NEXT: .LBB45_3: ; SI-NEXT: ; implicit-def: $sgpr29 ; SI-NEXT: ; implicit-def: $sgpr28 @@ -10350,19 +10725,52 @@ define inreg <16 x bfloat> @bitcast_v8f32_to_v16bf16_scalar(<8 x float> inreg %a ; SI-NEXT: v_mov_b32_e32 v0, s29 ; SI-NEXT: v_mov_b32_e32 v1, s28 ; SI-NEXT: v_mov_b32_e32 v2, s27 -; SI-NEXT: v_mov_b32_e32 v3, s26 -; SI-NEXT: v_mov_b32_e32 v4, s25 -; SI-NEXT: v_mov_b32_e32 v5, s24 -; SI-NEXT: v_mov_b32_e32 v6, s15 -; SI-NEXT: v_mov_b32_e32 v7, s14 -; SI-NEXT: v_mov_b32_e32 v8, s13 -; SI-NEXT: v_mov_b32_e32 v9, s12 -; SI-NEXT: v_mov_b32_e32 v10, s11 +; SI-NEXT: v_mov_b32_e32 v15, s26 +; SI-NEXT: v_mov_b32_e32 v3, s25 +; SI-NEXT: v_mov_b32_e32 v14, s24 +; SI-NEXT: v_mov_b32_e32 v4, s15 +; SI-NEXT: v_mov_b32_e32 v13, s14 +; SI-NEXT: v_mov_b32_e32 v5, s13 +; SI-NEXT: v_mov_b32_e32 v12, s12 +; SI-NEXT: v_mov_b32_e32 v6, s11 ; SI-NEXT: v_mov_b32_e32 v11, s10 -; SI-NEXT: v_mov_b32_e32 v12, s9 -; SI-NEXT: v_mov_b32_e32 v13, s8 -; SI-NEXT: v_mov_b32_e32 v14, s7 -; SI-NEXT: v_mov_b32_e32 v15, s6 +; SI-NEXT: v_mov_b32_e32 v7, s9 +; SI-NEXT: v_mov_b32_e32 v10, s8 +; SI-NEXT: v_mov_b32_e32 v8, s7 +; SI-NEXT: v_mov_b32_e32 v9, s6 +; SI-NEXT: .LBB45_5: ; %end +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v15 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v1 +; SI-NEXT: v_mul_f32_e32 v15, 1.0, v2 +; SI-NEXT: v_lshr_b64 v[1:2], v[15:16], 16 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v14 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v2 +; SI-NEXT: v_mul_f32_e32 v14, 1.0, v3 +; SI-NEXT: v_lshr_b64 v[2:3], v[14:15], 16 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v13 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v3 +; SI-NEXT: v_mul_f32_e32 v13, 1.0, v4 +; SI-NEXT: v_lshr_b64 v[3:4], v[13:14], 16 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v12 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v4 +; SI-NEXT: v_mul_f32_e32 v12, 1.0, v5 +; SI-NEXT: v_lshr_b64 v[4:5], v[12:13], 16 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v11 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v5 +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v6 +; SI-NEXT: v_lshr_b64 v[5:6], v[11:12], 16 +; SI-NEXT: v_mul_f32_e32 v6, 1.0, v10 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v6 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v7 +; SI-NEXT: v_lshr_b64 v[6:7], v[10:11], 16 +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v7 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_lshr_b64 v[7:8], v[8:9], 16 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8f32_to_v16bf16_scalar: @@ -10477,23 +10885,39 @@ define <8 x float> @bitcast_v16bf16_to_v8f32(<16 x bfloat> %a, i32 %b) { ; SI-LABEL: bitcast_v16bf16_to_v8f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; SI-NEXT: v_mul_f32_e32 v26, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v27, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v24, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v25, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v22, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v23, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v20, 1.0, v7 -; SI-NEXT: v_mul_f32_e32 v21, 1.0, v6 -; SI-NEXT: v_mul_f32_e32 v18, 1.0, v9 -; SI-NEXT: v_mul_f32_e32 v19, 1.0, v8 -; SI-NEXT: v_mul_f32_e32 v16, 1.0, v11 -; SI-NEXT: v_mul_f32_e32 v17, 1.0, v10 -; SI-NEXT: v_mul_f32_e32 v10, 1.0, v13 -; SI-NEXT: v_mul_f32_e32 v11, 1.0, v12 -; SI-NEXT: v_mul_f32_e32 v8, 1.0, v15 -; SI-NEXT: v_mul_f32_e32 v9, 1.0, v14 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v14, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v15, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v12, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v13, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v7 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -10505,26 +10929,22 @@ define <8 x float> @bitcast_v16bf16_to_v8f32(<16 x bfloat> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB46_3: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v12 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v10 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_alignbit_b32 v0, v0, v27, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v25, 16 -; SI-NEXT: v_alignbit_b32 v2, v2, v23, 16 -; SI-NEXT: v_alignbit_b32 v3, v3, v21, 16 -; SI-NEXT: v_alignbit_b32 v4, v4, v19, 16 -; SI-NEXT: v_alignbit_b32 v5, v5, v17, 16 +; SI-NEXT: v_alignbit_b32 v0, v0, v23, 16 +; SI-NEXT: v_alignbit_b32 v1, v1, v21, 16 +; SI-NEXT: v_alignbit_b32 v2, v2, v19, 16 +; SI-NEXT: v_alignbit_b32 v3, v3, v17, 16 +; SI-NEXT: v_alignbit_b32 v4, v4, v15, 16 +; SI-NEXT: v_alignbit_b32 v5, v5, v13, 16 ; SI-NEXT: v_alignbit_b32 v6, v6, v11, 16 ; SI-NEXT: v_alignbit_b32 v7, v7, v9, 16 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr20 @@ -10533,6 +10953,10 @@ define <8 x float> @bitcast_v16bf16_to_v8f32(<16 x bfloat> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr8 @@ -10540,11 +10964,11 @@ define <8 x float> @bitcast_v16bf16_to_v8f32(<16 x bfloat> %a, i32 %b) { ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB46_2 ; SI-NEXT: .LBB46_4: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v26 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v24 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v27 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v22 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v20 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v23 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v25 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v21 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 @@ -10552,26 +10976,26 @@ define <8 x float> @bitcast_v16bf16_to_v8f32(<16 x bfloat> %a, i32 %b) { ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 ; SI-NEXT: v_alignbit_b32 v1, v3, v2, 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v22 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v23 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v18 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v19 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v20 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v16 ; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v21 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v17 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v18 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v14 ; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v19 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v15 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v12 ; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v17 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v13 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 @@ -11205,24 +11629,39 @@ define inreg <8 x float> @bitcast_v16bf16_to_v8f32_scalar(<16 x bfloat> inreg %a ; SI-LABEL: bitcast_v16bf16_to_v8f32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mul_f32_e64 v32, 1.0, s17 +; SI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; SI-NEXT: s_lshl_b32 s5, s23, 16 +; SI-NEXT: s_and_b32 s6, s22, 0xffff0000 +; SI-NEXT: s_lshl_b32 s7, s22, 16 +; SI-NEXT: s_and_b32 s8, s21, 0xffff0000 +; SI-NEXT: s_lshl_b32 s9, s21, 16 +; SI-NEXT: s_and_b32 s10, s20, 0xffff0000 +; SI-NEXT: s_lshl_b32 s11, s20, 16 +; SI-NEXT: s_and_b32 s12, s19, 0xffff0000 +; SI-NEXT: s_lshl_b32 s13, s19, 16 +; SI-NEXT: s_and_b32 s14, s18, 0xffff0000 +; SI-NEXT: s_lshl_b32 s15, s18, 16 +; SI-NEXT: s_and_b32 s18, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_and_b32 s19, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s16, s16, 16 +; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: v_mul_f32_e64 v32, 1.0, s19 ; SI-NEXT: v_mul_f32_e64 v23, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v31, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v21, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v30, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v19, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v29, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v17, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v28, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v15, 1.0, s24 -; SI-NEXT: v_mul_f32_e64 v27, 1.0, s27 -; SI-NEXT: v_mul_f32_e64 v26, 1.0, s29 -; SI-NEXT: v_mul_f32_e32 v25, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v9, 1.0, v0 -; SI-NEXT: v_mul_f32_e64 v13, 1.0, s26 -; SI-NEXT: v_mul_f32_e64 v11, 1.0, s28 +; SI-NEXT: v_mul_f32_e64 v31, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v21, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v30, 1.0, s14 +; SI-NEXT: v_mul_f32_e64 v19, 1.0, s15 +; SI-NEXT: v_mul_f32_e64 v29, 1.0, s12 +; SI-NEXT: v_mul_f32_e64 v17, 1.0, s13 +; SI-NEXT: v_mul_f32_e64 v28, 1.0, s10 +; SI-NEXT: v_mul_f32_e64 v15, 1.0, s11 +; SI-NEXT: v_mul_f32_e64 v27, 1.0, s8 +; SI-NEXT: v_mul_f32_e64 v13, 1.0, s9 +; SI-NEXT: v_mul_f32_e64 v26, 1.0, s6 +; SI-NEXT: v_mul_f32_e64 v25, 1.0, s4 +; SI-NEXT: v_mul_f32_e64 v11, 1.0, s7 +; SI-NEXT: v_mul_f32_e64 v9, 1.0, s5 ; SI-NEXT: s_cbranch_scc0 .LBB47_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v32 @@ -15249,56 +15688,72 @@ define <16 x i16> @bitcast_v4i64_to_v16i16(<4 x i64> %a, i32 %b) { ; SI-LABEL: bitcast_v4i64_to_v16i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v14, v7 -; SI-NEXT: v_mov_b32_e32 v12, v6 -; SI-NEXT: v_mov_b32_e32 v10, v5 -; SI-NEXT: v_mov_b32_e32 v16, v4 -; SI-NEXT: v_mov_b32_e32 v6, v3 -; SI-NEXT: v_mov_b32_e32 v4, v2 -; SI-NEXT: v_mov_b32_e32 v2, v1 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 -; SI-NEXT: v_alignbit_b32 v9, v10, v16, 16 -; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_alignbit_b32 v8, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v9, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v10, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v13, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v1 ; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB56_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; SI-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; SI-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; SI-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc -; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 -; SI-NEXT: v_alignbit_b32 v9, v10, v16, 16 -; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_alignbit_b32 v8, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v9, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v10, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v13, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v1 ; SI-NEXT: .LBB56_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_mov_b32_e32 v8, v16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v0, v0, v13 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v15 +; SI-NEXT: v_or_b32_e32 v2, v2, v10 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v14 +; SI-NEXT: v_or_b32_e32 v4, v4, v9 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v12 +; SI-NEXT: v_or_b32_e32 v6, v6, v8 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v11 +; SI-NEXT: v_or_b32_e32 v1, v1, v13 +; SI-NEXT: v_or_b32_e32 v3, v3, v10 +; SI-NEXT: v_or_b32_e32 v5, v5, v9 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4i64_to_v16i16: @@ -15415,22 +15870,38 @@ define inreg <16 x i16> @bitcast_v4i64_to_v16i16_scalar(<4 x i64> inreg %a, i32 ; SI-NEXT: s_lshr_b64 s[8:9], s[18:19], 16 ; SI-NEXT: s_lshr_b64 s[10:11], s[16:17], 16 ; SI-NEXT: .LBB57_3: ; %end -; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_mov_b32_e32 v1, s10 -; SI-NEXT: v_mov_b32_e32 v2, s17 -; SI-NEXT: v_mov_b32_e32 v3, s25 -; SI-NEXT: v_mov_b32_e32 v4, s18 -; SI-NEXT: v_mov_b32_e32 v5, s8 -; SI-NEXT: v_mov_b32_e32 v6, s19 -; SI-NEXT: v_mov_b32_e32 v7, s24 -; SI-NEXT: v_mov_b32_e32 v8, s20 -; SI-NEXT: v_mov_b32_e32 v9, s6 -; SI-NEXT: v_mov_b32_e32 v10, s21 -; SI-NEXT: v_mov_b32_e32 v11, s15 -; SI-NEXT: v_mov_b32_e32 v12, s22 -; SI-NEXT: v_mov_b32_e32 v13, s4 -; SI-NEXT: v_mov_b32_e32 v14, s23 -; SI-NEXT: v_mov_b32_e32 v15, s14 +; SI-NEXT: s_and_b32 s5, s16, 0xffff +; SI-NEXT: s_lshl_b32 s7, s10, 16 +; SI-NEXT: s_or_b32 s5, s5, s7 +; SI-NEXT: s_and_b32 s7, s17, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: s_and_b32 s9, s18, 0xffff +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s19, 0xffff +; SI-NEXT: s_lshl_b32 s10, s24, 16 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s20, 0xffff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_or_b32 s6, s10, s6 +; SI-NEXT: s_and_b32 s10, s21, 0xffff +; SI-NEXT: s_lshl_b32 s11, s15, 16 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: s_and_b32 s11, s22, 0xffff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_or_b32 s4, s11, s4 +; SI-NEXT: s_and_b32 s11, s23, 0xffff +; SI-NEXT: s_lshl_b32 s12, s14, 16 +; SI-NEXT: s_or_b32 s11, s11, s12 +; SI-NEXT: v_mov_b32_e32 v0, s5 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: v_mov_b32_e32 v3, s9 +; SI-NEXT: v_mov_b32_e32 v4, s6 +; SI-NEXT: v_mov_b32_e32 v5, s10 +; SI-NEXT: v_mov_b32_e32 v6, s4 +; SI-NEXT: v_mov_b32_e32 v7, s11 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB57_4: ; SI-NEXT: ; implicit-def: $sgpr10 @@ -15549,19 +16020,31 @@ define <4 x i64> @bitcast_v16i16_to_v4i64(<16 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v16i16_to_v4i64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v20, v6 -; SI-NEXT: v_mov_b32_e32 v19, v4 -; SI-NEXT: v_mov_b32_e32 v18, v2 -; SI-NEXT: v_mov_b32_e32 v17, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v15 +; SI-NEXT: v_mov_b32_e32 v16, v7 +; SI-NEXT: v_mov_b32_e32 v9, v6 +; SI-NEXT: v_mov_b32_e32 v10, v5 +; SI-NEXT: v_mov_b32_e32 v11, v4 +; SI-NEXT: v_mov_b32_e32 v12, v3 +; SI-NEXT: v_mov_b32_e32 v13, v2 +; SI-NEXT: v_mov_b32_e32 v14, v1 +; SI-NEXT: v_mov_b32_e32 v15, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v15 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v0 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -15573,49 +16056,49 @@ define <4 x i64> @bitcast_v16i16_to_v4i64(<16 x i16> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB58_3: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v17 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v19 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v20 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v11 ; SI-NEXT: v_and_b32_e32 v5, 0xffff, v10 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v12 -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v14 -; SI-NEXT: v_or_b32_e32 v0, v0, v25 -; SI-NEXT: v_or_b32_e32 v1, v1, v24 -; SI-NEXT: v_or_b32_e32 v2, v2, v23 -; SI-NEXT: v_or_b32_e32 v3, v3, v22 -; SI-NEXT: v_or_b32_e32 v4, v4, v21 -; SI-NEXT: v_or_b32_e32 v5, v5, v16 -; SI-NEXT: v_or_b32_e32 v6, v6, v11 -; SI-NEXT: v_or_b32_e32 v7, v7, v9 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v0, v0, v23 +; SI-NEXT: v_or_b32_e32 v1, v1, v22 +; SI-NEXT: v_or_b32_e32 v2, v2, v21 +; SI-NEXT: v_or_b32_e32 v3, v3, v20 +; SI-NEXT: v_or_b32_e32 v4, v4, v19 +; SI-NEXT: v_or_b32_e32 v5, v5, v18 +; SI-NEXT: v_or_b32_e32 v6, v6, v17 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB58_2 ; SI-NEXT: .LBB58_4: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v17 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v18 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v19 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v20 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v11 ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v10 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v12 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v16 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 @@ -15624,15 +16107,15 @@ define <4 x i64> @bitcast_v16i16_to_v4i64(<16 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: v_or_b32_e32 v0, v25, v0 +; SI-NEXT: v_or_b32_e32 v0, v23, v0 ; SI-NEXT: s_mov_b32 s6, 0x30000 -; SI-NEXT: v_or_b32_e32 v1, v24, v1 -; SI-NEXT: v_or_b32_e32 v2, v23, v2 -; SI-NEXT: v_or_b32_e32 v3, v22, v3 -; SI-NEXT: v_or_b32_e32 v4, v21, v4 -; SI-NEXT: v_or_b32_e32 v5, v16, v5 -; SI-NEXT: v_or_b32_e32 v6, v11, v6 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_or_b32_e32 v1, v22, v1 +; SI-NEXT: v_or_b32_e32 v2, v21, v2 +; SI-NEXT: v_or_b32_e32 v3, v20, v3 +; SI-NEXT: v_or_b32_e32 v4, v19, v4 +; SI-NEXT: v_or_b32_e32 v5, v18, v5 +; SI-NEXT: v_or_b32_e32 v6, v17, v6 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 @@ -15744,74 +16227,75 @@ define inreg <4 x i64> @bitcast_v16i16_to_v4i64_scalar(<16 x i16> inreg %a, i32 ; SI-LABEL: bitcast_v16i16_to_v4i64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: v_mov_b32_e32 v8, v0 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v1 +; SI-NEXT: s_lshr_b32 s14, s23, 16 +; SI-NEXT: s_lshr_b32 s15, s22, 16 +; SI-NEXT: s_lshr_b32 s25, s21, 16 +; SI-NEXT: s_lshr_b32 s26, s20, 16 +; SI-NEXT: s_lshr_b32 s27, s19, 16 +; SI-NEXT: s_lshr_b32 s28, s18, 16 +; SI-NEXT: s_lshr_b32 s29, s17, 16 +; SI-NEXT: s_lshr_b32 s40, s16, 16 +; SI-NEXT: s_cmp_lg_u32 s24, 0 ; SI-NEXT: s_cbranch_scc0 .LBB59_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_lshl_b32 s5, s40, 16 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s29, 16 ; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_and_b32 s6, s18, 0xffff +; SI-NEXT: s_lshl_b32 s7, s28, 16 ; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_and_b32 s7, s19, 0xffff +; SI-NEXT: s_lshl_b32 s8, s27, 16 ; SI-NEXT: s_or_b32 s7, s7, s8 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_and_b32 s8, s20, 0xffff +; SI-NEXT: s_lshl_b32 s9, s26, 16 ; SI-NEXT: s_or_b32 s8, s8, s9 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_and_b32 s9, s21, 0xffff +; SI-NEXT: s_lshl_b32 s10, s25, 16 ; SI-NEXT: s_or_b32 s9, s9, s10 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: s_and_b32 s10, s22, 0xffff +; SI-NEXT: s_lshl_b32 s11, s15, 16 ; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v8 -; SI-NEXT: v_or_b32_e32 v7, v0, v9 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_mov_b32_e32 v5, s9 -; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_and_b32 s11, s23, 0xffff +; SI-NEXT: s_lshl_b32 s12, s14, 16 +; SI-NEXT: s_or_b32 s11, s11, s12 ; SI-NEXT: s_cbranch_execnz .LBB59_3 ; SI-NEXT: .LBB59_2: ; %cmp.true ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_lshl_b32 s5, s40, 16 +; SI-NEXT: s_add_i32 s17, s17, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s29, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_and_b32 s6, s18, 0xffff +; SI-NEXT: s_lshl_b32 s7, s28, 16 +; SI-NEXT: s_add_i32 s19, s19, 3 ; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_and_b32 s7, s19, 0xffff +; SI-NEXT: s_lshl_b32 s8, s27, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 ; SI-NEXT: s_or_b32 s7, s8, s7 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_and_b32 s8, s20, 0xffff +; SI-NEXT: s_lshl_b32 s9, s26, 16 +; SI-NEXT: s_add_i32 s21, s21, 3 ; SI-NEXT: s_or_b32 s8, s9, s8 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_and_b32 s9, s21, 0xffff +; SI-NEXT: s_lshl_b32 s10, s25, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 ; SI-NEXT: s_or_b32 s9, s10, s9 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v8 +; SI-NEXT: s_and_b32 s10, s22, 0xffff +; SI-NEXT: s_lshl_b32 s11, s15, 16 +; SI-NEXT: s_add_i32 s23, s23, 3 ; SI-NEXT: s_or_b32 s10, s11, s10 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_and_b32 s11, s23, 0xffff +; SI-NEXT: s_lshl_b32 s12, s14, 16 +; SI-NEXT: s_or_b32 s11, s12, s11 ; SI-NEXT: s_add_i32 s4, s4, 0x30000 ; SI-NEXT: s_add_i32 s5, s5, 0x30000 ; SI-NEXT: s_add_i32 s6, s6, 0x30000 @@ -15819,8 +16303,8 @@ define inreg <4 x i64> @bitcast_v16i16_to_v4i64_scalar(<16 x i16> inreg %a, i32 ; SI-NEXT: s_add_i32 s8, s8, 0x30000 ; SI-NEXT: s_add_i32 s9, s9, 0x30000 ; SI-NEXT: s_add_i32 s10, s10, 0x30000 -; SI-NEXT: v_or_b32_e32 v0, v9, v0 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 +; SI-NEXT: s_add_i32 s11, s11, 0x30000 +; SI-NEXT: .LBB59_3: ; %end ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_mov_b32_e32 v2, s6 @@ -15828,10 +16312,10 @@ define inreg <4 x i64> @bitcast_v16i16_to_v4i64_scalar(<16 x i16> inreg %a, i32 ; SI-NEXT: v_mov_b32_e32 v4, s8 ; SI-NEXT: v_mov_b32_e32 v5, s9 ; SI-NEXT: v_mov_b32_e32 v6, s10 -; SI-NEXT: .LBB59_3: ; %end +; SI-NEXT: v_mov_b32_e32 v7, s11 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB59_4: -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; SI-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11 ; SI-NEXT: s_branch .LBB59_2 ; ; VI-LABEL: bitcast_v16i16_to_v4i64_scalar: @@ -15977,65 +16461,7 @@ define <16 x half> @bitcast_v4i64_to_v16f16(<4 x i64> %a, i32 %b) { ; SI-LABEL: bitcast_v4i64_to_v16f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v17, v7 -; SI-NEXT: v_mov_b32_e32 v16, v6 -; SI-NEXT: v_mov_b32_e32 v19, v5 -; SI-NEXT: v_mov_b32_e32 v18, v4 -; SI-NEXT: v_mov_b32_e32 v21, v3 -; SI-NEXT: v_mov_b32_e32 v20, v2 -; SI-NEXT: v_mov_b32_e32 v23, v1 -; SI-NEXT: v_mov_b32_e32 v22, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB60_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB60_4 -; SI-NEXT: .LBB60_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB60_3: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v22 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr20 @@ -16044,42 +16470,120 @@ define <16 x half> @bitcast_v4i64_to_v16f16(<4 x i64> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB60_2 -; SI-NEXT: .LBB60_4: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v22 -; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v23, vcc -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v20 -; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v21, vcc -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v18 -; SI-NEXT: v_addc_u32_e32 v8, vcc, 0, v19, vcc -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v16 -; SI-NEXT: v_addc_u32_e32 v12, vcc, 0, v17, vcc -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v4 +; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: .LBB60_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB60_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: .LBB60_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v20 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v19 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v16 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v14 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v13 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v10 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4i64_to_v16f16: @@ -16170,29 +16674,29 @@ define inreg <16 x half> @bitcast_v4i64_to_v16f16_scalar(<4 x i64> inreg %a, i32 ; SI-NEXT: s_cbranch_scc0 .LBB61_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 ; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 ; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 ; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 ; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 ; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 ; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s16 ; SI-NEXT: s_cbranch_execnz .LBB61_3 ; SI-NEXT: .LBB61_2: ; %cmp.true ; SI-NEXT: s_add_u32 s4, s16, 3 @@ -16211,41 +16715,73 @@ define inreg <16 x half> @bitcast_v4i64_to_v16f16_scalar(<4 x i64> inreg %a, i32 ; SI-NEXT: s_addc_u32 s17, s23, 0 ; SI-NEXT: s_lshr_b32 s18, s16, 16 ; SI-NEXT: s_lshr_b32 s19, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s6 ; SI-NEXT: .LBB61_3: ; %end +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v15 +; SI-NEXT: v_or_b32_e32 v0, v14, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v13 +; SI-NEXT: v_or_b32_e32 v2, v12, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_or_b32_e32 v5, v5, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v10 +; SI-NEXT: v_or_b32_e32 v1, v16, v1 +; SI-NEXT: v_or_b32_e32 v3, v14, v3 +; SI-NEXT: v_or_b32_e32 v4, v11, v4 +; SI-NEXT: v_or_b32_e32 v6, v9, v6 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB61_4: +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: s_branch .LBB61_2 ; ; VI-LABEL: bitcast_v4i64_to_v16f16_scalar: @@ -16354,23 +16890,47 @@ define <4 x i64> @bitcast_v16f16_to_v4i64(<16 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v16f16_to_v4i64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v28, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v14 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v7 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -16382,26 +16942,22 @@ define <4 x i64> @bitcast_v16f16_to_v4i64(<16 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB62_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v28 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v26 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v24 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v22 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v20 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v18 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v9 -; SI-NEXT: v_or_b32_e32 v0, v27, v0 -; SI-NEXT: v_or_b32_e32 v1, v25, v1 -; SI-NEXT: v_or_b32_e32 v2, v23, v2 -; SI-NEXT: v_or_b32_e32 v3, v21, v3 -; SI-NEXT: v_or_b32_e32 v4, v19, v4 -; SI-NEXT: v_or_b32_e32 v5, v17, v5 -; SI-NEXT: v_or_b32_e32 v6, v10, v6 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v10 +; SI-NEXT: v_or_b32_e32 v0, v23, v0 +; SI-NEXT: v_or_b32_e32 v1, v21, v1 +; SI-NEXT: v_or_b32_e32 v2, v19, v2 +; SI-NEXT: v_or_b32_e32 v3, v17, v3 +; SI-NEXT: v_or_b32_e32 v4, v15, v4 +; SI-NEXT: v_or_b32_e32 v5, v13, v5 +; SI-NEXT: v_or_b32_e32 v6, v11, v6 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr22 @@ -16410,17 +16966,21 @@ define <4 x i64> @bitcast_v16f16_to_v4i64(<16 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB62_2 ; SI-NEXT: .LBB62_4: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v21 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -16433,10 +16993,10 @@ define <4 x i64> @bitcast_v16f16_to_v4i64(<16 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v16 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -16445,18 +17005,18 @@ define <4 x i64> @bitcast_v16f16_to_v4i64(<16 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v17 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v10 ; SI-NEXT: v_or_b32_e32 v3, v3, v4 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v14 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 @@ -16464,23 +17024,23 @@ define <4 x i64> @bitcast_v16f16_to_v4i64(<16 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v12 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v9 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -16585,24 +17145,47 @@ define inreg <4 x i64> @bitcast_v16f16_to_v4i64_scalar(<16 x half> inreg %a, i32 ; SI-LABEL: bitcast_v16f16_to_v4i64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v23, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v22, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v21, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v20, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v19, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v18, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v17, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v16, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v15, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v14, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v13, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v12, s26 -; SI-NEXT: v_cvt_f16_f32_e32 v11, s29 -; SI-NEXT: v_cvt_f16_f32_e32 v10, s28 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: s_cmp_lg_u32 s24, 0 ; SI-NEXT: s_cbranch_scc0 .LBB63_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v23 @@ -16839,15 +17422,43 @@ define <16 x bfloat> @bitcast_v4i64_to_v16bf16(<4 x i64> %a, i32 %b) { ; SI-LABEL: bitcast_v4i64_to_v16bf16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v23, v7 -; SI-NEXT: v_mov_b32_e32 v22, v6 -; SI-NEXT: v_mov_b32_e32 v21, v5 -; SI-NEXT: v_mov_b32_e32 v20, v4 -; SI-NEXT: v_mov_b32_e32 v19, v3 -; SI-NEXT: v_mov_b32_e32 v18, v2 -; SI-NEXT: v_mov_b32_e32 v17, v1 -; SI-NEXT: v_mov_b32_e32 v16, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB64_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v7 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v6 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v5 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v4 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v3 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v2 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v1 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v0 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 @@ -16856,76 +17467,68 @@ define <16 x bfloat> @bitcast_v4i64_to_v16bf16(<4 x i64> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB64_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB64_4 -; SI-NEXT: .LBB64_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB64_3: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v23 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v23 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v22 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v22 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v21 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v21 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v20 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v20 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v19 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v19 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v18 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v18 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v17 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v17 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v16 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v16 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: .LBB64_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB64_2 -; SI-NEXT: .LBB64_4: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v16 -; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v17, vcc -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v18 -; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v19, vcc -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v20 -; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v21, vcc -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v22 -; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v23, vcc -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v7 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v7 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v6 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v6 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v5 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v5 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v4 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v4 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v3 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_cbranch_execz .LBB64_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v7 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v6 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v5 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v4 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v3 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v2 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v1 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v0 +; SI-NEXT: .LBB64_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v23 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v22 +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v21 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v20 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v19 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v18 +; SI-NEXT: v_alignbit_b32 v2, v2, v3, 16 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v17 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v16 +; SI-NEXT: v_alignbit_b32 v3, v3, v4, 16 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v15 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v14 +; SI-NEXT: v_alignbit_b32 v4, v4, v5, 16 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v13 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_mul_f32_e32 v6, 1.0, v12 +; SI-NEXT: v_alignbit_b32 v5, v5, v6, 16 +; SI-NEXT: v_mul_f32_e32 v6, 1.0, v11 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v10 +; SI-NEXT: v_alignbit_b32 v6, v6, v7, 16 +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v9 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_alignbit_b32 v7, v7, v8, 16 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4i64_to_v16bf16: @@ -17015,83 +17618,99 @@ define inreg <16 x bfloat> @bitcast_v4i64_to_v16bf16_scalar(<4 x i64> inreg %a, ; SI-NEXT: s_cmp_lg_u32 s24, 0 ; SI-NEXT: s_cbranch_scc0 .LBB65_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_and_b32 s6, s23, 0xffff0000 -; SI-NEXT: s_lshl_b32 s7, s23, 16 -; SI-NEXT: s_and_b32 s8, s22, 0xffff0000 -; SI-NEXT: s_lshl_b32 s9, s22, 16 -; SI-NEXT: s_and_b32 s10, s21, 0xffff0000 -; SI-NEXT: s_lshl_b32 s11, s21, 16 -; SI-NEXT: s_and_b32 s12, s20, 0xffff0000 -; SI-NEXT: s_lshl_b32 s13, s20, 16 -; SI-NEXT: s_and_b32 s14, s19, 0xffff0000 -; SI-NEXT: s_lshl_b32 s15, s19, 16 -; SI-NEXT: s_and_b32 s24, s18, 0xffff0000 -; SI-NEXT: s_lshl_b32 s25, s18, 16 -; SI-NEXT: s_and_b32 s26, s17, 0xffff0000 -; SI-NEXT: s_lshl_b32 s27, s17, 16 -; SI-NEXT: s_and_b32 s28, s16, 0xffff0000 -; SI-NEXT: s_lshl_b32 s29, s16, 16 +; SI-NEXT: s_and_b32 s7, s23, 0xffff0000 +; SI-NEXT: s_lshl_b32 s6, s23, 16 +; SI-NEXT: s_and_b32 s9, s22, 0xffff0000 +; SI-NEXT: s_lshl_b32 s8, s22, 16 +; SI-NEXT: s_and_b32 s11, s21, 0xffff0000 +; SI-NEXT: s_lshl_b32 s10, s21, 16 +; SI-NEXT: s_and_b32 s13, s20, 0xffff0000 +; SI-NEXT: s_lshl_b32 s12, s20, 16 +; SI-NEXT: s_and_b32 s15, s19, 0xffff0000 +; SI-NEXT: s_lshl_b32 s14, s19, 16 +; SI-NEXT: s_and_b32 s25, s18, 0xffff0000 +; SI-NEXT: s_lshl_b32 s24, s18, 16 +; SI-NEXT: s_and_b32 s27, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s26, s17, 16 +; SI-NEXT: s_and_b32 s29, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s28, s16, 16 ; SI-NEXT: s_cbranch_execnz .LBB65_3 ; SI-NEXT: .LBB65_2: ; %cmp.true ; SI-NEXT: s_add_u32 s4, s16, 3 ; SI-NEXT: s_addc_u32 s5, s17, 0 ; SI-NEXT: s_add_u32 s16, s18, 3 -; SI-NEXT: s_addc_u32 s15, s19, 0 -; SI-NEXT: s_add_u32 s13, s20, 3 -; SI-NEXT: s_addc_u32 s11, s21, 0 -; SI-NEXT: s_add_u32 s9, s22, 3 -; SI-NEXT: s_addc_u32 s7, s23, 0 -; SI-NEXT: s_and_b32 s6, s7, 0xffff0000 -; SI-NEXT: s_lshl_b32 s7, s7, 16 -; SI-NEXT: s_and_b32 s8, s9, 0xffff0000 -; SI-NEXT: s_lshl_b32 s9, s9, 16 -; SI-NEXT: s_and_b32 s10, s11, 0xffff0000 -; SI-NEXT: s_lshl_b32 s11, s11, 16 -; SI-NEXT: s_and_b32 s12, s13, 0xffff0000 -; SI-NEXT: s_lshl_b32 s13, s13, 16 -; SI-NEXT: s_and_b32 s14, s15, 0xffff0000 -; SI-NEXT: s_lshl_b32 s15, s15, 16 -; SI-NEXT: s_and_b32 s24, s16, 0xffff0000 -; SI-NEXT: s_lshl_b32 s25, s16, 16 -; SI-NEXT: s_and_b32 s26, s5, 0xffff0000 -; SI-NEXT: s_lshl_b32 s27, s5, 16 -; SI-NEXT: s_and_b32 s28, s4, 0xffff0000 -; SI-NEXT: s_lshl_b32 s29, s4, 16 +; SI-NEXT: s_addc_u32 s14, s19, 0 +; SI-NEXT: s_add_u32 s12, s20, 3 +; SI-NEXT: s_addc_u32 s10, s21, 0 +; SI-NEXT: s_add_u32 s8, s22, 3 +; SI-NEXT: s_addc_u32 s6, s23, 0 +; SI-NEXT: s_and_b32 s7, s6, 0xffff0000 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s9, s8, 0xffff0000 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_and_b32 s11, s10, 0xffff0000 +; SI-NEXT: s_lshl_b32 s10, s10, 16 +; SI-NEXT: s_and_b32 s13, s12, 0xffff0000 +; SI-NEXT: s_lshl_b32 s12, s12, 16 +; SI-NEXT: s_and_b32 s15, s14, 0xffff0000 +; SI-NEXT: s_lshl_b32 s14, s14, 16 +; SI-NEXT: s_and_b32 s25, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s24, s16, 16 +; SI-NEXT: s_and_b32 s27, s5, 0xffff0000 +; SI-NEXT: s_lshl_b32 s26, s5, 16 +; SI-NEXT: s_and_b32 s29, s4, 0xffff0000 +; SI-NEXT: s_lshl_b32 s28, s4, 16 ; SI-NEXT: .LBB65_3: ; %end -; SI-NEXT: v_mov_b32_e32 v0, s29 -; SI-NEXT: v_mov_b32_e32 v1, s28 -; SI-NEXT: v_mov_b32_e32 v2, s27 -; SI-NEXT: v_mov_b32_e32 v3, s26 -; SI-NEXT: v_mov_b32_e32 v4, s25 -; SI-NEXT: v_mov_b32_e32 v5, s24 -; SI-NEXT: v_mov_b32_e32 v6, s15 -; SI-NEXT: v_mov_b32_e32 v7, s14 -; SI-NEXT: v_mov_b32_e32 v8, s13 -; SI-NEXT: v_mov_b32_e32 v9, s12 -; SI-NEXT: v_mov_b32_e32 v10, s11 -; SI-NEXT: v_mov_b32_e32 v11, s10 -; SI-NEXT: v_mov_b32_e32 v12, s9 -; SI-NEXT: v_mov_b32_e32 v13, s8 -; SI-NEXT: v_mov_b32_e32 v14, s7 -; SI-NEXT: v_mov_b32_e32 v15, s6 +; SI-NEXT: v_mul_f32_e64 v0, 1.0, s29 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_mul_f32_e64 v0, 1.0, s28 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s27 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s26 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s25 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s24 +; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s15 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s14 +; SI-NEXT: v_lshr_b64 v[3:4], v[3:4], 16 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s13 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s12 +; SI-NEXT: v_lshr_b64 v[4:5], v[4:5], 16 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s11 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s10 +; SI-NEXT: v_lshr_b64 v[5:6], v[5:6], 16 +; SI-NEXT: v_mul_f32_e64 v6, 1.0, s9 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_mul_f32_e64 v6, 1.0, s8 +; SI-NEXT: v_lshr_b64 v[6:7], v[6:7], 16 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s7 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v7 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s6 +; SI-NEXT: v_lshr_b64 v[7:8], v[7:8], 16 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB65_4: -; SI-NEXT: ; implicit-def: $sgpr29 ; SI-NEXT: ; implicit-def: $sgpr28 -; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr29 ; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr25 +; SI-NEXT: ; implicit-def: $sgpr27 ; SI-NEXT: ; implicit-def: $sgpr24 -; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $sgpr25 ; SI-NEXT: ; implicit-def: $sgpr14 -; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $sgpr15 ; SI-NEXT: ; implicit-def: $sgpr12 -; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr13 ; SI-NEXT: ; implicit-def: $sgpr10 -; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $sgpr11 ; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr9 ; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr7 ; SI-NEXT: s_branch .LBB65_2 ; ; VI-LABEL: bitcast_v4i64_to_v16bf16_scalar: @@ -17200,23 +17819,39 @@ define <4 x i64> @bitcast_v16bf16_to_v4i64(<16 x bfloat> %a, i32 %b) { ; SI-LABEL: bitcast_v16bf16_to_v4i64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; SI-NEXT: v_mul_f32_e32 v26, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v27, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v24, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v25, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v22, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v23, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v20, 1.0, v7 -; SI-NEXT: v_mul_f32_e32 v21, 1.0, v6 -; SI-NEXT: v_mul_f32_e32 v18, 1.0, v9 -; SI-NEXT: v_mul_f32_e32 v19, 1.0, v8 -; SI-NEXT: v_mul_f32_e32 v16, 1.0, v11 -; SI-NEXT: v_mul_f32_e32 v17, 1.0, v10 -; SI-NEXT: v_mul_f32_e32 v10, 1.0, v13 -; SI-NEXT: v_mul_f32_e32 v11, 1.0, v12 -; SI-NEXT: v_mul_f32_e32 v8, 1.0, v15 -; SI-NEXT: v_mul_f32_e32 v9, 1.0, v14 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v14, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v15, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v12, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v13, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v7 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -17228,26 +17863,22 @@ define <4 x i64> @bitcast_v16bf16_to_v4i64(<16 x bfloat> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB66_3: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v12 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v10 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_alignbit_b32 v0, v0, v27, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v25, 16 -; SI-NEXT: v_alignbit_b32 v2, v2, v23, 16 -; SI-NEXT: v_alignbit_b32 v3, v3, v21, 16 -; SI-NEXT: v_alignbit_b32 v4, v4, v19, 16 -; SI-NEXT: v_alignbit_b32 v5, v5, v17, 16 +; SI-NEXT: v_alignbit_b32 v0, v0, v23, 16 +; SI-NEXT: v_alignbit_b32 v1, v1, v21, 16 +; SI-NEXT: v_alignbit_b32 v2, v2, v19, 16 +; SI-NEXT: v_alignbit_b32 v3, v3, v17, 16 +; SI-NEXT: v_alignbit_b32 v4, v4, v15, 16 +; SI-NEXT: v_alignbit_b32 v5, v5, v13, 16 ; SI-NEXT: v_alignbit_b32 v6, v6, v11, 16 ; SI-NEXT: v_alignbit_b32 v7, v7, v9, 16 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr20 @@ -17256,6 +17887,10 @@ define <4 x i64> @bitcast_v16bf16_to_v4i64(<16 x bfloat> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr8 @@ -17263,11 +17898,11 @@ define <4 x i64> @bitcast_v16bf16_to_v4i64(<16 x bfloat> %a, i32 %b) { ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB66_2 ; SI-NEXT: .LBB66_4: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v26 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v24 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v27 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v22 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v20 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v23 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v25 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v21 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 @@ -17275,26 +17910,26 @@ define <4 x i64> @bitcast_v16bf16_to_v4i64(<16 x bfloat> %a, i32 %b) { ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 ; SI-NEXT: v_alignbit_b32 v1, v3, v2, 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v22 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v23 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v18 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v19 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v20 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v16 ; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v21 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v17 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v18 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v14 ; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v19 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v15 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v12 ; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v17 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v13 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 @@ -17928,24 +18563,39 @@ define inreg <4 x i64> @bitcast_v16bf16_to_v4i64_scalar(<16 x bfloat> inreg %a, ; SI-LABEL: bitcast_v16bf16_to_v4i64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mul_f32_e64 v32, 1.0, s17 +; SI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; SI-NEXT: s_lshl_b32 s5, s23, 16 +; SI-NEXT: s_and_b32 s6, s22, 0xffff0000 +; SI-NEXT: s_lshl_b32 s7, s22, 16 +; SI-NEXT: s_and_b32 s8, s21, 0xffff0000 +; SI-NEXT: s_lshl_b32 s9, s21, 16 +; SI-NEXT: s_and_b32 s10, s20, 0xffff0000 +; SI-NEXT: s_lshl_b32 s11, s20, 16 +; SI-NEXT: s_and_b32 s12, s19, 0xffff0000 +; SI-NEXT: s_lshl_b32 s13, s19, 16 +; SI-NEXT: s_and_b32 s14, s18, 0xffff0000 +; SI-NEXT: s_lshl_b32 s15, s18, 16 +; SI-NEXT: s_and_b32 s18, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_and_b32 s19, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s16, s16, 16 +; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: v_mul_f32_e64 v32, 1.0, s19 ; SI-NEXT: v_mul_f32_e64 v23, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v31, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v21, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v30, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v19, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v29, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v17, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v28, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v15, 1.0, s24 -; SI-NEXT: v_mul_f32_e64 v27, 1.0, s27 -; SI-NEXT: v_mul_f32_e64 v26, 1.0, s29 -; SI-NEXT: v_mul_f32_e32 v25, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v9, 1.0, v0 -; SI-NEXT: v_mul_f32_e64 v13, 1.0, s26 -; SI-NEXT: v_mul_f32_e64 v11, 1.0, s28 +; SI-NEXT: v_mul_f32_e64 v31, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v21, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v30, 1.0, s14 +; SI-NEXT: v_mul_f32_e64 v19, 1.0, s15 +; SI-NEXT: v_mul_f32_e64 v29, 1.0, s12 +; SI-NEXT: v_mul_f32_e64 v17, 1.0, s13 +; SI-NEXT: v_mul_f32_e64 v28, 1.0, s10 +; SI-NEXT: v_mul_f32_e64 v15, 1.0, s11 +; SI-NEXT: v_mul_f32_e64 v27, 1.0, s8 +; SI-NEXT: v_mul_f32_e64 v13, 1.0, s9 +; SI-NEXT: v_mul_f32_e64 v26, 1.0, s6 +; SI-NEXT: v_mul_f32_e64 v25, 1.0, s4 +; SI-NEXT: v_mul_f32_e64 v11, 1.0, s7 +; SI-NEXT: v_mul_f32_e64 v9, 1.0, s5 ; SI-NEXT: s_cbranch_scc0 .LBB67_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v32 @@ -21495,60 +22145,68 @@ define <16 x i16> @bitcast_v4f64_to_v16i16(<4 x double> %a, i32 %b) { ; SI-LABEL: bitcast_v4f64_to_v16i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v17, v7 -; SI-NEXT: v_mov_b32_e32 v16, v6 -; SI-NEXT: v_mov_b32_e32 v19, v5 -; SI-NEXT: v_mov_b32_e32 v18, v4 -; SI-NEXT: v_mov_b32_e32 v21, v3 -; SI-NEXT: v_mov_b32_e32 v20, v2 -; SI-NEXT: v_mov_b32_e32 v23, v1 -; SI-NEXT: v_mov_b32_e32 v22, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_alignbit_b32 v13, v17, v16, 16 -; SI-NEXT: v_alignbit_b32 v9, v19, v18, 16 -; SI-NEXT: v_alignbit_b32 v5, v21, v20, 16 -; SI-NEXT: v_alignbit_b32 v1, v23, v22, 16 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v23 +; SI-NEXT: v_alignbit_b32 v8, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v9, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v10, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v13, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v1 ; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB72_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 -; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; SI-NEXT: v_alignbit_b32 v13, v17, v16, 16 -; SI-NEXT: v_alignbit_b32 v9, v19, v18, 16 -; SI-NEXT: v_alignbit_b32 v5, v21, v20, 16 -; SI-NEXT: v_alignbit_b32 v1, v23, v22, 16 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v23 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_alignbit_b32 v8, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v9, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v10, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v13, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v1 ; SI-NEXT: .LBB72_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_mov_b32_e32 v0, v22 -; SI-NEXT: v_mov_b32_e32 v2, v23 -; SI-NEXT: v_mov_b32_e32 v4, v20 -; SI-NEXT: v_mov_b32_e32 v6, v21 -; SI-NEXT: v_mov_b32_e32 v8, v18 -; SI-NEXT: v_mov_b32_e32 v10, v19 -; SI-NEXT: v_mov_b32_e32 v12, v16 -; SI-NEXT: v_mov_b32_e32 v14, v17 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v0, v0, v13 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v15 +; SI-NEXT: v_or_b32_e32 v2, v2, v10 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v14 +; SI-NEXT: v_or_b32_e32 v4, v4, v9 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v12 +; SI-NEXT: v_or_b32_e32 v6, v6, v8 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v11 +; SI-NEXT: v_or_b32_e32 v1, v1, v13 +; SI-NEXT: v_or_b32_e32 v3, v3, v10 +; SI-NEXT: v_or_b32_e32 v5, v5, v9 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4f64_to_v16i16: @@ -21636,18 +22294,18 @@ define inreg <16 x i16> @bitcast_v4f64_to_v16i16_scalar(<4 x double> inreg %a, i ; SI-NEXT: s_lshr_b64 s[10:11], s[16:17], 16 ; SI-NEXT: s_cbranch_execnz .LBB73_4 ; SI-NEXT: .LBB73_2: ; %cmp.true -; SI-NEXT: v_add_f64 v[16:17], s[22:23], 1.0 -; SI-NEXT: v_add_f64 v[18:19], s[20:21], 1.0 -; SI-NEXT: v_add_f64 v[20:21], s[18:19], 1.0 -; SI-NEXT: v_add_f64 v[22:23], s[16:17], 1.0 -; SI-NEXT: v_lshr_b64 v[13:14], v[16:17], 16 -; SI-NEXT: v_lshr_b64 v[9:10], v[18:19], 16 -; SI-NEXT: v_lshr_b64 v[5:6], v[20:21], 16 -; SI-NEXT: v_lshr_b64 v[1:2], v[22:23], 16 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v23 +; SI-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; SI-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; SI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; SI-NEXT: v_lshr_b64 v[8:9], v[6:7], 16 +; SI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; SI-NEXT: v_lshr_b64 v[9:10], v[4:5], 16 +; SI-NEXT: v_lshr_b64 v[10:11], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[11:12], v[0:1], 16 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v1 ; SI-NEXT: s_branch .LBB73_5 ; SI-NEXT: .LBB73_3: ; SI-NEXT: ; implicit-def: $sgpr10 @@ -21660,31 +22318,47 @@ define inreg <16 x i16> @bitcast_v4f64_to_v16i16_scalar(<4 x double> inreg %a, i ; SI-NEXT: ; implicit-def: $sgpr25 ; SI-NEXT: s_branch .LBB73_2 ; SI-NEXT: .LBB73_4: -; SI-NEXT: v_mov_b32_e32 v17, s23 -; SI-NEXT: v_mov_b32_e32 v19, s21 -; SI-NEXT: v_mov_b32_e32 v21, s19 -; SI-NEXT: v_mov_b32_e32 v23, s17 -; SI-NEXT: v_mov_b32_e32 v22, s16 -; SI-NEXT: v_mov_b32_e32 v20, s18 -; SI-NEXT: v_mov_b32_e32 v18, s20 -; SI-NEXT: v_mov_b32_e32 v16, s22 -; SI-NEXT: v_mov_b32_e32 v15, s25 -; SI-NEXT: v_mov_b32_e32 v11, s24 -; SI-NEXT: v_mov_b32_e32 v7, s15 -; SI-NEXT: v_mov_b32_e32 v3, s14 -; SI-NEXT: v_mov_b32_e32 v1, s10 -; SI-NEXT: v_mov_b32_e32 v5, s8 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v13, s25 +; SI-NEXT: v_mov_b32_e32 v14, s24 +; SI-NEXT: v_mov_b32_e32 v15, s15 +; SI-NEXT: v_mov_b32_e32 v12, s14 +; SI-NEXT: v_mov_b32_e32 v11, s10 +; SI-NEXT: v_mov_b32_e32 v10, s8 ; SI-NEXT: v_mov_b32_e32 v9, s6 -; SI-NEXT: v_mov_b32_e32 v13, s4 +; SI-NEXT: v_mov_b32_e32 v8, s4 ; SI-NEXT: .LBB73_5: ; %end -; SI-NEXT: v_mov_b32_e32 v0, v22 -; SI-NEXT: v_mov_b32_e32 v2, v23 -; SI-NEXT: v_mov_b32_e32 v4, v20 -; SI-NEXT: v_mov_b32_e32 v6, v21 -; SI-NEXT: v_mov_b32_e32 v8, v18 -; SI-NEXT: v_mov_b32_e32 v10, v19 -; SI-NEXT: v_mov_b32_e32 v12, v16 -; SI-NEXT: v_mov_b32_e32 v14, v17 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v0, v0, v11 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_or_b32_e32 v2, v2, v10 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v15 +; SI-NEXT: v_or_b32_e32 v4, v4, v9 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v14 +; SI-NEXT: v_or_b32_e32 v6, v6, v8 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v13 +; SI-NEXT: v_or_b32_e32 v1, v1, v11 +; SI-NEXT: v_or_b32_e32 v3, v3, v10 +; SI-NEXT: v_or_b32_e32 v5, v5, v9 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4f64_to_v16i16_scalar: @@ -21787,19 +22461,31 @@ define <4 x double> @bitcast_v16i16_to_v4f64(<16 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v16i16_to_v4f64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v20, v6 -; SI-NEXT: v_mov_b32_e32 v19, v4 -; SI-NEXT: v_mov_b32_e32 v18, v2 -; SI-NEXT: v_mov_b32_e32 v17, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v15 +; SI-NEXT: v_mov_b32_e32 v16, v7 +; SI-NEXT: v_mov_b32_e32 v9, v6 +; SI-NEXT: v_mov_b32_e32 v10, v5 +; SI-NEXT: v_mov_b32_e32 v11, v4 +; SI-NEXT: v_mov_b32_e32 v12, v3 +; SI-NEXT: v_mov_b32_e32 v13, v2 +; SI-NEXT: v_mov_b32_e32 v14, v1 +; SI-NEXT: v_mov_b32_e32 v15, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v15 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v0 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -21811,49 +22497,49 @@ define <4 x double> @bitcast_v16i16_to_v4f64(<16 x i16> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB74_3: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v17 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v19 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v20 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v11 ; SI-NEXT: v_and_b32_e32 v5, 0xffff, v10 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v12 -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v14 -; SI-NEXT: v_or_b32_e32 v0, v0, v25 -; SI-NEXT: v_or_b32_e32 v1, v1, v24 -; SI-NEXT: v_or_b32_e32 v2, v2, v23 -; SI-NEXT: v_or_b32_e32 v3, v3, v22 -; SI-NEXT: v_or_b32_e32 v4, v4, v21 -; SI-NEXT: v_or_b32_e32 v5, v5, v16 -; SI-NEXT: v_or_b32_e32 v6, v6, v11 -; SI-NEXT: v_or_b32_e32 v7, v7, v9 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v0, v0, v23 +; SI-NEXT: v_or_b32_e32 v1, v1, v22 +; SI-NEXT: v_or_b32_e32 v2, v2, v21 +; SI-NEXT: v_or_b32_e32 v3, v3, v20 +; SI-NEXT: v_or_b32_e32 v4, v4, v19 +; SI-NEXT: v_or_b32_e32 v5, v5, v18 +; SI-NEXT: v_or_b32_e32 v6, v6, v17 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB74_2 ; SI-NEXT: .LBB74_4: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v17 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v18 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v19 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v20 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v11 ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v10 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v12 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v16 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 @@ -21862,15 +22548,15 @@ define <4 x double> @bitcast_v16i16_to_v4f64(<16 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: v_or_b32_e32 v0, v25, v0 +; SI-NEXT: v_or_b32_e32 v0, v23, v0 ; SI-NEXT: s_mov_b32 s6, 0x30000 -; SI-NEXT: v_or_b32_e32 v1, v24, v1 -; SI-NEXT: v_or_b32_e32 v2, v23, v2 -; SI-NEXT: v_or_b32_e32 v3, v22, v3 -; SI-NEXT: v_or_b32_e32 v4, v21, v4 -; SI-NEXT: v_or_b32_e32 v5, v16, v5 -; SI-NEXT: v_or_b32_e32 v6, v11, v6 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_or_b32_e32 v1, v22, v1 +; SI-NEXT: v_or_b32_e32 v2, v21, v2 +; SI-NEXT: v_or_b32_e32 v3, v20, v3 +; SI-NEXT: v_or_b32_e32 v4, v19, v4 +; SI-NEXT: v_or_b32_e32 v5, v18, v5 +; SI-NEXT: v_or_b32_e32 v6, v17, v6 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 @@ -21982,74 +22668,75 @@ define inreg <4 x double> @bitcast_v16i16_to_v4f64_scalar(<16 x i16> inreg %a, i ; SI-LABEL: bitcast_v16i16_to_v4f64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: v_mov_b32_e32 v8, v0 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v1 +; SI-NEXT: s_lshr_b32 s14, s23, 16 +; SI-NEXT: s_lshr_b32 s15, s22, 16 +; SI-NEXT: s_lshr_b32 s25, s21, 16 +; SI-NEXT: s_lshr_b32 s26, s20, 16 +; SI-NEXT: s_lshr_b32 s27, s19, 16 +; SI-NEXT: s_lshr_b32 s28, s18, 16 +; SI-NEXT: s_lshr_b32 s29, s17, 16 +; SI-NEXT: s_lshr_b32 s40, s16, 16 +; SI-NEXT: s_cmp_lg_u32 s24, 0 ; SI-NEXT: s_cbranch_scc0 .LBB75_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_lshl_b32 s5, s40, 16 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s29, 16 ; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_and_b32 s6, s18, 0xffff +; SI-NEXT: s_lshl_b32 s7, s28, 16 ; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_and_b32 s7, s19, 0xffff +; SI-NEXT: s_lshl_b32 s8, s27, 16 ; SI-NEXT: s_or_b32 s7, s7, s8 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_and_b32 s8, s20, 0xffff +; SI-NEXT: s_lshl_b32 s9, s26, 16 ; SI-NEXT: s_or_b32 s8, s8, s9 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_and_b32 s9, s21, 0xffff +; SI-NEXT: s_lshl_b32 s10, s25, 16 ; SI-NEXT: s_or_b32 s9, s9, s10 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: s_and_b32 s10, s22, 0xffff +; SI-NEXT: s_lshl_b32 s11, s15, 16 ; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v8 -; SI-NEXT: v_or_b32_e32 v7, v0, v9 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_mov_b32_e32 v5, s9 -; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_and_b32 s11, s23, 0xffff +; SI-NEXT: s_lshl_b32 s12, s14, 16 +; SI-NEXT: s_or_b32 s11, s11, s12 ; SI-NEXT: s_cbranch_execnz .LBB75_3 ; SI-NEXT: .LBB75_2: ; %cmp.true ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_lshl_b32 s5, s40, 16 +; SI-NEXT: s_add_i32 s17, s17, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s29, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_and_b32 s6, s18, 0xffff +; SI-NEXT: s_lshl_b32 s7, s28, 16 +; SI-NEXT: s_add_i32 s19, s19, 3 ; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_and_b32 s7, s19, 0xffff +; SI-NEXT: s_lshl_b32 s8, s27, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 ; SI-NEXT: s_or_b32 s7, s8, s7 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_and_b32 s8, s20, 0xffff +; SI-NEXT: s_lshl_b32 s9, s26, 16 +; SI-NEXT: s_add_i32 s21, s21, 3 ; SI-NEXT: s_or_b32 s8, s9, s8 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_and_b32 s9, s21, 0xffff +; SI-NEXT: s_lshl_b32 s10, s25, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 ; SI-NEXT: s_or_b32 s9, s10, s9 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v8 +; SI-NEXT: s_and_b32 s10, s22, 0xffff +; SI-NEXT: s_lshl_b32 s11, s15, 16 +; SI-NEXT: s_add_i32 s23, s23, 3 ; SI-NEXT: s_or_b32 s10, s11, s10 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_and_b32 s11, s23, 0xffff +; SI-NEXT: s_lshl_b32 s12, s14, 16 +; SI-NEXT: s_or_b32 s11, s12, s11 ; SI-NEXT: s_add_i32 s4, s4, 0x30000 ; SI-NEXT: s_add_i32 s5, s5, 0x30000 ; SI-NEXT: s_add_i32 s6, s6, 0x30000 @@ -22057,8 +22744,8 @@ define inreg <4 x double> @bitcast_v16i16_to_v4f64_scalar(<16 x i16> inreg %a, i ; SI-NEXT: s_add_i32 s8, s8, 0x30000 ; SI-NEXT: s_add_i32 s9, s9, 0x30000 ; SI-NEXT: s_add_i32 s10, s10, 0x30000 -; SI-NEXT: v_or_b32_e32 v0, v9, v0 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 +; SI-NEXT: s_add_i32 s11, s11, 0x30000 +; SI-NEXT: .LBB75_3: ; %end ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_mov_b32_e32 v2, s6 @@ -22066,10 +22753,10 @@ define inreg <4 x double> @bitcast_v16i16_to_v4f64_scalar(<16 x i16> inreg %a, i ; SI-NEXT: v_mov_b32_e32 v4, s8 ; SI-NEXT: v_mov_b32_e32 v5, s9 ; SI-NEXT: v_mov_b32_e32 v6, s10 -; SI-NEXT: .LBB75_3: ; %end +; SI-NEXT: v_mov_b32_e32 v7, s11 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB75_4: -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; SI-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11 ; SI-NEXT: s_branch .LBB75_2 ; ; VI-LABEL: bitcast_v16i16_to_v4f64_scalar: @@ -22218,47 +22905,47 @@ define <16 x half> @bitcast_v4f64_to_v16f16(<4 x double> %a, i32 %b) { ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB76_2 ; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v1 ; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v8 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v8 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v8 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v8 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v8 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v8 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v22, v0 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr2 @@ -22273,39 +22960,63 @@ define <16 x half> @bitcast_v4f64_to_v16f16(<4 x double> %a, i32 %b) { ; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v1 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v22, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: .LBB76_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_mov_b32_e32 v0, v22 -; SI-NEXT: v_mov_b32_e32 v1, v23 -; SI-NEXT: v_mov_b32_e32 v2, v21 -; SI-NEXT: v_mov_b32_e32 v3, v20 -; SI-NEXT: v_mov_b32_e32 v4, v19 -; SI-NEXT: v_mov_b32_e32 v5, v17 -; SI-NEXT: v_mov_b32_e32 v6, v18 -; SI-NEXT: v_mov_b32_e32 v7, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v20 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v19 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v16 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v14 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v13 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v10 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4f64_to_v16f16: @@ -22384,78 +23095,110 @@ define inreg <16 x half> @bitcast_v4f64_to_v16f16_scalar(<4 x double> inreg %a, ; SI-NEXT: s_cbranch_scc0 .LBB77_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 ; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 ; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 ; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 ; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 ; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 ; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s16 ; SI-NEXT: s_cbranch_execnz .LBB77_3 ; SI-NEXT: .LBB77_2: ; %cmp.true -; SI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 -; SI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 -; SI-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 -; SI-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v0 +; SI-NEXT: v_add_f64 v[3:4], s[16:17], 1.0 +; SI-NEXT: v_add_f64 v[0:1], s[18:19], 1.0 +; SI-NEXT: v_add_f64 v[10:11], s[20:21], 1.0 +; SI-NEXT: v_add_f64 v[5:6], s[22:23], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v0 ; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v2 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: .LBB77_3: ; %end +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v15 +; SI-NEXT: v_or_b32_e32 v0, v14, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v13 +; SI-NEXT: v_or_b32_e32 v2, v12, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_or_b32_e32 v5, v5, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v10 +; SI-NEXT: v_or_b32_e32 v1, v16, v1 +; SI-NEXT: v_or_b32_e32 v3, v14, v3 +; SI-NEXT: v_or_b32_e32 v4, v11, v4 +; SI-NEXT: v_or_b32_e32 v6, v9, v6 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB77_4: +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: s_branch .LBB77_2 ; ; VI-LABEL: bitcast_v4f64_to_v16f16_scalar: @@ -22558,23 +23301,47 @@ define <4 x double> @bitcast_v16f16_to_v4f64(<16 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v16f16_to_v4f64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v28, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v14 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v7 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -22586,26 +23353,22 @@ define <4 x double> @bitcast_v16f16_to_v4f64(<16 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB78_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v28 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v26 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v24 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v22 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v20 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v18 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v9 -; SI-NEXT: v_or_b32_e32 v0, v27, v0 -; SI-NEXT: v_or_b32_e32 v1, v25, v1 -; SI-NEXT: v_or_b32_e32 v2, v23, v2 -; SI-NEXT: v_or_b32_e32 v3, v21, v3 -; SI-NEXT: v_or_b32_e32 v4, v19, v4 -; SI-NEXT: v_or_b32_e32 v5, v17, v5 -; SI-NEXT: v_or_b32_e32 v6, v10, v6 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v10 +; SI-NEXT: v_or_b32_e32 v0, v23, v0 +; SI-NEXT: v_or_b32_e32 v1, v21, v1 +; SI-NEXT: v_or_b32_e32 v2, v19, v2 +; SI-NEXT: v_or_b32_e32 v3, v17, v3 +; SI-NEXT: v_or_b32_e32 v4, v15, v4 +; SI-NEXT: v_or_b32_e32 v5, v13, v5 +; SI-NEXT: v_or_b32_e32 v6, v11, v6 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr22 @@ -22614,17 +23377,21 @@ define <4 x double> @bitcast_v16f16_to_v4f64(<16 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB78_2 ; SI-NEXT: .LBB78_4: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v21 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -22637,10 +23404,10 @@ define <4 x double> @bitcast_v16f16_to_v4f64(<16 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v16 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -22649,18 +23416,18 @@ define <4 x double> @bitcast_v16f16_to_v4f64(<16 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v17 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v10 ; SI-NEXT: v_or_b32_e32 v3, v3, v4 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v14 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 @@ -22668,23 +23435,23 @@ define <4 x double> @bitcast_v16f16_to_v4f64(<16 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v12 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v9 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -22789,24 +23556,47 @@ define inreg <4 x double> @bitcast_v16f16_to_v4f64_scalar(<16 x half> inreg %a, ; SI-LABEL: bitcast_v16f16_to_v4f64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v23, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v22, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v21, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v20, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v19, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v18, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v17, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v16, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v15, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v14, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v13, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v12, s26 -; SI-NEXT: v_cvt_f16_f32_e32 v11, s29 -; SI-NEXT: v_cvt_f16_f32_e32 v10, s28 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: s_cmp_lg_u32 s24, 0 ; SI-NEXT: s_cbranch_scc0 .LBB79_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v23 @@ -23044,42 +23834,42 @@ define <16 x bfloat> @bitcast_v4f64_to_v16bf16(<4 x double> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB80_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v7 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v7 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v6 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v6 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v5 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v5 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v4 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v4 -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v3 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v3 -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v2 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v2 -; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v1 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v1 -; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v0 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v0 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v7 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v6 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v5 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v4 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v3 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v2 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v1 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v0 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr4 @@ -23092,32 +23882,56 @@ define <16 x bfloat> @bitcast_v4f64_to_v16bf16(<4 x double> %a, i32 %b) { ; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 ; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v7 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v7 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v6 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v6 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v5 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v5 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v4 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v4 -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v3 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v3 -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v2 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v2 -; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v1 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v1 -; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v0 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v0 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v7 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v6 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v5 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v4 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v3 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v2 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v1 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v0 ; SI-NEXT: .LBB80_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_mov_b32_e32 v0, v23 -; SI-NEXT: v_mov_b32_e32 v1, v22 -; SI-NEXT: v_mov_b32_e32 v2, v21 -; SI-NEXT: v_mov_b32_e32 v3, v20 -; SI-NEXT: v_mov_b32_e32 v4, v19 -; SI-NEXT: v_mov_b32_e32 v5, v18 -; SI-NEXT: v_mov_b32_e32 v6, v17 -; SI-NEXT: v_mov_b32_e32 v7, v16 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v23 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v22 +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v21 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v20 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v19 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v18 +; SI-NEXT: v_alignbit_b32 v2, v2, v3, 16 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v17 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v16 +; SI-NEXT: v_alignbit_b32 v3, v3, v4, 16 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v15 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v14 +; SI-NEXT: v_alignbit_b32 v4, v4, v5, 16 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v13 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_mul_f32_e32 v6, 1.0, v12 +; SI-NEXT: v_alignbit_b32 v5, v5, v6, 16 +; SI-NEXT: v_mul_f32_e32 v6, 1.0, v11 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v10 +; SI-NEXT: v_alignbit_b32 v6, v6, v7, 16 +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v9 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_alignbit_b32 v7, v7, v8, 16 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4f64_to_v16bf16: @@ -23216,24 +24030,24 @@ define inreg <16 x bfloat> @bitcast_v4f64_to_v16bf16_scalar(<4 x double> inreg % ; SI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 ; SI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 ; SI-NEXT: v_add_f64 v[4:5], s[22:23], 1.0 -; SI-NEXT: v_add_f64 v[6:7], s[20:21], 1.0 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v5 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v5 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v4 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v4 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v7 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v7 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v6 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v6 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v3 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; SI-NEXT: v_add_f64 v[13:14], s[20:21], 1.0 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v5 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v4 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v14 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v14 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v13 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v1 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: s_branch .LBB81_5 ; SI-NEXT: .LBB81_3: ; SI-NEXT: ; implicit-def: $sgpr6 ; SI-NEXT: ; implicit-def: $sgpr7 @@ -23253,22 +24067,55 @@ define inreg <16 x bfloat> @bitcast_v4f64_to_v16bf16_scalar(<4 x double> inreg % ; SI-NEXT: ; implicit-def: $sgpr29 ; SI-NEXT: s_branch .LBB81_2 ; SI-NEXT: .LBB81_4: -; SI-NEXT: v_mov_b32_e32 v15, s29 -; SI-NEXT: v_mov_b32_e32 v14, s28 -; SI-NEXT: v_mov_b32_e32 v13, s27 -; SI-NEXT: v_mov_b32_e32 v12, s26 +; SI-NEXT: v_mov_b32_e32 v9, s29 +; SI-NEXT: v_mov_b32_e32 v8, s28 +; SI-NEXT: v_mov_b32_e32 v10, s27 +; SI-NEXT: v_mov_b32_e32 v7, s26 ; SI-NEXT: v_mov_b32_e32 v11, s25 -; SI-NEXT: v_mov_b32_e32 v10, s24 -; SI-NEXT: v_mov_b32_e32 v9, s15 -; SI-NEXT: v_mov_b32_e32 v8, s14 -; SI-NEXT: v_mov_b32_e32 v7, s13 -; SI-NEXT: v_mov_b32_e32 v6, s12 -; SI-NEXT: v_mov_b32_e32 v5, s11 -; SI-NEXT: v_mov_b32_e32 v4, s10 -; SI-NEXT: v_mov_b32_e32 v3, s9 +; SI-NEXT: v_mov_b32_e32 v6, s24 +; SI-NEXT: v_mov_b32_e32 v12, s15 +; SI-NEXT: v_mov_b32_e32 v5, s14 +; SI-NEXT: v_mov_b32_e32 v13, s13 +; SI-NEXT: v_mov_b32_e32 v4, s12 +; SI-NEXT: v_mov_b32_e32 v14, s11 +; SI-NEXT: v_mov_b32_e32 v3, s10 +; SI-NEXT: v_mov_b32_e32 v15, s9 ; SI-NEXT: v_mov_b32_e32 v2, s8 ; SI-NEXT: v_mov_b32_e32 v1, s7 ; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: .LBB81_5: ; %end +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v15 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v1 +; SI-NEXT: v_mul_f32_e32 v15, 1.0, v2 +; SI-NEXT: v_lshr_b64 v[1:2], v[15:16], 16 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v14 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v2 +; SI-NEXT: v_mul_f32_e32 v14, 1.0, v3 +; SI-NEXT: v_lshr_b64 v[2:3], v[14:15], 16 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v13 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v3 +; SI-NEXT: v_mul_f32_e32 v13, 1.0, v4 +; SI-NEXT: v_lshr_b64 v[3:4], v[13:14], 16 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v12 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v4 +; SI-NEXT: v_mul_f32_e32 v12, 1.0, v5 +; SI-NEXT: v_lshr_b64 v[4:5], v[12:13], 16 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v11 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v5 +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v6 +; SI-NEXT: v_lshr_b64 v[5:6], v[11:12], 16 +; SI-NEXT: v_mul_f32_e32 v6, 1.0, v10 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v6 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v7 +; SI-NEXT: v_lshr_b64 v[6:7], v[10:11], 16 +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v7 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_lshr_b64 v[7:8], v[8:9], 16 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4f64_to_v16bf16_scalar: @@ -23371,23 +24218,39 @@ define <4 x double> @bitcast_v16bf16_to_v4f64(<16 x bfloat> %a, i32 %b) { ; SI-LABEL: bitcast_v16bf16_to_v4f64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; SI-NEXT: v_mul_f32_e32 v26, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v27, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v24, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v25, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v22, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v23, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v20, 1.0, v7 -; SI-NEXT: v_mul_f32_e32 v21, 1.0, v6 -; SI-NEXT: v_mul_f32_e32 v18, 1.0, v9 -; SI-NEXT: v_mul_f32_e32 v19, 1.0, v8 -; SI-NEXT: v_mul_f32_e32 v16, 1.0, v11 -; SI-NEXT: v_mul_f32_e32 v17, 1.0, v10 -; SI-NEXT: v_mul_f32_e32 v10, 1.0, v13 -; SI-NEXT: v_mul_f32_e32 v11, 1.0, v12 -; SI-NEXT: v_mul_f32_e32 v8, 1.0, v15 -; SI-NEXT: v_mul_f32_e32 v9, 1.0, v14 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v14, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v15, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v12, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v13, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v7 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -23399,26 +24262,22 @@ define <4 x double> @bitcast_v16bf16_to_v4f64(<16 x bfloat> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB82_3: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v12 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v10 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_alignbit_b32 v0, v0, v27, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v25, 16 -; SI-NEXT: v_alignbit_b32 v2, v2, v23, 16 -; SI-NEXT: v_alignbit_b32 v3, v3, v21, 16 -; SI-NEXT: v_alignbit_b32 v4, v4, v19, 16 -; SI-NEXT: v_alignbit_b32 v5, v5, v17, 16 +; SI-NEXT: v_alignbit_b32 v0, v0, v23, 16 +; SI-NEXT: v_alignbit_b32 v1, v1, v21, 16 +; SI-NEXT: v_alignbit_b32 v2, v2, v19, 16 +; SI-NEXT: v_alignbit_b32 v3, v3, v17, 16 +; SI-NEXT: v_alignbit_b32 v4, v4, v15, 16 +; SI-NEXT: v_alignbit_b32 v5, v5, v13, 16 ; SI-NEXT: v_alignbit_b32 v6, v6, v11, 16 ; SI-NEXT: v_alignbit_b32 v7, v7, v9, 16 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr20 @@ -23427,6 +24286,10 @@ define <4 x double> @bitcast_v16bf16_to_v4f64(<16 x bfloat> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr8 @@ -23434,11 +24297,11 @@ define <4 x double> @bitcast_v16bf16_to_v4f64(<16 x bfloat> %a, i32 %b) { ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB82_2 ; SI-NEXT: .LBB82_4: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v26 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v24 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v27 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v22 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v20 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v23 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v25 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v21 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 @@ -23446,26 +24309,26 @@ define <4 x double> @bitcast_v16bf16_to_v4f64(<16 x bfloat> %a, i32 %b) { ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 ; SI-NEXT: v_alignbit_b32 v1, v3, v2, 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v22 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v23 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v18 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v19 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v20 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v16 ; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v21 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v17 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v18 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v14 ; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v19 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v15 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v12 ; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v17 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v13 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 @@ -24099,24 +24962,39 @@ define inreg <4 x double> @bitcast_v16bf16_to_v4f64_scalar(<16 x bfloat> inreg % ; SI-LABEL: bitcast_v16bf16_to_v4f64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mul_f32_e64 v32, 1.0, s17 +; SI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; SI-NEXT: s_lshl_b32 s5, s23, 16 +; SI-NEXT: s_and_b32 s6, s22, 0xffff0000 +; SI-NEXT: s_lshl_b32 s7, s22, 16 +; SI-NEXT: s_and_b32 s8, s21, 0xffff0000 +; SI-NEXT: s_lshl_b32 s9, s21, 16 +; SI-NEXT: s_and_b32 s10, s20, 0xffff0000 +; SI-NEXT: s_lshl_b32 s11, s20, 16 +; SI-NEXT: s_and_b32 s12, s19, 0xffff0000 +; SI-NEXT: s_lshl_b32 s13, s19, 16 +; SI-NEXT: s_and_b32 s14, s18, 0xffff0000 +; SI-NEXT: s_lshl_b32 s15, s18, 16 +; SI-NEXT: s_and_b32 s18, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_and_b32 s19, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s16, s16, 16 +; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: v_mul_f32_e64 v32, 1.0, s19 ; SI-NEXT: v_mul_f32_e64 v23, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v31, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v21, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v30, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v19, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v29, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v17, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v28, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v15, 1.0, s24 -; SI-NEXT: v_mul_f32_e64 v27, 1.0, s27 -; SI-NEXT: v_mul_f32_e64 v26, 1.0, s29 -; SI-NEXT: v_mul_f32_e32 v25, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v9, 1.0, v0 -; SI-NEXT: v_mul_f32_e64 v13, 1.0, s26 -; SI-NEXT: v_mul_f32_e64 v11, 1.0, s28 +; SI-NEXT: v_mul_f32_e64 v31, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v21, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v30, 1.0, s14 +; SI-NEXT: v_mul_f32_e64 v19, 1.0, s15 +; SI-NEXT: v_mul_f32_e64 v29, 1.0, s12 +; SI-NEXT: v_mul_f32_e64 v17, 1.0, s13 +; SI-NEXT: v_mul_f32_e64 v28, 1.0, s10 +; SI-NEXT: v_mul_f32_e64 v15, 1.0, s11 +; SI-NEXT: v_mul_f32_e64 v27, 1.0, s8 +; SI-NEXT: v_mul_f32_e64 v13, 1.0, s9 +; SI-NEXT: v_mul_f32_e64 v26, 1.0, s6 +; SI-NEXT: v_mul_f32_e64 v25, 1.0, s4 +; SI-NEXT: v_mul_f32_e64 v11, 1.0, s7 +; SI-NEXT: v_mul_f32_e64 v9, 1.0, s5 ; SI-NEXT: s_cbranch_scc0 .LBB83_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v32 @@ -27683,23 +28561,51 @@ define <16 x half> @bitcast_v16i16_to_v16f16(<16 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v16i16_to_v16f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v31, v15 -; SI-NEXT: v_mov_b32_e32 v30, v14 -; SI-NEXT: v_mov_b32_e32 v29, v13 -; SI-NEXT: v_mov_b32_e32 v28, v12 -; SI-NEXT: v_mov_b32_e32 v27, v11 -; SI-NEXT: v_mov_b32_e32 v26, v10 -; SI-NEXT: v_mov_b32_e32 v25, v9 -; SI-NEXT: v_mov_b32_e32 v24, v8 -; SI-NEXT: v_mov_b32_e32 v23, v7 -; SI-NEXT: v_mov_b32_e32 v22, v6 -; SI-NEXT: v_mov_b32_e32 v21, v5 -; SI-NEXT: v_mov_b32_e32 v20, v4 -; SI-NEXT: v_mov_b32_e32 v19, v3 -; SI-NEXT: v_mov_b32_e32 v18, v2 -; SI-NEXT: v_mov_b32_e32 v17, v1 -; SI-NEXT: v_mov_b32_e32 v32, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB88_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_cvt_f32_f16_e32 v8, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v31 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 @@ -27708,48 +28614,6 @@ define <16 x half> @bitcast_v16i16_to_v16f16(<16 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB88_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB88_4 -; SI-NEXT: .LBB88_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB88_3: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v0, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v31 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr26 @@ -27758,42 +28622,76 @@ define <16 x half> @bitcast_v16i16_to_v16f16(<16 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: .LBB88_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB88_2 -; SI-NEXT: .LBB88_4: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v31 -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v30 +; SI-NEXT: s_cbranch_execz .LBB88_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v31 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 ; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v29 -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 ; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v27 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v25 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v24 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v23 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v22 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v21 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v20 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v19 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v18 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v17 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: .LBB88_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v10 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v17 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v19 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v14 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v21 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v16 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16i16_to_v16f16: @@ -27896,80 +28794,117 @@ define inreg <16 x half> @bitcast_v16i16_to_v16f16_scalar(<16 x i16> inreg %a, i ; SI-LABEL: bitcast_v16i16_to_v16f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: v_mov_b32_e32 v16, v1 -; SI-NEXT: v_mov_b32_e32 v17, v0 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_lshr_b32 s13, s23, 16 +; SI-NEXT: s_lshr_b32 s12, s22, 16 +; SI-NEXT: s_lshr_b32 s11, s21, 16 +; SI-NEXT: s_lshr_b32 s10, s20, 16 +; SI-NEXT: s_lshr_b32 s9, s19, 16 +; SI-NEXT: s_lshr_b32 s8, s18, 16 +; SI-NEXT: s_lshr_b32 s7, s17, 16 +; SI-NEXT: s_lshr_b32 s6, s16, 16 +; SI-NEXT: s_cmp_lg_u32 s24, 0 ; SI-NEXT: s_cbranch_scc0 .LBB89_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s6 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s7 ; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s8 ; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s9 ; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s10 ; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s11 ; SI-NEXT: v_cvt_f32_f16_e32 v6, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s12 ; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s13 ; SI-NEXT: s_cbranch_execnz .LBB89_3 ; SI-NEXT: .LBB89_2: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v16 -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v17 -; SI-NEXT: s_add_i32 s29, s29, 3 -; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: s_add_i32 s27, s27, 3 -; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: s_add_i32 s25, s25, 3 -; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s13, s13, 3 ; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s12, s12, 3 ; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s11, s11, 3 ; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s10, s10, 3 ; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s9, s9, 3 ; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s8, s8, 3 ; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s7, s7, 3 ; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s6, s6, 3 ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s6 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s7 ; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s8 ; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s9 ; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s10 ; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s11 ; SI-NEXT: v_cvt_f32_f16_e32 v6, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s12 ; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s13 ; SI-NEXT: .LBB89_3: ; %end +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v0, v0, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v10 +; SI-NEXT: v_or_b32_e32 v1, v1, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v2, v2, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; SI-NEXT: v_or_b32_e32 v3, v3, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v4, v4, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; SI-NEXT: v_or_b32_e32 v5, v5, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v6, v6, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB89_4: ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: s_branch .LBB89_2 ; @@ -28116,98 +29051,146 @@ define <16 x i16> @bitcast_v16f16_to_v16i16(<16 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v16f16_to_v16i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v11 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v11 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v11 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v17 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB90_2 ; SI-NEXT: ; %bb.1: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v15 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_or_b32_e32 v14, v14, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_or_b32_e32 v10, v10, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v5 ; SI-NEXT: v_or_b32_e32 v6, v6, v16 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v12, v12, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v2, v2, v16 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_or_b32_e32 v4, v4, v5 -; SI-NEXT: v_or_b32_e32 v8, v8, v9 -; SI-NEXT: v_or_b32_e32 v12, v12, v13 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 +; SI-NEXT: v_or_b32_e32 v0, v0, v8 +; SI-NEXT: v_or_b32_e32 v11, v11, v15 +; SI-NEXT: v_or_b32_e32 v4, v4, v14 +; SI-NEXT: v_or_b32_e32 v9, v9, v13 +; SI-NEXT: v_alignbit_b32 v16, v2, v8, 16 +; SI-NEXT: v_alignbit_b32 v15, v12, v15, 16 +; SI-NEXT: v_alignbit_b32 v14, v6, v14, 16 +; SI-NEXT: v_alignbit_b32 v13, v10, v13, 16 ; SI-NEXT: .LBB90_2: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v8 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v15 +; SI-NEXT: v_or_b32_e32 v2, v2, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v8, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v14 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v4, v4, v8 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v13 +; SI-NEXT: v_or_b32_e32 v6, v6, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16f16_to_v16i16: @@ -28311,102 +29294,144 @@ define inreg <16 x i16> @bitcast_v16f16_to_v16i16_scalar(<16 x half> inreg %a, i ; SI-LABEL: bitcast_v16f16_to_v16i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v5, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v18, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v2, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v3, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v4, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v21, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v6, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v7, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v8, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v19, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v11, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v12, s28 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v10, s26 -; SI-NEXT: v_cvt_f16_f32_e32 v16, s29 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v9 +; SI-NEXT: s_cmp_lg_u32 s24, 0 ; SI-NEXT: s_cbranch_scc0 .LBB91_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_cbranch_execnz .LBB91_3 ; SI-NEXT: .LBB91_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v18 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v23 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v20 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_or_b32_e32 v17, v4, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_or_b32_e32 v14, v14, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v11 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v23, v6, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v8 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v20, v9, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_or_b32_e32 v10, v10, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_or_b32_e32 v6, v6, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v2, v2, v16 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_lshr_b64 v[18:19], v[1:2], 16 -; SI-NEXT: v_lshr_b64 v[21:22], v[5:6], 16 -; SI-NEXT: v_lshr_b64 v[19:20], v[9:10], 16 -; SI-NEXT: v_lshr_b64 v[16:17], v[13:14], 16 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_or_b32_e32 v4, v4, v5 -; SI-NEXT: v_or_b32_e32 v8, v8, v9 -; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v9 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v22, v8, v6 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v16 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v9 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v8 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v18 +; SI-NEXT: v_or_b32_e32 v3, v3, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v21 +; SI-NEXT: v_or_b32_e32 v5, v5, v9 +; SI-NEXT: v_or_b32_e32 v1, v1, v8 +; SI-NEXT: v_lshr_b64 v[14:15], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[12:13], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[10:11], v[4:5], 16 +; SI-NEXT: v_lshr_b64 v[8:9], v[6:7], 16 ; SI-NEXT: .LBB91_3: ; %end -; SI-NEXT: v_mov_b32_e32 v1, v18 -; SI-NEXT: v_mov_b32_e32 v5, v21 -; SI-NEXT: v_mov_b32_e32 v9, v19 -; SI-NEXT: v_mov_b32_e32 v13, v16 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v14 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v17 +; SI-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v21 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v12 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v19 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v10 +; SI-NEXT: v_or_b32_e32 v4, v4, v6 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v18 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v6, v6, v8 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v16 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB91_4: ; SI-NEXT: s_branch .LBB91_2 @@ -28557,50 +29582,24 @@ define <16 x bfloat> @bitcast_v16i16_to_v16bf16(<16 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v16i16_to_v16bf16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v23, v14 -; SI-NEXT: v_mov_b32_e32 v22, v12 -; SI-NEXT: v_mov_b32_e32 v21, v10 -; SI-NEXT: v_mov_b32_e32 v20, v8 -; SI-NEXT: v_mov_b32_e32 v19, v6 -; SI-NEXT: v_mov_b32_e32 v18, v4 -; SI-NEXT: v_mov_b32_e32 v17, v2 -; SI-NEXT: v_mov_b32_e32 v24, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v16 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB92_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB92_4 -; SI-NEXT: .LBB92_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB92_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v24 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v18 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v20 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v22 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v23 -; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr19 @@ -28608,59 +29607,112 @@ define <16 x bfloat> @bitcast_v16i16_to_v16bf16(<16 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v7 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB92_2 -; SI-NEXT: .LBB92_4: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v23 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v15, v0 -; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v22 +; SI-NEXT: s_cbranch_execz .LBB92_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v7, v15, v7 ; SI-NEXT: s_mov_b32 s6, 0x30000 -; SI-NEXT: v_or_b32_e32 v0, v13, v0 -; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v21 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v11, v0 -; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v20 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v9, v0 -; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v19 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v7, v0 -; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v18 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v5, v0 -; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v17 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v3, v0 -; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v24 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v6, v14, v6 +; SI-NEXT: v_or_b32_e32 v5, v13, v5 +; SI-NEXT: v_or_b32_e32 v4, v12, v4 +; SI-NEXT: v_or_b32_e32 v3, v11, v3 +; SI-NEXT: v_or_b32_e32 v2, v10, v2 +; SI-NEXT: v_or_b32_e32 v1, v9, v1 +; SI-NEXT: v_or_b32_e32 v0, v8, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v10 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v12 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v0 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v1 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v2 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v3 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v4 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v5 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v6 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v7 +; SI-NEXT: .LBB92_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v8 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v16 +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v9 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v17 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v10 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v18 +; SI-NEXT: v_alignbit_b32 v2, v2, v3, 16 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v11 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v19 +; SI-NEXT: v_alignbit_b32 v3, v3, v4, 16 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v12 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v20 +; SI-NEXT: v_alignbit_b32 v4, v4, v5, 16 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v13 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_mul_f32_e32 v6, 1.0, v21 +; SI-NEXT: v_alignbit_b32 v5, v5, v6, 16 +; SI-NEXT: v_mul_f32_e32 v6, 1.0, v14 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v22 +; SI-NEXT: v_alignbit_b32 v6, v6, v7, 16 +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v15 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v23 +; SI-NEXT: v_alignbit_b32 v7, v7, v8, 16 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16i16_to_v16bf16: @@ -28763,115 +29815,142 @@ define inreg <16 x bfloat> @bitcast_v16i16_to_v16bf16_scalar(<16 x i16> inreg %a ; SI-LABEL: bitcast_v16i16_to_v16bf16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v1 +; SI-NEXT: s_lshr_b32 s47, s23, 16 +; SI-NEXT: s_lshr_b32 s46, s22, 16 +; SI-NEXT: s_lshr_b32 s45, s21, 16 +; SI-NEXT: s_lshr_b32 s44, s20, 16 +; SI-NEXT: s_lshr_b32 s43, s19, 16 +; SI-NEXT: s_lshr_b32 s42, s18, 16 +; SI-NEXT: s_lshr_b32 s41, s17, 16 +; SI-NEXT: s_lshr_b32 s40, s16, 16 +; SI-NEXT: s_cmp_lg_u32 s24, 0 ; SI-NEXT: s_cbranch_scc0 .LBB93_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshl_b32 s6, s16, 16 -; SI-NEXT: s_lshl_b32 s7, s17, 16 -; SI-NEXT: s_lshl_b32 s8, s18, 16 -; SI-NEXT: s_lshl_b32 s9, s19, 16 +; SI-NEXT: s_lshl_b32 s25, s16, 16 +; SI-NEXT: s_lshl_b32 s29, s40, 16 +; SI-NEXT: s_lshl_b32 s15, s17, 16 +; SI-NEXT: s_lshl_b32 s28, s41, 16 +; SI-NEXT: s_lshl_b32 s13, s18, 16 +; SI-NEXT: s_lshl_b32 s27, s42, 16 +; SI-NEXT: s_lshl_b32 s11, s19, 16 +; SI-NEXT: s_lshl_b32 s26, s43, 16 ; SI-NEXT: s_lshl_b32 s10, s20, 16 -; SI-NEXT: s_lshl_b32 s11, s21, 16 -; SI-NEXT: s_lshl_b32 s12, s22, 16 -; SI-NEXT: s_lshl_b32 s13, s23, 16 -; SI-NEXT: s_lshl_b32 s14, s24, 16 -; SI-NEXT: s_lshl_b32 s15, s25, 16 -; SI-NEXT: s_lshl_b32 s40, s26, 16 -; SI-NEXT: s_lshl_b32 s41, s27, 16 -; SI-NEXT: s_lshl_b32 s42, s28, 16 -; SI-NEXT: s_lshl_b32 s43, s29, 16 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v0 +; SI-NEXT: s_lshl_b32 s24, s44, 16 +; SI-NEXT: s_lshl_b32 s8, s21, 16 +; SI-NEXT: s_lshl_b32 s14, s45, 16 +; SI-NEXT: s_lshl_b32 s7, s22, 16 +; SI-NEXT: s_lshl_b32 s12, s46, 16 +; SI-NEXT: s_lshl_b32 s6, s23, 16 +; SI-NEXT: s_lshl_b32 s9, s47, 16 ; SI-NEXT: s_cbranch_execnz .LBB93_3 ; SI-NEXT: .LBB93_2: ; %cmp.true -; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: s_and_b32 s4, s28, 0xffff -; SI-NEXT: s_lshl_b32 s5, s29, 16 -; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_and_b32 s4, s23, 0xffff +; SI-NEXT: s_lshl_b32 s5, s47, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s5, s26, 0xffff -; SI-NEXT: s_lshl_b32 s6, s27, 16 -; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_and_b32 s5, s22, 0xffff +; SI-NEXT: s_lshl_b32 s6, s46, 16 +; SI-NEXT: s_add_i32 s21, s21, 3 ; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_and_b32 s6, s24, 0xffff -; SI-NEXT: s_lshl_b32 s7, s25, 16 -; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: s_add_i32 s14, s6, 0x30000 -; SI-NEXT: s_and_b32 s6, s22, 0xffff -; SI-NEXT: s_lshl_b32 s7, s23, 16 -; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s6, s21, 0xffff +; SI-NEXT: s_lshl_b32 s7, s45, 16 ; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: s_add_i32 s12, s6, 0x30000 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 ; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s20, 0xffff +; SI-NEXT: s_lshl_b32 s8, s44, 16 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s19, 0xffff +; SI-NEXT: s_lshl_b32 s9, s43, 16 ; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_add_i32 s10, s6, 0x30000 -; SI-NEXT: s_and_b32 s6, s18, 0xffff -; SI-NEXT: s_lshl_b32 s7, s19, 16 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s18, 0xffff +; SI-NEXT: s_lshl_b32 s10, s42, 16 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s17, 0xffff +; SI-NEXT: s_lshl_b32 s11, s41, 16 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_add_i32 s8, s6, 0x30000 -; SI-NEXT: s_and_b32 s6, s16, 0xffff -; SI-NEXT: s_lshl_b32 s7, s17, 16 -; SI-NEXT: v_or_b32_e32 v0, v15, v0 -; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: s_and_b32 s11, s16, 0xffff +; SI-NEXT: s_lshl_b32 s12, s40, 16 +; SI-NEXT: s_or_b32 s11, s12, s11 ; SI-NEXT: s_add_i32 s4, s4, 0x30000 ; SI-NEXT: s_add_i32 s5, s5, 0x30000 ; SI-NEXT: s_add_i32 s6, s6, 0x30000 -; SI-NEXT: s_and_b32 s7, s6, 0xffff0000 -; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: s_and_b32 s9, s8, 0xffff0000 -; SI-NEXT: s_lshl_b32 s8, s8, 16 -; SI-NEXT: s_and_b32 s11, s10, 0xffff0000 -; SI-NEXT: s_lshl_b32 s10, s10, 16 -; SI-NEXT: s_and_b32 s13, s12, 0xffff0000 -; SI-NEXT: s_lshl_b32 s12, s12, 16 -; SI-NEXT: s_and_b32 s15, s14, 0xffff0000 -; SI-NEXT: s_lshl_b32 s14, s14, 16 -; SI-NEXT: s_and_b32 s41, s5, 0xffff0000 -; SI-NEXT: s_lshl_b32 s40, s5, 16 -; SI-NEXT: s_and_b32 s43, s4, 0xffff0000 -; SI-NEXT: s_lshl_b32 s42, s4, 16 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v0 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v0 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: s_add_i32 s11, s11, 0x30000 +; SI-NEXT: s_and_b32 s29, s11, 0xffff0000 +; SI-NEXT: s_lshl_b32 s25, s11, 16 +; SI-NEXT: s_and_b32 s28, s10, 0xffff0000 +; SI-NEXT: s_lshl_b32 s15, s10, 16 +; SI-NEXT: s_and_b32 s27, s9, 0xffff0000 +; SI-NEXT: s_lshl_b32 s13, s9, 16 +; SI-NEXT: s_and_b32 s26, s8, 0xffff0000 +; SI-NEXT: s_lshl_b32 s11, s8, 16 +; SI-NEXT: s_and_b32 s24, s7, 0xffff0000 +; SI-NEXT: s_lshl_b32 s10, s7, 16 +; SI-NEXT: s_and_b32 s14, s6, 0xffff0000 +; SI-NEXT: s_lshl_b32 s8, s6, 16 +; SI-NEXT: s_and_b32 s12, s5, 0xffff0000 +; SI-NEXT: s_lshl_b32 s7, s5, 16 +; SI-NEXT: s_and_b32 s9, s4, 0xffff0000 +; SI-NEXT: s_lshl_b32 s6, s4, 16 ; SI-NEXT: .LBB93_3: ; %end -; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: v_mov_b32_e32 v1, s7 -; SI-NEXT: v_mov_b32_e32 v2, s8 -; SI-NEXT: v_mov_b32_e32 v3, s9 -; SI-NEXT: v_mov_b32_e32 v4, s10 -; SI-NEXT: v_mov_b32_e32 v5, s11 -; SI-NEXT: v_mov_b32_e32 v6, s12 -; SI-NEXT: v_mov_b32_e32 v7, s13 -; SI-NEXT: v_mov_b32_e32 v8, s14 -; SI-NEXT: v_mov_b32_e32 v9, s15 -; SI-NEXT: v_mov_b32_e32 v10, s40 -; SI-NEXT: v_mov_b32_e32 v11, s41 -; SI-NEXT: v_mov_b32_e32 v12, s42 -; SI-NEXT: v_mov_b32_e32 v13, s43 +; SI-NEXT: v_mul_f32_e64 v0, 1.0, s29 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_mul_f32_e64 v0, 1.0, s25 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s28 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s15 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s27 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s13 +; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s26 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s11 +; SI-NEXT: v_lshr_b64 v[3:4], v[3:4], 16 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s24 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s10 +; SI-NEXT: v_lshr_b64 v[4:5], v[4:5], 16 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s14 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s8 +; SI-NEXT: v_lshr_b64 v[5:6], v[5:6], 16 +; SI-NEXT: v_mul_f32_e64 v6, 1.0, s12 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_mul_f32_e64 v6, 1.0, s7 +; SI-NEXT: v_lshr_b64 v[6:7], v[6:7], 16 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s9 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v7 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s6 +; SI-NEXT: v_lshr_b64 v[7:8], v[7:8], 16 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB93_4: -; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; implicit-def: $sgpr7 -; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: ; implicit-def: $sgpr9 -; SI-NEXT: ; implicit-def: $sgpr10 -; SI-NEXT: ; implicit-def: $sgpr11 -; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr25 +; SI-NEXT: ; implicit-def: $sgpr29 +; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $sgpr28 ; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr24 +; SI-NEXT: ; implicit-def: $sgpr8 ; SI-NEXT: ; implicit-def: $sgpr14 -; SI-NEXT: ; implicit-def: $sgpr15 -; SI-NEXT: ; implicit-def: $sgpr40 -; SI-NEXT: ; implicit-def: $sgpr41 -; SI-NEXT: ; implicit-def: $sgpr42 -; SI-NEXT: ; implicit-def: $sgpr43 -; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr9 ; SI-NEXT: s_branch .LBB93_2 ; ; VI-LABEL: bitcast_v16i16_to_v16bf16_scalar: @@ -29017,69 +30096,79 @@ define <16 x i16> @bitcast_v16bf16_to_v16i16(<16 x bfloat> %a, i32 %b) { ; SI-LABEL: bitcast_v16bf16_to_v16i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 ; SI-NEXT: v_mul_f32_e32 v31, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v30, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v17, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v16, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v29, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v28, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v19, 1.0, v6 -; SI-NEXT: v_mul_f32_e32 v18, 1.0, v7 -; SI-NEXT: v_mul_f32_e32 v27, 1.0, v8 -; SI-NEXT: v_mul_f32_e32 v26, 1.0, v9 -; SI-NEXT: v_mul_f32_e32 v21, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v29, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v5 ; SI-NEXT: v_mul_f32_e32 v20, 1.0, v11 -; SI-NEXT: v_mul_f32_e32 v25, 1.0, v12 -; SI-NEXT: v_mul_f32_e32 v24, 1.0, v13 -; SI-NEXT: v_mul_f32_e32 v23, 1.0, v14 -; SI-NEXT: v_mul_f32_e32 v22, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v9 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB94_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB94_4 -; SI-NEXT: .LBB94_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB94_3: ; %cmp.false +; SI-NEXT: s_cbranch_execz .LBB94_2 +; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v31 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v29 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v22 ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr19 @@ -29092,66 +30181,92 @@ define <16 x i16> @bitcast_v16bf16_to_v16i16(<16 x bfloat> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: .LBB94_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB94_2 -; SI-NEXT: .LBB94_4: ; %cmp.true +; SI-NEXT: s_cbranch_execz .LBB94_4 +; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v30 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v31 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v28 -; SI-NEXT: v_alignbit_b32 v0, v2, v0, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v29 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v12 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v28 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v29 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v11 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v26 +; SI-NEXT: v_alignbit_b32 v2, v2, v1, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v27 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v26 -; SI-NEXT: v_alignbit_b32 v4, v4, v2, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v27 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v24 +; SI-NEXT: v_alignbit_b32 v4, v4, v1, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v25 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 -; SI-NEXT: v_alignbit_b32 v8, v6, v2, 16 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v24 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v25 -; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; SI-NEXT: v_alignbit_b32 v12, v7, v2, 16 ; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v22 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v23 +; SI-NEXT: v_alignbit_b32 v6, v6, v1, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v23 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v6 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v20 -; SI-NEXT: v_alignbit_b32 v14, v15, v2, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v21 -; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v6 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v18 -; SI-NEXT: v_alignbit_b32 v10, v11, v2, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v19 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v7 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v20 +; SI-NEXT: v_alignbit_b32 v7, v8, v1, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v21 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v16 -; SI-NEXT: v_alignbit_b32 v6, v7, v2, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v5 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v18 +; SI-NEXT: v_alignbit_b32 v5, v9, v1, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v19 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v3 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v15 +; SI-NEXT: v_alignbit_b32 v3, v10, v1, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_alignbit_b32 v1, v11, v1, 16 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; SI-NEXT: v_alignbit_b32 v12, v1, v12, 16 +; SI-NEXT: v_alignbit_b32 v13, v3, v13, 16 +; SI-NEXT: v_alignbit_b32 v14, v5, v14, 16 +; SI-NEXT: v_alignbit_b32 v17, v7, v17, 16 +; SI-NEXT: .LBB94_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v1, v1, v11 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v13 +; SI-NEXT: v_or_b32_e32 v3, v3, v10 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v14 +; SI-NEXT: v_or_b32_e32 v5, v5, v9 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v17 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v0, v0, v12 +; SI-NEXT: v_or_b32_e32 v2, v2, v11 +; SI-NEXT: v_or_b32_e32 v4, v4, v10 +; SI-NEXT: v_or_b32_e32 v6, v6, v9 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16bf16_to_v16i16: @@ -29762,127 +30877,158 @@ define inreg <16 x i16> @bitcast_v16bf16_to_v16i16_scalar(<16 x bfloat> inreg %a ; SI-LABEL: bitcast_v16bf16_to_v16i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mul_f32_e64 v32, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v31, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v21, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v3, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v30, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v5, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v19, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v7, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v29, 1.0, s24 -; SI-NEXT: v_mul_f32_e64 v9, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v17, 1.0, s26 -; SI-NEXT: v_mul_f32_e64 v11, 1.0, s27 -; SI-NEXT: v_mul_f32_e64 v28, 1.0, s28 -; SI-NEXT: v_mul_f32_e64 v13, 1.0, s29 -; SI-NEXT: v_mul_f32_e32 v15, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v27, 1.0, v1 +; SI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; SI-NEXT: s_lshl_b32 s5, s23, 16 +; SI-NEXT: s_and_b32 s6, s22, 0xffff0000 +; SI-NEXT: s_lshl_b32 s7, s22, 16 +; SI-NEXT: s_and_b32 s8, s21, 0xffff0000 +; SI-NEXT: s_lshl_b32 s9, s21, 16 +; SI-NEXT: s_and_b32 s10, s20, 0xffff0000 +; SI-NEXT: s_lshl_b32 s11, s20, 16 +; SI-NEXT: s_and_b32 s12, s19, 0xffff0000 +; SI-NEXT: s_lshl_b32 s13, s19, 16 +; SI-NEXT: s_and_b32 s14, s18, 0xffff0000 +; SI-NEXT: s_lshl_b32 s15, s18, 16 +; SI-NEXT: s_and_b32 s18, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_and_b32 s19, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s16, s16, 16 +; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: v_mul_f32_e64 v31, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v21, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v12, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v30, 1.0, s15 +; SI-NEXT: v_mul_f32_e64 v6, 1.0, s14 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s13 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s12 +; SI-NEXT: v_mul_f32_e64 v29, 1.0, s11 +; SI-NEXT: v_mul_f32_e64 v28, 1.0, s10 +; SI-NEXT: v_mul_f32_e64 v17, 1.0, s9 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s8 +; SI-NEXT: v_mul_f32_e64 v27, 1.0, s7 +; SI-NEXT: v_mul_f32_e64 v15, 1.0, s6 +; SI-NEXT: v_mul_f32_e64 v19, 1.0, s5 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s4 ; SI-NEXT: s_cbranch_scc0 .LBB95_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v32 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v31 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v29 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v19 ; SI-NEXT: s_cbranch_execnz .LBB95_3 ; SI-NEXT: .LBB95_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v31 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v32 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v31 ; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v23 ; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v30 -; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v22 -; SI-NEXT: v_lshr_b64 v[4:5], v[1:2], 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v9 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v29 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v5 -; SI-NEXT: v_lshr_b64 v[8:9], v[1:2], 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v13 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v28 -; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v6 -; SI-NEXT: v_lshr_b64 v[12:13], v[1:2], 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v15 -; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v27 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v17 -; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v11 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v19 -; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v7 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; SI-NEXT: v_lshr_b64 v[5:6], v[5:6], 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v29 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v28 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; SI-NEXT: v_lshr_b64 v[13:14], v[6:7], 16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v15 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v27 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v9 +; SI-NEXT: v_lshr_b64 v[14:15], v[6:7], 16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v19 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v8 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v6 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_lshr_b64 v[7:8], v[15:16], 16 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v17 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v10 ; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v12 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v22 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 ; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v1 -; SI-NEXT: v_lshr_b64 v[2:3], v[21:22], 16 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v6 -; SI-NEXT: v_lshr_b64 v[6:7], v[19:20], 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v23 -; SI-NEXT: v_lshr_b64 v[10:11], v[17:18], 16 -; SI-NEXT: v_lshr_b64 v[23:24], v[1:2], 16 -; SI-NEXT: v_lshr_b64 v[14:15], v[15:16], 16 -; SI-NEXT: v_lshr_b64 v[24:25], v[5:6], 16 -; SI-NEXT: v_lshr_b64 v[25:26], v[9:10], 16 -; SI-NEXT: v_lshr_b64 v[26:27], v[13:14], 16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v9 +; SI-NEXT: v_lshr_b64 v[9:10], v[17:18], 16 +; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v3 +; SI-NEXT: v_lshr_b64 v[11:12], v[21:22], 16 +; SI-NEXT: v_lshr_b64 v[3:4], v[19:20], 16 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v23 +; SI-NEXT: v_lshr_b64 v[23:24], v[10:11], 16 +; SI-NEXT: v_lshr_b64 v[24:25], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[25:26], v[8:9], 16 +; SI-NEXT: v_lshr_b64 v[26:27], v[6:7], 16 ; SI-NEXT: .LBB95_3: ; %end -; SI-NEXT: v_mov_b32_e32 v1, v23 -; SI-NEXT: v_mov_b32_e32 v3, v22 -; SI-NEXT: v_mov_b32_e32 v5, v24 -; SI-NEXT: v_mov_b32_e32 v7, v20 -; SI-NEXT: v_mov_b32_e32 v9, v25 -; SI-NEXT: v_mov_b32_e32 v11, v18 -; SI-NEXT: v_mov_b32_e32 v13, v26 -; SI-NEXT: v_mov_b32_e32 v15, v16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v23 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v22 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v24 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v20 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v25 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v18 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v26 +; SI-NEXT: v_or_b32_e32 v6, v6, v8 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v16 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB95_4: ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: s_branch .LBB95_2 ; ; VI-LABEL: bitcast_v16bf16_to_v16i16_scalar: @@ -30573,40 +31719,37 @@ define <32 x i8> @bitcast_v16i16_to_v32i8(<16 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v16i16_to_v32i8: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v48, v15 -; SI-NEXT: v_mov_b32_e32 v49, v11 -; SI-NEXT: v_mov_b32_e32 v50, v7 -; SI-NEXT: v_mov_b32_e32 v51, v3 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v32, v14 -; SI-NEXT: v_mov_b32_e32 v37, v12 -; SI-NEXT: v_mov_b32_e32 v33, v10 -; SI-NEXT: v_mov_b32_e32 v36, v8 -; SI-NEXT: v_mov_b32_e32 v34, v6 +; SI-NEXT: v_mov_b32_e32 v35, v7 +; SI-NEXT: v_mov_b32_e32 v39, v6 +; SI-NEXT: v_mov_b32_e32 v34, v5 ; SI-NEXT: v_mov_b32_e32 v38, v4 -; SI-NEXT: v_mov_b32_e32 v35, v2 -; SI-NEXT: v_mov_b32_e32 v39, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: v_mov_b32_e32 v33, v3 +; SI-NEXT: v_mov_b32_e32 v37, v2 +; SI-NEXT: v_mov_b32_e32 v32, v1 +; SI-NEXT: v_mov_b32_e32 v36, v0 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v38 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v36 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v14 ; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v50 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v49 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v30 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr9 @@ -30614,7 +31757,6 @@ define <32 x i8> @bitcast_v16i16_to_v32i8(<16 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr17 @@ -30622,7 +31764,6 @@ define <32 x i8> @bitcast_v16i16_to_v32i8(<16 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr25 @@ -30630,28 +31771,33 @@ define <32 x i8> @bitcast_v16i16_to_v32i8(<16 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execz .LBB96_2 -; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB96_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB96_4 +; SI-NEXT: .LBB96_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB96_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v37 +; SI-NEXT: v_or_b32_e32 v8, v5, v51 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v33 +; SI-NEXT: v_or_b32_e32 v12, v5, v50 ; SI-NEXT: v_and_b32_e32 v5, 0xffff, v38 -; SI-NEXT: v_or_b32_e32 v8, v5, v55 +; SI-NEXT: v_or_b32_e32 v16, v5, v53 ; SI-NEXT: v_and_b32_e32 v5, 0xffff, v34 -; SI-NEXT: v_or_b32_e32 v12, v5, v54 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v36 -; SI-NEXT: v_or_b32_e32 v16, v5, v41 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v33 -; SI-NEXT: v_or_b32_e32 v20, v5, v40 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v37 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v35 -; SI-NEXT: v_or_b32_e32 v24, v5, v43 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v32 -; SI-NEXT: v_or_b32_e32 v0, v0, v53 -; SI-NEXT: v_or_b32_e32 v4, v1, v52 -; SI-NEXT: v_or_b32_e32 v28, v5, v42 +; SI-NEXT: v_or_b32_e32 v20, v5, v52 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v39 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v32 +; SI-NEXT: v_or_b32_e32 v24, v5, v55 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v35 +; SI-NEXT: v_or_b32_e32 v0, v0, v49 +; SI-NEXT: v_or_b32_e32 v4, v1, v48 +; SI-NEXT: v_or_b32_e32 v28, v5, v54 ; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 ; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 ; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 @@ -30668,65 +31814,60 @@ define <32 x i8> @bitcast_v16i16_to_v32i8(<16 x i16> %a, i32 %b) { ; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v12 ; SI-NEXT: v_lshrrev_b32_e32 v21, 8, v20 ; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v28 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v51 -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v50 -; SI-NEXT: v_and_b32_e32 v22, 0xffff, v49 -; SI-NEXT: v_and_b32_e32 v30, 0xffff, v48 -; SI-NEXT: v_bfe_u32 v7, v51, 8, 8 -; SI-NEXT: v_bfe_u32 v15, v50, 8, 8 -; SI-NEXT: v_bfe_u32 v23, v49, 8, 8 -; SI-NEXT: v_bfe_u32 v31, v48, 8, 8 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: v_bfe_u32 v7, v6, 8, 8 +; SI-NEXT: v_bfe_u32 v15, v14, 8, 8 +; SI-NEXT: v_bfe_u32 v23, v22, 8, 8 +; SI-NEXT: v_bfe_u32 v31, v30, 8, 8 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: .LBB96_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB96_4 -; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: s_cbranch_execz .LBB96_2 +; SI-NEXT: .LBB96_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: v_or_b32_e32 v0, v55, v0 ; SI-NEXT: v_add_i32_e32 v24, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_mov_b32 s6, 0x30000 -; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: v_or_b32_e32 v0, v54, v0 ; SI-NEXT: v_add_i32_e32 v28, vcc, s6, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v41, v0 +; SI-NEXT: v_or_b32_e32 v0, v53, v0 ; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v40, v0 +; SI-NEXT: v_or_b32_e32 v0, v52, v0 ; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v55, v0 +; SI-NEXT: v_or_b32_e32 v0, v51, v0 ; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v54, v0 +; SI-NEXT: v_or_b32_e32 v0, v50, v0 ; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v35 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v32 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v0, v53, v0 -; SI-NEXT: v_or_b32_e32 v1, v52, v1 +; SI-NEXT: v_or_b32_e32 v0, v49, v0 +; SI-NEXT: v_or_b32_e32 v1, v48, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 ; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v1 ; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 @@ -30753,13 +31894,7 @@ define <32 x i8> @bitcast_v16i16_to_v32i8(<16 x i16> %a, i32 %b) { ; SI-NEXT: v_lshrrev_b32_e32 v31, 24, v28 ; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v28 ; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v28 -; SI-NEXT: .LBB96_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16i16_to_v32i8: @@ -31232,189 +32367,186 @@ define inreg <32 x i8> @bitcast_v16i16_to_v32i8_scalar(<16 x i16> inreg %a, i32 ; SI-LABEL: bitcast_v16i16_to_v32i8_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: v_readfirstlane_b32 s78, v1 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s79, v0 +; SI-NEXT: s_lshr_b32 s75, s23, 16 +; SI-NEXT: s_lshr_b32 s79, s22, 16 +; SI-NEXT: s_lshr_b32 s74, s21, 16 +; SI-NEXT: s_lshr_b32 s78, s20, 16 +; SI-NEXT: s_lshr_b32 s73, s19, 16 +; SI-NEXT: s_lshr_b32 s77, s18, 16 +; SI-NEXT: s_lshr_b32 s72, s17, 16 +; SI-NEXT: s_lshr_b32 s76, s16, 16 +; SI-NEXT: s_cmp_lg_u32 s24, 0 ; SI-NEXT: s_cbranch_scc0 .LBB97_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_lshl_b32 s5, s76, 16 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s72, 16 ; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_and_b32 s6, s18, 0xffff +; SI-NEXT: s_lshl_b32 s7, s77, 16 ; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_and_b32 s7, s19, 0xffff +; SI-NEXT: s_lshl_b32 s8, s73, 16 ; SI-NEXT: s_or_b32 s7, s7, s8 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_and_b32 s8, s20, 0xffff +; SI-NEXT: s_lshl_b32 s9, s78, 16 ; SI-NEXT: s_or_b32 s8, s8, s9 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_and_b32 s9, s21, 0xffff +; SI-NEXT: s_lshl_b32 s10, s74, 16 ; SI-NEXT: s_lshr_b64 s[12:13], s[4:5], 24 ; SI-NEXT: s_or_b32 s9, s9, s10 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: s_and_b32 s10, s22, 0xffff +; SI-NEXT: s_lshl_b32 s11, s79, 16 ; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: s_and_b32 s11, s79, 0xffff -; SI-NEXT: s_lshl_b32 s13, s78, 16 +; SI-NEXT: s_and_b32 s11, s23, 0xffff +; SI-NEXT: s_lshl_b32 s13, s75, 16 ; SI-NEXT: s_lshr_b64 s[14:15], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[40:41], s[4:5], 8 -; SI-NEXT: s_lshr_b64 s[42:43], s[6:7], 24 -; SI-NEXT: s_lshr_b64 s[44:45], s[6:7], 16 -; SI-NEXT: s_lshr_b64 s[46:47], s[6:7], 8 -; SI-NEXT: s_lshr_b64 s[56:57], s[8:9], 24 -; SI-NEXT: s_lshr_b64 s[58:59], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[24:25], s[4:5], 8 +; SI-NEXT: s_lshr_b64 s[26:27], s[6:7], 24 +; SI-NEXT: s_lshr_b64 s[28:29], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[6:7], 8 ; SI-NEXT: s_or_b32 s11, s11, s13 ; SI-NEXT: s_lshr_b32 s13, s5, 8 -; SI-NEXT: s_lshr_b32 s41, s7, 8 -; SI-NEXT: s_lshr_b32 s47, s9, 8 +; SI-NEXT: s_lshr_b32 s25, s7, 8 +; SI-NEXT: s_lshr_b32 s29, s9, 8 ; SI-NEXT: s_lshr_b32 s88, s11, 8 -; SI-NEXT: s_and_b32 s15, s19, 0xffff -; SI-NEXT: s_and_b32 s45, s23, 0xffff -; SI-NEXT: s_and_b32 s59, s27, 0xffff -; SI-NEXT: s_and_b32 s90, s78, 0xffff -; SI-NEXT: s_bfe_u32 s43, s19, 0x80008 -; SI-NEXT: s_bfe_u32 s57, s23, 0x80008 -; SI-NEXT: s_bfe_u32 s89, s27, 0x80008 -; SI-NEXT: s_bfe_u32 s91, s78, 0x80008 -; SI-NEXT: s_lshr_b64 s[60:61], s[8:9], 8 -; SI-NEXT: s_lshr_b64 s[72:73], s[10:11], 24 -; SI-NEXT: s_lshr_b64 s[62:63], s[10:11], 16 -; SI-NEXT: s_lshr_b64 s[74:75], s[10:11], 8 +; SI-NEXT: s_bfe_u32 s15, s72, 0x80008 +; SI-NEXT: s_bfe_u32 s27, s73, 0x80008 +; SI-NEXT: s_bfe_u32 s41, s74, 0x80008 +; SI-NEXT: s_bfe_u32 s89, s75, 0x80008 +; SI-NEXT: s_lshr_b64 s[42:43], s[8:9], 24 +; SI-NEXT: s_lshr_b64 s[44:45], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[46:47], s[8:9], 8 +; SI-NEXT: s_lshr_b64 s[56:57], s[10:11], 24 +; SI-NEXT: s_lshr_b64 s[58:59], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[10:11], 8 ; SI-NEXT: s_cbranch_execnz .LBB97_3 ; SI-NEXT: .LBB97_2: ; %cmp.true -; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: s_and_b32 s4, s28, 0xffff -; SI-NEXT: s_lshl_b32 s5, s29, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: s_lshl_b32 s5, s79, 16 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_add_i32 s79, s79, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 ; SI-NEXT: s_add_i32 s10, s4, 0x30000 -; SI-NEXT: s_and_b32 s4, s79, 0xffff -; SI-NEXT: s_lshl_b32 s5, s78, 16 +; SI-NEXT: s_and_b32 s4, s23, 0xffff +; SI-NEXT: s_lshl_b32 s5, s75, 16 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 ; SI-NEXT: s_add_i32 s11, s4, 0x30000 -; SI-NEXT: s_and_b32 s4, s24, 0xffff -; SI-NEXT: s_lshl_b32 s5, s25, 16 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: s_lshl_b32 s5, s78, 16 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 ; SI-NEXT: s_add_i32 s8, s4, 0x30000 -; SI-NEXT: s_and_b32 s4, s26, 0xffff -; SI-NEXT: s_lshl_b32 s5, s27, 16 +; SI-NEXT: s_and_b32 s4, s21, 0xffff +; SI-NEXT: s_lshl_b32 s5, s74, 16 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: s_add_i32 s9, s4, 0x30000 -; SI-NEXT: s_and_b32 s4, s20, 0xffff -; SI-NEXT: s_lshl_b32 s5, s21, 16 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: s_lshl_b32 s5, s77, 16 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 ; SI-NEXT: s_add_i32 s6, s4, 0x30000 -; SI-NEXT: s_and_b32 s4, s22, 0xffff -; SI-NEXT: s_lshl_b32 s5, s23, 16 +; SI-NEXT: s_and_b32 s4, s19, 0xffff +; SI-NEXT: s_lshl_b32 s5, s73, 16 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_add_i32 s7, s4, 0x30000 ; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_lshl_b32 s5, s76, 16 +; SI-NEXT: s_add_i32 s17, s17, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s12, s19, 16 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s12, s72, 16 ; SI-NEXT: s_or_b32 s5, s12, s5 ; SI-NEXT: s_add_i32 s4, s4, 0x30000 ; SI-NEXT: s_add_i32 s5, s5, 0x30000 ; SI-NEXT: s_lshr_b64 s[12:13], s[4:5], 24 ; SI-NEXT: s_lshr_b64 s[14:15], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[40:41], s[4:5], 8 -; SI-NEXT: s_lshr_b64 s[42:43], s[6:7], 24 -; SI-NEXT: s_lshr_b64 s[44:45], s[6:7], 16 -; SI-NEXT: s_lshr_b64 s[46:47], s[6:7], 8 -; SI-NEXT: s_lshr_b64 s[56:57], s[8:9], 24 -; SI-NEXT: s_lshr_b64 s[58:59], s[8:9], 16 -; SI-NEXT: s_lshr_b64 s[60:61], s[8:9], 8 -; SI-NEXT: s_lshr_b64 s[72:73], s[10:11], 24 -; SI-NEXT: s_lshr_b64 s[62:63], s[10:11], 16 -; SI-NEXT: s_lshr_b64 s[74:75], s[10:11], 8 -; SI-NEXT: s_lshr_b32 s43, s5, 24 -; SI-NEXT: s_lshr_b32 s15, s5, 16 +; SI-NEXT: s_lshr_b64 s[24:25], s[4:5], 8 +; SI-NEXT: s_lshr_b64 s[26:27], s[6:7], 24 +; SI-NEXT: s_lshr_b64 s[28:29], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[6:7], 8 +; SI-NEXT: s_lshr_b64 s[42:43], s[8:9], 24 +; SI-NEXT: s_lshr_b64 s[44:45], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[46:47], s[8:9], 8 +; SI-NEXT: s_lshr_b64 s[56:57], s[10:11], 24 +; SI-NEXT: s_lshr_b64 s[58:59], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[10:11], 8 +; SI-NEXT: s_lshr_b32 s15, s5, 24 +; SI-NEXT: s_lshr_b32 s72, s5, 16 ; SI-NEXT: s_lshr_b32 s13, s5, 8 -; SI-NEXT: s_lshr_b32 s57, s7, 24 -; SI-NEXT: s_lshr_b32 s45, s7, 16 -; SI-NEXT: s_lshr_b32 s41, s7, 8 -; SI-NEXT: s_lshr_b32 s89, s9, 24 -; SI-NEXT: s_lshr_b32 s59, s9, 16 -; SI-NEXT: s_lshr_b32 s47, s9, 8 -; SI-NEXT: s_lshr_b32 s91, s11, 24 -; SI-NEXT: s_lshr_b32 s90, s11, 16 +; SI-NEXT: s_lshr_b32 s27, s7, 24 +; SI-NEXT: s_lshr_b32 s73, s7, 16 +; SI-NEXT: s_lshr_b32 s25, s7, 8 +; SI-NEXT: s_lshr_b32 s41, s9, 24 +; SI-NEXT: s_lshr_b32 s74, s9, 16 +; SI-NEXT: s_lshr_b32 s29, s9, 8 +; SI-NEXT: s_lshr_b32 s89, s11, 24 +; SI-NEXT: s_lshr_b32 s75, s11, 16 ; SI-NEXT: s_lshr_b32 s88, s11, 8 ; SI-NEXT: .LBB97_3: ; %end ; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s40 +; SI-NEXT: v_mov_b32_e32 v1, s24 ; SI-NEXT: v_mov_b32_e32 v2, s14 ; SI-NEXT: v_mov_b32_e32 v3, s12 ; SI-NEXT: v_mov_b32_e32 v4, s5 ; SI-NEXT: v_mov_b32_e32 v5, s13 -; SI-NEXT: v_mov_b32_e32 v6, s15 -; SI-NEXT: v_mov_b32_e32 v7, s43 +; SI-NEXT: v_mov_b32_e32 v6, s72 +; SI-NEXT: v_mov_b32_e32 v7, s15 ; SI-NEXT: v_mov_b32_e32 v8, s6 -; SI-NEXT: v_mov_b32_e32 v9, s46 -; SI-NEXT: v_mov_b32_e32 v10, s44 -; SI-NEXT: v_mov_b32_e32 v11, s42 +; SI-NEXT: v_mov_b32_e32 v9, s40 +; SI-NEXT: v_mov_b32_e32 v10, s28 +; SI-NEXT: v_mov_b32_e32 v11, s26 ; SI-NEXT: v_mov_b32_e32 v12, s7 -; SI-NEXT: v_mov_b32_e32 v13, s41 -; SI-NEXT: v_mov_b32_e32 v14, s45 -; SI-NEXT: v_mov_b32_e32 v15, s57 +; SI-NEXT: v_mov_b32_e32 v13, s25 +; SI-NEXT: v_mov_b32_e32 v14, s73 +; SI-NEXT: v_mov_b32_e32 v15, s27 ; SI-NEXT: v_mov_b32_e32 v16, s8 -; SI-NEXT: v_mov_b32_e32 v17, s60 -; SI-NEXT: v_mov_b32_e32 v18, s58 -; SI-NEXT: v_mov_b32_e32 v19, s56 +; SI-NEXT: v_mov_b32_e32 v17, s46 +; SI-NEXT: v_mov_b32_e32 v18, s44 +; SI-NEXT: v_mov_b32_e32 v19, s42 ; SI-NEXT: v_mov_b32_e32 v20, s9 -; SI-NEXT: v_mov_b32_e32 v21, s47 -; SI-NEXT: v_mov_b32_e32 v22, s59 -; SI-NEXT: v_mov_b32_e32 v23, s89 +; SI-NEXT: v_mov_b32_e32 v21, s29 +; SI-NEXT: v_mov_b32_e32 v22, s74 +; SI-NEXT: v_mov_b32_e32 v23, s41 ; SI-NEXT: v_mov_b32_e32 v24, s10 -; SI-NEXT: v_mov_b32_e32 v25, s74 -; SI-NEXT: v_mov_b32_e32 v26, s62 -; SI-NEXT: v_mov_b32_e32 v27, s72 +; SI-NEXT: v_mov_b32_e32 v25, s60 +; SI-NEXT: v_mov_b32_e32 v26, s58 +; SI-NEXT: v_mov_b32_e32 v27, s56 ; SI-NEXT: v_mov_b32_e32 v28, s11 ; SI-NEXT: v_mov_b32_e32 v29, s88 -; SI-NEXT: v_mov_b32_e32 v30, s90 -; SI-NEXT: v_mov_b32_e32 v31, s91 +; SI-NEXT: v_mov_b32_e32 v30, s75 +; SI-NEXT: v_mov_b32_e32 v31, s89 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB97_4: ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr24 ; SI-NEXT: ; implicit-def: $sgpr14 ; SI-NEXT: ; implicit-def: $sgpr12 ; SI-NEXT: ; implicit-def: $sgpr13 ; SI-NEXT: ; implicit-def: $sgpr15 -; SI-NEXT: ; implicit-def: $sgpr43 ; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr25 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr8 ; SI-NEXT: ; implicit-def: $sgpr46 ; SI-NEXT: ; implicit-def: $sgpr44 ; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr29 ; SI-NEXT: ; implicit-def: $sgpr41 -; SI-NEXT: ; implicit-def: $sgpr45 -; SI-NEXT: ; implicit-def: $sgpr57 -; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr10 ; SI-NEXT: ; implicit-def: $sgpr60 ; SI-NEXT: ; implicit-def: $sgpr58 ; SI-NEXT: ; implicit-def: $sgpr56 -; SI-NEXT: ; implicit-def: $sgpr47 -; SI-NEXT: ; implicit-def: $sgpr59 -; SI-NEXT: ; implicit-def: $sgpr89 -; SI-NEXT: ; implicit-def: $sgpr10 -; SI-NEXT: ; implicit-def: $sgpr74 -; SI-NEXT: ; implicit-def: $sgpr62 ; SI-NEXT: ; implicit-def: $sgpr88 -; SI-NEXT: ; implicit-def: $sgpr90 -; SI-NEXT: ; implicit-def: $sgpr91 -; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr89 ; SI-NEXT: s_branch .LBB97_2 ; ; VI-LABEL: bitcast_v16i16_to_v32i8_scalar: @@ -31854,125 +32986,119 @@ define <16 x i16> @bitcast_v32i8_to_v16i16(<32 x i8> %a, i32 %b) { ; SI-LABEL: bitcast_v32i8_to_v16i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v33, v6 -; SI-NEXT: v_mov_b32_e32 v35, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 -; SI-NEXT: v_mov_b32_e32 v32, v14 -; SI-NEXT: v_mov_b32_e32 v36, v12 -; SI-NEXT: v_mov_b32_e32 v31, v10 -; SI-NEXT: v_mov_b32_e32 v34, v8 -; SI-NEXT: v_lshlrev_b32_e32 v38, 8, v5 -; SI-NEXT: v_lshlrev_b32_e32 v37, 24, v7 -; SI-NEXT: v_lshlrev_b32_e32 v39, 24, v3 -; SI-NEXT: v_lshlrev_b32_e32 v49, 8, v13 -; SI-NEXT: v_lshlrev_b32_e32 v48, 24, v15 -; SI-NEXT: v_lshlrev_b32_e32 v50, 24, v11 -; SI-NEXT: v_lshlrev_b32_e32 v52, 8, v21 -; SI-NEXT: v_lshlrev_b32_e32 v51, 24, v23 -; SI-NEXT: v_lshlrev_b32_e32 v53, 24, v19 -; SI-NEXT: v_lshlrev_b32_e32 v54, 8, v29 -; SI-NEXT: v_lshlrev_b32_e32 v55, 24, v27 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 +; SI-NEXT: v_lshlrev_b32_e32 v32, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v31, 24, v7 +; SI-NEXT: v_lshlrev_b32_e32 v33, 24, v3 +; SI-NEXT: v_lshlrev_b32_e32 v35, 8, v13 +; SI-NEXT: v_lshlrev_b32_e32 v34, 24, v15 +; SI-NEXT: v_lshlrev_b32_e32 v36, 24, v11 +; SI-NEXT: v_lshlrev_b32_e32 v50, 8, v21 +; SI-NEXT: v_lshlrev_b32_e32 v37, 24, v23 +; SI-NEXT: v_lshlrev_b32_e32 v51, 24, v19 +; SI-NEXT: v_lshlrev_b32_e32 v53, 8, v29 +; SI-NEXT: v_lshlrev_b32_e32 v54, 24, v27 ; SI-NEXT: v_lshlrev_b32_e32 v19, 8, v1 -; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v9 -; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 -; SI-NEXT: v_lshlrev_b32_e32 v25, 8, v25 -; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: v_lshlrev_b32_e32 v27, 8, v9 +; SI-NEXT: v_lshlrev_b32_e32 v49, 8, v17 +; SI-NEXT: v_lshlrev_b32_e32 v55, 8, v25 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v38 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v27, 24, v6 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: v_lshlrev_b32_e32 v52, 24, v39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB98_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v1, 0xff, v4 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v33 -; SI-NEXT: v_or_b32_e32 v1, v1, v38 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v3, v37, v2 -; SI-NEXT: v_or_b32_e32 v2, v1, v3 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v35 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v36 -; SI-NEXT: v_and_b32_e32 v6, 0xff, v32 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v5, v5, v49 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v14 ; SI-NEXT: v_or_b32_e32 v0, v0, v19 -; SI-NEXT: v_or_b32_e32 v4, v39, v1 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; SI-NEXT: v_or_b32_e32 v7, v48, v6 +; SI-NEXT: v_or_b32_e32 v2, v33, v2 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v6, v5, v7 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v31 -; SI-NEXT: v_and_b32_e32 v9, 0xff, v20 -; SI-NEXT: v_and_b32_e32 v10, 0xff, v22 -; SI-NEXT: v_or_b32_e32 v21, v0, v4 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v34 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v6, v34, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v10 +; SI-NEXT: v_or_b32_e32 v23, v0, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v8 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v9, v9, v52 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v0, v0, v23 -; SI-NEXT: v_or_b32_e32 v8, v50, v5 -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: v_or_b32_e32 v11, v51, v10 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v22 +; SI-NEXT: v_or_b32_e32 v0, v0, v27 +; SI-NEXT: v_or_b32_e32 v9, v36, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v10, v9, v11 -; SI-NEXT: v_and_b32_e32 v9, 0xff, v18 -; SI-NEXT: v_and_b32_e32 v13, 0xff, v28 -; SI-NEXT: v_and_b32_e32 v14, 0xff, v30 -; SI-NEXT: v_or_b32_e32 v29, v0, v8 +; SI-NEXT: v_or_b32_e32 v5, v5, v50 +; SI-NEXT: v_or_b32_e32 v10, v37, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v18 +; SI-NEXT: v_or_b32_e32 v15, v0, v9 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v13, v13, v54 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v0, v0, v17 -; SI-NEXT: v_or_b32_e32 v12, v53, v9 -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; SI-NEXT: v_or_b32_e32 v15, v27, v14 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v0, v0, v49 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v4 +; SI-NEXT: v_or_b32_e32 v4, v31, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v12 +; SI-NEXT: v_or_b32_e32 v5, v5, v10 +; SI-NEXT: v_or_b32_e32 v11, v51, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v28 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v30 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_alignbit_b32 v5, v6, v8, 16 -; SI-NEXT: v_or_b32_e32 v14, v13, v15 -; SI-NEXT: v_and_b32_e32 v13, 0xff, v26 -; SI-NEXT: v_or_b32_e32 v8, v0, v12 +; SI-NEXT: v_or_b32_e32 v1, v1, v32 +; SI-NEXT: v_or_b32_e32 v3, v3, v35 +; SI-NEXT: v_alignbit_b32 v21, v5, v11, 16 +; SI-NEXT: v_or_b32_e32 v7, v7, v53 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v26 +; SI-NEXT: v_or_b32_e32 v11, v0, v11 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v24 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v0, v0, v25 -; SI-NEXT: v_or_b32_e32 v18, v55, v13 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v12, v52, v12 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v0, v0, v55 +; SI-NEXT: v_or_b32_e32 v1, v1, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v6 +; SI-NEXT: v_or_b32_e32 v7, v7, v12 +; SI-NEXT: v_or_b32_e32 v14, v54, v14 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_alignbit_b32 v1, v2, v4, 16 -; SI-NEXT: v_alignbit_b32 v9, v10, v12, 16 -; SI-NEXT: v_alignbit_b32 v13, v14, v18, 16 -; SI-NEXT: v_or_b32_e32 v12, v0, v18 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_alignbit_b32 v13, v1, v2, 16 +; SI-NEXT: v_alignbit_b32 v17, v3, v9, 16 +; SI-NEXT: v_alignbit_b32 v25, v7, v14, 16 +; SI-NEXT: v_or_b32_e32 v9, v0, v14 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v12 ; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr20 @@ -31981,128 +33107,150 @@ define <16 x i16> @bitcast_v32i8_to_v16i16(<32 x i8> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: .LBB98_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB98_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v24 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v26 -; SI-NEXT: v_or_b32_e32 v1, v25, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v26 +; SI-NEXT: v_or_b32_e32 v1, v55, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v2, v55, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v3, v54, v3 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: s_mov_b32 s7, 0x3000000 -; SI-NEXT: v_add_i32_e32 v12, vcc, s7, v1 +; SI-NEXT: v_add_i32_e32 v9, vcc, s7, v1 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v28 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v30 ; SI-NEXT: s_movk_i32 s6, 0x300 -; SI-NEXT: v_or_b32_e32 v1, v54, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_or_b32_e32 v1, v53, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v2, v27, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_add_i32_e32 v14, vcc, s7, v1 +; SI-NEXT: v_or_b32_e32 v3, v52, v3 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v7, vcc, s7, v1 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v16 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v18 -; SI-NEXT: v_or_b32_e32 v1, v17, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v18 +; SI-NEXT: v_or_b32_e32 v1, v49, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v2, v53, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_add_i32_e32 v8, vcc, s7, v1 +; SI-NEXT: v_or_b32_e32 v3, v51, v3 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v11, vcc, s7, v1 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v20 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v22 -; SI-NEXT: v_or_b32_e32 v1, v52, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v22 +; SI-NEXT: v_or_b32_e32 v1, v50, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v2, v51, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_add_i32_e32 v10, vcc, s7, v1 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v34 +; SI-NEXT: v_or_b32_e32 v3, v37, v3 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v5, vcc, s7, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v8 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v31 -; SI-NEXT: v_or_b32_e32 v1, v23, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v10 +; SI-NEXT: v_or_b32_e32 v1, v27, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v2, v50, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_add_i32_e32 v29, vcc, s7, v1 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v36 +; SI-NEXT: v_or_b32_e32 v3, v36, v3 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v15, vcc, s7, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v12 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v32 -; SI-NEXT: v_or_b32_e32 v1, v49, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v14 +; SI-NEXT: v_or_b32_e32 v1, v35, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v2, v48, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v3, v34, v3 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_add_i32_e32 v6, vcc, s7, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, s7, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v35 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v2 ; SI-NEXT: v_or_b32_e32 v0, v19, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v39, v1 +; SI-NEXT: v_or_b32_e32 v1, v33, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v21, vcc, s7, v0 +; SI-NEXT: v_add_i32_e32 v23, vcc, s7, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v4 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v33 -; SI-NEXT: v_or_b32_e32 v0, v38, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v6 +; SI-NEXT: v_or_b32_e32 v0, v32, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v37, v1 +; SI-NEXT: v_or_b32_e32 v1, v31, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v2, vcc, s7, v0 -; SI-NEXT: v_alignbit_b32 v1, v2, v21, 16 -; SI-NEXT: v_alignbit_b32 v5, v6, v29, 16 -; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_add_i32_e32 v1, vcc, s7, v0 +; SI-NEXT: v_alignbit_b32 v13, v1, v23, 16 +; SI-NEXT: v_alignbit_b32 v17, v3, v15, 16 +; SI-NEXT: v_alignbit_b32 v21, v5, v11, 16 +; SI-NEXT: v_alignbit_b32 v25, v7, v9, 16 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v7 ; SI-NEXT: .LBB98_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_mov_b32_e32 v0, v21 -; SI-NEXT: v_mov_b32_e32 v4, v29 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v13 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v17 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v38 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v21 +; SI-NEXT: v_or_b32_e32 v4, v4, v6 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v39 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v25 +; SI-NEXT: v_or_b32_e32 v6, v6, v8 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v48 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v32i8_to_v16i16: @@ -32812,230 +33960,272 @@ define inreg <16 x i16> @bitcast_v32i8_to_v16i16_scalar(<32 x i8> inreg %a, i32 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; SI-NEXT: v_mov_b32_e32 v22, v14 -; SI-NEXT: v_mov_b32_e32 v21, v10 -; SI-NEXT: v_readfirstlane_b32 s43, v1 -; SI-NEXT: v_readfirstlane_b32 s42, v0 +; SI-NEXT: v_readfirstlane_b32 s63, v17 +; SI-NEXT: v_readfirstlane_b32 s62, v16 +; SI-NEXT: v_readfirstlane_b32 s74, v15 +; SI-NEXT: v_readfirstlane_b32 s75, v14 +; SI-NEXT: v_readfirstlane_b32 s77, v13 +; SI-NEXT: v_readfirstlane_b32 s76, v12 +; SI-NEXT: v_readfirstlane_b32 s78, v11 +; SI-NEXT: v_readfirstlane_b32 s79, v10 +; SI-NEXT: v_readfirstlane_b32 s57, v9 +; SI-NEXT: v_readfirstlane_b32 s56, v8 +; SI-NEXT: v_readfirstlane_b32 s58, v7 +; SI-NEXT: v_readfirstlane_b32 s59, v6 +; SI-NEXT: v_readfirstlane_b32 s61, v5 +; SI-NEXT: v_readfirstlane_b32 s60, v4 +; SI-NEXT: v_readfirstlane_b32 s72, v3 +; SI-NEXT: v_readfirstlane_b32 s73, v2 +; SI-NEXT: v_readfirstlane_b32 s46, v1 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v5 -; SI-NEXT: v_lshlrev_b32_e32 v25, 8, v11 -; SI-NEXT: v_lshlrev_b32_e32 v24, 24, v13 -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v7 -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v9 -; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v15 -; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v17 +; SI-NEXT: v_readfirstlane_b32 s47, v0 ; SI-NEXT: s_cbranch_scc0 .LBB99_4 ; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s8, s20, 0xff +; SI-NEXT: s_lshl_b32 s9, s21, 8 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s22, 0xff ; SI-NEXT: s_and_b32 s4, s16, 0xff ; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_lshl_b32 s11, s23, 24 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_and_b32 s5, s18, 0xff +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_or_b32 s44, s11, s9 ; SI-NEXT: s_lshl_b32 s5, s5, 16 ; SI-NEXT: s_lshl_b32 s6, s19, 24 +; SI-NEXT: s_or_b32 s13, s8, s44 +; SI-NEXT: s_and_b32 s8, s28, 0xff +; SI-NEXT: s_lshl_b32 s9, s29, 8 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s12, s6, s5 -; SI-NEXT: s_or_b32 s6, s4, s12 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s47, 0xff +; SI-NEXT: s_or_b32 s10, s4, s12 ; SI-NEXT: s_and_b32 s4, s24, 0xff ; SI-NEXT: s_lshl_b32 s5, s25, 8 +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_lshl_b32 s11, s46, 24 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_and_b32 s5, s26, 0xff +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_or_b32 s45, s11, s9 ; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s7, s27, 24 -; SI-NEXT: s_or_b32 s14, s7, s5 -; SI-NEXT: s_and_b32 s5, s20, 0xff -; SI-NEXT: s_lshl_b32 s7, s21, 8 +; SI-NEXT: s_lshl_b32 s6, s27, 24 +; SI-NEXT: s_or_b32 s15, s8, s45 +; SI-NEXT: s_and_b32 s8, s59, 0xff +; SI-NEXT: s_lshl_b32 s9, s58, 8 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s14, s6, s5 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s56, 0xff +; SI-NEXT: s_or_b32 s6, s4, s14 +; SI-NEXT: s_and_b32 s4, s73, 0xff +; SI-NEXT: s_lshl_b32 s5, s72, 8 +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_lshl_b32 s11, s57, 24 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s60, 0xff +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_or_b32 s88, s11, s9 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s7, s61, 24 +; SI-NEXT: s_or_b32 s41, s8, s88 +; SI-NEXT: s_and_b32 s8, s75, 0xff +; SI-NEXT: s_lshl_b32 s9, s74, 8 +; SI-NEXT: s_or_b32 s40, s7, s5 +; SI-NEXT: s_and_b32 s5, s79, 0xff +; SI-NEXT: s_lshl_b32 s7, s78, 8 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s62, 0xff ; SI-NEXT: s_or_b32 s5, s5, s7 -; SI-NEXT: s_and_b32 s7, s22, 0xff -; SI-NEXT: s_lshl_b32 s7, s7, 16 -; SI-NEXT: s_lshl_b32 s8, s23, 24 -; SI-NEXT: s_and_b32 s5, s5, 0xffff -; SI-NEXT: s_or_b32 s7, s8, s7 -; SI-NEXT: s_or_b32 s13, s5, s7 -; SI-NEXT: s_lshr_b64 s[8:9], s[12:13], 16 -; SI-NEXT: v_and_b32_e32 v10, 0xff, v12 -; SI-NEXT: v_and_b32_e32 v14, 0xff, v8 -; SI-NEXT: s_and_b32 s5, s28, 0xff -; SI-NEXT: s_lshl_b32 s9, s29, 8 -; SI-NEXT: v_and_b32_e32 v9, 0xff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_and_b32_e32 v17, 0xff, v16 -; SI-NEXT: s_or_b32 s5, s5, s9 -; SI-NEXT: s_and_b32 s9, s42, 0xff -; SI-NEXT: v_or_b32_e32 v9, v9, v23 -; SI-NEXT: v_or_b32_e32 v13, v24, v10 -; SI-NEXT: v_and_b32_e32 v10, 0xff, v6 -; SI-NEXT: v_or_b32_e32 v15, v0, v14 -; SI-NEXT: v_and_b32_e32 v14, 0xff, v22 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: s_and_b32 s7, s76, 0xff ; SI-NEXT: s_lshl_b32 s9, s9, 16 -; SI-NEXT: s_lshl_b32 s10, s43, 24 -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v9 -; SI-NEXT: v_and_b32_e32 v9, 0xff, v4 -; SI-NEXT: v_or_b32_e32 v10, v10, v1 -; SI-NEXT: v_or_b32_e32 v14, v14, v7 -; SI-NEXT: v_or_b32_e32 v26, v5, v17 -; SI-NEXT: v_and_b32_e32 v17, 0xff, v21 -; SI-NEXT: s_and_b32 s5, s5, 0xffff -; SI-NEXT: s_or_b32 s12, s10, s9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; SI-NEXT: v_or_b32_e32 v17, v17, v25 -; SI-NEXT: s_or_b32 s15, s5, s12 +; SI-NEXT: s_lshl_b32 s11, s63, 24 +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_or_b32 s89, s11, s9 +; SI-NEXT: s_lshl_b32 s9, s77, 24 +; SI-NEXT: s_or_b32 s42, s9, s7 +; SI-NEXT: s_and_b32 s7, s8, 0xffff ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: v_or_b32_e32 v9, v3, v9 -; SI-NEXT: v_or_b32_e32 v10, v10, v15 -; SI-NEXT: v_or_b32_e32 v14, v14, v26 -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; SI-NEXT: s_lshr_b64 s[10:11], s[14:15], 16 -; SI-NEXT: s_or_b32 s4, s4, s14 -; SI-NEXT: v_or_b32_e32 v19, v11, v9 -; SI-NEXT: v_mov_b32_e32 v20, v10 -; SI-NEXT: v_lshr_b64 v[9:10], v[9:10], 16 -; SI-NEXT: v_or_b32_e32 v17, v17, v13 -; SI-NEXT: v_mov_b32_e32 v18, v14 -; SI-NEXT: v_lshr_b64 v[13:14], v[13:14], 16 -; SI-NEXT: s_lshr_b32 s9, s7, 16 -; SI-NEXT: s_lshr_b32 s11, s12, 16 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v26 -; SI-NEXT: s_mov_b32 s7, s13 -; SI-NEXT: s_mov_b32 s5, s15 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s43, s7, s89 +; SI-NEXT: s_or_b32 s4, s4, s40 +; SI-NEXT: s_or_b32 s8, s5, s42 +; SI-NEXT: s_mov_b32 s11, s13 +; SI-NEXT: s_lshr_b64 s[12:13], s[12:13], 16 +; SI-NEXT: s_mov_b32 s7, s15 +; SI-NEXT: s_lshr_b64 s[14:15], s[14:15], 16 +; SI-NEXT: s_mov_b32 s5, s41 +; SI-NEXT: s_lshr_b64 s[40:41], s[40:41], 16 +; SI-NEXT: s_mov_b32 s9, s43 +; SI-NEXT: s_lshr_b64 s[42:43], s[42:43], 16 +; SI-NEXT: s_lshr_b32 s15, s44, 16 +; SI-NEXT: s_lshr_b32 s41, s45, 16 +; SI-NEXT: s_lshr_b32 s43, s88, 16 +; SI-NEXT: s_lshr_b32 s13, s89, 16 ; SI-NEXT: s_cbranch_execnz .LBB99_3 ; SI-NEXT: .LBB99_2: ; %cmp.true -; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: s_and_b32 s4, s24, 0xff -; SI-NEXT: s_lshl_b32 s5, s25, 8 -; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s79, s79, 3 +; SI-NEXT: s_and_b32 s4, s79, 0xff +; SI-NEXT: s_lshl_b32 s5, s78, 8 +; SI-NEXT: s_add_i32 s76, s76, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s6, s26, 0xff +; SI-NEXT: s_and_b32 s6, s76, 0xff ; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: s_lshl_b32 s5, s27, 24 +; SI-NEXT: s_lshl_b32 s5, s77, 24 ; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v21 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s5, s28, 0xff -; SI-NEXT: s_lshl_b32 s6, s29, 8 -; SI-NEXT: s_add_i32 s42, s42, 3 -; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v12 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: s_add_i32 s75, s75, 3 +; SI-NEXT: s_add_i32 s8, s4, 0x3000000 +; SI-NEXT: s_and_b32 s4, s75, 0xff +; SI-NEXT: s_lshl_b32 s5, s74, 8 +; SI-NEXT: s_add_i32 s62, s62, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s6, s62, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s63, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s73, s73, 3 +; SI-NEXT: s_add_i32 s9, s4, 0x3000000 +; SI-NEXT: s_and_b32 s4, s73, 0xff +; SI-NEXT: s_lshl_b32 s5, s72, 8 +; SI-NEXT: s_add_i32 s60, s60, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s6, s60, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s61, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_add_i32 s59, s59, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s59, 0xff +; SI-NEXT: s_lshl_b32 s6, s58, 8 +; SI-NEXT: s_add_i32 s56, s56, 3 ; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_and_b32 s7, s42, 0xff -; SI-NEXT: v_or_b32_e32 v9, v25, v9 -; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 -; SI-NEXT: v_or_b32_e32 v2, v23, v2 -; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: s_and_b32 s7, s56, 0xff ; SI-NEXT: s_addk_i32 s5, 0x300 -; SI-NEXT: s_lshl_b32 s6, s43, 24 +; SI-NEXT: s_lshl_b32 s6, s57, 24 ; SI-NEXT: s_lshl_b32 s7, s7, 16 -; SI-NEXT: v_add_i32_e32 v9, vcc, 0x300, v9 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x300, v2 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: s_and_b32 s5, s5, 0xffff ; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: v_or_b32_e32 v10, v24, v10 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: s_add_i32 s24, s24, 3 ; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_and_b32 s6, s16, 0xff -; SI-NEXT: s_lshl_b32 s7, s17, 8 -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: s_and_b32 s6, s24, 0xff +; SI-NEXT: s_lshl_b32 s7, s25, 8 +; SI-NEXT: s_add_i32 s26, s26, 3 ; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_and_b32 s8, s18, 0xff -; SI-NEXT: v_add_i32_e32 v17, vcc, 0x3000000, v9 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v22 -; SI-NEXT: v_add_i32_e32 v19, vcc, 0x3000000, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v6 +; SI-NEXT: s_and_b32 s10, s26, 0xff ; SI-NEXT: s_addk_i32 s6, 0x300 -; SI-NEXT: s_lshl_b32 s7, s19, 24 -; SI-NEXT: s_lshl_b32 s8, s8, 16 -; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: s_lshl_b32 s7, s27, 24 +; SI-NEXT: s_lshl_b32 s10, s10, 16 ; SI-NEXT: s_and_b32 s6, s6, 0xffff -; SI-NEXT: s_or_b32 s7, s7, s8 -; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: v_or_b32_e32 v7, v7, v9 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v16 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v8 +; SI-NEXT: s_or_b32 s7, s7, s10 +; SI-NEXT: s_add_i32 s28, s28, 3 ; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_and_b32 s7, s20, 0xff -; SI-NEXT: s_lshl_b32 s8, s21, 8 -; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: s_or_b32 s7, s8, s7 -; SI-NEXT: s_and_b32 s9, s22, 0xff -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x300, v7 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_and_b32 s7, s28, 0xff +; SI-NEXT: s_lshl_b32 s10, s29, 8 +; SI-NEXT: s_add_i32 s47, s47, 3 +; SI-NEXT: s_or_b32 s7, s10, s7 +; SI-NEXT: s_and_b32 s11, s47, 0xff ; SI-NEXT: s_addk_i32 s7, 0x300 -; SI-NEXT: s_lshl_b32 s8, s23, 24 -; SI-NEXT: s_lshl_b32 s9, s9, 16 -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: v_or_b32_e32 v5, v5, v9 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: s_lshl_b32 s10, s46, 24 +; SI-NEXT: s_lshl_b32 s11, s11, 16 ; SI-NEXT: s_and_b32 s7, s7, 0xffff -; SI-NEXT: s_or_b32 s8, s8, s9 -; SI-NEXT: v_or_b32_e32 v5, v5, v7 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: s_or_b32 s7, s8, s7 -; SI-NEXT: v_add_i32_e32 v18, vcc, 0x3000000, v5 -; SI-NEXT: v_add_i32_e32 v20, vcc, 0x3000000, v0 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_or_b32 s7, s10, s7 +; SI-NEXT: s_and_b32 s10, s16, 0xff +; SI-NEXT: s_lshl_b32 s11, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: s_and_b32 s12, s18, 0xff +; SI-NEXT: s_addk_i32 s10, 0x300 +; SI-NEXT: s_lshl_b32 s11, s19, 24 +; SI-NEXT: s_lshl_b32 s12, s12, 16 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_or_b32 s11, s11, s12 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: s_and_b32 s11, s20, 0xff +; SI-NEXT: s_lshl_b32 s12, s21, 8 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s11, s12, s11 +; SI-NEXT: s_and_b32 s13, s22, 0xff +; SI-NEXT: s_addk_i32 s11, 0x300 +; SI-NEXT: s_lshl_b32 s12, s23, 24 +; SI-NEXT: s_lshl_b32 s13, s13, 16 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: s_or_b32 s12, s12, s13 +; SI-NEXT: s_or_b32 s11, s12, s11 ; SI-NEXT: s_add_i32 s4, s4, 0x3000000 ; SI-NEXT: s_add_i32 s5, s5, 0x3000000 ; SI-NEXT: s_add_i32 s6, s6, 0x3000000 ; SI-NEXT: s_add_i32 s7, s7, 0x3000000 -; SI-NEXT: s_lshr_b64 s[8:9], s[6:7], 16 -; SI-NEXT: s_lshr_b64 s[10:11], s[4:5], 16 -; SI-NEXT: v_lshr_b64 v[9:10], v[19:20], 16 -; SI-NEXT: v_lshr_b64 v[13:14], v[17:18], 16 -; SI-NEXT: s_lshr_b32 s9, s7, 16 -; SI-NEXT: s_lshr_b32 s11, s5, 16 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v18 +; SI-NEXT: s_add_i32 s10, s10, 0x3000000 +; SI-NEXT: s_add_i32 s11, s11, 0x3000000 +; SI-NEXT: s_lshr_b64 s[12:13], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[14:15], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[42:43], s[8:9], 16 +; SI-NEXT: s_lshr_b32 s15, s11, 16 +; SI-NEXT: s_lshr_b32 s41, s7, 16 +; SI-NEXT: s_lshr_b32 s43, s5, 16 +; SI-NEXT: s_lshr_b32 s13, s9, 16 ; SI-NEXT: .LBB99_3: ; %end -; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: v_mov_b32_e32 v1, s8 -; SI-NEXT: v_mov_b32_e32 v2, s7 -; SI-NEXT: v_mov_b32_e32 v3, s9 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_lshl_b32 s12, s12, 16 +; SI-NEXT: s_or_b32 s10, s10, s12 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: s_lshl_b32 s12, s15, 16 +; SI-NEXT: s_or_b32 s11, s11, s12 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_lshl_b32 s12, s14, 16 +; SI-NEXT: s_or_b32 s6, s6, s12 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_lshl_b32 s12, s41, 16 +; SI-NEXT: s_or_b32 s7, s7, s12 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s12, s40, 16 +; SI-NEXT: s_or_b32 s4, s4, s12 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s12, s43, 16 +; SI-NEXT: s_or_b32 s5, s5, s12 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_lshl_b32 s12, s42, 16 +; SI-NEXT: s_or_b32 s8, s8, s12 +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_lshl_b32 s12, s13, 16 +; SI-NEXT: s_or_b32 s9, s9, s12 +; SI-NEXT: v_mov_b32_e32 v0, s10 +; SI-NEXT: v_mov_b32_e32 v1, s11 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 ; SI-NEXT: v_mov_b32_e32 v4, s4 -; SI-NEXT: v_mov_b32_e32 v5, s10 -; SI-NEXT: v_mov_b32_e32 v6, s5 -; SI-NEXT: v_mov_b32_e32 v7, s11 -; SI-NEXT: v_mov_b32_e32 v8, v19 -; SI-NEXT: v_mov_b32_e32 v10, v20 -; SI-NEXT: v_mov_b32_e32 v12, v17 -; SI-NEXT: v_mov_b32_e32 v14, v18 +; SI-NEXT: v_mov_b32_e32 v5, s5 +; SI-NEXT: v_mov_b32_e32 v6, s8 +; SI-NEXT: v_mov_b32_e32 v7, s9 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB99_4: +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr15 ; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr41 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: ; implicit-def: $sgpr10 -; SI-NEXT: ; implicit-def: $sgpr11 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr43 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr13 ; SI-NEXT: s_branch .LBB99_2 ; ; VI-LABEL: bitcast_v32i8_to_v16i16_scalar: @@ -33714,65 +34904,84 @@ define <16 x bfloat> @bitcast_v16f16_to_v16bf16(<16 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v16f16_to_v16bf16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v0 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v15 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v9 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 ; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB100_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB100_4 -; SI-NEXT: .LBB100_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB100_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v17 +; SI-NEXT: s_cbranch_execz .LBB100_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v17 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v18 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v20 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v21 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v22 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v24 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v25 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v26 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v28 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v29 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v30 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v31 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v31 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr19 @@ -33788,75 +34997,108 @@ define <16 x bfloat> @bitcast_v16f16_to_v16bf16(<16 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: .LBB100_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB100_2 -; SI-NEXT: .LBB100_4: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v29 +; SI-NEXT: s_cbranch_execz .LBB100_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v29 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v24 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v1 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v25 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v1 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v25 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v1 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v16 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v21 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v17 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v8 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 +; SI-NEXT: .LBB100_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_alignbit_b32 v0, v2, v0, 16 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v6 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v4 +; SI-NEXT: v_alignbit_b32 v2, v2, v3, 16 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v7 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v5 +; SI-NEXT: v_alignbit_b32 v3, v3, v4, 16 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v10 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v8 +; SI-NEXT: v_alignbit_b32 v4, v4, v5, 16 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v11 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_mul_f32_e32 v6, 1.0, v9 +; SI-NEXT: v_alignbit_b32 v5, v5, v6, 16 +; SI-NEXT: v_mul_f32_e32 v6, 1.0, v14 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v12 +; SI-NEXT: v_alignbit_b32 v6, v6, v7, 16 +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v15 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v13 +; SI-NEXT: v_alignbit_b32 v7, v7, v8, 16 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16f16_to_v16bf16: @@ -33960,60 +35202,83 @@ define inreg <16 x bfloat> @bitcast_v16f16_to_v16bf16_scalar(<16 x half> inreg % ; SI-LABEL: bitcast_v16f16_to_v16bf16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v16, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v17, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v18, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v19, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v20, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v21, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v22, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v23, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v24, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v25, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v26, s26 -; SI-NEXT: v_cvt_f16_f32_e32 v27, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v28, s28 -; SI-NEXT: v_cvt_f16_f32_e32 v29, s29 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v1 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v0 +; SI-NEXT: s_cmp_lg_u32 s24, 0 ; SI-NEXT: s_cbranch_scc0 .LBB101_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v18 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v20 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v22 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v24 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v26 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v28 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v29 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v30 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v31 ; SI-NEXT: s_cbranch_execnz .LBB101_3 ; SI-NEXT: .LBB101_2: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v28 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v1 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v25 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v24 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v1 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v23 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v21 @@ -34021,66 +35286,98 @@ define inreg <16 x bfloat> @bitcast_v16f16_to_v16bf16_scalar(<16 x half> inreg % ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v1 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v16 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v10 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v7 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v12 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v11 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: .LBB101_3: ; %end +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_lshr_b64 v[1:2], v[2:3], 16 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v8 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v5 +; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v6 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_lshr_b64 v[3:4], v[4:5], 16 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v14 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v9 +; SI-NEXT: v_lshr_b64 v[4:5], v[4:5], 16 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v13 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v7 +; SI-NEXT: v_lshr_b64 v[5:6], v[5:6], 16 +; SI-NEXT: v_mul_f32_e32 v6, 1.0, v19 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_mul_f32_e32 v6, 1.0, v15 +; SI-NEXT: v_lshr_b64 v[6:7], v[6:7], 16 +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v17 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v7 +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v11 +; SI-NEXT: v_lshr_b64 v[7:8], v[7:8], 16 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB101_4: ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: s_branch .LBB101_2 ; ; VI-LABEL: bitcast_v16f16_to_v16bf16_scalar: @@ -34229,80 +35526,90 @@ define <16 x half> @bitcast_v16bf16_to_v16f16(<16 x bfloat> %a, i32 %b) { ; SI-LABEL: bitcast_v16bf16_to_v16f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 ; SI-NEXT: v_mul_f32_e32 v16, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v17, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v18, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v19, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v20, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v21, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v22, 1.0, v6 -; SI-NEXT: v_mul_f32_e32 v23, 1.0, v7 -; SI-NEXT: v_mul_f32_e32 v24, 1.0, v8 -; SI-NEXT: v_mul_f32_e32 v25, 1.0, v9 -; SI-NEXT: v_mul_f32_e32 v26, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v5 ; SI-NEXT: v_mul_f32_e32 v27, 1.0, v11 -; SI-NEXT: v_mul_f32_e32 v28, 1.0, v12 -; SI-NEXT: v_mul_f32_e32 v29, 1.0, v13 -; SI-NEXT: v_mul_f32_e32 v30, 1.0, v14 -; SI-NEXT: v_mul_f32_e32 v31, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v29, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v9 ; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB102_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB102_4 -; SI-NEXT: .LBB102_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB102_3: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v16 +; SI-NEXT: s_cbranch_execz .LBB102_2 +; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v29 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v6 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v12 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v30 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr17 @@ -34320,74 +35627,108 @@ define <16 x half> @bitcast_v16bf16_to_v16f16(<16 x bfloat> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: .LBB102_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB102_2 -; SI-NEXT: .LBB102_4: ; %cmp.true +; SI-NEXT: s_cbranch_execz .LBB102_4 +; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v31 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v30 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v29 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v28 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v27 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v26 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v25 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v24 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v23 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v22 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v21 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v20 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v19 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v18 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v17 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v16 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: .LBB102_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v0, v0, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v8 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v14 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v12 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16bf16_to_v16f16: @@ -35008,143 +36349,190 @@ define inreg <16 x half> @bitcast_v16bf16_to_v16f16_scalar(<16 x bfloat> inreg % ; SI-LABEL: bitcast_v16bf16_to_v16f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mul_f32_e64 v16, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v17, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v18, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v19, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v20, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v21, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v22, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v23, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v24, 1.0, s24 -; SI-NEXT: v_mul_f32_e64 v25, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v26, 1.0, s26 -; SI-NEXT: v_mul_f32_e64 v27, 1.0, s27 -; SI-NEXT: v_mul_f32_e64 v28, 1.0, s28 -; SI-NEXT: v_mul_f32_e64 v29, 1.0, s29 -; SI-NEXT: v_mul_f32_e32 v30, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v31, 1.0, v1 +; SI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; SI-NEXT: s_lshl_b32 s5, s23, 16 +; SI-NEXT: s_and_b32 s6, s22, 0xffff0000 +; SI-NEXT: s_lshl_b32 s7, s22, 16 +; SI-NEXT: s_and_b32 s8, s21, 0xffff0000 +; SI-NEXT: s_lshl_b32 s9, s21, 16 +; SI-NEXT: s_and_b32 s10, s20, 0xffff0000 +; SI-NEXT: s_lshl_b32 s11, s20, 16 +; SI-NEXT: s_and_b32 s12, s19, 0xffff0000 +; SI-NEXT: s_lshl_b32 s13, s19, 16 +; SI-NEXT: s_and_b32 s14, s18, 0xffff0000 +; SI-NEXT: s_lshl_b32 s15, s18, 16 +; SI-NEXT: s_and_b32 s18, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_and_b32 s19, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s16, s16, 16 +; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: v_mul_f32_e64 v0, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s15 +; SI-NEXT: v_mul_f32_e64 v6, 1.0, s14 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s13 +; SI-NEXT: v_mul_f32_e64 v15, 1.0, s12 +; SI-NEXT: v_mul_f32_e64 v19, 1.0, s11 +; SI-NEXT: v_mul_f32_e64 v22, 1.0, s10 +; SI-NEXT: v_mul_f32_e64 v25, 1.0, s9 +; SI-NEXT: v_mul_f32_e64 v27, 1.0, s8 +; SI-NEXT: v_mul_f32_e64 v28, 1.0, s7 +; SI-NEXT: v_mul_f32_e64 v29, 1.0, s6 +; SI-NEXT: v_mul_f32_e64 v30, 1.0, s5 +; SI-NEXT: v_mul_f32_e64 v31, 1.0, s4 ; SI-NEXT: s_cbranch_scc0 .LBB103_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v29 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v11 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v13 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: s_cbranch_execnz .LBB103_3 ; SI-NEXT: .LBB103_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v31 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v30 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v29 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v28 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v27 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v26 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v25 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v24 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v23 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v22 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v21 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v20 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v19 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v18 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v17 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v16 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v31 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v30 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v29 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v28 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v27 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v25 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v22 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v19 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v15 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v10 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: .LBB103_3: ; %end +; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v7 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v14 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v11 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v20 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v16 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v24 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v21 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB103_4: -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: s_branch .LBB103_2 ; ; VI-LABEL: bitcast_v16bf16_to_v16f16_scalar: @@ -35878,25 +37266,47 @@ define <32 x i8> @bitcast_v16f16_to_v32i8(<16 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v16f16_to_v32i8: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v17, v14 -; SI-NEXT: v_mov_b32_e32 v18, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v1 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v33, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v17 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v7 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 @@ -36527,44 +37937,66 @@ define inreg <32 x i8> @bitcast_v16f16_to_v32i8_scalar(<16 x half> inreg %a, i32 ; SI-LABEL: bitcast_v16f16_to_v32i8_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s17 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v32, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v16, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v6, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v8, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v53, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v52, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v14, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v39, s22 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v40, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v55, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v22, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v54, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v11 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v42, s29 -; SI-NEXT: v_cvt_f16_f32_e32 v41, s28 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v30, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_cvt_f16_f32_e32 v40, v9 +; SI-NEXT: s_cmp_lg_u32 s24, 0 ; SI-NEXT: s_cbranch_scc0 .LBB105_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v32 -; SI-NEXT: v_or_b32_e32 v48, v16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v16 +; SI-NEXT: v_or_b32_e32 v48, v8, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v6 -; SI-NEXT: v_or_b32_e32 v49, v8, v1 +; SI-NEXT: v_or_b32_e32 v49, v0, v1 ; SI-NEXT: v_lshr_b64 v[1:2], v[48:49], 8 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v53 -; SI-NEXT: v_or_b32_e32 v35, v52, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 +; SI-NEXT: v_or_b32_e32 v35, v39, v2 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v14 -; SI-NEXT: v_or_b32_e32 v36, v39, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 -; SI-NEXT: v_or_b32_e32 v37, v55, v2 +; SI-NEXT: v_or_b32_e32 v36, v32, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 +; SI-NEXT: v_or_b32_e32 v37, v54, v2 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v22 -; SI-NEXT: v_or_b32_e32 v38, v54, v2 +; SI-NEXT: v_or_b32_e32 v38, v53, v2 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v42 ; SI-NEXT: v_lshr_b64 v[3:4], v[48:49], 24 ; SI-NEXT: v_lshr_b64 v[11:12], v[35:36], 24 @@ -36572,7 +38004,7 @@ define inreg <32 x i8> @bitcast_v16f16_to_v32i8_scalar(<16 x half> inreg %a, i32 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 ; SI-NEXT: v_lshr_b64 v[4:5], v[48:49], 16 ; SI-NEXT: v_lshr_b64 v[12:13], v[35:36], 16 -; SI-NEXT: v_or_b32_e32 v34, v0, v2 +; SI-NEXT: v_or_b32_e32 v34, v40, v2 ; SI-NEXT: v_lshr_b64 v[24:25], v[37:38], 16 ; SI-NEXT: v_lshr_b64 v[9:10], v[35:36], 8 ; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v49 @@ -36593,7 +38025,7 @@ define inreg <32 x i8> @bitcast_v16f16_to_v32i8_scalar(<16 x half> inreg %a, i32 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v42 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v41 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v54 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -36601,59 +38033,59 @@ define inreg <32 x i8> @bitcast_v16f16_to_v32i8_scalar(<16 x half> inreg %a, i32 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v30, v3 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v40 ; SI-NEXT: v_or_b32_e32 v33, v2, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v55 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v55 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v30 -; SI-NEXT: v_or_b32_e32 v34, v0, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v22 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_or_b32_e32 v37, v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v54 +; SI-NEXT: v_or_b32_e32 v34, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v22 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v37, v4, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v53 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v0 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v1 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v52 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v22 -; SI-NEXT: v_or_b32_e32 v38, v0, v2 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v14 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v1 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_or_b32_e32 v35, v2, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v32 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v14 -; SI-NEXT: v_or_b32_e32 v36, v1, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v39 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v22 +; SI-NEXT: v_or_b32_e32 v38, v1, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v14 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v2 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v35, v3, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v8 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v14 +; SI-NEXT: v_or_b32_e32 v36, v2, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v2 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v48, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; SI-NEXT: v_or_b32_e32 v49, v2, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v3 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v48, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v6 +; SI-NEXT: v_or_b32_e32 v49, v0, v1 ; SI-NEXT: v_lshr_b64 v[3:4], v[48:49], 24 ; SI-NEXT: v_lshr_b64 v[11:12], v[35:36], 24 ; SI-NEXT: v_lshr_b64 v[24:25], v[37:38], 16 @@ -37156,103 +38588,99 @@ define <16 x half> @bitcast_v32i8_to_v16f16(<32 x i8> %a, i32 %b) { ; SI-LABEL: bitcast_v32i8_to_v16f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v33, v2 -; SI-NEXT: v_mov_b32_e32 v31, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 -; SI-NEXT: v_mov_b32_e32 v32, v6 -; SI-NEXT: v_mov_b32_e32 v34, v4 -; SI-NEXT: v_lshlrev_b32_e32 v35, 8, v1 -; SI-NEXT: v_lshlrev_b32_e32 v36, 8, v3 -; SI-NEXT: v_lshlrev_b32_e32 v37, 8, v5 -; SI-NEXT: v_lshlrev_b32_e32 v38, 8, v7 -; SI-NEXT: v_lshlrev_b32_e32 v39, 8, v9 -; SI-NEXT: v_lshlrev_b32_e32 v48, 8, v11 -; SI-NEXT: v_lshlrev_b32_e32 v49, 8, v13 -; SI-NEXT: v_lshlrev_b32_e32 v50, 8, v15 -; SI-NEXT: v_lshlrev_b32_e32 v51, 8, v17 -; SI-NEXT: v_lshlrev_b32_e32 v19, 8, v19 -; SI-NEXT: v_lshlrev_b32_e32 v52, 8, v21 -; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v23 -; SI-NEXT: v_lshlrev_b32_e32 v53, 8, v25 -; SI-NEXT: v_lshlrev_b32_e32 v27, 8, v27 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 +; SI-NEXT: v_lshlrev_b32_e32 v32, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v33, 8, v3 +; SI-NEXT: v_lshlrev_b32_e32 v34, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v35, 8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v36, 8, v9 +; SI-NEXT: v_lshlrev_b32_e32 v37, 8, v11 +; SI-NEXT: v_lshlrev_b32_e32 v38, 8, v13 +; SI-NEXT: v_lshlrev_b32_e32 v39, 8, v15 +; SI-NEXT: v_lshlrev_b32_e32 v48, 8, v17 +; SI-NEXT: v_lshlrev_b32_e32 v49, 8, v19 +; SI-NEXT: v_lshlrev_b32_e32 v50, 8, v21 +; SI-NEXT: v_lshlrev_b32_e32 v51, 8, v23 +; SI-NEXT: v_lshlrev_b32_e32 v52, 8, v25 +; SI-NEXT: v_lshlrev_b32_e32 v53, 8, v27 ; SI-NEXT: v_lshlrev_b32_e32 v54, 8, v29 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v55, 8, v2 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: v_lshlrev_b32_e32 v55, 8, v55 +; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB106_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v4, 0xff, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xff, v16 -; SI-NEXT: v_or_b32_e32 v8, v8, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xff, v18 -; SI-NEXT: v_or_b32_e32 v8, v8, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xff, v20 -; SI-NEXT: v_or_b32_e32 v8, v8, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xff, v22 -; SI-NEXT: v_or_b32_e32 v8, v8, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xff, v24 -; SI-NEXT: v_or_b32_e32 v8, v8, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xff, v26 -; SI-NEXT: v_or_b32_e32 v8, v8, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xff, v28 -; SI-NEXT: v_or_b32_e32 v8, v8, v54 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v31 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v33 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v34 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v32 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v10 -; SI-NEXT: v_and_b32_e32 v6, 0xff, v12 -; SI-NEXT: v_and_b32_e32 v7, 0xff, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xff, v30 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v0, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v4 +; SI-NEXT: v_or_b32_e32 v0, v0, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v6 ; SI-NEXT: v_or_b32_e32 v0, v0, v35 -; SI-NEXT: v_or_b32_e32 v1, v1, v36 -; SI-NEXT: v_or_b32_e32 v2, v2, v37 -; SI-NEXT: v_or_b32_e32 v3, v3, v38 -; SI-NEXT: v_or_b32_e32 v4, v4, v39 -; SI-NEXT: v_or_b32_e32 v5, v5, v48 -; SI-NEXT: v_or_b32_e32 v6, v6, v49 -; SI-NEXT: v_or_b32_e32 v7, v7, v50 -; SI-NEXT: v_or_b32_e32 v8, v8, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v8 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v8 +; SI-NEXT: v_or_b32_e32 v0, v0, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v10 +; SI-NEXT: v_or_b32_e32 v0, v0, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v12 +; SI-NEXT: v_or_b32_e32 v0, v0, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v14 +; SI-NEXT: v_or_b32_e32 v0, v0, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v16 +; SI-NEXT: v_or_b32_e32 v0, v0, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v18 +; SI-NEXT: v_or_b32_e32 v0, v0, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v20 +; SI-NEXT: v_or_b32_e32 v0, v0, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v22 +; SI-NEXT: v_or_b32_e32 v0, v0, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v24 +; SI-NEXT: v_or_b32_e32 v0, v0, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v26 +; SI-NEXT: v_or_b32_e32 v0, v0, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v28 +; SI-NEXT: v_or_b32_e32 v0, v0, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v30 +; SI-NEXT: v_or_b32_e32 v0, v0, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr12 @@ -37265,6 +38693,9 @@ define <16 x half> @bitcast_v32i8_to_v16f16(<32 x i8> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr37 @@ -37274,104 +38705,129 @@ define <16 x half> @bitcast_v32i8_to_v16f16(<32 x i8> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: .LBB106_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB106_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v55, v0 -; SI-NEXT: v_add_i32_e32 v15, vcc, 0x300, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v28 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v30 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v1, v55, v1 +; SI-NEXT: v_add_i32_e32 v30, vcc, 0x300, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v28 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: s_movk_i32 s6, 0x300 -; SI-NEXT: v_or_b32_e32 v0, v54, v0 -; SI-NEXT: v_add_i32_e32 v28, vcc, s6, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v26 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v27, v0 -; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v24 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v53, v0 -; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v22 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v23, v0 -; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v20 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v52, v0 -; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v18 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v19, v0 -; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v16 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v51, v0 -; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v14 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v50, v0 -; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v12 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v49, v0 -; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v10 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v48, v0 -; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v8 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v39, v0 -; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v38, v0 -; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v37, v0 -; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v36, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v31 +; SI-NEXT: v_or_b32_e32 v1, v54, v1 +; SI-NEXT: v_add_i32_e32 v27, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v26 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v1, v53, v1 +; SI-NEXT: v_add_i32_e32 v26, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v24 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v1, v52, v1 +; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v22 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v1, v51, v1 +; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v1, v50, v1 +; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v18 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v1, v49, v1 +; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v16 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v1, v48, v1 +; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v14 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v1, v39, v1 +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v12 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v1, v38, v1 +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v10 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v1, v37, v1 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v8 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v1, v36, v1 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v6 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v1, v35, v1 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v1, v34, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v35, v0 +; SI-NEXT: v_or_b32_e32 v1, v33, v1 +; SI-NEXT: v_or_b32_e32 v0, v32, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v30 ; SI-NEXT: .LBB106_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_mov_b32_e32 v8, v17 -; SI-NEXT: v_mov_b32_e32 v10, v21 -; SI-NEXT: v_mov_b32_e32 v12, v25 -; SI-NEXT: v_mov_b32_e32 v14, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v13 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v11 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v21 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v19 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v29 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v27 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v32i8_to_v16f16: @@ -38117,11 +39573,11 @@ define inreg <16 x half> @bitcast_v32i8_to_v16f16_scalar(<32 x i8> inreg %a, i32 ; SI-NEXT: s_and_b32 s4, s22, 0xff ; SI-NEXT: s_lshl_b32 s5, s23, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 ; SI-NEXT: s_and_b32 s4, s24, 0xff ; SI-NEXT: s_lshl_b32 s5, s25, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 ; SI-NEXT: s_and_b32 s4, s26, 0xff ; SI-NEXT: s_lshl_b32 s5, s27, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 @@ -38133,11 +39589,11 @@ define inreg <16 x half> @bitcast_v32i8_to_v16f16_scalar(<32 x i8> inreg %a, i32 ; SI-NEXT: s_and_b32 s4, s8, 0xff ; SI-NEXT: s_lshl_b32 s5, s6, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 ; SI-NEXT: s_and_b32 s4, s9, 0xff ; SI-NEXT: s_lshl_b32 s5, s7, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 ; SI-NEXT: s_and_b32 s4, s11, 0xff ; SI-NEXT: s_lshl_b32 s5, s10, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 @@ -38149,19 +39605,19 @@ define inreg <16 x half> @bitcast_v32i8_to_v16f16_scalar(<32 x i8> inreg %a, i32 ; SI-NEXT: s_and_b32 s4, s15, 0xff ; SI-NEXT: s_lshl_b32 s5, s14, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 ; SI-NEXT: s_and_b32 s4, s41, 0xff ; SI-NEXT: s_lshl_b32 s5, s40, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 ; SI-NEXT: s_and_b32 s4, s43, 0xff ; SI-NEXT: s_lshl_b32 s5, s42, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 ; SI-NEXT: s_and_b32 s4, s45, 0xff ; SI-NEXT: s_lshl_b32 s5, s44, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 ; SI-NEXT: s_and_b32 s4, s47, 0xff ; SI-NEXT: s_lshl_b32 s5, s46, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 @@ -38251,37 +39707,69 @@ define inreg <16 x half> @bitcast_v32i8_to_v16f16_scalar(<32 x i8> inreg %a, i32 ; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s18 ; SI-NEXT: v_cvt_f32_f16_e32 v2, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s11 ; SI-NEXT: v_cvt_f32_f16_e32 v5, s9 ; SI-NEXT: v_cvt_f32_f16_e32 v6, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s7 ; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 ; SI-NEXT: v_cvt_f32_f16_e32 v10, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 ; SI-NEXT: .LBB107_3: ; %end +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v9 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v10 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v14 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v13 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB107_4: ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: s_branch .LBB107_2 ; @@ -38961,23 +40449,39 @@ define <32 x i8> @bitcast_v16bf16_to_v32i8(<16 x bfloat> %a, i32 %b) { ; SI-LABEL: bitcast_v16bf16_to_v32i8: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; SI-NEXT: v_mul_f32_e32 v35, 1.0, v1 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: v_mul_f32_e32 v35, 1.0, v16 ; SI-NEXT: v_mul_f32_e32 v36, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v32, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v34, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v39, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v48, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v33, 1.0, v7 -; SI-NEXT: v_mul_f32_e32 v38, 1.0, v6 -; SI-NEXT: v_mul_f32_e32 v51, 1.0, v9 -; SI-NEXT: v_mul_f32_e32 v52, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v34, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v48, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v33, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v38, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v52, 1.0, v4 ; SI-NEXT: v_mul_f32_e32 v37, 1.0, v11 -; SI-NEXT: v_mul_f32_e32 v50, 1.0, v10 -; SI-NEXT: v_mul_f32_e32 v54, 1.0, v13 -; SI-NEXT: v_mul_f32_e32 v55, 1.0, v12 -; SI-NEXT: v_mul_f32_e32 v49, 1.0, v15 -; SI-NEXT: v_mul_f32_e32 v53, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v50, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v54, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v55, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v49, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v53, 1.0, v7 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 @@ -40126,57 +41630,71 @@ define inreg <32 x i8> @bitcast_v16bf16_to_v32i8_scalar(<16 x bfloat> inreg %a, ; SI-LABEL: bitcast_v16bf16_to_v32i8_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; SI-NEXT: s_lshl_b32 s5, s23, 16 +; SI-NEXT: s_and_b32 s6, s22, 0xffff0000 +; SI-NEXT: s_lshl_b32 s7, s22, 16 +; SI-NEXT: s_and_b32 s8, s21, 0xffff0000 +; SI-NEXT: s_lshl_b32 s9, s21, 16 +; SI-NEXT: s_and_b32 s10, s20, 0xffff0000 +; SI-NEXT: s_lshl_b32 s11, s20, 16 +; SI-NEXT: s_and_b32 s12, s19, 0xffff0000 +; SI-NEXT: s_lshl_b32 s13, s19, 16 +; SI-NEXT: s_and_b32 s14, s18, 0xffff0000 +; SI-NEXT: s_lshl_b32 s15, s18, 16 +; SI-NEXT: s_and_b32 s18, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_and_b32 s19, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s16, s16, 16 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_cmp_lg_u32 s24, 0 ; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_mul_f32_e64 v46, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v8, 1.0, s16 ; SI-NEXT: v_mul_f32_e64 v45, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v5, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v0, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v32, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s17 ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_mul_f32_e64 v56, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v16, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v47, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v13, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v47, 1.0, s14 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s15 +; SI-NEXT: v_mul_f32_e64 v46, 1.0, s12 +; SI-NEXT: v_mul_f32_e64 v13, 1.0, s13 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mul_f32_e64 v58, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v24, 1.0, s24 -; SI-NEXT: v_mul_f32_e64 v57, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v57, 1.0, s10 +; SI-NEXT: v_mul_f32_e64 v16, 1.0, s11 +; SI-NEXT: v_mul_f32_e64 v56, 1.0, s8 +; SI-NEXT: v_mul_f32_e64 v21, 1.0, s9 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v60, 1.0, s29 -; SI-NEXT: v_mul_f32_e32 v59, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v29, 1.0, v0 -; SI-NEXT: v_mul_f32_e64 v21, 1.0, s26 -; SI-NEXT: v_mul_f32_e64 v32, 1.0, s28 +; SI-NEXT: v_mul_f32_e64 v59, 1.0, s6 +; SI-NEXT: v_mul_f32_e64 v58, 1.0, s4 +; SI-NEXT: v_mul_f32_e64 v24, 1.0, s7 +; SI-NEXT: v_mul_f32_e64 v29, 1.0, s5 ; SI-NEXT: s_cbranch_scc0 .LBB109_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v56 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v58 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v46 -; SI-NEXT: v_lshr_b64 v[53:54], v[16:17], 16 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v47 -; SI-NEXT: v_lshr_b64 v[39:40], v[24:25], 16 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v57 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v60 -; SI-NEXT: v_lshr_b64 v[50:51], v[8:9], 16 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v45 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v47 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v57 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v45 +; SI-NEXT: v_lshr_b64 v[53:54], v[8:9], 16 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v46 +; SI-NEXT: v_lshr_b64 v[39:40], v[16:17], 16 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v56 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v59 +; SI-NEXT: v_lshr_b64 v[50:51], v[0:1], 16 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v32 ; SI-NEXT: v_lshr_b64 v[54:55], v[13:14], 16 ; SI-NEXT: v_lshr_b64 v[40:41], v[21:22], 16 -; SI-NEXT: v_lshr_b64 v[42:43], v[32:33], 16 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v59 +; SI-NEXT: v_lshr_b64 v[42:43], v[24:25], 16 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v58 ; SI-NEXT: v_lshr_b64 v[51:52], v[5:6], 16 ; SI-NEXT: v_lshr_b64 v[43:44], v[29:30], 16 ; SI-NEXT: v_lshr_b64 v[36:37], v[53:54], 16 @@ -40187,12 +41705,12 @@ define inreg <32 x i8> @bitcast_v16bf16_to_v32i8_scalar(<16 x bfloat> inreg %a, ; SI-NEXT: v_lshr_b64 v[48:49], v[50:51], 16 ; SI-NEXT: v_lshr_b64 v[1:2], v[50:51], 8 ; SI-NEXT: v_lshr_b64 v[11:12], v[53:54], 24 -; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v45 -; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v47 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v32 +; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v46 ; SI-NEXT: v_lshrrev_b32_e32 v10, 8, v51 -; SI-NEXT: v_lshrrev_b32_e32 v23, 24, v57 +; SI-NEXT: v_lshrrev_b32_e32 v23, 24, v56 ; SI-NEXT: v_lshrrev_b32_e32 v20, 8, v54 -; SI-NEXT: v_lshrrev_b32_e32 v31, 24, v59 +; SI-NEXT: v_lshrrev_b32_e32 v31, 24, v58 ; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v40 ; SI-NEXT: v_lshrrev_b32_e32 v35, 8, v43 ; SI-NEXT: v_lshr_b64 v[17:18], v[39:40], 8 @@ -40201,49 +41719,49 @@ define inreg <32 x i8> @bitcast_v16bf16_to_v32i8_scalar(<16 x bfloat> inreg %a, ; SI-NEXT: v_lshr_b64 v[25:26], v[42:43], 8 ; SI-NEXT: s_cbranch_execnz .LBB109_3 ; SI-NEXT: .LBB109_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v60 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v32 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v59 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v24 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshr_b64 v[42:43], v[0:1], 16 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v29 -; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v59 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshr_b64 v[42:43], v[1:2], 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v29 +; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v58 -; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v24 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v57 +; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshr_b64 v[39:40], v[0:1], 16 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v21 -; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v57 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshr_b64 v[39:40], v[1:2], 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v56 -; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v47 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v8 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshr_b64 v[53:54], v[0:1], 16 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v13 -; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v47 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshr_b64 v[53:54], v[1:2], 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v46 -; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v8 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v45 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_lshr_b64 v[50:51], v[0:1], 16 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v45 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v15 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v32 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v8 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v31 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v16 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 ; SI-NEXT: v_lshr_b64 v[54:55], v[13:14], 16 ; SI-NEXT: v_lshr_b64 v[43:44], v[29:30], 16 @@ -40266,27 +41784,26 @@ define inreg <32 x i8> @bitcast_v16bf16_to_v32i8_scalar(<16 x bfloat> inreg %a, ; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v40 ; SI-NEXT: v_lshrrev_b32_e32 v35, 8, v43 ; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v0 -; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v15 -; SI-NEXT: v_lshrrev_b32_e32 v23, 24, v23 -; SI-NEXT: v_lshrrev_b32_e32 v31, 24, v31 +; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v8 +; SI-NEXT: v_lshrrev_b32_e32 v23, 24, v16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 24, v24 ; SI-NEXT: .LBB109_3: ; %end ; SI-NEXT: v_mov_b32_e32 v13, v20 ; SI-NEXT: v_mov_b32_e32 v20, v40 ; SI-NEXT: v_mov_b32_e32 v24, v42 ; SI-NEXT: v_mov_b32_e32 v28, v43 -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload ; SI-NEXT: v_mov_b32_e32 v0, v50 ; SI-NEXT: v_mov_b32_e32 v2, v48 ; SI-NEXT: v_mov_b32_e32 v4, v51 @@ -41447,99 +42964,97 @@ define <16 x bfloat> @bitcast_v32i8_to_v16bf16(<32 x i8> %a, i32 %b) { ; SI-LABEL: bitcast_v32i8_to_v16bf16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v33, v1 -; SI-NEXT: v_mov_b32_e32 v36, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 -; SI-NEXT: v_lshlrev_b32_e32 v37, 24, v3 -; SI-NEXT: v_lshlrev_b32_e32 v48, 8, v5 -; SI-NEXT: v_lshlrev_b32_e32 v38, 24, v7 -; SI-NEXT: v_lshlrev_b32_e32 v39, 24, v11 -; SI-NEXT: v_lshlrev_b32_e32 v51, 8, v13 -; SI-NEXT: v_lshlrev_b32_e32 v49, 24, v15 -; SI-NEXT: v_lshlrev_b32_e32 v50, 24, v19 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 +; SI-NEXT: v_lshlrev_b32_e32 v31, 24, v3 +; SI-NEXT: v_lshlrev_b32_e32 v36, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v32, 24, v7 +; SI-NEXT: v_lshlrev_b32_e32 v35, 24, v11 +; SI-NEXT: v_lshlrev_b32_e32 v50, 8, v13 +; SI-NEXT: v_lshlrev_b32_e32 v48, 24, v15 +; SI-NEXT: v_lshlrev_b32_e32 v49, 24, v19 ; SI-NEXT: v_lshlrev_b32_e32 v53, 8, v21 -; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v23 +; SI-NEXT: v_lshlrev_b32_e32 v51, 24, v23 ; SI-NEXT: v_lshlrev_b32_e32 v52, 24, v27 ; SI-NEXT: v_lshlrev_b32_e32 v55, 8, v29 -; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v54, 24, v1 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: v_lshlrev_b32_e32 v54, 24, v34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB110_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xff, v36 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v33 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v4 -; SI-NEXT: v_or_b32_e32 v2, v2, v48 -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v3, v38, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v9 -; SI-NEXT: v_or_b32_e32 v31, v4, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v5, v39, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v12 -; SI-NEXT: v_or_b32_e32 v2, v2, v51 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v7, v49, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v17 -; SI-NEXT: v_or_b32_e32 v23, v4, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v27, v50, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v20 -; SI-NEXT: v_or_b32_e32 v2, v2, v53 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v22 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v11, v21, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v24 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v25 -; SI-NEXT: v_or_b32_e32 v32, v4, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v26 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v13, v52, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v28 -; SI-NEXT: v_or_b32_e32 v2, v2, v55 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v30 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v37, v1 -; SI-NEXT: v_or_b32_e32 v15, v54, v2 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; SI-NEXT: v_or_b32_e32 v3, v1, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v5, v31, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v4 +; SI-NEXT: v_or_b32_e32 v0, v0, v36 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v11, v32, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v9 +; SI-NEXT: v_or_b32_e32 v13, v1, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v15, v35, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v12 +; SI-NEXT: v_or_b32_e32 v0, v0, v50 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v21, v48, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v17 +; SI-NEXT: v_or_b32_e32 v23, v1, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v27, v49, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v20 +; SI-NEXT: v_or_b32_e32 v0, v0, v53 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v33, v51, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v25 +; SI-NEXT: v_or_b32_e32 v34, v1, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v37, v52, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v28 +; SI-NEXT: v_or_b32_e32 v0, v0, v55 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v39, v54, v0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr6 @@ -41558,15 +43073,15 @@ define <16 x bfloat> @bitcast_v32i8_to_v16bf16(<32 x i8> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr54 @@ -41574,126 +43089,150 @@ define <16 x bfloat> @bitcast_v32i8_to_v16bf16(<32 x i8> %a, i32 %b) { ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB110_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v28 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v30 -; SI-NEXT: v_or_b32_e32 v0, v55, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v54, v1 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v28 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v30 +; SI-NEXT: v_or_b32_e32 v3, v55, v3 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x300, v3 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v5, v54, v5 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 ; SI-NEXT: s_mov_b32 s7, 0x3000000 -; SI-NEXT: v_add_i32_e32 v28, vcc, s7, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v24 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v25 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v28, vcc, s7, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v24 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v25 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v26 ; SI-NEXT: s_movk_i32 s6, 0x300 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v52, v1 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v15, vcc, s7, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v20 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v22 -; SI-NEXT: v_or_b32_e32 v0, v53, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v21, v1 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v13, vcc, s7, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v16 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v17 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v18 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v50, v1 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v11, vcc, s7, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v12 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v14 -; SI-NEXT: v_or_b32_e32 v0, v51, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v49, v1 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v12, vcc, s7, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v8 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v9 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v10 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v39, v1 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v7, vcc, s7, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v4 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v6 -; SI-NEXT: v_or_b32_e32 v0, v48, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v38, v1 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v4, vcc, s7, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v5, v52, v5 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_add_i32_e32 v24, vcc, s7, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v22 +; SI-NEXT: v_or_b32_e32 v3, v53, v3 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v5, v51, v5 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_add_i32_e32 v20, vcc, s7, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v16 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v17 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v18 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v5, v49, v5 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_add_i32_e32 v16, vcc, s7, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v12 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v14 +; SI-NEXT: v_or_b32_e32 v3, v50, v3 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v5, v48, v5 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_add_i32_e32 v12, vcc, s7, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v8 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v9 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v10 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v5, v35, v5 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, s7, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v4 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v33 +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v6 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v2 +; SI-NEXT: v_or_b32_e32 v3, v36, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v4, v32, v4 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v37, v1 +; SI-NEXT: v_or_b32_e32 v1, v31, v1 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v4, vcc, s7, v3 ; SI-NEXT: v_add_i32_e32 v0, vcc, s7, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v4 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v7 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v12 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v4 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v8 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v8 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v12 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v12 -; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v11 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v11 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v13 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v13 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v15 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v15 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v28 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v28 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v16 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v16 +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v20 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v20 +; SI-NEXT: v_and_b32_e32 v37, 0xffff0000, v24 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v24 +; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v28 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v28 ; SI-NEXT: .LBB110_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_mov_b32_e32 v2, v35 -; SI-NEXT: v_mov_b32_e32 v4, v31 -; SI-NEXT: v_mov_b32_e32 v6, v19 -; SI-NEXT: v_mov_b32_e32 v8, v23 -; SI-NEXT: v_mov_b32_e32 v9, v27 -; SI-NEXT: v_mov_b32_e32 v10, v29 -; SI-NEXT: v_mov_b32_e32 v12, v32 -; SI-NEXT: v_mov_b32_e32 v14, v34 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v5 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v3 +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v11 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v7 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v15 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v13 +; SI-NEXT: v_alignbit_b32 v2, v2, v3, 16 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v21 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v19 +; SI-NEXT: v_alignbit_b32 v3, v3, v4, 16 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v27 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v23 +; SI-NEXT: v_alignbit_b32 v4, v4, v5, 16 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v33 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_mul_f32_e32 v6, 1.0, v29 +; SI-NEXT: v_alignbit_b32 v5, v5, v6, 16 +; SI-NEXT: v_mul_f32_e32 v6, 1.0, v37 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v34 +; SI-NEXT: v_alignbit_b32 v6, v6, v7, 16 +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v39 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v38 +; SI-NEXT: v_alignbit_b32 v7, v7, v8, 16 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v32i8_to_v16bf16: @@ -42403,232 +43942,265 @@ define inreg <16 x bfloat> @bitcast_v32i8_to_v16bf16_scalar(<32 x i8> inreg %a, ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; SI-NEXT: v_readfirstlane_b32 s42, v15 -; SI-NEXT: v_readfirstlane_b32 s43, v14 -; SI-NEXT: v_readfirstlane_b32 s40, v7 -; SI-NEXT: v_readfirstlane_b32 s41, v6 -; SI-NEXT: v_readfirstlane_b32 s10, v1 -; SI-NEXT: v_readfirstlane_b32 s9, v0 +; SI-NEXT: v_readfirstlane_b32 s73, v17 +; SI-NEXT: v_readfirstlane_b32 s72, v16 +; SI-NEXT: v_readfirstlane_b32 s77, v15 +; SI-NEXT: v_readfirstlane_b32 s78, v14 +; SI-NEXT: v_readfirstlane_b32 s58, v13 +; SI-NEXT: v_readfirstlane_b32 s57, v12 +; SI-NEXT: v_readfirstlane_b32 s61, v11 +; SI-NEXT: v_readfirstlane_b32 s63, v10 +; SI-NEXT: v_readfirstlane_b32 s42, v9 +; SI-NEXT: v_readfirstlane_b32 s41, v8 +; SI-NEXT: v_readfirstlane_b32 s45, v7 +; SI-NEXT: v_readfirstlane_b32 s56, v6 +; SI-NEXT: v_readfirstlane_b32 s10, v5 +; SI-NEXT: v_readfirstlane_b32 s9, v4 +; SI-NEXT: v_readfirstlane_b32 s13, v3 +; SI-NEXT: v_readfirstlane_b32 s40, v2 +; SI-NEXT: v_readfirstlane_b32 s6, v1 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v5 -; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v9 -; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v13 -; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v17 -; SI-NEXT: s_cbranch_scc0 .LBB111_3 +; SI-NEXT: v_readfirstlane_b32 s7, v0 +; SI-NEXT: s_cbranch_scc0 .LBB111_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_and_b32 s4, s16, 0xff ; SI-NEXT: s_lshl_b32 s4, s4, 16 ; SI-NEXT: s_lshl_b32 s5, s17, 24 -; SI-NEXT: s_or_b32 s6, s5, s4 +; SI-NEXT: s_or_b32 s8, s5, s4 ; SI-NEXT: s_and_b32 s4, s18, 0xff ; SI-NEXT: s_lshl_b32 s4, s4, 16 ; SI-NEXT: s_lshl_b32 s5, s19, 24 -; SI-NEXT: s_or_b32 s7, s5, s4 +; SI-NEXT: s_or_b32 s12, s5, s4 ; SI-NEXT: s_and_b32 s4, s20, 0xff ; SI-NEXT: s_lshl_b32 s5, s21, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_lshl_b32 s8, s4, 16 +; SI-NEXT: s_lshl_b32 s11, s4, 16 ; SI-NEXT: s_and_b32 s4, s22, 0xff ; SI-NEXT: s_lshl_b32 s4, s4, 16 ; SI-NEXT: s_lshl_b32 s5, s23, 24 -; SI-NEXT: s_or_b32 s11, s5, s4 +; SI-NEXT: s_or_b32 s15, s5, s4 ; SI-NEXT: s_and_b32 s4, s24, 0xff -; SI-NEXT: v_and_b32_e32 v7, 0xff, v2 ; SI-NEXT: s_lshl_b32 s4, s4, 16 ; SI-NEXT: s_lshl_b32 s5, s25, 24 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v3 -; SI-NEXT: s_or_b32 s12, s5, s4 +; SI-NEXT: s_or_b32 s14, s5, s4 ; SI-NEXT: s_and_b32 s4, s26, 0xff -; SI-NEXT: v_or_b32_e32 v17, v9, v7 -; SI-NEXT: v_and_b32_e32 v7, 0xff, v4 ; SI-NEXT: s_lshl_b32 s4, s4, 16 ; SI-NEXT: s_lshl_b32 s5, s27, 24 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: s_or_b32 s13, s5, s4 +; SI-NEXT: s_or_b32 s44, s5, s4 ; SI-NEXT: s_and_b32 s4, s28, 0xff ; SI-NEXT: s_lshl_b32 s5, s29, 8 -; SI-NEXT: v_or_b32_e32 v9, v0, v7 -; SI-NEXT: v_and_b32_e32 v7, 0xff, v8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: s_lshl_b32 s14, s4, 16 +; SI-NEXT: s_lshl_b32 s43, s4, 16 +; SI-NEXT: s_and_b32 s4, s7, 0xff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_lshl_b32 s5, s6, 24 +; SI-NEXT: s_or_b32 s47, s5, s4 +; SI-NEXT: s_and_b32 s4, s40, 0xff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_lshl_b32 s5, s13, 24 +; SI-NEXT: s_or_b32 s46, s5, s4 ; SI-NEXT: s_and_b32 s4, s9, 0xff -; SI-NEXT: v_or_b32_e32 v19, v1, v7 -; SI-NEXT: v_and_b32_e32 v7, 0xff, v10 ; SI-NEXT: s_lshl_b32 s4, s4, 16 ; SI-NEXT: s_lshl_b32 s5, s10, 24 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v11 -; SI-NEXT: s_or_b32 s15, s5, s4 -; SI-NEXT: s_and_b32 s4, s41, 0xff -; SI-NEXT: s_lshl_b32 s5, s40, 8 -; SI-NEXT: v_or_b32_e32 v18, v13, v7 -; SI-NEXT: v_and_b32_e32 v7, 0xff, v12 +; SI-NEXT: s_or_b32 s60, s5, s4 +; SI-NEXT: s_and_b32 s4, s56, 0xff +; SI-NEXT: s_lshl_b32 s5, s45, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: s_lshl_b32 s44, s4, 16 -; SI-NEXT: v_or_b32_e32 v13, v5, v7 -; SI-NEXT: s_and_b32 s4, s43, 0xff -; SI-NEXT: s_lshl_b32 s5, s42, 8 -; SI-NEXT: v_and_b32_e32 v7, 0xff, v16 +; SI-NEXT: s_lshl_b32 s59, s4, 16 +; SI-NEXT: s_and_b32 s4, s41, 0xff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_lshl_b32 s5, s42, 24 +; SI-NEXT: s_or_b32 s74, s5, s4 +; SI-NEXT: s_and_b32 s4, s63, 0xff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_lshl_b32 s5, s61, 24 +; SI-NEXT: s_or_b32 s62, s5, s4 +; SI-NEXT: s_and_b32 s4, s57, 0xff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_lshl_b32 s5, s58, 24 +; SI-NEXT: s_or_b32 s76, s5, s4 +; SI-NEXT: s_and_b32 s4, s78, 0xff +; SI-NEXT: s_lshl_b32 s5, s77, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: s_lshl_b32 s45, s4, 16 -; SI-NEXT: v_or_b32_e32 v15, v6, v7 -; SI-NEXT: s_cbranch_execnz .LBB111_4 +; SI-NEXT: s_lshl_b32 s75, s4, 16 +; SI-NEXT: s_and_b32 s4, s72, 0xff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_lshl_b32 s5, s73, 24 +; SI-NEXT: s_or_b32 s79, s5, s4 +; SI-NEXT: s_cbranch_execnz .LBB111_3 ; SI-NEXT: .LBB111_2: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v16 -; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v6, v6, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v10 -; SI-NEXT: s_add_i32 s43, s43, 3 -; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v11 -; SI-NEXT: s_and_b32 s4, s43, 0xff -; SI-NEXT: s_lshl_b32 s5, s42, 8 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v12 +; SI-NEXT: s_add_i32 s78, s78, 3 +; SI-NEXT: s_and_b32 s4, s78, 0xff +; SI-NEXT: s_lshl_b32 s5, s77, 8 +; SI-NEXT: s_add_i32 s72, s72, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: s_and_b32 s8, s72, 0xff ; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x300, v7 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: v_or_b32_e32 v5, v5, v9 -; SI-NEXT: s_add_i32 s41, s41, 3 -; SI-NEXT: v_or_b32_e32 v6, s4, v6 -; SI-NEXT: v_or_b32_e32 v5, v5, v7 -; SI-NEXT: s_and_b32 s4, s41, 0xff -; SI-NEXT: s_lshl_b32 s5, s40, 8 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v8 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: v_or_b32_e32 v1, v1, v7 -; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: v_or_b32_e32 v1, s4, v1 -; SI-NEXT: s_and_b32 s4, s28, 0xff -; SI-NEXT: s_lshl_b32 s5, s29, 8 -; SI-NEXT: s_add_i32 s9, s9, 3 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s6, s9, 0xff -; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: s_lshl_b32 s5, s10, 24 -; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_lshl_b32 s5, s73, 24 +; SI-NEXT: s_lshl_b32 s8, s8, 16 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: s_add_i32 s63, s63, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s5, s24, 0xff -; SI-NEXT: s_lshl_b32 s6, s25, 8 -; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_and_b32 s7, s26, 0xff +; SI-NEXT: s_and_b32 s5, s63, 0xff +; SI-NEXT: s_lshl_b32 s8, s61, 8 +; SI-NEXT: s_add_i32 s57, s57, 3 +; SI-NEXT: s_or_b32 s5, s8, s5 +; SI-NEXT: s_and_b32 s11, s57, 0xff ; SI-NEXT: s_addk_i32 s5, 0x300 -; SI-NEXT: s_lshl_b32 s6, s27, 24 -; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_lshl_b32 s8, s58, 24 +; SI-NEXT: s_lshl_b32 s11, s11, 16 ; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s8, s8, s11 +; SI-NEXT: s_add_i32 s56, s56, 3 +; SI-NEXT: s_or_b32 s5, s8, s5 +; SI-NEXT: s_and_b32 s8, s56, 0xff +; SI-NEXT: s_lshl_b32 s11, s45, 8 +; SI-NEXT: s_add_i32 s41, s41, 3 +; SI-NEXT: s_or_b32 s8, s11, s8 +; SI-NEXT: s_and_b32 s12, s41, 0xff +; SI-NEXT: s_addk_i32 s8, 0x300 +; SI-NEXT: s_lshl_b32 s11, s42, 24 +; SI-NEXT: s_lshl_b32 s12, s12, 16 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_or_b32 s11, s11, s12 +; SI-NEXT: s_or_b32 s8, s11, s8 +; SI-NEXT: s_add_i32 s40, s40, 3 +; SI-NEXT: s_add_i32 s41, s8, 0x3000000 +; SI-NEXT: s_and_b32 s8, s40, 0xff +; SI-NEXT: s_lshl_b32 s11, s13, 8 +; SI-NEXT: s_add_i32 s9, s9, 3 +; SI-NEXT: s_or_b32 s8, s11, s8 +; SI-NEXT: s_and_b32 s9, s9, 0xff +; SI-NEXT: s_addk_i32 s8, 0x300 +; SI-NEXT: s_lshl_b32 s10, s10, 24 +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s9, s8, 0x3000000 +; SI-NEXT: s_and_b32 s8, s28, 0xff +; SI-NEXT: s_lshl_b32 s10, s29, 8 +; SI-NEXT: s_add_i32 s7, s7, 3 +; SI-NEXT: s_or_b32 s8, s10, s8 +; SI-NEXT: s_and_b32 s7, s7, 0xff +; SI-NEXT: s_addk_i32 s8, 0x300 +; SI-NEXT: s_lshl_b32 s6, s6, 24 +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_and_b32 s8, s8, 0xffff ; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s6, s6, s8 +; SI-NEXT: s_and_b32 s7, s24, 0xff +; SI-NEXT: s_lshl_b32 s8, s25, 8 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s10, s26, 0xff +; SI-NEXT: s_addk_i32 s7, 0x300 +; SI-NEXT: s_lshl_b32 s8, s27, 24 +; SI-NEXT: s_lshl_b32 s10, s10, 16 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_or_b32 s8, s8, s10 ; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_and_b32 s6, s20, 0xff -; SI-NEXT: s_lshl_b32 s7, s21, 8 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s20, 0xff +; SI-NEXT: s_lshl_b32 s10, s21, 8 ; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_and_b32 s8, s22, 0xff -; SI-NEXT: s_addk_i32 s6, 0x300 -; SI-NEXT: s_lshl_b32 s7, s23, 24 -; SI-NEXT: s_lshl_b32 s8, s8, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: s_and_b32 s6, s6, 0xffff -; SI-NEXT: s_or_b32 s7, s7, s8 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_or_b32 s8, s10, s8 +; SI-NEXT: s_and_b32 s11, s22, 0xff +; SI-NEXT: s_addk_i32 s8, 0x300 +; SI-NEXT: s_lshl_b32 s10, s23, 24 +; SI-NEXT: s_lshl_b32 s11, s11, 16 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: s_or_b32 s8, s10, s8 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v4 -; SI-NEXT: s_add_i32 s8, s6, 0x3000000 -; SI-NEXT: s_and_b32 s6, s16, 0xff -; SI-NEXT: s_lshl_b32 s7, s17, 8 +; SI-NEXT: s_add_i32 s10, s8, 0x3000000 +; SI-NEXT: s_and_b32 s8, s16, 0xff +; SI-NEXT: s_lshl_b32 s11, s17, 8 ; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_and_b32 s9, s18, 0xff -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x300, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: s_addk_i32 s6, 0x300 -; SI-NEXT: s_lshl_b32 s7, s19, 24 -; SI-NEXT: s_lshl_b32 s9, s9, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v0, v0, v3 -; SI-NEXT: s_and_b32 s6, s6, 0xffff -; SI-NEXT: s_or_b32 s7, s7, s9 -; SI-NEXT: v_or_b32_e32 v0, v0, v2 -; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: v_add_i32_e32 v6, vcc, 0x3000000, v6 -; SI-NEXT: v_add_i32_e32 v5, vcc, 0x3000000, v5 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x3000000, v1 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x3000000, v0 +; SI-NEXT: s_or_b32 s8, s11, s8 +; SI-NEXT: s_and_b32 s12, s18, 0xff +; SI-NEXT: s_addk_i32 s8, 0x300 +; SI-NEXT: s_lshl_b32 s11, s19, 24 +; SI-NEXT: s_lshl_b32 s12, s12, 16 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_or_b32 s11, s11, s12 +; SI-NEXT: s_or_b32 s8, s11, s8 ; SI-NEXT: s_add_i32 s4, s4, 0x3000000 ; SI-NEXT: s_add_i32 s5, s5, 0x3000000 ; SI-NEXT: s_add_i32 s6, s6, 0x3000000 -; SI-NEXT: s_and_b32 s7, s6, 0xffff0000 -; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: s_and_b32 s11, s8, 0xffff0000 +; SI-NEXT: s_add_i32 s7, s7, 0x3000000 +; SI-NEXT: s_add_i32 s8, s8, 0x3000000 +; SI-NEXT: s_and_b32 s12, s8, 0xffff0000 ; SI-NEXT: s_lshl_b32 s8, s8, 16 -; SI-NEXT: s_and_b32 s13, s5, 0xffff0000 -; SI-NEXT: s_lshl_b32 s12, s5, 16 -; SI-NEXT: s_and_b32 s15, s4, 0xffff0000 -; SI-NEXT: s_lshl_b32 s14, s4, 16 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v0 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v0 -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v1 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v1 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v5 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v5 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v6 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v6 -; SI-NEXT: s_branch .LBB111_5 -; SI-NEXT: .LBB111_3: -; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: s_and_b32 s15, s10, 0xffff0000 +; SI-NEXT: s_lshl_b32 s11, s10, 16 +; SI-NEXT: s_and_b32 s44, s7, 0xffff0000 +; SI-NEXT: s_lshl_b32 s14, s7, 16 +; SI-NEXT: s_and_b32 s47, s6, 0xffff0000 +; SI-NEXT: s_lshl_b32 s43, s6, 16 +; SI-NEXT: s_and_b32 s60, s9, 0xffff0000 +; SI-NEXT: s_lshl_b32 s46, s9, 16 +; SI-NEXT: s_and_b32 s74, s41, 0xffff0000 +; SI-NEXT: s_lshl_b32 s59, s41, 16 +; SI-NEXT: s_and_b32 s76, s5, 0xffff0000 +; SI-NEXT: s_lshl_b32 s62, s5, 16 +; SI-NEXT: s_and_b32 s79, s4, 0xffff0000 +; SI-NEXT: s_lshl_b32 s75, s4, 16 +; SI-NEXT: .LBB111_3: ; %end +; SI-NEXT: v_mul_f32_e64 v0, 1.0, s12 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_mul_f32_e64 v0, 1.0, s8 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s15 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s11 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s44 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s14 +; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s47 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s43 +; SI-NEXT: v_lshr_b64 v[3:4], v[3:4], 16 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s60 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s46 +; SI-NEXT: v_lshr_b64 v[4:5], v[4:5], 16 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s74 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s59 +; SI-NEXT: v_lshr_b64 v[5:6], v[5:6], 16 +; SI-NEXT: v_mul_f32_e64 v6, 1.0, s76 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_mul_f32_e64 v6, 1.0, s62 +; SI-NEXT: v_lshr_b64 v[6:7], v[6:7], 16 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s79 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v7 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s75 +; SI-NEXT: v_lshr_b64 v[7:8], v[7:8], 16 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB111_4: ; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: ; implicit-def: $sgpr11 ; SI-NEXT: ; implicit-def: $sgpr12 -; SI-NEXT: ; implicit-def: $sgpr13 -; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr11 ; SI-NEXT: ; implicit-def: $sgpr15 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $sgpr14 ; SI-NEXT: ; implicit-def: $sgpr44 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $sgpr45 -; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $sgpr43 +; SI-NEXT: ; implicit-def: $sgpr47 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr59 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr75 +; SI-NEXT: ; implicit-def: $sgpr79 ; SI-NEXT: s_branch .LBB111_2 -; SI-NEXT: .LBB111_4: -; SI-NEXT: v_mov_b32_e32 v10, s44 -; SI-NEXT: v_mov_b32_e32 v14, s45 -; SI-NEXT: .LBB111_5: ; %end -; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: v_mov_b32_e32 v1, s7 -; SI-NEXT: v_mov_b32_e32 v2, s8 -; SI-NEXT: v_mov_b32_e32 v3, s11 -; SI-NEXT: v_mov_b32_e32 v4, s12 -; SI-NEXT: v_mov_b32_e32 v5, s13 -; SI-NEXT: v_mov_b32_e32 v6, s14 -; SI-NEXT: v_mov_b32_e32 v7, s15 -; SI-NEXT: v_mov_b32_e32 v8, v17 -; SI-NEXT: v_mov_b32_e32 v11, v19 -; SI-NEXT: v_mov_b32_e32 v12, v18 -; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v32i8_to_v16bf16_scalar: ; VI: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.288bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.288bit.ll index 4e60831ca3da5..6c8abf8733579 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.288bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.288bit.ll @@ -498,65 +498,79 @@ define <18 x i16> @bitcast_v9i32_to_v18i16(<9 x i32> %a, i32 %b) { ; SI-LABEL: bitcast_v9i32_to_v18i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v16, v8 -; SI-NEXT: v_mov_b32_e32 v14, v7 -; SI-NEXT: v_mov_b32_e32 v12, v6 -; SI-NEXT: v_mov_b32_e32 v10, v5 -; SI-NEXT: v_mov_b32_e32 v8, v4 -; SI-NEXT: v_mov_b32_e32 v6, v3 -; SI-NEXT: v_mov_b32_e32 v4, v2 -; SI-NEXT: v_mov_b32_e32 v2, v1 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB4_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB4_4 -; SI-NEXT: .LBB4_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB4_3: ; %cmp.false -; SI-NEXT: v_alignbit_b32 v17, v0, v16, 16 -; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 -; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v9, v0, v8, 16 +; SI-NEXT: v_alignbit_b32 v10, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v11, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v12, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v14, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v1 +; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB4_2 -; SI-NEXT: .LBB4_4: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: s_cbranch_execz .LBB4_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 ; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 -; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; SI-NEXT: v_alignbit_b32 v17, v0, v16, 16 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_alignbit_b32 v10, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v11, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v12, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v14, v1, v0, 16 +; SI-NEXT: v_alignbit_b32 v9, v0, v8, 16 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v1 +; SI-NEXT: .LBB4_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v0, v0, v14 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v17 +; SI-NEXT: v_or_b32_e32 v2, v2, v12 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v16 +; SI-NEXT: v_or_b32_e32 v4, v4, v11 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v15 +; SI-NEXT: v_or_b32_e32 v6, v6, v10 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v13 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v1, v1, v14 +; SI-NEXT: v_or_b32_e32 v3, v3, v12 +; SI-NEXT: v_or_b32_e32 v5, v5, v11 +; SI-NEXT: v_or_b32_e32 v7, v7, v10 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v9i32_to_v18i16: @@ -677,24 +691,42 @@ define inreg <18 x i16> @bitcast_v9i32_to_v18i16_scalar(<9 x i32> inreg %a, i32 ; SI-NEXT: s_lshr_b32 s28, s17, 16 ; SI-NEXT: s_lshr_b64 s[8:9], s[24:25], 16 ; SI-NEXT: .LBB5_3: ; %end -; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_mov_b32_e32 v1, s12 -; SI-NEXT: v_mov_b32_e32 v2, s17 -; SI-NEXT: v_mov_b32_e32 v3, s28 -; SI-NEXT: v_mov_b32_e32 v4, s18 -; SI-NEXT: v_mov_b32_e32 v5, s10 -; SI-NEXT: v_mov_b32_e32 v6, s19 -; SI-NEXT: v_mov_b32_e32 v7, s27 -; SI-NEXT: v_mov_b32_e32 v8, s20 -; SI-NEXT: v_mov_b32_e32 v9, s6 -; SI-NEXT: v_mov_b32_e32 v10, s21 -; SI-NEXT: v_mov_b32_e32 v11, s26 -; SI-NEXT: v_mov_b32_e32 v12, s22 -; SI-NEXT: v_mov_b32_e32 v13, s4 -; SI-NEXT: v_mov_b32_e32 v14, s23 -; SI-NEXT: v_mov_b32_e32 v15, s25 -; SI-NEXT: v_mov_b32_e32 v16, s24 -; SI-NEXT: v_mov_b32_e32 v17, s8 +; SI-NEXT: s_and_b32 s5, s16, 0xffff +; SI-NEXT: s_lshl_b32 s7, s12, 16 +; SI-NEXT: s_or_b32 s5, s5, s7 +; SI-NEXT: s_and_b32 s7, s17, 0xffff +; SI-NEXT: s_lshl_b32 s9, s28, 16 +; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: s_and_b32 s9, s18, 0xffff +; SI-NEXT: s_lshl_b32 s10, s10, 16 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s19, 0xffff +; SI-NEXT: s_lshl_b32 s11, s27, 16 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: s_and_b32 s11, s20, 0xffff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_or_b32 s6, s11, s6 +; SI-NEXT: s_and_b32 s11, s21, 0xffff +; SI-NEXT: s_lshl_b32 s12, s26, 16 +; SI-NEXT: s_or_b32 s11, s11, s12 +; SI-NEXT: s_and_b32 s12, s22, 0xffff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_or_b32 s4, s12, s4 +; SI-NEXT: s_and_b32 s12, s23, 0xffff +; SI-NEXT: s_lshl_b32 s13, s25, 16 +; SI-NEXT: s_or_b32 s12, s12, s13 +; SI-NEXT: s_and_b32 s13, s24, 0xffff +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_or_b32 s8, s13, s8 +; SI-NEXT: v_mov_b32_e32 v0, s5 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: v_mov_b32_e32 v3, s10 +; SI-NEXT: v_mov_b32_e32 v4, s6 +; SI-NEXT: v_mov_b32_e32 v5, s11 +; SI-NEXT: v_mov_b32_e32 v6, s4 +; SI-NEXT: v_mov_b32_e32 v7, s12 +; SI-NEXT: v_mov_b32_e32 v8, s8 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB5_4: ; SI-NEXT: ; implicit-def: $sgpr12 @@ -820,21 +852,34 @@ define <9 x i32> @bitcast_v18i16_to_v9i32(<18 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v18i16_to_v9i32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v23, v8 -; SI-NEXT: v_mov_b32_e32 v22, v6 -; SI-NEXT: v_mov_b32_e32 v21, v4 -; SI-NEXT: v_mov_b32_e32 v20, v2 -; SI-NEXT: v_mov_b32_e32 v19, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v5 +; SI-NEXT: v_mov_b32_e32 v18, v8 +; SI-NEXT: v_mov_b32_e32 v10, v7 +; SI-NEXT: v_mov_b32_e32 v11, v6 +; SI-NEXT: v_mov_b32_e32 v12, v5 +; SI-NEXT: v_mov_b32_e32 v13, v4 +; SI-NEXT: v_mov_b32_e32 v14, v3 +; SI-NEXT: v_mov_b32_e32 v15, v2 +; SI-NEXT: v_mov_b32_e32 v16, v1 +; SI-NEXT: v_mov_b32_e32 v17, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v17 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v8 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v0 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -846,54 +891,54 @@ define <9 x i32> @bitcast_v18i16_to_v9i32(<18 x i16> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB6_3: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v19 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v21 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v22 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v23 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v10 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v12 -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v14 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v16 -; SI-NEXT: v_or_b32_e32 v0, v0, v28 -; SI-NEXT: v_or_b32_e32 v1, v1, v27 -; SI-NEXT: v_or_b32_e32 v2, v2, v26 -; SI-NEXT: v_or_b32_e32 v3, v3, v25 -; SI-NEXT: v_or_b32_e32 v4, v4, v24 -; SI-NEXT: v_or_b32_e32 v5, v5, v18 -; SI-NEXT: v_or_b32_e32 v6, v6, v13 -; SI-NEXT: v_or_b32_e32 v7, v7, v11 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v0, v0, v26 +; SI-NEXT: v_or_b32_e32 v1, v1, v25 +; SI-NEXT: v_or_b32_e32 v2, v2, v24 +; SI-NEXT: v_or_b32_e32 v3, v3, v23 +; SI-NEXT: v_or_b32_e32 v4, v4, v22 +; SI-NEXT: v_or_b32_e32 v5, v5, v21 +; SI-NEXT: v_or_b32_e32 v6, v6, v20 +; SI-NEXT: v_or_b32_e32 v7, v7, v19 ; SI-NEXT: v_or_b32_e32 v8, v8, v9 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB6_2 ; SI-NEXT: .LBB6_4: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v19 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v20 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v21 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v22 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v23 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v10 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v12 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v14 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v18 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 @@ -903,15 +948,15 @@ define <9 x i32> @bitcast_v18i16_to_v9i32(<18 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; SI-NEXT: v_or_b32_e32 v0, v28, v0 +; SI-NEXT: v_or_b32_e32 v0, v26, v0 ; SI-NEXT: s_mov_b32 s6, 0x30000 -; SI-NEXT: v_or_b32_e32 v1, v27, v1 -; SI-NEXT: v_or_b32_e32 v2, v26, v2 -; SI-NEXT: v_or_b32_e32 v3, v25, v3 -; SI-NEXT: v_or_b32_e32 v4, v24, v4 -; SI-NEXT: v_or_b32_e32 v5, v18, v5 -; SI-NEXT: v_or_b32_e32 v6, v13, v6 -; SI-NEXT: v_or_b32_e32 v7, v11, v7 +; SI-NEXT: v_or_b32_e32 v1, v25, v1 +; SI-NEXT: v_or_b32_e32 v2, v24, v2 +; SI-NEXT: v_or_b32_e32 v3, v23, v3 +; SI-NEXT: v_or_b32_e32 v4, v22, v4 +; SI-NEXT: v_or_b32_e32 v5, v21, v5 +; SI-NEXT: v_or_b32_e32 v6, v20, v6 +; SI-NEXT: v_or_b32_e32 v7, v19, v7 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 @@ -1030,82 +1075,83 @@ define inreg <9 x i32> @bitcast_v18i16_to_v9i32_scalar(<18 x i16> inreg %a, i32 ; SI-LABEL: bitcast_v18i16_to_v9i32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; SI-NEXT: v_mov_b32_e32 v9, v2 -; SI-NEXT: v_mov_b32_e32 v10, v0 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v3 +; SI-NEXT: s_lshr_b32 s13, s24, 16 +; SI-NEXT: s_lshr_b32 s26, s23, 16 +; SI-NEXT: s_lshr_b32 s27, s22, 16 +; SI-NEXT: s_lshr_b32 s28, s21, 16 +; SI-NEXT: s_lshr_b32 s29, s20, 16 +; SI-NEXT: s_lshr_b32 s40, s19, 16 +; SI-NEXT: s_lshr_b32 s41, s18, 16 +; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 +; SI-NEXT: s_cmp_lg_u32 s25, 0 ; SI-NEXT: s_cbranch_scc0 .LBB7_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_lshl_b32 s5, s43, 16 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s42, 16 ; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_and_b32 s6, s18, 0xffff +; SI-NEXT: s_lshl_b32 s7, s41, 16 ; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_and_b32 s7, s19, 0xffff +; SI-NEXT: s_lshl_b32 s8, s40, 16 ; SI-NEXT: s_or_b32 s7, s7, s8 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_and_b32 s8, s20, 0xffff +; SI-NEXT: s_lshl_b32 s9, s29, 16 ; SI-NEXT: s_or_b32 s8, s8, s9 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_and_b32 s9, s21, 0xffff +; SI-NEXT: s_lshl_b32 s10, s28, 16 ; SI-NEXT: s_or_b32 s9, s9, s10 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v10 +; SI-NEXT: s_and_b32 s10, s22, 0xffff +; SI-NEXT: s_lshl_b32 s11, s27, 16 ; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_or_b32_e32 v7, v0, v12 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v9 -; SI-NEXT: v_or_b32_e32 v8, v0, v11 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_mov_b32_e32 v5, s9 -; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_and_b32 s11, s23, 0xffff +; SI-NEXT: s_lshl_b32 s12, s26, 16 +; SI-NEXT: s_or_b32 s11, s11, s12 +; SI-NEXT: s_and_b32 s12, s24, 0xffff +; SI-NEXT: s_lshl_b32 s14, s13, 16 +; SI-NEXT: s_or_b32 s12, s12, s14 ; SI-NEXT: s_cbranch_execnz .LBB7_3 ; SI-NEXT: .LBB7_2: ; %cmp.true ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: s_add_i32 s17, s17, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s42, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_and_b32 s6, s18, 0xffff +; SI-NEXT: s_lshl_b32 s7, s41, 16 +; SI-NEXT: s_add_i32 s19, s19, 3 ; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v10 +; SI-NEXT: s_and_b32 s7, s19, 0xffff +; SI-NEXT: s_lshl_b32 s8, s40, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 ; SI-NEXT: s_or_b32 s7, s8, s7 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_and_b32 s8, s20, 0xffff +; SI-NEXT: s_lshl_b32 s9, s29, 16 +; SI-NEXT: s_add_i32 s21, s21, 3 ; SI-NEXT: s_or_b32 s8, s9, s8 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: v_or_b32_e32 v0, v12, v0 +; SI-NEXT: s_and_b32 s9, s21, 0xffff +; SI-NEXT: s_lshl_b32 s10, s28, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 ; SI-NEXT: s_or_b32 s9, s10, s9 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v9 +; SI-NEXT: s_and_b32 s10, s22, 0xffff +; SI-NEXT: s_lshl_b32 s11, s27, 16 +; SI-NEXT: s_add_i32 s23, s23, 3 ; SI-NEXT: s_or_b32 s10, s11, s10 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_and_b32 s11, s23, 0xffff +; SI-NEXT: s_lshl_b32 s12, s26, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s11, s12, s11 +; SI-NEXT: s_and_b32 s12, s24, 0xffff +; SI-NEXT: s_lshl_b32 s13, s13, 16 +; SI-NEXT: s_or_b32 s12, s13, s12 ; SI-NEXT: s_add_i32 s4, s4, 0x30000 ; SI-NEXT: s_add_i32 s5, s5, 0x30000 ; SI-NEXT: s_add_i32 s6, s6, 0x30000 @@ -1113,8 +1159,9 @@ define inreg <9 x i32> @bitcast_v18i16_to_v9i32_scalar(<18 x i16> inreg %a, i32 ; SI-NEXT: s_add_i32 s8, s8, 0x30000 ; SI-NEXT: s_add_i32 s9, s9, 0x30000 ; SI-NEXT: s_add_i32 s10, s10, 0x30000 -; SI-NEXT: v_or_b32_e32 v0, v11, v0 -; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v0 +; SI-NEXT: s_add_i32 s11, s11, 0x30000 +; SI-NEXT: s_add_i32 s12, s12, 0x30000 +; SI-NEXT: .LBB7_3: ; %end ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_mov_b32_e32 v2, s6 @@ -1122,10 +1169,11 @@ define inreg <9 x i32> @bitcast_v18i16_to_v9i32_scalar(<18 x i16> inreg %a, i32 ; SI-NEXT: v_mov_b32_e32 v4, s8 ; SI-NEXT: v_mov_b32_e32 v5, s9 ; SI-NEXT: v_mov_b32_e32 v6, s10 -; SI-NEXT: .LBB7_3: ; %end +; SI-NEXT: v_mov_b32_e32 v7, s11 +; SI-NEXT: v_mov_b32_e32 v8, s12 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB7_4: -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 +; SI-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12 ; SI-NEXT: s_branch .LBB7_2 ; ; VI-LABEL: bitcast_v18i16_to_v9i32_scalar: @@ -1281,16 +1329,56 @@ define <18 x half> @bitcast_v9i32_to_v18f16(<9 x i32> %a, i32 %b) { ; SI-LABEL: bitcast_v9i32_to_v18f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v26, v8 -; SI-NEXT: v_mov_b32_e32 v25, v7 -; SI-NEXT: v_mov_b32_e32 v24, v6 -; SI-NEXT: v_mov_b32_e32 v23, v5 -; SI-NEXT: v_mov_b32_e32 v22, v4 -; SI-NEXT: v_mov_b32_e32 v21, v3 -; SI-NEXT: v_mov_b32_e32 v20, v2 -; SI-NEXT: v_mov_b32_e32 v19, v1 -; SI-NEXT: v_mov_b32_e32 v18, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB8_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v0 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 @@ -1300,101 +1388,84 @@ define <18 x half> @bitcast_v9i32_to_v18f16(<9 x i32> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB8_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB8_4 -; SI-NEXT: .LBB8_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB8_3: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: .LBB8_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB8_2 -; SI-NEXT: .LBB8_4: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v18 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v19 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v20 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v21 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v22 -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v23 -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v24 -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v25 -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: s_cbranch_execz .LBB8_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v0 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v0 +; SI-NEXT: .LBB8_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v21 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v20 +; SI-NEXT: v_or_b32_e32 v2, v5, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v17 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v16 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v13 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v11 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v9i32_to_v18f16: @@ -1486,91 +1557,127 @@ define inreg <18 x half> @bitcast_v9i32_to_v18f16_scalar(<9 x i32> inreg %a, i32 ; SI-NEXT: s_cbranch_scc0 .LBB9_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 ; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 ; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 ; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 ; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 ; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 ; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 ; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 ; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s16 ; SI-NEXT: s_cbranch_execnz .LBB9_3 ; SI-NEXT: .LBB9_2: ; %cmp.true ; SI-NEXT: s_add_i32 s24, s24, 3 ; SI-NEXT: s_add_i32 s23, s23, 3 ; SI-NEXT: s_lshr_b32 s4, s24, 16 ; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 ; SI-NEXT: s_lshr_b32 s4, s23, 16 ; SI-NEXT: s_add_i32 s21, s21, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 ; SI-NEXT: s_lshr_b32 s4, s22, 16 ; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 ; SI-NEXT: s_lshr_b32 s4, s21, 16 ; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 ; SI-NEXT: s_lshr_b32 s4, s20, 16 ; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 ; SI-NEXT: s_lshr_b32 s4, s19, 16 ; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 ; SI-NEXT: s_lshr_b32 s4, s18, 16 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 ; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 ; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 ; SI-NEXT: .LBB9_3: ; %end +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v17 +; SI-NEXT: v_or_b32_e32 v0, v15, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v15, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v14 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_or_b32_e32 v5, v11, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v11 +; SI-NEXT: v_or_b32_e32 v1, v18, v1 +; SI-NEXT: v_or_b32_e32 v4, v15, v4 +; SI-NEXT: v_or_b32_e32 v6, v13, v6 +; SI-NEXT: v_or_b32_e32 v7, v10, v7 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB9_4: +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: s_branch .LBB9_2 ; ; VI-LABEL: bitcast_v9i32_to_v18f16_scalar: @@ -1685,58 +1792,81 @@ define <9 x i32> @bitcast_v18f16_to_v9i32(<18 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v18f16_to_v9i32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v31, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v16 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 -; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB10_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v8 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB10_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execnz .LBB10_4 ; SI-NEXT: .LBB10_2: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB10_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v31 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v29 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v10 -; SI-NEXT: v_or_b32_e32 v0, v30, v0 -; SI-NEXT: v_or_b32_e32 v1, v28, v1 -; SI-NEXT: v_or_b32_e32 v2, v26, v2 -; SI-NEXT: v_or_b32_e32 v3, v24, v3 -; SI-NEXT: v_or_b32_e32 v4, v22, v4 -; SI-NEXT: v_or_b32_e32 v5, v20, v5 -; SI-NEXT: v_or_b32_e32 v6, v13, v6 -; SI-NEXT: v_or_b32_e32 v7, v11, v7 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v11 +; SI-NEXT: v_or_b32_e32 v0, v26, v0 +; SI-NEXT: v_or_b32_e32 v1, v24, v1 +; SI-NEXT: v_or_b32_e32 v2, v22, v2 +; SI-NEXT: v_or_b32_e32 v3, v20, v3 +; SI-NEXT: v_or_b32_e32 v4, v18, v4 +; SI-NEXT: v_or_b32_e32 v5, v16, v5 +; SI-NEXT: v_or_b32_e32 v6, v14, v6 +; SI-NEXT: v_or_b32_e32 v7, v12, v7 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 ; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr25 @@ -1746,18 +1876,22 @@ define <9 x i32> @bitcast_v18f16_to_v9i32(<18 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB10_2 ; SI-NEXT: .LBB10_4: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v24 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -1770,10 +1904,10 @@ define <9 x i32> @bitcast_v18f16_to_v9i32(<18 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v20 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -1782,25 +1916,25 @@ define <9 x i32> @bitcast_v18f16_to_v9i32(<18 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v19 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v18 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v16 ; SI-NEXT: v_or_b32_e32 v4, v4, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v14 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 @@ -1808,24 +1942,24 @@ define <9 x i32> @bitcast_v18f16_to_v9i32(<18 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v13 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v10 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -1936,26 +2070,52 @@ define inreg <9 x i32> @bitcast_v18f16_to_v9i32_scalar(<18 x half> inreg %a, i32 ; SI-LABEL: bitcast_v18f16_to_v9i32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v26, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v25, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v24, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v23, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v22, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v19, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v18, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v17, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v16, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v15, s26 -; SI-NEXT: v_cvt_f16_f32_e32 v14, s29 -; SI-NEXT: v_cvt_f16_f32_e32 v13, s28 +; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v2 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_cvt_f16_f32_e32 v11, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: s_cmp_lg_u32 s25, 0 ; SI-NEXT: s_cbranch_scc0 .LBB11_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v26 @@ -2212,65 +2372,79 @@ define <18 x i16> @bitcast_v9f32_to_v18i16(<9 x float> %a, i32 %b) { ; SI-LABEL: bitcast_v9f32_to_v18i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v16, v8 -; SI-NEXT: v_mov_b32_e32 v14, v7 -; SI-NEXT: v_mov_b32_e32 v12, v6 -; SI-NEXT: v_mov_b32_e32 v10, v5 -; SI-NEXT: v_mov_b32_e32 v8, v4 -; SI-NEXT: v_mov_b32_e32 v6, v3 -; SI-NEXT: v_mov_b32_e32 v4, v2 -; SI-NEXT: v_mov_b32_e32 v2, v1 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB12_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB12_4 -; SI-NEXT: .LBB12_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB12_3: ; %cmp.false -; SI-NEXT: v_alignbit_b32 v17, v0, v16, 16 -; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 -; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v9, v0, v8, 16 +; SI-NEXT: v_alignbit_b32 v10, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v11, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v12, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v14, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v1 +; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB12_2 -; SI-NEXT: .LBB12_4: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 -; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: s_cbranch_execz .LBB12_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 ; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 -; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 -; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 -; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 -; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 -; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 -; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; SI-NEXT: v_alignbit_b32 v17, v0, v16, 16 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_alignbit_b32 v10, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v11, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v12, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v14, v1, v0, 16 +; SI-NEXT: v_alignbit_b32 v9, v0, v8, 16 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v1 +; SI-NEXT: .LBB12_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v0, v0, v14 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v17 +; SI-NEXT: v_or_b32_e32 v2, v2, v12 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v16 +; SI-NEXT: v_or_b32_e32 v4, v4, v11 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v15 +; SI-NEXT: v_or_b32_e32 v6, v6, v10 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v13 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v1, v1, v14 +; SI-NEXT: v_or_b32_e32 v3, v3, v12 +; SI-NEXT: v_or_b32_e32 v5, v5, v11 +; SI-NEXT: v_or_b32_e32 v7, v7, v10 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v9f32_to_v18i16: @@ -2367,24 +2541,24 @@ define inreg <18 x i16> @bitcast_v9f32_to_v18i16_scalar(<9 x float> inreg %a, i3 ; SI-NEXT: s_lshr_b64 s[12:13], s[16:17], 16 ; SI-NEXT: s_cbranch_execnz .LBB13_4 ; SI-NEXT: .LBB13_2: ; %cmp.true -; SI-NEXT: v_add_f32_e64 v16, s24, 1.0 -; SI-NEXT: v_add_f32_e64 v26, s17, 1.0 -; SI-NEXT: v_add_f32_e64 v25, s16, 1.0 -; SI-NEXT: v_add_f32_e64 v24, s19, 1.0 -; SI-NEXT: v_add_f32_e64 v23, s18, 1.0 -; SI-NEXT: v_add_f32_e64 v22, s21, 1.0 -; SI-NEXT: v_add_f32_e64 v21, s20, 1.0 -; SI-NEXT: v_add_f32_e64 v20, s23, 1.0 -; SI-NEXT: v_add_f32_e64 v19, s22, 1.0 -; SI-NEXT: v_lshr_b64 v[13:14], v[19:20], 16 -; SI-NEXT: v_lshr_b64 v[9:10], v[21:22], 16 -; SI-NEXT: v_lshr_b64 v[5:6], v[23:24], 16 -; SI-NEXT: v_lshr_b64 v[1:2], v[25:26], 16 -; SI-NEXT: v_lshr_b64 v[17:18], v[16:17], 16 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v26 +; SI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; SI-NEXT: v_add_f32_e64 v7, s23, 1.0 +; SI-NEXT: v_add_f32_e64 v6, s22, 1.0 +; SI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; SI-NEXT: v_lshr_b64 v[9:10], v[6:7], 16 +; SI-NEXT: v_lshr_b64 v[12:13], v[4:5], 16 +; SI-NEXT: v_add_f32_e64 v8, s24, 1.0 +; SI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: v_lshr_b64 v[13:14], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[14:15], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[10:11], v[8:9], 16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v1 ; SI-NEXT: s_branch .LBB13_5 ; SI-NEXT: .LBB13_3: ; SI-NEXT: ; implicit-def: $sgpr12 @@ -2398,33 +2572,52 @@ define inreg <18 x i16> @bitcast_v9f32_to_v18i16_scalar(<9 x float> inreg %a, i3 ; SI-NEXT: ; implicit-def: $sgpr8 ; SI-NEXT: s_branch .LBB13_2 ; SI-NEXT: .LBB13_4: -; SI-NEXT: v_mov_b32_e32 v25, s16 -; SI-NEXT: v_mov_b32_e32 v26, s17 -; SI-NEXT: v_mov_b32_e32 v23, s18 -; SI-NEXT: v_mov_b32_e32 v24, s19 -; SI-NEXT: v_mov_b32_e32 v21, s20 -; SI-NEXT: v_mov_b32_e32 v22, s21 -; SI-NEXT: v_mov_b32_e32 v19, s22 -; SI-NEXT: v_mov_b32_e32 v20, s23 -; SI-NEXT: v_mov_b32_e32 v16, s24 -; SI-NEXT: v_mov_b32_e32 v3, s25 -; SI-NEXT: v_mov_b32_e32 v7, s26 -; SI-NEXT: v_mov_b32_e32 v11, s27 -; SI-NEXT: v_mov_b32_e32 v15, s28 -; SI-NEXT: v_mov_b32_e32 v17, s8 -; SI-NEXT: v_mov_b32_e32 v1, s12 -; SI-NEXT: v_mov_b32_e32 v5, s10 -; SI-NEXT: v_mov_b32_e32 v9, s6 -; SI-NEXT: v_mov_b32_e32 v13, s4 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v19, s25 +; SI-NEXT: v_mov_b32_e32 v18, s26 +; SI-NEXT: v_mov_b32_e32 v17, s27 +; SI-NEXT: v_mov_b32_e32 v16, s28 +; SI-NEXT: v_mov_b32_e32 v10, s8 +; SI-NEXT: v_mov_b32_e32 v14, s12 +; SI-NEXT: v_mov_b32_e32 v13, s10 +; SI-NEXT: v_mov_b32_e32 v12, s6 +; SI-NEXT: v_mov_b32_e32 v9, s4 ; SI-NEXT: .LBB13_5: ; %end -; SI-NEXT: v_mov_b32_e32 v0, v25 -; SI-NEXT: v_mov_b32_e32 v2, v26 -; SI-NEXT: v_mov_b32_e32 v4, v23 -; SI-NEXT: v_mov_b32_e32 v6, v24 -; SI-NEXT: v_mov_b32_e32 v8, v21 -; SI-NEXT: v_mov_b32_e32 v10, v22 -; SI-NEXT: v_mov_b32_e32 v12, v19 -; SI-NEXT: v_mov_b32_e32 v14, v20 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v0, v11 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v19 +; SI-NEXT: v_or_b32_e32 v1, v1, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v13 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v2, v2, v11 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v18 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v3, v3, v11 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_or_b32_e32 v6, v6, v9 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v16 +; SI-NEXT: v_or_b32_e32 v4, v4, v11 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v17 +; SI-NEXT: v_or_b32_e32 v7, v7, v9 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_or_b32_e32 v5, v5, v11 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v9f32_to_v18i16_scalar: @@ -2562,21 +2755,34 @@ define <9 x float> @bitcast_v18i16_to_v9f32(<18 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v18i16_to_v9f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v23, v8 -; SI-NEXT: v_mov_b32_e32 v22, v6 -; SI-NEXT: v_mov_b32_e32 v21, v4 -; SI-NEXT: v_mov_b32_e32 v20, v2 -; SI-NEXT: v_mov_b32_e32 v19, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v5 +; SI-NEXT: v_mov_b32_e32 v18, v8 +; SI-NEXT: v_mov_b32_e32 v10, v7 +; SI-NEXT: v_mov_b32_e32 v11, v6 +; SI-NEXT: v_mov_b32_e32 v12, v5 +; SI-NEXT: v_mov_b32_e32 v13, v4 +; SI-NEXT: v_mov_b32_e32 v14, v3 +; SI-NEXT: v_mov_b32_e32 v15, v2 +; SI-NEXT: v_mov_b32_e32 v16, v1 +; SI-NEXT: v_mov_b32_e32 v17, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v17 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v8 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v0 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -2588,54 +2794,54 @@ define <9 x float> @bitcast_v18i16_to_v9f32(<18 x i16> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB14_3: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v19 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v21 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v22 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v23 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v10 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v12 -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v14 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v16 -; SI-NEXT: v_or_b32_e32 v0, v0, v28 -; SI-NEXT: v_or_b32_e32 v1, v1, v27 -; SI-NEXT: v_or_b32_e32 v2, v2, v26 -; SI-NEXT: v_or_b32_e32 v3, v3, v25 -; SI-NEXT: v_or_b32_e32 v4, v4, v24 -; SI-NEXT: v_or_b32_e32 v5, v5, v18 -; SI-NEXT: v_or_b32_e32 v6, v6, v13 -; SI-NEXT: v_or_b32_e32 v7, v7, v11 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v0, v0, v26 +; SI-NEXT: v_or_b32_e32 v1, v1, v25 +; SI-NEXT: v_or_b32_e32 v2, v2, v24 +; SI-NEXT: v_or_b32_e32 v3, v3, v23 +; SI-NEXT: v_or_b32_e32 v4, v4, v22 +; SI-NEXT: v_or_b32_e32 v5, v5, v21 +; SI-NEXT: v_or_b32_e32 v6, v6, v20 +; SI-NEXT: v_or_b32_e32 v7, v7, v19 ; SI-NEXT: v_or_b32_e32 v8, v8, v9 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB14_2 ; SI-NEXT: .LBB14_4: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v19 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v20 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v21 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v22 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v23 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v10 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v12 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v14 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v18 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 @@ -2645,15 +2851,15 @@ define <9 x float> @bitcast_v18i16_to_v9f32(<18 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; SI-NEXT: v_or_b32_e32 v0, v28, v0 +; SI-NEXT: v_or_b32_e32 v0, v26, v0 ; SI-NEXT: s_mov_b32 s6, 0x30000 -; SI-NEXT: v_or_b32_e32 v1, v27, v1 -; SI-NEXT: v_or_b32_e32 v2, v26, v2 -; SI-NEXT: v_or_b32_e32 v3, v25, v3 -; SI-NEXT: v_or_b32_e32 v4, v24, v4 -; SI-NEXT: v_or_b32_e32 v5, v18, v5 -; SI-NEXT: v_or_b32_e32 v6, v13, v6 -; SI-NEXT: v_or_b32_e32 v7, v11, v7 +; SI-NEXT: v_or_b32_e32 v1, v25, v1 +; SI-NEXT: v_or_b32_e32 v2, v24, v2 +; SI-NEXT: v_or_b32_e32 v3, v23, v3 +; SI-NEXT: v_or_b32_e32 v4, v22, v4 +; SI-NEXT: v_or_b32_e32 v5, v21, v5 +; SI-NEXT: v_or_b32_e32 v6, v20, v6 +; SI-NEXT: v_or_b32_e32 v7, v19, v7 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 @@ -2772,82 +2978,83 @@ define inreg <9 x float> @bitcast_v18i16_to_v9f32_scalar(<18 x i16> inreg %a, i3 ; SI-LABEL: bitcast_v18i16_to_v9f32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; SI-NEXT: v_mov_b32_e32 v9, v2 -; SI-NEXT: v_mov_b32_e32 v10, v0 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v3 +; SI-NEXT: s_lshr_b32 s13, s24, 16 +; SI-NEXT: s_lshr_b32 s26, s23, 16 +; SI-NEXT: s_lshr_b32 s27, s22, 16 +; SI-NEXT: s_lshr_b32 s28, s21, 16 +; SI-NEXT: s_lshr_b32 s29, s20, 16 +; SI-NEXT: s_lshr_b32 s40, s19, 16 +; SI-NEXT: s_lshr_b32 s41, s18, 16 +; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 +; SI-NEXT: s_cmp_lg_u32 s25, 0 ; SI-NEXT: s_cbranch_scc0 .LBB15_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_lshl_b32 s5, s43, 16 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s42, 16 ; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_and_b32 s6, s18, 0xffff +; SI-NEXT: s_lshl_b32 s7, s41, 16 ; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_and_b32 s7, s19, 0xffff +; SI-NEXT: s_lshl_b32 s8, s40, 16 ; SI-NEXT: s_or_b32 s7, s7, s8 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_and_b32 s8, s20, 0xffff +; SI-NEXT: s_lshl_b32 s9, s29, 16 ; SI-NEXT: s_or_b32 s8, s8, s9 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_and_b32 s9, s21, 0xffff +; SI-NEXT: s_lshl_b32 s10, s28, 16 ; SI-NEXT: s_or_b32 s9, s9, s10 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v10 +; SI-NEXT: s_and_b32 s10, s22, 0xffff +; SI-NEXT: s_lshl_b32 s11, s27, 16 ; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_or_b32_e32 v7, v0, v12 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v9 -; SI-NEXT: v_or_b32_e32 v8, v0, v11 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_mov_b32_e32 v5, s9 -; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_and_b32 s11, s23, 0xffff +; SI-NEXT: s_lshl_b32 s12, s26, 16 +; SI-NEXT: s_or_b32 s11, s11, s12 +; SI-NEXT: s_and_b32 s12, s24, 0xffff +; SI-NEXT: s_lshl_b32 s14, s13, 16 +; SI-NEXT: s_or_b32 s12, s12, s14 ; SI-NEXT: s_cbranch_execnz .LBB15_3 ; SI-NEXT: .LBB15_2: ; %cmp.true ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: s_add_i32 s17, s17, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s42, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_and_b32 s6, s18, 0xffff +; SI-NEXT: s_lshl_b32 s7, s41, 16 +; SI-NEXT: s_add_i32 s19, s19, 3 ; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v10 +; SI-NEXT: s_and_b32 s7, s19, 0xffff +; SI-NEXT: s_lshl_b32 s8, s40, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 ; SI-NEXT: s_or_b32 s7, s8, s7 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_and_b32 s8, s20, 0xffff +; SI-NEXT: s_lshl_b32 s9, s29, 16 +; SI-NEXT: s_add_i32 s21, s21, 3 ; SI-NEXT: s_or_b32 s8, s9, s8 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: v_or_b32_e32 v0, v12, v0 +; SI-NEXT: s_and_b32 s9, s21, 0xffff +; SI-NEXT: s_lshl_b32 s10, s28, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 ; SI-NEXT: s_or_b32 s9, s10, s9 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v9 +; SI-NEXT: s_and_b32 s10, s22, 0xffff +; SI-NEXT: s_lshl_b32 s11, s27, 16 +; SI-NEXT: s_add_i32 s23, s23, 3 ; SI-NEXT: s_or_b32 s10, s11, s10 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_and_b32 s11, s23, 0xffff +; SI-NEXT: s_lshl_b32 s12, s26, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s11, s12, s11 +; SI-NEXT: s_and_b32 s12, s24, 0xffff +; SI-NEXT: s_lshl_b32 s13, s13, 16 +; SI-NEXT: s_or_b32 s12, s13, s12 ; SI-NEXT: s_add_i32 s4, s4, 0x30000 ; SI-NEXT: s_add_i32 s5, s5, 0x30000 ; SI-NEXT: s_add_i32 s6, s6, 0x30000 @@ -2855,8 +3062,9 @@ define inreg <9 x float> @bitcast_v18i16_to_v9f32_scalar(<18 x i16> inreg %a, i3 ; SI-NEXT: s_add_i32 s8, s8, 0x30000 ; SI-NEXT: s_add_i32 s9, s9, 0x30000 ; SI-NEXT: s_add_i32 s10, s10, 0x30000 -; SI-NEXT: v_or_b32_e32 v0, v11, v0 -; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v0 +; SI-NEXT: s_add_i32 s11, s11, 0x30000 +; SI-NEXT: s_add_i32 s12, s12, 0x30000 +; SI-NEXT: .LBB15_3: ; %end ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_mov_b32_e32 v2, s6 @@ -2864,10 +3072,11 @@ define inreg <9 x float> @bitcast_v18i16_to_v9f32_scalar(<18 x i16> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v4, s8 ; SI-NEXT: v_mov_b32_e32 v5, s9 ; SI-NEXT: v_mov_b32_e32 v6, s10 -; SI-NEXT: .LBB15_3: ; %end +; SI-NEXT: v_mov_b32_e32 v7, s11 +; SI-NEXT: v_mov_b32_e32 v8, s12 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB15_4: -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 +; SI-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12 ; SI-NEXT: s_branch .LBB15_2 ; ; VI-LABEL: bitcast_v18i16_to_v9f32_scalar: @@ -3023,16 +3232,56 @@ define <18 x half> @bitcast_v9f32_to_v18f16(<9 x float> %a, i32 %b) { ; SI-LABEL: bitcast_v9f32_to_v18f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v26, v8 -; SI-NEXT: v_mov_b32_e32 v25, v7 -; SI-NEXT: v_mov_b32_e32 v24, v6 -; SI-NEXT: v_mov_b32_e32 v23, v5 -; SI-NEXT: v_mov_b32_e32 v22, v4 -; SI-NEXT: v_mov_b32_e32 v21, v3 -; SI-NEXT: v_mov_b32_e32 v20, v2 -; SI-NEXT: v_mov_b32_e32 v19, v1 -; SI-NEXT: v_mov_b32_e32 v18, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB16_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v0 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 @@ -3042,101 +3291,84 @@ define <18 x half> @bitcast_v9f32_to_v18f16(<9 x float> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB16_3 -; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: .LBB16_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB16_4 -; SI-NEXT: .LBB16_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB16_3: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB16_2 -; SI-NEXT: .LBB16_4: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v1, 1.0, v18 -; SI-NEXT: v_add_f32_e32 v3, 1.0, v19 -; SI-NEXT: v_add_f32_e32 v5, 1.0, v20 -; SI-NEXT: v_add_f32_e32 v7, 1.0, v21 -; SI-NEXT: v_add_f32_e32 v9, 1.0, v22 -; SI-NEXT: v_add_f32_e32 v11, 1.0, v23 -; SI-NEXT: v_add_f32_e32 v13, 1.0, v24 -; SI-NEXT: v_add_f32_e32 v15, 1.0, v25 -; SI-NEXT: v_add_f32_e32 v17, 1.0, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_cbranch_execz .LBB16_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v0 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v0 +; SI-NEXT: .LBB16_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v21 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v20 +; SI-NEXT: v_or_b32_e32 v2, v5, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v17 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v16 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v13 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v11 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v9f32_to_v18f16: @@ -3223,91 +3455,127 @@ define inreg <18 x half> @bitcast_v9f32_to_v18f16_scalar(<9 x float> inreg %a, i ; SI-NEXT: s_cbranch_scc0 .LBB17_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 ; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 ; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 ; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 ; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 ; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 ; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 ; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 ; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s16 ; SI-NEXT: s_cbranch_execnz .LBB17_3 ; SI-NEXT: .LBB17_2: ; %cmp.true -; SI-NEXT: v_add_f32_e64 v1, s16, 1.0 -; SI-NEXT: v_add_f32_e64 v3, s17, 1.0 -; SI-NEXT: v_add_f32_e64 v5, s18, 1.0 -; SI-NEXT: v_add_f32_e64 v7, s19, 1.0 -; SI-NEXT: v_add_f32_e64 v9, s20, 1.0 -; SI-NEXT: v_add_f32_e64 v11, s21, 1.0 -; SI-NEXT: v_add_f32_e64 v13, s22, 1.0 -; SI-NEXT: v_add_f32_e64 v15, s23, 1.0 -; SI-NEXT: v_add_f32_e64 v17, s24, 1.0 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_add_f32_e64 v17, s16, 1.0 +; SI-NEXT: v_add_f32_e64 v16, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v14, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v13, s20, 1.0 +; SI-NEXT: v_add_f32_e64 v12, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v6, s22, 1.0 +; SI-NEXT: v_add_f32_e64 v9, s23, 1.0 +; SI-NEXT: v_add_f32_e64 v7, s24, 1.0 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v17 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: .LBB17_3: ; %end +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v17 +; SI-NEXT: v_or_b32_e32 v0, v15, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v15, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v14 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_or_b32_e32 v5, v11, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v11 +; SI-NEXT: v_or_b32_e32 v1, v18, v1 +; SI-NEXT: v_or_b32_e32 v4, v15, v4 +; SI-NEXT: v_or_b32_e32 v6, v13, v6 +; SI-NEXT: v_or_b32_e32 v7, v10, v7 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB17_4: +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: s_branch .LBB17_2 ; ; VI-LABEL: bitcast_v9f32_to_v18f16_scalar: @@ -3445,25 +3713,52 @@ define <9 x float> @bitcast_v18f16_to_v9f32(<18 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v18f16_to_v9f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v31, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v16 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v8 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -3475,28 +3770,24 @@ define <9 x float> @bitcast_v18f16_to_v9f32(<18 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB18_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v31 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v29 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v10 -; SI-NEXT: v_or_b32_e32 v0, v30, v0 -; SI-NEXT: v_or_b32_e32 v1, v28, v1 -; SI-NEXT: v_or_b32_e32 v2, v26, v2 -; SI-NEXT: v_or_b32_e32 v3, v24, v3 -; SI-NEXT: v_or_b32_e32 v4, v22, v4 -; SI-NEXT: v_or_b32_e32 v5, v20, v5 -; SI-NEXT: v_or_b32_e32 v6, v13, v6 -; SI-NEXT: v_or_b32_e32 v7, v11, v7 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v11 +; SI-NEXT: v_or_b32_e32 v0, v26, v0 +; SI-NEXT: v_or_b32_e32 v1, v24, v1 +; SI-NEXT: v_or_b32_e32 v2, v22, v2 +; SI-NEXT: v_or_b32_e32 v3, v20, v3 +; SI-NEXT: v_or_b32_e32 v4, v18, v4 +; SI-NEXT: v_or_b32_e32 v5, v16, v5 +; SI-NEXT: v_or_b32_e32 v6, v14, v6 +; SI-NEXT: v_or_b32_e32 v7, v12, v7 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 ; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr25 @@ -3506,18 +3797,22 @@ define <9 x float> @bitcast_v18f16_to_v9f32(<18 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB18_2 ; SI-NEXT: .LBB18_4: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v24 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -3530,10 +3825,10 @@ define <9 x float> @bitcast_v18f16_to_v9f32(<18 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v20 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -3542,25 +3837,25 @@ define <9 x float> @bitcast_v18f16_to_v9f32(<18 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v19 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v18 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v16 ; SI-NEXT: v_or_b32_e32 v4, v4, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v14 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 @@ -3568,24 +3863,24 @@ define <9 x float> @bitcast_v18f16_to_v9f32(<18 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v13 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v10 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -3696,26 +3991,52 @@ define inreg <9 x float> @bitcast_v18f16_to_v9f32_scalar(<18 x half> inreg %a, i ; SI-LABEL: bitcast_v18f16_to_v9f32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v26, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v25, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v24, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v23, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v22, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v19, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v18, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v17, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v16, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v15, s26 -; SI-NEXT: v_cvt_f16_f32_e32 v14, s29 -; SI-NEXT: v_cvt_f16_f32_e32 v13, s28 +; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v2 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_cvt_f16_f32_e32 v11, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: s_cmp_lg_u32 s25, 0 ; SI-NEXT: s_cbranch_scc0 .LBB19_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v26 @@ -3972,25 +4293,56 @@ define <18 x half> @bitcast_v18i16_to_v18f16(<18 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v18i16_to_v18f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v35, v17 -; SI-NEXT: v_mov_b32_e32 v34, v16 -; SI-NEXT: v_mov_b32_e32 v33, v15 -; SI-NEXT: v_mov_b32_e32 v32, v14 -; SI-NEXT: v_mov_b32_e32 v31, v13 -; SI-NEXT: v_mov_b32_e32 v30, v12 -; SI-NEXT: v_mov_b32_e32 v29, v11 -; SI-NEXT: v_mov_b32_e32 v28, v10 -; SI-NEXT: v_mov_b32_e32 v27, v9 -; SI-NEXT: v_mov_b32_e32 v26, v8 -; SI-NEXT: v_mov_b32_e32 v25, v7 -; SI-NEXT: v_mov_b32_e32 v24, v6 -; SI-NEXT: v_mov_b32_e32 v23, v5 -; SI-NEXT: v_mov_b32_e32 v22, v4 -; SI-NEXT: v_mov_b32_e32 v21, v3 -; SI-NEXT: v_mov_b32_e32 v20, v2 -; SI-NEXT: v_mov_b32_e32 v19, v1 -; SI-NEXT: v_mov_b32_e32 v36, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB20_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_cvt_f32_f16_e32 v9, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v35 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 @@ -4000,52 +4352,6 @@ define <18 x half> @bitcast_v18i16_to_v18f16(<18 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB20_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB20_4 -; SI-NEXT: .LBB20_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB20_3: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v0, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v35 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr29 @@ -4055,46 +4361,84 @@ define <18 x half> @bitcast_v18i16_to_v18f16(<18 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: .LBB20_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB20_2 -; SI-NEXT: .LBB20_4: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v35 -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v34 +; SI-NEXT: s_cbranch_execz .LBB20_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v35 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v34 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 ; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v33 -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v32 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v32 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 ; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v31 -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v30 -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v29 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v28 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v27 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v26 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v25 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v24 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v23 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v22 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v21 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v20 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v19 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v29 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v27 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: .LBB20_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v12 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v19 +; SI-NEXT: v_or_b32_e32 v2, v5, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v14 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v22 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v16 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v24 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v18 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v18i16_to_v18f16: @@ -4202,90 +4546,130 @@ define inreg <18 x half> @bitcast_v18i16_to_v18f16_scalar(<18 x i16> inreg %a, i ; SI-LABEL: bitcast_v18i16_to_v18f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; SI-NEXT: v_mov_b32_e32 v18, v3 -; SI-NEXT: v_mov_b32_e32 v21, v2 -; SI-NEXT: v_mov_b32_e32 v20, v1 -; SI-NEXT: v_mov_b32_e32 v19, v0 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_lshr_b32 s14, s24, 16 +; SI-NEXT: s_lshr_b32 s13, s23, 16 +; SI-NEXT: s_lshr_b32 s12, s22, 16 +; SI-NEXT: s_lshr_b32 s11, s21, 16 +; SI-NEXT: s_lshr_b32 s10, s20, 16 +; SI-NEXT: s_lshr_b32 s9, s19, 16 +; SI-NEXT: s_lshr_b32 s8, s18, 16 +; SI-NEXT: s_lshr_b32 s7, s17, 16 +; SI-NEXT: s_lshr_b32 s6, s16, 16 +; SI-NEXT: s_cmp_lg_u32 s25, 0 ; SI-NEXT: s_cbranch_scc0 .LBB21_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s6 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s7 ; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s8 ; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s9 ; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s10 ; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s11 ; SI-NEXT: v_cvt_f32_f16_e32 v6, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s12 ; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s13 ; SI-NEXT: v_cvt_f32_f16_e32 v8, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s14 ; SI-NEXT: s_cbranch_execnz .LBB21_3 ; SI-NEXT: .LBB21_2: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v18 -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v21 -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v20 -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v19 -; SI-NEXT: s_add_i32 s29, s29, 3 -; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: s_add_i32 s27, s27, 3 -; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s14, s14, 3 ; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s13, s13, 3 ; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s12, s12, 3 ; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s11, s11, 3 ; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s10, s10, 3 ; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s9, s9, 3 ; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s8, s8, 3 ; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s7, s7, 3 ; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s6, s6, 3 ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s6 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s7 ; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s8 ; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s9 ; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s10 ; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s11 ; SI-NEXT: v_cvt_f32_f16_e32 v6, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s12 ; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s13 ; SI-NEXT: v_cvt_f32_f16_e32 v8, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s14 ; SI-NEXT: .LBB21_3: ; %end +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v0, v0, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v2, v2, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v3, v3, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_or_b32_e32 v4, v4, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v5, v5, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_or_b32_e32 v6, v6, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v17 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v1, v1, v11 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v7, v7, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB21_4: ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: s_branch .LBB21_2 ; @@ -4452,108 +4836,162 @@ define <18 x i16> @bitcast_v18f16_to_v18i16(<18 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v18f16_to_v18i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v13 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v19 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB22_2 ; SI-NEXT: ; %bb.1: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_or_b32_e32 v16, v16, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_or_b32_e32 v11, v11, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_or_b32_e32 v14, v14, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_or_b32_e32 v12, v12, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v5 ; SI-NEXT: v_or_b32_e32 v10, v10, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v7 -; SI-NEXT: v_or_b32_e32 v6, v6, v18 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v14, v14, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_or_b32_e32 v2, v2, v18 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_or_b32_e32 v4, v4, v5 -; SI-NEXT: v_or_b32_e32 v8, v8, v9 -; SI-NEXT: v_or_b32_e32 v12, v12, v13 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 +; SI-NEXT: v_or_b32_e32 v0, v0, v9 +; SI-NEXT: v_or_b32_e32 v13, v13, v17 +; SI-NEXT: v_or_b32_e32 v4, v4, v16 +; SI-NEXT: v_or_b32_e32 v6, v6, v15 +; SI-NEXT: v_alignbit_b32 v18, v2, v9, 16 +; SI-NEXT: v_alignbit_b32 v17, v14, v17, 16 +; SI-NEXT: v_alignbit_b32 v16, v10, v16, 16 +; SI-NEXT: v_alignbit_b32 v15, v12, v15, 16 ; SI-NEXT: .LBB22_2: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v18 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v9 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v17 +; SI-NEXT: v_or_b32_e32 v2, v2, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v9, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v16 +; SI-NEXT: v_or_b32_e32 v4, v4, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v9, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v15 +; SI-NEXT: v_or_b32_e32 v6, v6, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v18f16_to_v18i16: @@ -4663,114 +5101,160 @@ define inreg <18 x i16> @bitcast_v18f16_to_v18i16_scalar(<18 x half> inreg %a, i ; SI-LABEL: bitcast_v18f16_to_v18i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v5, v3 -; SI-NEXT: v_mov_b32_e32 v9, v2 -; SI-NEXT: v_mov_b32_e32 v10, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v0, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v23, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v2, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v3, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v4, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v18, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v6, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v7, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v8, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v11, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v12, s28 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v21, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v10, s26 -; SI-NEXT: v_cvt_f16_f32_e32 v19, s29 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v11 +; SI-NEXT: s_cmp_lg_u32 s25, 0 ; SI-NEXT: s_cbranch_scc0 .LBB23_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_cbranch_execnz .LBB23_3 ; SI-NEXT: .LBB23_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v13 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v19 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v17, v8, v0 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_or_b32_e32 v19, v8, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_or_b32_e32 v22, v9, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v18 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v24, v8, v6 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v8 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v15 +; SI-NEXT: v_or_b32_e32 v7, v7, v9 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v18 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v9 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_or_b32_e32 v5, v5, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_or_b32_e32 v14, v14, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v18 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_or_b32_e32 v10, v10, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v9 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_or_b32_e32 v6, v6, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v16, v16, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v2, v2, v18 -; SI-NEXT: v_lshr_b64 v[18:19], v[5:6], 16 -; SI-NEXT: v_lshr_b64 v[23:24], v[1:2], 16 -; SI-NEXT: v_lshr_b64 v[21:22], v[9:10], 16 -; SI-NEXT: v_lshr_b64 v[19:20], v[13:14], 16 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_or_b32_e32 v4, v4, v5 -; SI-NEXT: v_or_b32_e32 v8, v8, v9 -; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v9 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v20 +; SI-NEXT: v_or_b32_e32 v23, v8, v9 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v21 +; SI-NEXT: v_or_b32_e32 v1, v1, v8 +; SI-NEXT: v_lshr_b64 v[12:13], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[13:14], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[10:11], v[4:5], 16 +; SI-NEXT: v_lshr_b64 v[8:9], v[6:7], 16 ; SI-NEXT: .LBB23_3: ; %end -; SI-NEXT: v_mov_b32_e32 v1, v23 -; SI-NEXT: v_mov_b32_e32 v5, v18 -; SI-NEXT: v_mov_b32_e32 v9, v21 -; SI-NEXT: v_mov_b32_e32 v13, v19 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v12 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v17 +; SI-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v21 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v13 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v19 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v18 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v10 +; SI-NEXT: v_or_b32_e32 v4, v4, v6 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v16 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v6, v6, v8 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v15 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v20 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB23_4: ; SI-NEXT: s_branch .LBB23_2 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll index 6fae7fdbbf9bb..20a8e6dc2727e 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll @@ -525,67 +525,87 @@ define <20 x i16> @bitcast_v10i32_to_v20i16(<10 x i32> %a, i32 %b) { ; SI-LABEL: bitcast_v10i32_to_v20i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v18, v9 -; SI-NEXT: v_mov_b32_e32 v16, v8 -; SI-NEXT: v_mov_b32_e32 v14, v7 -; SI-NEXT: v_mov_b32_e32 v12, v6 -; SI-NEXT: v_mov_b32_e32 v20, v5 -; SI-NEXT: v_mov_b32_e32 v8, v4 -; SI-NEXT: v_mov_b32_e32 v6, v3 -; SI-NEXT: v_mov_b32_e32 v4, v2 -; SI-NEXT: v_mov_b32_e32 v2, v1 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB4_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_alignbit_b32 v17, v18, v16, 16 -; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 -; SI-NEXT: v_alignbit_b32 v9, v20, v8, 16 -; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_alignbit_b32 v10, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v11, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v12, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v14, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v16, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v1 ; SI-NEXT: .LBB4_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB4_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 ; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 ; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; SI-NEXT: v_alignbit_b32 v17, v18, v16, 16 -; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 -; SI-NEXT: v_alignbit_b32 v9, v20, v8, 16 -; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_alignbit_b32 v10, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v11, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v12, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v14, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v16, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v1 ; SI-NEXT: .LBB4_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_mov_b32_e32 v10, v20 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v0, v0, v16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v19 +; SI-NEXT: v_or_b32_e32 v2, v2, v14 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v18 +; SI-NEXT: v_or_b32_e32 v4, v4, v12 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v17 +; SI-NEXT: v_or_b32_e32 v6, v6, v11 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v15 +; SI-NEXT: v_or_b32_e32 v8, v8, v10 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v13 +; SI-NEXT: v_or_b32_e32 v1, v1, v16 +; SI-NEXT: v_or_b32_e32 v3, v3, v14 +; SI-NEXT: v_or_b32_e32 v5, v5, v12 +; SI-NEXT: v_or_b32_e32 v7, v7, v11 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v10i32_to_v20i16: @@ -714,26 +734,46 @@ define inreg <20 x i16> @bitcast_v10i32_to_v20i16_scalar(<10 x i32> inreg %a, i3 ; SI-NEXT: s_lshr_b32 s29, s19, 16 ; SI-NEXT: s_lshr_b32 s40, s17, 16 ; SI-NEXT: .LBB5_3: ; %end -; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_mov_b32_e32 v1, s12 -; SI-NEXT: v_mov_b32_e32 v2, s17 -; SI-NEXT: v_mov_b32_e32 v3, s40 -; SI-NEXT: v_mov_b32_e32 v4, s18 -; SI-NEXT: v_mov_b32_e32 v5, s10 -; SI-NEXT: v_mov_b32_e32 v6, s19 -; SI-NEXT: v_mov_b32_e32 v7, s29 -; SI-NEXT: v_mov_b32_e32 v8, s20 -; SI-NEXT: v_mov_b32_e32 v9, s8 -; SI-NEXT: v_mov_b32_e32 v10, s21 -; SI-NEXT: v_mov_b32_e32 v11, s28 -; SI-NEXT: v_mov_b32_e32 v12, s22 -; SI-NEXT: v_mov_b32_e32 v13, s6 -; SI-NEXT: v_mov_b32_e32 v14, s23 -; SI-NEXT: v_mov_b32_e32 v15, s27 -; SI-NEXT: v_mov_b32_e32 v16, s24 -; SI-NEXT: v_mov_b32_e32 v17, s4 -; SI-NEXT: v_mov_b32_e32 v18, s25 -; SI-NEXT: v_mov_b32_e32 v19, s26 +; SI-NEXT: s_and_b32 s5, s16, 0xffff +; SI-NEXT: s_lshl_b32 s7, s12, 16 +; SI-NEXT: s_or_b32 s5, s5, s7 +; SI-NEXT: s_and_b32 s7, s17, 0xffff +; SI-NEXT: s_lshl_b32 s9, s40, 16 +; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: s_and_b32 s9, s18, 0xffff +; SI-NEXT: s_lshl_b32 s10, s10, 16 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s19, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: s_and_b32 s11, s20, 0xffff +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_or_b32 s8, s11, s8 +; SI-NEXT: s_and_b32 s11, s21, 0xffff +; SI-NEXT: s_lshl_b32 s12, s28, 16 +; SI-NEXT: s_or_b32 s11, s11, s12 +; SI-NEXT: s_and_b32 s12, s22, 0xffff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_or_b32 s6, s12, s6 +; SI-NEXT: s_and_b32 s12, s23, 0xffff +; SI-NEXT: s_lshl_b32 s13, s27, 16 +; SI-NEXT: s_or_b32 s12, s12, s13 +; SI-NEXT: s_and_b32 s13, s24, 0xffff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_or_b32 s4, s13, s4 +; SI-NEXT: s_and_b32 s13, s25, 0xffff +; SI-NEXT: s_lshl_b32 s14, s26, 16 +; SI-NEXT: s_or_b32 s13, s13, s14 +; SI-NEXT: v_mov_b32_e32 v0, s5 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: v_mov_b32_e32 v3, s10 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s11 +; SI-NEXT: v_mov_b32_e32 v6, s6 +; SI-NEXT: v_mov_b32_e32 v7, s12 +; SI-NEXT: v_mov_b32_e32 v8, s4 +; SI-NEXT: v_mov_b32_e32 v9, s13 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB5_4: ; SI-NEXT: ; implicit-def: $sgpr12 @@ -865,22 +905,37 @@ define <10 x i32> @bitcast_v20i16_to_v10i32(<20 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v20i16_to_v10i32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v25, v8 -; SI-NEXT: v_mov_b32_e32 v24, v6 -; SI-NEXT: v_mov_b32_e32 v23, v4 -; SI-NEXT: v_mov_b32_e32 v22, v2 -; SI-NEXT: v_mov_b32_e32 v21, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v19 +; SI-NEXT: v_mov_b32_e32 v20, v9 +; SI-NEXT: v_mov_b32_e32 v11, v8 +; SI-NEXT: v_mov_b32_e32 v12, v7 +; SI-NEXT: v_mov_b32_e32 v13, v6 +; SI-NEXT: v_mov_b32_e32 v14, v5 +; SI-NEXT: v_mov_b32_e32 v15, v4 +; SI-NEXT: v_mov_b32_e32 v16, v3 +; SI-NEXT: v_mov_b32_e32 v17, v2 +; SI-NEXT: v_mov_b32_e32 v18, v1 +; SI-NEXT: v_mov_b32_e32 v19, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v19 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v0 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -892,59 +947,59 @@ define <10 x i32> @bitcast_v20i16_to_v10i32(<20 x i16> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB6_3: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v21 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v23 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v24 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v25 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v10 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v12 -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v14 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v16 -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v18 -; SI-NEXT: v_or_b32_e32 v0, v0, v31 -; SI-NEXT: v_or_b32_e32 v1, v1, v30 -; SI-NEXT: v_or_b32_e32 v2, v2, v29 -; SI-NEXT: v_or_b32_e32 v3, v3, v28 -; SI-NEXT: v_or_b32_e32 v4, v4, v27 -; SI-NEXT: v_or_b32_e32 v5, v5, v26 -; SI-NEXT: v_or_b32_e32 v6, v6, v20 -; SI-NEXT: v_or_b32_e32 v7, v7, v15 -; SI-NEXT: v_or_b32_e32 v8, v8, v13 -; SI-NEXT: v_or_b32_e32 v9, v9, v11 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v19 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v16 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v0, v0, v29 +; SI-NEXT: v_or_b32_e32 v1, v1, v28 +; SI-NEXT: v_or_b32_e32 v2, v2, v27 +; SI-NEXT: v_or_b32_e32 v3, v3, v26 +; SI-NEXT: v_or_b32_e32 v4, v4, v25 +; SI-NEXT: v_or_b32_e32 v5, v5, v24 +; SI-NEXT: v_or_b32_e32 v6, v6, v23 +; SI-NEXT: v_or_b32_e32 v7, v7, v22 +; SI-NEXT: v_or_b32_e32 v8, v8, v21 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB6_2 ; SI-NEXT: .LBB6_4: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v21 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v22 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v23 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v24 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v25 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v10 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v12 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v14 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v16 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v20 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 @@ -955,17 +1010,17 @@ define <10 x i32> @bitcast_v20i16_to_v10i32(<20 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: v_or_b32_e32 v0, v31, v0 +; SI-NEXT: v_or_b32_e32 v0, v29, v0 ; SI-NEXT: s_mov_b32 s6, 0x30000 -; SI-NEXT: v_or_b32_e32 v1, v30, v1 -; SI-NEXT: v_or_b32_e32 v2, v29, v2 -; SI-NEXT: v_or_b32_e32 v3, v28, v3 -; SI-NEXT: v_or_b32_e32 v4, v27, v4 -; SI-NEXT: v_or_b32_e32 v5, v26, v5 -; SI-NEXT: v_or_b32_e32 v6, v20, v6 -; SI-NEXT: v_or_b32_e32 v7, v15, v7 -; SI-NEXT: v_or_b32_e32 v8, v13, v8 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_or_b32_e32 v1, v28, v1 +; SI-NEXT: v_or_b32_e32 v2, v27, v2 +; SI-NEXT: v_or_b32_e32 v3, v26, v3 +; SI-NEXT: v_or_b32_e32 v4, v25, v4 +; SI-NEXT: v_or_b32_e32 v5, v24, v5 +; SI-NEXT: v_or_b32_e32 v6, v23, v6 +; SI-NEXT: v_or_b32_e32 v7, v22, v7 +; SI-NEXT: v_or_b32_e32 v8, v21, v8 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 @@ -1090,90 +1145,91 @@ define inreg <10 x i32> @bitcast_v20i16_to_v10i32_scalar(<20 x i16> inreg %a, i3 ; SI-LABEL: bitcast_v20i16_to_v10i32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; SI-NEXT: v_mov_b32_e32 v10, v4 -; SI-NEXT: v_mov_b32_e32 v11, v2 -; SI-NEXT: v_mov_b32_e32 v12, v0 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v5 +; SI-NEXT: s_lshr_b32 s27, s25, 16 +; SI-NEXT: s_lshr_b32 s28, s24, 16 +; SI-NEXT: s_lshr_b32 s29, s23, 16 +; SI-NEXT: s_lshr_b32 s40, s22, 16 +; SI-NEXT: s_lshr_b32 s41, s21, 16 +; SI-NEXT: s_lshr_b32 s42, s20, 16 +; SI-NEXT: s_lshr_b32 s43, s19, 16 +; SI-NEXT: s_lshr_b32 s44, s18, 16 +; SI-NEXT: s_lshr_b32 s45, s17, 16 +; SI-NEXT: s_lshr_b32 s46, s16, 16 +; SI-NEXT: s_cmp_lg_u32 s26, 0 ; SI-NEXT: s_cbranch_scc0 .LBB7_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_lshl_b32 s5, s46, 16 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s45, 16 ; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_and_b32 s6, s18, 0xffff +; SI-NEXT: s_lshl_b32 s7, s44, 16 ; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_and_b32 s7, s19, 0xffff +; SI-NEXT: s_lshl_b32 s8, s43, 16 ; SI-NEXT: s_or_b32 s7, s7, s8 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_and_b32 s8, s20, 0xffff +; SI-NEXT: s_lshl_b32 s9, s42, 16 ; SI-NEXT: s_or_b32 s8, s8, s9 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v12 +; SI-NEXT: s_and_b32 s9, s21, 0xffff +; SI-NEXT: s_lshl_b32 s10, s41, 16 ; SI-NEXT: s_or_b32 s9, s9, s10 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_or_b32_e32 v7, v0, v15 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v11 +; SI-NEXT: s_and_b32 s10, s22, 0xffff +; SI-NEXT: s_lshl_b32 s11, s40, 16 ; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_or_b32_e32 v8, v0, v14 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v10 -; SI-NEXT: v_or_b32_e32 v9, v0, v13 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_mov_b32_e32 v5, s9 -; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_and_b32 s11, s23, 0xffff +; SI-NEXT: s_lshl_b32 s12, s29, 16 +; SI-NEXT: s_or_b32 s11, s11, s12 +; SI-NEXT: s_and_b32 s12, s24, 0xffff +; SI-NEXT: s_lshl_b32 s13, s28, 16 +; SI-NEXT: s_or_b32 s12, s12, s13 +; SI-NEXT: s_and_b32 s13, s25, 0xffff +; SI-NEXT: s_lshl_b32 s14, s27, 16 +; SI-NEXT: s_or_b32 s13, s13, s14 ; SI-NEXT: s_cbranch_execnz .LBB7_3 ; SI-NEXT: .LBB7_2: ; %cmp.true ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v12 +; SI-NEXT: s_lshl_b32 s5, s46, 16 +; SI-NEXT: s_add_i32 s17, s17, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s45, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: v_or_b32_e32 v0, v15, v0 +; SI-NEXT: s_and_b32 s6, s18, 0xffff +; SI-NEXT: s_lshl_b32 s7, s44, 16 +; SI-NEXT: s_add_i32 s19, s19, 3 ; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v11 +; SI-NEXT: s_and_b32 s7, s19, 0xffff +; SI-NEXT: s_lshl_b32 s8, s43, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 ; SI-NEXT: s_or_b32 s7, s8, s7 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_and_b32 s8, s20, 0xffff +; SI-NEXT: s_lshl_b32 s9, s42, 16 +; SI-NEXT: s_add_i32 s21, s21, 3 ; SI-NEXT: s_or_b32 s8, s9, s8 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: v_or_b32_e32 v0, v14, v0 +; SI-NEXT: s_and_b32 s9, s21, 0xffff +; SI-NEXT: s_lshl_b32 s10, s41, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 ; SI-NEXT: s_or_b32 s9, s10, s9 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v10 +; SI-NEXT: s_and_b32 s10, s22, 0xffff +; SI-NEXT: s_lshl_b32 s11, s40, 16 +; SI-NEXT: s_add_i32 s23, s23, 3 ; SI-NEXT: s_or_b32 s10, s11, s10 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_and_b32 s11, s23, 0xffff +; SI-NEXT: s_lshl_b32 s12, s29, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s11, s12, s11 +; SI-NEXT: s_and_b32 s12, s24, 0xffff +; SI-NEXT: s_lshl_b32 s13, s28, 16 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_or_b32 s12, s13, s12 +; SI-NEXT: s_and_b32 s13, s25, 0xffff +; SI-NEXT: s_lshl_b32 s14, s27, 16 +; SI-NEXT: s_or_b32 s13, s14, s13 ; SI-NEXT: s_add_i32 s4, s4, 0x30000 ; SI-NEXT: s_add_i32 s5, s5, 0x30000 ; SI-NEXT: s_add_i32 s6, s6, 0x30000 @@ -1181,8 +1237,10 @@ define inreg <10 x i32> @bitcast_v20i16_to_v10i32_scalar(<20 x i16> inreg %a, i3 ; SI-NEXT: s_add_i32 s8, s8, 0x30000 ; SI-NEXT: s_add_i32 s9, s9, 0x30000 ; SI-NEXT: s_add_i32 s10, s10, 0x30000 -; SI-NEXT: v_or_b32_e32 v0, v13, v0 -; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 +; SI-NEXT: s_add_i32 s11, s11, 0x30000 +; SI-NEXT: s_add_i32 s12, s12, 0x30000 +; SI-NEXT: s_add_i32 s13, s13, 0x30000 +; SI-NEXT: .LBB7_3: ; %end ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_mov_b32_e32 v2, s6 @@ -1190,10 +1248,12 @@ define inreg <10 x i32> @bitcast_v20i16_to_v10i32_scalar(<20 x i16> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v4, s8 ; SI-NEXT: v_mov_b32_e32 v5, s9 ; SI-NEXT: v_mov_b32_e32 v6, s10 -; SI-NEXT: .LBB7_3: ; %end +; SI-NEXT: v_mov_b32_e32 v7, s11 +; SI-NEXT: v_mov_b32_e32 v8, s12 +; SI-NEXT: v_mov_b32_e32 v9, s13 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB7_4: -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 +; SI-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; SI-NEXT: s_branch .LBB7_2 ; ; VI-LABEL: bitcast_v20i16_to_v10i32_scalar: @@ -1358,17 +1418,61 @@ define <20 x half> @bitcast_v10i32_to_v20f16(<10 x i32> %a, i32 %b) { ; SI-LABEL: bitcast_v10i32_to_v20f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v29, v9 -; SI-NEXT: v_mov_b32_e32 v28, v8 -; SI-NEXT: v_mov_b32_e32 v27, v7 -; SI-NEXT: v_mov_b32_e32 v26, v6 -; SI-NEXT: v_mov_b32_e32 v25, v5 -; SI-NEXT: v_mov_b32_e32 v24, v4 -; SI-NEXT: v_mov_b32_e32 v23, v3 -; SI-NEXT: v_mov_b32_e32 v22, v2 -; SI-NEXT: v_mov_b32_e32 v21, v1 -; SI-NEXT: v_mov_b32_e32 v20, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB8_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v11 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v11 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v11 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v11 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v11 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v11 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v11 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v11 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v0 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 @@ -1379,110 +1483,92 @@ define <20 x half> @bitcast_v10i32_to_v20f16(<10 x i32> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB8_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB8_4 -; SI-NEXT: .LBB8_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB8_3: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v20 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: .LBB8_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB8_2 -; SI-NEXT: .LBB8_4: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v20 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v21 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v22 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v23 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v24 -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v25 -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v26 -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v27 -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v28 -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: s_cbranch_execz .LBB8_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v0 ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v0 +; SI-NEXT: .LBB8_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v25 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v26 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v22 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v21 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v18 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v17 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v15 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v12 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v10i32_to_v20f16: @@ -1579,100 +1665,140 @@ define inreg <20 x half> @bitcast_v10i32_to_v20f16_scalar(<10 x i32> inreg %a, i ; SI-NEXT: s_cbranch_scc0 .LBB9_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 ; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 ; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 ; SI-NEXT: s_lshr_b32 s4, s22, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 ; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 ; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 ; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 ; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 ; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 ; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 ; SI-NEXT: v_cvt_f32_f16_e32 v12, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 ; SI-NEXT: s_cbranch_execnz .LBB9_3 ; SI-NEXT: .LBB9_2: ; %cmp.true ; SI-NEXT: s_add_i32 s25, s25, 3 ; SI-NEXT: s_add_i32 s24, s24, 3 ; SI-NEXT: s_lshr_b32 s4, s25, 16 ; SI-NEXT: s_add_i32 s23, s23, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 ; SI-NEXT: s_lshr_b32 s4, s24, 16 ; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 ; SI-NEXT: s_lshr_b32 s4, s23, 16 ; SI-NEXT: s_add_i32 s21, s21, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 ; SI-NEXT: s_lshr_b32 s4, s22, 16 ; SI-NEXT: s_add_i32 s20, s20, 3 ; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 ; SI-NEXT: s_lshr_b32 s4, s21, 16 ; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 ; SI-NEXT: s_lshr_b32 s4, s20, 16 ; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 ; SI-NEXT: s_lshr_b32 s4, s19, 16 ; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 ; SI-NEXT: s_lshr_b32 s4, s18, 16 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 ; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 ; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 ; SI-NEXT: v_cvt_f32_f16_e32 v12, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 ; SI-NEXT: .LBB9_3: ; %end +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v19 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v17 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_or_b32_e32 v4, v14, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v13 +; SI-NEXT: v_or_b32_e32 v6, v12, v6 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_or_b32_e32 v7, v7, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v12 +; SI-NEXT: v_or_b32_e32 v1, v20, v1 +; SI-NEXT: v_or_b32_e32 v3, v18, v3 +; SI-NEXT: v_or_b32_e32 v5, v16, v5 +; SI-NEXT: v_or_b32_e32 v8, v11, v8 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB9_4: -; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: s_branch .LBB9_2 ; ; VI-LABEL: bitcast_v10i32_to_v20f16_scalar: @@ -1792,27 +1918,57 @@ define <10 x i32> @bitcast_v20f16_to_v10i32(<20 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v20f16_to_v10i32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v34, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v18 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v9 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -1824,30 +1980,26 @@ define <10 x i32> @bitcast_v20f16_to_v10i32(<20 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB10_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v34 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v28 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v26 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v24 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v22 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v11 -; SI-NEXT: v_or_b32_e32 v0, v33, v0 -; SI-NEXT: v_or_b32_e32 v1, v31, v1 -; SI-NEXT: v_or_b32_e32 v2, v29, v2 -; SI-NEXT: v_or_b32_e32 v3, v27, v3 -; SI-NEXT: v_or_b32_e32 v4, v25, v4 -; SI-NEXT: v_or_b32_e32 v5, v23, v5 -; SI-NEXT: v_or_b32_e32 v6, v21, v6 -; SI-NEXT: v_or_b32_e32 v7, v14, v7 -; SI-NEXT: v_or_b32_e32 v8, v12, v8 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v12 +; SI-NEXT: v_or_b32_e32 v0, v29, v0 +; SI-NEXT: v_or_b32_e32 v1, v27, v1 +; SI-NEXT: v_or_b32_e32 v2, v25, v2 +; SI-NEXT: v_or_b32_e32 v3, v23, v3 +; SI-NEXT: v_or_b32_e32 v4, v21, v4 +; SI-NEXT: v_or_b32_e32 v5, v19, v5 +; SI-NEXT: v_or_b32_e32 v6, v17, v6 +; SI-NEXT: v_or_b32_e32 v7, v15, v7 +; SI-NEXT: v_or_b32_e32 v8, v13, v8 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr28 @@ -1858,19 +2010,23 @@ define <10 x i32> @bitcast_v20f16_to_v10i32(<20 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB10_2 ; SI-NEXT: .LBB10_4: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v27 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -1879,25 +2035,25 @@ define <10 x i32> @bitcast_v20f16_to_v10i32(<20 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v26 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v25 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v23 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v21 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -1905,11 +2061,11 @@ define <10 x i32> @bitcast_v20f16_to_v10i32(<20 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v20 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v18 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 @@ -1917,11 +2073,11 @@ define <10 x i32> @bitcast_v20f16_to_v10i32(<20 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v17 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v15 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 @@ -1929,24 +2085,24 @@ define <10 x i32> @bitcast_v20f16_to_v10i32(<20 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v14 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v11 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -2062,28 +2218,57 @@ define inreg <10 x i32> @bitcast_v20f16_to_v10i32_scalar(<20 x half> inreg %a, i ; SI-LABEL: bitcast_v20f16_to_v10i32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v29, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v28, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v27, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v26, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v25, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v24, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v23, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v22, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v21, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v20, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v19, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v18, s26 -; SI-NEXT: v_cvt_f16_f32_e32 v17, s29 -; SI-NEXT: v_cvt_f16_f32_e32 v16, s28 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v4 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 +; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: s_cmp_lg_u32 s26, 0 ; SI-NEXT: s_cbranch_scc0 .LBB11_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v29 @@ -7364,76 +7549,96 @@ define <20 x i16> @bitcast_v10f32_to_v20i16(<10 x float> %a, i32 %b) { ; SI-LABEL: bitcast_v10f32_to_v20i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v18, v9 -; SI-NEXT: v_mov_b32_e32 v16, v8 -; SI-NEXT: v_mov_b32_e32 v14, v7 -; SI-NEXT: v_mov_b32_e32 v12, v6 -; SI-NEXT: v_mov_b32_e32 v20, v5 -; SI-NEXT: v_mov_b32_e32 v8, v4 -; SI-NEXT: v_mov_b32_e32 v6, v3 -; SI-NEXT: v_mov_b32_e32 v4, v2 -; SI-NEXT: v_mov_b32_e32 v2, v1 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB24_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_alignbit_b32 v17, v18, v16, 16 -; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 -; SI-NEXT: v_alignbit_b32 v9, v20, v8, 16 -; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_alignbit_b32 v10, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v11, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v12, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v14, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v16, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v1 ; SI-NEXT: .LBB24_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB24_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 ; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 -; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 ; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 -; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 -; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 -; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 -; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 -; SI-NEXT: v_alignbit_b32 v17, v18, v16, 16 -; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 -; SI-NEXT: v_alignbit_b32 v9, v20, v8, 16 -; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_alignbit_b32 v10, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v11, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v12, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v14, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v16, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v1 ; SI-NEXT: .LBB24_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_mov_b32_e32 v10, v20 -; SI-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: bitcast_v10f32_to_v20i16: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v0, v0, v16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v19 +; SI-NEXT: v_or_b32_e32 v2, v2, v14 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v18 +; SI-NEXT: v_or_b32_e32 v4, v4, v12 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v17 +; SI-NEXT: v_or_b32_e32 v6, v6, v11 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v15 +; SI-NEXT: v_or_b32_e32 v8, v8, v10 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v13 +; SI-NEXT: v_or_b32_e32 v1, v1, v16 +; SI-NEXT: v_or_b32_e32 v3, v3, v14 +; SI-NEXT: v_or_b32_e32 v5, v5, v12 +; SI-NEXT: v_or_b32_e32 v7, v7, v11 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v10f32_to_v20i16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB24_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 @@ -7526,26 +7731,26 @@ define inreg <20 x i16> @bitcast_v10f32_to_v20i16_scalar(<10 x float> inreg %a, ; SI-NEXT: s_lshr_b64 s[12:13], s[16:17], 16 ; SI-NEXT: s_cbranch_execnz .LBB25_4 ; SI-NEXT: .LBB25_2: ; %cmp.true -; SI-NEXT: v_add_f32_e64 v29, s17, 1.0 -; SI-NEXT: v_add_f32_e64 v28, s16, 1.0 -; SI-NEXT: v_add_f32_e64 v27, s19, 1.0 -; SI-NEXT: v_add_f32_e64 v26, s18, 1.0 -; SI-NEXT: v_add_f32_e64 v25, s21, 1.0 -; SI-NEXT: v_add_f32_e64 v24, s20, 1.0 -; SI-NEXT: v_add_f32_e64 v23, s23, 1.0 -; SI-NEXT: v_add_f32_e64 v22, s22, 1.0 -; SI-NEXT: v_add_f32_e64 v21, s25, 1.0 -; SI-NEXT: v_add_f32_e64 v20, s24, 1.0 -; SI-NEXT: v_lshr_b64 v[17:18], v[20:21], 16 -; SI-NEXT: v_lshr_b64 v[13:14], v[22:23], 16 -; SI-NEXT: v_lshr_b64 v[9:10], v[24:25], 16 -; SI-NEXT: v_lshr_b64 v[5:6], v[26:27], 16 -; SI-NEXT: v_lshr_b64 v[1:2], v[28:29], 16 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v29 +; SI-NEXT: v_add_f32_e64 v9, s25, 1.0 +; SI-NEXT: v_add_f32_e64 v8, s24, 1.0 +; SI-NEXT: v_add_f32_e64 v7, s23, 1.0 +; SI-NEXT: v_add_f32_e64 v6, s22, 1.0 +; SI-NEXT: v_lshr_b64 v[10:11], v[8:9], 16 +; SI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; SI-NEXT: v_lshr_b64 v[11:12], v[6:7], 16 +; SI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; SI-NEXT: v_lshr_b64 v[12:13], v[4:5], 16 +; SI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: v_lshr_b64 v[13:14], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[14:15], v[0:1], 16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v1 ; SI-NEXT: s_branch .LBB25_5 ; SI-NEXT: .LBB25_3: ; SI-NEXT: ; implicit-def: $sgpr12 @@ -7560,37 +7765,57 @@ define inreg <20 x i16> @bitcast_v10f32_to_v20i16_scalar(<10 x float> inreg %a, ; SI-NEXT: ; implicit-def: $sgpr40 ; SI-NEXT: s_branch .LBB25_2 ; SI-NEXT: .LBB25_4: -; SI-NEXT: v_mov_b32_e32 v28, s16 -; SI-NEXT: v_mov_b32_e32 v29, s17 -; SI-NEXT: v_mov_b32_e32 v26, s18 -; SI-NEXT: v_mov_b32_e32 v27, s19 -; SI-NEXT: v_mov_b32_e32 v24, s20 -; SI-NEXT: v_mov_b32_e32 v25, s21 -; SI-NEXT: v_mov_b32_e32 v22, s22 -; SI-NEXT: v_mov_b32_e32 v23, s23 -; SI-NEXT: v_mov_b32_e32 v20, s24 -; SI-NEXT: v_mov_b32_e32 v21, s25 -; SI-NEXT: v_mov_b32_e32 v3, s26 -; SI-NEXT: v_mov_b32_e32 v7, s27 -; SI-NEXT: v_mov_b32_e32 v11, s28 -; SI-NEXT: v_mov_b32_e32 v15, s29 -; SI-NEXT: v_mov_b32_e32 v19, s40 -; SI-NEXT: v_mov_b32_e32 v1, s12 -; SI-NEXT: v_mov_b32_e32 v5, s10 -; SI-NEXT: v_mov_b32_e32 v9, s8 -; SI-NEXT: v_mov_b32_e32 v13, s6 -; SI-NEXT: v_mov_b32_e32 v17, s4 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v20, s26 +; SI-NEXT: v_mov_b32_e32 v19, s27 +; SI-NEXT: v_mov_b32_e32 v18, s28 +; SI-NEXT: v_mov_b32_e32 v17, s29 +; SI-NEXT: v_mov_b32_e32 v16, s40 +; SI-NEXT: v_mov_b32_e32 v14, s12 +; SI-NEXT: v_mov_b32_e32 v13, s10 +; SI-NEXT: v_mov_b32_e32 v12, s8 +; SI-NEXT: v_mov_b32_e32 v11, s6 +; SI-NEXT: v_mov_b32_e32 v10, s4 ; SI-NEXT: .LBB25_5: ; %end -; SI-NEXT: v_mov_b32_e32 v0, v28 -; SI-NEXT: v_mov_b32_e32 v2, v29 -; SI-NEXT: v_mov_b32_e32 v4, v26 -; SI-NEXT: v_mov_b32_e32 v6, v27 -; SI-NEXT: v_mov_b32_e32 v8, v24 -; SI-NEXT: v_mov_b32_e32 v10, v25 -; SI-NEXT: v_mov_b32_e32 v12, v22 -; SI-NEXT: v_mov_b32_e32 v14, v23 -; SI-NEXT: v_mov_b32_e32 v16, v20 -; SI-NEXT: v_mov_b32_e32 v18, v21 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v0, v0, v14 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v20 +; SI-NEXT: v_or_b32_e32 v2, v2, v13 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v19 +; SI-NEXT: v_or_b32_e32 v4, v4, v12 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v18 +; SI-NEXT: v_or_b32_e32 v6, v6, v11 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v17 +; SI-NEXT: v_or_b32_e32 v8, v8, v10 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v16 +; SI-NEXT: v_or_b32_e32 v1, v1, v14 +; SI-NEXT: v_or_b32_e32 v3, v3, v13 +; SI-NEXT: v_or_b32_e32 v5, v5, v12 +; SI-NEXT: v_or_b32_e32 v7, v7, v11 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v10f32_to_v20i16_scalar: @@ -7731,22 +7956,37 @@ define <10 x float> @bitcast_v20i16_to_v10f32(<20 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v20i16_to_v10f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v25, v8 -; SI-NEXT: v_mov_b32_e32 v24, v6 -; SI-NEXT: v_mov_b32_e32 v23, v4 -; SI-NEXT: v_mov_b32_e32 v22, v2 -; SI-NEXT: v_mov_b32_e32 v21, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v19 +; SI-NEXT: v_mov_b32_e32 v20, v9 +; SI-NEXT: v_mov_b32_e32 v11, v8 +; SI-NEXT: v_mov_b32_e32 v12, v7 +; SI-NEXT: v_mov_b32_e32 v13, v6 +; SI-NEXT: v_mov_b32_e32 v14, v5 +; SI-NEXT: v_mov_b32_e32 v15, v4 +; SI-NEXT: v_mov_b32_e32 v16, v3 +; SI-NEXT: v_mov_b32_e32 v17, v2 +; SI-NEXT: v_mov_b32_e32 v18, v1 +; SI-NEXT: v_mov_b32_e32 v19, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v19 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v0 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -7758,59 +7998,59 @@ define <10 x float> @bitcast_v20i16_to_v10f32(<20 x i16> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB26_3: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v21 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v23 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v24 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v25 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v10 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v12 -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v14 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v16 -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v18 -; SI-NEXT: v_or_b32_e32 v0, v0, v31 -; SI-NEXT: v_or_b32_e32 v1, v1, v30 -; SI-NEXT: v_or_b32_e32 v2, v2, v29 -; SI-NEXT: v_or_b32_e32 v3, v3, v28 -; SI-NEXT: v_or_b32_e32 v4, v4, v27 -; SI-NEXT: v_or_b32_e32 v5, v5, v26 -; SI-NEXT: v_or_b32_e32 v6, v6, v20 -; SI-NEXT: v_or_b32_e32 v7, v7, v15 -; SI-NEXT: v_or_b32_e32 v8, v8, v13 -; SI-NEXT: v_or_b32_e32 v9, v9, v11 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v19 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v16 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v0, v0, v29 +; SI-NEXT: v_or_b32_e32 v1, v1, v28 +; SI-NEXT: v_or_b32_e32 v2, v2, v27 +; SI-NEXT: v_or_b32_e32 v3, v3, v26 +; SI-NEXT: v_or_b32_e32 v4, v4, v25 +; SI-NEXT: v_or_b32_e32 v5, v5, v24 +; SI-NEXT: v_or_b32_e32 v6, v6, v23 +; SI-NEXT: v_or_b32_e32 v7, v7, v22 +; SI-NEXT: v_or_b32_e32 v8, v8, v21 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB26_2 ; SI-NEXT: .LBB26_4: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v21 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v22 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v23 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v24 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v25 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v10 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v12 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v14 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v16 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v20 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 @@ -7821,17 +8061,17 @@ define <10 x float> @bitcast_v20i16_to_v10f32(<20 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: v_or_b32_e32 v0, v31, v0 +; SI-NEXT: v_or_b32_e32 v0, v29, v0 ; SI-NEXT: s_mov_b32 s6, 0x30000 -; SI-NEXT: v_or_b32_e32 v1, v30, v1 -; SI-NEXT: v_or_b32_e32 v2, v29, v2 -; SI-NEXT: v_or_b32_e32 v3, v28, v3 -; SI-NEXT: v_or_b32_e32 v4, v27, v4 -; SI-NEXT: v_or_b32_e32 v5, v26, v5 -; SI-NEXT: v_or_b32_e32 v6, v20, v6 -; SI-NEXT: v_or_b32_e32 v7, v15, v7 -; SI-NEXT: v_or_b32_e32 v8, v13, v8 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_or_b32_e32 v1, v28, v1 +; SI-NEXT: v_or_b32_e32 v2, v27, v2 +; SI-NEXT: v_or_b32_e32 v3, v26, v3 +; SI-NEXT: v_or_b32_e32 v4, v25, v4 +; SI-NEXT: v_or_b32_e32 v5, v24, v5 +; SI-NEXT: v_or_b32_e32 v6, v23, v6 +; SI-NEXT: v_or_b32_e32 v7, v22, v7 +; SI-NEXT: v_or_b32_e32 v8, v21, v8 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 @@ -7956,90 +8196,91 @@ define inreg <10 x float> @bitcast_v20i16_to_v10f32_scalar(<20 x i16> inreg %a, ; SI-LABEL: bitcast_v20i16_to_v10f32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; SI-NEXT: v_mov_b32_e32 v10, v4 -; SI-NEXT: v_mov_b32_e32 v11, v2 -; SI-NEXT: v_mov_b32_e32 v12, v0 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v5 +; SI-NEXT: s_lshr_b32 s27, s25, 16 +; SI-NEXT: s_lshr_b32 s28, s24, 16 +; SI-NEXT: s_lshr_b32 s29, s23, 16 +; SI-NEXT: s_lshr_b32 s40, s22, 16 +; SI-NEXT: s_lshr_b32 s41, s21, 16 +; SI-NEXT: s_lshr_b32 s42, s20, 16 +; SI-NEXT: s_lshr_b32 s43, s19, 16 +; SI-NEXT: s_lshr_b32 s44, s18, 16 +; SI-NEXT: s_lshr_b32 s45, s17, 16 +; SI-NEXT: s_lshr_b32 s46, s16, 16 +; SI-NEXT: s_cmp_lg_u32 s26, 0 ; SI-NEXT: s_cbranch_scc0 .LBB27_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_lshl_b32 s5, s46, 16 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s45, 16 ; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_and_b32 s6, s18, 0xffff +; SI-NEXT: s_lshl_b32 s7, s44, 16 ; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_and_b32 s7, s19, 0xffff +; SI-NEXT: s_lshl_b32 s8, s43, 16 ; SI-NEXT: s_or_b32 s7, s7, s8 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_and_b32 s8, s20, 0xffff +; SI-NEXT: s_lshl_b32 s9, s42, 16 ; SI-NEXT: s_or_b32 s8, s8, s9 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v12 +; SI-NEXT: s_and_b32 s9, s21, 0xffff +; SI-NEXT: s_lshl_b32 s10, s41, 16 ; SI-NEXT: s_or_b32 s9, s9, s10 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_or_b32_e32 v7, v0, v15 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v11 +; SI-NEXT: s_and_b32 s10, s22, 0xffff +; SI-NEXT: s_lshl_b32 s11, s40, 16 ; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_or_b32_e32 v8, v0, v14 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v10 -; SI-NEXT: v_or_b32_e32 v9, v0, v13 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_mov_b32_e32 v5, s9 -; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_and_b32 s11, s23, 0xffff +; SI-NEXT: s_lshl_b32 s12, s29, 16 +; SI-NEXT: s_or_b32 s11, s11, s12 +; SI-NEXT: s_and_b32 s12, s24, 0xffff +; SI-NEXT: s_lshl_b32 s13, s28, 16 +; SI-NEXT: s_or_b32 s12, s12, s13 +; SI-NEXT: s_and_b32 s13, s25, 0xffff +; SI-NEXT: s_lshl_b32 s14, s27, 16 +; SI-NEXT: s_or_b32 s13, s13, s14 ; SI-NEXT: s_cbranch_execnz .LBB27_3 ; SI-NEXT: .LBB27_2: ; %cmp.true ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v12 +; SI-NEXT: s_lshl_b32 s5, s46, 16 +; SI-NEXT: s_add_i32 s17, s17, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s45, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: v_or_b32_e32 v0, v15, v0 +; SI-NEXT: s_and_b32 s6, s18, 0xffff +; SI-NEXT: s_lshl_b32 s7, s44, 16 +; SI-NEXT: s_add_i32 s19, s19, 3 ; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v11 +; SI-NEXT: s_and_b32 s7, s19, 0xffff +; SI-NEXT: s_lshl_b32 s8, s43, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 ; SI-NEXT: s_or_b32 s7, s8, s7 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_and_b32 s8, s20, 0xffff +; SI-NEXT: s_lshl_b32 s9, s42, 16 +; SI-NEXT: s_add_i32 s21, s21, 3 ; SI-NEXT: s_or_b32 s8, s9, s8 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: v_or_b32_e32 v0, v14, v0 +; SI-NEXT: s_and_b32 s9, s21, 0xffff +; SI-NEXT: s_lshl_b32 s10, s41, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 ; SI-NEXT: s_or_b32 s9, s10, s9 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v10 +; SI-NEXT: s_and_b32 s10, s22, 0xffff +; SI-NEXT: s_lshl_b32 s11, s40, 16 +; SI-NEXT: s_add_i32 s23, s23, 3 ; SI-NEXT: s_or_b32 s10, s11, s10 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_and_b32 s11, s23, 0xffff +; SI-NEXT: s_lshl_b32 s12, s29, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s11, s12, s11 +; SI-NEXT: s_and_b32 s12, s24, 0xffff +; SI-NEXT: s_lshl_b32 s13, s28, 16 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_or_b32 s12, s13, s12 +; SI-NEXT: s_and_b32 s13, s25, 0xffff +; SI-NEXT: s_lshl_b32 s14, s27, 16 +; SI-NEXT: s_or_b32 s13, s14, s13 ; SI-NEXT: s_add_i32 s4, s4, 0x30000 ; SI-NEXT: s_add_i32 s5, s5, 0x30000 ; SI-NEXT: s_add_i32 s6, s6, 0x30000 @@ -8047,8 +8288,10 @@ define inreg <10 x float> @bitcast_v20i16_to_v10f32_scalar(<20 x i16> inreg %a, ; SI-NEXT: s_add_i32 s8, s8, 0x30000 ; SI-NEXT: s_add_i32 s9, s9, 0x30000 ; SI-NEXT: s_add_i32 s10, s10, 0x30000 -; SI-NEXT: v_or_b32_e32 v0, v13, v0 -; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 +; SI-NEXT: s_add_i32 s11, s11, 0x30000 +; SI-NEXT: s_add_i32 s12, s12, 0x30000 +; SI-NEXT: s_add_i32 s13, s13, 0x30000 +; SI-NEXT: .LBB27_3: ; %end ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_mov_b32_e32 v2, s6 @@ -8056,10 +8299,12 @@ define inreg <10 x float> @bitcast_v20i16_to_v10f32_scalar(<20 x i16> inreg %a, ; SI-NEXT: v_mov_b32_e32 v4, s8 ; SI-NEXT: v_mov_b32_e32 v5, s9 ; SI-NEXT: v_mov_b32_e32 v6, s10 -; SI-NEXT: .LBB27_3: ; %end +; SI-NEXT: v_mov_b32_e32 v7, s11 +; SI-NEXT: v_mov_b32_e32 v8, s12 +; SI-NEXT: v_mov_b32_e32 v9, s13 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB27_4: -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 +; SI-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; SI-NEXT: s_branch .LBB27_2 ; ; VI-LABEL: bitcast_v20i16_to_v10f32_scalar: @@ -8224,17 +8469,61 @@ define <20 x half> @bitcast_v10f32_to_v20f16(<10 x float> %a, i32 %b) { ; SI-LABEL: bitcast_v10f32_to_v20f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v29, v9 -; SI-NEXT: v_mov_b32_e32 v28, v8 -; SI-NEXT: v_mov_b32_e32 v27, v7 -; SI-NEXT: v_mov_b32_e32 v26, v6 -; SI-NEXT: v_mov_b32_e32 v25, v5 -; SI-NEXT: v_mov_b32_e32 v24, v4 -; SI-NEXT: v_mov_b32_e32 v23, v3 -; SI-NEXT: v_mov_b32_e32 v22, v2 -; SI-NEXT: v_mov_b32_e32 v21, v1 -; SI-NEXT: v_mov_b32_e32 v20, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB28_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v11 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v11 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v11 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v11 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v11 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v11 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v11 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v11 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v0 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 @@ -8245,110 +8534,92 @@ define <20 x half> @bitcast_v10f32_to_v20f16(<10 x float> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB28_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB28_4 -; SI-NEXT: .LBB28_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB28_3: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v20 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: .LBB28_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB28_2 -; SI-NEXT: .LBB28_4: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v1, 1.0, v20 -; SI-NEXT: v_add_f32_e32 v3, 1.0, v21 -; SI-NEXT: v_add_f32_e32 v5, 1.0, v22 -; SI-NEXT: v_add_f32_e32 v7, 1.0, v23 -; SI-NEXT: v_add_f32_e32 v9, 1.0, v24 -; SI-NEXT: v_add_f32_e32 v11, 1.0, v25 -; SI-NEXT: v_add_f32_e32 v13, 1.0, v26 -; SI-NEXT: v_add_f32_e32 v15, 1.0, v27 -; SI-NEXT: v_add_f32_e32 v17, 1.0, v28 -; SI-NEXT: v_add_f32_e32 v19, 1.0, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: s_cbranch_execz .LBB28_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v0 ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v0 +; SI-NEXT: .LBB28_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v25 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v26 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v22 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v21 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v18 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v17 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v15 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v12 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v10f32_to_v20f16: @@ -8439,100 +8710,140 @@ define inreg <20 x half> @bitcast_v10f32_to_v20f16_scalar(<10 x float> inreg %a, ; SI-NEXT: s_cbranch_scc0 .LBB29_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 ; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 ; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 ; SI-NEXT: s_lshr_b32 s4, s22, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 ; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 ; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 ; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 ; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 ; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 ; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 ; SI-NEXT: v_cvt_f32_f16_e32 v12, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 ; SI-NEXT: s_cbranch_execnz .LBB29_3 ; SI-NEXT: .LBB29_2: ; %cmp.true -; SI-NEXT: v_add_f32_e64 v1, s16, 1.0 -; SI-NEXT: v_add_f32_e64 v3, s17, 1.0 -; SI-NEXT: v_add_f32_e64 v5, s18, 1.0 -; SI-NEXT: v_add_f32_e64 v7, s19, 1.0 -; SI-NEXT: v_add_f32_e64 v9, s20, 1.0 -; SI-NEXT: v_add_f32_e64 v11, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v19, s16, 1.0 +; SI-NEXT: v_add_f32_e64 v18, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v17, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v16, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v15, s20, 1.0 +; SI-NEXT: v_add_f32_e64 v5, s21, 1.0 ; SI-NEXT: v_add_f32_e64 v13, s22, 1.0 -; SI-NEXT: v_add_f32_e64 v15, s23, 1.0 -; SI-NEXT: v_add_f32_e64 v17, s24, 1.0 -; SI-NEXT: v_add_f32_e64 v19, s25, 1.0 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v15 +; SI-NEXT: v_add_f32_e64 v6, s23, 1.0 +; SI-NEXT: v_add_f32_e64 v10, s24, 1.0 +; SI-NEXT: v_add_f32_e64 v8, s25, 1.0 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v19 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: .LBB29_3: ; %end +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v19 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v17 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_or_b32_e32 v4, v14, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v13 +; SI-NEXT: v_or_b32_e32 v6, v12, v6 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_or_b32_e32 v7, v7, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v12 +; SI-NEXT: v_or_b32_e32 v1, v20, v1 +; SI-NEXT: v_or_b32_e32 v3, v18, v3 +; SI-NEXT: v_or_b32_e32 v5, v16, v5 +; SI-NEXT: v_or_b32_e32 v8, v11, v8 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB29_4: -; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: s_branch .LBB29_2 ; ; VI-LABEL: bitcast_v10f32_to_v20f16_scalar: @@ -8673,27 +8984,57 @@ define <10 x float> @bitcast_v20f16_to_v10f32(<20 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v20f16_to_v10f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v34, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v18 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v9 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -8705,30 +9046,26 @@ define <10 x float> @bitcast_v20f16_to_v10f32(<20 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB30_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v34 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v28 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v26 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v24 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v22 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v11 -; SI-NEXT: v_or_b32_e32 v0, v33, v0 -; SI-NEXT: v_or_b32_e32 v1, v31, v1 -; SI-NEXT: v_or_b32_e32 v2, v29, v2 -; SI-NEXT: v_or_b32_e32 v3, v27, v3 -; SI-NEXT: v_or_b32_e32 v4, v25, v4 -; SI-NEXT: v_or_b32_e32 v5, v23, v5 -; SI-NEXT: v_or_b32_e32 v6, v21, v6 -; SI-NEXT: v_or_b32_e32 v7, v14, v7 -; SI-NEXT: v_or_b32_e32 v8, v12, v8 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v12 +; SI-NEXT: v_or_b32_e32 v0, v29, v0 +; SI-NEXT: v_or_b32_e32 v1, v27, v1 +; SI-NEXT: v_or_b32_e32 v2, v25, v2 +; SI-NEXT: v_or_b32_e32 v3, v23, v3 +; SI-NEXT: v_or_b32_e32 v4, v21, v4 +; SI-NEXT: v_or_b32_e32 v5, v19, v5 +; SI-NEXT: v_or_b32_e32 v6, v17, v6 +; SI-NEXT: v_or_b32_e32 v7, v15, v7 +; SI-NEXT: v_or_b32_e32 v8, v13, v8 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr28 @@ -8739,19 +9076,23 @@ define <10 x float> @bitcast_v20f16_to_v10f32(<20 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB30_2 ; SI-NEXT: .LBB30_4: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v27 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -8760,25 +9101,25 @@ define <10 x float> @bitcast_v20f16_to_v10f32(<20 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v26 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v25 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v23 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v21 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -8786,11 +9127,11 @@ define <10 x float> @bitcast_v20f16_to_v10f32(<20 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v20 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v18 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 @@ -8798,11 +9139,11 @@ define <10 x float> @bitcast_v20f16_to_v10f32(<20 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v17 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v15 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 @@ -8810,24 +9151,24 @@ define <10 x float> @bitcast_v20f16_to_v10f32(<20 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v14 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v11 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -8943,28 +9284,57 @@ define inreg <10 x float> @bitcast_v20f16_to_v10f32_scalar(<20 x half> inreg %a, ; SI-LABEL: bitcast_v20f16_to_v10f32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v29, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v28, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v27, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v26, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v25, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v24, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v23, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v22, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v21, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v20, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v19, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v18, s26 -; SI-NEXT: v_cvt_f16_f32_e32 v17, s29 -; SI-NEXT: v_cvt_f16_f32_e32 v16, s28 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v4 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 +; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: s_cmp_lg_u32 s26, 0 ; SI-NEXT: s_cbranch_scc0 .LBB31_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v29 @@ -14310,87 +14680,71 @@ define <20 x half> @bitcast_v20i16_to_v20f16(<20 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v20i16_to_v20f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v39, v19 -; SI-NEXT: v_mov_b32_e32 v38, v18 -; SI-NEXT: v_mov_b32_e32 v37, v17 -; SI-NEXT: v_mov_b32_e32 v36, v16 -; SI-NEXT: v_mov_b32_e32 v35, v15 -; SI-NEXT: v_mov_b32_e32 v34, v14 -; SI-NEXT: v_mov_b32_e32 v33, v13 -; SI-NEXT: v_mov_b32_e32 v32, v12 -; SI-NEXT: v_mov_b32_e32 v31, v11 -; SI-NEXT: v_mov_b32_e32 v30, v10 -; SI-NEXT: v_mov_b32_e32 v29, v9 -; SI-NEXT: v_mov_b32_e32 v28, v8 -; SI-NEXT: v_mov_b32_e32 v27, v7 -; SI-NEXT: v_mov_b32_e32 v26, v6 -; SI-NEXT: v_mov_b32_e32 v25, v5 -; SI-NEXT: v_mov_b32_e32 v24, v4 -; SI-NEXT: v_mov_b32_e32 v23, v3 -; SI-NEXT: v_mov_b32_e32 v22, v2 -; SI-NEXT: v_mov_b32_e32 v21, v1 -; SI-NEXT: v_mov_b32_e32 v48, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB44_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB44_4 -; SI-NEXT: .LBB44_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB44_3: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v0, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v39 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB44_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_cvt_f32_f16_e32 v10, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v39 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr32 @@ -14401,50 +14755,92 @@ define <20 x half> @bitcast_v20i16_to_v20f16(<20 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: .LBB44_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB44_2 -; SI-NEXT: .LBB44_4: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v39 -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v38 +; SI-NEXT: s_cbranch_execz .LBB44_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v39 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v38 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 ; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v37 -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v36 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v36 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 ; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v35 -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v34 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v34 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 ; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v33 -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v32 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v32 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v31 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v30 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v29 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v28 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v27 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v26 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v25 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v24 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v23 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v22 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v21 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: .LBB44_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v12 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v20 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v14 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v23 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v16 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v25 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v19 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v27 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v21 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v20i16_to_v20f16: @@ -14558,100 +14954,143 @@ define inreg <20 x half> @bitcast_v20i16_to_v20f16_scalar(<20 x i16> inreg %a, i ; SI-LABEL: bitcast_v20i16_to_v20f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; SI-NEXT: v_mov_b32_e32 v22, v5 -; SI-NEXT: v_mov_b32_e32 v21, v4 -; SI-NEXT: v_mov_b32_e32 v20, v3 -; SI-NEXT: v_mov_b32_e32 v25, v2 -; SI-NEXT: v_mov_b32_e32 v24, v1 -; SI-NEXT: v_mov_b32_e32 v23, v0 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_lshr_b32 s15, s25, 16 +; SI-NEXT: s_lshr_b32 s14, s24, 16 +; SI-NEXT: s_lshr_b32 s13, s23, 16 +; SI-NEXT: s_lshr_b32 s12, s22, 16 +; SI-NEXT: s_lshr_b32 s11, s21, 16 +; SI-NEXT: s_lshr_b32 s10, s20, 16 +; SI-NEXT: s_lshr_b32 s9, s19, 16 +; SI-NEXT: s_lshr_b32 s8, s18, 16 +; SI-NEXT: s_lshr_b32 s7, s17, 16 +; SI-NEXT: s_lshr_b32 s6, s16, 16 +; SI-NEXT: s_cmp_lg_u32 s26, 0 ; SI-NEXT: s_cbranch_scc0 .LBB45_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s6 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s7 ; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s8 ; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s9 ; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s10 ; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s11 ; SI-NEXT: v_cvt_f32_f16_e32 v6, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s13 ; SI-NEXT: v_cvt_f32_f16_e32 v8, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s15 ; SI-NEXT: s_cbranch_execnz .LBB45_3 ; SI-NEXT: .LBB45_2: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v22 -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v21 -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v20 -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v25 -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v24 -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v23 -; SI-NEXT: s_add_i32 s29, s29, 3 -; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: s_add_i32 s27, s27, 3 -; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s15, s15, 3 ; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s14, s14, 3 ; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s13, s13, 3 ; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s12, s12, 3 ; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s11, s11, 3 ; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s10, s10, 3 ; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s9, s9, 3 ; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s8, s8, 3 ; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s7, s7, 3 ; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s6, s6, 3 ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s6 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s7 ; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s8 ; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s9 ; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s10 ; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s11 ; SI-NEXT: v_cvt_f32_f16_e32 v6, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s13 ; SI-NEXT: v_cvt_f32_f16_e32 v8, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s15 ; SI-NEXT: .LBB45_3: ; %end +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v0, v0, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v12 +; SI-NEXT: v_or_b32_e32 v1, v1, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v2, v2, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v11 +; SI-NEXT: v_or_b32_e32 v3, v3, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v4, v4, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v11 +; SI-NEXT: v_or_b32_e32 v5, v5, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v11 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v11 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB45_4: ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: s_branch .LBB45_2 ; @@ -14826,119 +15265,179 @@ define <20 x i16> @bitcast_v20f16_to_v20i16(<20 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v20f16_to_v20i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v11 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v12 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v11 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v12 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v15 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v1 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v20 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB46_2 ; SI-NEXT: ; %bb.1: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_or_b32_e32 v18, v18, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_or_b32_e32 v9, v9, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_or_b32_e32 v14, v14, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v11 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_or_b32_e32 v7, v7, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_or_b32_e32 v10, v10, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v7 -; SI-NEXT: v_or_b32_e32 v6, v6, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v3 -; SI-NEXT: v_or_b32_e32 v2, v2, v20 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_or_b32_e32 v5, v5, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v20 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_or_b32_e32 v4, v4, v5 -; SI-NEXT: v_or_b32_e32 v8, v8, v9 -; SI-NEXT: v_or_b32_e32 v12, v12, v13 -; SI-NEXT: v_or_b32_e32 v16, v16, v17 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 -; SI-NEXT: v_alignbit_b32 v17, v18, v17, 16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v0, v0, v19 +; SI-NEXT: v_or_b32_e32 v15, v15, v18 +; SI-NEXT: v_or_b32_e32 v12, v12, v17 +; SI-NEXT: v_or_b32_e32 v13, v13, v16 +; SI-NEXT: v_or_b32_e32 v11, v11, v14 +; SI-NEXT: v_alignbit_b32 v19, v1, v19, 16 +; SI-NEXT: v_alignbit_b32 v18, v3, v18, 16 +; SI-NEXT: v_alignbit_b32 v17, v5, v17, 16 +; SI-NEXT: v_alignbit_b32 v16, v7, v16, 16 +; SI-NEXT: v_alignbit_b32 v14, v9, v14, 16 ; SI-NEXT: .LBB46_2: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v17 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v18 +; SI-NEXT: v_or_b32_e32 v4, v4, v12 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v16 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v14 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v0, v0, v19 +; SI-NEXT: v_or_b32_e32 v2, v2, v15 +; SI-NEXT: v_or_b32_e32 v6, v6, v12 +; SI-NEXT: v_or_b32_e32 v8, v8, v11 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v20f16_to_v20i16: @@ -15053,127 +15552,177 @@ define inreg <20 x i16> @bitcast_v20f16_to_v20i16_scalar(<20 x half> inreg %a, i ; SI-LABEL: bitcast_v20f16_to_v20i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v9, v4 -; SI-NEXT: v_mov_b32_e32 v10, v3 -; SI-NEXT: v_mov_b32_e32 v13, v2 -; SI-NEXT: v_mov_b32_e32 v14, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v0, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v23, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v2, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v3, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v4, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v7, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v8, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v11, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v12, s28 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v10 +; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 +; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v21, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v6, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v24, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v10, s26 -; SI-NEXT: v_cvt_f16_f32_e32 v25, s29 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_cvt_f16_f32_e32 v1, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v11 +; SI-NEXT: s_cmp_lg_u32 s26, 0 ; SI-NEXT: s_cbranch_scc0 .LBB47_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_cbranch_execnz .LBB47_3 ; SI-NEXT: .LBB47_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v8, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v25 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v8 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v18 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v20 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v27 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v23 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v7 -; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v24 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v3 -; SI-NEXT: v_or_b32_e32 v10, v10, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v25 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_or_b32_e32 v2, v2, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v21 -; SI-NEXT: v_or_b32_e32 v14, v14, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v22 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v16 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v14 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v29 +; SI-NEXT: v_or_b32_e32 v27, v10, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v24 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_lshr_b64 v[23:24], v[1:2], 16 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v19 -; SI-NEXT: v_lshr_b64 v[24:25], v[9:10], 16 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v18, v18, v20 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_lshr_b64 v[25:26], v[13:14], 16 -; SI-NEXT: v_lshr_b64 v[21:22], v[5:6], 16 -; SI-NEXT: v_lshr_b64 v[26:27], v[17:18], 16 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_or_b32_e32 v4, v4, v5 -; SI-NEXT: v_or_b32_e32 v8, v8, v9 -; SI-NEXT: v_or_b32_e32 v12, v12, v13 -; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v29, v11, v2 +; SI-NEXT: v_or_b32_e32 v28, v10, v4 +; SI-NEXT: v_or_b32_e32 v26, v12, v6 +; SI-NEXT: v_or_b32_e32 v24, v13, v8 +; SI-NEXT: v_lshr_b64 v[18:19], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[16:17], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[14:15], v[4:5], 16 +; SI-NEXT: v_lshr_b64 v[12:13], v[6:7], 16 +; SI-NEXT: v_lshr_b64 v[10:11], v[8:9], 16 ; SI-NEXT: .LBB47_3: ; %end -; SI-NEXT: v_mov_b32_e32 v1, v23 -; SI-NEXT: v_mov_b32_e32 v5, v21 -; SI-NEXT: v_mov_b32_e32 v9, v24 -; SI-NEXT: v_mov_b32_e32 v13, v25 -; SI-NEXT: v_mov_b32_e32 v17, v26 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v18 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v27 +; SI-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v16 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v29 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v23 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v14 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v28 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v22 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v12 +; SI-NEXT: v_or_b32_e32 v6, v6, v8 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v21 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v8, v8, v10 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v20 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB47_4: ; SI-NEXT: s_branch .LBB47_2 @@ -15358,375 +15907,356 @@ define <40 x i8> @bitcast_v20i16_to_v40i8(<20 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v20i16_to_v40i8: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v1 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v24 ; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v21 -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v19 ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v15 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v13 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v14 -; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v18 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v20 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v12 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB48_2 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v30, v1, v57 +; SI-NEXT: v_or_b32_e32 v30, v1, v44 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v31, v1, v43 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v31, v1, v56 +; SI-NEXT: v_or_b32_e32 v22, v1, v46 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v21, v1, v45 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 -; SI-NEXT: v_or_b32_e32 v23, v1, v59 +; SI-NEXT: v_or_b32_e32 v20, v1, v56 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v18, v1, v47 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 -; SI-NEXT: v_or_b32_e32 v22, v1, v58 +; SI-NEXT: v_or_b32_e32 v17, v1, v58 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v16, v1, v57 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 -; SI-NEXT: v_or_b32_e32 v21, v1, v61 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 -; SI-NEXT: v_or_b32_e32 v18, v1, v60 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 -; SI-NEXT: v_or_b32_e32 v14, v1, v63 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 -; SI-NEXT: v_or_b32_e32 v10, v1, v62 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 -; SI-NEXT: v_or_b32_e32 v6, v1, v25 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 -; SI-NEXT: v_or_b32_e32 v2, v1, v24 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v4, 8, 8 -; SI-NEXT: v_alignbit_b32 v39, v31, v30, 24 -; SI-NEXT: v_alignbit_b32 v48, v31, v30, 16 -; SI-NEXT: v_alignbit_b32 v52, v31, v30, 8 -; SI-NEXT: v_alignbit_b32 v36, v22, v23, 24 -; SI-NEXT: v_alignbit_b32 v37, v22, v23, 16 -; SI-NEXT: v_alignbit_b32 v49, v22, v23, 8 -; SI-NEXT: v_alignbit_b32 v33, v18, v21, 24 -; SI-NEXT: v_alignbit_b32 v34, v18, v21, 16 -; SI-NEXT: v_alignbit_b32 v38, v18, v21, 8 -; SI-NEXT: v_alignbit_b32 v28, v10, v14, 24 -; SI-NEXT: v_alignbit_b32 v29, v10, v14, 16 -; SI-NEXT: v_alignbit_b32 v35, v10, v14, 8 -; SI-NEXT: v_alignbit_b32 v26, v2, v6, 24 -; SI-NEXT: v_alignbit_b32 v27, v2, v6, 16 -; SI-NEXT: v_alignbit_b32 v32, v2, v6, 8 -; SI-NEXT: v_lshrrev_b32_e32 v47, 8, v31 -; SI-NEXT: v_lshrrev_b32_e32 v44, 8, v22 -; SI-NEXT: v_lshrrev_b32_e32 v41, 8, v18 -; SI-NEXT: v_lshrrev_b32_e32 v54, 8, v10 -; SI-NEXT: v_lshrrev_b32_e32 v50, 8, v2 -; SI-NEXT: v_and_b32_e32 v45, 0xffff, v8 -; SI-NEXT: v_and_b32_e32 v42, 0xffff, v12 -; SI-NEXT: v_and_b32_e32 v55, 0xffff, v16 -; SI-NEXT: v_and_b32_e32 v51, 0xffff, v20 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: v_bfe_u32 v46, v8, 8, 8 -; SI-NEXT: v_bfe_u32 v43, v12, 8, 8 -; SI-NEXT: v_bfe_u32 v40, v16, 8, 8 -; SI-NEXT: v_bfe_u32 v53, v20, 8, 8 +; SI-NEXT: v_or_b32_e32 v14, v1, v60 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v11, v1, v59 +; SI-NEXT: v_alignbit_b32 v35, v31, v30, 24 +; SI-NEXT: v_alignbit_b32 v38, v31, v30, 16 +; SI-NEXT: v_alignbit_b32 v50, v31, v30, 8 +; SI-NEXT: v_alignbit_b32 v34, v21, v22, 24 +; SI-NEXT: v_alignbit_b32 v36, v21, v22, 16 +; SI-NEXT: v_alignbit_b32 v39, v21, v22, 8 +; SI-NEXT: v_alignbit_b32 v29, v18, v20, 24 +; SI-NEXT: v_alignbit_b32 v32, v18, v20, 16 +; SI-NEXT: v_alignbit_b32 v37, v18, v20, 8 +; SI-NEXT: v_alignbit_b32 v26, v16, v17, 24 +; SI-NEXT: v_alignbit_b32 v27, v16, v17, 16 +; SI-NEXT: v_alignbit_b32 v33, v16, v17, 8 +; SI-NEXT: v_alignbit_b32 v23, v11, v14, 24 +; SI-NEXT: v_alignbit_b32 v25, v11, v14, 16 +; SI-NEXT: v_alignbit_b32 v28, v11, v14, 8 +; SI-NEXT: v_lshrrev_b32_e32 v41, 8, v31 +; SI-NEXT: v_lshrrev_b32_e32 v55, 8, v21 +; SI-NEXT: v_lshrrev_b32_e32 v53, 8, v18 +; SI-NEXT: v_lshrrev_b32_e32 v51, 8, v16 +; SI-NEXT: v_lshrrev_b32_e32 v48, 8, v11 +; SI-NEXT: v_bfe_u32 v42, v24, 8, 8 +; SI-NEXT: v_bfe_u32 v40, v19, 8, 8 +; SI-NEXT: v_bfe_u32 v54, v15, 8, 8 +; SI-NEXT: v_bfe_u32 v52, v13, 8, 8 +; SI-NEXT: v_bfe_u32 v49, v12, 8, 8 ; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: .LBB48_2: ; %Flow -; SI-NEXT: s_or_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_xor_b64 exec, exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB48_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v13 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; SI-NEXT: s_mov_b32 s6, 0x30000 -; SI-NEXT: v_or_b32_e32 v4, v63, v4 -; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v15 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; SI-NEXT: v_or_b32_e32 v4, v62, v4 -; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v9 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; SI-NEXT: v_or_b32_e32 v4, v61, v4 -; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v11 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; SI-NEXT: v_or_b32_e32 v4, v60, v4 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v17 -; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v5 -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v2, v25, v2 -; SI-NEXT: v_or_b32_e32 v4, v59, v4 -; SI-NEXT: v_or_b32_e32 v1, v57, v1 -; SI-NEXT: v_add_i32_e32 v6, vcc, 0x30000, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v19 -; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v7 +; SI-NEXT: v_or_b32_e32 v9, v60, v9 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v7, v58, v7 +; SI-NEXT: v_or_b32_e32 v5, v56, v5 +; SI-NEXT: v_or_b32_e32 v3, v46, v3 +; SI-NEXT: v_or_b32_e32 v1, v44, v1 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v4 ; SI-NEXT: v_add_i32_e32 v30, vcc, s6, v1 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v3 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v2, v24, v2 -; SI-NEXT: v_or_b32_e32 v4, v58, v4 -; SI-NEXT: v_or_b32_e32 v1, v56, v1 -; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v4 +; SI-NEXT: v_or_b32_e32 v9, v59, v9 +; SI-NEXT: v_or_b32_e32 v7, v57, v7 +; SI-NEXT: v_or_b32_e32 v5, v47, v5 +; SI-NEXT: v_or_b32_e32 v3, v45, v3 +; SI-NEXT: v_or_b32_e32 v1, v43, v1 +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v9 +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v7 +; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v5 +; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v3 ; SI-NEXT: v_add_i32_e32 v31, vcc, s6, v1 -; SI-NEXT: v_alignbit_b32 v39, v31, v30, 24 -; SI-NEXT: v_alignbit_b32 v48, v31, v30, 16 -; SI-NEXT: v_alignbit_b32 v52, v31, v30, 8 -; SI-NEXT: v_alignbit_b32 v36, v22, v23, 24 -; SI-NEXT: v_alignbit_b32 v37, v22, v23, 16 -; SI-NEXT: v_alignbit_b32 v49, v22, v23, 8 -; SI-NEXT: v_alignbit_b32 v33, v18, v21, 24 -; SI-NEXT: v_alignbit_b32 v34, v18, v21, 16 -; SI-NEXT: v_alignbit_b32 v38, v18, v21, 8 -; SI-NEXT: v_alignbit_b32 v28, v10, v14, 24 -; SI-NEXT: v_alignbit_b32 v29, v10, v14, 16 -; SI-NEXT: v_alignbit_b32 v35, v10, v14, 8 -; SI-NEXT: v_alignbit_b32 v26, v2, v6, 24 -; SI-NEXT: v_alignbit_b32 v27, v2, v6, 16 -; SI-NEXT: v_alignbit_b32 v32, v2, v6, 8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v12, 24, v31 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v31 -; SI-NEXT: v_lshrrev_b32_e32 v47, 8, v31 -; SI-NEXT: v_lshrrev_b32_e32 v46, 24, v22 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v44, 8, v22 -; SI-NEXT: v_lshrrev_b32_e32 v43, 24, v18 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v41, 8, v18 -; SI-NEXT: v_lshrrev_b32_e32 v40, 24, v10 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v54, 8, v10 -; SI-NEXT: v_lshrrev_b32_e32 v53, 24, v2 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v50, 8, v2 +; SI-NEXT: v_alignbit_b32 v35, v31, v30, 24 +; SI-NEXT: v_alignbit_b32 v38, v31, v30, 16 +; SI-NEXT: v_alignbit_b32 v50, v31, v30, 8 +; SI-NEXT: v_alignbit_b32 v34, v21, v22, 24 +; SI-NEXT: v_alignbit_b32 v36, v21, v22, 16 +; SI-NEXT: v_alignbit_b32 v39, v21, v22, 8 +; SI-NEXT: v_alignbit_b32 v29, v18, v20, 24 +; SI-NEXT: v_alignbit_b32 v32, v18, v20, 16 +; SI-NEXT: v_alignbit_b32 v37, v18, v20, 8 +; SI-NEXT: v_alignbit_b32 v26, v16, v17, 24 +; SI-NEXT: v_alignbit_b32 v27, v16, v17, 16 +; SI-NEXT: v_alignbit_b32 v33, v16, v17, 8 +; SI-NEXT: v_alignbit_b32 v23, v11, v14, 24 +; SI-NEXT: v_alignbit_b32 v25, v11, v14, 16 +; SI-NEXT: v_alignbit_b32 v28, v11, v14, 8 +; SI-NEXT: v_lshrrev_b32_e32 v42, 24, v31 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v41, 8, v31 +; SI-NEXT: v_lshrrev_b32_e32 v40, 24, v21 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v55, 8, v21 +; SI-NEXT: v_lshrrev_b32_e32 v54, 24, v18 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v53, 8, v18 +; SI-NEXT: v_lshrrev_b32_e32 v52, 24, v16 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v51, 8, v16 +; SI-NEXT: v_lshrrev_b32_e32 v49, 24, v11 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v48, 8, v11 ; SI-NEXT: .LBB48_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v30 -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v52 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v48 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v39 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v50 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v38 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v35 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v31 -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v47 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v3, 0xff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v12 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v23 -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v49 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v37 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v36 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v41 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v42 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v22 -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v44 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v45 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v46 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v39 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v34 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v21 -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v38 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v34 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v33 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v55 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v40 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v37 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v29 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v41 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v42 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v43 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v53 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v54 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v35 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v29 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v28 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v33 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v26 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v54 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v55 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v40 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v51 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v52 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v32 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v27 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v26 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v28 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v23 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v50 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v48 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v51 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v12 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v53 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v49 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 36, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -16522,132 +17052,125 @@ define inreg <40 x i8> @bitcast_v20i16_to_v40i8_scalar(<20 x i16> inreg %a, i32 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v8, s30, 0 -; SI-NEXT: v_writelane_b32 v8, s31, 1 -; SI-NEXT: v_writelane_b32 v8, s34, 2 -; SI-NEXT: v_writelane_b32 v8, s35, 3 -; SI-NEXT: v_writelane_b32 v8, s36, 4 -; SI-NEXT: v_writelane_b32 v8, s37, 5 -; SI-NEXT: v_writelane_b32 v8, s38, 6 -; SI-NEXT: v_writelane_b32 v8, s39, 7 -; SI-NEXT: v_writelane_b32 v8, s48, 8 -; SI-NEXT: v_writelane_b32 v8, s49, 9 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 -; SI-NEXT: v_writelane_b32 v8, s50, 10 -; SI-NEXT: v_readfirstlane_b32 s39, v6 -; SI-NEXT: v_readfirstlane_b32 s48, v5 -; SI-NEXT: v_readfirstlane_b32 s49, v4 -; SI-NEXT: v_readfirstlane_b32 s50, v3 -; SI-NEXT: v_readfirstlane_b32 s35, v2 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s38, v1 +; SI-NEXT: v_writelane_b32 v3, s30, 0 +; SI-NEXT: v_writelane_b32 v3, s31, 1 +; SI-NEXT: v_writelane_b32 v3, s34, 2 +; SI-NEXT: v_writelane_b32 v3, s35, 3 +; SI-NEXT: s_lshr_b32 s90, s25, 16 +; SI-NEXT: s_lshr_b32 s35, s24, 16 +; SI-NEXT: s_lshr_b32 s91, s23, 16 +; SI-NEXT: s_lshr_b32 s34, s22, 16 +; SI-NEXT: s_lshr_b32 s92, s21, 16 +; SI-NEXT: s_lshr_b32 s31, s20, 16 +; SI-NEXT: s_lshr_b32 s93, s19, 16 +; SI-NEXT: s_lshr_b32 s30, s18, 16 +; SI-NEXT: s_lshr_b32 s94, s17, 16 +; SI-NEXT: s_lshr_b32 s95, s16, 16 +; SI-NEXT: s_cmp_lg_u32 s26, 0 ; SI-NEXT: s_cbranch_scc0 .LBB49_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_lshl_b32 s5, s95, 16 ; SI-NEXT: s_or_b32 s12, s4, s5 -; SI-NEXT: s_and_b32 s4, s18, 0xffff -; SI-NEXT: s_lshl_b32 s5, s19, 16 +; SI-NEXT: s_and_b32 s4, s17, 0xffff +; SI-NEXT: s_lshl_b32 s5, s94, 16 ; SI-NEXT: s_or_b32 s13, s4, s5 -; SI-NEXT: s_and_b32 s4, s20, 0xffff -; SI-NEXT: s_lshl_b32 s5, s21, 16 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: s_lshl_b32 s5, s30, 16 ; SI-NEXT: s_or_b32 s10, s4, s5 -; SI-NEXT: s_and_b32 s4, s22, 0xffff -; SI-NEXT: s_lshl_b32 s5, s23, 16 +; SI-NEXT: s_and_b32 s4, s19, 0xffff +; SI-NEXT: s_lshl_b32 s5, s93, 16 ; SI-NEXT: s_or_b32 s11, s4, s5 -; SI-NEXT: s_and_b32 s4, s24, 0xffff -; SI-NEXT: s_lshl_b32 s5, s25, 16 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: s_lshl_b32 s5, s31, 16 ; SI-NEXT: s_or_b32 s8, s4, s5 -; SI-NEXT: s_and_b32 s4, s26, 0xffff -; SI-NEXT: s_lshl_b32 s5, s27, 16 +; SI-NEXT: s_and_b32 s4, s21, 0xffff +; SI-NEXT: s_lshl_b32 s5, s92, 16 ; SI-NEXT: s_or_b32 s9, s4, s5 -; SI-NEXT: s_and_b32 s4, s28, 0xffff -; SI-NEXT: s_lshl_b32 s5, s29, 16 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: s_lshl_b32 s5, s34, 16 ; SI-NEXT: s_or_b32 s6, s4, s5 -; SI-NEXT: s_and_b32 s4, s38, 0xffff -; SI-NEXT: s_lshl_b32 s5, s35, 16 +; SI-NEXT: s_and_b32 s4, s23, 0xffff +; SI-NEXT: s_lshl_b32 s5, s91, 16 ; SI-NEXT: s_lshr_b64 s[14:15], s[12:13], 24 ; SI-NEXT: s_or_b32 s7, s4, s5 -; SI-NEXT: s_and_b32 s4, s50, 0xffff -; SI-NEXT: s_lshl_b32 s5, s49, 16 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: s_lshl_b32 s5, s35, 16 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s48, 0xffff -; SI-NEXT: s_lshl_b32 s15, s39, 16 -; SI-NEXT: s_lshr_b64 s[40:41], s[12:13], 16 -; SI-NEXT: s_lshr_b64 s[56:57], s[12:13], 8 -; SI-NEXT: s_lshr_b64 s[42:43], s[10:11], 24 -; SI-NEXT: s_lshr_b64 s[46:47], s[10:11], 16 -; SI-NEXT: s_lshr_b64 s[60:61], s[10:11], 8 -; SI-NEXT: s_lshr_b64 s[44:45], s[8:9], 24 -; SI-NEXT: s_lshr_b64 s[58:59], s[8:9], 16 +; SI-NEXT: s_and_b32 s5, s25, 0xffff +; SI-NEXT: s_lshl_b32 s15, s90, 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[44:45], s[12:13], 8 +; SI-NEXT: s_lshr_b64 s[28:29], s[10:11], 24 +; SI-NEXT: s_lshr_b64 s[42:43], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[58:59], s[10:11], 8 +; SI-NEXT: s_lshr_b64 s[40:41], s[8:9], 24 +; SI-NEXT: s_lshr_b64 s[56:57], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[8:9], 8 +; SI-NEXT: s_lshr_b64 s[46:47], s[6:7], 24 ; SI-NEXT: s_or_b32 s5, s5, s15 -; SI-NEXT: s_lshr_b32 s34, s13, 8 -; SI-NEXT: s_lshr_b32 s95, s11, 8 -; SI-NEXT: s_lshr_b32 s59, s9, 8 -; SI-NEXT: s_lshr_b32 s45, s7, 8 -; SI-NEXT: s_lshr_b32 s15, s5, 8 -; SI-NEXT: s_and_b32 s36, s19, 0xffff -; SI-NEXT: s_and_b32 s30, s23, 0xffff -; SI-NEXT: s_and_b32 s61, s27, 0xffff -; SI-NEXT: s_and_b32 s47, s35, 0xffff -; SI-NEXT: s_and_b32 s41, s39, 0xffff -; SI-NEXT: s_bfe_u32 s37, s19, 0x80008 -; SI-NEXT: s_bfe_u32 s31, s23, 0x80008 -; SI-NEXT: s_bfe_u32 s94, s27, 0x80008 -; SI-NEXT: s_bfe_u32 s57, s35, 0x80008 -; SI-NEXT: s_bfe_u32 s43, s39, 0x80008 -; SI-NEXT: s_lshr_b64 s[88:89], s[8:9], 8 -; SI-NEXT: s_lshr_b64 s[78:79], s[6:7], 24 -; SI-NEXT: s_lshr_b64 s[90:91], s[6:7], 16 -; SI-NEXT: s_lshr_b64 s[74:75], s[6:7], 8 +; SI-NEXT: s_lshr_b32 s59, s13, 8 +; SI-NEXT: s_lshr_b32 s47, s11, 8 +; SI-NEXT: s_lshr_b32 s43, s9, 8 +; SI-NEXT: s_lshr_b32 s29, s7, 8 +; SI-NEXT: s_lshr_b32 s27, s5, 8 +; SI-NEXT: s_bfe_u32 s61, s94, 0x80008 +; SI-NEXT: s_bfe_u32 s57, s93, 0x80008 +; SI-NEXT: s_bfe_u32 s45, s92, 0x80008 +; SI-NEXT: s_bfe_u32 s41, s91, 0x80008 +; SI-NEXT: s_bfe_u32 s15, s90, 0x80008 +; SI-NEXT: s_lshr_b64 s[72:73], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[6:7], 8 ; SI-NEXT: s_lshr_b64 s[62:63], s[4:5], 24 -; SI-NEXT: s_lshr_b64 s[72:73], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[76:77], s[4:5], 8 +; SI-NEXT: s_lshr_b64 s[74:75], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[78:79], s[4:5], 8 ; SI-NEXT: s_cbranch_execnz .LBB49_3 ; SI-NEXT: .LBB49_2: ; %cmp.true -; SI-NEXT: s_add_i32 s50, s50, 3 -; SI-NEXT: s_and_b32 s4, s50, 0xffff -; SI-NEXT: s_lshl_b32 s5, s49, 16 -; SI-NEXT: s_add_i32 s48, s48, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: s_lshl_b32 s5, s35, 16 +; SI-NEXT: s_add_i32 s25, s25, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s5, s48, 0xffff -; SI-NEXT: s_lshl_b32 s6, s39, 16 -; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_and_b32 s5, s25, 0xffff +; SI-NEXT: s_lshl_b32 s6, s90, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 ; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_and_b32 s6, s28, 0xffff -; SI-NEXT: s_lshl_b32 s7, s29, 16 -; SI-NEXT: s_add_i32 s38, s38, 3 +; SI-NEXT: s_and_b32 s6, s22, 0xffff +; SI-NEXT: s_lshl_b32 s7, s34, 16 +; SI-NEXT: s_add_i32 s23, s23, 3 ; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_and_b32 s7, s38, 0xffff -; SI-NEXT: s_lshl_b32 s8, s35, 16 -; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_and_b32 s7, s23, 0xffff +; SI-NEXT: s_lshl_b32 s8, s91, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 ; SI-NEXT: s_or_b32 s7, s8, s7 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_and_b32 s8, s20, 0xffff +; SI-NEXT: s_lshl_b32 s9, s31, 16 +; SI-NEXT: s_add_i32 s21, s21, 3 ; SI-NEXT: s_or_b32 s8, s9, s8 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_and_b32 s9, s21, 0xffff +; SI-NEXT: s_lshl_b32 s10, s92, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: s_or_b32 s9, s10, s9 -; SI-NEXT: s_and_b32 s10, s20, 0xffff -; SI-NEXT: s_lshl_b32 s11, s21, 16 -; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_and_b32 s10, s18, 0xffff +; SI-NEXT: s_lshl_b32 s11, s30, 16 +; SI-NEXT: s_add_i32 s19, s19, 3 ; SI-NEXT: s_or_b32 s10, s11, s10 -; SI-NEXT: s_and_b32 s11, s22, 0xffff -; SI-NEXT: s_lshl_b32 s12, s23, 16 +; SI-NEXT: s_and_b32 s11, s19, 0xffff +; SI-NEXT: s_lshl_b32 s12, s93, 16 ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_or_b32 s11, s12, s11 ; SI-NEXT: s_and_b32 s12, s16, 0xffff -; SI-NEXT: s_lshl_b32 s13, s17, 16 -; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_lshl_b32 s13, s95, 16 +; SI-NEXT: s_add_i32 s17, s17, 3 ; SI-NEXT: s_or_b32 s12, s13, s12 -; SI-NEXT: s_and_b32 s13, s18, 0xffff -; SI-NEXT: s_lshl_b32 s14, s19, 16 +; SI-NEXT: s_and_b32 s13, s17, 0xffff +; SI-NEXT: s_lshl_b32 s14, s94, 16 ; SI-NEXT: s_or_b32 s13, s14, s13 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 ; SI-NEXT: s_add_i32 s8, s8, 0x30000 ; SI-NEXT: s_add_i32 s9, s9, 0x30000 ; SI-NEXT: s_add_i32 s10, s10, 0x30000 @@ -16656,43 +17179,41 @@ define inreg <40 x i8> @bitcast_v20i16_to_v40i8_scalar(<20 x i16> inreg %a, i32 ; SI-NEXT: s_add_i32 s13, s13, 0x30000 ; SI-NEXT: s_add_i32 s4, s4, 0x30000 ; SI-NEXT: s_add_i32 s5, s5, 0x30000 -; SI-NEXT: s_add_i32 s6, s6, 0x30000 -; SI-NEXT: s_add_i32 s7, s7, 0x30000 ; SI-NEXT: s_lshr_b64 s[14:15], s[12:13], 24 -; SI-NEXT: s_lshr_b64 s[40:41], s[12:13], 16 -; SI-NEXT: s_lshr_b64 s[56:57], s[12:13], 8 -; SI-NEXT: s_lshr_b64 s[42:43], s[10:11], 24 -; SI-NEXT: s_lshr_b64 s[46:47], s[10:11], 16 -; SI-NEXT: s_lshr_b64 s[60:61], s[10:11], 8 -; SI-NEXT: s_lshr_b64 s[44:45], s[8:9], 24 -; SI-NEXT: s_lshr_b64 s[58:59], s[8:9], 16 -; SI-NEXT: s_lshr_b64 s[88:89], s[8:9], 8 -; SI-NEXT: s_lshr_b64 s[78:79], s[6:7], 24 -; SI-NEXT: s_lshr_b64 s[90:91], s[6:7], 16 -; SI-NEXT: s_lshr_b64 s[74:75], s[6:7], 8 +; SI-NEXT: s_lshr_b64 s[26:27], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[44:45], s[12:13], 8 +; SI-NEXT: s_lshr_b64 s[28:29], s[10:11], 24 +; SI-NEXT: s_lshr_b64 s[42:43], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[58:59], s[10:11], 8 +; SI-NEXT: s_lshr_b64 s[40:41], s[8:9], 24 +; SI-NEXT: s_lshr_b64 s[56:57], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[8:9], 8 +; SI-NEXT: s_lshr_b64 s[46:47], s[6:7], 24 +; SI-NEXT: s_lshr_b64 s[72:73], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[6:7], 8 ; SI-NEXT: s_lshr_b64 s[62:63], s[4:5], 24 -; SI-NEXT: s_lshr_b64 s[72:73], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[76:77], s[4:5], 8 -; SI-NEXT: s_lshr_b32 s37, s13, 24 -; SI-NEXT: s_lshr_b32 s36, s13, 16 -; SI-NEXT: s_lshr_b32 s34, s13, 8 -; SI-NEXT: s_lshr_b32 s31, s11, 24 -; SI-NEXT: s_lshr_b32 s30, s11, 16 -; SI-NEXT: s_lshr_b32 s95, s11, 8 -; SI-NEXT: s_lshr_b32 s94, s9, 24 -; SI-NEXT: s_lshr_b32 s61, s9, 16 -; SI-NEXT: s_lshr_b32 s59, s9, 8 -; SI-NEXT: s_lshr_b32 s57, s7, 24 -; SI-NEXT: s_lshr_b32 s47, s7, 16 -; SI-NEXT: s_lshr_b32 s45, s7, 8 -; SI-NEXT: s_lshr_b32 s43, s5, 24 -; SI-NEXT: s_lshr_b32 s41, s5, 16 -; SI-NEXT: s_lshr_b32 s15, s5, 8 +; SI-NEXT: s_lshr_b64 s[74:75], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[78:79], s[4:5], 8 +; SI-NEXT: s_lshr_b32 s61, s13, 24 +; SI-NEXT: s_lshr_b32 s94, s13, 16 +; SI-NEXT: s_lshr_b32 s59, s13, 8 +; SI-NEXT: s_lshr_b32 s57, s11, 24 +; SI-NEXT: s_lshr_b32 s93, s11, 16 +; SI-NEXT: s_lshr_b32 s47, s11, 8 +; SI-NEXT: s_lshr_b32 s45, s9, 24 +; SI-NEXT: s_lshr_b32 s92, s9, 16 +; SI-NEXT: s_lshr_b32 s43, s9, 8 +; SI-NEXT: s_lshr_b32 s41, s7, 24 +; SI-NEXT: s_lshr_b32 s91, s7, 16 +; SI-NEXT: s_lshr_b32 s29, s7, 8 +; SI-NEXT: s_lshr_b32 s15, s5, 24 +; SI-NEXT: s_lshr_b32 s90, s5, 16 +; SI-NEXT: s_lshr_b32 s27, s5, 8 ; SI-NEXT: .LBB49_3: ; %end ; SI-NEXT: s_and_b32 s12, s12, 0xff -; SI-NEXT: s_lshl_b32 s16, s56, 8 +; SI-NEXT: s_lshl_b32 s16, s44, 8 ; SI-NEXT: s_or_b32 s12, s12, s16 -; SI-NEXT: s_and_b32 s16, s40, 0xff +; SI-NEXT: s_and_b32 s16, s26, 0xff ; SI-NEXT: s_lshl_b32 s16, s16, 16 ; SI-NEXT: s_lshl_b32 s14, s14, 24 ; SI-NEXT: s_and_b32 s12, s12, 0xffff @@ -16700,21 +17221,21 @@ define inreg <40 x i8> @bitcast_v20i16_to_v40i8_scalar(<20 x i16> inreg %a, i32 ; SI-NEXT: s_or_b32 s12, s12, s14 ; SI-NEXT: v_mov_b32_e32 v1, s12 ; SI-NEXT: s_and_b32 s12, s13, 0xff -; SI-NEXT: s_lshl_b32 s13, s34, 8 +; SI-NEXT: s_lshl_b32 s13, s59, 8 ; SI-NEXT: s_or_b32 s12, s12, s13 -; SI-NEXT: s_and_b32 s13, s36, 0xff +; SI-NEXT: s_and_b32 s13, s94, 0xff ; SI-NEXT: s_lshl_b32 s13, s13, 16 -; SI-NEXT: s_lshl_b32 s14, s37, 24 +; SI-NEXT: s_lshl_b32 s14, s61, 24 ; SI-NEXT: s_and_b32 s12, s12, 0xffff ; SI-NEXT: s_or_b32 s13, s14, s13 ; SI-NEXT: s_or_b32 s12, s12, s13 ; SI-NEXT: v_mov_b32_e32 v2, s12 ; SI-NEXT: s_and_b32 s10, s10, 0xff -; SI-NEXT: s_lshl_b32 s12, s60, 8 +; SI-NEXT: s_lshl_b32 s12, s58, 8 ; SI-NEXT: s_or_b32 s10, s10, s12 -; SI-NEXT: s_and_b32 s12, s46, 0xff +; SI-NEXT: s_and_b32 s12, s42, 0xff ; SI-NEXT: s_lshl_b32 s12, s12, 16 -; SI-NEXT: s_lshl_b32 s13, s42, 24 +; SI-NEXT: s_lshl_b32 s13, s28, 24 ; SI-NEXT: s_and_b32 s10, s10, 0xffff ; SI-NEXT: s_or_b32 s12, s13, s12 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen @@ -16725,11 +17246,11 @@ define inreg <40 x i8> @bitcast_v20i16_to_v40i8_scalar(<20 x i16> inreg %a, i32 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s10 ; SI-NEXT: s_and_b32 s10, s11, 0xff -; SI-NEXT: s_lshl_b32 s11, s95, 8 +; SI-NEXT: s_lshl_b32 s11, s47, 8 ; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: s_and_b32 s11, s30, 0xff +; SI-NEXT: s_and_b32 s11, s93, 0xff ; SI-NEXT: s_lshl_b32 s11, s11, 16 -; SI-NEXT: s_lshl_b32 s12, s31, 24 +; SI-NEXT: s_lshl_b32 s12, s57, 24 ; SI-NEXT: s_and_b32 s10, s10, 0xffff ; SI-NEXT: s_or_b32 s11, s12, s11 ; SI-NEXT: v_add_i32_e32 v1, vcc, 8, v0 @@ -16738,11 +17259,11 @@ define inreg <40 x i8> @bitcast_v20i16_to_v40i8_scalar(<20 x i16> inreg %a, i32 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s10 ; SI-NEXT: s_and_b32 s8, s8, 0xff -; SI-NEXT: s_lshl_b32 s10, s88, 8 +; SI-NEXT: s_lshl_b32 s10, s60, 8 ; SI-NEXT: s_or_b32 s8, s8, s10 -; SI-NEXT: s_and_b32 s10, s58, 0xff +; SI-NEXT: s_and_b32 s10, s56, 0xff ; SI-NEXT: s_lshl_b32 s10, s10, 16 -; SI-NEXT: s_lshl_b32 s11, s44, 24 +; SI-NEXT: s_lshl_b32 s11, s40, 24 ; SI-NEXT: s_and_b32 s8, s8, 0xffff ; SI-NEXT: s_or_b32 s10, s11, s10 ; SI-NEXT: v_add_i32_e32 v1, vcc, 12, v0 @@ -16751,11 +17272,11 @@ define inreg <40 x i8> @bitcast_v20i16_to_v40i8_scalar(<20 x i16> inreg %a, i32 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s8 ; SI-NEXT: s_and_b32 s8, s9, 0xff -; SI-NEXT: s_lshl_b32 s9, s59, 8 +; SI-NEXT: s_lshl_b32 s9, s43, 8 ; SI-NEXT: s_or_b32 s8, s8, s9 -; SI-NEXT: s_and_b32 s9, s61, 0xff +; SI-NEXT: s_and_b32 s9, s92, 0xff ; SI-NEXT: s_lshl_b32 s9, s9, 16 -; SI-NEXT: s_lshl_b32 s10, s94, 24 +; SI-NEXT: s_lshl_b32 s10, s45, 24 ; SI-NEXT: s_and_b32 s8, s8, 0xffff ; SI-NEXT: s_or_b32 s9, s10, s9 ; SI-NEXT: v_add_i32_e32 v1, vcc, 16, v0 @@ -16764,11 +17285,11 @@ define inreg <40 x i8> @bitcast_v20i16_to_v40i8_scalar(<20 x i16> inreg %a, i32 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s8 ; SI-NEXT: s_and_b32 s6, s6, 0xff -; SI-NEXT: s_lshl_b32 s8, s74, 8 +; SI-NEXT: s_lshl_b32 s8, s76, 8 ; SI-NEXT: s_or_b32 s6, s6, s8 -; SI-NEXT: s_and_b32 s8, s90, 0xff +; SI-NEXT: s_and_b32 s8, s72, 0xff ; SI-NEXT: s_lshl_b32 s8, s8, 16 -; SI-NEXT: s_lshl_b32 s9, s78, 24 +; SI-NEXT: s_lshl_b32 s9, s46, 24 ; SI-NEXT: s_and_b32 s6, s6, 0xffff ; SI-NEXT: s_or_b32 s8, s9, s8 ; SI-NEXT: v_add_i32_e32 v1, vcc, 20, v0 @@ -16777,11 +17298,11 @@ define inreg <40 x i8> @bitcast_v20i16_to_v40i8_scalar(<20 x i16> inreg %a, i32 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: s_and_b32 s6, s7, 0xff -; SI-NEXT: s_lshl_b32 s7, s45, 8 +; SI-NEXT: s_lshl_b32 s7, s29, 8 ; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: s_and_b32 s7, s47, 0xff +; SI-NEXT: s_and_b32 s7, s91, 0xff ; SI-NEXT: s_lshl_b32 s7, s7, 16 -; SI-NEXT: s_lshl_b32 s8, s57, 24 +; SI-NEXT: s_lshl_b32 s8, s41, 24 ; SI-NEXT: s_and_b32 s6, s6, 0xffff ; SI-NEXT: s_or_b32 s7, s8, s7 ; SI-NEXT: v_add_i32_e32 v1, vcc, 24, v0 @@ -16790,9 +17311,9 @@ define inreg <40 x i8> @bitcast_v20i16_to_v40i8_scalar(<20 x i16> inreg %a, i32 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: s_and_b32 s4, s4, 0xff -; SI-NEXT: s_lshl_b32 s6, s76, 8 +; SI-NEXT: s_lshl_b32 s6, s78, 8 ; SI-NEXT: s_or_b32 s4, s4, s6 -; SI-NEXT: s_and_b32 s6, s72, 0xff +; SI-NEXT: s_and_b32 s6, s74, 0xff ; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: s_lshl_b32 s7, s62, 24 ; SI-NEXT: s_and_b32 s4, s4, 0xffff @@ -16803,11 +17324,11 @@ define inreg <40 x i8> @bitcast_v20i16_to_v40i8_scalar(<20 x i16> inreg %a, i32 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s4 ; SI-NEXT: s_and_b32 s4, s5, 0xff -; SI-NEXT: s_lshl_b32 s5, s15, 8 +; SI-NEXT: s_lshl_b32 s5, s27, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s41, 0xff +; SI-NEXT: s_and_b32 s5, s90, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s6, s43, 24 +; SI-NEXT: s_lshl_b32 s6, s15, 24 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s6, s5 ; SI-NEXT: v_add_i32_e32 v1, vcc, 32, v0 @@ -16816,57 +17337,45 @@ define inreg <40 x i8> @bitcast_v20i16_to_v40i8_scalar(<20 x i16> inreg %a, i32 ; SI-NEXT: v_add_i32_e32 v0, vcc, 36, v0 ; SI-NEXT: v_mov_b32_e32 v1, s4 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: v_readlane_b32 s50, v8, 10 -; SI-NEXT: v_readlane_b32 s49, v8, 9 -; SI-NEXT: v_readlane_b32 s48, v8, 8 -; SI-NEXT: v_readlane_b32 s39, v8, 7 -; SI-NEXT: v_readlane_b32 s38, v8, 6 -; SI-NEXT: v_readlane_b32 s37, v8, 5 -; SI-NEXT: v_readlane_b32 s36, v8, 4 -; SI-NEXT: v_readlane_b32 s35, v8, 3 -; SI-NEXT: v_readlane_b32 s34, v8, 2 -; SI-NEXT: v_readlane_b32 s31, v8, 1 -; SI-NEXT: v_readlane_b32 s30, v8, 0 +; SI-NEXT: v_readlane_b32 s35, v3, 3 +; SI-NEXT: v_readlane_b32 s34, v3, 2 +; SI-NEXT: v_readlane_b32 s31, v3, 1 +; SI-NEXT: v_readlane_b32 s30, v3, 0 ; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB49_4: ; SI-NEXT: ; implicit-def: $sgpr12 -; SI-NEXT: ; implicit-def: $sgpr56 -; SI-NEXT: ; implicit-def: $sgpr40 -; SI-NEXT: ; implicit-def: $sgpr14 -; SI-NEXT: ; implicit-def: $sgpr34 -; SI-NEXT: ; implicit-def: $sgpr36 -; SI-NEXT: ; implicit-def: $sgpr37 -; SI-NEXT: ; implicit-def: $sgpr10 -; SI-NEXT: ; implicit-def: $sgpr60 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr42 -; SI-NEXT: ; implicit-def: $sgpr95 -; SI-NEXT: ; implicit-def: $sgpr30 -; SI-NEXT: ; implicit-def: $sgpr31 -; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: ; implicit-def: $sgpr88 -; SI-NEXT: ; implicit-def: $sgpr58 ; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr14 ; SI-NEXT: ; implicit-def: $sgpr59 ; SI-NEXT: ; implicit-def: $sgpr61 -; SI-NEXT: ; implicit-def: $sgpr94 -; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; implicit-def: $sgpr74 -; SI-NEXT: ; implicit-def: $sgpr45 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr28 ; SI-NEXT: ; implicit-def: $sgpr47 ; SI-NEXT: ; implicit-def: $sgpr57 -; SI-NEXT: ; implicit-def: $sgpr15 -; SI-NEXT: ; implicit-def: $sgpr41 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr40 ; SI-NEXT: ; implicit-def: $sgpr43 -; SI-NEXT: ; implicit-def: $sgpr90 -; SI-NEXT: ; implicit-def: $sgpr78 -; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr45 +; SI-NEXT: ; implicit-def: $sgpr6 ; SI-NEXT: ; implicit-def: $sgpr76 ; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr29 +; SI-NEXT: ; implicit-def: $sgpr41 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr78 +; SI-NEXT: ; implicit-def: $sgpr74 ; SI-NEXT: ; implicit-def: $sgpr62 ; SI-NEXT: s_branch .LBB49_2 ; @@ -17623,184 +18132,179 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v36, v18 -; SI-NEXT: v_mov_b32_e32 v37, v16 -; SI-NEXT: v_mov_b32_e32 v31, v14 -; SI-NEXT: v_mov_b32_e32 v33, v12 -; SI-NEXT: v_mov_b32_e32 v38, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:24 ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:4 -; SI-NEXT: v_lshlrev_b32_e32 v48, 8, v5 -; SI-NEXT: v_lshlrev_b32_e32 v39, 24, v7 -; SI-NEXT: v_lshlrev_b32_e32 v49, 24, v3 -; SI-NEXT: v_lshlrev_b32_e32 v51, 8, v13 -; SI-NEXT: v_lshlrev_b32_e32 v50, 24, v15 -; SI-NEXT: v_lshlrev_b32_e32 v52, 24, v11 -; SI-NEXT: v_lshlrev_b32_e32 v40, 8, v21 -; SI-NEXT: v_lshlrev_b32_e32 v53, 24, v23 -; SI-NEXT: v_lshlrev_b32_e32 v41, 24, v19 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:4 +; SI-NEXT: v_lshlrev_b32_e32 v32, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v31, 24, v7 +; SI-NEXT: v_lshlrev_b32_e32 v33, 24, v3 +; SI-NEXT: v_lshlrev_b32_e32 v37, 8, v13 +; SI-NEXT: v_lshlrev_b32_e32 v36, 24, v15 +; SI-NEXT: v_lshlrev_b32_e32 v38, 24, v11 +; SI-NEXT: v_lshlrev_b32_e32 v51, 8, v21 +; SI-NEXT: v_lshlrev_b32_e32 v39, 24, v23 +; SI-NEXT: v_lshlrev_b32_e32 v52, 24, v19 ; SI-NEXT: s_waitcnt expcnt(6) ; SI-NEXT: v_lshlrev_b32_e32 v45, 8, v29 ; SI-NEXT: s_waitcnt expcnt(5) ; SI-NEXT: v_lshlrev_b32_e32 v46, 24, v27 ; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v1 -; SI-NEXT: v_lshlrev_b32_e32 v27, 8, v9 -; SI-NEXT: v_lshlrev_b32_e32 v29, 8, v17 +; SI-NEXT: v_lshlrev_b32_e32 v34, 8, v9 +; SI-NEXT: v_lshlrev_b32_e32 v48, 8, v17 ; SI-NEXT: v_lshlrev_b32_e32 v44, 8, v25 -; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: s_waitcnt vmcnt(9) expcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v47, 24, v10 +; SI-NEXT: v_lshlrev_b32_e32 v47, 24, v35 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v53 ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v56, 24, v14 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: v_lshlrev_b32_e32 v56, 24, v54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: s_waitcnt vmcnt(5) expcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v58, 8, v16 -; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: v_lshlrev_b32_e32 v58, 8, v55 +; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: s_waitcnt vmcnt(3) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v59, 24, v18 -; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: v_lshlrev_b32_e32 v59, 24, v40 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v57, 8, v32 -; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: v_lshlrev_b32_e32 v57, 8, v41 +; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB50_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v1, 0xff, v4 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v6 -; SI-NEXT: v_or_b32_e32 v1, v1, v48 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v3, v39, v3 -; SI-NEXT: v_or_b32_e32 v35, v1, v3 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v2 -; SI-NEXT: v_and_b32_e32 v4, 0xff, v33 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v31 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v4, v4, v51 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v14 ; SI-NEXT: v_or_b32_e32 v0, v0, v23 -; SI-NEXT: v_or_b32_e32 v2, v49, v1 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; SI-NEXT: v_or_b32_e32 v6, v50, v5 +; SI-NEXT: v_or_b32_e32 v2, v33, v2 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v32, v4, v6 -; SI-NEXT: v_and_b32_e32 v4, 0xff, v38 -; SI-NEXT: v_and_b32_e32 v7, 0xff, v20 -; SI-NEXT: v_and_b32_e32 v9, 0xff, v22 -; SI-NEXT: v_or_b32_e32 v25, v0, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v6, v36, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v10 +; SI-NEXT: v_or_b32_e32 v29, v0, v2 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v7, v7, v40 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v0, v0, v27 -; SI-NEXT: v_or_b32_e32 v4, v52, v4 -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: v_or_b32_e32 v11, v53, v9 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v22 +; SI-NEXT: v_or_b32_e32 v0, v0, v34 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v10, v38, v5 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v50 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v10, v7, v11 -; SI-NEXT: v_and_b32_e32 v7, 0xff, v36 -; SI-NEXT: v_and_b32_e32 v12, 0xff, v28 -; SI-NEXT: v_and_b32_e32 v13, 0xff, v30 -; SI-NEXT: v_or_b32_e32 v21, v0, v4 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v37 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v4 +; SI-NEXT: v_or_b32_e32 v4, v31, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v12 +; SI-NEXT: v_or_b32_e32 v12, v39, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v21, v0, v10 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v16 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v12, v12, v45 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v0, v0, v29 -; SI-NEXT: v_or_b32_e32 v7, v41, v7 -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; SI-NEXT: v_or_b32_e32 v15, v47, v13 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v30 +; SI-NEXT: v_or_b32_e32 v18, v56, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v43 +; SI-NEXT: v_or_b32_e32 v0, v0, v48 +; SI-NEXT: v_or_b32_e32 v11, v52, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v14, v12, v15 -; SI-NEXT: v_and_b32_e32 v12, 0xff, v26 -; SI-NEXT: v_and_b32_e32 v16, 0xff, v42 -; SI-NEXT: v_and_b32_e32 v17, 0xff, v55 -; SI-NEXT: v_or_b32_e32 v34, v0, v7 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v20 +; SI-NEXT: v_or_b32_e32 v7, v7, v45 +; SI-NEXT: v_or_b32_e32 v14, v47, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v26 +; SI-NEXT: v_or_b32_e32 v20, v59, v15 +; SI-NEXT: v_or_b32_e32 v15, v0, v11 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v24 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v16, v16, v58 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v0, v0, v44 -; SI-NEXT: v_or_b32_e32 v12, v46, v12 -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; SI-NEXT: v_or_b32_e32 v19, v56, v17 +; SI-NEXT: v_or_b32_e32 v7, v7, v14 +; SI-NEXT: v_or_b32_e32 v13, v46, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v42 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 -; SI-NEXT: v_or_b32_e32 v18, v16, v19 -; SI-NEXT: v_and_b32_e32 v16, 0xff, v43 -; SI-NEXT: v_or_b32_e32 v12, v0, v12 +; SI-NEXT: v_or_b32_e32 v1, v1, v32 +; SI-NEXT: v_or_b32_e32 v3, v3, v37 +; SI-NEXT: v_or_b32_e32 v5, v5, v51 +; SI-NEXT: v_alignbit_b32 v27, v7, v13, 16 +; SI-NEXT: v_or_b32_e32 v9, v9, v58 +; SI-NEXT: v_or_b32_e32 v13, v0, v13 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xff, v54 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v49 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; SI-NEXT: v_or_b32_e32 v0, v0, v57 -; SI-NEXT: v_or_b32_e32 v16, v59, v16 +; SI-NEXT: v_or_b32_e32 v1, v1, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v6 +; SI-NEXT: v_or_b32_e32 v5, v5, v12 +; SI-NEXT: v_or_b32_e32 v9, v9, v18 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_alignbit_b32 v1, v35, v2, 16 -; SI-NEXT: v_alignbit_b32 v5, v32, v4, 16 -; SI-NEXT: v_alignbit_b32 v9, v10, v7, 16 -; SI-NEXT: v_alignbit_b32 v17, v18, v16, 16 -; SI-NEXT: v_or_b32_e32 v16, v0, v16 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_alignbit_b32 v17, v1, v2, 16 +; SI-NEXT: v_alignbit_b32 v19, v3, v10, 16 +; SI-NEXT: v_alignbit_b32 v25, v5, v11, 16 +; SI-NEXT: v_alignbit_b32 v35, v9, v20, 16 +; SI-NEXT: v_or_b32_e32 v11, v0, v20 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v18 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr46 @@ -17808,8 +18312,8 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: .LBB50_2: ; %Flow @@ -17817,7 +18321,7 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) { ; SI-NEXT: s_cbranch_execz .LBB50_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v49 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v43 ; SI-NEXT: v_or_b32_e32 v1, v57, v1 @@ -17828,10 +18332,10 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v3, v59, v3 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: s_mov_b32 s7, 0x3000000 -; SI-NEXT: v_add_i32_e32 v16, vcc, s7, v1 +; SI-NEXT: v_add_i32_e32 v11, vcc, s7, v1 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v42 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v50 ; SI-NEXT: s_movk_i32 s6, 0x300 ; SI-NEXT: v_or_b32_e32 v1, v58, v1 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 @@ -17840,7 +18344,7 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v3, v56, v3 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_add_i32_e32 v18, vcc, s7, v1 +; SI-NEXT: v_add_i32_e32 v9, vcc, s7, v1 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v24 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v26 @@ -17851,7 +18355,7 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v3, v46, v3 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_add_i32_e32 v12, vcc, s7, v1 +; SI-NEXT: v_add_i32_e32 v13, vcc, s7, v1 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v28 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v30 @@ -17862,52 +18366,52 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v3, v47, v3 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_add_i32_e32 v14, vcc, s7, v1 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v37 +; SI-NEXT: v_add_i32_e32 v7, vcc, s7, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v16 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v36 -; SI-NEXT: v_or_b32_e32 v1, v29, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v18 +; SI-NEXT: v_or_b32_e32 v1, v48, v1 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v3, v41, v3 +; SI-NEXT: v_or_b32_e32 v3, v52, v3 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_add_i32_e32 v34, vcc, s7, v1 +; SI-NEXT: v_add_i32_e32 v15, vcc, s7, v1 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v20 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v22 -; SI-NEXT: v_or_b32_e32 v1, v40, v1 +; SI-NEXT: v_or_b32_e32 v1, v51, v1 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v3, v53, v3 +; SI-NEXT: v_or_b32_e32 v3, v39, v3 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_add_i32_e32 v10, vcc, s7, v1 +; SI-NEXT: v_add_i32_e32 v5, vcc, s7, v1 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v8 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v38 -; SI-NEXT: v_or_b32_e32 v1, v27, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v10 +; SI-NEXT: v_or_b32_e32 v1, v34, v1 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v3, v52, v3 +; SI-NEXT: v_or_b32_e32 v3, v38, v3 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_add_i32_e32 v21, vcc, s7, v1 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v33 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v12 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v31 -; SI-NEXT: v_or_b32_e32 v1, v51, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v14 +; SI-NEXT: v_or_b32_e32 v1, v37, v1 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v3, v50, v3 +; SI-NEXT: v_or_b32_e32 v3, v36, v3 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_add_i32_e32 v32, vcc, s7, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, s7, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v2 ; SI-NEXT: v_or_b32_e32 v0, v23, v0 @@ -17915,32 +18419,60 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v49, v1 +; SI-NEXT: v_or_b32_e32 v1, v33, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v25, vcc, s7, v0 +; SI-NEXT: v_add_i32_e32 v29, vcc, s7, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v4 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v6 -; SI-NEXT: v_or_b32_e32 v0, v48, v0 +; SI-NEXT: v_or_b32_e32 v0, v32, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v39, v1 +; SI-NEXT: v_or_b32_e32 v1, v31, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v35, vcc, s7, v0 -; SI-NEXT: v_alignbit_b32 v1, v35, v25, 16 -; SI-NEXT: v_alignbit_b32 v5, v32, v21, 16 -; SI-NEXT: v_alignbit_b32 v9, v10, v34, 16 -; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 -; SI-NEXT: v_alignbit_b32 v17, v18, v16, 16 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v35 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v32 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; SI-NEXT: v_add_i32_e32 v1, vcc, s7, v0 +; SI-NEXT: v_alignbit_b32 v17, v1, v29, 16 +; SI-NEXT: v_alignbit_b32 v19, v3, v21, 16 +; SI-NEXT: v_alignbit_b32 v25, v5, v15, 16 +; SI-NEXT: v_alignbit_b32 v27, v7, v13, 16 +; SI-NEXT: v_alignbit_b32 v35, v9, v11, 16 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v9 ; SI-NEXT: .LBB50_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v17 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v53 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v19 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v54 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v25 +; SI-NEXT: v_or_b32_e32 v4, v4, v6 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v55 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v27 +; SI-NEXT: v_or_b32_e32 v6, v6, v8 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v40 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v35 +; SI-NEXT: v_or_b32_e32 v8, v8, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v41 ; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload @@ -17953,11 +18485,8 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v0, v25 -; SI-NEXT: v_mov_b32_e32 v2, v35 -; SI-NEXT: v_mov_b32_e32 v4, v21 -; SI-NEXT: v_mov_b32_e32 v6, v32 -; SI-NEXT: v_mov_b32_e32 v8, v34 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -18912,282 +19441,365 @@ define inreg <20 x i16> @bitcast_v40i8_to_v20i16_scalar(<40 x i8> inreg %a, i32 ; SI-LABEL: bitcast_v40i8_to_v20i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v27, s30, 0 +; SI-NEXT: v_writelane_b32 v27, s31, 1 +; SI-NEXT: v_writelane_b32 v27, s34, 2 +; SI-NEXT: v_writelane_b32 v27, s35, 3 +; SI-NEXT: v_writelane_b32 v27, s36, 4 +; SI-NEXT: v_writelane_b32 v27, s37, 5 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 -; SI-NEXT: v_mov_b32_e32 v31, v18 -; SI-NEXT: v_mov_b32_e32 v32, v14 -; SI-NEXT: v_mov_b32_e32 v33, v10 -; SI-NEXT: v_readfirstlane_b32 s43, v1 -; SI-NEXT: v_readfirstlane_b32 s42, v0 +; SI-NEXT: v_writelane_b32 v27, s38, 6 +; SI-NEXT: v_readfirstlane_b32 s91, v25 +; SI-NEXT: v_readfirstlane_b32 s90, v24 +; SI-NEXT: v_readfirstlane_b32 s94, v23 +; SI-NEXT: v_readfirstlane_b32 s95, v22 +; SI-NEXT: v_readfirstlane_b32 s31, v21 +; SI-NEXT: v_readfirstlane_b32 s30, v20 +; SI-NEXT: v_readfirstlane_b32 s34, v19 +; SI-NEXT: v_readfirstlane_b32 s35, v18 +; SI-NEXT: v_readfirstlane_b32 s75, v17 +; SI-NEXT: v_readfirstlane_b32 s74, v16 +; SI-NEXT: v_readfirstlane_b32 s78, v15 +; SI-NEXT: v_readfirstlane_b32 s79, v14 +; SI-NEXT: v_readfirstlane_b32 s89, v13 +; SI-NEXT: v_readfirstlane_b32 s88, v12 +; SI-NEXT: v_readfirstlane_b32 s92, v11 +; SI-NEXT: v_readfirstlane_b32 s93, v10 +; SI-NEXT: v_readfirstlane_b32 s61, v9 +; SI-NEXT: v_readfirstlane_b32 s60, v8 +; SI-NEXT: v_readfirstlane_b32 s62, v7 +; SI-NEXT: v_readfirstlane_b32 s63, v6 +; SI-NEXT: v_readfirstlane_b32 s73, v5 +; SI-NEXT: v_readfirstlane_b32 s72, v4 +; SI-NEXT: v_readfirstlane_b32 s76, v3 +; SI-NEXT: v_readfirstlane_b32 s77, v2 +; SI-NEXT: v_readfirstlane_b32 s58, v1 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_lshlrev_b32_e32 v34, 8, v3 -; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v5 -; SI-NEXT: v_lshlrev_b32_e32 v36, 8, v11 -; SI-NEXT: v_lshlrev_b32_e32 v35, 24, v13 -; SI-NEXT: v_lshlrev_b32_e32 v38, 8, v19 -; SI-NEXT: v_lshlrev_b32_e32 v37, 24, v21 -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v7 -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v9 -; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v15 -; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v17 -; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v23 -; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v25 +; SI-NEXT: v_readfirstlane_b32 s59, v0 +; SI-NEXT: v_writelane_b32 v27, s39, 7 ; SI-NEXT: s_cbranch_scc0 .LBB51_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_and_b32 s4, s16, 0xff ; SI-NEXT: s_lshl_b32 s5, s17, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s18, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s6, s19, 24 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s12, s6, s5 -; SI-NEXT: s_or_b32 s6, s4, s12 -; SI-NEXT: s_and_b32 s4, s24, 0xff -; SI-NEXT: s_lshl_b32 s5, s25, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_or_b32 s8, s4, s5 +; SI-NEXT: s_and_b32 s4, s18, 0xff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_lshl_b32 s5, s19, 24 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s24, 0xff +; SI-NEXT: s_lshl_b32 s6, s25, 8 +; SI-NEXT: s_or_b32 s9, s5, s6 ; SI-NEXT: s_and_b32 s5, s26, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s7, s27, 24 -; SI-NEXT: s_or_b32 s14, s7, s5 +; SI-NEXT: s_lshl_b32 s6, s27, 24 +; SI-NEXT: s_or_b32 s6, s6, s5 +; SI-NEXT: s_and_b32 s5, s77, 0xff +; SI-NEXT: s_lshl_b32 s7, s76, 8 +; SI-NEXT: s_or_b32 s10, s5, s7 +; SI-NEXT: s_and_b32 s5, s72, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s7, s73, 24 +; SI-NEXT: s_or_b32 s42, s7, s5 +; SI-NEXT: s_and_b32 s5, s93, 0xff +; SI-NEXT: s_lshl_b32 s7, s92, 8 +; SI-NEXT: s_or_b32 s11, s5, s7 +; SI-NEXT: s_and_b32 s5, s88, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s7, s89, 24 +; SI-NEXT: s_or_b32 s44, s7, s5 +; SI-NEXT: s_and_b32 s5, s35, 0xff +; SI-NEXT: s_lshl_b32 s7, s34, 8 +; SI-NEXT: s_or_b32 s12, s5, s7 ; SI-NEXT: s_and_b32 s5, s20, 0xff ; SI-NEXT: s_lshl_b32 s7, s21, 8 ; SI-NEXT: s_or_b32 s5, s5, s7 ; SI-NEXT: s_and_b32 s7, s22, 0xff -; SI-NEXT: v_and_b32_e32 v10, 0xff, v33 ; SI-NEXT: s_lshl_b32 s7, s7, 16 -; SI-NEXT: s_lshl_b32 s8, s23, 24 -; SI-NEXT: v_or_b32_e32 v10, v10, v36 -; SI-NEXT: s_and_b32 s5, s5, 0xffff -; SI-NEXT: s_or_b32 s7, s8, s7 -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v10 -; SI-NEXT: v_and_b32_e32 v10, 0xff, v12 -; SI-NEXT: s_or_b32 s13, s5, s7 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: s_lshr_b64 s[8:9], s[12:13], 16 -; SI-NEXT: v_or_b32_e32 v13, v35, v10 -; SI-NEXT: v_and_b32_e32 v10, 0xff, v20 -; SI-NEXT: v_and_b32_e32 v14, 0xff, v8 -; SI-NEXT: v_and_b32_e32 v18, 0xff, v16 -; SI-NEXT: s_and_b32 s5, s28, 0xff -; SI-NEXT: s_lshl_b32 s9, s29, 8 -; SI-NEXT: v_and_b32_e32 v9, 0xff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_and_b32_e32 v25, 0xff, v24 -; SI-NEXT: s_or_b32 s5, s5, s9 -; SI-NEXT: s_and_b32 s9, s42, 0xff -; SI-NEXT: v_or_b32_e32 v9, v9, v34 -; SI-NEXT: v_or_b32_e32 v17, v37, v10 -; SI-NEXT: v_and_b32_e32 v10, 0xff, v6 -; SI-NEXT: v_or_b32_e32 v19, v0, v14 -; SI-NEXT: v_and_b32_e32 v14, 0xff, v32 -; SI-NEXT: v_or_b32_e32 v39, v5, v18 -; SI-NEXT: v_and_b32_e32 v18, 0xff, v22 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; SI-NEXT: s_lshl_b32 s9, s9, 16 -; SI-NEXT: s_lshl_b32 s10, s43, 24 -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v9 -; SI-NEXT: v_and_b32_e32 v9, 0xff, v4 -; SI-NEXT: v_or_b32_e32 v10, v10, v3 -; SI-NEXT: v_or_b32_e32 v14, v14, v7 -; SI-NEXT: v_or_b32_e32 v18, v18, v23 -; SI-NEXT: v_or_b32_e32 v48, v21, v25 -; SI-NEXT: v_and_b32_e32 v25, 0xff, v31 +; SI-NEXT: s_lshl_b32 s13, s23, 24 +; SI-NEXT: s_or_b32 s56, s13, s7 +; SI-NEXT: s_and_b32 s7, s28, 0xff +; SI-NEXT: s_lshl_b32 s13, s29, 8 +; SI-NEXT: s_or_b32 s7, s7, s13 +; SI-NEXT: s_and_b32 s13, s59, 0xff +; SI-NEXT: s_lshl_b32 s13, s13, 16 +; SI-NEXT: s_lshl_b32 s14, s58, 24 +; SI-NEXT: s_or_b32 s57, s14, s13 +; SI-NEXT: s_and_b32 s13, s63, 0xff +; SI-NEXT: s_lshl_b32 s14, s62, 8 +; SI-NEXT: s_or_b32 s13, s13, s14 +; SI-NEXT: s_and_b32 s14, s60, 0xff +; SI-NEXT: s_lshl_b32 s14, s14, 16 +; SI-NEXT: s_lshl_b32 s15, s61, 24 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_or_b32 vcc_lo, s15, s14 +; SI-NEXT: s_or_b32 s43, s13, vcc_lo +; SI-NEXT: s_and_b32 s13, s79, 0xff +; SI-NEXT: s_lshl_b32 s14, s78, 8 +; SI-NEXT: s_or_b32 s13, s13, s14 +; SI-NEXT: s_and_b32 s14, s74, 0xff +; SI-NEXT: s_lshl_b32 s14, s14, 16 +; SI-NEXT: s_lshl_b32 s15, s75, 24 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_or_b32 vcc_hi, s15, s14 +; SI-NEXT: s_or_b32 s45, s13, vcc_hi +; SI-NEXT: s_and_b32 s13, s95, 0xff +; SI-NEXT: s_lshl_b32 s14, s94, 8 +; SI-NEXT: s_or_b32 s13, s13, s14 +; SI-NEXT: s_and_b32 s14, s90, 0xff +; SI-NEXT: s_lshl_b32 s14, s14, 16 +; SI-NEXT: s_lshl_b32 s15, s91, 24 +; SI-NEXT: s_or_b32 s36, s15, s14 +; SI-NEXT: s_and_b32 s14, s30, 0xff ; SI-NEXT: s_and_b32 s5, s5, 0xffff -; SI-NEXT: s_or_b32 s12, s10, s9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; SI-NEXT: v_or_b32_e32 v25, v25, v38 -; SI-NEXT: s_or_b32 s15, s5, s12 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: v_or_b32_e32 v9, v1, v9 -; SI-NEXT: v_or_b32_e32 v10, v10, v19 -; SI-NEXT: v_or_b32_e32 v14, v14, v39 -; SI-NEXT: v_or_b32_e32 v18, v18, v48 -; SI-NEXT: v_and_b32_e32 v29, 0xffff, v25 -; SI-NEXT: s_lshr_b64 s[10:11], s[14:15], 16 -; SI-NEXT: s_or_b32 s4, s4, s14 -; SI-NEXT: v_or_b32_e32 v25, v11, v9 -; SI-NEXT: v_mov_b32_e32 v26, v10 -; SI-NEXT: v_lshr_b64 v[9:10], v[9:10], 16 -; SI-NEXT: v_or_b32_e32 v27, v15, v13 -; SI-NEXT: v_mov_b32_e32 v28, v14 -; SI-NEXT: v_lshr_b64 v[13:14], v[13:14], 16 -; SI-NEXT: v_or_b32_e32 v29, v29, v17 -; SI-NEXT: v_mov_b32_e32 v30, v18 -; SI-NEXT: v_lshr_b64 v[17:18], v[17:18], 16 -; SI-NEXT: s_lshr_b32 s9, s7, 16 -; SI-NEXT: s_lshr_b32 s11, s12, 16 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v39 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v48 -; SI-NEXT: s_mov_b32 s7, s13 -; SI-NEXT: s_mov_b32 s5, s15 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_lshl_b32 s14, s14, 16 +; SI-NEXT: s_lshl_b32 s15, s31, 24 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s56 +; SI-NEXT: s_or_b32 s7, s7, s57 +; SI-NEXT: s_or_b32 s46, s15, s14 +; SI-NEXT: s_or_b32 s47, s13, s36 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_and_b32 s37, s10, 0xffff +; SI-NEXT: s_and_b32 s38, s11, 0xffff +; SI-NEXT: s_and_b32 s39, s12, 0xffff +; SI-NEXT: s_or_b32 s12, s8, s4 +; SI-NEXT: s_mov_b32 s13, s5 +; SI-NEXT: s_lshr_b64 s[14:15], s[4:5], 16 +; SI-NEXT: s_or_b32 s10, s9, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_lshr_b64 s[40:41], s[6:7], 16 +; SI-NEXT: s_or_b32 s8, s37, s42 +; SI-NEXT: s_mov_b32 s9, s43 +; SI-NEXT: s_lshr_b64 s[42:43], s[42:43], 16 +; SI-NEXT: s_or_b32 s6, s38, s44 +; SI-NEXT: s_mov_b32 s7, s45 +; SI-NEXT: s_lshr_b64 s[44:45], s[44:45], 16 +; SI-NEXT: s_or_b32 s4, s39, s46 +; SI-NEXT: s_mov_b32 s5, s47 +; SI-NEXT: s_lshr_b64 s[46:47], s[46:47], 16 +; SI-NEXT: s_lshr_b32 s41, s56, 16 +; SI-NEXT: s_lshr_b32 s43, s57, 16 +; SI-NEXT: s_lshr_b32 s45, vcc_lo, 16 +; SI-NEXT: s_lshr_b32 s47, vcc_hi, 16 +; SI-NEXT: s_lshr_b32 s15, s36, 16 ; SI-NEXT: s_cbranch_execnz .LBB51_3 ; SI-NEXT: .LBB51_2: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v31 -; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v20 -; SI-NEXT: v_or_b32_e32 v9, v38, v9 -; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 -; SI-NEXT: v_add_i32_e32 v9, vcc, 0x300, v9 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: v_or_b32_e32 v10, v37, v10 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_add_i32_e32 v29, vcc, 0x3000000, v9 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v22 -; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v24 -; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: v_or_b32_e32 v9, v23, v9 -; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 -; SI-NEXT: s_and_b32 s4, s24, 0xff -; SI-NEXT: s_lshl_b32 s5, s25, 8 -; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: v_add_i32_e32 v9, vcc, 0x300, v9 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: s_add_i32 s35, s35, 3 +; SI-NEXT: s_and_b32 s4, s35, 0xff +; SI-NEXT: s_lshl_b32 s5, s34, 8 +; SI-NEXT: s_add_i32 s30, s30, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s6, s26, 0xff -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: v_or_b32_e32 v10, v21, v10 +; SI-NEXT: s_and_b32 s6, s30, 0xff ; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: s_lshl_b32 s5, s27, 24 +; SI-NEXT: s_lshl_b32 s5, s31, 24 ; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: v_add_i32_e32 v30, vcc, 0x3000000, v9 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v33 +; SI-NEXT: s_add_i32 s95, s95, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s5, s28, 0xff -; SI-NEXT: s_lshl_b32 s6, s29, 8 -; SI-NEXT: s_add_i32 s42, s42, 3 -; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v12 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: s_and_b32 s5, s95, 0xff +; SI-NEXT: s_lshl_b32 s6, s94, 8 +; SI-NEXT: s_add_i32 s90, s90, 3 ; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_and_b32 s7, s42, 0xff -; SI-NEXT: v_or_b32_e32 v9, v36, v9 -; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: s_and_b32 s7, s90, 0xff ; SI-NEXT: s_addk_i32 s5, 0x300 -; SI-NEXT: s_lshl_b32 s6, s43, 24 +; SI-NEXT: s_lshl_b32 s6, s91, 24 ; SI-NEXT: s_lshl_b32 s7, s7, 16 -; SI-NEXT: v_add_i32_e32 v9, vcc, 0x300, v9 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v2, v34, v2 -; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 ; SI-NEXT: s_and_b32 s5, s5, 0xffff ; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: v_or_b32_e32 v10, v35, v10 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x300, v2 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: s_add_i32 s93, s93, 3 ; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_and_b32 s6, s16, 0xff -; SI-NEXT: s_lshl_b32 s7, s17, 8 -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v4 +; SI-NEXT: s_and_b32 s6, s93, 0xff +; SI-NEXT: s_lshl_b32 s7, s92, 8 +; SI-NEXT: s_add_i32 s88, s88, 3 ; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_and_b32 s8, s18, 0xff -; SI-NEXT: v_add_i32_e32 v27, vcc, 0x3000000, v9 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v32 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: s_and_b32 s8, s88, 0xff ; SI-NEXT: s_addk_i32 s6, 0x300 -; SI-NEXT: s_lshl_b32 s7, s19, 24 +; SI-NEXT: s_lshl_b32 s7, s89, 24 ; SI-NEXT: s_lshl_b32 s8, s8, 16 -; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 -; SI-NEXT: v_add_i32_e32 v25, vcc, 0x3000000, v1 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v6 ; SI-NEXT: s_and_b32 s6, s6, 0xffff ; SI-NEXT: s_or_b32 s7, s7, s8 -; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: v_or_b32_e32 v7, v7, v9 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v16 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v8 +; SI-NEXT: s_add_i32 s79, s79, 3 ; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_and_b32 s7, s20, 0xff -; SI-NEXT: s_lshl_b32 s8, s21, 8 -; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: s_and_b32 s7, s79, 0xff +; SI-NEXT: s_lshl_b32 s8, s78, 8 +; SI-NEXT: s_add_i32 s74, s74, 3 ; SI-NEXT: s_or_b32 s7, s8, s7 -; SI-NEXT: s_and_b32 s9, s22, 0xff -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x300, v7 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_and_b32 s9, s74, 0xff ; SI-NEXT: s_addk_i32 s7, 0x300 -; SI-NEXT: s_lshl_b32 s8, s23, 24 +; SI-NEXT: s_lshl_b32 s8, s75, 24 ; SI-NEXT: s_lshl_b32 s9, s9, 16 -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: v_or_b32_e32 v5, v5, v9 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v0, v0, v2 ; SI-NEXT: s_and_b32 s7, s7, 0xffff ; SI-NEXT: s_or_b32 s8, s8, s9 -; SI-NEXT: v_or_b32_e32 v5, v5, v7 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: s_add_i32 s77, s77, 3 ; SI-NEXT: s_or_b32 s7, s8, s7 -; SI-NEXT: v_add_i32_e32 v28, vcc, 0x3000000, v5 -; SI-NEXT: v_add_i32_e32 v26, vcc, 0x3000000, v0 +; SI-NEXT: s_and_b32 s8, s77, 0xff +; SI-NEXT: s_lshl_b32 s9, s76, 8 +; SI-NEXT: s_add_i32 s72, s72, 3 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s10, s72, 0xff +; SI-NEXT: s_addk_i32 s8, 0x300 +; SI-NEXT: s_lshl_b32 s9, s73, 24 +; SI-NEXT: s_lshl_b32 s10, s10, 16 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_add_i32 s63, s63, 3 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s63, 0xff +; SI-NEXT: s_lshl_b32 s10, s62, 8 +; SI-NEXT: s_add_i32 s60, s60, 3 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s11, s60, 0xff +; SI-NEXT: s_addk_i32 s9, 0x300 +; SI-NEXT: s_lshl_b32 s10, s61, 24 +; SI-NEXT: s_lshl_b32 s11, s11, 16 +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s24, 0xff +; SI-NEXT: s_lshl_b32 s11, s25, 8 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: s_and_b32 s12, s26, 0xff +; SI-NEXT: s_addk_i32 s10, 0x300 +; SI-NEXT: s_lshl_b32 s11, s27, 24 +; SI-NEXT: s_lshl_b32 s12, s12, 16 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_or_b32 s11, s11, s12 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: s_and_b32 s11, s28, 0xff +; SI-NEXT: s_lshl_b32 s12, s29, 8 +; SI-NEXT: s_add_i32 s59, s59, 3 +; SI-NEXT: s_or_b32 s11, s12, s11 +; SI-NEXT: s_and_b32 s13, s59, 0xff +; SI-NEXT: s_addk_i32 s11, 0x300 +; SI-NEXT: s_lshl_b32 s12, s58, 24 +; SI-NEXT: s_lshl_b32 s13, s13, 16 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: s_or_b32 s12, s12, s13 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_or_b32 s11, s12, s11 +; SI-NEXT: s_and_b32 s12, s16, 0xff +; SI-NEXT: s_lshl_b32 s13, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s12, s13, s12 +; SI-NEXT: s_and_b32 s14, s18, 0xff +; SI-NEXT: s_addk_i32 s12, 0x300 +; SI-NEXT: s_lshl_b32 s13, s19, 24 +; SI-NEXT: s_lshl_b32 s14, s14, 16 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_or_b32 s13, s13, s14 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s12, s13, s12 +; SI-NEXT: s_and_b32 s13, s20, 0xff +; SI-NEXT: s_lshl_b32 s14, s21, 8 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s13, s14, s13 +; SI-NEXT: s_and_b32 s15, s22, 0xff +; SI-NEXT: s_addk_i32 s13, 0x300 +; SI-NEXT: s_lshl_b32 s14, s23, 24 +; SI-NEXT: s_lshl_b32 s15, s15, 16 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_or_b32 s14, s14, s15 +; SI-NEXT: s_or_b32 s13, s14, s13 ; SI-NEXT: s_add_i32 s4, s4, 0x3000000 ; SI-NEXT: s_add_i32 s5, s5, 0x3000000 ; SI-NEXT: s_add_i32 s6, s6, 0x3000000 ; SI-NEXT: s_add_i32 s7, s7, 0x3000000 -; SI-NEXT: s_lshr_b64 s[8:9], s[6:7], 16 -; SI-NEXT: s_lshr_b64 s[10:11], s[4:5], 16 -; SI-NEXT: v_lshr_b64 v[9:10], v[25:26], 16 -; SI-NEXT: v_lshr_b64 v[13:14], v[27:28], 16 -; SI-NEXT: v_lshr_b64 v[17:18], v[29:30], 16 -; SI-NEXT: s_lshr_b32 s9, s7, 16 -; SI-NEXT: s_lshr_b32 s11, s5, 16 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v30 +; SI-NEXT: s_add_i32 s8, s8, 0x3000000 +; SI-NEXT: s_add_i32 s9, s9, 0x3000000 +; SI-NEXT: s_add_i32 s10, s10, 0x3000000 +; SI-NEXT: s_add_i32 s11, s11, 0x3000000 +; SI-NEXT: s_add_i32 s12, s12, 0x3000000 +; SI-NEXT: s_add_i32 s13, s13, 0x3000000 +; SI-NEXT: s_lshr_b64 s[14:15], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[42:43], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[44:45], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[46:47], s[4:5], 16 +; SI-NEXT: s_lshr_b32 s41, s13, 16 +; SI-NEXT: s_lshr_b32 s43, s11, 16 +; SI-NEXT: s_lshr_b32 s45, s9, 16 +; SI-NEXT: s_lshr_b32 s47, s7, 16 +; SI-NEXT: s_lshr_b32 s15, s5, 16 ; SI-NEXT: .LBB51_3: ; %end -; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: v_mov_b32_e32 v1, s8 -; SI-NEXT: v_mov_b32_e32 v2, s7 -; SI-NEXT: v_mov_b32_e32 v3, s9 -; SI-NEXT: v_mov_b32_e32 v4, s4 -; SI-NEXT: v_mov_b32_e32 v5, s10 -; SI-NEXT: v_mov_b32_e32 v6, s5 -; SI-NEXT: v_mov_b32_e32 v7, s11 -; SI-NEXT: v_mov_b32_e32 v8, v25 -; SI-NEXT: v_mov_b32_e32 v10, v26 -; SI-NEXT: v_mov_b32_e32 v12, v27 -; SI-NEXT: v_mov_b32_e32 v14, v28 -; SI-NEXT: v_mov_b32_e32 v16, v29 -; SI-NEXT: v_mov_b32_e32 v18, v30 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_lshl_b32 s14, s14, 16 +; SI-NEXT: s_or_b32 s12, s12, s14 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_lshl_b32 s14, s41, 16 +; SI-NEXT: s_or_b32 s13, s13, s14 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_lshl_b32 s14, s40, 16 +; SI-NEXT: s_or_b32 s10, s10, s14 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: s_lshl_b32 s14, s43, 16 +; SI-NEXT: s_or_b32 s11, s11, s14 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_lshl_b32 s14, s42, 16 +; SI-NEXT: s_or_b32 s8, s8, s14 +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_lshl_b32 s14, s45, 16 +; SI-NEXT: s_or_b32 s9, s9, s14 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_lshl_b32 s14, s44, 16 +; SI-NEXT: s_or_b32 s6, s6, s14 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_lshl_b32 s14, s47, 16 +; SI-NEXT: s_or_b32 s7, s7, s14 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s14, s46, 16 +; SI-NEXT: s_or_b32 s4, s4, s14 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s14, s15, 16 +; SI-NEXT: s_or_b32 s5, s5, s14 +; SI-NEXT: v_mov_b32_e32 v0, s12 +; SI-NEXT: v_mov_b32_e32 v1, s13 +; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: v_mov_b32_e32 v3, s11 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s6 +; SI-NEXT: v_mov_b32_e32 v7, s7 +; SI-NEXT: v_mov_b32_e32 v8, s4 +; SI-NEXT: v_mov_b32_e32 v9, s5 +; SI-NEXT: v_readlane_b32 s39, v27, 7 +; SI-NEXT: v_readlane_b32 s38, v27, 6 +; SI-NEXT: v_readlane_b32 s37, v27, 5 +; SI-NEXT: v_readlane_b32 s36, v27, 4 +; SI-NEXT: v_readlane_b32 s35, v27, 3 +; SI-NEXT: v_readlane_b32 s34, v27, 2 +; SI-NEXT: v_readlane_b32 s31, v27, 1 +; SI-NEXT: v_readlane_b32 s30, v27, 0 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB51_4: -; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: ; implicit-def: $sgpr9 -; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr41 ; SI-NEXT: ; implicit-def: $sgpr10 -; SI-NEXT: ; implicit-def: $sgpr11 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr43 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr45 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr47 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr15 ; SI-NEXT: s_branch .LBB51_2 ; ; VI-LABEL: bitcast_v40i8_to_v20i16_scalar: @@ -19996,25 +20608,37 @@ define <5 x double> @bitcast_v20i16_to_v5f64(<20 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v20i16_to_v5f64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v28, v14 -; SI-NEXT: v_mov_b32_e32 v27, v12 -; SI-NEXT: v_mov_b32_e32 v26, v10 -; SI-NEXT: v_mov_b32_e32 v21, v8 -; SI-NEXT: v_mov_b32_e32 v22, v6 -; SI-NEXT: v_mov_b32_e32 v23, v4 -; SI-NEXT: v_mov_b32_e32 v24, v2 +; SI-NEXT: v_mov_b32_e32 v16, v9 +; SI-NEXT: v_mov_b32_e32 v17, v8 +; SI-NEXT: v_mov_b32_e32 v18, v7 +; SI-NEXT: v_mov_b32_e32 v19, v6 +; SI-NEXT: v_mov_b32_e32 v20, v5 +; SI-NEXT: v_mov_b32_e32 v21, v4 +; SI-NEXT: v_mov_b32_e32 v22, v3 +; SI-NEXT: v_mov_b32_e32 v23, v2 +; SI-NEXT: v_mov_b32_e32 v24, v1 ; SI-NEXT: v_mov_b32_e32 v25, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v25 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v8 ; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v0 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -20031,32 +20655,31 @@ define <5 x double> @bitcast_v20i16_to_v5f64(<20 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v23 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v22 ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v21 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v26 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v27 -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v28 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v16 -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v18 -; SI-NEXT: v_or_b32_e32 v0, v0, v36 -; SI-NEXT: v_or_b32_e32 v1, v1, v35 -; SI-NEXT: v_or_b32_e32 v2, v2, v34 -; SI-NEXT: v_or_b32_e32 v3, v3, v33 -; SI-NEXT: v_or_b32_e32 v4, v4, v32 -; SI-NEXT: v_or_b32_e32 v5, v5, v31 -; SI-NEXT: v_or_b32_e32 v6, v6, v30 -; SI-NEXT: v_or_b32_e32 v7, v7, v29 -; SI-NEXT: v_or_b32_e32 v8, v8, v20 -; SI-NEXT: v_or_b32_e32 v9, v9, v17 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v19 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v0, v0, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v34 +; SI-NEXT: v_or_b32_e32 v2, v2, v33 +; SI-NEXT: v_or_b32_e32 v3, v3, v32 +; SI-NEXT: v_or_b32_e32 v4, v4, v31 +; SI-NEXT: v_or_b32_e32 v5, v5, v30 +; SI-NEXT: v_or_b32_e32 v6, v6, v29 +; SI-NEXT: v_or_b32_e32 v7, v7, v28 +; SI-NEXT: v_or_b32_e32 v8, v8, v27 +; SI-NEXT: v_or_b32_e32 v9, v9, v26 ; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr33 @@ -20064,8 +20687,9 @@ define <5 x double> @bitcast_v20i16_to_v5f64(<20 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB52_2 ; SI-NEXT: .LBB52_4: ; %cmp.true @@ -20074,11 +20698,11 @@ define <5 x double> @bitcast_v20i16_to_v5f64(<20 x i16> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v23 ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v22 ; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v21 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v26 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v27 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v28 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v16 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v16 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 @@ -20089,17 +20713,17 @@ define <5 x double> @bitcast_v20i16_to_v5f64(<20 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: v_or_b32_e32 v0, v36, v0 +; SI-NEXT: v_or_b32_e32 v0, v35, v0 ; SI-NEXT: s_mov_b32 s6, 0x30000 -; SI-NEXT: v_or_b32_e32 v1, v35, v1 -; SI-NEXT: v_or_b32_e32 v2, v34, v2 -; SI-NEXT: v_or_b32_e32 v3, v33, v3 -; SI-NEXT: v_or_b32_e32 v4, v32, v4 -; SI-NEXT: v_or_b32_e32 v5, v31, v5 -; SI-NEXT: v_or_b32_e32 v6, v30, v6 -; SI-NEXT: v_or_b32_e32 v7, v29, v7 -; SI-NEXT: v_or_b32_e32 v8, v20, v8 -; SI-NEXT: v_or_b32_e32 v9, v17, v9 +; SI-NEXT: v_or_b32_e32 v1, v34, v1 +; SI-NEXT: v_or_b32_e32 v2, v33, v2 +; SI-NEXT: v_or_b32_e32 v3, v32, v3 +; SI-NEXT: v_or_b32_e32 v4, v31, v4 +; SI-NEXT: v_or_b32_e32 v5, v30, v5 +; SI-NEXT: v_or_b32_e32 v6, v29, v6 +; SI-NEXT: v_or_b32_e32 v7, v28, v7 +; SI-NEXT: v_or_b32_e32 v8, v27, v8 +; SI-NEXT: v_or_b32_e32 v9, v26, v9 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 @@ -20224,110 +20848,139 @@ define inreg <5 x double> @bitcast_v20i16_to_v5f64_scalar(<20 x i16> inreg %a, i ; SI-LABEL: bitcast_v20i16_to_v5f64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; SI-NEXT: v_mov_b32_e32 v16, v4 -; SI-NEXT: v_mov_b32_e32 v17, v2 -; SI-NEXT: v_mov_b32_e32 v18, v0 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v5 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v10, s36, 0 +; SI-NEXT: v_writelane_b32 v10, s37, 1 +; SI-NEXT: v_writelane_b32 v10, s38, 2 +; SI-NEXT: v_writelane_b32 v10, s39, 3 +; SI-NEXT: v_writelane_b32 v10, s48, 4 +; SI-NEXT: v_writelane_b32 v10, s49, 5 +; SI-NEXT: s_lshr_b32 s6, s25, 16 +; SI-NEXT: s_lshr_b32 s7, s24, 16 +; SI-NEXT: s_lshr_b32 s8, s23, 16 +; SI-NEXT: s_lshr_b32 s9, s22, 16 +; SI-NEXT: s_lshr_b32 s10, s21, 16 +; SI-NEXT: s_lshr_b32 s11, s20, 16 +; SI-NEXT: s_lshr_b32 s12, s19, 16 +; SI-NEXT: s_lshr_b32 s13, s18, 16 +; SI-NEXT: s_lshr_b32 s14, s17, 16 +; SI-NEXT: s_lshr_b32 s15, s16, 16 +; SI-NEXT: v_writelane_b32 v10, s50, 6 +; SI-NEXT: s_cmp_lg_u32 s26, 0 +; SI-NEXT: v_writelane_b32 v10, s51, 7 ; SI-NEXT: s_cbranch_scc0 .LBB53_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: s_or_b32 s7, s7, s8 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: s_or_b32 s8, s8, s9 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v18 -; SI-NEXT: s_or_b32 s9, s9, s10 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 -; SI-NEXT: v_or_b32_e32 v7, v0, v21 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 -; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_or_b32_e32 v8, v1, v20 -; SI-NEXT: v_or_b32_e32 v9, v0, v19 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_mov_b32_e32 v5, s9 -; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_lshl_b32 s5, s15, 16 +; SI-NEXT: s_or_b32 s36, s4, s5 +; SI-NEXT: s_and_b32 s4, s17, 0xffff +; SI-NEXT: s_lshl_b32 s5, s14, 16 +; SI-NEXT: s_or_b32 s37, s4, s5 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: s_lshl_b32 s5, s13, 16 +; SI-NEXT: s_or_b32 s38, s4, s5 +; SI-NEXT: s_and_b32 s4, s19, 0xffff +; SI-NEXT: s_lshl_b32 s5, s12, 16 +; SI-NEXT: s_or_b32 s39, s4, s5 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: s_lshl_b32 s5, s11, 16 +; SI-NEXT: s_or_b32 s40, s4, s5 +; SI-NEXT: s_and_b32 s4, s21, 0xffff +; SI-NEXT: s_lshl_b32 s5, s10, 16 +; SI-NEXT: s_or_b32 s41, s4, s5 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: s_lshl_b32 s5, s9, 16 +; SI-NEXT: s_or_b32 s42, s4, s5 +; SI-NEXT: s_and_b32 s4, s23, 0xffff +; SI-NEXT: s_lshl_b32 s5, s8, 16 +; SI-NEXT: s_or_b32 s43, s4, s5 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: s_lshl_b32 s5, s7, 16 +; SI-NEXT: s_or_b32 s44, s4, s5 +; SI-NEXT: s_and_b32 s4, s25, 0xffff +; SI-NEXT: s_lshl_b32 s5, s6, 16 +; SI-NEXT: s_or_b32 s45, s4, s5 ; SI-NEXT: s_cbranch_execnz .LBB53_3 ; SI-NEXT: .LBB53_2: ; %cmp.true ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_lshl_b32 s5, s15, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s36, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s17, 0xffff +; SI-NEXT: s_lshl_b32 s5, s14, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v18 +; SI-NEXT: s_add_i32 s37, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: s_lshl_b32 s5, s13, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s38, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s19, 0xffff +; SI-NEXT: s_lshl_b32 s5, s12, 16 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 ; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s39, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: s_lshl_b32 s5, s11, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s40, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s21, 0xffff +; SI-NEXT: s_lshl_b32 s5, s10, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: v_or_b32_e32 v0, v21, v0 -; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s41, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: s_lshl_b32 s5, s9, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s42, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s23, 0xffff +; SI-NEXT: s_lshl_b32 s5, s8, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v17 -; SI-NEXT: s_or_b32 s7, s8, s7 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_or_b32 s8, s9, s8 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: v_or_b32_e32 v0, v20, v0 -; SI-NEXT: s_or_b32 s9, s10, s9 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v16 -; SI-NEXT: s_or_b32 s10, s11, s10 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_add_i32 s4, s4, 0x30000 -; SI-NEXT: s_add_i32 s5, s5, 0x30000 -; SI-NEXT: s_add_i32 s6, s6, 0x30000 -; SI-NEXT: s_add_i32 s7, s7, 0x30000 -; SI-NEXT: s_add_i32 s8, s8, 0x30000 -; SI-NEXT: s_add_i32 s9, s9, 0x30000 -; SI-NEXT: s_add_i32 s10, s10, 0x30000 -; SI-NEXT: v_or_b32_e32 v0, v19, v0 -; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_mov_b32_e32 v5, s9 -; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_add_i32 s43, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: s_lshl_b32 s5, s7, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s44, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s25, 0xffff +; SI-NEXT: s_lshl_b32 s5, s6, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s45, s4, 0x30000 ; SI-NEXT: .LBB53_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s36 +; SI-NEXT: v_mov_b32_e32 v1, s37 +; SI-NEXT: v_mov_b32_e32 v2, s38 +; SI-NEXT: v_mov_b32_e32 v3, s39 +; SI-NEXT: v_mov_b32_e32 v4, s40 +; SI-NEXT: v_mov_b32_e32 v5, s41 +; SI-NEXT: v_mov_b32_e32 v6, s42 +; SI-NEXT: v_mov_b32_e32 v7, s43 +; SI-NEXT: v_mov_b32_e32 v8, s44 +; SI-NEXT: v_mov_b32_e32 v9, s45 +; SI-NEXT: v_readlane_b32 s51, v10, 7 +; SI-NEXT: v_readlane_b32 s50, v10, 6 +; SI-NEXT: v_readlane_b32 s49, v10, 5 +; SI-NEXT: v_readlane_b32 s48, v10, 4 +; SI-NEXT: v_readlane_b32 s39, v10, 3 +; SI-NEXT: v_readlane_b32 s38, v10, 2 +; SI-NEXT: v_readlane_b32 s37, v10, 1 +; SI-NEXT: v_readlane_b32 s36, v10, 0 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB53_4: -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: ; implicit-def: $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 ; SI-NEXT: s_branch .LBB53_2 ; ; VI-LABEL: bitcast_v20i16_to_v5f64_scalar: @@ -20501,70 +21154,82 @@ define <20 x i16> @bitcast_v5f64_to_v20i16(<5 x double> %a, i32 %b) { ; SI-LABEL: bitcast_v5f64_to_v20i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v28, v9 -; SI-NEXT: v_mov_b32_e32 v27, v8 -; SI-NEXT: v_mov_b32_e32 v26, v7 -; SI-NEXT: v_mov_b32_e32 v25, v6 -; SI-NEXT: v_mov_b32_e32 v24, v5 -; SI-NEXT: v_mov_b32_e32 v23, v4 -; SI-NEXT: v_mov_b32_e32 v22, v3 -; SI-NEXT: v_mov_b32_e32 v21, v2 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB54_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_alignbit_b32 v17, v28, v27, 16 -; SI-NEXT: v_alignbit_b32 v13, v26, v25, 16 -; SI-NEXT: v_alignbit_b32 v9, v24, v23, 16 -; SI-NEXT: v_alignbit_b32 v5, v22, v21, 16 -; SI-NEXT: v_alignbit_b32 v20, v1, v0, 16 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_alignbit_b32 v10, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v11, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v12, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v13, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v16, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v1 ; SI-NEXT: .LBB54_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB54_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; SI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 -; SI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 -; SI-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 -; SI-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 -; SI-NEXT: v_alignbit_b32 v17, v28, v27, 16 -; SI-NEXT: v_alignbit_b32 v13, v26, v25, 16 -; SI-NEXT: v_alignbit_b32 v9, v24, v23, 16 -; SI-NEXT: v_alignbit_b32 v5, v22, v21, 16 -; SI-NEXT: v_alignbit_b32 v20, v1, v0, 16 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_alignbit_b32 v10, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v11, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v12, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v13, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v16, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v1 ; SI-NEXT: .LBB54_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_mov_b32_e32 v2, v1 -; SI-NEXT: v_mov_b32_e32 v4, v21 -; SI-NEXT: v_mov_b32_e32 v6, v22 -; SI-NEXT: v_mov_b32_e32 v8, v23 -; SI-NEXT: v_mov_b32_e32 v10, v24 -; SI-NEXT: v_mov_b32_e32 v12, v25 -; SI-NEXT: v_mov_b32_e32 v14, v26 -; SI-NEXT: v_mov_b32_e32 v16, v27 -; SI-NEXT: v_mov_b32_e32 v18, v28 -; SI-NEXT: v_mov_b32_e32 v1, v20 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v0, v0, v16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v19 +; SI-NEXT: v_or_b32_e32 v2, v2, v13 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v18 +; SI-NEXT: v_or_b32_e32 v4, v4, v12 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v17 +; SI-NEXT: v_or_b32_e32 v6, v6, v11 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v15 +; SI-NEXT: v_or_b32_e32 v8, v8, v10 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v14 +; SI-NEXT: v_or_b32_e32 v1, v1, v16 +; SI-NEXT: v_or_b32_e32 v3, v3, v13 +; SI-NEXT: v_or_b32_e32 v5, v5, v12 +; SI-NEXT: v_or_b32_e32 v7, v7, v11 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v5f64_to_v20i16: @@ -20657,21 +21322,21 @@ define inreg <20 x i16> @bitcast_v5f64_to_v20i16_scalar(<5 x double> inreg %a, i ; SI-NEXT: s_lshr_b64 s[12:13], s[16:17], 16 ; SI-NEXT: s_cbranch_execnz .LBB55_4 ; SI-NEXT: .LBB55_2: ; %cmp.true -; SI-NEXT: v_add_f64 v[20:21], s[24:25], 1.0 -; SI-NEXT: v_add_f64 v[22:23], s[22:23], 1.0 -; SI-NEXT: v_add_f64 v[24:25], s[20:21], 1.0 -; SI-NEXT: v_add_f64 v[26:27], s[18:19], 1.0 -; SI-NEXT: v_add_f64 v[28:29], s[16:17], 1.0 -; SI-NEXT: v_lshr_b64 v[17:18], v[20:21], 16 -; SI-NEXT: v_lshr_b64 v[13:14], v[22:23], 16 -; SI-NEXT: v_lshr_b64 v[9:10], v[24:25], 16 -; SI-NEXT: v_lshr_b64 v[5:6], v[26:27], 16 -; SI-NEXT: v_lshr_b64 v[1:2], v[28:29], 16 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v29 +; SI-NEXT: v_add_f64 v[8:9], s[24:25], 1.0 +; SI-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; SI-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; SI-NEXT: v_lshr_b64 v[10:11], v[8:9], 16 +; SI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; SI-NEXT: v_lshr_b64 v[11:12], v[6:7], 16 +; SI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; SI-NEXT: v_lshr_b64 v[12:13], v[4:5], 16 +; SI-NEXT: v_lshr_b64 v[13:14], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[14:15], v[0:1], 16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v1 ; SI-NEXT: s_branch .LBB55_5 ; SI-NEXT: .LBB55_3: ; SI-NEXT: ; implicit-def: $sgpr12 @@ -20686,37 +21351,57 @@ define inreg <20 x i16> @bitcast_v5f64_to_v20i16_scalar(<5 x double> inreg %a, i ; SI-NEXT: ; implicit-def: $sgpr40 ; SI-NEXT: s_branch .LBB55_2 ; SI-NEXT: .LBB55_4: -; SI-NEXT: v_mov_b32_e32 v21, s25 -; SI-NEXT: v_mov_b32_e32 v23, s23 -; SI-NEXT: v_mov_b32_e32 v25, s21 -; SI-NEXT: v_mov_b32_e32 v27, s19 -; SI-NEXT: v_mov_b32_e32 v29, s17 -; SI-NEXT: v_mov_b32_e32 v28, s16 -; SI-NEXT: v_mov_b32_e32 v26, s18 -; SI-NEXT: v_mov_b32_e32 v24, s20 -; SI-NEXT: v_mov_b32_e32 v22, s22 -; SI-NEXT: v_mov_b32_e32 v20, s24 -; SI-NEXT: v_mov_b32_e32 v19, s40 -; SI-NEXT: v_mov_b32_e32 v15, s29 -; SI-NEXT: v_mov_b32_e32 v11, s28 -; SI-NEXT: v_mov_b32_e32 v7, s27 -; SI-NEXT: v_mov_b32_e32 v3, s26 -; SI-NEXT: v_mov_b32_e32 v1, s12 -; SI-NEXT: v_mov_b32_e32 v5, s10 -; SI-NEXT: v_mov_b32_e32 v9, s8 -; SI-NEXT: v_mov_b32_e32 v13, s6 -; SI-NEXT: v_mov_b32_e32 v17, s4 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v16, s40 +; SI-NEXT: v_mov_b32_e32 v17, s29 +; SI-NEXT: v_mov_b32_e32 v18, s28 +; SI-NEXT: v_mov_b32_e32 v19, s27 +; SI-NEXT: v_mov_b32_e32 v20, s26 +; SI-NEXT: v_mov_b32_e32 v14, s12 +; SI-NEXT: v_mov_b32_e32 v13, s10 +; SI-NEXT: v_mov_b32_e32 v12, s8 +; SI-NEXT: v_mov_b32_e32 v11, s6 +; SI-NEXT: v_mov_b32_e32 v10, s4 ; SI-NEXT: .LBB55_5: ; %end -; SI-NEXT: v_mov_b32_e32 v0, v28 -; SI-NEXT: v_mov_b32_e32 v2, v29 -; SI-NEXT: v_mov_b32_e32 v4, v26 -; SI-NEXT: v_mov_b32_e32 v6, v27 -; SI-NEXT: v_mov_b32_e32 v8, v24 -; SI-NEXT: v_mov_b32_e32 v10, v25 -; SI-NEXT: v_mov_b32_e32 v12, v22 -; SI-NEXT: v_mov_b32_e32 v14, v23 -; SI-NEXT: v_mov_b32_e32 v16, v20 -; SI-NEXT: v_mov_b32_e32 v18, v21 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v0, v0, v14 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v20 +; SI-NEXT: v_or_b32_e32 v2, v2, v13 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v19 +; SI-NEXT: v_or_b32_e32 v4, v4, v12 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v18 +; SI-NEXT: v_or_b32_e32 v6, v6, v11 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v17 +; SI-NEXT: v_or_b32_e32 v8, v8, v10 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v16 +; SI-NEXT: v_or_b32_e32 v1, v1, v14 +; SI-NEXT: v_or_b32_e32 v3, v3, v13 +; SI-NEXT: v_or_b32_e32 v5, v5, v12 +; SI-NEXT: v_or_b32_e32 v7, v7, v11 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v5f64_to_v20i16_scalar: @@ -20842,25 +21527,37 @@ define <5 x i64> @bitcast_v20i16_to_v5i64(<20 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v20i16_to_v5i64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v28, v14 -; SI-NEXT: v_mov_b32_e32 v27, v12 -; SI-NEXT: v_mov_b32_e32 v26, v10 -; SI-NEXT: v_mov_b32_e32 v21, v8 -; SI-NEXT: v_mov_b32_e32 v22, v6 -; SI-NEXT: v_mov_b32_e32 v23, v4 -; SI-NEXT: v_mov_b32_e32 v24, v2 +; SI-NEXT: v_mov_b32_e32 v16, v9 +; SI-NEXT: v_mov_b32_e32 v17, v8 +; SI-NEXT: v_mov_b32_e32 v18, v7 +; SI-NEXT: v_mov_b32_e32 v19, v6 +; SI-NEXT: v_mov_b32_e32 v20, v5 +; SI-NEXT: v_mov_b32_e32 v21, v4 +; SI-NEXT: v_mov_b32_e32 v22, v3 +; SI-NEXT: v_mov_b32_e32 v23, v2 +; SI-NEXT: v_mov_b32_e32 v24, v1 ; SI-NEXT: v_mov_b32_e32 v25, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v25 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v8 ; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v0 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -20877,32 +21574,31 @@ define <5 x i64> @bitcast_v20i16_to_v5i64(<20 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v23 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v22 ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v21 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v26 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v27 -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v28 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v16 -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v18 -; SI-NEXT: v_or_b32_e32 v0, v0, v36 -; SI-NEXT: v_or_b32_e32 v1, v1, v35 -; SI-NEXT: v_or_b32_e32 v2, v2, v34 -; SI-NEXT: v_or_b32_e32 v3, v3, v33 -; SI-NEXT: v_or_b32_e32 v4, v4, v32 -; SI-NEXT: v_or_b32_e32 v5, v5, v31 -; SI-NEXT: v_or_b32_e32 v6, v6, v30 -; SI-NEXT: v_or_b32_e32 v7, v7, v29 -; SI-NEXT: v_or_b32_e32 v8, v8, v20 -; SI-NEXT: v_or_b32_e32 v9, v9, v17 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v19 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v0, v0, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v34 +; SI-NEXT: v_or_b32_e32 v2, v2, v33 +; SI-NEXT: v_or_b32_e32 v3, v3, v32 +; SI-NEXT: v_or_b32_e32 v4, v4, v31 +; SI-NEXT: v_or_b32_e32 v5, v5, v30 +; SI-NEXT: v_or_b32_e32 v6, v6, v29 +; SI-NEXT: v_or_b32_e32 v7, v7, v28 +; SI-NEXT: v_or_b32_e32 v8, v8, v27 +; SI-NEXT: v_or_b32_e32 v9, v9, v26 ; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr33 @@ -20910,8 +21606,9 @@ define <5 x i64> @bitcast_v20i16_to_v5i64(<20 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB56_2 ; SI-NEXT: .LBB56_4: ; %cmp.true @@ -20920,11 +21617,11 @@ define <5 x i64> @bitcast_v20i16_to_v5i64(<20 x i16> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v23 ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v22 ; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v21 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v26 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v27 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v28 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v16 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v16 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 @@ -20935,17 +21632,17 @@ define <5 x i64> @bitcast_v20i16_to_v5i64(<20 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: v_or_b32_e32 v0, v36, v0 +; SI-NEXT: v_or_b32_e32 v0, v35, v0 ; SI-NEXT: s_mov_b32 s6, 0x30000 -; SI-NEXT: v_or_b32_e32 v1, v35, v1 -; SI-NEXT: v_or_b32_e32 v2, v34, v2 -; SI-NEXT: v_or_b32_e32 v3, v33, v3 -; SI-NEXT: v_or_b32_e32 v4, v32, v4 -; SI-NEXT: v_or_b32_e32 v5, v31, v5 -; SI-NEXT: v_or_b32_e32 v6, v30, v6 -; SI-NEXT: v_or_b32_e32 v7, v29, v7 -; SI-NEXT: v_or_b32_e32 v8, v20, v8 -; SI-NEXT: v_or_b32_e32 v9, v17, v9 +; SI-NEXT: v_or_b32_e32 v1, v34, v1 +; SI-NEXT: v_or_b32_e32 v2, v33, v2 +; SI-NEXT: v_or_b32_e32 v3, v32, v3 +; SI-NEXT: v_or_b32_e32 v4, v31, v4 +; SI-NEXT: v_or_b32_e32 v5, v30, v5 +; SI-NEXT: v_or_b32_e32 v6, v29, v6 +; SI-NEXT: v_or_b32_e32 v7, v28, v7 +; SI-NEXT: v_or_b32_e32 v8, v27, v8 +; SI-NEXT: v_or_b32_e32 v9, v26, v9 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 @@ -21070,110 +21767,139 @@ define inreg <5 x i64> @bitcast_v20i16_to_v5i64_scalar(<20 x i16> inreg %a, i32 ; SI-LABEL: bitcast_v20i16_to_v5i64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; SI-NEXT: v_mov_b32_e32 v16, v4 -; SI-NEXT: v_mov_b32_e32 v17, v2 -; SI-NEXT: v_mov_b32_e32 v18, v0 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v5 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v10, s36, 0 +; SI-NEXT: v_writelane_b32 v10, s37, 1 +; SI-NEXT: v_writelane_b32 v10, s38, 2 +; SI-NEXT: v_writelane_b32 v10, s39, 3 +; SI-NEXT: v_writelane_b32 v10, s48, 4 +; SI-NEXT: v_writelane_b32 v10, s49, 5 +; SI-NEXT: s_lshr_b32 s6, s25, 16 +; SI-NEXT: s_lshr_b32 s7, s24, 16 +; SI-NEXT: s_lshr_b32 s8, s23, 16 +; SI-NEXT: s_lshr_b32 s9, s22, 16 +; SI-NEXT: s_lshr_b32 s10, s21, 16 +; SI-NEXT: s_lshr_b32 s11, s20, 16 +; SI-NEXT: s_lshr_b32 s12, s19, 16 +; SI-NEXT: s_lshr_b32 s13, s18, 16 +; SI-NEXT: s_lshr_b32 s14, s17, 16 +; SI-NEXT: s_lshr_b32 s15, s16, 16 +; SI-NEXT: v_writelane_b32 v10, s50, 6 +; SI-NEXT: s_cmp_lg_u32 s26, 0 +; SI-NEXT: v_writelane_b32 v10, s51, 7 ; SI-NEXT: s_cbranch_scc0 .LBB57_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: s_or_b32 s7, s7, s8 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: s_or_b32 s8, s8, s9 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v18 -; SI-NEXT: s_or_b32 s9, s9, s10 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 -; SI-NEXT: v_or_b32_e32 v7, v0, v21 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 -; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_or_b32_e32 v8, v1, v20 -; SI-NEXT: v_or_b32_e32 v9, v0, v19 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_mov_b32_e32 v5, s9 -; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_lshl_b32 s5, s15, 16 +; SI-NEXT: s_or_b32 s36, s4, s5 +; SI-NEXT: s_and_b32 s4, s17, 0xffff +; SI-NEXT: s_lshl_b32 s5, s14, 16 +; SI-NEXT: s_or_b32 s37, s4, s5 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: s_lshl_b32 s5, s13, 16 +; SI-NEXT: s_or_b32 s38, s4, s5 +; SI-NEXT: s_and_b32 s4, s19, 0xffff +; SI-NEXT: s_lshl_b32 s5, s12, 16 +; SI-NEXT: s_or_b32 s39, s4, s5 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: s_lshl_b32 s5, s11, 16 +; SI-NEXT: s_or_b32 s40, s4, s5 +; SI-NEXT: s_and_b32 s4, s21, 0xffff +; SI-NEXT: s_lshl_b32 s5, s10, 16 +; SI-NEXT: s_or_b32 s41, s4, s5 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: s_lshl_b32 s5, s9, 16 +; SI-NEXT: s_or_b32 s42, s4, s5 +; SI-NEXT: s_and_b32 s4, s23, 0xffff +; SI-NEXT: s_lshl_b32 s5, s8, 16 +; SI-NEXT: s_or_b32 s43, s4, s5 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: s_lshl_b32 s5, s7, 16 +; SI-NEXT: s_or_b32 s44, s4, s5 +; SI-NEXT: s_and_b32 s4, s25, 0xffff +; SI-NEXT: s_lshl_b32 s5, s6, 16 +; SI-NEXT: s_or_b32 s45, s4, s5 ; SI-NEXT: s_cbranch_execnz .LBB57_3 ; SI-NEXT: .LBB57_2: ; %cmp.true ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_lshl_b32 s5, s15, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s36, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s17, 0xffff +; SI-NEXT: s_lshl_b32 s5, s14, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v18 +; SI-NEXT: s_add_i32 s37, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: s_lshl_b32 s5, s13, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s38, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s19, 0xffff +; SI-NEXT: s_lshl_b32 s5, s12, 16 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 ; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s39, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: s_lshl_b32 s5, s11, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s40, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s21, 0xffff +; SI-NEXT: s_lshl_b32 s5, s10, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: v_or_b32_e32 v0, v21, v0 -; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s41, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: s_lshl_b32 s5, s9, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s42, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s23, 0xffff +; SI-NEXT: s_lshl_b32 s5, s8, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v17 -; SI-NEXT: s_or_b32 s7, s8, s7 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_or_b32 s8, s9, s8 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: v_or_b32_e32 v0, v20, v0 -; SI-NEXT: s_or_b32 s9, s10, s9 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v16 -; SI-NEXT: s_or_b32 s10, s11, s10 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_add_i32 s4, s4, 0x30000 -; SI-NEXT: s_add_i32 s5, s5, 0x30000 -; SI-NEXT: s_add_i32 s6, s6, 0x30000 -; SI-NEXT: s_add_i32 s7, s7, 0x30000 -; SI-NEXT: s_add_i32 s8, s8, 0x30000 -; SI-NEXT: s_add_i32 s9, s9, 0x30000 -; SI-NEXT: s_add_i32 s10, s10, 0x30000 -; SI-NEXT: v_or_b32_e32 v0, v19, v0 -; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_mov_b32_e32 v5, s9 -; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_add_i32 s43, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: s_lshl_b32 s5, s7, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s44, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s25, 0xffff +; SI-NEXT: s_lshl_b32 s5, s6, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s45, s4, 0x30000 ; SI-NEXT: .LBB57_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s36 +; SI-NEXT: v_mov_b32_e32 v1, s37 +; SI-NEXT: v_mov_b32_e32 v2, s38 +; SI-NEXT: v_mov_b32_e32 v3, s39 +; SI-NEXT: v_mov_b32_e32 v4, s40 +; SI-NEXT: v_mov_b32_e32 v5, s41 +; SI-NEXT: v_mov_b32_e32 v6, s42 +; SI-NEXT: v_mov_b32_e32 v7, s43 +; SI-NEXT: v_mov_b32_e32 v8, s44 +; SI-NEXT: v_mov_b32_e32 v9, s45 +; SI-NEXT: v_readlane_b32 s51, v10, 7 +; SI-NEXT: v_readlane_b32 s50, v10, 6 +; SI-NEXT: v_readlane_b32 s49, v10, 5 +; SI-NEXT: v_readlane_b32 s48, v10, 4 +; SI-NEXT: v_readlane_b32 s39, v10, 3 +; SI-NEXT: v_readlane_b32 s38, v10, 2 +; SI-NEXT: v_readlane_b32 s37, v10, 1 +; SI-NEXT: v_readlane_b32 s36, v10, 0 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB57_4: -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: ; implicit-def: $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 ; SI-NEXT: s_branch .LBB57_2 ; ; VI-LABEL: bitcast_v20i16_to_v5i64_scalar: @@ -21347,67 +22073,87 @@ define <20 x i16> @bitcast_v5i64_to_v20i16(<5 x i64> %a, i32 %b) { ; SI-LABEL: bitcast_v5i64_to_v20i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v18, v9 -; SI-NEXT: v_mov_b32_e32 v16, v8 -; SI-NEXT: v_mov_b32_e32 v14, v7 -; SI-NEXT: v_mov_b32_e32 v12, v6 -; SI-NEXT: v_mov_b32_e32 v20, v5 -; SI-NEXT: v_mov_b32_e32 v8, v4 -; SI-NEXT: v_mov_b32_e32 v6, v3 -; SI-NEXT: v_mov_b32_e32 v4, v2 -; SI-NEXT: v_mov_b32_e32 v2, v1 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB58_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_alignbit_b32 v17, v18, v16, 16 -; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 -; SI-NEXT: v_alignbit_b32 v9, v20, v8, 16 -; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_alignbit_b32 v10, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v11, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v12, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v13, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v16, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v1 ; SI-NEXT: .LBB58_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB58_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; SI-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc ; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; SI-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; SI-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; SI-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc -; SI-NEXT: v_alignbit_b32 v17, v18, v16, 16 -; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 -; SI-NEXT: v_alignbit_b32 v9, v20, v8, 16 -; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_alignbit_b32 v10, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v11, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v12, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v13, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v16, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v1 ; SI-NEXT: .LBB58_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_mov_b32_e32 v10, v20 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v0, v0, v16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v19 +; SI-NEXT: v_or_b32_e32 v2, v2, v13 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v18 +; SI-NEXT: v_or_b32_e32 v4, v4, v12 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v17 +; SI-NEXT: v_or_b32_e32 v6, v6, v11 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v15 +; SI-NEXT: v_or_b32_e32 v8, v8, v10 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v14 +; SI-NEXT: v_or_b32_e32 v1, v1, v16 +; SI-NEXT: v_or_b32_e32 v3, v3, v13 +; SI-NEXT: v_or_b32_e32 v5, v5, v12 +; SI-NEXT: v_or_b32_e32 v7, v7, v11 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v5i64_to_v20i16: @@ -21539,26 +22285,46 @@ define inreg <20 x i16> @bitcast_v5i64_to_v20i16_scalar(<5 x i64> inreg %a, i32 ; SI-NEXT: s_lshr_b64 s[10:11], s[18:19], 16 ; SI-NEXT: s_lshr_b64 s[12:13], s[16:17], 16 ; SI-NEXT: .LBB59_3: ; %end -; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_mov_b32_e32 v1, s12 -; SI-NEXT: v_mov_b32_e32 v2, s17 -; SI-NEXT: v_mov_b32_e32 v3, s40 -; SI-NEXT: v_mov_b32_e32 v4, s18 -; SI-NEXT: v_mov_b32_e32 v5, s10 -; SI-NEXT: v_mov_b32_e32 v6, s19 -; SI-NEXT: v_mov_b32_e32 v7, s29 -; SI-NEXT: v_mov_b32_e32 v8, s20 -; SI-NEXT: v_mov_b32_e32 v9, s8 -; SI-NEXT: v_mov_b32_e32 v10, s21 -; SI-NEXT: v_mov_b32_e32 v11, s28 -; SI-NEXT: v_mov_b32_e32 v12, s22 -; SI-NEXT: v_mov_b32_e32 v13, s6 -; SI-NEXT: v_mov_b32_e32 v14, s23 -; SI-NEXT: v_mov_b32_e32 v15, s27 -; SI-NEXT: v_mov_b32_e32 v16, s24 -; SI-NEXT: v_mov_b32_e32 v17, s4 -; SI-NEXT: v_mov_b32_e32 v18, s25 -; SI-NEXT: v_mov_b32_e32 v19, s26 +; SI-NEXT: s_and_b32 s5, s16, 0xffff +; SI-NEXT: s_lshl_b32 s7, s12, 16 +; SI-NEXT: s_or_b32 s5, s5, s7 +; SI-NEXT: s_and_b32 s7, s17, 0xffff +; SI-NEXT: s_lshl_b32 s9, s40, 16 +; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: s_and_b32 s9, s18, 0xffff +; SI-NEXT: s_lshl_b32 s10, s10, 16 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s19, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: s_and_b32 s11, s20, 0xffff +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_or_b32 s8, s11, s8 +; SI-NEXT: s_and_b32 s11, s21, 0xffff +; SI-NEXT: s_lshl_b32 s12, s28, 16 +; SI-NEXT: s_or_b32 s11, s11, s12 +; SI-NEXT: s_and_b32 s12, s22, 0xffff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_or_b32 s6, s12, s6 +; SI-NEXT: s_and_b32 s12, s23, 0xffff +; SI-NEXT: s_lshl_b32 s13, s27, 16 +; SI-NEXT: s_or_b32 s12, s12, s13 +; SI-NEXT: s_and_b32 s13, s24, 0xffff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_or_b32 s4, s13, s4 +; SI-NEXT: s_and_b32 s13, s25, 0xffff +; SI-NEXT: s_lshl_b32 s14, s26, 16 +; SI-NEXT: s_or_b32 s13, s13, s14 +; SI-NEXT: v_mov_b32_e32 v0, s5 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: v_mov_b32_e32 v3, s10 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s11 +; SI-NEXT: v_mov_b32_e32 v6, s6 +; SI-NEXT: v_mov_b32_e32 v7, s12 +; SI-NEXT: v_mov_b32_e32 v8, s4 +; SI-NEXT: v_mov_b32_e32 v9, s13 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB59_4: ; SI-NEXT: ; implicit-def: $sgpr12 @@ -21690,6 +22456,36 @@ define <40 x i8> @bitcast_v20f16_to_v40i8(<20 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v20f16_to_v40i8: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill @@ -21698,110 +22494,110 @@ define <40 x i8> @bitcast_v20f16_to_v40i8(<20 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v51, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v50, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v23 ; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v41, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v6 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v44, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v47, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v19 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v10 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB60_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v51 -; SI-NEXT: v_or_b32_e32 v24, v50, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v22 -; SI-NEXT: v_or_b32_e32 v19, v49, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v54 -; SI-NEXT: v_or_b32_e32 v12, v53, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; SI-NEXT: v_or_b32_e32 v11, v52, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v41 -; SI-NEXT: v_or_b32_e32 v9, v40, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v3 -; SI-NEXT: v_or_b32_e32 v10, v55, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v44 -; SI-NEXT: v_or_b32_e32 v7, v43, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v2 -; SI-NEXT: v_or_b32_e32 v8, v42, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v47 -; SI-NEXT: v_or_b32_e32 v6, v46, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v1 -; SI-NEXT: v_or_b32_e32 v5, v45, v5 -; SI-NEXT: v_alignbit_b32 v26, v19, v24, 24 -; SI-NEXT: v_alignbit_b32 v30, v19, v24, 16 -; SI-NEXT: v_alignbit_b32 v32, v19, v24, 8 -; SI-NEXT: v_alignbit_b32 v25, v11, v12, 24 -; SI-NEXT: v_alignbit_b32 v27, v11, v12, 16 -; SI-NEXT: v_alignbit_b32 v31, v11, v12, 8 -; SI-NEXT: v_alignbit_b32 v18, v10, v9, 24 -; SI-NEXT: v_alignbit_b32 v21, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v28, v10, v9, 8 -; SI-NEXT: v_alignbit_b32 v15, v8, v7, 24 -; SI-NEXT: v_alignbit_b32 v16, v8, v7, 16 -; SI-NEXT: v_alignbit_b32 v23, v8, v7, 8 -; SI-NEXT: v_alignbit_b32 v13, v5, v6, 24 -; SI-NEXT: v_alignbit_b32 v14, v5, v6, 16 -; SI-NEXT: v_alignbit_b32 v17, v5, v6, 8 -; SI-NEXT: v_lshrrev_b32_e32 v39, 8, v19 -; SI-NEXT: v_lshrrev_b32_e32 v37, 8, v11 -; SI-NEXT: v_lshrrev_b32_e32 v35, 8, v10 -; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v8 -; SI-NEXT: v_lshrrev_b32_e32 v20, 8, v5 -; SI-NEXT: v_bfe_u32 v48, v22, 8, 8 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v51 +; SI-NEXT: v_or_b32_e32 v24, v50, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; SI-NEXT: v_or_b32_e32 v20, v49, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v54 +; SI-NEXT: v_or_b32_e32 v13, v53, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; SI-NEXT: v_or_b32_e32 v12, v52, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v41 +; SI-NEXT: v_or_b32_e32 v10, v40, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v3 +; SI-NEXT: v_or_b32_e32 v11, v55, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v44 +; SI-NEXT: v_or_b32_e32 v8, v43, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; SI-NEXT: v_or_b32_e32 v9, v42, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v47 +; SI-NEXT: v_or_b32_e32 v7, v46, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; SI-NEXT: v_or_b32_e32 v6, v45, v6 +; SI-NEXT: v_alignbit_b32 v26, v20, v24, 24 +; SI-NEXT: v_alignbit_b32 v30, v20, v24, 16 +; SI-NEXT: v_alignbit_b32 v32, v20, v24, 8 +; SI-NEXT: v_alignbit_b32 v25, v12, v13, 24 +; SI-NEXT: v_alignbit_b32 v27, v12, v13, 16 +; SI-NEXT: v_alignbit_b32 v31, v12, v13, 8 +; SI-NEXT: v_alignbit_b32 v19, v11, v10, 24 +; SI-NEXT: v_alignbit_b32 v22, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v28, v11, v10, 8 +; SI-NEXT: v_alignbit_b32 v16, v9, v8, 24 +; SI-NEXT: v_alignbit_b32 v17, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v23, v9, v8, 8 +; SI-NEXT: v_alignbit_b32 v14, v6, v7, 24 +; SI-NEXT: v_alignbit_b32 v15, v6, v7, 16 +; SI-NEXT: v_alignbit_b32 v18, v6, v7, 8 +; SI-NEXT: v_lshrrev_b32_e32 v39, 8, v20 +; SI-NEXT: v_lshrrev_b32_e32 v37, 8, v12 +; SI-NEXT: v_lshrrev_b32_e32 v35, 8, v11 +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v9 +; SI-NEXT: v_lshrrev_b32_e32 v21, 8, v6 +; SI-NEXT: v_bfe_u32 v48, v5, 8, 8 ; SI-NEXT: v_bfe_u32 v38, v4, 8, 8 ; SI-NEXT: v_bfe_u32 v36, v3, 8, 8 ; SI-NEXT: v_bfe_u32 v34, v2, 8, 8 @@ -21825,107 +22621,107 @@ define <40 x i8> @bitcast_v20f16_to_v40i8(<20 x half> %a, i32 %b) { ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB60_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v5, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v46 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v45 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v45 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v6, v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v44 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v1 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v7, v7, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v44 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v41 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v40 -; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v55 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v40 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v55 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v53 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v54 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_or_b32_e32 v10, v10, v12 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v52 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v53 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v54 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v11, v11, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v52 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_or_b32_e32 v12, v12, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v51 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v4 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v13, v13, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v51 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v4 +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v49 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v15 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v24, v13, v14 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v22 -; SI-NEXT: v_or_b32_e32 v19, v15, v13 -; SI-NEXT: v_alignbit_b32 v26, v19, v24, 24 -; SI-NEXT: v_alignbit_b32 v30, v19, v24, 16 -; SI-NEXT: v_alignbit_b32 v32, v19, v24, 8 -; SI-NEXT: v_alignbit_b32 v25, v11, v12, 24 -; SI-NEXT: v_alignbit_b32 v27, v11, v12, 16 -; SI-NEXT: v_alignbit_b32 v31, v11, v12, 8 -; SI-NEXT: v_alignbit_b32 v18, v10, v9, 24 -; SI-NEXT: v_alignbit_b32 v21, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v28, v10, v9, 8 -; SI-NEXT: v_alignbit_b32 v15, v8, v7, 24 -; SI-NEXT: v_alignbit_b32 v16, v8, v7, 16 -; SI-NEXT: v_alignbit_b32 v23, v8, v7, 8 -; SI-NEXT: v_alignbit_b32 v13, v5, v6, 24 -; SI-NEXT: v_alignbit_b32 v14, v5, v6, 16 -; SI-NEXT: v_alignbit_b32 v17, v5, v6, 8 -; SI-NEXT: v_lshrrev_b32_e32 v39, 8, v19 -; SI-NEXT: v_lshrrev_b32_e32 v37, 8, v11 -; SI-NEXT: v_lshrrev_b32_e32 v35, 8, v10 -; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v8 -; SI-NEXT: v_lshrrev_b32_e32 v20, 8, v5 -; SI-NEXT: v_bfe_u32 v48, v22, 8, 8 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v24, v14, v15 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v5 +; SI-NEXT: v_or_b32_e32 v20, v16, v14 +; SI-NEXT: v_alignbit_b32 v26, v20, v24, 24 +; SI-NEXT: v_alignbit_b32 v30, v20, v24, 16 +; SI-NEXT: v_alignbit_b32 v32, v20, v24, 8 +; SI-NEXT: v_alignbit_b32 v25, v12, v13, 24 +; SI-NEXT: v_alignbit_b32 v27, v12, v13, 16 +; SI-NEXT: v_alignbit_b32 v31, v12, v13, 8 +; SI-NEXT: v_alignbit_b32 v19, v11, v10, 24 +; SI-NEXT: v_alignbit_b32 v22, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v28, v11, v10, 8 +; SI-NEXT: v_alignbit_b32 v16, v9, v8, 24 +; SI-NEXT: v_alignbit_b32 v17, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v23, v9, v8, 8 +; SI-NEXT: v_alignbit_b32 v14, v6, v7, 24 +; SI-NEXT: v_alignbit_b32 v15, v6, v7, 16 +; SI-NEXT: v_alignbit_b32 v18, v6, v7, 8 +; SI-NEXT: v_lshrrev_b32_e32 v39, 8, v20 +; SI-NEXT: v_lshrrev_b32_e32 v37, 8, v12 +; SI-NEXT: v_lshrrev_b32_e32 v35, 8, v11 +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v9 +; SI-NEXT: v_lshrrev_b32_e32 v21, 8, v6 +; SI-NEXT: v_bfe_u32 v48, v5, 8, 8 ; SI-NEXT: v_bfe_u32 v38, v4, 8, 8 ; SI-NEXT: v_bfe_u32 v36, v3, 8, 8 ; SI-NEXT: v_bfe_u32 v34, v2, 8, 8 @@ -21942,80 +22738,80 @@ define <40 x i8> @bitcast_v20f16_to_v40i8(<20 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v26, v26, v30 ; SI-NEXT: v_or_b32_e32 v24, v24, v26 ; SI-NEXT: buffer_store_dword v24, v0, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 +; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v24, 8, v39 -; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 -; SI-NEXT: v_or_b32_e32 v19, v19, v24 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_or_b32_e32 v20, v20, v24 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_lshlrev_b32_e32 v24, 24, v48 -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; SI-NEXT: v_or_b32_e32 v22, v24, v22 -; SI-NEXT: v_or_b32_e32 v19, v19, v22 -; SI-NEXT: v_add_i32_e32 v22, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v19, v22, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v5, v24, v5 +; SI-NEXT: v_or_b32_e32 v5, v20, v5 +; SI-NEXT: v_add_i32_e32 v20, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v5, v20, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v19, 8, v31 -; SI-NEXT: v_or_b32_e32 v12, v12, v19 -; SI-NEXT: v_and_b32_e32 v19, 0xff, v27 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v22, 24, v25 -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; SI-NEXT: v_or_b32_e32 v19, v22, v19 -; SI-NEXT: v_or_b32_e32 v12, v12, v19 -; SI-NEXT: v_add_i32_e32 v19, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v12, v19, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 8, v31 +; SI-NEXT: v_or_b32_e32 v5, v5, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v25 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v13, v20, v13 +; SI-NEXT: v_or_b32_e32 v5, v5, v13 +; SI-NEXT: v_add_i32_e32 v13, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v5, v13, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v12 ; SI-NEXT: v_lshlrev_b32_e32 v12, 8, v37 ; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 -; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_or_b32_e32 v5, v5, v12 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v38 -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; SI-NEXT: v_or_b32_e32 v4, v12, v4 -; SI-NEXT: v_or_b32_e32 v4, v11, v4 -; SI-NEXT: v_add_i32_e32 v11, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v4, v11, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v4, 0xff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v28 -; SI-NEXT: v_or_b32_e32 v4, v4, v9 -; SI-NEXT: v_and_b32_e32 v9, 0xff, v21 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v18 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v28 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v19 ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_or_b32_e32 v4, v4, v9 -; SI-NEXT: v_add_i32_e32 v9, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v4, v9, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v5, v10, v5 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v4, 0xff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v35 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v35 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_or_b32_e32 v4, v4, v9 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v36 +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v36 ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; SI-NEXT: v_or_b32_e32 v3, v9, v3 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_add_i32_e32 v4, vcc, 20, v0 ; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xff, v7 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v8 ; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v23 ; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_and_b32_e32 v4, 0xff, v16 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v17 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v15 +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v16 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v4, v7, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 ; SI-NEXT: v_or_b32_e32 v3, v3, v4 ; SI-NEXT: v_add_i32_e32 v4, vcc, 24, v0 ; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xff, v8 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v9 ; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v33 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: v_or_b32_e32 v3, v3, v4 @@ -22027,20 +22823,20 @@ define <40 x i8> @bitcast_v20f16_to_v40i8(<20 x half> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 ; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v17 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v18 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v14 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v15 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v13 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v14 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 ; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v20 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v21 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -22820,70 +23616,99 @@ define inreg <40 x i8> @bitcast_v20f16_to_v40i8_scalar(<20 x half> inreg %a, i32 ; SI-LABEL: bitcast_v20f16_to_v40i8_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v13, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v12, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v10, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v11, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v16, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v15, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v9, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v14, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v19, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v18, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v8, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v17, s26 -; SI-NEXT: v_cvt_f16_f32_e32 v21, s29 +; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, s28 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_cvt_f16_f32_e32 v25, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v20 +; SI-NEXT: s_cmp_lg_u32 s26, 0 ; SI-NEXT: s_cbranch_scc0 .LBB61_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_readfirstlane_b32 s4, v13 +; SI-NEXT: v_readfirstlane_b32 s4, v8 ; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v12 +; SI-NEXT: v_readfirstlane_b32 s5, v7 ; SI-NEXT: s_or_b32 s12, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v10 +; SI-NEXT: v_readfirstlane_b32 s4, v5 ; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v11 +; SI-NEXT: v_readfirstlane_b32 s5, v6 ; SI-NEXT: s_or_b32 s13, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v16 +; SI-NEXT: v_readfirstlane_b32 s4, v11 ; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v15 +; SI-NEXT: v_readfirstlane_b32 s5, v10 ; SI-NEXT: s_or_b32 s10, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v9 +; SI-NEXT: v_readfirstlane_b32 s4, v4 ; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v14 +; SI-NEXT: v_readfirstlane_b32 s5, v9 ; SI-NEXT: s_or_b32 s11, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v19 +; SI-NEXT: v_readfirstlane_b32 s4, v14 ; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v18 +; SI-NEXT: v_readfirstlane_b32 s5, v13 ; SI-NEXT: s_or_b32 s8, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v8 +; SI-NEXT: v_readfirstlane_b32 s4, v3 ; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v17 +; SI-NEXT: v_readfirstlane_b32 s5, v12 ; SI-NEXT: s_or_b32 s9, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v21 +; SI-NEXT: v_readfirstlane_b32 s4, v19 ; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v5 +; SI-NEXT: v_readfirstlane_b32 s5, v17 ; SI-NEXT: s_or_b32 s6, s5, s4 ; SI-NEXT: v_readfirstlane_b32 s4, v2 ; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v20 +; SI-NEXT: v_readfirstlane_b32 s5, v15 ; SI-NEXT: s_or_b32 s7, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v24 +; SI-NEXT: v_readfirstlane_b32 s4, v25 ; SI-NEXT: s_lshl_b32 s4, s4, 16 ; SI-NEXT: v_readfirstlane_b32 s5, v23 ; SI-NEXT: s_lshr_b64 s[14:15], s[12:13], 24 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: v_readfirstlane_b32 s5, v1 ; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: v_readfirstlane_b32 s15, v22 +; SI-NEXT: v_readfirstlane_b32 s15, v21 ; SI-NEXT: s_lshr_b64 s[16:17], s[12:13], 16 ; SI-NEXT: s_lshr_b64 s[20:21], s[12:13], 8 ; SI-NEXT: s_lshr_b64 s[18:19], s[10:11], 24 @@ -22904,112 +23729,112 @@ define inreg <40 x i8> @bitcast_v20f16_to_v40i8_scalar(<20 x half> inreg %a, i32 ; SI-NEXT: s_lshr_b32 s19, s9, 8 ; SI-NEXT: s_lshr_b32 s17, s7, 8 ; SI-NEXT: s_lshr_b32 s15, s5, 8 -; SI-NEXT: v_bfe_u32 v25, v10, 8, 8 -; SI-NEXT: v_bfe_u32 v7, v9, 8, 8 -; SI-NEXT: v_bfe_u32 v6, v8, 8, 8 -; SI-NEXT: v_bfe_u32 v4, v2, 8, 8 -; SI-NEXT: v_bfe_u32 v3, v1, 8, 8 +; SI-NEXT: v_bfe_u32 v24, v5, 8, 8 +; SI-NEXT: v_bfe_u32 v22, v4, 8, 8 +; SI-NEXT: v_bfe_u32 v20, v3, 8, 8 +; SI-NEXT: v_bfe_u32 v18, v2, 8, 8 +; SI-NEXT: v_bfe_u32 v16, v1, 8, 8 ; SI-NEXT: s_cbranch_execnz .LBB61_3 ; SI-NEXT: .LBB61_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v3, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v23 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_readfirstlane_b32 s4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v22 -; SI-NEXT: v_readfirstlane_b32 s5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v21 +; SI-NEXT: v_readfirstlane_b32 s4, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v21 +; SI-NEXT: v_readfirstlane_b32 s5, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_readfirstlane_b32 s6, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s6, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v5 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: v_readfirstlane_b32 s5, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: v_readfirstlane_b32 s6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v19 -; SI-NEXT: v_readfirstlane_b32 s7, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v20 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_readfirstlane_b32 s6, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: v_readfirstlane_b32 s7, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: v_readfirstlane_b32 s8, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v18 ; SI-NEXT: v_readfirstlane_b32 s7, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: v_readfirstlane_b32 s8, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: s_or_b32 s7, s8, s7 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_readfirstlane_b32 s8, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v8 +; SI-NEXT: v_readfirstlane_b32 s8, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: s_lshl_b32 s8, s8, 16 -; SI-NEXT: v_readfirstlane_b32 s9, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v17 +; SI-NEXT: v_readfirstlane_b32 s9, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v16 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: s_or_b32 s8, s9, s8 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_readfirstlane_b32 s9, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_readfirstlane_b32 s10, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v15 -; SI-NEXT: v_readfirstlane_b32 s9, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: v_readfirstlane_b32 s10, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: s_or_b32 s9, s10, s9 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_readfirstlane_b32 s10, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v12 +; SI-NEXT: v_readfirstlane_b32 s10, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: s_lshl_b32 s10, s10, 16 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_readfirstlane_b32 s11, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v13 +; SI-NEXT: v_readfirstlane_b32 s11, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: s_or_b32 s10, s11, s10 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_readfirstlane_b32 s11, v9 +; SI-NEXT: v_readfirstlane_b32 s11, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: s_lshl_b32 s11, s11, 16 -; SI-NEXT: v_readfirstlane_b32 s12, v3 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v5 +; SI-NEXT: v_readfirstlane_b32 s12, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: s_or_b32 s11, s12, s11 -; SI-NEXT: v_readfirstlane_b32 s12, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v11 +; SI-NEXT: v_readfirstlane_b32 s12, v8 ; SI-NEXT: s_lshl_b32 s12, s12, 16 -; SI-NEXT: v_readfirstlane_b32 s13, v3 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v3 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_readfirstlane_b32 s13, v7 ; SI-NEXT: s_or_b32 s12, s13, s12 -; SI-NEXT: v_readfirstlane_b32 s13, v10 +; SI-NEXT: v_readfirstlane_b32 s13, v5 ; SI-NEXT: s_lshl_b32 s13, s13, 16 -; SI-NEXT: v_readfirstlane_b32 s14, v3 +; SI-NEXT: v_readfirstlane_b32 s14, v6 ; SI-NEXT: s_or_b32 s13, s14, s13 ; SI-NEXT: s_lshr_b64 s[14:15], s[12:13], 24 ; SI-NEXT: s_lshr_b64 s[16:17], s[12:13], 16 @@ -23031,11 +23856,11 @@ define inreg <40 x i8> @bitcast_v20f16_to_v40i8_scalar(<20 x half> inreg %a, i32 ; SI-NEXT: s_lshr_b32 s19, s9, 8 ; SI-NEXT: s_lshr_b32 s17, s7, 8 ; SI-NEXT: s_lshr_b32 s15, s5, 8 -; SI-NEXT: v_bfe_u32 v25, v10, 8, 8 -; SI-NEXT: v_bfe_u32 v7, v9, 8, 8 -; SI-NEXT: v_bfe_u32 v6, v8, 8, 8 -; SI-NEXT: v_bfe_u32 v4, v2, 8, 8 -; SI-NEXT: v_bfe_u32 v3, v1, 8, 8 +; SI-NEXT: v_bfe_u32 v24, v5, 8, 8 +; SI-NEXT: v_bfe_u32 v22, v4, 8, 8 +; SI-NEXT: v_bfe_u32 v20, v3, 8, 8 +; SI-NEXT: v_bfe_u32 v18, v2, 8, 8 +; SI-NEXT: v_bfe_u32 v16, v1, 8, 8 ; SI-NEXT: .LBB61_3: ; %end ; SI-NEXT: s_and_b32 s12, s12, 0xff ; SI-NEXT: s_lshl_b32 s20, s20, 8 @@ -23046,17 +23871,17 @@ define inreg <40 x i8> @bitcast_v20f16_to_v40i8_scalar(<20 x half> inreg %a, i32 ; SI-NEXT: s_and_b32 s12, s12, 0xffff ; SI-NEXT: s_or_b32 s14, s14, s16 ; SI-NEXT: s_or_b32 s12, s12, s14 -; SI-NEXT: v_mov_b32_e32 v5, s12 -; SI-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v6, s12 ; SI-NEXT: s_and_b32 s12, s13, 0xff ; SI-NEXT: s_lshl_b32 s13, s23, 8 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v10 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen ; SI-NEXT: s_or_b32 s12, s12, s13 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v25 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v24 ; SI-NEXT: s_and_b32 s12, s12, 0xffff -; SI-NEXT: v_or_b32_e32 v5, v10, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 ; SI-NEXT: v_or_b32_e32 v5, s12, v5 ; SI-NEXT: s_and_b32 s10, s10, 0xff ; SI-NEXT: s_lshl_b32 s12, s26, 8 @@ -23066,22 +23891,22 @@ define inreg <40 x i8> @bitcast_v20f16_to_v40i8_scalar(<20 x half> inreg %a, i32 ; SI-NEXT: s_lshl_b32 s13, s18, 24 ; SI-NEXT: s_and_b32 s10, s10, 0xffff ; SI-NEXT: s_or_b32 s12, s13, s12 -; SI-NEXT: v_add_i32_e32 v10, vcc, 4, v0 +; SI-NEXT: v_add_i32_e32 v6, vcc, 4, v0 ; SI-NEXT: s_or_b32 s10, s10, s12 -; SI-NEXT: buffer_store_dword v5, v10, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v5, vcc, 8, v0 -; SI-NEXT: v_mov_b32_e32 v10, s10 -; SI-NEXT: buffer_store_dword v10, v5, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v6, s10 ; SI-NEXT: s_and_b32 s10, s11, 0xff ; SI-NEXT: s_lshl_b32 s11, s21, 8 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v9 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen ; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v22 ; SI-NEXT: s_and_b32 s10, s10, 0xffff -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_or_b32_e32 v5, s10, v5 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_or_b32_e32 v4, s10, v4 ; SI-NEXT: s_and_b32 s8, s8, 0xff ; SI-NEXT: s_lshl_b32 s10, s40, 8 ; SI-NEXT: s_or_b32 s8, s8, s10 @@ -23090,22 +23915,22 @@ define inreg <40 x i8> @bitcast_v20f16_to_v40i8_scalar(<20 x half> inreg %a, i32 ; SI-NEXT: s_lshl_b32 s11, s24, 24 ; SI-NEXT: s_and_b32 s8, s8, 0xffff ; SI-NEXT: s_or_b32 s10, s11, s10 -; SI-NEXT: v_add_i32_e32 v7, vcc, 12, v0 +; SI-NEXT: v_add_i32_e32 v5, vcc, 12, v0 ; SI-NEXT: s_or_b32 s8, s8, s10 -; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v5, vcc, 16, v0 -; SI-NEXT: v_mov_b32_e32 v7, s8 -; SI-NEXT: buffer_store_dword v7, v5, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v4, vcc, 16, v0 +; SI-NEXT: v_mov_b32_e32 v5, s8 ; SI-NEXT: s_and_b32 s8, s9, 0xff ; SI-NEXT: s_lshl_b32 s9, s19, 8 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v8 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: buffer_store_dword v5, v4, s[0:3], 0 offen ; SI-NEXT: s_or_b32 s8, s8, s9 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v20 ; SI-NEXT: s_and_b32 s8, s8, 0xffff -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_or_b32_e32 v5, s8, v5 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v3, s8, v3 ; SI-NEXT: s_and_b32 s6, s6, 0xff ; SI-NEXT: s_lshl_b32 s8, s44, 8 ; SI-NEXT: s_or_b32 s6, s6, s8 @@ -23114,18 +23939,21 @@ define inreg <40 x i8> @bitcast_v20f16_to_v40i8_scalar(<20 x half> inreg %a, i32 ; SI-NEXT: s_lshl_b32 s9, s46, 24 ; SI-NEXT: s_and_b32 s6, s6, 0xffff ; SI-NEXT: s_or_b32 s8, s9, s8 -; SI-NEXT: v_add_i32_e32 v6, vcc, 20, v0 +; SI-NEXT: v_add_i32_e32 v4, vcc, 20, v0 ; SI-NEXT: s_or_b32 s6, s6, s8 -; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v6, s6 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: v_mov_b32_e32 v4, s6 ; SI-NEXT: s_and_b32 s6, s7, 0xff ; SI-NEXT: s_lshl_b32 s7, s17, 8 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen ; SI-NEXT: s_or_b32 s6, s6, s7 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v18 ; SI-NEXT: s_and_b32 s6, s6, 0xffff -; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_or_b32_e32 v2, s6, v2 ; SI-NEXT: s_and_b32 s4, s4, 0xff ; SI-NEXT: s_lshl_b32 s6, s60, 8 @@ -23135,22 +23963,19 @@ define inreg <40 x i8> @bitcast_v20f16_to_v40i8_scalar(<20 x half> inreg %a, i32 ; SI-NEXT: s_lshl_b32 s7, s56, 24 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v5, vcc, 24, v0 -; SI-NEXT: v_add_i32_e32 v4, vcc, 28, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 ; SI-NEXT: s_or_b32 s4, s4, s6 -; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen -; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 -; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: v_mov_b32_e32 v3, s4 ; SI-NEXT: s_and_b32 s4, s5, 0xff ; SI-NEXT: s_lshl_b32 s5, s15, 8 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v16 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v1, s4, v1 @@ -23164,25 +23989,25 @@ define inreg <40 x i8> @bitcast_v20f16_to_v40i8_scalar(<20 x half> inreg %a, i32 ; SI-NEXT: ; implicit-def: $sgpr16 ; SI-NEXT: ; implicit-def: $sgpr14 ; SI-NEXT: ; implicit-def: $sgpr23 -; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $sgpr10 ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr22 ; SI-NEXT: ; implicit-def: $sgpr18 ; SI-NEXT: ; implicit-def: $sgpr21 -; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $sgpr8 ; SI-NEXT: ; implicit-def: $sgpr40 ; SI-NEXT: ; implicit-def: $sgpr28 ; SI-NEXT: ; implicit-def: $sgpr24 ; SI-NEXT: ; implicit-def: $sgpr19 -; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $sgpr6 ; SI-NEXT: ; implicit-def: $sgpr44 ; SI-NEXT: ; implicit-def: $sgpr42 ; SI-NEXT: ; implicit-def: $sgpr17 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $sgpr15 ; SI-NEXT: ; implicit-def: $sgpr46 ; SI-NEXT: ; implicit-def: $sgpr4 @@ -23916,19 +24741,6 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) { ; SI-LABEL: bitcast_v40i8_to_v20f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v36, v4 -; SI-NEXT: v_mov_b32_e32 v31, v2 -; SI-NEXT: v_mov_b32_e32 v35, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill @@ -23941,119 +24753,132 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: v_lshlrev_b32_e32 v37, 8, v1 -; SI-NEXT: v_lshlrev_b32_e32 v38, 8, v3 -; SI-NEXT: v_lshlrev_b32_e32 v39, 8, v5 -; SI-NEXT: v_lshlrev_b32_e32 v48, 8, v7 -; SI-NEXT: v_lshlrev_b32_e32 v49, 8, v9 -; SI-NEXT: v_lshlrev_b32_e32 v54, 8, v11 -; SI-NEXT: v_lshlrev_b32_e32 v55, 8, v13 -; SI-NEXT: v_lshlrev_b32_e32 v40, 8, v15 -; SI-NEXT: v_lshlrev_b32_e32 v41, 8, v17 -; SI-NEXT: v_lshlrev_b32_e32 v42, 8, v19 -; SI-NEXT: v_lshlrev_b32_e32 v43, 8, v21 -; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v23 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:8 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:4 +; SI-NEXT: v_lshlrev_b32_e32 v36, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v37, 8, v3 +; SI-NEXT: v_lshlrev_b32_e32 v38, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v39, 8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v48, 8, v9 +; SI-NEXT: v_lshlrev_b32_e32 v53, 8, v11 +; SI-NEXT: v_lshlrev_b32_e32 v54, 8, v13 +; SI-NEXT: v_lshlrev_b32_e32 v55, 8, v15 +; SI-NEXT: v_lshlrev_b32_e32 v40, 8, v17 +; SI-NEXT: v_lshlrev_b32_e32 v41, 8, v19 +; SI-NEXT: v_lshlrev_b32_e32 v42, 8, v21 +; SI-NEXT: v_lshlrev_b32_e32 v43, 8, v23 ; SI-NEXT: v_lshlrev_b32_e32 v44, 8, v25 -; SI-NEXT: s_waitcnt expcnt(6) ; SI-NEXT: v_lshlrev_b32_e32 v45, 8, v27 -; SI-NEXT: s_waitcnt expcnt(5) ; SI-NEXT: v_lshlrev_b32_e32 v46, 8, v29 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v47, 8, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v59, 8, v4 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: v_lshlrev_b32_e32 v58, 8, v32 -; SI-NEXT: v_lshlrev_b32_e32 v56, 8, v33 -; SI-NEXT: v_lshlrev_b32_e32 v57, 8, v34 -; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v47, 8, v31 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 +; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v59, 8, v33 ; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v58, 8, v34 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v56, 8, v35 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v57, 8, v57 ; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB62_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v3, 0xff, v6 -; SI-NEXT: v_and_b32_e32 v6, 0xff, v12 -; SI-NEXT: v_or_b32_e32 v6, v6, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v6 -; SI-NEXT: v_and_b32_e32 v6, 0xff, v14 -; SI-NEXT: v_or_b32_e32 v6, v6, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v6 -; SI-NEXT: v_and_b32_e32 v6, 0xff, v16 -; SI-NEXT: v_or_b32_e32 v6, v6, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v6 -; SI-NEXT: v_and_b32_e32 v6, 0xff, v18 -; SI-NEXT: v_or_b32_e32 v6, v6, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v6 -; SI-NEXT: v_and_b32_e32 v6, 0xff, v20 -; SI-NEXT: v_or_b32_e32 v6, v6, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v6 -; SI-NEXT: v_and_b32_e32 v6, 0xff, v22 -; SI-NEXT: v_or_b32_e32 v6, v6, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v6 -; SI-NEXT: v_and_b32_e32 v6, 0xff, v24 -; SI-NEXT: v_or_b32_e32 v6, v6, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v6 -; SI-NEXT: v_and_b32_e32 v6, 0xff, v26 -; SI-NEXT: v_or_b32_e32 v6, v6, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v6 -; SI-NEXT: v_and_b32_e32 v6, 0xff, v28 -; SI-NEXT: v_or_b32_e32 v6, v6, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v6 -; SI-NEXT: v_and_b32_e32 v6, 0xff, v30 -; SI-NEXT: v_or_b32_e32 v6, v6, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v6 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v50 -; SI-NEXT: v_or_b32_e32 v6, v6, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v6 -; SI-NEXT: v_and_b32_e32 v6, 0xff, v51 -; SI-NEXT: v_or_b32_e32 v6, v6, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v6 -; SI-NEXT: v_and_b32_e32 v6, 0xff, v53 -; SI-NEXT: v_or_b32_e32 v6, v6, v58 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v35 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v31 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v36 -; SI-NEXT: v_and_b32_e32 v4, 0xff, v8 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v6 -; SI-NEXT: v_and_b32_e32 v6, 0xff, v52 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v0, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v2 ; SI-NEXT: v_or_b32_e32 v0, v0, v37 -; SI-NEXT: v_or_b32_e32 v1, v1, v38 -; SI-NEXT: v_or_b32_e32 v2, v2, v39 -; SI-NEXT: v_or_b32_e32 v3, v3, v48 -; SI-NEXT: v_or_b32_e32 v4, v4, v49 -; SI-NEXT: v_or_b32_e32 v5, v5, v54 -; SI-NEXT: v_or_b32_e32 v6, v6, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v6 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v4 +; SI-NEXT: v_or_b32_e32 v0, v0, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v0, v0, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v8 +; SI-NEXT: v_or_b32_e32 v0, v0, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v10 +; SI-NEXT: v_or_b32_e32 v0, v0, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v12 +; SI-NEXT: v_or_b32_e32 v0, v0, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v14 +; SI-NEXT: v_or_b32_e32 v0, v0, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v16 +; SI-NEXT: v_or_b32_e32 v0, v0, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v18 +; SI-NEXT: v_or_b32_e32 v0, v0, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v20 +; SI-NEXT: v_or_b32_e32 v0, v0, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v22 +; SI-NEXT: v_or_b32_e32 v0, v0, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v24 +; SI-NEXT: v_or_b32_e32 v0, v0, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v26 +; SI-NEXT: v_or_b32_e32 v0, v0, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v28 +; SI-NEXT: v_or_b32_e32 v0, v0, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v30 +; SI-NEXT: v_or_b32_e32 v0, v0, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xff, v49 +; SI-NEXT: v_or_b32_e32 v0, v0, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v50 +; SI-NEXT: v_or_b32_e32 v0, v0, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v52 +; SI-NEXT: v_or_b32_e32 v0, v0, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v51 +; SI-NEXT: v_or_b32_e32 v0, v0, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr10 @@ -24067,22 +24892,22 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr46 @@ -24095,109 +24920,110 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) { ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB62_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v59, v0 -; SI-NEXT: v_add_i32_e32 v19, vcc, 0x300, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v51 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v1, v59, v1 +; SI-NEXT: v_add_i32_e32 v35, vcc, 0x300, v1 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v52 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: s_movk_i32 s6, 0x300 -; SI-NEXT: v_or_b32_e32 v0, v58, v0 -; SI-NEXT: v_add_i32_e32 v34, vcc, s6, v0 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v57, v0 -; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v0 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v56, v0 -; SI-NEXT: v_add_i32_e32 v32, vcc, s6, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v47, v0 -; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v28 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v46, v0 -; SI-NEXT: v_add_i32_e32 v28, vcc, s6, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v26 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v45, v0 -; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v24 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v44, v0 -; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v22 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v23, v0 -; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v20 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v43, v0 -; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v18 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v42, v0 -; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v16 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v41, v0 -; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v14 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v40, v0 -; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v12 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v55, v0 -; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v10 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v54, v0 -; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v8 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v49, v0 -; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v6 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v48, v0 -; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v39, v0 -; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v31 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v38, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_or_b32_e32 v1, v58, v1 +; SI-NEXT: v_add_i32_e32 v33, vcc, s6, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v50 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v1, v57, v1 +; SI-NEXT: v_add_i32_e32 v34, vcc, s6, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v49 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v1, v56, v1 +; SI-NEXT: v_add_i32_e32 v31, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v30 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v1, v47, v1 +; SI-NEXT: v_add_i32_e32 v30, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v28 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v1, v46, v1 +; SI-NEXT: v_add_i32_e32 v27, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v26 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v1, v45, v1 +; SI-NEXT: v_add_i32_e32 v26, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v24 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v1, v44, v1 +; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v22 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v1, v43, v1 +; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v1, v42, v1 +; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v18 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v1, v41, v1 +; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v16 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v1, v40, v1 +; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v14 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v1, v55, v1 +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v12 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v1, v54, v1 +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v10 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v1, v53, v1 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v8 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v1, v48, v1 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v6 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v1, v39, v1 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v1, v38, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v37, v0 +; SI-NEXT: v_or_b32_e32 v1, v37, v1 +; SI-NEXT: v_or_b32_e32 v0, v36, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 ; SI-NEXT: .LBB62_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload @@ -24212,13 +25038,46 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v6, v27 -; SI-NEXT: v_mov_b32_e32 v8, v33 -; SI-NEXT: v_mov_b32_e32 v10, v21 -; SI-NEXT: v_mov_b32_e32 v12, v25 -; SI-NEXT: v_mov_b32_e32 v14, v29 -; SI-NEXT: v_mov_b32_e32 v16, v32 -; SI-NEXT: v_mov_b32_e32 v18, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v13 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v11 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v21 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v19 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v29 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v27 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v34 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v33 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -25218,11 +26077,11 @@ define inreg <20 x half> @bitcast_v40i8_to_v20f16_scalar(<40 x i8> inreg %a, i32 ; SI-NEXT: s_and_b32 s4, s22, 0xff ; SI-NEXT: s_lshl_b32 s5, s23, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 ; SI-NEXT: s_and_b32 s4, s24, 0xff ; SI-NEXT: s_lshl_b32 s5, s25, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 ; SI-NEXT: s_and_b32 s4, s26, 0xff ; SI-NEXT: s_lshl_b32 s5, s27, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 @@ -25234,11 +26093,11 @@ define inreg <20 x half> @bitcast_v40i8_to_v20f16_scalar(<40 x i8> inreg %a, i32 ; SI-NEXT: s_and_b32 s4, s9, 0xff ; SI-NEXT: s_lshl_b32 s5, s7, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 ; SI-NEXT: s_and_b32 s4, s13, 0xff ; SI-NEXT: s_lshl_b32 s5, s10, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 ; SI-NEXT: s_and_b32 s4, s40, 0xff ; SI-NEXT: s_lshl_b32 s5, s6, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 @@ -25250,11 +26109,11 @@ define inreg <20 x half> @bitcast_v40i8_to_v20f16_scalar(<40 x i8> inreg %a, i32 ; SI-NEXT: s_and_b32 s4, s14, 0xff ; SI-NEXT: s_lshl_b32 s5, s12, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 ; SI-NEXT: s_and_b32 s4, s41, 0xff ; SI-NEXT: s_lshl_b32 s5, s15, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 ; SI-NEXT: s_and_b32 s4, s43, 0xff ; SI-NEXT: s_lshl_b32 s5, s42, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 @@ -25266,19 +26125,19 @@ define inreg <20 x half> @bitcast_v40i8_to_v20f16_scalar(<40 x i8> inreg %a, i32 ; SI-NEXT: s_and_b32 s4, s47, 0xff ; SI-NEXT: s_lshl_b32 s5, s46, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 ; SI-NEXT: s_and_b32 s4, s57, 0xff ; SI-NEXT: s_lshl_b32 s5, s56, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 ; SI-NEXT: s_and_b32 s4, s59, 0xff ; SI-NEXT: s_lshl_b32 s5, s58, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 ; SI-NEXT: s_and_b32 s4, s61, 0xff ; SI-NEXT: s_lshl_b32 s5, s60, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 ; SI-NEXT: s_and_b32 s4, s63, 0xff ; SI-NEXT: s_lshl_b32 s5, s62, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 @@ -25388,45 +26247,85 @@ define inreg <20 x half> @bitcast_v40i8_to_v20f16_scalar(<40 x i8> inreg %a, i32 ; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s18 ; SI-NEXT: v_cvt_f32_f16_e32 v2, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s13 ; SI-NEXT: v_cvt_f32_f16_e32 v5, s11 ; SI-NEXT: v_cvt_f32_f16_e32 v6, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s10 ; SI-NEXT: v_cvt_f32_f16_e32 v9, s6 ; SI-NEXT: v_cvt_f32_f16_e32 v10, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s15 ; SI-NEXT: v_cvt_f32_f16_e32 v13, s42 ; SI-NEXT: v_cvt_f32_f16_e32 v14, s44 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s46 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s56 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s58 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s56 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s58 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 ; SI-NEXT: .LBB63_3: ; %end +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v9 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v10 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v13 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v14 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v18 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v17 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB63_4: ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: s_branch .LBB63_2 ; @@ -26236,27 +27135,57 @@ define <5 x double> @bitcast_v20f16_to_v5f64(<20 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v20f16_to_v5f64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v38, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v15 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v18 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v9 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -26268,29 +27197,26 @@ define <5 x double> @bitcast_v20f16_to_v5f64(<20 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB64_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v38 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v30 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v28 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v26 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v24 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v19 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v17 -; SI-NEXT: v_or_b32_e32 v0, v37, v0 -; SI-NEXT: v_or_b32_e32 v1, v35, v1 -; SI-NEXT: v_or_b32_e32 v2, v33, v2 -; SI-NEXT: v_or_b32_e32 v3, v31, v3 -; SI-NEXT: v_or_b32_e32 v4, v29, v4 -; SI-NEXT: v_or_b32_e32 v5, v27, v5 -; SI-NEXT: v_or_b32_e32 v6, v25, v6 -; SI-NEXT: v_or_b32_e32 v7, v23, v7 -; SI-NEXT: v_or_b32_e32 v8, v21, v8 +; SI-NEXT: v_or_b32_e32 v0, v34, v0 +; SI-NEXT: v_or_b32_e32 v1, v32, v1 +; SI-NEXT: v_or_b32_e32 v2, v30, v2 +; SI-NEXT: v_or_b32_e32 v3, v28, v3 +; SI-NEXT: v_or_b32_e32 v4, v26, v4 +; SI-NEXT: v_or_b32_e32 v5, v24, v5 +; SI-NEXT: v_or_b32_e32 v6, v22, v6 +; SI-NEXT: v_or_b32_e32 v7, v20, v7 +; SI-NEXT: v_or_b32_e32 v8, v18, v8 ; SI-NEXT: v_or_b32_e32 v9, v16, v9 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr33 @@ -26306,15 +27232,18 @@ define <5 x double> @bitcast_v20f16_to_v5f64(<20 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB64_2 ; SI-NEXT: .LBB64_4: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v32 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -26327,10 +27256,10 @@ define <5 x double> @bitcast_v20f16_to_v5f64(<20 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v28 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -26338,10 +27267,10 @@ define <5 x double> @bitcast_v20f16_to_v5f64(<20 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v26 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v27 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 @@ -26349,11 +27278,11 @@ define <5 x double> @bitcast_v20f16_to_v5f64(<20 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v25 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v23 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 @@ -26361,11 +27290,11 @@ define <5 x double> @bitcast_v20f16_to_v5f64(<20 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v22 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v20 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 @@ -26373,10 +27302,10 @@ define <5 x double> @bitcast_v20f16_to_v5f64(<20 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v19 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v16 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 @@ -26506,28 +27435,57 @@ define inreg <5 x double> @bitcast_v20f16_to_v5f64_scalar(<20 x half> inreg %a, ; SI-LABEL: bitcast_v20f16_to_v5f64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v35, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v34, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v33, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v32, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v31, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v30, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v29, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v28, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v27, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v26, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v25, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v24, s26 -; SI-NEXT: v_cvt_f16_f32_e32 v23, s29 -; SI-NEXT: v_cvt_f16_f32_e32 v22, s28 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v4 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 +; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v10 +; SI-NEXT: s_cmp_lg_u32 s26, 0 ; SI-NEXT: s_cbranch_scc0 .LBB65_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v35 @@ -26821,57 +27779,57 @@ define <20 x half> @bitcast_v5f64_to_v20f16(<5 x double> %a, i32 %b) { ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB66_2 ; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v12 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v12 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v12 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v12 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v12 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v1 ; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v1 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v12 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v28, v0 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr2 @@ -26888,47 +27846,77 @@ define <20 x half> @bitcast_v5f64_to_v20f16(<5 x double> %a, i32 %b) { ; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 ; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 ; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v3 ; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v1 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v28, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 ; SI-NEXT: .LBB66_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_mov_b32_e32 v0, v28 -; SI-NEXT: v_mov_b32_e32 v1, v29 -; SI-NEXT: v_mov_b32_e32 v2, v27 -; SI-NEXT: v_mov_b32_e32 v3, v26 -; SI-NEXT: v_mov_b32_e32 v4, v25 -; SI-NEXT: v_mov_b32_e32 v5, v23 -; SI-NEXT: v_mov_b32_e32 v6, v24 -; SI-NEXT: v_mov_b32_e32 v7, v21 -; SI-NEXT: v_mov_b32_e32 v8, v22 -; SI-NEXT: v_mov_b32_e32 v9, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v26 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v25 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v22 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v20 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v19 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v16 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v15 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v11 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v12 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v5f64_to_v20f16: @@ -27010,95 +27998,135 @@ define inreg <20 x half> @bitcast_v5f64_to_v20f16_scalar(<5 x double> inreg %a, ; SI-NEXT: s_cbranch_scc0 .LBB67_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 ; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 ; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 ; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 ; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 ; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 ; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 ; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 ; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s16 ; SI-NEXT: s_cbranch_execnz .LBB67_3 ; SI-NEXT: .LBB67_2: ; %cmp.true -; SI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 -; SI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 -; SI-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 -; SI-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 -; SI-NEXT: v_add_f64 v[8:9], s[24:25], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v3 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v2 +; SI-NEXT: v_add_f64 v[3:4], s[16:17], 1.0 +; SI-NEXT: v_add_f64 v[0:1], s[18:19], 1.0 +; SI-NEXT: v_add_f64 v[14:15], s[20:21], 1.0 +; SI-NEXT: v_add_f64 v[5:6], s[22:23], 1.0 +; SI-NEXT: v_add_f64 v[7:8], s[24:25], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: .LBB67_3: ; %end +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v19 +; SI-NEXT: v_or_b32_e32 v0, v18, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_or_b32_e32 v2, v16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v15 +; SI-NEXT: v_or_b32_e32 v4, v14, v4 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v16 +; SI-NEXT: v_or_b32_e32 v5, v5, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_or_b32_e32 v7, v7, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v12 +; SI-NEXT: v_or_b32_e32 v1, v20, v1 +; SI-NEXT: v_or_b32_e32 v3, v18, v3 +; SI-NEXT: v_or_b32_e32 v6, v13, v6 +; SI-NEXT: v_or_b32_e32 v8, v11, v8 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB67_4: +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: s_branch .LBB67_2 ; ; VI-LABEL: bitcast_v5f64_to_v20f16_scalar: @@ -27224,27 +28252,57 @@ define <5 x i64> @bitcast_v20f16_to_v5i64(<20 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v20f16_to_v5i64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v38, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v15 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v18 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v9 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -27256,29 +28314,26 @@ define <5 x i64> @bitcast_v20f16_to_v5i64(<20 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB68_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v38 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v30 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v28 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v26 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v24 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v19 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v17 -; SI-NEXT: v_or_b32_e32 v0, v37, v0 -; SI-NEXT: v_or_b32_e32 v1, v35, v1 -; SI-NEXT: v_or_b32_e32 v2, v33, v2 -; SI-NEXT: v_or_b32_e32 v3, v31, v3 -; SI-NEXT: v_or_b32_e32 v4, v29, v4 -; SI-NEXT: v_or_b32_e32 v5, v27, v5 -; SI-NEXT: v_or_b32_e32 v6, v25, v6 -; SI-NEXT: v_or_b32_e32 v7, v23, v7 -; SI-NEXT: v_or_b32_e32 v8, v21, v8 +; SI-NEXT: v_or_b32_e32 v0, v34, v0 +; SI-NEXT: v_or_b32_e32 v1, v32, v1 +; SI-NEXT: v_or_b32_e32 v2, v30, v2 +; SI-NEXT: v_or_b32_e32 v3, v28, v3 +; SI-NEXT: v_or_b32_e32 v4, v26, v4 +; SI-NEXT: v_or_b32_e32 v5, v24, v5 +; SI-NEXT: v_or_b32_e32 v6, v22, v6 +; SI-NEXT: v_or_b32_e32 v7, v20, v7 +; SI-NEXT: v_or_b32_e32 v8, v18, v8 ; SI-NEXT: v_or_b32_e32 v9, v16, v9 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr33 @@ -27294,15 +28349,18 @@ define <5 x i64> @bitcast_v20f16_to_v5i64(<20 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB68_2 ; SI-NEXT: .LBB68_4: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v32 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -27315,10 +28373,10 @@ define <5 x i64> @bitcast_v20f16_to_v5i64(<20 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v28 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -27326,10 +28384,10 @@ define <5 x i64> @bitcast_v20f16_to_v5i64(<20 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v26 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v27 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 @@ -27337,11 +28395,11 @@ define <5 x i64> @bitcast_v20f16_to_v5i64(<20 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v25 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v23 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 @@ -27349,11 +28407,11 @@ define <5 x i64> @bitcast_v20f16_to_v5i64(<20 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v22 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v20 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 @@ -27361,10 +28419,10 @@ define <5 x i64> @bitcast_v20f16_to_v5i64(<20 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v19 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v16 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 @@ -27494,28 +28552,57 @@ define inreg <5 x i64> @bitcast_v20f16_to_v5i64_scalar(<20 x half> inreg %a, i32 ; SI-LABEL: bitcast_v20f16_to_v5i64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v35, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v34, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v33, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v32, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v31, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v30, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v29, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v28, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v27, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v26, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v25, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v24, s26 -; SI-NEXT: v_cvt_f16_f32_e32 v23, s29 -; SI-NEXT: v_cvt_f16_f32_e32 v22, s28 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v4 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 +; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v10 +; SI-NEXT: s_cmp_lg_u32 s26, 0 ; SI-NEXT: s_cbranch_scc0 .LBB69_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v35 @@ -27806,17 +28893,61 @@ define <20 x half> @bitcast_v5i64_to_v20f16(<5 x i64> %a, i32 %b) { ; SI-LABEL: bitcast_v5i64_to_v20f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v21, v9 -; SI-NEXT: v_mov_b32_e32 v20, v8 -; SI-NEXT: v_mov_b32_e32 v23, v7 -; SI-NEXT: v_mov_b32_e32 v22, v6 -; SI-NEXT: v_mov_b32_e32 v25, v5 -; SI-NEXT: v_mov_b32_e32 v24, v4 -; SI-NEXT: v_mov_b32_e32 v27, v3 -; SI-NEXT: v_mov_b32_e32 v26, v2 -; SI-NEXT: v_mov_b32_e32 v29, v1 -; SI-NEXT: v_mov_b32_e32 v28, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB70_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v12 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v12 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v12 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v12 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v12 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v12 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v0 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 @@ -27827,110 +28958,92 @@ define <20 x half> @bitcast_v5i64_to_v20f16(<5 x i64> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB70_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB70_4 -; SI-NEXT: .LBB70_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB70_3: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v28 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: .LBB70_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB70_2 -; SI-NEXT: .LBB70_4: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v28 -; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v29, vcc -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v26 -; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v27, vcc -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v24 -; SI-NEXT: v_addc_u32_e32 v8, vcc, 0, v25, vcc -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v22 -; SI-NEXT: v_addc_u32_e32 v12, vcc, 0, v23, vcc -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v20 -; SI-NEXT: v_addc_u32_e32 v16, vcc, 0, v21, vcc -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v6 +; SI-NEXT: s_cbranch_execz .LBB70_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v7 ; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: .LBB70_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v26 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v25 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v22 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v20 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v19 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v16 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v15 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v11 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v12 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v5i64_to_v20f16: @@ -28030,35 +29143,35 @@ define inreg <20 x half> @bitcast_v5i64_to_v20f16_scalar(<5 x i64> inreg %a, i32 ; SI-NEXT: s_cbranch_scc0 .LBB71_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 ; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 ; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 ; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 ; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 ; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 ; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 ; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 ; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s16 ; SI-NEXT: s_cbranch_execnz .LBB71_3 ; SI-NEXT: .LBB71_2: ; %cmp.true ; SI-NEXT: s_add_u32 s4, s16, 3 @@ -28081,49 +29194,89 @@ define inreg <20 x half> @bitcast_v5i64_to_v20f16_scalar(<5 x i64> inreg %a, i32 ; SI-NEXT: s_addc_u32 s21, s25, 0 ; SI-NEXT: s_lshr_b32 s22, s20, 16 ; SI-NEXT: s_lshr_b32 s23, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s6 ; SI-NEXT: .LBB71_3: ; %end +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v19 +; SI-NEXT: v_or_b32_e32 v0, v18, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v17 +; SI-NEXT: v_or_b32_e32 v2, v16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v16 +; SI-NEXT: v_or_b32_e32 v5, v5, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_or_b32_e32 v7, v7, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v12 +; SI-NEXT: v_or_b32_e32 v1, v20, v1 +; SI-NEXT: v_or_b32_e32 v3, v18, v3 +; SI-NEXT: v_or_b32_e32 v4, v15, v4 +; SI-NEXT: v_or_b32_e32 v6, v13, v6 +; SI-NEXT: v_or_b32_e32 v8, v11, v8 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB71_4: +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: s_branch .LBB71_2 ; ; VI-LABEL: bitcast_v5i64_to_v20f16_scalar: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll index 8055ea8be5261..040f0c8b4d299 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll @@ -318,21 +318,18 @@ define <2 x i16> @bitcast_i32_to_v2i16(i32 %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB4_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB4_4 -; SI-NEXT: .LBB4_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB4_3: ; %cmp.false +; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB4_2 -; SI-NEXT: .LBB4_4: ; %cmp.true +; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: ; %bb.4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_i32_to_v2i16: @@ -404,8 +401,10 @@ define inreg <2 x i16> @bitcast_i32_to_v2i16_scalar(i32 inreg %a, i32 inreg %b) ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_lshr_b32 s6, s16, 16 ; SI-NEXT: .LBB5_3: ; %end -; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_mov_b32_e32 v1, s6 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s6, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB5_4: ; SI-NEXT: ; implicit-def: $sgpr6 @@ -480,9 +479,10 @@ define i32 @bitcast_v2i16_to_i32(<2 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v2i16_to_i32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v3, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mov_b32_e32 v2, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -494,14 +494,14 @@ define i32 @bitcast_v2i16_to_i32(<2 x i16> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB6_3: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB6_2 ; SI-NEXT: .LBB6_4: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v2 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 @@ -571,24 +571,25 @@ define inreg i32 @bitcast_v2i16_to_i32_scalar(<2 x i16> inreg %a, i32 inreg %b) ; SI-LABEL: bitcast_v2i16_to_i32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: s_lshr_b32 s6, s16, 16 +; SI-NEXT: s_cmp_lg_u32 s17, 0 ; SI-NEXT: s_cbranch_scc0 .LBB7_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: s_or_b32 s6, s4, s5 +; SI-NEXT: s_lshl_b32 s5, s6, 16 +; SI-NEXT: s_or_b32 s7, s4, s5 ; SI-NEXT: s_cbranch_execnz .LBB7_3 ; SI-NEXT: .LBB7_2: ; %cmp.true ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_lshl_b32 s5, s6, 16 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_add_i32 s6, s4, 0x30000 +; SI-NEXT: s_add_i32 s7, s4, 0x30000 ; SI-NEXT: .LBB7_3: ; %end -; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v0, s7 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB7_4: -; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr7 ; SI-NEXT: s_branch .LBB7_2 ; ; VI-LABEL: bitcast_v2i16_to_i32_scalar: @@ -664,32 +665,30 @@ define <2 x half> @bitcast_i32_to_v2f16(i32 %a, i32 %b) { ; SI-LABEL: bitcast_i32_to_v2f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB8_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB8_4 -; SI-NEXT: .LBB8_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB8_3: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v2 -; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB8_2 -; SI-NEXT: .LBB8_4: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v2 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_cbranch_execz .LBB8_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: .LBB8_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_cvt_f16_f32_e32 v0, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_i32_to_v2f16: @@ -765,6 +764,10 @@ define inreg <2 x half> @bitcast_i32_to_v2f16_scalar(i32 inreg %a, i32 inreg %b) ; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 ; SI-NEXT: .LBB9_3: ; %end +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB9_4: ; SI-NEXT: ; implicit-def: $vgpr0 @@ -840,9 +843,12 @@ define i32 @bitcast_v2f16_to_i32(<2 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v2f16_to_i32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v0 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -855,14 +861,14 @@ define i32 @bitcast_v2f16_to_i32(<2 x half> %a, i32 %b) { ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB10_3: ; %cmp.false ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v0, v2, v0 ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB10_2 ; SI-NEXT: .LBB10_4: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v2 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -936,9 +942,12 @@ define inreg i32 @bitcast_v2f16_to_i32_scalar(<2 x half> inreg %a, i32 inreg %b) ; SI-LABEL: bitcast_v2f16_to_i32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v1, s16 -; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: s_cmp_lg_u32 s17, 0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_cbranch_scc0 .LBB11_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 @@ -1035,30 +1044,27 @@ define <2 x bfloat> @bitcast_i32_to_v2bf16(i32 %a, i32 %b) { ; SI-LABEL: bitcast_i32_to_v2bf16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB12_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB12_4 -; SI-NEXT: .LBB12_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB12_3: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB12_2 -; SI-NEXT: .LBB12_4: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v2 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; SI-NEXT: ; %bb.4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v2 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_i32_to_v2bf16: @@ -1124,20 +1130,22 @@ define inreg <2 x bfloat> @bitcast_i32_to_v2bf16_scalar(i32 inreg %a, i32 inreg ; SI-NEXT: s_cmp_lg_u32 s17, 0 ; SI-NEXT: s_cbranch_scc0 .LBB13_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_and_b32 s6, s16, 0xffff0000 -; SI-NEXT: s_lshl_b32 s7, s16, 16 +; SI-NEXT: s_and_b32 s7, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s6, s16, 16 ; SI-NEXT: s_cbranch_execnz .LBB13_3 ; SI-NEXT: .LBB13_2: ; %cmp.true ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: s_and_b32 s6, s16, 0xffff0000 -; SI-NEXT: s_lshl_b32 s7, s16, 16 +; SI-NEXT: s_and_b32 s7, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s6, s16, 16 ; SI-NEXT: .LBB13_3: ; %end -; SI-NEXT: v_mov_b32_e32 v0, s7 -; SI-NEXT: v_mov_b32_e32 v1, s6 +; SI-NEXT: v_mul_f32_e64 v0, 1.0, s7 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_mul_f32_e64 v0, 1.0, s6 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB13_4: -; SI-NEXT: ; implicit-def: $sgpr7 ; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr7 ; SI-NEXT: s_branch .LBB13_2 ; ; VI-LABEL: bitcast_i32_to_v2bf16_scalar: @@ -1209,8 +1217,10 @@ define i32 @bitcast_v2bf16_to_i32(<2 x bfloat> %a, i32 %b) { ; SI-LABEL: bitcast_v2bf16_to_i32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v2 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v0 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -1391,9 +1401,11 @@ define inreg i32 @bitcast_v2bf16_to_i32_scalar(<2 x bfloat> inreg %a, i32 inreg ; SI-LABEL: bitcast_v2bf16_to_i32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_cmp_lg_u32 s18, 0 -; SI-NEXT: v_mul_f32_e64 v4, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s16 +; SI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s5, s16, 16 +; SI-NEXT: s_cmp_lg_u32 s17, 0 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s4 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s5 ; SI-NEXT: s_cbranch_scc0 .LBB15_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 @@ -2592,21 +2604,18 @@ define <2 x i16> @bitcast_f32_to_v2i16(float %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB24_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB24_4 -; SI-NEXT: .LBB24_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB24_3: ; %cmp.false +; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB24_2 -; SI-NEXT: .LBB24_4: ; %cmp.true +; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: ; %bb.4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_f32_to_v2i16: @@ -2677,13 +2686,17 @@ define inreg <2 x i16> @bitcast_f32_to_v2i16_scalar(float inreg %a, i32 inreg %b ; SI-NEXT: .LBB25_2: ; %cmp.true ; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: s_branch .LBB25_5 ; SI-NEXT: .LBB25_3: ; SI-NEXT: ; implicit-def: $sgpr6 ; SI-NEXT: s_branch .LBB25_2 ; SI-NEXT: .LBB25_4: ; SI-NEXT: v_mov_b32_e32 v0, s16 ; SI-NEXT: v_mov_b32_e32 v1, s6 +; SI-NEXT: .LBB25_5: ; %end +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_f32_to_v2i16_scalar: @@ -2756,9 +2769,10 @@ define float @bitcast_v2i16_to_f32(<2 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v2i16_to_f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v3, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mov_b32_e32 v2, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -2770,14 +2784,14 @@ define float @bitcast_v2i16_to_f32(<2 x i16> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB26_3: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB26_2 ; SI-NEXT: .LBB26_4: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v2 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 @@ -2847,24 +2861,25 @@ define inreg float @bitcast_v2i16_to_f32_scalar(<2 x i16> inreg %a, i32 inreg %b ; SI-LABEL: bitcast_v2i16_to_f32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: s_lshr_b32 s6, s16, 16 +; SI-NEXT: s_cmp_lg_u32 s17, 0 ; SI-NEXT: s_cbranch_scc0 .LBB27_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: s_or_b32 s6, s4, s5 +; SI-NEXT: s_lshl_b32 s5, s6, 16 +; SI-NEXT: s_or_b32 s7, s4, s5 ; SI-NEXT: s_cbranch_execnz .LBB27_3 ; SI-NEXT: .LBB27_2: ; %cmp.true ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_lshl_b32 s5, s6, 16 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_add_i32 s6, s4, 0x30000 +; SI-NEXT: s_add_i32 s7, s4, 0x30000 ; SI-NEXT: .LBB27_3: ; %end -; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v0, s7 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB27_4: -; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr7 ; SI-NEXT: s_branch .LBB27_2 ; ; VI-LABEL: bitcast_v2i16_to_f32_scalar: @@ -2940,32 +2955,30 @@ define <2 x half> @bitcast_f32_to_v2f16(float %a, i32 %b) { ; SI-LABEL: bitcast_f32_to_v2f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB28_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB28_4 -; SI-NEXT: .LBB28_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB28_3: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v2 -; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB28_2 -; SI-NEXT: .LBB28_4: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v0, 1.0, v2 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_cbranch_execz .LBB28_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: .LBB28_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_cvt_f16_f32_e32 v0, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_f32_to_v2f16: @@ -3041,6 +3054,10 @@ define inreg <2 x half> @bitcast_f32_to_v2f16_scalar(float inreg %a, i32 inreg % ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: .LBB29_3: ; %end +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB29_4: ; SI-NEXT: ; implicit-def: $vgpr0 @@ -3117,9 +3134,12 @@ define float @bitcast_v2f16_to_f32(<2 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v2f16_to_f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v0 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -3132,14 +3152,14 @@ define float @bitcast_v2f16_to_f32(<2 x half> %a, i32 %b) { ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB30_3: ; %cmp.false ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v0, v2, v0 ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB30_2 ; SI-NEXT: .LBB30_4: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v2 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -3213,9 +3233,12 @@ define inreg float @bitcast_v2f16_to_f32_scalar(<2 x half> inreg %a, i32 inreg % ; SI-LABEL: bitcast_v2f16_to_f32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v1, s16 -; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: s_cmp_lg_u32 s17, 0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_cbranch_scc0 .LBB31_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 @@ -3312,30 +3335,27 @@ define <2 x bfloat> @bitcast_f32_to_v2bf16(float %a, i32 %b) { ; SI-LABEL: bitcast_f32_to_v2bf16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB32_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB32_4 -; SI-NEXT: .LBB32_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB32_3: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB32_2 -; SI-NEXT: .LBB32_4: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v0, 1.0, v2 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; SI-NEXT: ; %bb.4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v2 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_f32_to_v2bf16: @@ -3408,7 +3428,7 @@ define inreg <2 x bfloat> @bitcast_f32_to_v2bf16_scalar(float inreg %a, i32 inre ; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: s_branch .LBB33_5 ; SI-NEXT: .LBB33_3: ; SI-NEXT: ; implicit-def: $sgpr7 ; SI-NEXT: ; implicit-def: $sgpr6 @@ -3416,6 +3436,11 @@ define inreg <2 x bfloat> @bitcast_f32_to_v2bf16_scalar(float inreg %a, i32 inre ; SI-NEXT: .LBB33_4: ; SI-NEXT: v_mov_b32_e32 v0, s7 ; SI-NEXT: v_mov_b32_e32 v1, s6 +; SI-NEXT: .LBB33_5: ; %end +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_f32_to_v2bf16_scalar: @@ -3488,8 +3513,10 @@ define float @bitcast_v2bf16_to_f32(<2 x bfloat> %a, i32 %b) { ; SI-LABEL: bitcast_v2bf16_to_f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v2 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v0 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -3670,9 +3697,11 @@ define inreg float @bitcast_v2bf16_to_f32_scalar(<2 x bfloat> inreg %a, i32 inre ; SI-LABEL: bitcast_v2bf16_to_f32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_cmp_lg_u32 s18, 0 -; SI-NEXT: v_mul_f32_e64 v4, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s16 +; SI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s5, s16, 16 +; SI-NEXT: s_cmp_lg_u32 s17, 0 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s4 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s5 ; SI-NEXT: s_cbranch_scc0 .LBB35_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 @@ -4872,33 +4901,31 @@ define <2 x half> @bitcast_v2i16_to_v2f16(<2 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v2i16_to_v2f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v4, v1 -; SI-NEXT: v_mov_b32_e32 v3, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 ; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB44_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB44_4 -; SI-NEXT: .LBB44_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB44_3: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v0, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v4 -; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v3 ; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB44_2 -; SI-NEXT: .LBB44_4: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v4 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_cbranch_execz .LBB44_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: .LBB44_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_cvt_f16_f32_e32 v0, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2i16_to_v2f16: @@ -4964,18 +4991,23 @@ define inreg <2 x half> @bitcast_v2i16_to_v2f16_scalar(<2 x i16> inreg %a, i32 i ; SI-LABEL: bitcast_v2i16_to_v2f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: s_lshr_b32 s6, s16, 16 +; SI-NEXT: s_cmp_lg_u32 s17, 0 ; SI-NEXT: s_cbranch_scc0 .LBB45_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s6 ; SI-NEXT: s_cbranch_execnz .LBB45_3 ; SI-NEXT: .LBB45_2: ; %cmp.true -; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s6, s6, 3 ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s6 ; SI-NEXT: .LBB45_3: ; %end +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB45_4: ; SI-NEXT: ; implicit-def: $vgpr0 @@ -5055,24 +5087,30 @@ define <2 x i16> @bitcast_v2f16_to_v2i16(<2 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v2f16_to_v2i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB46_2 ; SI-NEXT: ; %bb.1: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: .LBB46_2: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2f16_to_v2i16: @@ -5139,22 +5177,28 @@ define inreg <2 x i16> @bitcast_v2f16_to_v2i16_scalar(<2 x half> inreg %a, i32 i ; SI-LABEL: bitcast_v2f16_to_v2i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v1, s17 -; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: s_cmp_lg_u32 s17, 0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v2 ; SI-NEXT: s_cbranch_scc0 .LBB47_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_cbranch_execnz .LBB47_3 ; SI-NEXT: .LBB47_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: .LBB47_3: ; %end +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB47_4: ; SI-NEXT: s_branch .LBB47_2 @@ -5235,9 +5279,10 @@ define <2 x bfloat> @bitcast_v2i16_to_v2bf16(<2 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v2i16_to_v2bf16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] @@ -5246,7 +5291,11 @@ define <2 x bfloat> @bitcast_v2i16_to_v2bf16(<2 x i16> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x30000, v1 ; SI-NEXT: ; %bb.2: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2i16_to_v2bf16: ; VI: ; %bb.0: @@ -5311,24 +5360,27 @@ define inreg <2 x bfloat> @bitcast_v2i16_to_v2bf16_scalar(<2 x i16> inreg %a, i3 ; SI-LABEL: bitcast_v2i16_to_v2bf16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: s_lshr_b32 s7, s16, 16 +; SI-NEXT: s_cmp_lg_u32 s17, 0 ; SI-NEXT: s_cbranch_scc0 .LBB49_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_lshl_b32 s6, s16, 16 -; SI-NEXT: s_lshl_b32 s7, s17, 16 +; SI-NEXT: s_lshl_b32 s8, s7, 16 ; SI-NEXT: s_cbranch_execnz .LBB49_3 ; SI-NEXT: .LBB49_2: ; %cmp.true -; SI-NEXT: s_lshl_b32 s4, s17, 16 +; SI-NEXT: s_lshl_b32 s4, s7, 16 ; SI-NEXT: s_lshl_b32 s5, s16, 16 ; SI-NEXT: s_add_i32 s6, s5, 0x30000 -; SI-NEXT: s_add_i32 s7, s4, 0x30000 +; SI-NEXT: s_add_i32 s8, s4, 0x30000 ; SI-NEXT: .LBB49_3: ; %end -; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mul_f32_e64 v0, 1.0, s8 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_mul_f32_e64 v0, 1.0, s6 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB49_4: ; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr8 ; SI-NEXT: s_branch .LBB49_2 ; ; VI-LABEL: bitcast_v2i16_to_v2bf16_scalar: @@ -5404,35 +5456,34 @@ define <2 x i16> @bitcast_v2bf16_to_v2i16(<2 x bfloat> %a, i32 %b) { ; SI-LABEL: bitcast_v2bf16_to_v2i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 ; SI-NEXT: v_mul_f32_e32 v3, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB50_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB50_4 -; SI-NEXT: .LBB50_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB50_3: ; %cmp.false +; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v3 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB50_2 -; SI-NEXT: .LBB50_4: ; %cmp.true +; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v3 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: ; %bb.4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2bf16_to_v2i16: @@ -5580,9 +5631,11 @@ define inreg <2 x i16> @bitcast_v2bf16_to_v2i16_scalar(<2 x bfloat> inreg %a, i3 ; SI-LABEL: bitcast_v2bf16_to_v2i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_cmp_lg_u32 s18, 0 -; SI-NEXT: v_mul_f32_e64 v0, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v3, 1.0, s17 +; SI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s5, s16, 16 +; SI-NEXT: s_cmp_lg_u32 s17, 0 +; SI-NEXT: v_mul_f32_e64 v0, 1.0, s5 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s4 ; SI-NEXT: s_cbranch_scc0 .LBB51_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 @@ -5596,7 +5649,9 @@ define inreg <2 x i16> @bitcast_v2bf16_to_v2i16_scalar(<2 x bfloat> inreg %a, i3 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_lshr_b64 v[2:3], v[0:1], 16 ; SI-NEXT: .LBB51_3: ; %end -; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB51_4: ; SI-NEXT: ; implicit-def: $vgpr2 @@ -5768,9 +5823,10 @@ define <1 x i32> @bitcast_v2i16_to_v1i32(<2 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v2i16_to_v1i32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v3, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mov_b32_e32 v2, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -5782,14 +5838,14 @@ define <1 x i32> @bitcast_v2i16_to_v1i32(<2 x i16> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB52_3: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB52_2 ; SI-NEXT: .LBB52_4: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v2 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 @@ -5859,24 +5915,25 @@ define inreg <1 x i32> @bitcast_v2i16_to_v1i32_scalar(<2 x i16> inreg %a, i32 in ; SI-LABEL: bitcast_v2i16_to_v1i32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: s_lshr_b32 s6, s16, 16 +; SI-NEXT: s_cmp_lg_u32 s17, 0 ; SI-NEXT: s_cbranch_scc0 .LBB53_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: s_or_b32 s6, s4, s5 +; SI-NEXT: s_lshl_b32 s5, s6, 16 +; SI-NEXT: s_or_b32 s7, s4, s5 ; SI-NEXT: s_cbranch_execnz .LBB53_3 ; SI-NEXT: .LBB53_2: ; %cmp.true ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_lshl_b32 s5, s6, 16 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_add_i32 s6, s4, 0x30000 +; SI-NEXT: s_add_i32 s7, s4, 0x30000 ; SI-NEXT: .LBB53_3: ; %end -; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v0, s7 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB53_4: -; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr7 ; SI-NEXT: s_branch .LBB53_2 ; ; VI-LABEL: bitcast_v2i16_to_v1i32_scalar: @@ -5956,21 +6013,18 @@ define <2 x i16> @bitcast_v1i32_to_v2i16(<1 x i32> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB54_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB54_4 -; SI-NEXT: .LBB54_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB54_3: ; %cmp.false +; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB54_2 -; SI-NEXT: .LBB54_4: ; %cmp.true +; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: ; %bb.4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v1i32_to_v2i16: @@ -6042,8 +6096,10 @@ define inreg <2 x i16> @bitcast_v1i32_to_v2i16_scalar(<1 x i32> inreg %a, i32 in ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_lshr_b32 s6, s16, 16 ; SI-NEXT: .LBB55_3: ; %end -; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_mov_b32_e32 v1, s6 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s6, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB55_4: ; SI-NEXT: ; implicit-def: $sgpr6 @@ -6118,12 +6174,11 @@ define <4 x i8> @bitcast_v2i16_to_v4i8(<2 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v2i16_to_v4i8: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v4, v1 -; SI-NEXT: v_mov_b32_e32 v5, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: v_mov_b32_e32 v4, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v4 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -6135,19 +6190,17 @@ define <4 x i8> @bitcast_v2i16_to_v4i8(<2 x i16> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB56_3: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v4 -; SI-NEXT: v_bfe_u32 v3, v4, 8, 8 -; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: v_bfe_u32 v3, v2, 8, 8 ; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB56_2 ; SI-NEXT: .LBB56_4: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v5 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v2 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 @@ -6302,33 +6355,32 @@ define inreg <4 x i8> @bitcast_v2i16_to_v4i8_scalar(<2 x i16> inreg %a, i32 inre ; SI-LABEL: bitcast_v2i16_to_v4i8_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: s_lshr_b32 s6, s16, 16 +; SI-NEXT: s_cmp_lg_u32 s17, 0 ; SI-NEXT: s_cbranch_scc0 .LBB57_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: s_or_b32 s6, s4, s5 -; SI-NEXT: s_lshr_b32 s7, s6, 8 -; SI-NEXT: s_and_b32 s8, s17, 0xffff -; SI-NEXT: s_bfe_u32 s9, s17, 0x80008 +; SI-NEXT: s_lshl_b32 s5, s6, 16 +; SI-NEXT: s_or_b32 s7, s4, s5 +; SI-NEXT: s_lshr_b32 s8, s7, 8 +; SI-NEXT: s_bfe_u32 s9, s6, 0x80008 ; SI-NEXT: s_cbranch_execnz .LBB57_3 ; SI-NEXT: .LBB57_2: ; %cmp.true ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s5, s6, 3 ; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: s_or_b32 s6, s4, s5 -; SI-NEXT: s_lshr_b32 s7, s6, 8 -; SI-NEXT: s_and_b32 s8, s17, 0xffff -; SI-NEXT: s_bfe_u32 s9, s17, 0x80008 +; SI-NEXT: s_lshl_b32 s6, s5, 16 +; SI-NEXT: s_or_b32 s7, s4, s6 +; SI-NEXT: s_lshr_b32 s8, s7, 8 +; SI-NEXT: s_and_b32 s6, s5, 0xffff +; SI-NEXT: s_bfe_u32 s9, s5, 0x80008 ; SI-NEXT: .LBB57_3: ; %end -; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: v_mov_b32_e32 v1, s7 -; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: v_mov_b32_e32 v0, s7 +; SI-NEXT: v_mov_b32_e32 v1, s8 +; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: v_mov_b32_e32 v3, s9 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB57_4: -; SI-NEXT: ; implicit-def: $sgpr6 ; SI-NEXT: ; implicit-def: $sgpr7 ; SI-NEXT: ; implicit-def: $sgpr8 ; SI-NEXT: ; implicit-def: $sgpr9 @@ -6444,50 +6496,47 @@ define <2 x i16> @bitcast_v4i8_to_v2i16(<4 x i8> %a, i32 %b) { ; SI-LABEL: bitcast_v4i8_to_v2i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v5, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v1 ; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB58_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB58_4 -; SI-NEXT: .LBB58_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB58_3: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xff, v5 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v2 -; SI-NEXT: v_or_b32_e32 v0, v0, v4 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_or_b32_e32 v0, v0, v5 +; SI-NEXT: v_or_b32_e32 v2, v1, v3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_or_b32_e32 v0, v0, v2 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v0, v1 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v2 +; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB58_2 -; SI-NEXT: .LBB58_4: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v5 +; SI-NEXT: s_cbranch_execz .LBB58_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v2 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v0, v4, v0 +; SI-NEXT: v_or_b32_e32 v0, v5, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x300, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_or_b32_e32 v0, v0, v2 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v0, v1 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v2 +; SI-NEXT: .LBB58_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4i8_to_v2i16: @@ -6716,8 +6765,10 @@ define inreg <2 x i16> @bitcast_v4i8_to_v2i16_scalar(<4 x i8> inreg %a, i32 inre ; SI-NEXT: s_or_b32 s6, s4, s6 ; SI-NEXT: s_and_b32 s7, s5, 0xffff ; SI-NEXT: .LBB59_3: ; %end -; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: s_and_b32 s4, s6, 0xffff +; SI-NEXT: s_lshl_b32 s5, s7, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB59_4: ; SI-NEXT: ; implicit-def: $sgpr6 @@ -6860,37 +6911,39 @@ define <2 x bfloat> @bitcast_v2f16_to_v2bf16(<2 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v2f16_to_v2bf16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v2 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 ; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB60_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB60_4 -; SI-NEXT: .LBB60_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB60_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 ; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB60_2 -; SI-NEXT: .LBB60_4: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v3 +; SI-NEXT: s_cbranch_execz .LBB60_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v2 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: .LBB60_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2f16_to_v2bf16: @@ -6957,28 +7010,35 @@ define inreg <2 x bfloat> @bitcast_v2f16_to_v2bf16_scalar(<2 x half> inreg %a, i ; SI-LABEL: bitcast_v2f16_to_v2bf16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v3, s17 -; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: s_cmp_lg_u32 s17, 0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v1 ; SI-NEXT: s_cbranch_scc0 .LBB61_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; SI-NEXT: s_cbranch_execnz .LBB61_3 ; SI-NEXT: .LBB61_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v2 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; SI-NEXT: .LBB61_3: ; %end +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshr_b64 v[0:1], v[1:2], 16 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB61_4: -; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: s_branch .LBB61_2 ; ; VI-LABEL: bitcast_v2f16_to_v2bf16_scalar: @@ -7057,30 +7117,27 @@ define <2 x half> @bitcast_v2bf16_to_v2f16(<2 x bfloat> %a, i32 %b) { ; SI-LABEL: bitcast_v2bf16_to_v2f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v3, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB62_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB62_4 -; SI-NEXT: .LBB62_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB62_3: ; %cmp.false +; SI-NEXT: s_cbranch_execz .LBB62_2 +; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v2 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: .LBB62_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB62_2 -; SI-NEXT: .LBB62_4: ; %cmp.true +; SI-NEXT: s_cbranch_execz .LBB62_4 +; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v3 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 @@ -7089,7 +7146,12 @@ define <2 x half> @bitcast_v2bf16_to_v2f16(<2 x bfloat> %a, i32 %b) { ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: .LBB62_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2bf16_to_v2f16: @@ -7244,30 +7306,36 @@ define inreg <2 x half> @bitcast_v2bf16_to_v2f16_scalar(<2 x bfloat> inreg %a, i ; SI-LABEL: bitcast_v2bf16_to_v2f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_cmp_lg_u32 s18, 0 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v3, 1.0, s17 +; SI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s5, s16, 16 +; SI-NEXT: s_cmp_lg_u32 s17, 0 +; SI-NEXT: v_mul_f32_e64 v0, 1.0, s5 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 ; SI-NEXT: s_cbranch_scc0 .LBB63_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: s_cbranch_execnz .LBB63_3 ; SI-NEXT: .LBB63_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v3 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v2 ; SI-NEXT: .LBB63_3: ; %end +; SI-NEXT: v_cvt_f16_f32_e32 v0, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB63_4: -; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: s_branch .LBB63_2 ; ; VI-LABEL: bitcast_v2bf16_to_v2f16_scalar: @@ -7442,9 +7510,12 @@ define <1 x i32> @bitcast_v2f16_to_v1i32(<2 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v2f16_to_v1i32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v0 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -7457,14 +7528,14 @@ define <1 x i32> @bitcast_v2f16_to_v1i32(<2 x half> %a, i32 %b) { ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB64_3: ; %cmp.false ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v0, v2, v0 ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB64_2 ; SI-NEXT: .LBB64_4: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v2 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -7538,9 +7609,12 @@ define inreg <1 x i32> @bitcast_v2f16_to_v1i32_scalar(<2 x half> inreg %a, i32 i ; SI-LABEL: bitcast_v2f16_to_v1i32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v1, s16 -; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: s_cmp_lg_u32 s17, 0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_cbranch_scc0 .LBB65_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 @@ -7637,32 +7711,30 @@ define <2 x half> @bitcast_v1i32_to_v2f16(<1 x i32> %a, i32 %b) { ; SI-LABEL: bitcast_v1i32_to_v2f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB66_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB66_4 -; SI-NEXT: .LBB66_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB66_3: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v2 -; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB66_2 -; SI-NEXT: .LBB66_4: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v2 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_cbranch_execz .LBB66_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: .LBB66_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_cvt_f16_f32_e32 v0, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v1i32_to_v2f16: @@ -7738,6 +7810,10 @@ define inreg <2 x half> @bitcast_v1i32_to_v2f16_scalar(<1 x i32> inreg %a, i32 i ; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 ; SI-NEXT: .LBB67_3: ; %end +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB67_4: ; SI-NEXT: ; implicit-def: $vgpr0 @@ -7813,13 +7889,15 @@ define <4 x i8> @bitcast_v2f16_to_v4i8(<2 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v2f16_to_v4i8: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v3, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v0 +; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execnz .LBB68_3 @@ -7993,9 +8071,12 @@ define inreg <4 x i8> @bitcast_v2f16_to_v4i8_scalar(<2 x half> inreg %a, i32 inr ; SI-LABEL: bitcast_v2f16_to_v4i8_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v4, s16 -; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: s_cmp_lg_u32 s17, 0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 ; SI-NEXT: s_cbranch_scc0 .LBB69_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 @@ -8132,46 +8213,45 @@ define <2 x half> @bitcast_v4i8_to_v2f16(<4 x i8> %a, i32 %b) { ; SI-LABEL: bitcast_v4i8_to_v2f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v5, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v1 -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v3 ; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB70_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB70_4 -; SI-NEXT: .LBB70_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB70_3: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xff, v5 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v2 +; SI-NEXT: s_cbranch_execz .LBB70_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v0, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v2 ; SI-NEXT: v_or_b32_e32 v0, v0, v4 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: .LBB70_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB70_2 -; SI-NEXT: .LBB70_4: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v2 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v3, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v5 +; SI-NEXT: s_cbranch_execz .LBB70_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v4, v0 +; SI-NEXT: v_or_b32_e32 v1, v4, v1 +; SI-NEXT: v_or_b32_e32 v0, v5, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x300, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v2 +; SI-NEXT: .LBB70_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_cvt_f16_f32_e32 v0, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4i8_to_v2f16: @@ -8396,6 +8476,10 @@ define inreg <2 x half> @bitcast_v4i8_to_v2f16_scalar(<4 x i8> inreg %a, i32 inr ; SI-NEXT: v_cvt_f32_f16_e32 v0, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 ; SI-NEXT: .LBB71_3: ; %end +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB71_4: ; SI-NEXT: ; implicit-def: $vgpr0 @@ -8538,8 +8622,10 @@ define <1 x i32> @bitcast_v2bf16_to_v1i32(<2 x bfloat> %a, i32 %b) { ; SI-LABEL: bitcast_v2bf16_to_v1i32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v2 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v0 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -8720,9 +8806,11 @@ define inreg <1 x i32> @bitcast_v2bf16_to_v1i32_scalar(<2 x bfloat> inreg %a, i3 ; SI-LABEL: bitcast_v2bf16_to_v1i32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_cmp_lg_u32 s18, 0 -; SI-NEXT: v_mul_f32_e64 v4, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s16 +; SI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s5, s16, 16 +; SI-NEXT: s_cmp_lg_u32 s17, 0 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s4 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s5 ; SI-NEXT: s_cbranch_scc0 .LBB73_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 @@ -8913,30 +9001,27 @@ define <2 x bfloat> @bitcast_v1i32_to_v2bf16(<1 x i32> %a, i32 %b) { ; SI-LABEL: bitcast_v1i32_to_v2bf16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB74_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB74_4 -; SI-NEXT: .LBB74_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB74_3: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB74_2 -; SI-NEXT: .LBB74_4: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v2 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; SI-NEXT: ; %bb.4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v2 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v1i32_to_v2bf16: @@ -9002,20 +9087,22 @@ define inreg <2 x bfloat> @bitcast_v1i32_to_v2bf16_scalar(<1 x i32> inreg %a, i3 ; SI-NEXT: s_cmp_lg_u32 s17, 0 ; SI-NEXT: s_cbranch_scc0 .LBB75_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_and_b32 s6, s16, 0xffff0000 -; SI-NEXT: s_lshl_b32 s7, s16, 16 +; SI-NEXT: s_and_b32 s7, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s6, s16, 16 ; SI-NEXT: s_cbranch_execnz .LBB75_3 ; SI-NEXT: .LBB75_2: ; %cmp.true ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: s_and_b32 s6, s16, 0xffff0000 -; SI-NEXT: s_lshl_b32 s7, s16, 16 +; SI-NEXT: s_and_b32 s7, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s6, s16, 16 ; SI-NEXT: .LBB75_3: ; %end -; SI-NEXT: v_mov_b32_e32 v0, s7 -; SI-NEXT: v_mov_b32_e32 v1, s6 +; SI-NEXT: v_mul_f32_e64 v0, 1.0, s7 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_mul_f32_e64 v0, 1.0, s6 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB75_4: -; SI-NEXT: ; implicit-def: $sgpr7 ; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr7 ; SI-NEXT: s_branch .LBB75_2 ; ; VI-LABEL: bitcast_v1i32_to_v2bf16_scalar: @@ -9087,8 +9174,10 @@ define <4 x i8> @bitcast_v2bf16_to_v4i8(<2 x bfloat> %a, i32 %b) { ; SI-LABEL: bitcast_v2bf16_to_v4i8: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: v_mul_f32_e32 v4, 1.0, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v2 ; SI-NEXT: v_mul_f32_e32 v5, 1.0, v0 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 @@ -9336,9 +9425,11 @@ define inreg <4 x i8> @bitcast_v2bf16_to_v4i8_scalar(<2 x bfloat> inreg %a, i32 ; SI-LABEL: bitcast_v2bf16_to_v4i8_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_cmp_lg_u32 s18, 0 -; SI-NEXT: v_mul_f32_e64 v0, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s16 +; SI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s5, s16, 16 +; SI-NEXT: s_cmp_lg_u32 s17, 0 +; SI-NEXT: v_mul_f32_e64 v0, 1.0, s4 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s5 ; SI-NEXT: s_cbranch_scc0 .LBB77_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 @@ -9582,46 +9673,44 @@ define <2 x bfloat> @bitcast_v4i8_to_v2bf16(<4 x i8> %a, i32 %b) { ; SI-LABEL: bitcast_v4i8_to_v2bf16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v5, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 ; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v1 ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB78_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB78_4 -; SI-NEXT: .LBB78_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB78_3: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xff, v5 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v0, v4, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_or_b32_e32 v1, v4, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v5, v3, v0 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB78_2 -; SI-NEXT: .LBB78_4: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v2 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v5 +; SI-NEXT: s_cbranch_execz .LBB78_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v2, v3, v1 ; SI-NEXT: v_or_b32_e32 v0, v4, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x3000000, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x3000000, v2 +; SI-NEXT: .LBB78_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v5 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4i8_to_v2bf16: @@ -9846,8 +9935,10 @@ define inreg <2 x bfloat> @bitcast_v4i8_to_v2bf16_scalar(<4 x i8> inreg %a, i32 ; SI-NEXT: s_add_i32 s6, s5, 0x3000000 ; SI-NEXT: s_add_i32 s7, s4, 0x3000000 ; SI-NEXT: .LBB79_3: ; %end -; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mul_f32_e64 v0, 1.0, s7 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_mul_f32_e64 v0, 1.0, s6 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB79_4: ; SI-NEXT: ; implicit-def: $sgpr6 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.352bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.352bit.ll index bd8c305606364..e81978684b8b6 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.352bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.352bit.ll @@ -549,75 +549,94 @@ define <22 x i16> @bitcast_v11i32_to_v22i16(<11 x i32> %a, i32 %b) { ; SI-LABEL: bitcast_v11i32_to_v22i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v20, v10 -; SI-NEXT: v_mov_b32_e32 v18, v9 -; SI-NEXT: v_mov_b32_e32 v16, v8 -; SI-NEXT: v_mov_b32_e32 v14, v7 -; SI-NEXT: v_mov_b32_e32 v12, v6 -; SI-NEXT: v_mov_b32_e32 v10, v5 -; SI-NEXT: v_mov_b32_e32 v8, v4 -; SI-NEXT: v_mov_b32_e32 v6, v3 -; SI-NEXT: v_mov_b32_e32 v4, v2 -; SI-NEXT: v_mov_b32_e32 v2, v1 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB4_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB4_4 -; SI-NEXT: .LBB4_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB4_3: ; %cmp.false -; SI-NEXT: v_alignbit_b32 v21, v0, v20, 16 -; SI-NEXT: v_alignbit_b32 v17, v18, v16, 16 -; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 -; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB4_2 -; SI-NEXT: .LBB4_4: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v12, v0, v10, 16 +; SI-NEXT: v_alignbit_b32 v11, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v13, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v14, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v15, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v18, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v1 +; SI-NEXT: .LBB4_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB4_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 ; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 ; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; SI-NEXT: v_alignbit_b32 v17, v18, v16, 16 -; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 -; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; SI-NEXT: v_alignbit_b32 v21, v0, v20, 16 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_alignbit_b32 v11, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v13, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v14, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v15, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v18, v1, v0, 16 +; SI-NEXT: v_alignbit_b32 v12, v0, v10, 16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v1 +; SI-NEXT: .LBB4_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v8, v8, v11 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v16 +; SI-NEXT: v_or_b32_e32 v0, v0, v18 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v21 +; SI-NEXT: v_or_b32_e32 v2, v2, v15 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v20 +; SI-NEXT: v_or_b32_e32 v4, v4, v14 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v19 +; SI-NEXT: v_or_b32_e32 v6, v6, v13 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v17 +; SI-NEXT: v_or_b32_e32 v9, v9, v11 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_or_b32_e32 v1, v1, v18 +; SI-NEXT: v_or_b32_e32 v3, v3, v15 +; SI-NEXT: v_or_b32_e32 v5, v5, v14 +; SI-NEXT: v_or_b32_e32 v7, v7, v13 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v11i32_to_v22i16: @@ -721,10 +740,10 @@ define inreg <22 x i16> @bitcast_v11i32_to_v22i16_scalar(<11 x i32> inreg %a, i3 ; SI-NEXT: s_lshr_b32 s41, s21, 16 ; SI-NEXT: s_lshr_b32 s42, s19, 16 ; SI-NEXT: s_lshr_b32 s43, s17, 16 -; SI-NEXT: s_lshr_b64 s[10:11], s[26:27], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[26:27], 16 ; SI-NEXT: s_lshr_b64 s[4:5], s[24:25], 16 ; SI-NEXT: s_lshr_b64 s[6:7], s[22:23], 16 -; SI-NEXT: s_lshr_b64 s[8:9], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[20:21], 16 ; SI-NEXT: s_lshr_b64 s[12:13], s[18:19], 16 ; SI-NEXT: s_lshr_b64 s[14:15], s[16:17], 16 ; SI-NEXT: s_cbranch_execnz .LBB5_3 @@ -743,50 +762,72 @@ define inreg <22 x i16> @bitcast_v11i32_to_v22i16_scalar(<11 x i32> inreg %a, i3 ; SI-NEXT: s_lshr_b32 s27, s25, 16 ; SI-NEXT: s_lshr_b64 s[4:5], s[24:25], 16 ; SI-NEXT: s_lshr_b64 s[6:7], s[22:23], 16 -; SI-NEXT: s_lshr_b64 s[8:9], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[20:21], 16 ; SI-NEXT: s_lshr_b64 s[12:13], s[18:19], 16 ; SI-NEXT: s_lshr_b64 s[14:15], s[16:17], 16 ; SI-NEXT: s_lshr_b32 s40, s23, 16 ; SI-NEXT: s_lshr_b32 s41, s21, 16 ; SI-NEXT: s_lshr_b32 s42, s19, 16 ; SI-NEXT: s_lshr_b32 s43, s17, 16 -; SI-NEXT: s_lshr_b64 s[10:11], s[26:27], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[26:27], 16 ; SI-NEXT: .LBB5_3: ; %end -; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_mov_b32_e32 v1, s14 -; SI-NEXT: v_mov_b32_e32 v2, s17 -; SI-NEXT: v_mov_b32_e32 v3, s43 -; SI-NEXT: v_mov_b32_e32 v4, s18 +; SI-NEXT: s_and_b32 s5, s16, 0xffff +; SI-NEXT: s_lshl_b32 s7, s14, 16 +; SI-NEXT: s_or_b32 s5, s5, s7 +; SI-NEXT: s_and_b32 s7, s17, 0xffff +; SI-NEXT: s_lshl_b32 s9, s43, 16 +; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: s_and_b32 s9, s18, 0xffff +; SI-NEXT: s_lshl_b32 s11, s12, 16 +; SI-NEXT: s_or_b32 s9, s9, s11 +; SI-NEXT: s_and_b32 s11, s19, 0xffff +; SI-NEXT: s_lshl_b32 s12, s42, 16 +; SI-NEXT: s_or_b32 s11, s11, s12 +; SI-NEXT: s_and_b32 s12, s20, 0xffff +; SI-NEXT: s_lshl_b32 s10, s10, 16 +; SI-NEXT: s_or_b32 s10, s12, s10 +; SI-NEXT: s_and_b32 s12, s21, 0xffff +; SI-NEXT: s_lshl_b32 s13, s41, 16 +; SI-NEXT: s_or_b32 s12, s12, s13 +; SI-NEXT: s_and_b32 s13, s22, 0xffff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_or_b32 s6, s13, s6 +; SI-NEXT: s_and_b32 s13, s23, 0xffff +; SI-NEXT: s_lshl_b32 s14, s40, 16 +; SI-NEXT: s_or_b32 s13, s13, s14 +; SI-NEXT: s_and_b32 s14, s24, 0xffff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_or_b32 s4, s14, s4 +; SI-NEXT: s_and_b32 s14, s25, 0xffff +; SI-NEXT: s_lshl_b32 s15, s27, 16 +; SI-NEXT: s_or_b32 s14, s14, s15 +; SI-NEXT: s_and_b32 s15, s26, 0xffff +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_or_b32 s8, s15, s8 +; SI-NEXT: v_mov_b32_e32 v0, s5 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: v_mov_b32_e32 v3, s11 +; SI-NEXT: v_mov_b32_e32 v4, s10 ; SI-NEXT: v_mov_b32_e32 v5, s12 -; SI-NEXT: v_mov_b32_e32 v6, s19 -; SI-NEXT: v_mov_b32_e32 v7, s42 -; SI-NEXT: v_mov_b32_e32 v8, s20 -; SI-NEXT: v_mov_b32_e32 v9, s8 -; SI-NEXT: v_mov_b32_e32 v10, s21 -; SI-NEXT: v_mov_b32_e32 v11, s41 -; SI-NEXT: v_mov_b32_e32 v12, s22 -; SI-NEXT: v_mov_b32_e32 v13, s6 -; SI-NEXT: v_mov_b32_e32 v14, s23 -; SI-NEXT: v_mov_b32_e32 v15, s40 -; SI-NEXT: v_mov_b32_e32 v16, s24 -; SI-NEXT: v_mov_b32_e32 v17, s4 -; SI-NEXT: v_mov_b32_e32 v18, s25 -; SI-NEXT: v_mov_b32_e32 v19, s27 -; SI-NEXT: v_mov_b32_e32 v20, s26 -; SI-NEXT: v_mov_b32_e32 v21, s10 +; SI-NEXT: v_mov_b32_e32 v6, s6 +; SI-NEXT: v_mov_b32_e32 v7, s13 +; SI-NEXT: v_mov_b32_e32 v8, s4 +; SI-NEXT: v_mov_b32_e32 v9, s14 +; SI-NEXT: v_mov_b32_e32 v10, s8 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB5_4: ; SI-NEXT: ; implicit-def: $sgpr14 ; SI-NEXT: ; implicit-def: $sgpr43 ; SI-NEXT: ; implicit-def: $sgpr12 ; SI-NEXT: ; implicit-def: $sgpr42 -; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr10 ; SI-NEXT: ; implicit-def: $sgpr41 ; SI-NEXT: ; implicit-def: $sgpr6 ; SI-NEXT: ; implicit-def: $sgpr40 ; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr27 -; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr8 ; SI-NEXT: s_branch .LBB5_2 ; ; VI-LABEL: bitcast_v11i32_to_v22i16_scalar: @@ -912,24 +953,40 @@ define <11 x i32> @bitcast_v22i16_to_v11i32(<22 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v22i16_to_v11i32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v28, v10 -; SI-NEXT: v_mov_b32_e32 v27, v8 -; SI-NEXT: v_mov_b32_e32 v26, v6 -; SI-NEXT: v_mov_b32_e32 v25, v4 -; SI-NEXT: v_mov_b32_e32 v24, v2 -; SI-NEXT: v_mov_b32_e32 v23, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v7 +; SI-NEXT: v_mov_b32_e32 v22, v10 +; SI-NEXT: v_mov_b32_e32 v12, v9 +; SI-NEXT: v_mov_b32_e32 v13, v8 +; SI-NEXT: v_mov_b32_e32 v14, v7 +; SI-NEXT: v_mov_b32_e32 v15, v6 +; SI-NEXT: v_mov_b32_e32 v16, v5 +; SI-NEXT: v_mov_b32_e32 v17, v4 +; SI-NEXT: v_mov_b32_e32 v18, v3 +; SI-NEXT: v_mov_b32_e32 v19, v2 +; SI-NEXT: v_mov_b32_e32 v20, v1 +; SI-NEXT: v_mov_b32_e32 v21, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v21 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v10 ; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v0 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -941,64 +998,64 @@ define <11 x i32> @bitcast_v22i16_to_v11i32(<22 x i16> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB6_3: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v23 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v24 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v25 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v26 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v27 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v28 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v21 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v19 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v15 ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v14 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v16 -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v18 -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v20 -; SI-NEXT: v_or_b32_e32 v0, v0, v35 -; SI-NEXT: v_or_b32_e32 v1, v1, v34 -; SI-NEXT: v_or_b32_e32 v2, v2, v33 -; SI-NEXT: v_or_b32_e32 v3, v3, v32 -; SI-NEXT: v_or_b32_e32 v4, v4, v31 -; SI-NEXT: v_or_b32_e32 v5, v5, v30 -; SI-NEXT: v_or_b32_e32 v6, v6, v29 -; SI-NEXT: v_or_b32_e32 v7, v7, v22 -; SI-NEXT: v_or_b32_e32 v8, v8, v15 -; SI-NEXT: v_or_b32_e32 v9, v9, v13 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v0, v0, v32 +; SI-NEXT: v_or_b32_e32 v1, v1, v31 +; SI-NEXT: v_or_b32_e32 v2, v2, v30 +; SI-NEXT: v_or_b32_e32 v3, v3, v29 +; SI-NEXT: v_or_b32_e32 v4, v4, v28 +; SI-NEXT: v_or_b32_e32 v5, v5, v27 +; SI-NEXT: v_or_b32_e32 v6, v6, v26 +; SI-NEXT: v_or_b32_e32 v7, v7, v25 +; SI-NEXT: v_or_b32_e32 v8, v8, v24 +; SI-NEXT: v_or_b32_e32 v9, v9, v23 ; SI-NEXT: v_or_b32_e32 v10, v10, v11 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB6_2 ; SI-NEXT: .LBB6_4: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v23 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v24 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v25 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v26 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v27 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v28 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v15 ; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v14 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v16 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v18 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v22 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 @@ -1010,17 +1067,17 @@ define <11 x i32> @bitcast_v22i16_to_v11i32(<22 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; SI-NEXT: v_or_b32_e32 v0, v35, v0 +; SI-NEXT: v_or_b32_e32 v0, v32, v0 ; SI-NEXT: s_mov_b32 s6, 0x30000 -; SI-NEXT: v_or_b32_e32 v1, v34, v1 -; SI-NEXT: v_or_b32_e32 v2, v33, v2 -; SI-NEXT: v_or_b32_e32 v3, v32, v3 -; SI-NEXT: v_or_b32_e32 v4, v31, v4 -; SI-NEXT: v_or_b32_e32 v5, v30, v5 -; SI-NEXT: v_or_b32_e32 v6, v29, v6 -; SI-NEXT: v_or_b32_e32 v7, v22, v7 -; SI-NEXT: v_or_b32_e32 v8, v15, v8 -; SI-NEXT: v_or_b32_e32 v9, v13, v9 +; SI-NEXT: v_or_b32_e32 v1, v31, v1 +; SI-NEXT: v_or_b32_e32 v2, v30, v2 +; SI-NEXT: v_or_b32_e32 v3, v29, v3 +; SI-NEXT: v_or_b32_e32 v4, v28, v4 +; SI-NEXT: v_or_b32_e32 v5, v27, v5 +; SI-NEXT: v_or_b32_e32 v6, v26, v6 +; SI-NEXT: v_or_b32_e32 v7, v25, v7 +; SI-NEXT: v_or_b32_e32 v8, v24, v8 +; SI-NEXT: v_or_b32_e32 v9, v23, v9 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 @@ -1152,98 +1209,99 @@ define inreg <11 x i32> @bitcast_v22i16_to_v11i32_scalar(<22 x i16> inreg %a, i3 ; SI-LABEL: bitcast_v22i16_to_v11i32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; SI-NEXT: v_mov_b32_e32 v11, v6 -; SI-NEXT: v_mov_b32_e32 v12, v4 -; SI-NEXT: v_mov_b32_e32 v13, v2 -; SI-NEXT: v_mov_b32_e32 v14, v0 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v7 +; SI-NEXT: s_lshr_b32 s15, s26, 16 +; SI-NEXT: s_lshr_b32 s40, s25, 16 +; SI-NEXT: s_lshr_b32 s41, s24, 16 +; SI-NEXT: s_lshr_b32 s42, s23, 16 +; SI-NEXT: s_lshr_b32 s43, s22, 16 +; SI-NEXT: s_lshr_b32 s44, s21, 16 +; SI-NEXT: s_lshr_b32 s45, s20, 16 +; SI-NEXT: s_lshr_b32 s46, s19, 16 +; SI-NEXT: s_lshr_b32 s47, s18, 16 +; SI-NEXT: s_lshr_b32 s56, s17, 16 +; SI-NEXT: s_lshr_b32 s57, s16, 16 +; SI-NEXT: s_cmp_lg_u32 s27, 0 ; SI-NEXT: s_cbranch_scc0 .LBB7_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_lshl_b32 s5, s57, 16 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s56, 16 ; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_and_b32 s6, s18, 0xffff +; SI-NEXT: s_lshl_b32 s7, s47, 16 ; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_and_b32 s7, s19, 0xffff +; SI-NEXT: s_lshl_b32 s8, s46, 16 ; SI-NEXT: s_or_b32 s7, s7, s8 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v14 +; SI-NEXT: s_and_b32 s8, s20, 0xffff +; SI-NEXT: s_lshl_b32 s9, s45, 16 ; SI-NEXT: s_or_b32 s8, s8, s9 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: v_or_b32_e32 v7, v0, v18 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v13 +; SI-NEXT: s_and_b32 s9, s21, 0xffff +; SI-NEXT: s_lshl_b32 s10, s44, 16 ; SI-NEXT: s_or_b32 s9, s9, s10 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_or_b32_e32 v8, v0, v17 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v12 +; SI-NEXT: s_and_b32 s10, s22, 0xffff +; SI-NEXT: s_lshl_b32 s11, s43, 16 ; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_or_b32_e32 v9, v0, v16 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v11 -; SI-NEXT: v_or_b32_e32 v10, v0, v15 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_mov_b32_e32 v5, s9 -; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_and_b32 s11, s23, 0xffff +; SI-NEXT: s_lshl_b32 s12, s42, 16 +; SI-NEXT: s_or_b32 s11, s11, s12 +; SI-NEXT: s_and_b32 s12, s24, 0xffff +; SI-NEXT: s_lshl_b32 s13, s41, 16 +; SI-NEXT: s_or_b32 s12, s12, s13 +; SI-NEXT: s_and_b32 s13, s25, 0xffff +; SI-NEXT: s_lshl_b32 s14, s40, 16 +; SI-NEXT: s_or_b32 s13, s13, s14 +; SI-NEXT: s_and_b32 s14, s26, 0xffff +; SI-NEXT: s_lshl_b32 s27, s15, 16 +; SI-NEXT: s_or_b32 s14, s14, s27 ; SI-NEXT: s_cbranch_execnz .LBB7_3 ; SI-NEXT: .LBB7_2: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v14 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_or_b32_e32 v0, v18, v0 ; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v13 +; SI-NEXT: s_lshl_b32 s5, s57, 16 +; SI-NEXT: s_add_i32 s17, s17, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s56, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: v_or_b32_e32 v0, v17, v0 +; SI-NEXT: s_and_b32 s6, s18, 0xffff +; SI-NEXT: s_lshl_b32 s7, s47, 16 +; SI-NEXT: s_add_i32 s19, s19, 3 ; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v12 +; SI-NEXT: s_and_b32 s7, s19, 0xffff +; SI-NEXT: s_lshl_b32 s8, s46, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 ; SI-NEXT: s_or_b32 s7, s8, s7 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_and_b32 s8, s20, 0xffff +; SI-NEXT: s_lshl_b32 s9, s45, 16 +; SI-NEXT: s_add_i32 s21, s21, 3 ; SI-NEXT: s_or_b32 s8, s9, s8 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: v_or_b32_e32 v0, v16, v0 +; SI-NEXT: s_and_b32 s9, s21, 0xffff +; SI-NEXT: s_lshl_b32 s10, s44, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 ; SI-NEXT: s_or_b32 s9, s10, s9 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v11 +; SI-NEXT: s_and_b32 s10, s22, 0xffff +; SI-NEXT: s_lshl_b32 s11, s43, 16 +; SI-NEXT: s_add_i32 s23, s23, 3 ; SI-NEXT: s_or_b32 s10, s11, s10 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_and_b32 s11, s23, 0xffff +; SI-NEXT: s_lshl_b32 s12, s42, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s11, s12, s11 +; SI-NEXT: s_and_b32 s12, s24, 0xffff +; SI-NEXT: s_lshl_b32 s13, s41, 16 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_or_b32 s12, s13, s12 +; SI-NEXT: s_and_b32 s13, s25, 0xffff +; SI-NEXT: s_lshl_b32 s14, s40, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s13, s14, s13 +; SI-NEXT: s_and_b32 s14, s26, 0xffff +; SI-NEXT: s_lshl_b32 s15, s15, 16 +; SI-NEXT: s_or_b32 s14, s15, s14 ; SI-NEXT: s_add_i32 s4, s4, 0x30000 ; SI-NEXT: s_add_i32 s5, s5, 0x30000 ; SI-NEXT: s_add_i32 s6, s6, 0x30000 @@ -1251,8 +1309,11 @@ define inreg <11 x i32> @bitcast_v22i16_to_v11i32_scalar(<22 x i16> inreg %a, i3 ; SI-NEXT: s_add_i32 s8, s8, 0x30000 ; SI-NEXT: s_add_i32 s9, s9, 0x30000 ; SI-NEXT: s_add_i32 s10, s10, 0x30000 -; SI-NEXT: v_or_b32_e32 v0, v15, v0 -; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 +; SI-NEXT: s_add_i32 s11, s11, 0x30000 +; SI-NEXT: s_add_i32 s12, s12, 0x30000 +; SI-NEXT: s_add_i32 s13, s13, 0x30000 +; SI-NEXT: s_add_i32 s14, s14, 0x30000 +; SI-NEXT: .LBB7_3: ; %end ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_mov_b32_e32 v2, s6 @@ -1260,10 +1321,13 @@ define inreg <11 x i32> @bitcast_v22i16_to_v11i32_scalar(<22 x i16> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v4, s8 ; SI-NEXT: v_mov_b32_e32 v5, s9 ; SI-NEXT: v_mov_b32_e32 v6, s10 -; SI-NEXT: .LBB7_3: ; %end +; SI-NEXT: v_mov_b32_e32 v7, s11 +; SI-NEXT: v_mov_b32_e32 v8, s12 +; SI-NEXT: v_mov_b32_e32 v9, s13 +; SI-NEXT: v_mov_b32_e32 v10, s14 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB7_4: -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10 +; SI-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14 ; SI-NEXT: s_branch .LBB7_2 ; ; VI-LABEL: bitcast_v22i16_to_v11i32_scalar: @@ -1438,18 +1502,66 @@ define <22 x half> @bitcast_v11i32_to_v22f16(<11 x i32> %a, i32 %b) { ; SI-LABEL: bitcast_v11i32_to_v22f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v32, v10 -; SI-NEXT: v_mov_b32_e32 v31, v9 -; SI-NEXT: v_mov_b32_e32 v30, v8 -; SI-NEXT: v_mov_b32_e32 v29, v7 -; SI-NEXT: v_mov_b32_e32 v28, v6 -; SI-NEXT: v_mov_b32_e32 v27, v5 -; SI-NEXT: v_mov_b32_e32 v26, v4 -; SI-NEXT: v_mov_b32_e32 v25, v3 -; SI-NEXT: v_mov_b32_e32 v24, v2 -; SI-NEXT: v_mov_b32_e32 v23, v1 -; SI-NEXT: v_mov_b32_e32 v22, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB8_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v13 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v13 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v13 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v13 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v13 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v13 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v13 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v13 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v0 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 @@ -1461,119 +1573,100 @@ define <22 x half> @bitcast_v11i32_to_v22f16(<11 x i32> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB8_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB8_4 -; SI-NEXT: .LBB8_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB8_3: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: .LBB8_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB8_2 -; SI-NEXT: .LBB8_4: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v22 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v23 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v24 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v25 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v26 -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v27 -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v28 -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v29 -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v30 -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v31 -; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: s_cbranch_execz .LBB8_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v0 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v0 +; SI-NEXT: .LBB8_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_cvt_f16_f32_e32 v0, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v26 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v27 +; SI-NEXT: v_or_b32_e32 v2, v5, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v23 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v22 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v19 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v18 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v16 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v12 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v13 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v11i32_to_v22f16: @@ -1673,37 +1766,37 @@ define inreg <22 x half> @bitcast_v11i32_to_v22f16_scalar(<11 x i32> inreg %a, i ; SI-NEXT: s_cbranch_scc0 .LBB9_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_lshr_b32 s4, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 ; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 ; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 ; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 ; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 ; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 ; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 ; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 ; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 ; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 ; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 ; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 ; SI-NEXT: s_cbranch_execnz .LBB9_3 ; SI-NEXT: .LBB9_2: ; %cmp.true @@ -1711,71 +1804,115 @@ define inreg <22 x half> @bitcast_v11i32_to_v22f16_scalar(<11 x i32> inreg %a, i ; SI-NEXT: s_add_i32 s25, s25, 3 ; SI-NEXT: s_lshr_b32 s4, s26, 16 ; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 ; SI-NEXT: s_lshr_b32 s4, s25, 16 ; SI-NEXT: s_add_i32 s23, s23, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 ; SI-NEXT: s_lshr_b32 s4, s24, 16 ; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 ; SI-NEXT: s_lshr_b32 s4, s23, 16 ; SI-NEXT: s_add_i32 s21, s21, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 ; SI-NEXT: s_lshr_b32 s4, s22, 16 ; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 ; SI-NEXT: s_lshr_b32 s4, s21, 16 ; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 ; SI-NEXT: s_lshr_b32 s4, s20, 16 ; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 ; SI-NEXT: s_lshr_b32 s4, s19, 16 ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 ; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 ; SI-NEXT: s_lshr_b32 s4, s17, 16 ; SI-NEXT: s_lshr_b32 s5, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 ; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s5 ; SI-NEXT: .LBB9_3: ; %end +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v2, v2, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v18 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_or_b32_e32 v5, v15, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v14 +; SI-NEXT: v_or_b32_e32 v7, v13, v7 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_or_b32_e32 v8, v8, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v13 +; SI-NEXT: v_or_b32_e32 v0, v0, v21 +; SI-NEXT: v_or_b32_e32 v1, v1, v20 +; SI-NEXT: v_or_b32_e32 v4, v19, v4 +; SI-NEXT: v_or_b32_e32 v6, v17, v6 +; SI-NEXT: v_or_b32_e32 v9, v12, v9 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB9_4: ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: s_branch .LBB9_2 ; ; VI-LABEL: bitcast_v11i32_to_v22f16_scalar: @@ -1901,29 +2038,62 @@ define <11 x i32> @bitcast_v22f16_to_v11i32(<22 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v22f16_to_v11i32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v38, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v20 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v10 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -1935,33 +2105,28 @@ define <11 x i32> @bitcast_v22f16_to_v11i32(<22 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB10_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v38 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v30 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v28 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v26 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v19 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v14 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v12 -; SI-NEXT: v_or_b32_e32 v0, v37, v0 -; SI-NEXT: v_or_b32_e32 v1, v35, v1 -; SI-NEXT: v_or_b32_e32 v2, v33, v2 -; SI-NEXT: v_or_b32_e32 v3, v31, v3 -; SI-NEXT: v_or_b32_e32 v4, v29, v4 -; SI-NEXT: v_or_b32_e32 v5, v27, v5 -; SI-NEXT: v_or_b32_e32 v6, v25, v6 -; SI-NEXT: v_or_b32_e32 v7, v23, v7 -; SI-NEXT: v_or_b32_e32 v8, v15, v8 -; SI-NEXT: v_or_b32_e32 v9, v13, v9 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v13 +; SI-NEXT: v_or_b32_e32 v0, v32, v0 +; SI-NEXT: v_or_b32_e32 v1, v30, v1 +; SI-NEXT: v_or_b32_e32 v2, v28, v2 +; SI-NEXT: v_or_b32_e32 v3, v26, v3 +; SI-NEXT: v_or_b32_e32 v4, v24, v4 +; SI-NEXT: v_or_b32_e32 v5, v22, v5 +; SI-NEXT: v_or_b32_e32 v6, v20, v6 +; SI-NEXT: v_or_b32_e32 v7, v18, v7 +; SI-NEXT: v_or_b32_e32 v8, v16, v8 +; SI-NEXT: v_or_b32_e32 v9, v14, v9 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr31 @@ -1973,19 +2138,24 @@ define <11 x i32> @bitcast_v22f16_to_v11i32(<22 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB10_2 ; SI-NEXT: .LBB10_4: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v30 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -1993,10 +2163,10 @@ define <11 x i32> @bitcast_v22f16_to_v11i32(<22 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v28 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v29 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -2004,11 +2174,11 @@ define <11 x i32> @bitcast_v22f16_to_v11i32(<22 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v27 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v25 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -2016,11 +2186,11 @@ define <11 x i32> @bitcast_v22f16_to_v11i32(<22 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v24 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v22 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 @@ -2028,11 +2198,11 @@ define <11 x i32> @bitcast_v22f16_to_v11i32(<22 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v21 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v19 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 @@ -2040,11 +2210,11 @@ define <11 x i32> @bitcast_v22f16_to_v11i32(<22 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v18 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v16 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 @@ -2052,24 +2222,24 @@ define <11 x i32> @bitcast_v22f16_to_v11i32(<22 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v15 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v13 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v12 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -2190,30 +2360,62 @@ define inreg <11 x i32> @bitcast_v22f16_to_v11i32_scalar(<22 x half> inreg %a, i ; SI-LABEL: bitcast_v22f16_to_v11i32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v32, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v31, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v30, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v29, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v28, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v27, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v26, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v25, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v24, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v23, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v22, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v20, s26 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v21, s29 -; SI-NEXT: v_cvt_f16_f32_e32 v19, s28 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_lshr_b32 s5, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s5 +; SI-NEXT: s_lshr_b32 s5, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 +; SI-NEXT: s_lshr_b32 s5, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s5 +; SI-NEXT: s_lshr_b32 s5, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s5 +; SI-NEXT: s_lshr_b32 s5, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s5 +; SI-NEXT: s_lshr_b32 s5, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s5 +; SI-NEXT: s_lshr_b32 s5, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s5 +; SI-NEXT: s_lshr_b32 s5, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s5 +; SI-NEXT: s_lshr_b32 s5, s17, 16 +; SI-NEXT: s_lshr_b32 s4, s26, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s5 +; SI-NEXT: s_lshr_b32 s5, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v2 +; SI-NEXT: s_cmp_lg_u32 s27, 0 ; SI-NEXT: s_cbranch_scc0 .LBB11_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v32 @@ -2222,7 +2424,7 @@ define inreg <11 x i32> @bitcast_v22f16_to_v11i32_scalar(<22 x half> inreg %a, i ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v26 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v24 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v22 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v20 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v18 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v16 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v14 @@ -2232,7 +2434,7 @@ define inreg <11 x i32> @bitcast_v22f16_to_v11i32_scalar(<22 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v2, v27, v2 ; SI-NEXT: v_or_b32_e32 v3, v25, v3 ; SI-NEXT: v_or_b32_e32 v4, v23, v4 -; SI-NEXT: v_or_b32_e32 v5, v20, v5 +; SI-NEXT: v_or_b32_e32 v5, v21, v5 ; SI-NEXT: v_or_b32_e32 v6, v19, v6 ; SI-NEXT: v_or_b32_e32 v7, v17, v7 ; SI-NEXT: v_or_b32_e32 v8, v15, v8 @@ -2278,7 +2480,7 @@ define inreg <11 x i32> @bitcast_v22f16_to_v11i32_scalar(<22 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v21 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 @@ -2286,7 +2488,7 @@ define inreg <11 x i32> @bitcast_v22f16_to_v11i32_scalar(<22 x half> inreg %a, i ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v20 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v19 @@ -2509,75 +2711,94 @@ define <22 x i16> @bitcast_v11f32_to_v22i16(<11 x float> %a, i32 %b) { ; SI-LABEL: bitcast_v11f32_to_v22i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v20, v10 -; SI-NEXT: v_mov_b32_e32 v18, v9 -; SI-NEXT: v_mov_b32_e32 v16, v8 -; SI-NEXT: v_mov_b32_e32 v14, v7 -; SI-NEXT: v_mov_b32_e32 v12, v6 -; SI-NEXT: v_mov_b32_e32 v10, v5 -; SI-NEXT: v_mov_b32_e32 v8, v4 -; SI-NEXT: v_mov_b32_e32 v6, v3 -; SI-NEXT: v_mov_b32_e32 v4, v2 -; SI-NEXT: v_mov_b32_e32 v2, v1 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB12_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB12_4 -; SI-NEXT: .LBB12_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB12_3: ; %cmp.false -; SI-NEXT: v_alignbit_b32 v21, v0, v20, 16 -; SI-NEXT: v_alignbit_b32 v17, v18, v16, 16 -; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 -; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB12_2 -; SI-NEXT: .LBB12_4: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 -; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v12, v0, v10, 16 +; SI-NEXT: v_alignbit_b32 v11, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v13, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v14, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v15, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v18, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v1 +; SI-NEXT: .LBB12_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB12_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 ; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 -; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 ; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 -; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 -; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 -; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 -; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 -; SI-NEXT: v_alignbit_b32 v17, v18, v16, 16 -; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 -; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; SI-NEXT: v_alignbit_b32 v21, v0, v20, 16 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_alignbit_b32 v11, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v13, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v14, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v15, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v18, v1, v0, 16 +; SI-NEXT: v_alignbit_b32 v12, v0, v10, 16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v1 +; SI-NEXT: .LBB12_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v8, v8, v11 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v16 +; SI-NEXT: v_or_b32_e32 v0, v0, v18 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v21 +; SI-NEXT: v_or_b32_e32 v2, v2, v15 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v20 +; SI-NEXT: v_or_b32_e32 v4, v4, v14 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v19 +; SI-NEXT: v_or_b32_e32 v6, v6, v13 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v17 +; SI-NEXT: v_or_b32_e32 v9, v9, v11 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_or_b32_e32 v1, v1, v18 +; SI-NEXT: v_or_b32_e32 v3, v3, v15 +; SI-NEXT: v_or_b32_e32 v5, v5, v14 +; SI-NEXT: v_or_b32_e32 v7, v7, v13 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v11f32_to_v22i16: @@ -2683,28 +2904,28 @@ define inreg <22 x i16> @bitcast_v11f32_to_v22i16_scalar(<11 x float> inreg %a, ; SI-NEXT: s_lshr_b64 s[14:15], s[16:17], 16 ; SI-NEXT: s_cbranch_execnz .LBB13_4 ; SI-NEXT: .LBB13_2: ; %cmp.true -; SI-NEXT: v_add_f32_e64 v20, s26, 1.0 -; SI-NEXT: v_add_f32_e64 v32, s17, 1.0 -; SI-NEXT: v_add_f32_e64 v31, s16, 1.0 -; SI-NEXT: v_add_f32_e64 v30, s19, 1.0 -; SI-NEXT: v_add_f32_e64 v29, s18, 1.0 -; SI-NEXT: v_add_f32_e64 v28, s21, 1.0 -; SI-NEXT: v_add_f32_e64 v27, s20, 1.0 -; SI-NEXT: v_add_f32_e64 v26, s23, 1.0 -; SI-NEXT: v_add_f32_e64 v25, s22, 1.0 -; SI-NEXT: v_add_f32_e64 v24, s25, 1.0 -; SI-NEXT: v_add_f32_e64 v23, s24, 1.0 -; SI-NEXT: v_lshr_b64 v[17:18], v[23:24], 16 -; SI-NEXT: v_lshr_b64 v[13:14], v[25:26], 16 -; SI-NEXT: v_lshr_b64 v[9:10], v[27:28], 16 -; SI-NEXT: v_lshr_b64 v[5:6], v[29:30], 16 -; SI-NEXT: v_lshr_b64 v[1:2], v[31:32], 16 -; SI-NEXT: v_lshr_b64 v[21:22], v[20:21], 16 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v32 +; SI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; SI-NEXT: v_add_f32_e64 v9, s25, 1.0 +; SI-NEXT: v_add_f32_e64 v8, s24, 1.0 +; SI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v7, s23, 1.0 +; SI-NEXT: v_add_f32_e64 v6, s22, 1.0 +; SI-NEXT: v_lshr_b64 v[11:12], v[8:9], 16 +; SI-NEXT: v_lshr_b64 v[15:16], v[4:5], 16 +; SI-NEXT: v_add_f32_e64 v10, s26, 1.0 +; SI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: v_lshr_b64 v[12:13], v[6:7], 16 +; SI-NEXT: v_lshr_b64 v[16:17], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[17:18], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[13:14], v[10:11], 16 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v1 ; SI-NEXT: s_branch .LBB13_5 ; SI-NEXT: .LBB13_3: ; SI-NEXT: ; implicit-def: $sgpr14 @@ -2720,39 +2941,62 @@ define inreg <22 x i16> @bitcast_v11f32_to_v22i16_scalar(<11 x float> inreg %a, ; SI-NEXT: ; implicit-def: $sgpr8 ; SI-NEXT: s_branch .LBB13_2 ; SI-NEXT: .LBB13_4: -; SI-NEXT: v_mov_b32_e32 v31, s16 -; SI-NEXT: v_mov_b32_e32 v32, s17 -; SI-NEXT: v_mov_b32_e32 v29, s18 -; SI-NEXT: v_mov_b32_e32 v30, s19 -; SI-NEXT: v_mov_b32_e32 v27, s20 -; SI-NEXT: v_mov_b32_e32 v28, s21 -; SI-NEXT: v_mov_b32_e32 v25, s22 -; SI-NEXT: v_mov_b32_e32 v26, s23 -; SI-NEXT: v_mov_b32_e32 v23, s24 -; SI-NEXT: v_mov_b32_e32 v24, s25 -; SI-NEXT: v_mov_b32_e32 v20, s26 -; SI-NEXT: v_mov_b32_e32 v3, s27 -; SI-NEXT: v_mov_b32_e32 v7, s40 -; SI-NEXT: v_mov_b32_e32 v11, s41 -; SI-NEXT: v_mov_b32_e32 v15, s42 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v23, s27 +; SI-NEXT: v_mov_b32_e32 v22, s40 +; SI-NEXT: v_mov_b32_e32 v21, s41 +; SI-NEXT: v_mov_b32_e32 v20, s42 ; SI-NEXT: v_mov_b32_e32 v19, s43 -; SI-NEXT: v_mov_b32_e32 v21, s8 -; SI-NEXT: v_mov_b32_e32 v1, s14 -; SI-NEXT: v_mov_b32_e32 v5, s12 -; SI-NEXT: v_mov_b32_e32 v9, s10 -; SI-NEXT: v_mov_b32_e32 v13, s6 -; SI-NEXT: v_mov_b32_e32 v17, s4 +; SI-NEXT: v_mov_b32_e32 v13, s8 +; SI-NEXT: v_mov_b32_e32 v17, s14 +; SI-NEXT: v_mov_b32_e32 v16, s12 +; SI-NEXT: v_mov_b32_e32 v15, s10 +; SI-NEXT: v_mov_b32_e32 v12, s6 +; SI-NEXT: v_mov_b32_e32 v11, s4 ; SI-NEXT: .LBB13_5: ; %end -; SI-NEXT: v_mov_b32_e32 v0, v31 -; SI-NEXT: v_mov_b32_e32 v2, v32 -; SI-NEXT: v_mov_b32_e32 v4, v29 -; SI-NEXT: v_mov_b32_e32 v6, v30 -; SI-NEXT: v_mov_b32_e32 v8, v27 -; SI-NEXT: v_mov_b32_e32 v10, v28 -; SI-NEXT: v_mov_b32_e32 v12, v25 -; SI-NEXT: v_mov_b32_e32 v14, v26 -; SI-NEXT: v_mov_b32_e32 v16, v23 -; SI-NEXT: v_mov_b32_e32 v18, v24 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v17 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v0, v14 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v23 +; SI-NEXT: v_or_b32_e32 v1, v1, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v2, v2, v14 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v22 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v3, v3, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v8, v8, v11 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v19 +; SI-NEXT: v_or_b32_e32 v4, v4, v14 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v21 +; SI-NEXT: v_or_b32_e32 v6, v6, v12 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v20 +; SI-NEXT: v_or_b32_e32 v9, v9, v11 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v13 +; SI-NEXT: v_or_b32_e32 v5, v5, v14 +; SI-NEXT: v_or_b32_e32 v7, v7, v12 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v11f32_to_v22i16_scalar: @@ -2896,24 +3140,40 @@ define <11 x float> @bitcast_v22i16_to_v11f32(<22 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v22i16_to_v11f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v28, v10 -; SI-NEXT: v_mov_b32_e32 v27, v8 -; SI-NEXT: v_mov_b32_e32 v26, v6 -; SI-NEXT: v_mov_b32_e32 v25, v4 -; SI-NEXT: v_mov_b32_e32 v24, v2 -; SI-NEXT: v_mov_b32_e32 v23, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v7 +; SI-NEXT: v_mov_b32_e32 v22, v10 +; SI-NEXT: v_mov_b32_e32 v12, v9 +; SI-NEXT: v_mov_b32_e32 v13, v8 +; SI-NEXT: v_mov_b32_e32 v14, v7 +; SI-NEXT: v_mov_b32_e32 v15, v6 +; SI-NEXT: v_mov_b32_e32 v16, v5 +; SI-NEXT: v_mov_b32_e32 v17, v4 +; SI-NEXT: v_mov_b32_e32 v18, v3 +; SI-NEXT: v_mov_b32_e32 v19, v2 +; SI-NEXT: v_mov_b32_e32 v20, v1 +; SI-NEXT: v_mov_b32_e32 v21, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v21 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v10 ; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v0 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -2925,64 +3185,64 @@ define <11 x float> @bitcast_v22i16_to_v11f32(<22 x i16> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB14_3: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v23 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v24 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v25 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v26 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v27 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v28 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v21 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v19 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v15 ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v14 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v16 -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v18 -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v20 -; SI-NEXT: v_or_b32_e32 v0, v0, v35 -; SI-NEXT: v_or_b32_e32 v1, v1, v34 -; SI-NEXT: v_or_b32_e32 v2, v2, v33 -; SI-NEXT: v_or_b32_e32 v3, v3, v32 -; SI-NEXT: v_or_b32_e32 v4, v4, v31 -; SI-NEXT: v_or_b32_e32 v5, v5, v30 -; SI-NEXT: v_or_b32_e32 v6, v6, v29 -; SI-NEXT: v_or_b32_e32 v7, v7, v22 -; SI-NEXT: v_or_b32_e32 v8, v8, v15 -; SI-NEXT: v_or_b32_e32 v9, v9, v13 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v0, v0, v32 +; SI-NEXT: v_or_b32_e32 v1, v1, v31 +; SI-NEXT: v_or_b32_e32 v2, v2, v30 +; SI-NEXT: v_or_b32_e32 v3, v3, v29 +; SI-NEXT: v_or_b32_e32 v4, v4, v28 +; SI-NEXT: v_or_b32_e32 v5, v5, v27 +; SI-NEXT: v_or_b32_e32 v6, v6, v26 +; SI-NEXT: v_or_b32_e32 v7, v7, v25 +; SI-NEXT: v_or_b32_e32 v8, v8, v24 +; SI-NEXT: v_or_b32_e32 v9, v9, v23 ; SI-NEXT: v_or_b32_e32 v10, v10, v11 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB14_2 ; SI-NEXT: .LBB14_4: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v23 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v24 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v25 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v26 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v27 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v28 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v15 ; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v14 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v16 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v18 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v22 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 @@ -2994,17 +3254,17 @@ define <11 x float> @bitcast_v22i16_to_v11f32(<22 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; SI-NEXT: v_or_b32_e32 v0, v35, v0 +; SI-NEXT: v_or_b32_e32 v0, v32, v0 ; SI-NEXT: s_mov_b32 s6, 0x30000 -; SI-NEXT: v_or_b32_e32 v1, v34, v1 -; SI-NEXT: v_or_b32_e32 v2, v33, v2 -; SI-NEXT: v_or_b32_e32 v3, v32, v3 -; SI-NEXT: v_or_b32_e32 v4, v31, v4 -; SI-NEXT: v_or_b32_e32 v5, v30, v5 -; SI-NEXT: v_or_b32_e32 v6, v29, v6 -; SI-NEXT: v_or_b32_e32 v7, v22, v7 -; SI-NEXT: v_or_b32_e32 v8, v15, v8 -; SI-NEXT: v_or_b32_e32 v9, v13, v9 +; SI-NEXT: v_or_b32_e32 v1, v31, v1 +; SI-NEXT: v_or_b32_e32 v2, v30, v2 +; SI-NEXT: v_or_b32_e32 v3, v29, v3 +; SI-NEXT: v_or_b32_e32 v4, v28, v4 +; SI-NEXT: v_or_b32_e32 v5, v27, v5 +; SI-NEXT: v_or_b32_e32 v6, v26, v6 +; SI-NEXT: v_or_b32_e32 v7, v25, v7 +; SI-NEXT: v_or_b32_e32 v8, v24, v8 +; SI-NEXT: v_or_b32_e32 v9, v23, v9 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 @@ -3136,98 +3396,99 @@ define inreg <11 x float> @bitcast_v22i16_to_v11f32_scalar(<22 x i16> inreg %a, ; SI-LABEL: bitcast_v22i16_to_v11f32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; SI-NEXT: v_mov_b32_e32 v11, v6 -; SI-NEXT: v_mov_b32_e32 v12, v4 -; SI-NEXT: v_mov_b32_e32 v13, v2 -; SI-NEXT: v_mov_b32_e32 v14, v0 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v7 +; SI-NEXT: s_lshr_b32 s15, s26, 16 +; SI-NEXT: s_lshr_b32 s40, s25, 16 +; SI-NEXT: s_lshr_b32 s41, s24, 16 +; SI-NEXT: s_lshr_b32 s42, s23, 16 +; SI-NEXT: s_lshr_b32 s43, s22, 16 +; SI-NEXT: s_lshr_b32 s44, s21, 16 +; SI-NEXT: s_lshr_b32 s45, s20, 16 +; SI-NEXT: s_lshr_b32 s46, s19, 16 +; SI-NEXT: s_lshr_b32 s47, s18, 16 +; SI-NEXT: s_lshr_b32 s56, s17, 16 +; SI-NEXT: s_lshr_b32 s57, s16, 16 +; SI-NEXT: s_cmp_lg_u32 s27, 0 ; SI-NEXT: s_cbranch_scc0 .LBB15_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_lshl_b32 s5, s57, 16 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s56, 16 ; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_and_b32 s6, s18, 0xffff +; SI-NEXT: s_lshl_b32 s7, s47, 16 ; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_and_b32 s7, s19, 0xffff +; SI-NEXT: s_lshl_b32 s8, s46, 16 ; SI-NEXT: s_or_b32 s7, s7, s8 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v14 +; SI-NEXT: s_and_b32 s8, s20, 0xffff +; SI-NEXT: s_lshl_b32 s9, s45, 16 ; SI-NEXT: s_or_b32 s8, s8, s9 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: v_or_b32_e32 v7, v0, v18 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v13 +; SI-NEXT: s_and_b32 s9, s21, 0xffff +; SI-NEXT: s_lshl_b32 s10, s44, 16 ; SI-NEXT: s_or_b32 s9, s9, s10 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_or_b32_e32 v8, v0, v17 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v12 +; SI-NEXT: s_and_b32 s10, s22, 0xffff +; SI-NEXT: s_lshl_b32 s11, s43, 16 ; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_or_b32_e32 v9, v0, v16 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v11 -; SI-NEXT: v_or_b32_e32 v10, v0, v15 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_mov_b32_e32 v5, s9 -; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_and_b32 s11, s23, 0xffff +; SI-NEXT: s_lshl_b32 s12, s42, 16 +; SI-NEXT: s_or_b32 s11, s11, s12 +; SI-NEXT: s_and_b32 s12, s24, 0xffff +; SI-NEXT: s_lshl_b32 s13, s41, 16 +; SI-NEXT: s_or_b32 s12, s12, s13 +; SI-NEXT: s_and_b32 s13, s25, 0xffff +; SI-NEXT: s_lshl_b32 s14, s40, 16 +; SI-NEXT: s_or_b32 s13, s13, s14 +; SI-NEXT: s_and_b32 s14, s26, 0xffff +; SI-NEXT: s_lshl_b32 s27, s15, 16 +; SI-NEXT: s_or_b32 s14, s14, s27 ; SI-NEXT: s_cbranch_execnz .LBB15_3 ; SI-NEXT: .LBB15_2: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v14 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_or_b32_e32 v0, v18, v0 ; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v13 +; SI-NEXT: s_lshl_b32 s5, s57, 16 +; SI-NEXT: s_add_i32 s17, s17, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s56, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: v_or_b32_e32 v0, v17, v0 +; SI-NEXT: s_and_b32 s6, s18, 0xffff +; SI-NEXT: s_lshl_b32 s7, s47, 16 +; SI-NEXT: s_add_i32 s19, s19, 3 ; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v12 +; SI-NEXT: s_and_b32 s7, s19, 0xffff +; SI-NEXT: s_lshl_b32 s8, s46, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 ; SI-NEXT: s_or_b32 s7, s8, s7 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_and_b32 s8, s20, 0xffff +; SI-NEXT: s_lshl_b32 s9, s45, 16 +; SI-NEXT: s_add_i32 s21, s21, 3 ; SI-NEXT: s_or_b32 s8, s9, s8 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: v_or_b32_e32 v0, v16, v0 +; SI-NEXT: s_and_b32 s9, s21, 0xffff +; SI-NEXT: s_lshl_b32 s10, s44, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 ; SI-NEXT: s_or_b32 s9, s10, s9 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v11 +; SI-NEXT: s_and_b32 s10, s22, 0xffff +; SI-NEXT: s_lshl_b32 s11, s43, 16 +; SI-NEXT: s_add_i32 s23, s23, 3 ; SI-NEXT: s_or_b32 s10, s11, s10 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_and_b32 s11, s23, 0xffff +; SI-NEXT: s_lshl_b32 s12, s42, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s11, s12, s11 +; SI-NEXT: s_and_b32 s12, s24, 0xffff +; SI-NEXT: s_lshl_b32 s13, s41, 16 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_or_b32 s12, s13, s12 +; SI-NEXT: s_and_b32 s13, s25, 0xffff +; SI-NEXT: s_lshl_b32 s14, s40, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s13, s14, s13 +; SI-NEXT: s_and_b32 s14, s26, 0xffff +; SI-NEXT: s_lshl_b32 s15, s15, 16 +; SI-NEXT: s_or_b32 s14, s15, s14 ; SI-NEXT: s_add_i32 s4, s4, 0x30000 ; SI-NEXT: s_add_i32 s5, s5, 0x30000 ; SI-NEXT: s_add_i32 s6, s6, 0x30000 @@ -3235,8 +3496,11 @@ define inreg <11 x float> @bitcast_v22i16_to_v11f32_scalar(<22 x i16> inreg %a, ; SI-NEXT: s_add_i32 s8, s8, 0x30000 ; SI-NEXT: s_add_i32 s9, s9, 0x30000 ; SI-NEXT: s_add_i32 s10, s10, 0x30000 -; SI-NEXT: v_or_b32_e32 v0, v15, v0 -; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 +; SI-NEXT: s_add_i32 s11, s11, 0x30000 +; SI-NEXT: s_add_i32 s12, s12, 0x30000 +; SI-NEXT: s_add_i32 s13, s13, 0x30000 +; SI-NEXT: s_add_i32 s14, s14, 0x30000 +; SI-NEXT: .LBB15_3: ; %end ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_mov_b32_e32 v2, s6 @@ -3244,10 +3508,13 @@ define inreg <11 x float> @bitcast_v22i16_to_v11f32_scalar(<22 x i16> inreg %a, ; SI-NEXT: v_mov_b32_e32 v4, s8 ; SI-NEXT: v_mov_b32_e32 v5, s9 ; SI-NEXT: v_mov_b32_e32 v6, s10 -; SI-NEXT: .LBB15_3: ; %end +; SI-NEXT: v_mov_b32_e32 v7, s11 +; SI-NEXT: v_mov_b32_e32 v8, s12 +; SI-NEXT: v_mov_b32_e32 v9, s13 +; SI-NEXT: v_mov_b32_e32 v10, s14 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB15_4: -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10 +; SI-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14 ; SI-NEXT: s_branch .LBB15_2 ; ; VI-LABEL: bitcast_v22i16_to_v11f32_scalar: @@ -3422,18 +3689,66 @@ define <22 x half> @bitcast_v11f32_to_v22f16(<11 x float> %a, i32 %b) { ; SI-LABEL: bitcast_v11f32_to_v22f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v32, v10 -; SI-NEXT: v_mov_b32_e32 v31, v9 -; SI-NEXT: v_mov_b32_e32 v30, v8 -; SI-NEXT: v_mov_b32_e32 v29, v7 -; SI-NEXT: v_mov_b32_e32 v28, v6 -; SI-NEXT: v_mov_b32_e32 v27, v5 -; SI-NEXT: v_mov_b32_e32 v26, v4 -; SI-NEXT: v_mov_b32_e32 v25, v3 -; SI-NEXT: v_mov_b32_e32 v24, v2 -; SI-NEXT: v_mov_b32_e32 v23, v1 -; SI-NEXT: v_mov_b32_e32 v22, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB16_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v13 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v13 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v13 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v13 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v13 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v13 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v13 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v13 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v0 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 @@ -3445,119 +3760,100 @@ define <22 x half> @bitcast_v11f32_to_v22f16(<11 x float> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB16_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB16_4 -; SI-NEXT: .LBB16_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB16_3: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: .LBB16_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB16_2 -; SI-NEXT: .LBB16_4: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v1, 1.0, v22 -; SI-NEXT: v_add_f32_e32 v3, 1.0, v23 -; SI-NEXT: v_add_f32_e32 v5, 1.0, v24 -; SI-NEXT: v_add_f32_e32 v7, 1.0, v25 -; SI-NEXT: v_add_f32_e32 v9, 1.0, v26 -; SI-NEXT: v_add_f32_e32 v11, 1.0, v27 -; SI-NEXT: v_add_f32_e32 v13, 1.0, v28 -; SI-NEXT: v_add_f32_e32 v15, 1.0, v29 -; SI-NEXT: v_add_f32_e32 v17, 1.0, v30 -; SI-NEXT: v_add_f32_e32 v19, 1.0, v31 -; SI-NEXT: v_add_f32_e32 v21, 1.0, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: s_cbranch_execz .LBB16_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v0 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v0 +; SI-NEXT: .LBB16_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_cvt_f16_f32_e32 v0, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v26 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v27 +; SI-NEXT: v_or_b32_e32 v2, v5, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v23 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v22 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v19 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v18 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v16 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v12 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v13 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v11f32_to_v22f16: @@ -3651,109 +3947,153 @@ define inreg <22 x half> @bitcast_v11f32_to_v22f16_scalar(<11 x float> inreg %a, ; SI-NEXT: s_cbranch_scc0 .LBB17_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_lshr_b32 s4, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 ; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 ; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 ; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 ; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 ; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 ; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 ; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 ; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 ; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 ; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s16 ; SI-NEXT: s_cbranch_execnz .LBB17_3 ; SI-NEXT: .LBB17_2: ; %cmp.true -; SI-NEXT: v_add_f32_e64 v1, s18, 1.0 -; SI-NEXT: v_add_f32_e64 v5, s20, 1.0 -; SI-NEXT: v_add_f32_e64 v9, s22, 1.0 -; SI-NEXT: v_add_f32_e64 v13, s24, 1.0 -; SI-NEXT: v_add_f32_e64 v17, s26, 1.0 ; SI-NEXT: v_add_f32_e64 v0, s17, 1.0 -; SI-NEXT: v_add_f32_e64 v3, s19, 1.0 -; SI-NEXT: v_add_f32_e64 v7, s21, 1.0 -; SI-NEXT: v_add_f32_e64 v11, s23, 1.0 -; SI-NEXT: v_add_f32_e64 v15, s25, 1.0 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v1 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_add_f32_e64 v19, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v18, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v17, s20, 1.0 +; SI-NEXT: v_add_f32_e64 v16, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v6, s22, 1.0 +; SI-NEXT: v_add_f32_e64 v14, s23, 1.0 +; SI-NEXT: v_add_f32_e64 v7, s24, 1.0 +; SI-NEXT: v_add_f32_e64 v11, s25, 1.0 +; SI-NEXT: v_add_f32_e64 v9, s26, 1.0 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v0 +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v19 ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v17 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v13 ; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v9 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v5 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v1 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v0 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_add_f32_e64 v1, s16, 1.0 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: .LBB17_3: ; %end +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v2, v2, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v18 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_or_b32_e32 v5, v15, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v14 +; SI-NEXT: v_or_b32_e32 v7, v13, v7 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_or_b32_e32 v8, v8, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v13 +; SI-NEXT: v_or_b32_e32 v0, v21, v0 +; SI-NEXT: v_or_b32_e32 v1, v1, v20 +; SI-NEXT: v_or_b32_e32 v4, v19, v4 +; SI-NEXT: v_or_b32_e32 v6, v17, v6 +; SI-NEXT: v_or_b32_e32 v9, v12, v9 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB17_4: +; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: s_branch .LBB17_2 ; ; VI-LABEL: bitcast_v11f32_to_v22f16_scalar: @@ -3897,29 +4237,62 @@ define <11 x float> @bitcast_v22f16_to_v11f32(<22 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v22f16_to_v11f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v38, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v20 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v10 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -3931,33 +4304,28 @@ define <11 x float> @bitcast_v22f16_to_v11f32(<22 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB18_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v38 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v30 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v28 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v26 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v19 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v14 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v12 -; SI-NEXT: v_or_b32_e32 v0, v37, v0 -; SI-NEXT: v_or_b32_e32 v1, v35, v1 -; SI-NEXT: v_or_b32_e32 v2, v33, v2 -; SI-NEXT: v_or_b32_e32 v3, v31, v3 -; SI-NEXT: v_or_b32_e32 v4, v29, v4 -; SI-NEXT: v_or_b32_e32 v5, v27, v5 -; SI-NEXT: v_or_b32_e32 v6, v25, v6 -; SI-NEXT: v_or_b32_e32 v7, v23, v7 -; SI-NEXT: v_or_b32_e32 v8, v15, v8 -; SI-NEXT: v_or_b32_e32 v9, v13, v9 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v13 +; SI-NEXT: v_or_b32_e32 v0, v32, v0 +; SI-NEXT: v_or_b32_e32 v1, v30, v1 +; SI-NEXT: v_or_b32_e32 v2, v28, v2 +; SI-NEXT: v_or_b32_e32 v3, v26, v3 +; SI-NEXT: v_or_b32_e32 v4, v24, v4 +; SI-NEXT: v_or_b32_e32 v5, v22, v5 +; SI-NEXT: v_or_b32_e32 v6, v20, v6 +; SI-NEXT: v_or_b32_e32 v7, v18, v7 +; SI-NEXT: v_or_b32_e32 v8, v16, v8 +; SI-NEXT: v_or_b32_e32 v9, v14, v9 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr31 @@ -3969,19 +4337,24 @@ define <11 x float> @bitcast_v22f16_to_v11f32(<22 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB18_2 ; SI-NEXT: .LBB18_4: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v30 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -3989,10 +4362,10 @@ define <11 x float> @bitcast_v22f16_to_v11f32(<22 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v28 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v29 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -4000,11 +4373,11 @@ define <11 x float> @bitcast_v22f16_to_v11f32(<22 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v27 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v25 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -4012,11 +4385,11 @@ define <11 x float> @bitcast_v22f16_to_v11f32(<22 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v24 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v22 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 @@ -4024,11 +4397,11 @@ define <11 x float> @bitcast_v22f16_to_v11f32(<22 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v21 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v19 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 @@ -4036,11 +4409,11 @@ define <11 x float> @bitcast_v22f16_to_v11f32(<22 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v18 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v16 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 @@ -4048,24 +4421,24 @@ define <11 x float> @bitcast_v22f16_to_v11f32(<22 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v15 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v13 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v12 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -4186,30 +4559,62 @@ define inreg <11 x float> @bitcast_v22f16_to_v11f32_scalar(<22 x half> inreg %a, ; SI-LABEL: bitcast_v22f16_to_v11f32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v32, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v31, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v30, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v29, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v28, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v27, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v26, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v25, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v24, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v23, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v22, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v20, s26 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v21, s29 -; SI-NEXT: v_cvt_f16_f32_e32 v19, s28 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_lshr_b32 s5, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s5 +; SI-NEXT: s_lshr_b32 s5, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 +; SI-NEXT: s_lshr_b32 s5, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s5 +; SI-NEXT: s_lshr_b32 s5, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s5 +; SI-NEXT: s_lshr_b32 s5, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s5 +; SI-NEXT: s_lshr_b32 s5, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s5 +; SI-NEXT: s_lshr_b32 s5, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s5 +; SI-NEXT: s_lshr_b32 s5, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s5 +; SI-NEXT: s_lshr_b32 s5, s17, 16 +; SI-NEXT: s_lshr_b32 s4, s26, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s5 +; SI-NEXT: s_lshr_b32 s5, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v2 +; SI-NEXT: s_cmp_lg_u32 s27, 0 ; SI-NEXT: s_cbranch_scc0 .LBB19_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v32 @@ -4218,7 +4623,7 @@ define inreg <11 x float> @bitcast_v22f16_to_v11f32_scalar(<22 x half> inreg %a, ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v26 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v24 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v22 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v20 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v18 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v16 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v14 @@ -4228,7 +4633,7 @@ define inreg <11 x float> @bitcast_v22f16_to_v11f32_scalar(<22 x half> inreg %a, ; SI-NEXT: v_or_b32_e32 v2, v27, v2 ; SI-NEXT: v_or_b32_e32 v3, v25, v3 ; SI-NEXT: v_or_b32_e32 v4, v23, v4 -; SI-NEXT: v_or_b32_e32 v5, v20, v5 +; SI-NEXT: v_or_b32_e32 v5, v21, v5 ; SI-NEXT: v_or_b32_e32 v6, v19, v6 ; SI-NEXT: v_or_b32_e32 v7, v17, v7 ; SI-NEXT: v_or_b32_e32 v8, v15, v8 @@ -4274,7 +4679,7 @@ define inreg <11 x float> @bitcast_v22f16_to_v11f32_scalar(<22 x half> inreg %a, ; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v21 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 @@ -4282,7 +4687,7 @@ define inreg <11 x float> @bitcast_v22f16_to_v11f32_scalar(<22 x half> inreg %a, ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v20 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v19 @@ -4505,29 +4910,66 @@ define <22 x half> @bitcast_v22i16_to_v22f16(<22 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v22i16_to_v22f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v51, v21 -; SI-NEXT: v_mov_b32_e32 v50, v20 -; SI-NEXT: v_mov_b32_e32 v49, v19 -; SI-NEXT: v_mov_b32_e32 v48, v18 -; SI-NEXT: v_mov_b32_e32 v39, v17 -; SI-NEXT: v_mov_b32_e32 v38, v16 -; SI-NEXT: v_mov_b32_e32 v37, v15 -; SI-NEXT: v_mov_b32_e32 v36, v14 -; SI-NEXT: v_mov_b32_e32 v35, v13 -; SI-NEXT: v_mov_b32_e32 v34, v12 -; SI-NEXT: v_mov_b32_e32 v33, v11 -; SI-NEXT: v_mov_b32_e32 v32, v10 -; SI-NEXT: v_mov_b32_e32 v31, v9 -; SI-NEXT: v_mov_b32_e32 v30, v8 -; SI-NEXT: v_mov_b32_e32 v29, v7 -; SI-NEXT: v_mov_b32_e32 v28, v6 -; SI-NEXT: v_mov_b32_e32 v27, v5 -; SI-NEXT: v_mov_b32_e32 v26, v4 -; SI-NEXT: v_mov_b32_e32 v25, v3 -; SI-NEXT: v_mov_b32_e32 v24, v2 -; SI-NEXT: v_mov_b32_e32 v23, v1 -; SI-NEXT: v_mov_b32_e32 v52, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB20_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_cvt_f32_f16_e32 v11, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v51 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 @@ -4539,60 +4981,6 @@ define <22 x half> @bitcast_v22i16_to_v22f16(<22 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB20_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB20_4 -; SI-NEXT: .LBB20_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB20_3: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v0, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v51 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr35 @@ -4604,54 +4992,100 @@ define <22 x half> @bitcast_v22i16_to_v22f16(<22 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: .LBB20_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB20_2 -; SI-NEXT: .LBB20_4: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v51 -; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v50 +; SI-NEXT: s_cbranch_execz .LBB20_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 ; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v49 -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v48 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v48 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 ; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v39 -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v38 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v38 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 ; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v37 -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v36 -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v35 -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v34 -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v33 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v32 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v31 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v30 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v29 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v28 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v27 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v26 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v25 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v24 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v23 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v36 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v35 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v34 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v33 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: .LBB20_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v14 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v22 +; SI-NEXT: v_or_b32_e32 v2, v5, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v16 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v26 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v18 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v28 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v21 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v30 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v23 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v22i16_to_v22f16: @@ -4770,110 +5204,156 @@ define inreg <22 x half> @bitcast_v22i16_to_v22f16_scalar(<22 x i16> inreg %a, i ; SI-LABEL: bitcast_v22i16_to_v22f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; SI-NEXT: v_mov_b32_e32 v26, v7 -; SI-NEXT: v_mov_b32_e32 v25, v6 -; SI-NEXT: v_mov_b32_e32 v24, v5 -; SI-NEXT: v_mov_b32_e32 v23, v4 -; SI-NEXT: v_mov_b32_e32 v22, v3 -; SI-NEXT: v_mov_b32_e32 v29, v2 -; SI-NEXT: v_mov_b32_e32 v28, v1 -; SI-NEXT: v_mov_b32_e32 v27, v0 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_lshr_b32 s28, s26, 16 +; SI-NEXT: s_lshr_b32 s15, s25, 16 +; SI-NEXT: s_lshr_b32 s14, s24, 16 +; SI-NEXT: s_lshr_b32 s13, s23, 16 +; SI-NEXT: s_lshr_b32 s12, s22, 16 +; SI-NEXT: s_lshr_b32 s11, s21, 16 +; SI-NEXT: s_lshr_b32 s10, s20, 16 +; SI-NEXT: s_lshr_b32 s9, s19, 16 +; SI-NEXT: s_lshr_b32 s8, s18, 16 +; SI-NEXT: s_lshr_b32 s7, s17, 16 +; SI-NEXT: s_lshr_b32 s6, s16, 16 +; SI-NEXT: s_cmp_lg_u32 s27, 0 ; SI-NEXT: s_cbranch_scc0 .LBB21_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s6 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s7 ; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s8 ; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s10 ; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s12 ; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s28 ; SI-NEXT: s_cbranch_execnz .LBB21_3 ; SI-NEXT: .LBB21_2: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v26 -; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v25 -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v24 -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v23 -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v22 -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v29 -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v28 -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v27 -; SI-NEXT: s_add_i32 s29, s29, 3 ; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: s_add_i32 s27, s27, 3 ; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s15, s15, 3 ; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s14, s14, 3 ; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s13, s13, 3 ; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s12, s12, 3 ; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s11, s11, 3 ; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s10, s10, 3 ; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s9, s9, 3 ; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s8, s8, 3 ; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s7, s7, 3 ; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s6, s6, 3 ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s6 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s7 ; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s8 ; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s10 ; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s12 ; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s28 ; SI-NEXT: .LBB21_3: ; %end +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v0, v0, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v1, v1, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v17 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v18 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v16 +; SI-NEXT: v_or_b32_e32 v6, v9, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v20 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v19 +; SI-NEXT: v_or_b32_e32 v8, v11, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v21 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB21_4: ; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: s_branch .LBB21_2 ; @@ -5056,129 +5536,195 @@ define <22 x i16> @bitcast_v22f16_to_v22i16(<22 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v22f16_to_v22i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v13 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v3 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB22_2 ; SI-NEXT: ; %bb.1: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_or_b32_e32 v20, v20, v22 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_or_b32_e32 v18, v18, v22 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v15 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_or_b32_e32 v10, v10, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_or_b32_e32 v9, v9, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_or_b32_e32 v14, v14, v22 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_or_b32_e32 v10, v10, v22 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v7 -; SI-NEXT: v_or_b32_e32 v6, v6, v22 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v3 -; SI-NEXT: v_or_b32_e32 v2, v2, v22 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_or_b32_e32 v4, v4, v5 -; SI-NEXT: v_or_b32_e32 v8, v8, v9 -; SI-NEXT: v_or_b32_e32 v12, v12, v13 -; SI-NEXT: v_or_b32_e32 v16, v16, v17 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 -; SI-NEXT: v_alignbit_b32 v17, v18, v17, 16 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_or_b32_e32 v7, v7, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_or_b32_e32 v5, v5, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v22 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v0, v0, v21 +; SI-NEXT: v_or_b32_e32 v17, v17, v20 +; SI-NEXT: v_or_b32_e32 v14, v14, v19 +; SI-NEXT: v_or_b32_e32 v15, v15, v18 +; SI-NEXT: v_or_b32_e32 v13, v13, v16 +; SI-NEXT: v_alignbit_b32 v21, v1, v21, 16 +; SI-NEXT: v_alignbit_b32 v20, v3, v20, 16 +; SI-NEXT: v_alignbit_b32 v19, v5, v19, 16 +; SI-NEXT: v_alignbit_b32 v18, v7, v18, 16 +; SI-NEXT: v_alignbit_b32 v16, v9, v16, 16 ; SI-NEXT: .LBB22_2: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v19 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v20 +; SI-NEXT: v_or_b32_e32 v4, v4, v14 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v18 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v16 +; SI-NEXT: v_or_b32_e32 v9, v9, v11 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_or_b32_e32 v0, v0, v21 +; SI-NEXT: v_or_b32_e32 v2, v2, v17 +; SI-NEXT: v_or_b32_e32 v6, v6, v14 +; SI-NEXT: v_or_b32_e32 v8, v8, v13 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v22f16_to_v22i16: @@ -5298,137 +5844,193 @@ define inreg <22 x i16> @bitcast_v22f16_to_v22i16_scalar(<22 x half> inreg %a, i ; SI-LABEL: bitcast_v22f16_to_v22i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v9, v7 -; SI-NEXT: v_mov_b32_e32 v10, v4 -; SI-NEXT: v_mov_b32_e32 v13, v3 -; SI-NEXT: v_mov_b32_e32 v14, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v0, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v23, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v3, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v4, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v7, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v8, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v11, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v12, s28 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v13 +; SI-NEXT: s_lshr_b32 s4, s26, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 +; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v2, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v24, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v6, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v25, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v10, s26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, s29 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_cvt_f16_f32_e32 v1, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v13 +; SI-NEXT: s_cmp_lg_u32 s27, 0 ; SI-NEXT: s_cbranch_scc0 .LBB23_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_cbranch_execnz .LBB23_3 ; SI-NEXT: .LBB23_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v23 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v8 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v24 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v21 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v10 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v23 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_or_b32_e32 v2, v2, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v24 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v7 -; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v25 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v11 -; SI-NEXT: v_or_b32_e32 v10, v10, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v26 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v20 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_or_b32_e32 v23, v10, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_or_b32_e32 v14, v14, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v24 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v16 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v14 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v31 +; SI-NEXT: v_or_b32_e32 v29, v10, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v26 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_lshr_b64 v[23:24], v[1:2], 16 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_lshr_b64 v[24:25], v[5:6], 16 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v19 -; SI-NEXT: v_lshr_b64 v[25:26], v[9:10], 16 -; SI-NEXT: v_or_b32_e32 v18, v18, v22 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_lshr_b64 v[26:27], v[13:14], 16 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v21 -; SI-NEXT: v_lshr_b64 v[27:28], v[17:18], 16 -; SI-NEXT: v_or_b32_e32 v20, v20, v22 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_or_b32_e32 v4, v4, v5 -; SI-NEXT: v_or_b32_e32 v8, v8, v9 -; SI-NEXT: v_or_b32_e32 v12, v12, v13 -; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v31, v11, v2 +; SI-NEXT: v_or_b32_e32 v30, v10, v4 +; SI-NEXT: v_or_b32_e32 v28, v12, v6 +; SI-NEXT: v_or_b32_e32 v26, v13, v8 +; SI-NEXT: v_lshr_b64 v[18:19], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[16:17], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[14:15], v[4:5], 16 +; SI-NEXT: v_lshr_b64 v[12:13], v[6:7], 16 +; SI-NEXT: v_lshr_b64 v[10:11], v[8:9], 16 ; SI-NEXT: .LBB23_3: ; %end -; SI-NEXT: v_mov_b32_e32 v1, v23 -; SI-NEXT: v_mov_b32_e32 v5, v24 -; SI-NEXT: v_mov_b32_e32 v9, v25 -; SI-NEXT: v_mov_b32_e32 v13, v26 -; SI-NEXT: v_mov_b32_e32 v17, v27 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v18 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v29 +; SI-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v16 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v31 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v25 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v14 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v30 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v24 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v12 +; SI-NEXT: v_or_b32_e32 v6, v6, v8 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v22 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v8, v8, v10 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v21 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v20 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB23_4: ; SI-NEXT: s_branch .LBB23_2 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.384bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.384bit.ll index 4f6801a4dcdfd..50dfbb9a5d234 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.384bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.384bit.ll @@ -1652,77 +1652,101 @@ define <24 x i16> @bitcast_v12i32_to_v24i16(<12 x i32> %a, i32 %b) { ; SI-LABEL: bitcast_v12i32_to_v24i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v22, v11 -; SI-NEXT: v_mov_b32_e32 v20, v10 -; SI-NEXT: v_mov_b32_e32 v18, v9 -; SI-NEXT: v_mov_b32_e32 v16, v8 -; SI-NEXT: v_mov_b32_e32 v14, v7 -; SI-NEXT: v_mov_b32_e32 v24, v6 -; SI-NEXT: v_mov_b32_e32 v10, v5 -; SI-NEXT: v_mov_b32_e32 v8, v4 -; SI-NEXT: v_mov_b32_e32 v6, v3 -; SI-NEXT: v_mov_b32_e32 v4, v2 -; SI-NEXT: v_mov_b32_e32 v2, v1 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB12_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_alignbit_b32 v21, v22, v20, 16 -; SI-NEXT: v_alignbit_b32 v17, v18, v16, 16 -; SI-NEXT: v_alignbit_b32 v13, v14, v24, 16 -; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_alignbit_b32 v12, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v13, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v14, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v15, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v17, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v20, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v1 ; SI-NEXT: .LBB12_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB12_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 ; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 ; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; SI-NEXT: v_alignbit_b32 v21, v22, v20, 16 -; SI-NEXT: v_alignbit_b32 v17, v18, v16, 16 -; SI-NEXT: v_alignbit_b32 v13, v14, v24, 16 -; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_alignbit_b32 v12, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v13, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v14, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v15, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v17, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v20, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v1 ; SI-NEXT: .LBB12_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_mov_b32_e32 v12, v24 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v0, v0, v20 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v23 +; SI-NEXT: v_or_b32_e32 v2, v2, v17 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v22 +; SI-NEXT: v_or_b32_e32 v4, v4, v15 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v21 +; SI-NEXT: v_or_b32_e32 v6, v6, v14 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v19 +; SI-NEXT: v_or_b32_e32 v8, v8, v13 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v18 +; SI-NEXT: v_or_b32_e32 v10, v10, v12 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v16 +; SI-NEXT: v_or_b32_e32 v1, v1, v20 +; SI-NEXT: v_or_b32_e32 v3, v3, v17 +; SI-NEXT: v_or_b32_e32 v5, v5, v15 +; SI-NEXT: v_or_b32_e32 v7, v7, v14 +; SI-NEXT: v_or_b32_e32 v9, v9, v13 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v12i32_to_v24i16: @@ -1863,30 +1887,54 @@ define inreg <24 x i16> @bitcast_v12i32_to_v24i16_scalar(<12 x i32> inreg %a, i3 ; SI-NEXT: s_lshr_b32 s44, s19, 16 ; SI-NEXT: s_lshr_b32 s45, s17, 16 ; SI-NEXT: .LBB13_3: ; %end -; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_mov_b32_e32 v1, s14 -; SI-NEXT: v_mov_b32_e32 v2, s17 -; SI-NEXT: v_mov_b32_e32 v3, s45 -; SI-NEXT: v_mov_b32_e32 v4, s18 +; SI-NEXT: s_and_b32 s5, s16, 0xffff +; SI-NEXT: s_lshl_b32 s7, s14, 16 +; SI-NEXT: s_or_b32 s5, s5, s7 +; SI-NEXT: s_and_b32 s7, s17, 0xffff +; SI-NEXT: s_lshl_b32 s9, s45, 16 +; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: s_and_b32 s9, s18, 0xffff +; SI-NEXT: s_lshl_b32 s11, s12, 16 +; SI-NEXT: s_or_b32 s9, s9, s11 +; SI-NEXT: s_and_b32 s11, s19, 0xffff +; SI-NEXT: s_lshl_b32 s12, s44, 16 +; SI-NEXT: s_or_b32 s11, s11, s12 +; SI-NEXT: s_and_b32 s12, s20, 0xffff +; SI-NEXT: s_lshl_b32 s10, s10, 16 +; SI-NEXT: s_or_b32 s10, s12, s10 +; SI-NEXT: s_and_b32 s12, s21, 0xffff +; SI-NEXT: s_lshl_b32 s13, s43, 16 +; SI-NEXT: s_or_b32 s12, s12, s13 +; SI-NEXT: s_and_b32 s13, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_or_b32 s8, s13, s8 +; SI-NEXT: s_and_b32 s13, s23, 0xffff +; SI-NEXT: s_lshl_b32 s14, s42, 16 +; SI-NEXT: s_or_b32 s13, s13, s14 +; SI-NEXT: s_and_b32 s14, s24, 0xffff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_or_b32 s6, s14, s6 +; SI-NEXT: s_and_b32 s14, s25, 0xffff +; SI-NEXT: s_lshl_b32 s15, s41, 16 +; SI-NEXT: s_or_b32 s14, s14, s15 +; SI-NEXT: s_and_b32 s15, s26, 0xffff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_or_b32 s4, s15, s4 +; SI-NEXT: s_and_b32 s15, s27, 0xffff +; SI-NEXT: s_lshl_b32 s16, s40, 16 +; SI-NEXT: s_or_b32 s15, s15, s16 +; SI-NEXT: v_mov_b32_e32 v0, s5 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: v_mov_b32_e32 v3, s11 +; SI-NEXT: v_mov_b32_e32 v4, s10 ; SI-NEXT: v_mov_b32_e32 v5, s12 -; SI-NEXT: v_mov_b32_e32 v6, s19 -; SI-NEXT: v_mov_b32_e32 v7, s44 -; SI-NEXT: v_mov_b32_e32 v8, s20 -; SI-NEXT: v_mov_b32_e32 v9, s10 -; SI-NEXT: v_mov_b32_e32 v10, s21 -; SI-NEXT: v_mov_b32_e32 v11, s43 -; SI-NEXT: v_mov_b32_e32 v12, s22 -; SI-NEXT: v_mov_b32_e32 v13, s8 -; SI-NEXT: v_mov_b32_e32 v14, s23 -; SI-NEXT: v_mov_b32_e32 v15, s42 -; SI-NEXT: v_mov_b32_e32 v16, s24 -; SI-NEXT: v_mov_b32_e32 v17, s6 -; SI-NEXT: v_mov_b32_e32 v18, s25 -; SI-NEXT: v_mov_b32_e32 v19, s41 -; SI-NEXT: v_mov_b32_e32 v20, s26 -; SI-NEXT: v_mov_b32_e32 v21, s4 -; SI-NEXT: v_mov_b32_e32 v22, s27 -; SI-NEXT: v_mov_b32_e32 v23, s40 +; SI-NEXT: v_mov_b32_e32 v6, s8 +; SI-NEXT: v_mov_b32_e32 v7, s13 +; SI-NEXT: v_mov_b32_e32 v8, s6 +; SI-NEXT: v_mov_b32_e32 v9, s14 +; SI-NEXT: v_mov_b32_e32 v10, s4 +; SI-NEXT: v_mov_b32_e32 v11, s15 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB13_4: ; SI-NEXT: ; implicit-def: $sgpr14 @@ -2031,25 +2079,43 @@ define <12 x i32> @bitcast_v24i16_to_v12i32(<24 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v24i16_to_v12i32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v30, v10 -; SI-NEXT: v_mov_b32_e32 v29, v8 -; SI-NEXT: v_mov_b32_e32 v28, v6 -; SI-NEXT: v_mov_b32_e32 v27, v4 -; SI-NEXT: v_mov_b32_e32 v26, v2 -; SI-NEXT: v_mov_b32_e32 v25, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v23 +; SI-NEXT: v_mov_b32_e32 v24, v11 +; SI-NEXT: v_mov_b32_e32 v13, v10 +; SI-NEXT: v_mov_b32_e32 v14, v9 +; SI-NEXT: v_mov_b32_e32 v15, v8 +; SI-NEXT: v_mov_b32_e32 v16, v7 +; SI-NEXT: v_mov_b32_e32 v17, v6 +; SI-NEXT: v_mov_b32_e32 v18, v5 +; SI-NEXT: v_mov_b32_e32 v19, v4 +; SI-NEXT: v_mov_b32_e32 v20, v3 +; SI-NEXT: v_mov_b32_e32 v21, v2 +; SI-NEXT: v_mov_b32_e32 v22, v1 +; SI-NEXT: v_mov_b32_e32 v23, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v23 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v0 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -2061,69 +2127,69 @@ define <12 x i32> @bitcast_v24i16_to_v12i32(<24 x i16> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB14_3: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v25 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v26 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v27 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v28 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v29 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v30 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v12 -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v14 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v16 -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v18 -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v20 -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v22 -; SI-NEXT: v_or_b32_e32 v0, v0, v38 -; SI-NEXT: v_or_b32_e32 v1, v1, v37 -; SI-NEXT: v_or_b32_e32 v2, v2, v36 -; SI-NEXT: v_or_b32_e32 v3, v3, v35 -; SI-NEXT: v_or_b32_e32 v4, v4, v34 -; SI-NEXT: v_or_b32_e32 v5, v5, v33 -; SI-NEXT: v_or_b32_e32 v6, v6, v32 -; SI-NEXT: v_or_b32_e32 v7, v7, v31 -; SI-NEXT: v_or_b32_e32 v8, v8, v24 -; SI-NEXT: v_or_b32_e32 v9, v9, v17 -; SI-NEXT: v_or_b32_e32 v10, v10, v15 -; SI-NEXT: v_or_b32_e32 v11, v11, v13 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v23 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v21 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v19 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v16 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v0, v0, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v34 +; SI-NEXT: v_or_b32_e32 v2, v2, v33 +; SI-NEXT: v_or_b32_e32 v3, v3, v32 +; SI-NEXT: v_or_b32_e32 v4, v4, v31 +; SI-NEXT: v_or_b32_e32 v5, v5, v30 +; SI-NEXT: v_or_b32_e32 v6, v6, v29 +; SI-NEXT: v_or_b32_e32 v7, v7, v28 +; SI-NEXT: v_or_b32_e32 v8, v8, v27 +; SI-NEXT: v_or_b32_e32 v9, v9, v26 +; SI-NEXT: v_or_b32_e32 v10, v10, v25 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB14_2 ; SI-NEXT: .LBB14_4: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v25 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v26 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v27 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v28 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v29 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v30 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v12 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v14 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v16 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v18 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v20 -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v24 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 @@ -2136,19 +2202,19 @@ define <12 x i32> @bitcast_v24i16_to_v12i32(<24 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: v_or_b32_e32 v0, v38, v0 +; SI-NEXT: v_or_b32_e32 v0, v35, v0 ; SI-NEXT: s_mov_b32 s6, 0x30000 -; SI-NEXT: v_or_b32_e32 v1, v37, v1 -; SI-NEXT: v_or_b32_e32 v2, v36, v2 -; SI-NEXT: v_or_b32_e32 v3, v35, v3 -; SI-NEXT: v_or_b32_e32 v4, v34, v4 -; SI-NEXT: v_or_b32_e32 v5, v33, v5 -; SI-NEXT: v_or_b32_e32 v6, v32, v6 -; SI-NEXT: v_or_b32_e32 v7, v31, v7 -; SI-NEXT: v_or_b32_e32 v8, v24, v8 -; SI-NEXT: v_or_b32_e32 v9, v17, v9 -; SI-NEXT: v_or_b32_e32 v10, v15, v10 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_or_b32_e32 v1, v34, v1 +; SI-NEXT: v_or_b32_e32 v2, v33, v2 +; SI-NEXT: v_or_b32_e32 v3, v32, v3 +; SI-NEXT: v_or_b32_e32 v4, v31, v4 +; SI-NEXT: v_or_b32_e32 v5, v30, v5 +; SI-NEXT: v_or_b32_e32 v6, v29, v6 +; SI-NEXT: v_or_b32_e32 v7, v28, v7 +; SI-NEXT: v_or_b32_e32 v8, v27, v8 +; SI-NEXT: v_or_b32_e32 v9, v26, v9 +; SI-NEXT: v_or_b32_e32 v10, v25, v10 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 @@ -2285,106 +2351,107 @@ define inreg <12 x i32> @bitcast_v24i16_to_v12i32_scalar(<24 x i16> inreg %a, i3 ; SI-LABEL: bitcast_v24i16_to_v12i32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; SI-NEXT: v_mov_b32_e32 v12, v8 -; SI-NEXT: v_mov_b32_e32 v13, v6 -; SI-NEXT: v_mov_b32_e32 v14, v4 -; SI-NEXT: v_mov_b32_e32 v15, v2 -; SI-NEXT: v_mov_b32_e32 v16, v0 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v9 +; SI-NEXT: s_lshr_b32 s40, s27, 16 +; SI-NEXT: s_lshr_b32 s41, s26, 16 +; SI-NEXT: s_lshr_b32 s42, s25, 16 +; SI-NEXT: s_lshr_b32 s43, s24, 16 +; SI-NEXT: s_lshr_b32 s44, s23, 16 +; SI-NEXT: s_lshr_b32 s45, s22, 16 +; SI-NEXT: s_lshr_b32 s46, s21, 16 +; SI-NEXT: s_lshr_b32 s47, s20, 16 +; SI-NEXT: s_lshr_b32 s56, s19, 16 +; SI-NEXT: s_lshr_b32 s57, s18, 16 +; SI-NEXT: s_lshr_b32 s58, s17, 16 +; SI-NEXT: s_lshr_b32 s59, s16, 16 +; SI-NEXT: s_cmp_lg_u32 s28, 0 ; SI-NEXT: s_cbranch_scc0 .LBB15_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_lshl_b32 s5, s59, 16 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s58, 16 ; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_and_b32 s6, s18, 0xffff +; SI-NEXT: s_lshl_b32 s7, s57, 16 ; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 +; SI-NEXT: s_and_b32 s7, s19, 0xffff +; SI-NEXT: s_lshl_b32 s8, s56, 16 ; SI-NEXT: s_or_b32 s7, s7, s8 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: v_or_b32_e32 v7, v0, v21 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v14 +; SI-NEXT: s_and_b32 s8, s20, 0xffff +; SI-NEXT: s_lshl_b32 s9, s47, 16 ; SI-NEXT: s_or_b32 s8, s8, s9 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: v_or_b32_e32 v9, v0, v19 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v13 +; SI-NEXT: s_and_b32 s9, s21, 0xffff +; SI-NEXT: s_lshl_b32 s10, s46, 16 ; SI-NEXT: s_or_b32 s9, s9, s10 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 -; SI-NEXT: v_or_b32_e32 v10, v0, v18 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v12 +; SI-NEXT: s_and_b32 s10, s22, 0xffff +; SI-NEXT: s_lshl_b32 s11, s45, 16 ; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_or_b32_e32 v8, v1, v20 -; SI-NEXT: v_or_b32_e32 v11, v0, v17 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_mov_b32_e32 v5, s9 -; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_and_b32 s11, s23, 0xffff +; SI-NEXT: s_lshl_b32 s12, s44, 16 +; SI-NEXT: s_or_b32 s11, s11, s12 +; SI-NEXT: s_and_b32 s12, s24, 0xffff +; SI-NEXT: s_lshl_b32 s13, s43, 16 +; SI-NEXT: s_or_b32 s12, s12, s13 +; SI-NEXT: s_and_b32 s13, s25, 0xffff +; SI-NEXT: s_lshl_b32 s14, s42, 16 +; SI-NEXT: s_or_b32 s13, s13, s14 +; SI-NEXT: s_and_b32 s14, s26, 0xffff +; SI-NEXT: s_lshl_b32 s15, s41, 16 +; SI-NEXT: s_or_b32 s14, s14, s15 +; SI-NEXT: s_and_b32 s15, s27, 0xffff +; SI-NEXT: s_lshl_b32 s28, s40, 16 +; SI-NEXT: s_or_b32 s15, s15, s28 ; SI-NEXT: s_cbranch_execnz .LBB15_3 ; SI-NEXT: .LBB15_2: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v16 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v21, v0 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v15 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_or_b32_e32 v0, v20, v0 ; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v14 +; SI-NEXT: s_lshl_b32 s5, s59, 16 +; SI-NEXT: s_add_i32 s17, s17, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s58, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: v_or_b32_e32 v0, v19, v0 +; SI-NEXT: s_and_b32 s6, s18, 0xffff +; SI-NEXT: s_lshl_b32 s7, s57, 16 +; SI-NEXT: s_add_i32 s19, s19, 3 ; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v13 +; SI-NEXT: s_and_b32 s7, s19, 0xffff +; SI-NEXT: s_lshl_b32 s8, s56, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 ; SI-NEXT: s_or_b32 s7, s8, s7 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_and_b32 s8, s20, 0xffff +; SI-NEXT: s_lshl_b32 s9, s47, 16 +; SI-NEXT: s_add_i32 s21, s21, 3 ; SI-NEXT: s_or_b32 s8, s9, s8 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: v_or_b32_e32 v0, v18, v0 +; SI-NEXT: s_and_b32 s9, s21, 0xffff +; SI-NEXT: s_lshl_b32 s10, s46, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 ; SI-NEXT: s_or_b32 s9, s10, s9 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v12 +; SI-NEXT: s_and_b32 s10, s22, 0xffff +; SI-NEXT: s_lshl_b32 s11, s45, 16 +; SI-NEXT: s_add_i32 s23, s23, 3 ; SI-NEXT: s_or_b32 s10, s11, s10 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_and_b32 s11, s23, 0xffff +; SI-NEXT: s_lshl_b32 s12, s44, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s11, s12, s11 +; SI-NEXT: s_and_b32 s12, s24, 0xffff +; SI-NEXT: s_lshl_b32 s13, s43, 16 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_or_b32 s12, s13, s12 +; SI-NEXT: s_and_b32 s13, s25, 0xffff +; SI-NEXT: s_lshl_b32 s14, s42, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s13, s14, s13 +; SI-NEXT: s_and_b32 s14, s26, 0xffff +; SI-NEXT: s_lshl_b32 s15, s41, 16 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_or_b32 s14, s15, s14 +; SI-NEXT: s_and_b32 s15, s27, 0xffff +; SI-NEXT: s_lshl_b32 s16, s40, 16 +; SI-NEXT: s_or_b32 s15, s16, s15 ; SI-NEXT: s_add_i32 s4, s4, 0x30000 ; SI-NEXT: s_add_i32 s5, s5, 0x30000 ; SI-NEXT: s_add_i32 s6, s6, 0x30000 @@ -2392,8 +2459,12 @@ define inreg <12 x i32> @bitcast_v24i16_to_v12i32_scalar(<24 x i16> inreg %a, i3 ; SI-NEXT: s_add_i32 s8, s8, 0x30000 ; SI-NEXT: s_add_i32 s9, s9, 0x30000 ; SI-NEXT: s_add_i32 s10, s10, 0x30000 -; SI-NEXT: v_or_b32_e32 v0, v17, v0 -; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 +; SI-NEXT: s_add_i32 s11, s11, 0x30000 +; SI-NEXT: s_add_i32 s12, s12, 0x30000 +; SI-NEXT: s_add_i32 s13, s13, 0x30000 +; SI-NEXT: s_add_i32 s14, s14, 0x30000 +; SI-NEXT: s_add_i32 s15, s15, 0x30000 +; SI-NEXT: .LBB15_3: ; %end ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_mov_b32_e32 v2, s6 @@ -2401,10 +2472,14 @@ define inreg <12 x i32> @bitcast_v24i16_to_v12i32_scalar(<24 x i16> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v4, s8 ; SI-NEXT: v_mov_b32_e32 v5, s9 ; SI-NEXT: v_mov_b32_e32 v6, s10 -; SI-NEXT: .LBB15_3: ; %end +; SI-NEXT: v_mov_b32_e32 v7, s11 +; SI-NEXT: v_mov_b32_e32 v8, s12 +; SI-NEXT: v_mov_b32_e32 v9, s13 +; SI-NEXT: v_mov_b32_e32 v10, s14 +; SI-NEXT: v_mov_b32_e32 v11, s15 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB15_4: -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 +; SI-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; SI-NEXT: s_branch .LBB15_2 ; ; VI-LABEL: bitcast_v24i16_to_v12i32_scalar: @@ -2588,19 +2663,71 @@ define <24 x half> @bitcast_v12i32_to_v24f16(<12 x i32> %a, i32 %b) { ; SI-LABEL: bitcast_v12i32_to_v24f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v35, v11 -; SI-NEXT: v_mov_b32_e32 v34, v10 -; SI-NEXT: v_mov_b32_e32 v33, v9 -; SI-NEXT: v_mov_b32_e32 v32, v8 -; SI-NEXT: v_mov_b32_e32 v31, v7 -; SI-NEXT: v_mov_b32_e32 v30, v6 -; SI-NEXT: v_mov_b32_e32 v29, v5 -; SI-NEXT: v_mov_b32_e32 v28, v4 -; SI-NEXT: v_mov_b32_e32 v27, v3 -; SI-NEXT: v_mov_b32_e32 v26, v2 -; SI-NEXT: v_mov_b32_e32 v25, v1 -; SI-NEXT: v_mov_b32_e32 v24, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB16_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v0 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 @@ -2613,128 +2740,108 @@ define <24 x half> @bitcast_v12i32_to_v24f16(<12 x i32> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB16_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB16_4 -; SI-NEXT: .LBB16_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB16_3: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v24 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: .LBB16_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB16_2 -; SI-NEXT: .LBB16_4: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v24 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v25 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v26 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v27 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v28 -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v29 -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v30 -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v31 -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v32 -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v33 -; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v34 -; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: s_cbranch_execz .LBB16_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v0 ; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v0 +; SI-NEXT: .LBB16_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_cvt_f16_f32_e32 v0, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v31 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v27 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v28 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v24 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v23 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v20 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v18 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v17 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v13 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v14 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v12i32_to_v24f16: @@ -2837,118 +2944,166 @@ define inreg <24 x half> @bitcast_v12i32_to_v24f16_scalar(<12 x i32> inreg %a, i ; SI-NEXT: s_cbranch_scc0 .LBB17_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_lshr_b32 s4, s27, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 ; SI-NEXT: s_lshr_b32 s4, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 ; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 ; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 ; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 ; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 ; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 ; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 ; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 ; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 ; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s4 ; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 ; SI-NEXT: s_cbranch_execnz .LBB17_3 ; SI-NEXT: .LBB17_2: ; %cmp.true ; SI-NEXT: s_add_i32 s27, s27, 3 ; SI-NEXT: s_add_i32 s26, s26, 3 ; SI-NEXT: s_lshr_b32 s4, s27, 16 ; SI-NEXT: s_add_i32 s25, s25, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 ; SI-NEXT: s_lshr_b32 s4, s26, 16 ; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 ; SI-NEXT: s_lshr_b32 s4, s25, 16 ; SI-NEXT: s_add_i32 s23, s23, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 ; SI-NEXT: s_lshr_b32 s4, s24, 16 ; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 ; SI-NEXT: s_lshr_b32 s4, s23, 16 ; SI-NEXT: s_add_i32 s21, s21, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 ; SI-NEXT: s_lshr_b32 s4, s22, 16 ; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 ; SI-NEXT: s_lshr_b32 s4, s21, 16 ; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 ; SI-NEXT: s_lshr_b32 s4, s20, 16 ; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 ; SI-NEXT: s_lshr_b32 s4, s19, 16 ; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 ; SI-NEXT: s_lshr_b32 s4, s18, 16 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 ; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s4 ; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 ; SI-NEXT: .LBB17_3: ; %end +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v21 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v19 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v17 +; SI-NEXT: v_or_b32_e32 v6, v16, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v16 +; SI-NEXT: v_or_b32_e32 v9, v9, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v14 +; SI-NEXT: v_or_b32_e32 v1, v24, v1 +; SI-NEXT: v_or_b32_e32 v3, v22, v3 +; SI-NEXT: v_or_b32_e32 v5, v20, v5 +; SI-NEXT: v_or_b32_e32 v7, v18, v7 +; SI-NEXT: v_or_b32_e32 v8, v15, v8 +; SI-NEXT: v_or_b32_e32 v10, v13, v10 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB17_4: -; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: s_branch .LBB17_2 ; ; VI-LABEL: bitcast_v12i32_to_v24f16_scalar: @@ -3079,31 +3234,67 @@ define <12 x i32> @bitcast_v24f16_to_v12i32(<24 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v24f16_to_v12i32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v50, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v22 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v11 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -3115,36 +3306,30 @@ define <12 x i32> @bitcast_v24f16_to_v12i32(<24 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB18_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v50 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v48 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v34 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v30 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v28 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v26 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v13 -; SI-NEXT: v_or_b32_e32 v0, v49, v0 -; SI-NEXT: v_or_b32_e32 v1, v39, v1 -; SI-NEXT: v_or_b32_e32 v2, v37, v2 -; SI-NEXT: v_or_b32_e32 v3, v35, v3 -; SI-NEXT: v_or_b32_e32 v4, v33, v4 -; SI-NEXT: v_or_b32_e32 v5, v31, v5 -; SI-NEXT: v_or_b32_e32 v6, v29, v6 -; SI-NEXT: v_or_b32_e32 v7, v27, v7 -; SI-NEXT: v_or_b32_e32 v8, v25, v8 -; SI-NEXT: v_or_b32_e32 v9, v16, v9 -; SI-NEXT: v_or_b32_e32 v10, v14, v10 -; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v14 +; SI-NEXT: v_or_b32_e32 v0, v35, v0 +; SI-NEXT: v_or_b32_e32 v1, v33, v1 +; SI-NEXT: v_or_b32_e32 v2, v31, v2 +; SI-NEXT: v_or_b32_e32 v3, v29, v3 +; SI-NEXT: v_or_b32_e32 v4, v27, v4 +; SI-NEXT: v_or_b32_e32 v5, v25, v5 +; SI-NEXT: v_or_b32_e32 v6, v23, v6 +; SI-NEXT: v_or_b32_e32 v7, v21, v7 +; SI-NEXT: v_or_b32_e32 v8, v19, v8 +; SI-NEXT: v_or_b32_e32 v9, v17, v9 +; SI-NEXT: v_or_b32_e32 v10, v15, v10 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr34 @@ -3157,19 +3342,25 @@ define <12 x i32> @bitcast_v24f16_to_v12i32(<24 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB18_2 ; SI-NEXT: .LBB18_4: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v33 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -3182,10 +3373,10 @@ define <12 x i32> @bitcast_v24f16_to_v12i32(<24 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v29 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -3194,25 +3385,25 @@ define <12 x i32> @bitcast_v24f16_to_v12i32(<24 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v28 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v27 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v25 ; SI-NEXT: v_or_b32_e32 v4, v4, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v23 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 @@ -3220,11 +3411,11 @@ define <12 x i32> @bitcast_v24f16_to_v12i32(<24 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v22 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v20 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 @@ -3232,11 +3423,11 @@ define <12 x i32> @bitcast_v24f16_to_v12i32(<24 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v19 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v17 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 @@ -3244,24 +3435,24 @@ define <12 x i32> @bitcast_v24f16_to_v12i32(<24 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v16 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v13 -; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -3387,56 +3578,91 @@ define inreg <12 x i32> @bitcast_v24f16_to_v12i32_scalar(<24 x half> inreg %a, i ; SI-LABEL: bitcast_v24f16_to_v12i32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v35, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v34, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v33, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v32, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v31, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v30, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v29, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v28, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v26, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v24, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v27, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v25, s26 -; SI-NEXT: v_cvt_f16_f32_e32 v23, s29 -; SI-NEXT: v_cvt_f16_f32_e32 v22, s28 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_lshr_b32 s6, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s6 +; SI-NEXT: s_lshr_b32 s6, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s6 +; SI-NEXT: s_lshr_b32 s6, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s6 +; SI-NEXT: s_lshr_b32 s6, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s6 +; SI-NEXT: s_lshr_b32 s6, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s6 +; SI-NEXT: s_lshr_b32 s6, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 +; SI-NEXT: s_lshr_b32 s6, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s6 +; SI-NEXT: s_lshr_b32 s6, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s6 +; SI-NEXT: s_lshr_b32 s6, s17, 16 +; SI-NEXT: s_lshr_b32 s4, s27, 16 +; SI-NEXT: s_lshr_b32 s5, s26, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s6 +; SI-NEXT: s_lshr_b32 s6, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v3 +; SI-NEXT: s_cmp_lg_u32 s28, 0 ; SI-NEXT: s_cbranch_scc0 .LBB19_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v35 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v33 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v29 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v26 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v25 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v23 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v21 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v19 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v16 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v13 ; SI-NEXT: v_or_b32_e32 v0, v34, v0 ; SI-NEXT: v_or_b32_e32 v1, v32, v1 ; SI-NEXT: v_or_b32_e32 v2, v30, v2 ; SI-NEXT: v_or_b32_e32 v3, v28, v3 -; SI-NEXT: v_or_b32_e32 v4, v24, v4 -; SI-NEXT: v_or_b32_e32 v5, v25, v5 +; SI-NEXT: v_or_b32_e32 v4, v26, v4 +; SI-NEXT: v_or_b32_e32 v5, v24, v5 ; SI-NEXT: v_or_b32_e32 v6, v22, v6 ; SI-NEXT: v_or_b32_e32 v7, v20, v7 ; SI-NEXT: v_or_b32_e32 v8, v18, v8 -; SI-NEXT: v_or_b32_e32 v9, v16, v9 +; SI-NEXT: v_or_b32_e32 v9, v15, v9 ; SI-NEXT: v_or_b32_e32 v10, v14, v10 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 ; SI-NEXT: s_cbranch_execnz .LBB19_3 @@ -3470,20 +3696,20 @@ define inreg <12 x i32> @bitcast_v24f16_to_v12i32_scalar(<24 x half> inreg %a, i ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v27 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v26 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v24 ; SI-NEXT: v_or_b32_e32 v4, v4, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v23 @@ -3511,7 +3737,7 @@ define inreg <12 x i32> @bitcast_v24f16_to_v12i32_scalar(<24 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v7, v8, v7 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v15 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 @@ -3519,7 +3745,7 @@ define inreg <12 x i32> @bitcast_v24f16_to_v12i32_scalar(<24 x half> inreg %a, i ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v16 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v11, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v14 @@ -4837,77 +5063,101 @@ define <24 x i16> @bitcast_v12f32_to_v24i16(<12 x float> %a, i32 %b) { ; SI-LABEL: bitcast_v12f32_to_v24i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v22, v11 -; SI-NEXT: v_mov_b32_e32 v20, v10 -; SI-NEXT: v_mov_b32_e32 v18, v9 -; SI-NEXT: v_mov_b32_e32 v16, v8 -; SI-NEXT: v_mov_b32_e32 v14, v7 -; SI-NEXT: v_mov_b32_e32 v24, v6 -; SI-NEXT: v_mov_b32_e32 v10, v5 -; SI-NEXT: v_mov_b32_e32 v8, v4 -; SI-NEXT: v_mov_b32_e32 v6, v3 -; SI-NEXT: v_mov_b32_e32 v4, v2 -; SI-NEXT: v_mov_b32_e32 v2, v1 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB28_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_alignbit_b32 v21, v22, v20, 16 -; SI-NEXT: v_alignbit_b32 v17, v18, v16, 16 -; SI-NEXT: v_alignbit_b32 v13, v14, v24, 16 -; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_alignbit_b32 v12, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v13, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v14, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v15, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v17, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v20, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v1 ; SI-NEXT: .LBB28_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB28_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 ; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 -; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 ; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 -; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 -; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 -; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 -; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 -; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 -; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 -; SI-NEXT: v_alignbit_b32 v21, v22, v20, 16 -; SI-NEXT: v_alignbit_b32 v17, v18, v16, 16 -; SI-NEXT: v_alignbit_b32 v13, v14, v24, 16 -; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_alignbit_b32 v12, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v13, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v14, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v15, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v17, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v20, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v1 ; SI-NEXT: .LBB28_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_mov_b32_e32 v12, v24 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v0, v0, v20 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v23 +; SI-NEXT: v_or_b32_e32 v2, v2, v17 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v22 +; SI-NEXT: v_or_b32_e32 v4, v4, v15 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v21 +; SI-NEXT: v_or_b32_e32 v6, v6, v14 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v19 +; SI-NEXT: v_or_b32_e32 v8, v8, v13 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v18 +; SI-NEXT: v_or_b32_e32 v10, v10, v12 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v16 +; SI-NEXT: v_or_b32_e32 v1, v1, v20 +; SI-NEXT: v_or_b32_e32 v3, v3, v17 +; SI-NEXT: v_or_b32_e32 v5, v5, v15 +; SI-NEXT: v_or_b32_e32 v7, v7, v14 +; SI-NEXT: v_or_b32_e32 v9, v9, v13 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v12f32_to_v24i16: @@ -5016,30 +5266,30 @@ define inreg <24 x i16> @bitcast_v12f32_to_v24i16_scalar(<12 x float> inreg %a, ; SI-NEXT: s_lshr_b64 s[14:15], s[16:17], 16 ; SI-NEXT: s_cbranch_execnz .LBB29_4 ; SI-NEXT: .LBB29_2: ; %cmp.true -; SI-NEXT: v_add_f32_e64 v35, s17, 1.0 -; SI-NEXT: v_add_f32_e64 v34, s16, 1.0 -; SI-NEXT: v_add_f32_e64 v33, s19, 1.0 -; SI-NEXT: v_add_f32_e64 v32, s18, 1.0 -; SI-NEXT: v_add_f32_e64 v31, s21, 1.0 -; SI-NEXT: v_add_f32_e64 v30, s20, 1.0 -; SI-NEXT: v_add_f32_e64 v29, s23, 1.0 -; SI-NEXT: v_add_f32_e64 v28, s22, 1.0 -; SI-NEXT: v_add_f32_e64 v27, s25, 1.0 -; SI-NEXT: v_add_f32_e64 v26, s24, 1.0 -; SI-NEXT: v_add_f32_e64 v25, s27, 1.0 -; SI-NEXT: v_add_f32_e64 v24, s26, 1.0 -; SI-NEXT: v_lshr_b64 v[21:22], v[24:25], 16 -; SI-NEXT: v_lshr_b64 v[17:18], v[26:27], 16 -; SI-NEXT: v_lshr_b64 v[13:14], v[28:29], 16 -; SI-NEXT: v_lshr_b64 v[9:10], v[30:31], 16 -; SI-NEXT: v_lshr_b64 v[5:6], v[32:33], 16 -; SI-NEXT: v_lshr_b64 v[1:2], v[34:35], 16 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v29 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v31 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v33 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v35 +; SI-NEXT: v_add_f32_e64 v11, s27, 1.0 +; SI-NEXT: v_add_f32_e64 v10, s26, 1.0 +; SI-NEXT: v_add_f32_e64 v9, s25, 1.0 +; SI-NEXT: v_add_f32_e64 v8, s24, 1.0 +; SI-NEXT: v_lshr_b64 v[12:13], v[10:11], 16 +; SI-NEXT: v_add_f32_e64 v7, s23, 1.0 +; SI-NEXT: v_add_f32_e64 v6, s22, 1.0 +; SI-NEXT: v_lshr_b64 v[13:14], v[8:9], 16 +; SI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; SI-NEXT: v_lshr_b64 v[14:15], v[6:7], 16 +; SI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; SI-NEXT: v_lshr_b64 v[15:16], v[4:5], 16 +; SI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: v_lshr_b64 v[16:17], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[17:18], v[0:1], 16 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v1 ; SI-NEXT: s_branch .LBB29_5 ; SI-NEXT: .LBB29_3: ; SI-NEXT: ; implicit-def: $sgpr14 @@ -5056,53 +5306,77 @@ define inreg <24 x i16> @bitcast_v12f32_to_v24i16_scalar(<12 x float> inreg %a, ; SI-NEXT: ; implicit-def: $sgpr45 ; SI-NEXT: s_branch .LBB29_2 ; SI-NEXT: .LBB29_4: -; SI-NEXT: v_mov_b32_e32 v34, s16 -; SI-NEXT: v_mov_b32_e32 v35, s17 -; SI-NEXT: v_mov_b32_e32 v32, s18 -; SI-NEXT: v_mov_b32_e32 v33, s19 -; SI-NEXT: v_mov_b32_e32 v30, s20 -; SI-NEXT: v_mov_b32_e32 v31, s21 -; SI-NEXT: v_mov_b32_e32 v28, s22 -; SI-NEXT: v_mov_b32_e32 v29, s23 -; SI-NEXT: v_mov_b32_e32 v26, s24 -; SI-NEXT: v_mov_b32_e32 v27, s25 -; SI-NEXT: v_mov_b32_e32 v24, s26 -; SI-NEXT: v_mov_b32_e32 v25, s27 -; SI-NEXT: v_mov_b32_e32 v3, s40 -; SI-NEXT: v_mov_b32_e32 v7, s41 -; SI-NEXT: v_mov_b32_e32 v11, s42 -; SI-NEXT: v_mov_b32_e32 v15, s43 -; SI-NEXT: v_mov_b32_e32 v19, s44 -; SI-NEXT: v_mov_b32_e32 v23, s45 -; SI-NEXT: v_mov_b32_e32 v1, s14 -; SI-NEXT: v_mov_b32_e32 v5, s12 -; SI-NEXT: v_mov_b32_e32 v9, s10 -; SI-NEXT: v_mov_b32_e32 v13, s8 -; SI-NEXT: v_mov_b32_e32 v17, s6 -; SI-NEXT: v_mov_b32_e32 v21, s4 -; SI-NEXT: .LBB29_5: ; %end -; SI-NEXT: v_mov_b32_e32 v0, v34 -; SI-NEXT: v_mov_b32_e32 v2, v35 -; SI-NEXT: v_mov_b32_e32 v4, v32 -; SI-NEXT: v_mov_b32_e32 v6, v33 -; SI-NEXT: v_mov_b32_e32 v8, v30 -; SI-NEXT: v_mov_b32_e32 v10, v31 -; SI-NEXT: v_mov_b32_e32 v12, v28 -; SI-NEXT: v_mov_b32_e32 v14, v29 -; SI-NEXT: v_mov_b32_e32 v16, v26 -; SI-NEXT: v_mov_b32_e32 v18, v27 -; SI-NEXT: v_mov_b32_e32 v20, v24 -; SI-NEXT: v_mov_b32_e32 v22, v25 -; SI-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: bitcast_v12f32_to_v24i16_scalar: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: s_cmp_lg_u32 s28, 0 -; VI-NEXT: s_cbranch_scc0 .LBB29_3 -; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_cbranch_execnz .LBB29_4 -; VI-NEXT: .LBB29_2: ; %cmp.true +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v24, s40 +; SI-NEXT: v_mov_b32_e32 v23, s41 +; SI-NEXT: v_mov_b32_e32 v22, s42 +; SI-NEXT: v_mov_b32_e32 v21, s43 +; SI-NEXT: v_mov_b32_e32 v20, s44 +; SI-NEXT: v_mov_b32_e32 v19, s45 +; SI-NEXT: v_mov_b32_e32 v17, s14 +; SI-NEXT: v_mov_b32_e32 v16, s12 +; SI-NEXT: v_mov_b32_e32 v15, s10 +; SI-NEXT: v_mov_b32_e32 v14, s8 +; SI-NEXT: v_mov_b32_e32 v13, s6 +; SI-NEXT: v_mov_b32_e32 v12, s4 +; SI-NEXT: .LBB29_5: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v0, v0, v17 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v24 +; SI-NEXT: v_or_b32_e32 v2, v2, v16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v23 +; SI-NEXT: v_or_b32_e32 v4, v4, v15 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v22 +; SI-NEXT: v_or_b32_e32 v6, v6, v14 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v21 +; SI-NEXT: v_or_b32_e32 v8, v8, v13 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v20 +; SI-NEXT: v_or_b32_e32 v10, v10, v12 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v19 +; SI-NEXT: v_or_b32_e32 v1, v1, v17 +; SI-NEXT: v_or_b32_e32 v3, v3, v16 +; SI-NEXT: v_or_b32_e32 v5, v5, v15 +; SI-NEXT: v_or_b32_e32 v7, v7, v14 +; SI-NEXT: v_or_b32_e32 v9, v9, v13 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v12f32_to_v24i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s28, 0 +; VI-NEXT: s_cbranch_scc0 .LBB29_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB29_4 +; VI-NEXT: .LBB29_2: ; %cmp.true ; VI-NEXT: v_add_f32_e64 v11, s27, 1.0 ; VI-NEXT: v_add_f32_e64 v10, s26, 1.0 ; VI-NEXT: v_add_f32_e64 v9, s25, 1.0 @@ -5239,25 +5513,43 @@ define <12 x float> @bitcast_v24i16_to_v12f32(<24 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v24i16_to_v12f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v30, v10 -; SI-NEXT: v_mov_b32_e32 v29, v8 -; SI-NEXT: v_mov_b32_e32 v28, v6 -; SI-NEXT: v_mov_b32_e32 v27, v4 -; SI-NEXT: v_mov_b32_e32 v26, v2 -; SI-NEXT: v_mov_b32_e32 v25, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v23 +; SI-NEXT: v_mov_b32_e32 v24, v11 +; SI-NEXT: v_mov_b32_e32 v13, v10 +; SI-NEXT: v_mov_b32_e32 v14, v9 +; SI-NEXT: v_mov_b32_e32 v15, v8 +; SI-NEXT: v_mov_b32_e32 v16, v7 +; SI-NEXT: v_mov_b32_e32 v17, v6 +; SI-NEXT: v_mov_b32_e32 v18, v5 +; SI-NEXT: v_mov_b32_e32 v19, v4 +; SI-NEXT: v_mov_b32_e32 v20, v3 +; SI-NEXT: v_mov_b32_e32 v21, v2 +; SI-NEXT: v_mov_b32_e32 v22, v1 +; SI-NEXT: v_mov_b32_e32 v23, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v23 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v0 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -5269,69 +5561,69 @@ define <12 x float> @bitcast_v24i16_to_v12f32(<24 x i16> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB30_3: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v25 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v26 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v27 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v28 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v29 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v30 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v12 -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v14 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v16 -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v18 -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v20 -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v22 -; SI-NEXT: v_or_b32_e32 v0, v0, v38 -; SI-NEXT: v_or_b32_e32 v1, v1, v37 -; SI-NEXT: v_or_b32_e32 v2, v2, v36 -; SI-NEXT: v_or_b32_e32 v3, v3, v35 -; SI-NEXT: v_or_b32_e32 v4, v4, v34 -; SI-NEXT: v_or_b32_e32 v5, v5, v33 -; SI-NEXT: v_or_b32_e32 v6, v6, v32 -; SI-NEXT: v_or_b32_e32 v7, v7, v31 -; SI-NEXT: v_or_b32_e32 v8, v8, v24 -; SI-NEXT: v_or_b32_e32 v9, v9, v17 -; SI-NEXT: v_or_b32_e32 v10, v10, v15 -; SI-NEXT: v_or_b32_e32 v11, v11, v13 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v23 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v21 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v19 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v16 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v0, v0, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v34 +; SI-NEXT: v_or_b32_e32 v2, v2, v33 +; SI-NEXT: v_or_b32_e32 v3, v3, v32 +; SI-NEXT: v_or_b32_e32 v4, v4, v31 +; SI-NEXT: v_or_b32_e32 v5, v5, v30 +; SI-NEXT: v_or_b32_e32 v6, v6, v29 +; SI-NEXT: v_or_b32_e32 v7, v7, v28 +; SI-NEXT: v_or_b32_e32 v8, v8, v27 +; SI-NEXT: v_or_b32_e32 v9, v9, v26 +; SI-NEXT: v_or_b32_e32 v10, v10, v25 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB30_2 ; SI-NEXT: .LBB30_4: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v25 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v26 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v27 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v28 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v29 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v30 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v12 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v14 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v16 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v18 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v20 -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v24 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 @@ -5344,19 +5636,19 @@ define <12 x float> @bitcast_v24i16_to_v12f32(<24 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: v_or_b32_e32 v0, v38, v0 +; SI-NEXT: v_or_b32_e32 v0, v35, v0 ; SI-NEXT: s_mov_b32 s6, 0x30000 -; SI-NEXT: v_or_b32_e32 v1, v37, v1 -; SI-NEXT: v_or_b32_e32 v2, v36, v2 -; SI-NEXT: v_or_b32_e32 v3, v35, v3 -; SI-NEXT: v_or_b32_e32 v4, v34, v4 -; SI-NEXT: v_or_b32_e32 v5, v33, v5 -; SI-NEXT: v_or_b32_e32 v6, v32, v6 -; SI-NEXT: v_or_b32_e32 v7, v31, v7 -; SI-NEXT: v_or_b32_e32 v8, v24, v8 -; SI-NEXT: v_or_b32_e32 v9, v17, v9 -; SI-NEXT: v_or_b32_e32 v10, v15, v10 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_or_b32_e32 v1, v34, v1 +; SI-NEXT: v_or_b32_e32 v2, v33, v2 +; SI-NEXT: v_or_b32_e32 v3, v32, v3 +; SI-NEXT: v_or_b32_e32 v4, v31, v4 +; SI-NEXT: v_or_b32_e32 v5, v30, v5 +; SI-NEXT: v_or_b32_e32 v6, v29, v6 +; SI-NEXT: v_or_b32_e32 v7, v28, v7 +; SI-NEXT: v_or_b32_e32 v8, v27, v8 +; SI-NEXT: v_or_b32_e32 v9, v26, v9 +; SI-NEXT: v_or_b32_e32 v10, v25, v10 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 @@ -5493,106 +5785,107 @@ define inreg <12 x float> @bitcast_v24i16_to_v12f32_scalar(<24 x i16> inreg %a, ; SI-LABEL: bitcast_v24i16_to_v12f32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; SI-NEXT: v_mov_b32_e32 v12, v8 -; SI-NEXT: v_mov_b32_e32 v13, v6 -; SI-NEXT: v_mov_b32_e32 v14, v4 -; SI-NEXT: v_mov_b32_e32 v15, v2 -; SI-NEXT: v_mov_b32_e32 v16, v0 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v9 +; SI-NEXT: s_lshr_b32 s40, s27, 16 +; SI-NEXT: s_lshr_b32 s41, s26, 16 +; SI-NEXT: s_lshr_b32 s42, s25, 16 +; SI-NEXT: s_lshr_b32 s43, s24, 16 +; SI-NEXT: s_lshr_b32 s44, s23, 16 +; SI-NEXT: s_lshr_b32 s45, s22, 16 +; SI-NEXT: s_lshr_b32 s46, s21, 16 +; SI-NEXT: s_lshr_b32 s47, s20, 16 +; SI-NEXT: s_lshr_b32 s56, s19, 16 +; SI-NEXT: s_lshr_b32 s57, s18, 16 +; SI-NEXT: s_lshr_b32 s58, s17, 16 +; SI-NEXT: s_lshr_b32 s59, s16, 16 +; SI-NEXT: s_cmp_lg_u32 s28, 0 ; SI-NEXT: s_cbranch_scc0 .LBB31_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_lshl_b32 s5, s59, 16 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s58, 16 ; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_and_b32 s6, s18, 0xffff +; SI-NEXT: s_lshl_b32 s7, s57, 16 ; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 +; SI-NEXT: s_and_b32 s7, s19, 0xffff +; SI-NEXT: s_lshl_b32 s8, s56, 16 ; SI-NEXT: s_or_b32 s7, s7, s8 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: v_or_b32_e32 v7, v0, v21 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v14 +; SI-NEXT: s_and_b32 s8, s20, 0xffff +; SI-NEXT: s_lshl_b32 s9, s47, 16 ; SI-NEXT: s_or_b32 s8, s8, s9 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: v_or_b32_e32 v9, v0, v19 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v13 +; SI-NEXT: s_and_b32 s9, s21, 0xffff +; SI-NEXT: s_lshl_b32 s10, s46, 16 ; SI-NEXT: s_or_b32 s9, s9, s10 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 -; SI-NEXT: v_or_b32_e32 v10, v0, v18 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v12 +; SI-NEXT: s_and_b32 s10, s22, 0xffff +; SI-NEXT: s_lshl_b32 s11, s45, 16 ; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_or_b32_e32 v8, v1, v20 -; SI-NEXT: v_or_b32_e32 v11, v0, v17 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_mov_b32_e32 v5, s9 -; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_and_b32 s11, s23, 0xffff +; SI-NEXT: s_lshl_b32 s12, s44, 16 +; SI-NEXT: s_or_b32 s11, s11, s12 +; SI-NEXT: s_and_b32 s12, s24, 0xffff +; SI-NEXT: s_lshl_b32 s13, s43, 16 +; SI-NEXT: s_or_b32 s12, s12, s13 +; SI-NEXT: s_and_b32 s13, s25, 0xffff +; SI-NEXT: s_lshl_b32 s14, s42, 16 +; SI-NEXT: s_or_b32 s13, s13, s14 +; SI-NEXT: s_and_b32 s14, s26, 0xffff +; SI-NEXT: s_lshl_b32 s15, s41, 16 +; SI-NEXT: s_or_b32 s14, s14, s15 +; SI-NEXT: s_and_b32 s15, s27, 0xffff +; SI-NEXT: s_lshl_b32 s28, s40, 16 +; SI-NEXT: s_or_b32 s15, s15, s28 ; SI-NEXT: s_cbranch_execnz .LBB31_3 ; SI-NEXT: .LBB31_2: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v16 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v21, v0 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v15 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_or_b32_e32 v0, v20, v0 ; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v14 +; SI-NEXT: s_lshl_b32 s5, s59, 16 +; SI-NEXT: s_add_i32 s17, s17, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s58, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: v_or_b32_e32 v0, v19, v0 +; SI-NEXT: s_and_b32 s6, s18, 0xffff +; SI-NEXT: s_lshl_b32 s7, s57, 16 +; SI-NEXT: s_add_i32 s19, s19, 3 ; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v13 +; SI-NEXT: s_and_b32 s7, s19, 0xffff +; SI-NEXT: s_lshl_b32 s8, s56, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 ; SI-NEXT: s_or_b32 s7, s8, s7 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_and_b32 s8, s20, 0xffff +; SI-NEXT: s_lshl_b32 s9, s47, 16 +; SI-NEXT: s_add_i32 s21, s21, 3 ; SI-NEXT: s_or_b32 s8, s9, s8 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: v_or_b32_e32 v0, v18, v0 +; SI-NEXT: s_and_b32 s9, s21, 0xffff +; SI-NEXT: s_lshl_b32 s10, s46, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 ; SI-NEXT: s_or_b32 s9, s10, s9 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v12 +; SI-NEXT: s_and_b32 s10, s22, 0xffff +; SI-NEXT: s_lshl_b32 s11, s45, 16 +; SI-NEXT: s_add_i32 s23, s23, 3 ; SI-NEXT: s_or_b32 s10, s11, s10 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_and_b32 s11, s23, 0xffff +; SI-NEXT: s_lshl_b32 s12, s44, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s11, s12, s11 +; SI-NEXT: s_and_b32 s12, s24, 0xffff +; SI-NEXT: s_lshl_b32 s13, s43, 16 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_or_b32 s12, s13, s12 +; SI-NEXT: s_and_b32 s13, s25, 0xffff +; SI-NEXT: s_lshl_b32 s14, s42, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s13, s14, s13 +; SI-NEXT: s_and_b32 s14, s26, 0xffff +; SI-NEXT: s_lshl_b32 s15, s41, 16 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_or_b32 s14, s15, s14 +; SI-NEXT: s_and_b32 s15, s27, 0xffff +; SI-NEXT: s_lshl_b32 s16, s40, 16 +; SI-NEXT: s_or_b32 s15, s16, s15 ; SI-NEXT: s_add_i32 s4, s4, 0x30000 ; SI-NEXT: s_add_i32 s5, s5, 0x30000 ; SI-NEXT: s_add_i32 s6, s6, 0x30000 @@ -5600,8 +5893,12 @@ define inreg <12 x float> @bitcast_v24i16_to_v12f32_scalar(<24 x i16> inreg %a, ; SI-NEXT: s_add_i32 s8, s8, 0x30000 ; SI-NEXT: s_add_i32 s9, s9, 0x30000 ; SI-NEXT: s_add_i32 s10, s10, 0x30000 -; SI-NEXT: v_or_b32_e32 v0, v17, v0 -; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 +; SI-NEXT: s_add_i32 s11, s11, 0x30000 +; SI-NEXT: s_add_i32 s12, s12, 0x30000 +; SI-NEXT: s_add_i32 s13, s13, 0x30000 +; SI-NEXT: s_add_i32 s14, s14, 0x30000 +; SI-NEXT: s_add_i32 s15, s15, 0x30000 +; SI-NEXT: .LBB31_3: ; %end ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_mov_b32_e32 v2, s6 @@ -5609,10 +5906,14 @@ define inreg <12 x float> @bitcast_v24i16_to_v12f32_scalar(<24 x i16> inreg %a, ; SI-NEXT: v_mov_b32_e32 v4, s8 ; SI-NEXT: v_mov_b32_e32 v5, s9 ; SI-NEXT: v_mov_b32_e32 v6, s10 -; SI-NEXT: .LBB31_3: ; %end +; SI-NEXT: v_mov_b32_e32 v7, s11 +; SI-NEXT: v_mov_b32_e32 v8, s12 +; SI-NEXT: v_mov_b32_e32 v9, s13 +; SI-NEXT: v_mov_b32_e32 v10, s14 +; SI-NEXT: v_mov_b32_e32 v11, s15 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB31_4: -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 +; SI-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; SI-NEXT: s_branch .LBB31_2 ; ; VI-LABEL: bitcast_v24i16_to_v12f32_scalar: @@ -5796,19 +6097,71 @@ define <24 x half> @bitcast_v12f32_to_v24f16(<12 x float> %a, i32 %b) { ; SI-LABEL: bitcast_v12f32_to_v24f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v35, v11 -; SI-NEXT: v_mov_b32_e32 v34, v10 -; SI-NEXT: v_mov_b32_e32 v33, v9 -; SI-NEXT: v_mov_b32_e32 v32, v8 -; SI-NEXT: v_mov_b32_e32 v31, v7 -; SI-NEXT: v_mov_b32_e32 v30, v6 -; SI-NEXT: v_mov_b32_e32 v29, v5 -; SI-NEXT: v_mov_b32_e32 v28, v4 -; SI-NEXT: v_mov_b32_e32 v27, v3 -; SI-NEXT: v_mov_b32_e32 v26, v2 -; SI-NEXT: v_mov_b32_e32 v25, v1 -; SI-NEXT: v_mov_b32_e32 v24, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB32_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v0 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 @@ -5821,128 +6174,108 @@ define <24 x half> @bitcast_v12f32_to_v24f16(<12 x float> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB32_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB32_4 -; SI-NEXT: .LBB32_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB32_3: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v24 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: .LBB32_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB32_2 -; SI-NEXT: .LBB32_4: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v1, 1.0, v24 -; SI-NEXT: v_add_f32_e32 v3, 1.0, v25 -; SI-NEXT: v_add_f32_e32 v5, 1.0, v26 -; SI-NEXT: v_add_f32_e32 v7, 1.0, v27 -; SI-NEXT: v_add_f32_e32 v9, 1.0, v28 -; SI-NEXT: v_add_f32_e32 v11, 1.0, v29 -; SI-NEXT: v_add_f32_e32 v13, 1.0, v30 -; SI-NEXT: v_add_f32_e32 v15, 1.0, v31 -; SI-NEXT: v_add_f32_e32 v17, 1.0, v32 -; SI-NEXT: v_add_f32_e32 v19, 1.0, v33 -; SI-NEXT: v_add_f32_e32 v21, 1.0, v34 -; SI-NEXT: v_add_f32_e32 v23, 1.0, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: s_cbranch_execz .LBB32_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v0 ; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v0 +; SI-NEXT: .LBB32_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_cvt_f16_f32_e32 v0, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v31 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v27 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v28 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v24 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v23 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v20 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v18 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v17 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v13 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v14 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v12f32_to_v24f16: @@ -6038,118 +6371,166 @@ define inreg <24 x half> @bitcast_v12f32_to_v24f16_scalar(<12 x float> inreg %a, ; SI-NEXT: s_cbranch_scc0 .LBB33_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_lshr_b32 s4, s27, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 ; SI-NEXT: s_lshr_b32 s4, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 ; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 ; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 ; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 ; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 ; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 ; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 ; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 ; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 ; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s4 ; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 ; SI-NEXT: s_cbranch_execnz .LBB33_3 ; SI-NEXT: .LBB33_2: ; %cmp.true -; SI-NEXT: v_add_f32_e64 v1, s16, 1.0 -; SI-NEXT: v_add_f32_e64 v3, s17, 1.0 -; SI-NEXT: v_add_f32_e64 v5, s18, 1.0 -; SI-NEXT: v_add_f32_e64 v7, s19, 1.0 -; SI-NEXT: v_add_f32_e64 v9, s20, 1.0 -; SI-NEXT: v_add_f32_e64 v11, s21, 1.0 -; SI-NEXT: v_add_f32_e64 v13, s22, 1.0 -; SI-NEXT: v_add_f32_e64 v15, s23, 1.0 -; SI-NEXT: v_add_f32_e64 v17, s24, 1.0 -; SI-NEXT: v_add_f32_e64 v19, s25, 1.0 -; SI-NEXT: v_add_f32_e64 v21, s26, 1.0 -; SI-NEXT: v_add_f32_e64 v23, s27, 1.0 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v9 +; SI-NEXT: v_add_f32_e64 v23, s16, 1.0 +; SI-NEXT: v_add_f32_e64 v22, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v21, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v20, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v19, s20, 1.0 +; SI-NEXT: v_add_f32_e64 v18, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v17, s22, 1.0 +; SI-NEXT: v_add_f32_e64 v7, s23, 1.0 +; SI-NEXT: v_add_f32_e64 v14, s24, 1.0 +; SI-NEXT: v_add_f32_e64 v8, s25, 1.0 +; SI-NEXT: v_add_f32_e64 v11, s26, 1.0 +; SI-NEXT: v_add_f32_e64 v10, s27, 1.0 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v23 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: .LBB33_3: ; %end +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v21 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v19 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v17 +; SI-NEXT: v_or_b32_e32 v6, v16, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v16 +; SI-NEXT: v_or_b32_e32 v9, v9, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v14 +; SI-NEXT: v_or_b32_e32 v1, v24, v1 +; SI-NEXT: v_or_b32_e32 v3, v22, v3 +; SI-NEXT: v_or_b32_e32 v5, v20, v5 +; SI-NEXT: v_or_b32_e32 v7, v18, v7 +; SI-NEXT: v_or_b32_e32 v8, v15, v8 +; SI-NEXT: v_or_b32_e32 v10, v13, v10 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB33_4: -; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: s_branch .LBB33_2 ; ; VI-LABEL: bitcast_v12f32_to_v24f16_scalar: @@ -6296,31 +6677,67 @@ define <12 x float> @bitcast_v24f16_to_v12f32(<24 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v24f16_to_v12f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v50, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v22 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v11 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -6332,36 +6749,30 @@ define <12 x float> @bitcast_v24f16_to_v12f32(<24 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB34_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v50 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v48 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v34 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v30 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v28 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v26 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v13 -; SI-NEXT: v_or_b32_e32 v0, v49, v0 -; SI-NEXT: v_or_b32_e32 v1, v39, v1 -; SI-NEXT: v_or_b32_e32 v2, v37, v2 -; SI-NEXT: v_or_b32_e32 v3, v35, v3 -; SI-NEXT: v_or_b32_e32 v4, v33, v4 -; SI-NEXT: v_or_b32_e32 v5, v31, v5 -; SI-NEXT: v_or_b32_e32 v6, v29, v6 -; SI-NEXT: v_or_b32_e32 v7, v27, v7 -; SI-NEXT: v_or_b32_e32 v8, v25, v8 -; SI-NEXT: v_or_b32_e32 v9, v16, v9 -; SI-NEXT: v_or_b32_e32 v10, v14, v10 -; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v14 +; SI-NEXT: v_or_b32_e32 v0, v35, v0 +; SI-NEXT: v_or_b32_e32 v1, v33, v1 +; SI-NEXT: v_or_b32_e32 v2, v31, v2 +; SI-NEXT: v_or_b32_e32 v3, v29, v3 +; SI-NEXT: v_or_b32_e32 v4, v27, v4 +; SI-NEXT: v_or_b32_e32 v5, v25, v5 +; SI-NEXT: v_or_b32_e32 v6, v23, v6 +; SI-NEXT: v_or_b32_e32 v7, v21, v7 +; SI-NEXT: v_or_b32_e32 v8, v19, v8 +; SI-NEXT: v_or_b32_e32 v9, v17, v9 +; SI-NEXT: v_or_b32_e32 v10, v15, v10 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr34 @@ -6374,19 +6785,25 @@ define <12 x float> @bitcast_v24f16_to_v12f32(<24 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB34_2 ; SI-NEXT: .LBB34_4: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v33 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -6399,10 +6816,10 @@ define <12 x float> @bitcast_v24f16_to_v12f32(<24 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v29 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -6411,25 +6828,25 @@ define <12 x float> @bitcast_v24f16_to_v12f32(<24 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v28 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v27 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v25 ; SI-NEXT: v_or_b32_e32 v4, v4, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v23 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 @@ -6437,11 +6854,11 @@ define <12 x float> @bitcast_v24f16_to_v12f32(<24 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v22 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v20 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 @@ -6449,11 +6866,11 @@ define <12 x float> @bitcast_v24f16_to_v12f32(<24 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v19 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v17 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 @@ -6461,24 +6878,24 @@ define <12 x float> @bitcast_v24f16_to_v12f32(<24 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v16 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v13 -; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -6604,56 +7021,91 @@ define inreg <12 x float> @bitcast_v24f16_to_v12f32_scalar(<24 x half> inreg %a, ; SI-LABEL: bitcast_v24f16_to_v12f32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v35, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v34, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v33, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v32, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v31, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v30, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v29, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v28, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v26, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v24, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v27, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v25, s26 -; SI-NEXT: v_cvt_f16_f32_e32 v23, s29 -; SI-NEXT: v_cvt_f16_f32_e32 v22, s28 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_lshr_b32 s6, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s6 +; SI-NEXT: s_lshr_b32 s6, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s6 +; SI-NEXT: s_lshr_b32 s6, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s6 +; SI-NEXT: s_lshr_b32 s6, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s6 +; SI-NEXT: s_lshr_b32 s6, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s6 +; SI-NEXT: s_lshr_b32 s6, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 +; SI-NEXT: s_lshr_b32 s6, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s6 +; SI-NEXT: s_lshr_b32 s6, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s6 +; SI-NEXT: s_lshr_b32 s6, s17, 16 +; SI-NEXT: s_lshr_b32 s4, s27, 16 +; SI-NEXT: s_lshr_b32 s5, s26, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s6 +; SI-NEXT: s_lshr_b32 s6, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v3 +; SI-NEXT: s_cmp_lg_u32 s28, 0 ; SI-NEXT: s_cbranch_scc0 .LBB35_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v35 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v33 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v29 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v26 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v25 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v23 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v21 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v19 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v16 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v13 ; SI-NEXT: v_or_b32_e32 v0, v34, v0 ; SI-NEXT: v_or_b32_e32 v1, v32, v1 ; SI-NEXT: v_or_b32_e32 v2, v30, v2 ; SI-NEXT: v_or_b32_e32 v3, v28, v3 -; SI-NEXT: v_or_b32_e32 v4, v24, v4 -; SI-NEXT: v_or_b32_e32 v5, v25, v5 +; SI-NEXT: v_or_b32_e32 v4, v26, v4 +; SI-NEXT: v_or_b32_e32 v5, v24, v5 ; SI-NEXT: v_or_b32_e32 v6, v22, v6 ; SI-NEXT: v_or_b32_e32 v7, v20, v7 ; SI-NEXT: v_or_b32_e32 v8, v18, v8 -; SI-NEXT: v_or_b32_e32 v9, v16, v9 +; SI-NEXT: v_or_b32_e32 v9, v15, v9 ; SI-NEXT: v_or_b32_e32 v10, v14, v10 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 ; SI-NEXT: s_cbranch_execnz .LBB35_3 @@ -6687,20 +7139,20 @@ define inreg <12 x float> @bitcast_v24f16_to_v12f32_scalar(<24 x half> inreg %a, ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v27 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v26 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v24 ; SI-NEXT: v_or_b32_e32 v4, v4, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v23 @@ -6728,7 +7180,7 @@ define inreg <12 x float> @bitcast_v24f16_to_v12f32_scalar(<24 x half> inreg %a, ; SI-NEXT: v_or_b32_e32 v7, v8, v7 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v15 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 @@ -6736,7 +7188,7 @@ define inreg <12 x float> @bitcast_v24f16_to_v12f32_scalar(<24 x half> inreg %a, ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v16 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v11, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v14 @@ -7479,81 +7931,95 @@ define <24 x i16> @bitcast_v6f64_to_v24i16(<6 x double> %a, i32 %b) { ; SI-LABEL: bitcast_v6f64_to_v24i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v34, v11 -; SI-NEXT: v_mov_b32_e32 v33, v10 -; SI-NEXT: v_mov_b32_e32 v32, v9 -; SI-NEXT: v_mov_b32_e32 v31, v8 -; SI-NEXT: v_mov_b32_e32 v30, v7 -; SI-NEXT: v_mov_b32_e32 v29, v6 -; SI-NEXT: v_mov_b32_e32 v28, v5 -; SI-NEXT: v_mov_b32_e32 v27, v4 -; SI-NEXT: v_mov_b32_e32 v26, v3 -; SI-NEXT: v_mov_b32_e32 v25, v2 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB40_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_alignbit_b32 v21, v34, v33, 16 -; SI-NEXT: v_alignbit_b32 v17, v32, v31, 16 -; SI-NEXT: v_alignbit_b32 v13, v30, v29, 16 -; SI-NEXT: v_alignbit_b32 v9, v28, v27, 16 -; SI-NEXT: v_alignbit_b32 v5, v26, v25, 16 -; SI-NEXT: v_alignbit_b32 v24, v1, v0, 16 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v34 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v32 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_alignbit_b32 v12, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v13, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v14, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v15, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v18, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v20, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v1 ; SI-NEXT: .LBB40_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB40_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; SI-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 -; SI-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 -; SI-NEXT: v_add_f64 v[29:30], v[29:30], 1.0 -; SI-NEXT: v_add_f64 v[33:34], v[33:34], 1.0 -; SI-NEXT: v_add_f64 v[31:32], v[31:32], 1.0 -; SI-NEXT: v_alignbit_b32 v21, v34, v33, 16 -; SI-NEXT: v_alignbit_b32 v17, v32, v31, 16 -; SI-NEXT: v_alignbit_b32 v13, v30, v29, 16 -; SI-NEXT: v_alignbit_b32 v9, v28, v27, 16 -; SI-NEXT: v_alignbit_b32 v5, v26, v25, 16 -; SI-NEXT: v_alignbit_b32 v24, v1, v0, 16 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v34 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v32 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_alignbit_b32 v12, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v13, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v14, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v15, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v18, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v20, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v1 ; SI-NEXT: .LBB40_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_mov_b32_e32 v2, v1 -; SI-NEXT: v_mov_b32_e32 v4, v25 -; SI-NEXT: v_mov_b32_e32 v6, v26 -; SI-NEXT: v_mov_b32_e32 v8, v27 -; SI-NEXT: v_mov_b32_e32 v10, v28 -; SI-NEXT: v_mov_b32_e32 v12, v29 -; SI-NEXT: v_mov_b32_e32 v14, v30 -; SI-NEXT: v_mov_b32_e32 v16, v31 -; SI-NEXT: v_mov_b32_e32 v18, v32 -; SI-NEXT: v_mov_b32_e32 v20, v33 -; SI-NEXT: v_mov_b32_e32 v22, v34 -; SI-NEXT: v_mov_b32_e32 v1, v24 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v0, v0, v20 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v23 +; SI-NEXT: v_or_b32_e32 v2, v2, v18 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v22 +; SI-NEXT: v_or_b32_e32 v4, v4, v15 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v21 +; SI-NEXT: v_or_b32_e32 v6, v6, v14 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v19 +; SI-NEXT: v_or_b32_e32 v8, v8, v13 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v17 +; SI-NEXT: v_or_b32_e32 v10, v10, v12 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v16 +; SI-NEXT: v_or_b32_e32 v1, v1, v20 +; SI-NEXT: v_or_b32_e32 v3, v3, v18 +; SI-NEXT: v_or_b32_e32 v5, v5, v15 +; SI-NEXT: v_or_b32_e32 v7, v7, v14 +; SI-NEXT: v_or_b32_e32 v9, v9, v13 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v6f64_to_v24i16: @@ -7651,24 +8117,24 @@ define inreg <24 x i16> @bitcast_v6f64_to_v24i16_scalar(<6 x double> inreg %a, i ; SI-NEXT: s_lshr_b64 s[14:15], s[16:17], 16 ; SI-NEXT: s_cbranch_execnz .LBB41_4 ; SI-NEXT: .LBB41_2: ; %cmp.true -; SI-NEXT: v_add_f64 v[24:25], s[26:27], 1.0 -; SI-NEXT: v_add_f64 v[26:27], s[24:25], 1.0 -; SI-NEXT: v_add_f64 v[28:29], s[22:23], 1.0 -; SI-NEXT: v_add_f64 v[30:31], s[20:21], 1.0 -; SI-NEXT: v_add_f64 v[32:33], s[18:19], 1.0 -; SI-NEXT: v_add_f64 v[34:35], s[16:17], 1.0 -; SI-NEXT: v_lshr_b64 v[21:22], v[24:25], 16 -; SI-NEXT: v_lshr_b64 v[17:18], v[26:27], 16 -; SI-NEXT: v_lshr_b64 v[13:14], v[28:29], 16 -; SI-NEXT: v_lshr_b64 v[9:10], v[30:31], 16 -; SI-NEXT: v_lshr_b64 v[5:6], v[32:33], 16 -; SI-NEXT: v_lshr_b64 v[1:2], v[34:35], 16 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v29 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v31 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v33 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v35 +; SI-NEXT: v_add_f64 v[10:11], s[26:27], 1.0 +; SI-NEXT: v_add_f64 v[8:9], s[24:25], 1.0 +; SI-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; SI-NEXT: v_lshr_b64 v[12:13], v[10:11], 16 +; SI-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; SI-NEXT: v_lshr_b64 v[13:14], v[8:9], 16 +; SI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; SI-NEXT: v_lshr_b64 v[14:15], v[6:7], 16 +; SI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; SI-NEXT: v_lshr_b64 v[15:16], v[4:5], 16 +; SI-NEXT: v_lshr_b64 v[16:17], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[17:18], v[0:1], 16 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v1 ; SI-NEXT: s_branch .LBB41_5 ; SI-NEXT: .LBB41_3: ; SI-NEXT: ; implicit-def: $sgpr14 @@ -7685,51 +8151,75 @@ define inreg <24 x i16> @bitcast_v6f64_to_v24i16_scalar(<6 x double> inreg %a, i ; SI-NEXT: ; implicit-def: $sgpr45 ; SI-NEXT: s_branch .LBB41_2 ; SI-NEXT: .LBB41_4: -; SI-NEXT: v_mov_b32_e32 v25, s27 -; SI-NEXT: v_mov_b32_e32 v27, s25 -; SI-NEXT: v_mov_b32_e32 v29, s23 -; SI-NEXT: v_mov_b32_e32 v31, s21 -; SI-NEXT: v_mov_b32_e32 v33, s19 -; SI-NEXT: v_mov_b32_e32 v35, s17 -; SI-NEXT: v_mov_b32_e32 v34, s16 -; SI-NEXT: v_mov_b32_e32 v32, s18 -; SI-NEXT: v_mov_b32_e32 v30, s20 -; SI-NEXT: v_mov_b32_e32 v28, s22 -; SI-NEXT: v_mov_b32_e32 v26, s24 -; SI-NEXT: v_mov_b32_e32 v24, s26 -; SI-NEXT: v_mov_b32_e32 v23, s45 -; SI-NEXT: v_mov_b32_e32 v19, s44 -; SI-NEXT: v_mov_b32_e32 v15, s43 -; SI-NEXT: v_mov_b32_e32 v11, s42 -; SI-NEXT: v_mov_b32_e32 v7, s41 -; SI-NEXT: v_mov_b32_e32 v3, s40 -; SI-NEXT: v_mov_b32_e32 v1, s14 -; SI-NEXT: v_mov_b32_e32 v5, s12 -; SI-NEXT: v_mov_b32_e32 v9, s10 -; SI-NEXT: v_mov_b32_e32 v13, s8 -; SI-NEXT: v_mov_b32_e32 v17, s6 -; SI-NEXT: v_mov_b32_e32 v21, s4 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v19, s45 +; SI-NEXT: v_mov_b32_e32 v20, s44 +; SI-NEXT: v_mov_b32_e32 v21, s43 +; SI-NEXT: v_mov_b32_e32 v22, s42 +; SI-NEXT: v_mov_b32_e32 v23, s41 +; SI-NEXT: v_mov_b32_e32 v24, s40 +; SI-NEXT: v_mov_b32_e32 v17, s14 +; SI-NEXT: v_mov_b32_e32 v16, s12 +; SI-NEXT: v_mov_b32_e32 v15, s10 +; SI-NEXT: v_mov_b32_e32 v14, s8 +; SI-NEXT: v_mov_b32_e32 v13, s6 +; SI-NEXT: v_mov_b32_e32 v12, s4 ; SI-NEXT: .LBB41_5: ; %end -; SI-NEXT: v_mov_b32_e32 v0, v34 -; SI-NEXT: v_mov_b32_e32 v2, v35 -; SI-NEXT: v_mov_b32_e32 v4, v32 -; SI-NEXT: v_mov_b32_e32 v6, v33 -; SI-NEXT: v_mov_b32_e32 v8, v30 -; SI-NEXT: v_mov_b32_e32 v10, v31 -; SI-NEXT: v_mov_b32_e32 v12, v28 -; SI-NEXT: v_mov_b32_e32 v14, v29 -; SI-NEXT: v_mov_b32_e32 v16, v26 -; SI-NEXT: v_mov_b32_e32 v18, v27 -; SI-NEXT: v_mov_b32_e32 v20, v24 -; SI-NEXT: v_mov_b32_e32 v22, v25 -; SI-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: bitcast_v6f64_to_v24i16_scalar: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: s_cmp_lg_u32 s28, 0 -; VI-NEXT: s_cbranch_scc0 .LBB41_3 -; VI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v0, v0, v17 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v24 +; SI-NEXT: v_or_b32_e32 v2, v2, v16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v23 +; SI-NEXT: v_or_b32_e32 v4, v4, v15 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v22 +; SI-NEXT: v_or_b32_e32 v6, v6, v14 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v21 +; SI-NEXT: v_or_b32_e32 v8, v8, v13 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v20 +; SI-NEXT: v_or_b32_e32 v10, v10, v12 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v19 +; SI-NEXT: v_or_b32_e32 v1, v1, v17 +; SI-NEXT: v_or_b32_e32 v3, v3, v16 +; SI-NEXT: v_or_b32_e32 v5, v5, v15 +; SI-NEXT: v_or_b32_e32 v7, v7, v14 +; SI-NEXT: v_or_b32_e32 v9, v9, v13 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v6f64_to_v24i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s28, 0 +; VI-NEXT: s_cbranch_scc0 .LBB41_3 +; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB41_4 ; VI-NEXT: .LBB41_2: ; %cmp.true ; VI-NEXT: v_add_f64 v[10:11], s[26:27], 1.0 @@ -7850,27 +8340,43 @@ define <6 x double> @bitcast_v24i16_to_v6f64(<24 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v24i16_to_v6f64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v32, v14 -; SI-NEXT: v_mov_b32_e32 v31, v12 -; SI-NEXT: v_mov_b32_e32 v30, v10 -; SI-NEXT: v_mov_b32_e32 v29, v8 -; SI-NEXT: v_mov_b32_e32 v28, v6 -; SI-NEXT: v_mov_b32_e32 v25, v4 -; SI-NEXT: v_mov_b32_e32 v26, v2 +; SI-NEXT: v_mov_b32_e32 v16, v11 +; SI-NEXT: v_mov_b32_e32 v17, v10 +; SI-NEXT: v_mov_b32_e32 v18, v9 +; SI-NEXT: v_mov_b32_e32 v19, v8 +; SI-NEXT: v_mov_b32_e32 v20, v7 +; SI-NEXT: v_mov_b32_e32 v21, v6 +; SI-NEXT: v_mov_b32_e32 v22, v5 +; SI-NEXT: v_mov_b32_e32 v23, v4 +; SI-NEXT: v_mov_b32_e32 v24, v3 +; SI-NEXT: v_mov_b32_e32 v25, v2 +; SI-NEXT: v_mov_b32_e32 v26, v1 ; SI-NEXT: v_mov_b32_e32 v27, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 -; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v27 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v10 ; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v0 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -7885,41 +8391,39 @@ define <6 x double> @bitcast_v24i16_to_v6f64(<24 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v27 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v26 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v25 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v28 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v29 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v30 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v31 -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v32 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v24 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v23 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v21 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v19 ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v18 -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v20 -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v22 -; SI-NEXT: v_or_b32_e32 v0, v0, v49 -; SI-NEXT: v_or_b32_e32 v1, v1, v48 -; SI-NEXT: v_or_b32_e32 v2, v2, v39 -; SI-NEXT: v_or_b32_e32 v3, v3, v38 -; SI-NEXT: v_or_b32_e32 v4, v4, v37 -; SI-NEXT: v_or_b32_e32 v5, v5, v36 -; SI-NEXT: v_or_b32_e32 v6, v6, v35 -; SI-NEXT: v_or_b32_e32 v7, v7, v34 -; SI-NEXT: v_or_b32_e32 v8, v8, v33 -; SI-NEXT: v_or_b32_e32 v9, v9, v24 -; SI-NEXT: v_or_b32_e32 v10, v10, v19 -; SI-NEXT: v_or_b32_e32 v11, v11, v17 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v0, v0, v39 +; SI-NEXT: v_or_b32_e32 v1, v1, v38 +; SI-NEXT: v_or_b32_e32 v2, v2, v37 +; SI-NEXT: v_or_b32_e32 v3, v3, v36 +; SI-NEXT: v_or_b32_e32 v4, v4, v35 +; SI-NEXT: v_or_b32_e32 v5, v5, v34 +; SI-NEXT: v_or_b32_e32 v6, v6, v33 +; SI-NEXT: v_or_b32_e32 v7, v7, v32 +; SI-NEXT: v_or_b32_e32 v8, v8, v31 +; SI-NEXT: v_or_b32_e32 v9, v9, v30 +; SI-NEXT: v_or_b32_e32 v10, v10, v29 +; SI-NEXT: v_or_b32_e32 v11, v11, v28 ; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr37 @@ -7927,24 +8431,26 @@ define <6 x double> @bitcast_v24i16_to_v6f64(<24 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB42_2 ; SI-NEXT: .LBB42_4: ; %cmp.true ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v27 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v26 ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v25 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v28 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v29 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v30 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v31 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v32 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v19 ; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v18 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v20 -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v16 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 @@ -7957,19 +8463,19 @@ define <6 x double> @bitcast_v24i16_to_v6f64(<24 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: v_or_b32_e32 v0, v49, v0 +; SI-NEXT: v_or_b32_e32 v0, v39, v0 ; SI-NEXT: s_mov_b32 s6, 0x30000 -; SI-NEXT: v_or_b32_e32 v1, v48, v1 -; SI-NEXT: v_or_b32_e32 v2, v39, v2 -; SI-NEXT: v_or_b32_e32 v3, v38, v3 -; SI-NEXT: v_or_b32_e32 v4, v37, v4 -; SI-NEXT: v_or_b32_e32 v5, v36, v5 -; SI-NEXT: v_or_b32_e32 v6, v35, v6 -; SI-NEXT: v_or_b32_e32 v7, v34, v7 -; SI-NEXT: v_or_b32_e32 v8, v33, v8 -; SI-NEXT: v_or_b32_e32 v9, v24, v9 -; SI-NEXT: v_or_b32_e32 v10, v19, v10 -; SI-NEXT: v_or_b32_e32 v11, v17, v11 +; SI-NEXT: v_or_b32_e32 v1, v38, v1 +; SI-NEXT: v_or_b32_e32 v2, v37, v2 +; SI-NEXT: v_or_b32_e32 v3, v36, v3 +; SI-NEXT: v_or_b32_e32 v4, v35, v4 +; SI-NEXT: v_or_b32_e32 v5, v34, v5 +; SI-NEXT: v_or_b32_e32 v6, v33, v6 +; SI-NEXT: v_or_b32_e32 v7, v32, v7 +; SI-NEXT: v_or_b32_e32 v8, v31, v8 +; SI-NEXT: v_or_b32_e32 v9, v30, v9 +; SI-NEXT: v_or_b32_e32 v10, v29, v10 +; SI-NEXT: v_or_b32_e32 v11, v28, v11 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 @@ -8106,126 +8612,159 @@ define inreg <6 x double> @bitcast_v24i16_to_v6f64_scalar(<24 x i16> inreg %a, i ; SI-LABEL: bitcast_v24i16_to_v6f64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; SI-NEXT: v_mov_b32_e32 v16, v8 -; SI-NEXT: v_mov_b32_e32 v17, v6 -; SI-NEXT: v_mov_b32_e32 v18, v4 -; SI-NEXT: v_mov_b32_e32 v19, v2 -; SI-NEXT: v_mov_b32_e32 v20, v0 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v9 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v12, s36, 0 +; SI-NEXT: v_writelane_b32 v12, s37, 1 +; SI-NEXT: v_writelane_b32 v12, s38, 2 +; SI-NEXT: v_writelane_b32 v12, s39, 3 +; SI-NEXT: v_writelane_b32 v12, s48, 4 +; SI-NEXT: v_writelane_b32 v12, s49, 5 +; SI-NEXT: s_lshr_b32 s6, s27, 16 +; SI-NEXT: s_lshr_b32 s7, s26, 16 +; SI-NEXT: s_lshr_b32 s8, s25, 16 +; SI-NEXT: s_lshr_b32 s9, s24, 16 +; SI-NEXT: s_lshr_b32 s10, s23, 16 +; SI-NEXT: s_lshr_b32 s11, s22, 16 +; SI-NEXT: s_lshr_b32 s12, s21, 16 +; SI-NEXT: s_lshr_b32 s13, s20, 16 +; SI-NEXT: s_lshr_b32 s14, s19, 16 +; SI-NEXT: s_lshr_b32 s15, s18, 16 +; SI-NEXT: s_lshr_b32 s29, s17, 16 +; SI-NEXT: s_lshr_b32 s56, s16, 16 +; SI-NEXT: v_writelane_b32 v12, s50, 6 +; SI-NEXT: s_cmp_lg_u32 s28, 0 +; SI-NEXT: v_writelane_b32 v12, s51, 7 ; SI-NEXT: s_cbranch_scc0 .LBB43_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v20 -; SI-NEXT: s_or_b32 s7, s7, s8 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: v_or_b32_e32 v7, v0, v25 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v18 -; SI-NEXT: s_or_b32 s8, s8, s9 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: v_or_b32_e32 v9, v0, v23 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v17 -; SI-NEXT: s_or_b32 s9, s9, s10 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 -; SI-NEXT: v_or_b32_e32 v10, v0, v22 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 -; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_or_b32_e32 v8, v1, v24 -; SI-NEXT: v_or_b32_e32 v11, v0, v21 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_mov_b32_e32 v5, s9 -; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_lshl_b32 s5, s56, 16 +; SI-NEXT: s_or_b32 s36, s4, s5 +; SI-NEXT: s_and_b32 s4, s17, 0xffff +; SI-NEXT: s_lshl_b32 s5, s29, 16 +; SI-NEXT: s_or_b32 s37, s4, s5 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: s_lshl_b32 s5, s15, 16 +; SI-NEXT: s_or_b32 s38, s4, s5 +; SI-NEXT: s_and_b32 s4, s19, 0xffff +; SI-NEXT: s_lshl_b32 s5, s14, 16 +; SI-NEXT: s_or_b32 s39, s4, s5 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: s_lshl_b32 s5, s13, 16 +; SI-NEXT: s_or_b32 s40, s4, s5 +; SI-NEXT: s_and_b32 s4, s21, 0xffff +; SI-NEXT: s_lshl_b32 s5, s12, 16 +; SI-NEXT: s_or_b32 s41, s4, s5 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: s_lshl_b32 s5, s11, 16 +; SI-NEXT: s_or_b32 s42, s4, s5 +; SI-NEXT: s_and_b32 s4, s23, 0xffff +; SI-NEXT: s_lshl_b32 s5, s10, 16 +; SI-NEXT: s_or_b32 s43, s4, s5 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: s_lshl_b32 s5, s9, 16 +; SI-NEXT: s_or_b32 s44, s4, s5 +; SI-NEXT: s_and_b32 s4, s25, 0xffff +; SI-NEXT: s_lshl_b32 s5, s8, 16 +; SI-NEXT: s_or_b32 s45, s4, s5 +; SI-NEXT: s_and_b32 s4, s26, 0xffff +; SI-NEXT: s_lshl_b32 s5, s7, 16 +; SI-NEXT: s_or_b32 s46, s4, s5 +; SI-NEXT: s_and_b32 s4, s27, 0xffff +; SI-NEXT: s_lshl_b32 s5, s6, 16 +; SI-NEXT: s_or_b32 s47, s4, s5 ; SI-NEXT: s_cbranch_execnz .LBB43_3 ; SI-NEXT: .LBB43_2: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v20 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v25, v0 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v19 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_or_b32_e32 v0, v24, v0 ; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_lshl_b32 s5, s56, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s36, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s17, 0xffff +; SI-NEXT: s_lshl_b32 s5, s29, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v18 +; SI-NEXT: s_add_i32 s37, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: s_lshl_b32 s5, s15, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s38, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s19, 0xffff +; SI-NEXT: s_lshl_b32 s5, s14, 16 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 ; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s39, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: s_lshl_b32 s5, s13, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s40, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s21, 0xffff +; SI-NEXT: s_lshl_b32 s5, s12, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: v_or_b32_e32 v0, v23, v0 -; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s41, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: s_lshl_b32 s5, s11, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s42, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s23, 0xffff +; SI-NEXT: s_lshl_b32 s5, s10, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v17 -; SI-NEXT: s_or_b32 s7, s8, s7 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s43, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: s_lshl_b32 s5, s9, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s44, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s25, 0xffff +; SI-NEXT: s_lshl_b32 s5, s8, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_or_b32 s8, s9, s8 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: v_or_b32_e32 v0, v22, v0 -; SI-NEXT: s_or_b32 s9, s10, s9 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v16 -; SI-NEXT: s_or_b32 s10, s11, s10 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_add_i32 s4, s4, 0x30000 -; SI-NEXT: s_add_i32 s5, s5, 0x30000 -; SI-NEXT: s_add_i32 s6, s6, 0x30000 -; SI-NEXT: s_add_i32 s7, s7, 0x30000 -; SI-NEXT: s_add_i32 s8, s8, 0x30000 -; SI-NEXT: s_add_i32 s9, s9, 0x30000 -; SI-NEXT: s_add_i32 s10, s10, 0x30000 -; SI-NEXT: v_or_b32_e32 v0, v21, v0 -; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_mov_b32_e32 v5, s9 -; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_add_i32 s45, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s26, 0xffff +; SI-NEXT: s_lshl_b32 s5, s7, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_add_i32 s46, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s27, 0xffff +; SI-NEXT: s_lshl_b32 s5, s6, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s47, s4, 0x30000 ; SI-NEXT: .LBB43_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s36 +; SI-NEXT: v_mov_b32_e32 v1, s37 +; SI-NEXT: v_mov_b32_e32 v2, s38 +; SI-NEXT: v_mov_b32_e32 v3, s39 +; SI-NEXT: v_mov_b32_e32 v4, s40 +; SI-NEXT: v_mov_b32_e32 v5, s41 +; SI-NEXT: v_mov_b32_e32 v6, s42 +; SI-NEXT: v_mov_b32_e32 v7, s43 +; SI-NEXT: v_mov_b32_e32 v8, s44 +; SI-NEXT: v_mov_b32_e32 v9, s45 +; SI-NEXT: v_mov_b32_e32 v10, s46 +; SI-NEXT: v_mov_b32_e32 v11, s47 +; SI-NEXT: v_readlane_b32 s51, v12, 7 +; SI-NEXT: v_readlane_b32 s50, v12, 6 +; SI-NEXT: v_readlane_b32 s49, v12, 5 +; SI-NEXT: v_readlane_b32 s48, v12, 4 +; SI-NEXT: v_readlane_b32 s39, v12, 3 +; SI-NEXT: v_readlane_b32 s38, v12, 2 +; SI-NEXT: v_readlane_b32 s37, v12, 1 +; SI-NEXT: v_readlane_b32 s36, v12, 0 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB43_4: -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: ; implicit-def: $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 ; SI-NEXT: s_branch .LBB43_2 ; ; VI-LABEL: bitcast_v24i16_to_v6f64_scalar: @@ -8418,67 +8957,67 @@ define <24 x half> @bitcast_v6f64_to_v24f16(<6 x double> %a, i32 %b) { ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 ; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB44_2 ; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v1 ; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v12 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v12 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v12 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v12 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v12 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v12 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v12 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v12 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v12 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v12 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v12 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v10 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v18, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v34, v0 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr2 @@ -8497,55 +9036,91 @@ define <24 x half> @bitcast_v6f64_to_v24f16(<6 x double> %a, i32 %b) { ; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 ; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 ; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v7 ; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v10 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v18, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v34, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 ; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 ; SI-NEXT: .LBB44_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_mov_b32_e32 v0, v34 -; SI-NEXT: v_mov_b32_e32 v1, v35 -; SI-NEXT: v_mov_b32_e32 v2, v33 -; SI-NEXT: v_mov_b32_e32 v3, v32 -; SI-NEXT: v_mov_b32_e32 v4, v31 -; SI-NEXT: v_mov_b32_e32 v5, v29 -; SI-NEXT: v_mov_b32_e32 v6, v30 -; SI-NEXT: v_mov_b32_e32 v7, v27 -; SI-NEXT: v_mov_b32_e32 v8, v28 -; SI-NEXT: v_mov_b32_e32 v9, v24 -; SI-NEXT: v_mov_b32_e32 v10, v26 -; SI-NEXT: v_mov_b32_e32 v11, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v32 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v31 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v28 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v26 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v25 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v22 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v21 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v17 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v18 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v13 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v14 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v6f64_to_v24f16: @@ -8630,112 +9205,160 @@ define inreg <24 x half> @bitcast_v6f64_to_v24f16_scalar(<6 x double> inreg %a, ; SI-NEXT: s_cbranch_scc0 .LBB45_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_lshr_b32 s4, s27, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 ; SI-NEXT: s_lshr_b32 s4, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 ; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 ; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 ; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 ; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 ; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 ; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 ; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 ; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 ; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s16 ; SI-NEXT: s_cbranch_execnz .LBB45_3 ; SI-NEXT: .LBB45_2: ; %cmp.true -; SI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 -; SI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 -; SI-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 -; SI-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 -; SI-NEXT: v_add_f64 v[8:9], s[24:25], 1.0 -; SI-NEXT: v_add_f64 v[10:11], s[26:27], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v3 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v2 +; SI-NEXT: v_add_f64 v[3:4], s[16:17], 1.0 +; SI-NEXT: v_add_f64 v[0:1], s[18:19], 1.0 +; SI-NEXT: v_add_f64 v[8:9], s[20:21], 1.0 +; SI-NEXT: v_add_f64 v[5:6], s[22:23], 1.0 +; SI-NEXT: v_add_f64 v[10:11], s[24:25], 1.0 +; SI-NEXT: v_add_f64 v[13:14], s[26:27], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v28 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: .LBB45_3: ; %end -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB45_4: -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v23 +; SI-NEXT: v_or_b32_e32 v0, v22, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_or_b32_e32 v2, v20, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v19 +; SI-NEXT: v_or_b32_e32 v4, v18, v4 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v20 +; SI-NEXT: v_or_b32_e32 v5, v5, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v18 +; SI-NEXT: v_or_b32_e32 v7, v7, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_or_b32_e32 v9, v14, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v14 +; SI-NEXT: v_or_b32_e32 v1, v24, v1 +; SI-NEXT: v_or_b32_e32 v3, v22, v3 +; SI-NEXT: v_or_b32_e32 v6, v17, v6 +; SI-NEXT: v_or_b32_e32 v8, v15, v8 +; SI-NEXT: v_or_b32_e32 v10, v13, v10 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB45_4: ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: s_branch .LBB45_2 ; ; VI-LABEL: bitcast_v6f64_to_v24f16_scalar: @@ -8864,31 +9487,67 @@ define <6 x double> @bitcast_v24f16_to_v6f64(<24 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v24f16_to_v6f64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v52, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v22 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v11 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -8900,35 +9559,30 @@ define <6 x double> @bitcast_v24f16_to_v6f64(<24 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB46_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v52 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v50 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v38 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v34 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v30 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v28 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v21 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v19 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v17 -; SI-NEXT: v_or_b32_e32 v0, v51, v0 -; SI-NEXT: v_or_b32_e32 v1, v49, v1 -; SI-NEXT: v_or_b32_e32 v2, v39, v2 -; SI-NEXT: v_or_b32_e32 v3, v37, v3 -; SI-NEXT: v_or_b32_e32 v4, v35, v4 -; SI-NEXT: v_or_b32_e32 v5, v33, v5 -; SI-NEXT: v_or_b32_e32 v6, v31, v6 -; SI-NEXT: v_or_b32_e32 v7, v29, v7 -; SI-NEXT: v_or_b32_e32 v8, v27, v8 -; SI-NEXT: v_or_b32_e32 v9, v25, v9 +; SI-NEXT: v_or_b32_e32 v0, v38, v0 +; SI-NEXT: v_or_b32_e32 v1, v36, v1 +; SI-NEXT: v_or_b32_e32 v2, v34, v2 +; SI-NEXT: v_or_b32_e32 v3, v32, v3 +; SI-NEXT: v_or_b32_e32 v4, v30, v4 +; SI-NEXT: v_or_b32_e32 v5, v28, v5 +; SI-NEXT: v_or_b32_e32 v6, v26, v6 +; SI-NEXT: v_or_b32_e32 v7, v24, v7 +; SI-NEXT: v_or_b32_e32 v8, v22, v8 +; SI-NEXT: v_or_b32_e32 v9, v20, v9 ; SI-NEXT: v_or_b32_e32 v10, v18, v10 ; SI-NEXT: v_or_b32_e32 v11, v16, v11 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr37 @@ -8944,6 +9598,11 @@ define <6 x double> @bitcast_v24f16_to_v6f64(<24 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr17 @@ -8951,10 +9610,10 @@ define <6 x double> @bitcast_v24f16_to_v6f64(<24 x half> %a, i32 %b) { ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB46_2 ; SI-NEXT: .LBB46_4: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v36 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -8967,10 +9626,10 @@ define <6 x double> @bitcast_v24f16_to_v6f64(<24 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v32 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -8979,25 +9638,25 @@ define <6 x double> @bitcast_v24f16_to_v6f64(<24 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v31 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v30 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v28 ; SI-NEXT: v_or_b32_e32 v4, v4, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v26 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 @@ -9005,11 +9664,11 @@ define <6 x double> @bitcast_v24f16_to_v6f64(<24 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v25 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v23 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 @@ -9017,11 +9676,11 @@ define <6 x double> @bitcast_v24f16_to_v6f64(<24 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v22 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v20 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 @@ -9172,56 +9831,91 @@ define inreg <6 x double> @bitcast_v24f16_to_v6f64_scalar(<24 x half> inreg %a, ; SI-LABEL: bitcast_v24f16_to_v6f64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v39, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v38, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v37, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v36, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v35, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v34, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v33, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v32, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v30, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v28, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v31, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v29, s26 -; SI-NEXT: v_cvt_f16_f32_e32 v27, s29 -; SI-NEXT: v_cvt_f16_f32_e32 v26, s28 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_lshr_b32 s6, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s6 +; SI-NEXT: s_lshr_b32 s6, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s6 +; SI-NEXT: s_lshr_b32 s6, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s6 +; SI-NEXT: s_lshr_b32 s6, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s6 +; SI-NEXT: s_lshr_b32 s6, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s6 +; SI-NEXT: s_lshr_b32 s6, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 +; SI-NEXT: s_lshr_b32 s6, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s6 +; SI-NEXT: s_lshr_b32 s6, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s6 +; SI-NEXT: s_lshr_b32 s6, s17, 16 +; SI-NEXT: s_lshr_b32 s4, s27, 16 +; SI-NEXT: s_lshr_b32 s5, s26, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s6 +; SI-NEXT: s_lshr_b32 s6, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v3 +; SI-NEXT: s_cmp_lg_u32 s28, 0 ; SI-NEXT: s_cbranch_scc0 .LBB47_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v39 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v37 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v33 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v30 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v29 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v25 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v23 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v20 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v17 ; SI-NEXT: v_or_b32_e32 v0, v38, v0 ; SI-NEXT: v_or_b32_e32 v1, v36, v1 ; SI-NEXT: v_or_b32_e32 v2, v34, v2 ; SI-NEXT: v_or_b32_e32 v3, v32, v3 -; SI-NEXT: v_or_b32_e32 v4, v28, v4 -; SI-NEXT: v_or_b32_e32 v5, v29, v5 +; SI-NEXT: v_or_b32_e32 v4, v30, v4 +; SI-NEXT: v_or_b32_e32 v5, v28, v5 ; SI-NEXT: v_or_b32_e32 v6, v26, v6 ; SI-NEXT: v_or_b32_e32 v7, v24, v7 ; SI-NEXT: v_or_b32_e32 v8, v22, v8 -; SI-NEXT: v_or_b32_e32 v9, v20, v9 +; SI-NEXT: v_or_b32_e32 v9, v19, v9 ; SI-NEXT: v_or_b32_e32 v10, v18, v10 ; SI-NEXT: v_or_b32_e32 v11, v16, v11 ; SI-NEXT: s_cbranch_execnz .LBB47_3 @@ -9254,21 +9948,21 @@ define inreg <6 x double> @bitcast_v24f16_to_v6f64_scalar(<24 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v31 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v30 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v28 ; SI-NEXT: v_or_b32_e32 v4, v4, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v27 @@ -9296,7 +9990,7 @@ define inreg <6 x double> @bitcast_v24f16_to_v6f64_scalar(<24 x half> inreg %a, ; SI-NEXT: v_or_b32_e32 v7, v8, v7 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v19 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 @@ -9304,7 +9998,7 @@ define inreg <6 x double> @bitcast_v24f16_to_v6f64_scalar(<24 x half> inreg %a, ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v20 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v11, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v18 @@ -9522,77 +10216,101 @@ define <24 x i16> @bitcast_v6i64_to_v24i16(<6 x i64> %a, i32 %b) { ; SI-LABEL: bitcast_v6i64_to_v24i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v22, v11 -; SI-NEXT: v_mov_b32_e32 v20, v10 -; SI-NEXT: v_mov_b32_e32 v18, v9 -; SI-NEXT: v_mov_b32_e32 v16, v8 -; SI-NEXT: v_mov_b32_e32 v14, v7 -; SI-NEXT: v_mov_b32_e32 v24, v6 -; SI-NEXT: v_mov_b32_e32 v10, v5 -; SI-NEXT: v_mov_b32_e32 v8, v4 -; SI-NEXT: v_mov_b32_e32 v6, v3 -; SI-NEXT: v_mov_b32_e32 v4, v2 -; SI-NEXT: v_mov_b32_e32 v2, v1 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB48_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_alignbit_b32 v21, v22, v20, 16 -; SI-NEXT: v_alignbit_b32 v17, v18, v16, 16 -; SI-NEXT: v_alignbit_b32 v13, v14, v24, 16 -; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_alignbit_b32 v12, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v13, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v14, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v15, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v17, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v20, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v1 ; SI-NEXT: .LBB48_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB48_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; SI-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc ; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; SI-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc -; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; SI-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; SI-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc -; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; SI-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc -; SI-NEXT: v_alignbit_b32 v21, v22, v20, 16 -; SI-NEXT: v_alignbit_b32 v17, v18, v16, 16 -; SI-NEXT: v_alignbit_b32 v13, v14, v24, 16 -; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_alignbit_b32 v12, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v13, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v14, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v15, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v17, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v20, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v1 ; SI-NEXT: .LBB48_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_mov_b32_e32 v12, v24 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v0, v0, v20 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v23 +; SI-NEXT: v_or_b32_e32 v2, v2, v17 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v22 +; SI-NEXT: v_or_b32_e32 v4, v4, v15 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v21 +; SI-NEXT: v_or_b32_e32 v6, v6, v14 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v19 +; SI-NEXT: v_or_b32_e32 v8, v8, v13 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v18 +; SI-NEXT: v_or_b32_e32 v10, v10, v12 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v16 +; SI-NEXT: v_or_b32_e32 v1, v1, v20 +; SI-NEXT: v_or_b32_e32 v3, v3, v17 +; SI-NEXT: v_or_b32_e32 v5, v5, v15 +; SI-NEXT: v_or_b32_e32 v7, v7, v14 +; SI-NEXT: v_or_b32_e32 v9, v9, v13 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v6i64_to_v24i16: @@ -9736,30 +10454,54 @@ define inreg <24 x i16> @bitcast_v6i64_to_v24i16_scalar(<6 x i64> inreg %a, i32 ; SI-NEXT: s_lshr_b64 s[12:13], s[18:19], 16 ; SI-NEXT: s_lshr_b64 s[14:15], s[16:17], 16 ; SI-NEXT: .LBB49_3: ; %end -; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_mov_b32_e32 v1, s14 -; SI-NEXT: v_mov_b32_e32 v2, s17 -; SI-NEXT: v_mov_b32_e32 v3, s45 -; SI-NEXT: v_mov_b32_e32 v4, s18 +; SI-NEXT: s_and_b32 s5, s16, 0xffff +; SI-NEXT: s_lshl_b32 s7, s14, 16 +; SI-NEXT: s_or_b32 s5, s5, s7 +; SI-NEXT: s_and_b32 s7, s17, 0xffff +; SI-NEXT: s_lshl_b32 s9, s45, 16 +; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: s_and_b32 s9, s18, 0xffff +; SI-NEXT: s_lshl_b32 s11, s12, 16 +; SI-NEXT: s_or_b32 s9, s9, s11 +; SI-NEXT: s_and_b32 s11, s19, 0xffff +; SI-NEXT: s_lshl_b32 s12, s44, 16 +; SI-NEXT: s_or_b32 s11, s11, s12 +; SI-NEXT: s_and_b32 s12, s20, 0xffff +; SI-NEXT: s_lshl_b32 s10, s10, 16 +; SI-NEXT: s_or_b32 s10, s12, s10 +; SI-NEXT: s_and_b32 s12, s21, 0xffff +; SI-NEXT: s_lshl_b32 s13, s43, 16 +; SI-NEXT: s_or_b32 s12, s12, s13 +; SI-NEXT: s_and_b32 s13, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_or_b32 s8, s13, s8 +; SI-NEXT: s_and_b32 s13, s23, 0xffff +; SI-NEXT: s_lshl_b32 s14, s42, 16 +; SI-NEXT: s_or_b32 s13, s13, s14 +; SI-NEXT: s_and_b32 s14, s24, 0xffff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_or_b32 s6, s14, s6 +; SI-NEXT: s_and_b32 s14, s25, 0xffff +; SI-NEXT: s_lshl_b32 s15, s41, 16 +; SI-NEXT: s_or_b32 s14, s14, s15 +; SI-NEXT: s_and_b32 s15, s26, 0xffff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_or_b32 s4, s15, s4 +; SI-NEXT: s_and_b32 s15, s27, 0xffff +; SI-NEXT: s_lshl_b32 s16, s40, 16 +; SI-NEXT: s_or_b32 s15, s15, s16 +; SI-NEXT: v_mov_b32_e32 v0, s5 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: v_mov_b32_e32 v3, s11 +; SI-NEXT: v_mov_b32_e32 v4, s10 ; SI-NEXT: v_mov_b32_e32 v5, s12 -; SI-NEXT: v_mov_b32_e32 v6, s19 -; SI-NEXT: v_mov_b32_e32 v7, s44 -; SI-NEXT: v_mov_b32_e32 v8, s20 -; SI-NEXT: v_mov_b32_e32 v9, s10 -; SI-NEXT: v_mov_b32_e32 v10, s21 -; SI-NEXT: v_mov_b32_e32 v11, s43 -; SI-NEXT: v_mov_b32_e32 v12, s22 -; SI-NEXT: v_mov_b32_e32 v13, s8 -; SI-NEXT: v_mov_b32_e32 v14, s23 -; SI-NEXT: v_mov_b32_e32 v15, s42 -; SI-NEXT: v_mov_b32_e32 v16, s24 -; SI-NEXT: v_mov_b32_e32 v17, s6 -; SI-NEXT: v_mov_b32_e32 v18, s25 -; SI-NEXT: v_mov_b32_e32 v19, s41 -; SI-NEXT: v_mov_b32_e32 v20, s26 -; SI-NEXT: v_mov_b32_e32 v21, s4 -; SI-NEXT: v_mov_b32_e32 v22, s27 -; SI-NEXT: v_mov_b32_e32 v23, s40 +; SI-NEXT: v_mov_b32_e32 v6, s8 +; SI-NEXT: v_mov_b32_e32 v7, s13 +; SI-NEXT: v_mov_b32_e32 v8, s6 +; SI-NEXT: v_mov_b32_e32 v9, s14 +; SI-NEXT: v_mov_b32_e32 v10, s4 +; SI-NEXT: v_mov_b32_e32 v11, s15 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB49_4: ; SI-NEXT: ; implicit-def: $sgpr14 @@ -9904,27 +10646,43 @@ define <6 x i64> @bitcast_v24i16_to_v6i64(<24 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v24i16_to_v6i64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v32, v14 -; SI-NEXT: v_mov_b32_e32 v31, v12 -; SI-NEXT: v_mov_b32_e32 v30, v10 -; SI-NEXT: v_mov_b32_e32 v29, v8 -; SI-NEXT: v_mov_b32_e32 v28, v6 -; SI-NEXT: v_mov_b32_e32 v25, v4 -; SI-NEXT: v_mov_b32_e32 v26, v2 +; SI-NEXT: v_mov_b32_e32 v16, v11 +; SI-NEXT: v_mov_b32_e32 v17, v10 +; SI-NEXT: v_mov_b32_e32 v18, v9 +; SI-NEXT: v_mov_b32_e32 v19, v8 +; SI-NEXT: v_mov_b32_e32 v20, v7 +; SI-NEXT: v_mov_b32_e32 v21, v6 +; SI-NEXT: v_mov_b32_e32 v22, v5 +; SI-NEXT: v_mov_b32_e32 v23, v4 +; SI-NEXT: v_mov_b32_e32 v24, v3 +; SI-NEXT: v_mov_b32_e32 v25, v2 +; SI-NEXT: v_mov_b32_e32 v26, v1 ; SI-NEXT: v_mov_b32_e32 v27, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 -; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v27 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v10 ; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v0 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -9939,41 +10697,39 @@ define <6 x i64> @bitcast_v24i16_to_v6i64(<24 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v27 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v26 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v25 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v28 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v29 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v30 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v31 -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v32 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v24 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v23 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v21 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v19 ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v18 -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v20 -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v22 -; SI-NEXT: v_or_b32_e32 v0, v0, v49 -; SI-NEXT: v_or_b32_e32 v1, v1, v48 -; SI-NEXT: v_or_b32_e32 v2, v2, v39 -; SI-NEXT: v_or_b32_e32 v3, v3, v38 -; SI-NEXT: v_or_b32_e32 v4, v4, v37 -; SI-NEXT: v_or_b32_e32 v5, v5, v36 -; SI-NEXT: v_or_b32_e32 v6, v6, v35 -; SI-NEXT: v_or_b32_e32 v7, v7, v34 -; SI-NEXT: v_or_b32_e32 v8, v8, v33 -; SI-NEXT: v_or_b32_e32 v9, v9, v24 -; SI-NEXT: v_or_b32_e32 v10, v10, v19 -; SI-NEXT: v_or_b32_e32 v11, v11, v17 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v0, v0, v39 +; SI-NEXT: v_or_b32_e32 v1, v1, v38 +; SI-NEXT: v_or_b32_e32 v2, v2, v37 +; SI-NEXT: v_or_b32_e32 v3, v3, v36 +; SI-NEXT: v_or_b32_e32 v4, v4, v35 +; SI-NEXT: v_or_b32_e32 v5, v5, v34 +; SI-NEXT: v_or_b32_e32 v6, v6, v33 +; SI-NEXT: v_or_b32_e32 v7, v7, v32 +; SI-NEXT: v_or_b32_e32 v8, v8, v31 +; SI-NEXT: v_or_b32_e32 v9, v9, v30 +; SI-NEXT: v_or_b32_e32 v10, v10, v29 +; SI-NEXT: v_or_b32_e32 v11, v11, v28 ; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr37 @@ -9981,24 +10737,26 @@ define <6 x i64> @bitcast_v24i16_to_v6i64(<24 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB50_2 ; SI-NEXT: .LBB50_4: ; %cmp.true ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v27 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v26 ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v25 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v28 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v29 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v30 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v31 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v32 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v19 ; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v18 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v20 -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v16 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 @@ -10011,19 +10769,19 @@ define <6 x i64> @bitcast_v24i16_to_v6i64(<24 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: v_or_b32_e32 v0, v49, v0 +; SI-NEXT: v_or_b32_e32 v0, v39, v0 ; SI-NEXT: s_mov_b32 s6, 0x30000 -; SI-NEXT: v_or_b32_e32 v1, v48, v1 -; SI-NEXT: v_or_b32_e32 v2, v39, v2 -; SI-NEXT: v_or_b32_e32 v3, v38, v3 -; SI-NEXT: v_or_b32_e32 v4, v37, v4 -; SI-NEXT: v_or_b32_e32 v5, v36, v5 -; SI-NEXT: v_or_b32_e32 v6, v35, v6 -; SI-NEXT: v_or_b32_e32 v7, v34, v7 -; SI-NEXT: v_or_b32_e32 v8, v33, v8 -; SI-NEXT: v_or_b32_e32 v9, v24, v9 -; SI-NEXT: v_or_b32_e32 v10, v19, v10 -; SI-NEXT: v_or_b32_e32 v11, v17, v11 +; SI-NEXT: v_or_b32_e32 v1, v38, v1 +; SI-NEXT: v_or_b32_e32 v2, v37, v2 +; SI-NEXT: v_or_b32_e32 v3, v36, v3 +; SI-NEXT: v_or_b32_e32 v4, v35, v4 +; SI-NEXT: v_or_b32_e32 v5, v34, v5 +; SI-NEXT: v_or_b32_e32 v6, v33, v6 +; SI-NEXT: v_or_b32_e32 v7, v32, v7 +; SI-NEXT: v_or_b32_e32 v8, v31, v8 +; SI-NEXT: v_or_b32_e32 v9, v30, v9 +; SI-NEXT: v_or_b32_e32 v10, v29, v10 +; SI-NEXT: v_or_b32_e32 v11, v28, v11 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 @@ -10160,126 +10918,159 @@ define inreg <6 x i64> @bitcast_v24i16_to_v6i64_scalar(<24 x i16> inreg %a, i32 ; SI-LABEL: bitcast_v24i16_to_v6i64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; SI-NEXT: v_mov_b32_e32 v16, v8 -; SI-NEXT: v_mov_b32_e32 v17, v6 -; SI-NEXT: v_mov_b32_e32 v18, v4 -; SI-NEXT: v_mov_b32_e32 v19, v2 -; SI-NEXT: v_mov_b32_e32 v20, v0 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v9 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v12, s36, 0 +; SI-NEXT: v_writelane_b32 v12, s37, 1 +; SI-NEXT: v_writelane_b32 v12, s38, 2 +; SI-NEXT: v_writelane_b32 v12, s39, 3 +; SI-NEXT: v_writelane_b32 v12, s48, 4 +; SI-NEXT: v_writelane_b32 v12, s49, 5 +; SI-NEXT: s_lshr_b32 s6, s27, 16 +; SI-NEXT: s_lshr_b32 s7, s26, 16 +; SI-NEXT: s_lshr_b32 s8, s25, 16 +; SI-NEXT: s_lshr_b32 s9, s24, 16 +; SI-NEXT: s_lshr_b32 s10, s23, 16 +; SI-NEXT: s_lshr_b32 s11, s22, 16 +; SI-NEXT: s_lshr_b32 s12, s21, 16 +; SI-NEXT: s_lshr_b32 s13, s20, 16 +; SI-NEXT: s_lshr_b32 s14, s19, 16 +; SI-NEXT: s_lshr_b32 s15, s18, 16 +; SI-NEXT: s_lshr_b32 s29, s17, 16 +; SI-NEXT: s_lshr_b32 s56, s16, 16 +; SI-NEXT: v_writelane_b32 v12, s50, 6 +; SI-NEXT: s_cmp_lg_u32 s28, 0 +; SI-NEXT: v_writelane_b32 v12, s51, 7 ; SI-NEXT: s_cbranch_scc0 .LBB51_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v20 -; SI-NEXT: s_or_b32 s7, s7, s8 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: v_or_b32_e32 v7, v0, v25 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v18 -; SI-NEXT: s_or_b32 s8, s8, s9 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: v_or_b32_e32 v9, v0, v23 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v17 -; SI-NEXT: s_or_b32 s9, s9, s10 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 -; SI-NEXT: v_or_b32_e32 v10, v0, v22 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 -; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_or_b32_e32 v8, v1, v24 -; SI-NEXT: v_or_b32_e32 v11, v0, v21 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_mov_b32_e32 v5, s9 -; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_lshl_b32 s5, s56, 16 +; SI-NEXT: s_or_b32 s36, s4, s5 +; SI-NEXT: s_and_b32 s4, s17, 0xffff +; SI-NEXT: s_lshl_b32 s5, s29, 16 +; SI-NEXT: s_or_b32 s37, s4, s5 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: s_lshl_b32 s5, s15, 16 +; SI-NEXT: s_or_b32 s38, s4, s5 +; SI-NEXT: s_and_b32 s4, s19, 0xffff +; SI-NEXT: s_lshl_b32 s5, s14, 16 +; SI-NEXT: s_or_b32 s39, s4, s5 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: s_lshl_b32 s5, s13, 16 +; SI-NEXT: s_or_b32 s40, s4, s5 +; SI-NEXT: s_and_b32 s4, s21, 0xffff +; SI-NEXT: s_lshl_b32 s5, s12, 16 +; SI-NEXT: s_or_b32 s41, s4, s5 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: s_lshl_b32 s5, s11, 16 +; SI-NEXT: s_or_b32 s42, s4, s5 +; SI-NEXT: s_and_b32 s4, s23, 0xffff +; SI-NEXT: s_lshl_b32 s5, s10, 16 +; SI-NEXT: s_or_b32 s43, s4, s5 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: s_lshl_b32 s5, s9, 16 +; SI-NEXT: s_or_b32 s44, s4, s5 +; SI-NEXT: s_and_b32 s4, s25, 0xffff +; SI-NEXT: s_lshl_b32 s5, s8, 16 +; SI-NEXT: s_or_b32 s45, s4, s5 +; SI-NEXT: s_and_b32 s4, s26, 0xffff +; SI-NEXT: s_lshl_b32 s5, s7, 16 +; SI-NEXT: s_or_b32 s46, s4, s5 +; SI-NEXT: s_and_b32 s4, s27, 0xffff +; SI-NEXT: s_lshl_b32 s5, s6, 16 +; SI-NEXT: s_or_b32 s47, s4, s5 ; SI-NEXT: s_cbranch_execnz .LBB51_3 ; SI-NEXT: .LBB51_2: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v20 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v25, v0 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v19 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_or_b32_e32 v0, v24, v0 ; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_lshl_b32 s5, s56, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s36, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s17, 0xffff +; SI-NEXT: s_lshl_b32 s5, s29, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v18 +; SI-NEXT: s_add_i32 s37, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: s_lshl_b32 s5, s15, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s38, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s19, 0xffff +; SI-NEXT: s_lshl_b32 s5, s14, 16 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 ; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s39, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: s_lshl_b32 s5, s13, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s40, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s21, 0xffff +; SI-NEXT: s_lshl_b32 s5, s12, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: v_or_b32_e32 v0, v23, v0 -; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s41, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: s_lshl_b32 s5, s11, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s42, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s23, 0xffff +; SI-NEXT: s_lshl_b32 s5, s10, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v17 -; SI-NEXT: s_or_b32 s7, s8, s7 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s43, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: s_lshl_b32 s5, s9, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s44, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s25, 0xffff +; SI-NEXT: s_lshl_b32 s5, s8, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_or_b32 s8, s9, s8 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: v_or_b32_e32 v0, v22, v0 -; SI-NEXT: s_or_b32 s9, s10, s9 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v16 -; SI-NEXT: s_or_b32 s10, s11, s10 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_add_i32 s4, s4, 0x30000 -; SI-NEXT: s_add_i32 s5, s5, 0x30000 -; SI-NEXT: s_add_i32 s6, s6, 0x30000 -; SI-NEXT: s_add_i32 s7, s7, 0x30000 -; SI-NEXT: s_add_i32 s8, s8, 0x30000 -; SI-NEXT: s_add_i32 s9, s9, 0x30000 -; SI-NEXT: s_add_i32 s10, s10, 0x30000 -; SI-NEXT: v_or_b32_e32 v0, v21, v0 -; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_mov_b32_e32 v5, s9 -; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_add_i32 s45, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s26, 0xffff +; SI-NEXT: s_lshl_b32 s5, s7, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_add_i32 s46, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s27, 0xffff +; SI-NEXT: s_lshl_b32 s5, s6, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s47, s4, 0x30000 ; SI-NEXT: .LBB51_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s36 +; SI-NEXT: v_mov_b32_e32 v1, s37 +; SI-NEXT: v_mov_b32_e32 v2, s38 +; SI-NEXT: v_mov_b32_e32 v3, s39 +; SI-NEXT: v_mov_b32_e32 v4, s40 +; SI-NEXT: v_mov_b32_e32 v5, s41 +; SI-NEXT: v_mov_b32_e32 v6, s42 +; SI-NEXT: v_mov_b32_e32 v7, s43 +; SI-NEXT: v_mov_b32_e32 v8, s44 +; SI-NEXT: v_mov_b32_e32 v9, s45 +; SI-NEXT: v_mov_b32_e32 v10, s46 +; SI-NEXT: v_mov_b32_e32 v11, s47 +; SI-NEXT: v_readlane_b32 s51, v12, 7 +; SI-NEXT: v_readlane_b32 s50, v12, 6 +; SI-NEXT: v_readlane_b32 s49, v12, 5 +; SI-NEXT: v_readlane_b32 s48, v12, 4 +; SI-NEXT: v_readlane_b32 s39, v12, 3 +; SI-NEXT: v_readlane_b32 s38, v12, 2 +; SI-NEXT: v_readlane_b32 s37, v12, 1 +; SI-NEXT: v_readlane_b32 s36, v12, 0 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB51_4: -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: ; implicit-def: $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 ; SI-NEXT: s_branch .LBB51_2 ; ; VI-LABEL: bitcast_v24i16_to_v6i64_scalar: @@ -10469,19 +11260,71 @@ define <24 x half> @bitcast_v6i64_to_v24f16(<6 x i64> %a, i32 %b) { ; SI-LABEL: bitcast_v6i64_to_v24f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v25, v11 -; SI-NEXT: v_mov_b32_e32 v24, v10 -; SI-NEXT: v_mov_b32_e32 v27, v9 -; SI-NEXT: v_mov_b32_e32 v26, v8 -; SI-NEXT: v_mov_b32_e32 v29, v7 -; SI-NEXT: v_mov_b32_e32 v28, v6 -; SI-NEXT: v_mov_b32_e32 v31, v5 -; SI-NEXT: v_mov_b32_e32 v30, v4 -; SI-NEXT: v_mov_b32_e32 v33, v3 -; SI-NEXT: v_mov_b32_e32 v32, v2 -; SI-NEXT: v_mov_b32_e32 v35, v1 -; SI-NEXT: v_mov_b32_e32 v34, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB52_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v0 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 @@ -10494,128 +11337,108 @@ define <24 x half> @bitcast_v6i64_to_v24f16(<6 x i64> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB52_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB52_4 -; SI-NEXT: .LBB52_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB52_3: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v34 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: .LBB52_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB52_2 -; SI-NEXT: .LBB52_4: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 -; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v35, vcc -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v32 -; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v33, vcc -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v30 -; SI-NEXT: v_addc_u32_e32 v8, vcc, 0, v31, vcc -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v28 -; SI-NEXT: v_addc_u32_e32 v12, vcc, 0, v29, vcc -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v26 -; SI-NEXT: v_addc_u32_e32 v16, vcc, 0, v27, vcc -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v24 -; SI-NEXT: v_addc_u32_e32 v20, vcc, 0, v25, vcc -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v8 +; SI-NEXT: s_cbranch_execz .LBB52_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v9 ; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: .LBB52_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_cvt_f16_f32_e32 v0, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v32 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v31 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v28 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v26 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v25 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v22 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v21 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v17 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v18 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v13 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v14 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v6i64_to_v24f16: @@ -10721,41 +11544,41 @@ define inreg <24 x half> @bitcast_v6i64_to_v24f16_scalar(<6 x i64> inreg %a, i32 ; SI-NEXT: s_cbranch_scc0 .LBB53_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_lshr_b32 s4, s27, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 ; SI-NEXT: s_lshr_b32 s4, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 ; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 ; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 ; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 ; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 ; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 ; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 ; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 ; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 ; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s16 ; SI-NEXT: s_cbranch_execnz .LBB53_3 ; SI-NEXT: .LBB53_2: ; %cmp.true ; SI-NEXT: s_add_u32 s4, s16, 3 @@ -10782,57 +11605,105 @@ define inreg <24 x half> @bitcast_v6i64_to_v24f16_scalar(<6 x i64> inreg %a, i32 ; SI-NEXT: s_addc_u32 s25, s27, 0 ; SI-NEXT: s_lshr_b32 s26, s24, 16 ; SI-NEXT: s_lshr_b32 s27, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s6 ; SI-NEXT: .LBB53_3: ; %end +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v23 +; SI-NEXT: v_or_b32_e32 v0, v22, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v21 +; SI-NEXT: v_or_b32_e32 v2, v20, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v20 +; SI-NEXT: v_or_b32_e32 v5, v5, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v18 +; SI-NEXT: v_or_b32_e32 v7, v7, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_or_b32_e32 v9, v14, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v14 +; SI-NEXT: v_or_b32_e32 v1, v24, v1 +; SI-NEXT: v_or_b32_e32 v3, v22, v3 +; SI-NEXT: v_or_b32_e32 v4, v19, v4 +; SI-NEXT: v_or_b32_e32 v6, v17, v6 +; SI-NEXT: v_or_b32_e32 v8, v15, v8 +; SI-NEXT: v_or_b32_e32 v10, v13, v10 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB53_4: +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: s_branch .LBB53_2 ; ; VI-LABEL: bitcast_v6i64_to_v24f16_scalar: @@ -10963,71 +11834,102 @@ define <6 x i64> @bitcast_v24f16_to_v6i64(<24 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v24f16_to_v6i64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v52, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v22 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB54_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB54_4 -; SI-NEXT: .LBB54_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v11 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB54_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB54_4 +; SI-NEXT: .LBB54_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB54_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v52 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v50 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v38 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v34 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v30 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v28 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v21 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v19 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v17 -; SI-NEXT: v_or_b32_e32 v0, v51, v0 -; SI-NEXT: v_or_b32_e32 v1, v49, v1 -; SI-NEXT: v_or_b32_e32 v2, v39, v2 -; SI-NEXT: v_or_b32_e32 v3, v37, v3 -; SI-NEXT: v_or_b32_e32 v4, v35, v4 -; SI-NEXT: v_or_b32_e32 v5, v33, v5 -; SI-NEXT: v_or_b32_e32 v6, v31, v6 -; SI-NEXT: v_or_b32_e32 v7, v29, v7 -; SI-NEXT: v_or_b32_e32 v8, v27, v8 -; SI-NEXT: v_or_b32_e32 v9, v25, v9 +; SI-NEXT: v_or_b32_e32 v0, v38, v0 +; SI-NEXT: v_or_b32_e32 v1, v36, v1 +; SI-NEXT: v_or_b32_e32 v2, v34, v2 +; SI-NEXT: v_or_b32_e32 v3, v32, v3 +; SI-NEXT: v_or_b32_e32 v4, v30, v4 +; SI-NEXT: v_or_b32_e32 v5, v28, v5 +; SI-NEXT: v_or_b32_e32 v6, v26, v6 +; SI-NEXT: v_or_b32_e32 v7, v24, v7 +; SI-NEXT: v_or_b32_e32 v8, v22, v8 +; SI-NEXT: v_or_b32_e32 v9, v20, v9 ; SI-NEXT: v_or_b32_e32 v10, v18, v10 ; SI-NEXT: v_or_b32_e32 v11, v16, v11 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr37 @@ -11043,6 +11945,11 @@ define <6 x i64> @bitcast_v24f16_to_v6i64(<24 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr17 @@ -11050,10 +11957,10 @@ define <6 x i64> @bitcast_v24f16_to_v6i64(<24 x half> %a, i32 %b) { ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB54_2 ; SI-NEXT: .LBB54_4: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v36 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -11066,10 +11973,10 @@ define <6 x i64> @bitcast_v24f16_to_v6i64(<24 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v32 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -11078,25 +11985,25 @@ define <6 x i64> @bitcast_v24f16_to_v6i64(<24 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v31 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v30 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v28 ; SI-NEXT: v_or_b32_e32 v4, v4, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v26 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 @@ -11104,11 +12011,11 @@ define <6 x i64> @bitcast_v24f16_to_v6i64(<24 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v25 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v23 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 @@ -11116,11 +12023,11 @@ define <6 x i64> @bitcast_v24f16_to_v6i64(<24 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v22 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v20 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 @@ -11271,56 +12178,91 @@ define inreg <6 x i64> @bitcast_v24f16_to_v6i64_scalar(<24 x half> inreg %a, i32 ; SI-LABEL: bitcast_v24f16_to_v6i64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v39, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v38, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v37, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v36, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v35, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v34, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v33, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v32, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v30, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v28, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v31, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v29, s26 -; SI-NEXT: v_cvt_f16_f32_e32 v27, s29 -; SI-NEXT: v_cvt_f16_f32_e32 v26, s28 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_lshr_b32 s6, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s6 +; SI-NEXT: s_lshr_b32 s6, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s6 +; SI-NEXT: s_lshr_b32 s6, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s6 +; SI-NEXT: s_lshr_b32 s6, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s6 +; SI-NEXT: s_lshr_b32 s6, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s6 +; SI-NEXT: s_lshr_b32 s6, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 +; SI-NEXT: s_lshr_b32 s6, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s6 +; SI-NEXT: s_lshr_b32 s6, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s6 +; SI-NEXT: s_lshr_b32 s6, s17, 16 +; SI-NEXT: s_lshr_b32 s4, s27, 16 +; SI-NEXT: s_lshr_b32 s5, s26, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s6 +; SI-NEXT: s_lshr_b32 s6, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v3 +; SI-NEXT: s_cmp_lg_u32 s28, 0 ; SI-NEXT: s_cbranch_scc0 .LBB55_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v39 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v37 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v33 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v30 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v29 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v25 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v23 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v20 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v17 ; SI-NEXT: v_or_b32_e32 v0, v38, v0 ; SI-NEXT: v_or_b32_e32 v1, v36, v1 ; SI-NEXT: v_or_b32_e32 v2, v34, v2 ; SI-NEXT: v_or_b32_e32 v3, v32, v3 -; SI-NEXT: v_or_b32_e32 v4, v28, v4 -; SI-NEXT: v_or_b32_e32 v5, v29, v5 +; SI-NEXT: v_or_b32_e32 v4, v30, v4 +; SI-NEXT: v_or_b32_e32 v5, v28, v5 ; SI-NEXT: v_or_b32_e32 v6, v26, v6 ; SI-NEXT: v_or_b32_e32 v7, v24, v7 ; SI-NEXT: v_or_b32_e32 v8, v22, v8 -; SI-NEXT: v_or_b32_e32 v9, v20, v9 +; SI-NEXT: v_or_b32_e32 v9, v19, v9 ; SI-NEXT: v_or_b32_e32 v10, v18, v10 ; SI-NEXT: v_or_b32_e32 v11, v16, v11 ; SI-NEXT: s_cbranch_execnz .LBB55_3 @@ -11353,21 +12295,21 @@ define inreg <6 x i64> @bitcast_v24f16_to_v6i64_scalar(<24 x half> inreg %a, i32 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v31 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v30 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v28 ; SI-NEXT: v_or_b32_e32 v4, v4, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v27 @@ -11395,7 +12337,7 @@ define inreg <6 x i64> @bitcast_v24f16_to_v6i64_scalar(<24 x half> inreg %a, i32 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v19 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 @@ -11403,7 +12345,7 @@ define inreg <6 x i64> @bitcast_v24f16_to_v6i64_scalar(<24 x half> inreg %a, i32 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v20 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v11, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v18 @@ -11621,33 +12563,71 @@ define <24 x half> @bitcast_v24i16_to_v24f16(<24 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v24i16_to_v24f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v55, v23 -; SI-NEXT: v_mov_b32_e32 v54, v22 -; SI-NEXT: v_mov_b32_e32 v53, v21 -; SI-NEXT: v_mov_b32_e32 v52, v20 -; SI-NEXT: v_mov_b32_e32 v51, v19 -; SI-NEXT: v_mov_b32_e32 v50, v18 -; SI-NEXT: v_mov_b32_e32 v49, v17 -; SI-NEXT: v_mov_b32_e32 v48, v16 -; SI-NEXT: v_mov_b32_e32 v39, v15 -; SI-NEXT: v_mov_b32_e32 v38, v14 -; SI-NEXT: v_mov_b32_e32 v37, v13 -; SI-NEXT: v_mov_b32_e32 v36, v12 -; SI-NEXT: v_mov_b32_e32 v35, v11 -; SI-NEXT: v_mov_b32_e32 v34, v10 -; SI-NEXT: v_mov_b32_e32 v33, v9 -; SI-NEXT: v_mov_b32_e32 v32, v8 -; SI-NEXT: v_mov_b32_e32 v31, v7 -; SI-NEXT: v_mov_b32_e32 v30, v6 -; SI-NEXT: v_mov_b32_e32 v29, v5 -; SI-NEXT: v_mov_b32_e32 v28, v4 -; SI-NEXT: v_mov_b32_e32 v27, v3 -; SI-NEXT: v_mov_b32_e32 v26, v2 -; SI-NEXT: v_mov_b32_e32 v25, v1 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v40, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB56_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_cvt_f32_f16_e32 v12, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v55 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 @@ -11660,58 +12640,6 @@ define <24 x half> @bitcast_v24i16_to_v24f16(<24 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execz .LBB56_2 -; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v0, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v55 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr38 @@ -11728,58 +12656,104 @@ define <24 x half> @bitcast_v24i16_to_v24f16(<24 x i16> %a, i32 %b) { ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB56_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v55 -; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v54 -; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v53 -; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v35, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 ; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v51 -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 ; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v49 -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v48 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v48 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 ; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v39 -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v38 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v38 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v37 -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v36 -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v35 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v34 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v33 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v32 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v31 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v30 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v29 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v28 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v27 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v26 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v25 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v36 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 ; SI-NEXT: .LBB56_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v14 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v23 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v16 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v26 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v18 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v29 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v21 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v31 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v24 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v33 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v27 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v24i16_to_v24f16: @@ -11903,120 +12877,169 @@ define inreg <24 x half> @bitcast_v24i16_to_v24f16_scalar(<24 x i16> inreg %a, i ; SI-LABEL: bitcast_v24i16_to_v24f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; SI-NEXT: v_mov_b32_e32 v30, v9 -; SI-NEXT: v_mov_b32_e32 v29, v8 -; SI-NEXT: v_mov_b32_e32 v28, v7 -; SI-NEXT: v_mov_b32_e32 v27, v6 -; SI-NEXT: v_mov_b32_e32 v26, v5 -; SI-NEXT: v_mov_b32_e32 v25, v4 -; SI-NEXT: v_mov_b32_e32 v24, v3 -; SI-NEXT: v_mov_b32_e32 v33, v2 -; SI-NEXT: v_mov_b32_e32 v32, v1 -; SI-NEXT: v_mov_b32_e32 v31, v0 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_lshr_b32 s40, s27, 16 +; SI-NEXT: s_lshr_b32 s29, s26, 16 +; SI-NEXT: s_lshr_b32 s15, s25, 16 +; SI-NEXT: s_lshr_b32 s14, s24, 16 +; SI-NEXT: s_lshr_b32 s13, s23, 16 +; SI-NEXT: s_lshr_b32 s12, s22, 16 +; SI-NEXT: s_lshr_b32 s11, s21, 16 +; SI-NEXT: s_lshr_b32 s10, s20, 16 +; SI-NEXT: s_lshr_b32 s9, s19, 16 +; SI-NEXT: s_lshr_b32 s8, s18, 16 +; SI-NEXT: s_lshr_b32 s7, s17, 16 +; SI-NEXT: s_lshr_b32 s6, s16, 16 +; SI-NEXT: s_cmp_lg_u32 s28, 0 ; SI-NEXT: s_cbranch_scc0 .LBB57_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s6 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s7 ; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s8 ; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s9 ; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s10 ; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s11 ; SI-NEXT: v_cvt_f32_f16_e32 v6, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s13 ; SI-NEXT: v_cvt_f32_f16_e32 v8, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s15 ; SI-NEXT: v_cvt_f32_f16_e32 v10, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s40 ; SI-NEXT: s_cbranch_execnz .LBB57_3 ; SI-NEXT: .LBB57_2: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v30 -; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v29 -; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v28 -; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v27 -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v26 -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v25 -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v24 -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v33 -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v32 -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v31 -; SI-NEXT: s_add_i32 s29, s29, 3 -; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s40, s40, 3 ; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_add_i32 s29, s29, 3 ; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s15, s15, 3 ; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s14, s14, 3 ; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s13, s13, 3 ; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s12, s12, 3 ; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s11, s11, 3 ; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s10, s10, 3 ; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s9, s9, 3 ; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s8, s8, 3 ; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s7, s7, 3 ; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s6, s6, 3 ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s6 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s7 ; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s8 ; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s9 ; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s10 ; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s11 ; SI-NEXT: v_cvt_f32_f16_e32 v6, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s13 ; SI-NEXT: v_cvt_f32_f16_e32 v8, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s15 ; SI-NEXT: v_cvt_f32_f16_e32 v10, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s40 ; SI-NEXT: .LBB57_3: ; %end +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v0, v0, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v14 +; SI-NEXT: v_or_b32_e32 v1, v1, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v2, v2, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v11 +; SI-NEXT: v_or_b32_e32 v3, v3, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v4, v4, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v11 +; SI-NEXT: v_or_b32_e32 v5, v5, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v11 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v21 +; SI-NEXT: v_or_b32_e32 v9, v12, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB57_4: ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: s_branch .LBB57_2 ; @@ -12207,140 +13230,212 @@ define <24 x i16> @bitcast_v24f16_to_v24i16(<24 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v24f16_to_v24i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v2 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v13 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB58_2 ; SI-NEXT: ; %bb.1: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_or_b32_e32 v22, v22, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_or_b32_e32 v11, v11, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_or_b32_e32 v18, v18, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_or_b32_e32 v9, v9, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_or_b32_e32 v14, v14, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v11 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_or_b32_e32 v7, v7, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_or_b32_e32 v10, v10, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v7 -; SI-NEXT: v_or_b32_e32 v6, v6, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v3 -; SI-NEXT: v_or_b32_e32 v2, v2, v24 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_or_b32_e32 v5, v5, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v24 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_or_b32_e32 v4, v4, v5 -; SI-NEXT: v_or_b32_e32 v8, v8, v9 -; SI-NEXT: v_or_b32_e32 v12, v12, v13 -; SI-NEXT: v_or_b32_e32 v16, v16, v17 -; SI-NEXT: v_or_b32_e32 v20, v20, v21 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 -; SI-NEXT: v_alignbit_b32 v17, v18, v17, 16 -; SI-NEXT: v_alignbit_b32 v21, v22, v21, 16 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v0, v0, v23 +; SI-NEXT: v_or_b32_e32 v18, v18, v22 +; SI-NEXT: v_or_b32_e32 v16, v16, v21 +; SI-NEXT: v_or_b32_e32 v14, v14, v20 +; SI-NEXT: v_or_b32_e32 v15, v15, v19 +; SI-NEXT: v_or_b32_e32 v13, v13, v17 +; SI-NEXT: v_alignbit_b32 v23, v1, v23, 16 +; SI-NEXT: v_alignbit_b32 v22, v3, v22, 16 +; SI-NEXT: v_alignbit_b32 v21, v5, v21, 16 +; SI-NEXT: v_alignbit_b32 v20, v7, v20, 16 +; SI-NEXT: v_alignbit_b32 v19, v9, v19, 16 +; SI-NEXT: v_alignbit_b32 v17, v11, v17, 16 ; SI-NEXT: .LBB58_2: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v20 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v22 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v21 +; SI-NEXT: v_or_b32_e32 v6, v6, v14 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v19 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v17 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v0, v0, v23 +; SI-NEXT: v_or_b32_e32 v2, v2, v18 +; SI-NEXT: v_or_b32_e32 v4, v4, v16 +; SI-NEXT: v_or_b32_e32 v8, v8, v14 +; SI-NEXT: v_or_b32_e32 v10, v10, v13 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v24f16_to_v24i16: @@ -12465,150 +13560,210 @@ define inreg <24 x i16> @bitcast_v24f16_to_v24i16_scalar(<24 x half> inreg %a, i ; SI-LABEL: bitcast_v24f16_to_v24i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v13, v8 -; SI-NEXT: v_mov_b32_e32 v17, v7 -; SI-NEXT: v_mov_b32_e32 v18, v4 -; SI-NEXT: v_mov_b32_e32 v19, v3 -; SI-NEXT: v_mov_b32_e32 v14, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v0, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v3, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v4, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v7, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v8, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v11, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v12, s28 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v27, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v2, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v25, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v6, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v28, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v10, s26 -; SI-NEXT: v_cvt_f16_f32_e32 v29, s29 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_lshr_b32 s4, s27, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 +; SI-NEXT: s_lshr_b32 s4, s26, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: s_lshr_b32 s5, s23, 16 +; SI-NEXT: s_lshr_b32 s6, s21, 16 +; SI-NEXT: s_lshr_b32 s7, s19, 16 +; SI-NEXT: s_lshr_b32 s8, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s20 +; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: s_lshr_b32 s5, s22, 16 +; SI-NEXT: s_lshr_b32 s6, s20, 16 +; SI-NEXT: s_lshr_b32 s7, s18, 16 +; SI-NEXT: s_lshr_b32 s8, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v6 +; SI-NEXT: s_cmp_lg_u32 s28, 0 ; SI-NEXT: s_cbranch_scc0 .LBB59_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_cbranch_execnz .LBB59_3 ; SI-NEXT: .LBB59_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v10 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v25 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v12 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v22 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v8 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v32 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v4 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v6 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v28 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v11 -; SI-NEXT: v_or_b32_e32 v10, v10, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v29 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v3 -; SI-NEXT: v_or_b32_e32 v14, v14, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v30 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_or_b32_e32 v2, v2, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v25 -; SI-NEXT: v_or_b32_e32 v18, v18, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v26 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v35 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v18 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v16 +; SI-NEXT: v_or_b32_e32 v34, v12, v0 +; SI-NEXT: v_or_b32_e32 v32, v13, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v33 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v28 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_lshr_b64 v[27:28], v[1:2], 16 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_lshr_b64 v[28:29], v[9:10], 16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v23 -; SI-NEXT: v_lshr_b64 v[29:30], v[13:14], 16 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v22, v22, v24 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_lshr_b64 v[30:31], v[17:18], 16 -; SI-NEXT: v_lshr_b64 v[25:26], v[5:6], 16 -; SI-NEXT: v_lshr_b64 v[31:32], v[21:22], 16 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_or_b32_e32 v4, v4, v5 -; SI-NEXT: v_or_b32_e32 v8, v8, v9 -; SI-NEXT: v_or_b32_e32 v12, v12, v13 -; SI-NEXT: v_or_b32_e32 v16, v16, v17 -; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v35, v13, v4 +; SI-NEXT: v_or_b32_e32 v33, v12, v6 +; SI-NEXT: v_or_b32_e32 v30, v14, v8 +; SI-NEXT: v_or_b32_e32 v28, v15, v10 +; SI-NEXT: v_lshr_b64 v[22:23], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[20:21], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[18:19], v[4:5], 16 +; SI-NEXT: v_lshr_b64 v[16:17], v[6:7], 16 +; SI-NEXT: v_lshr_b64 v[14:15], v[8:9], 16 +; SI-NEXT: v_lshr_b64 v[12:13], v[10:11], 16 ; SI-NEXT: .LBB59_3: ; %end -; SI-NEXT: v_mov_b32_e32 v1, v27 -; SI-NEXT: v_mov_b32_e32 v5, v25 -; SI-NEXT: v_mov_b32_e32 v9, v28 -; SI-NEXT: v_mov_b32_e32 v13, v29 -; SI-NEXT: v_mov_b32_e32 v17, v30 -; SI-NEXT: v_mov_b32_e32 v21, v31 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v22 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v34 +; SI-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v20 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v32 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v29 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v18 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v35 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v27 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v16 +; SI-NEXT: v_or_b32_e32 v6, v6, v8 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v26 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v14 +; SI-NEXT: v_or_b32_e32 v8, v8, v10 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v25 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v10, v10, v12 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v24 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB59_4: ; SI-NEXT: s_branch .LBB59_2 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.448bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.448bit.ll index 7fbc631c10e34..95359d8ae8f72 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.448bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.448bit.ll @@ -1815,87 +1815,115 @@ define <28 x i16> @bitcast_v14i32_to_v28i16(<14 x i32> %a, i32 %b) { ; SI-LABEL: bitcast_v14i32_to_v28i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v26, v13 -; SI-NEXT: v_mov_b32_e32 v24, v12 -; SI-NEXT: v_mov_b32_e32 v22, v11 -; SI-NEXT: v_mov_b32_e32 v20, v10 -; SI-NEXT: v_mov_b32_e32 v18, v9 -; SI-NEXT: v_mov_b32_e32 v16, v8 -; SI-NEXT: v_mov_b32_e32 v28, v7 -; SI-NEXT: v_mov_b32_e32 v12, v6 -; SI-NEXT: v_mov_b32_e32 v10, v5 -; SI-NEXT: v_mov_b32_e32 v8, v4 -; SI-NEXT: v_mov_b32_e32 v6, v3 -; SI-NEXT: v_mov_b32_e32 v4, v2 -; SI-NEXT: v_mov_b32_e32 v2, v1 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB12_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_alignbit_b32 v25, v26, v24, 16 -; SI-NEXT: v_alignbit_b32 v21, v22, v20, 16 -; SI-NEXT: v_alignbit_b32 v17, v18, v16, 16 -; SI-NEXT: v_alignbit_b32 v13, v28, v12, 16 -; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_alignbit_b32 v14, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v15, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v16, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v17, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v18, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v21, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v23, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 ; SI-NEXT: .LBB12_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB12_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 ; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 ; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 ; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; SI-NEXT: v_alignbit_b32 v25, v26, v24, 16 -; SI-NEXT: v_alignbit_b32 v21, v22, v20, 16 -; SI-NEXT: v_alignbit_b32 v17, v18, v16, 16 -; SI-NEXT: v_alignbit_b32 v13, v28, v12, 16 -; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_alignbit_b32 v14, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v15, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v16, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v17, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v18, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v21, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v23, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 ; SI-NEXT: .LBB12_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_mov_b32_e32 v14, v28 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v0, v0, v23 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v27 +; SI-NEXT: v_or_b32_e32 v2, v2, v21 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v26 +; SI-NEXT: v_or_b32_e32 v4, v4, v18 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v25 +; SI-NEXT: v_or_b32_e32 v6, v6, v17 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v24 +; SI-NEXT: v_or_b32_e32 v8, v8, v16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v22 +; SI-NEXT: v_or_b32_e32 v10, v10, v15 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v20 +; SI-NEXT: v_or_b32_e32 v12, v12, v14 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v19 +; SI-NEXT: v_or_b32_e32 v1, v1, v23 +; SI-NEXT: v_or_b32_e32 v3, v3, v21 +; SI-NEXT: v_or_b32_e32 v5, v5, v18 +; SI-NEXT: v_or_b32_e32 v7, v7, v17 +; SI-NEXT: v_or_b32_e32 v9, v9, v16 +; SI-NEXT: v_or_b32_e32 v11, v11, v15 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v14i32_to_v28i16: @@ -2049,34 +2077,62 @@ define inreg <28 x i16> @bitcast_v14i32_to_v28i16_scalar(<14 x i32> inreg %a, i3 ; SI-NEXT: s_lshr_b32 s57, s19, 16 ; SI-NEXT: s_lshr_b32 s58, s17, 16 ; SI-NEXT: .LBB13_3: ; %end -; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_mov_b32_e32 v1, s40 -; SI-NEXT: v_mov_b32_e32 v2, s17 -; SI-NEXT: v_mov_b32_e32 v3, s58 -; SI-NEXT: v_mov_b32_e32 v4, s18 -; SI-NEXT: v_mov_b32_e32 v5, s14 -; SI-NEXT: v_mov_b32_e32 v6, s19 -; SI-NEXT: v_mov_b32_e32 v7, s57 -; SI-NEXT: v_mov_b32_e32 v8, s20 -; SI-NEXT: v_mov_b32_e32 v9, s12 -; SI-NEXT: v_mov_b32_e32 v10, s21 -; SI-NEXT: v_mov_b32_e32 v11, s56 -; SI-NEXT: v_mov_b32_e32 v12, s22 -; SI-NEXT: v_mov_b32_e32 v13, s10 -; SI-NEXT: v_mov_b32_e32 v14, s23 -; SI-NEXT: v_mov_b32_e32 v15, s47 -; SI-NEXT: v_mov_b32_e32 v16, s24 -; SI-NEXT: v_mov_b32_e32 v17, s8 -; SI-NEXT: v_mov_b32_e32 v18, s25 -; SI-NEXT: v_mov_b32_e32 v19, s46 -; SI-NEXT: v_mov_b32_e32 v20, s26 -; SI-NEXT: v_mov_b32_e32 v21, s6 -; SI-NEXT: v_mov_b32_e32 v22, s27 -; SI-NEXT: v_mov_b32_e32 v23, s45 -; SI-NEXT: v_mov_b32_e32 v24, s28 -; SI-NEXT: v_mov_b32_e32 v25, s4 -; SI-NEXT: v_mov_b32_e32 v26, s29 -; SI-NEXT: v_mov_b32_e32 v27, s44 +; SI-NEXT: s_and_b32 s5, s16, 0xffff +; SI-NEXT: s_lshl_b32 s7, s40, 16 +; SI-NEXT: s_or_b32 s5, s5, s7 +; SI-NEXT: s_and_b32 s7, s17, 0xffff +; SI-NEXT: s_lshl_b32 s9, s58, 16 +; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: s_and_b32 s9, s18, 0xffff +; SI-NEXT: s_lshl_b32 s11, s14, 16 +; SI-NEXT: s_or_b32 s9, s9, s11 +; SI-NEXT: s_and_b32 s11, s19, 0xffff +; SI-NEXT: s_lshl_b32 s13, s57, 16 +; SI-NEXT: s_or_b32 s11, s11, s13 +; SI-NEXT: s_and_b32 s13, s20, 0xffff +; SI-NEXT: s_lshl_b32 s12, s12, 16 +; SI-NEXT: s_or_b32 s12, s13, s12 +; SI-NEXT: s_and_b32 s13, s21, 0xffff +; SI-NEXT: s_lshl_b32 s14, s56, 16 +; SI-NEXT: s_or_b32 s13, s13, s14 +; SI-NEXT: s_and_b32 s14, s22, 0xffff +; SI-NEXT: s_lshl_b32 s10, s10, 16 +; SI-NEXT: s_or_b32 s10, s14, s10 +; SI-NEXT: s_and_b32 s14, s23, 0xffff +; SI-NEXT: s_lshl_b32 s15, s47, 16 +; SI-NEXT: s_or_b32 s14, s14, s15 +; SI-NEXT: s_and_b32 s15, s24, 0xffff +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_or_b32 s8, s15, s8 +; SI-NEXT: s_and_b32 s15, s25, 0xffff +; SI-NEXT: s_lshl_b32 s16, s46, 16 +; SI-NEXT: s_or_b32 s15, s15, s16 +; SI-NEXT: s_and_b32 s16, s26, 0xffff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_or_b32 s6, s16, s6 +; SI-NEXT: s_and_b32 s16, s27, 0xffff +; SI-NEXT: s_lshl_b32 s17, s45, 16 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_and_b32 s17, s28, 0xffff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_or_b32 s4, s17, s4 +; SI-NEXT: s_and_b32 s17, s29, 0xffff +; SI-NEXT: s_lshl_b32 s18, s44, 16 +; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: v_mov_b32_e32 v0, s5 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: v_mov_b32_e32 v3, s11 +; SI-NEXT: v_mov_b32_e32 v4, s12 +; SI-NEXT: v_mov_b32_e32 v5, s13 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: v_mov_b32_e32 v7, s14 +; SI-NEXT: v_mov_b32_e32 v8, s8 +; SI-NEXT: v_mov_b32_e32 v9, s15 +; SI-NEXT: v_mov_b32_e32 v10, s6 +; SI-NEXT: v_mov_b32_e32 v11, s16 +; SI-NEXT: v_mov_b32_e32 v12, s4 +; SI-NEXT: v_mov_b32_e32 v13, s17 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB13_4: ; SI-NEXT: ; implicit-def: $sgpr40 @@ -2236,29 +2292,49 @@ define <14 x i32> @bitcast_v28i16_to_v14i32(<28 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v28i16_to_v14i32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v36, v14 -; SI-NEXT: v_mov_b32_e32 v35, v12 -; SI-NEXT: v_mov_b32_e32 v34, v10 -; SI-NEXT: v_mov_b32_e32 v33, v8 -; SI-NEXT: v_mov_b32_e32 v32, v6 -; SI-NEXT: v_mov_b32_e32 v31, v4 -; SI-NEXT: v_mov_b32_e32 v30, v2 +; SI-NEXT: v_mov_b32_e32 v16, v13 +; SI-NEXT: v_mov_b32_e32 v17, v12 +; SI-NEXT: v_mov_b32_e32 v18, v11 +; SI-NEXT: v_mov_b32_e32 v19, v10 +; SI-NEXT: v_mov_b32_e32 v20, v9 +; SI-NEXT: v_mov_b32_e32 v21, v8 +; SI-NEXT: v_mov_b32_e32 v22, v7 +; SI-NEXT: v_mov_b32_e32 v23, v6 +; SI-NEXT: v_mov_b32_e32 v24, v5 +; SI-NEXT: v_mov_b32_e32 v25, v4 +; SI-NEXT: v_mov_b32_e32 v26, v3 +; SI-NEXT: v_mov_b32_e32 v27, v2 +; SI-NEXT: v_mov_b32_e32 v28, v1 ; SI-NEXT: v_mov_b32_e32 v29, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v29 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v12 ; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v0 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -2271,50 +2347,47 @@ define <14 x i32> @bitcast_v28i16_to_v14i32(<28 x i16> %a, i32 %b) { ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB14_3: ; %cmp.false ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v29 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v30 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v31 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v32 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v33 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v34 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v35 -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v36 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v16 -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v18 -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v20 -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v22 -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v24 -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v26 -; SI-NEXT: v_or_b32_e32 v0, v0, v54 -; SI-NEXT: v_or_b32_e32 v1, v1, v53 -; SI-NEXT: v_or_b32_e32 v2, v2, v52 -; SI-NEXT: v_or_b32_e32 v3, v3, v51 -; SI-NEXT: v_or_b32_e32 v4, v4, v50 -; SI-NEXT: v_or_b32_e32 v5, v5, v49 -; SI-NEXT: v_or_b32_e32 v6, v6, v48 -; SI-NEXT: v_or_b32_e32 v7, v7, v39 -; SI-NEXT: v_or_b32_e32 v8, v8, v38 -; SI-NEXT: v_or_b32_e32 v9, v9, v37 -; SI-NEXT: v_or_b32_e32 v10, v10, v28 -; SI-NEXT: v_or_b32_e32 v11, v11, v21 -; SI-NEXT: v_or_b32_e32 v12, v12, v19 -; SI-NEXT: v_or_b32_e32 v13, v13, v17 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v28 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v27 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v26 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v25 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v24 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v23 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v21 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v19 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v0, v0, v51 +; SI-NEXT: v_or_b32_e32 v1, v1, v50 +; SI-NEXT: v_or_b32_e32 v2, v2, v49 +; SI-NEXT: v_or_b32_e32 v3, v3, v48 +; SI-NEXT: v_or_b32_e32 v4, v4, v39 +; SI-NEXT: v_or_b32_e32 v5, v5, v38 +; SI-NEXT: v_or_b32_e32 v6, v6, v37 +; SI-NEXT: v_or_b32_e32 v7, v7, v36 +; SI-NEXT: v_or_b32_e32 v8, v8, v35 +; SI-NEXT: v_or_b32_e32 v9, v9, v34 +; SI-NEXT: v_or_b32_e32 v10, v10, v33 +; SI-NEXT: v_or_b32_e32 v11, v11, v32 +; SI-NEXT: v_or_b32_e32 v12, v12, v31 +; SI-NEXT: v_or_b32_e32 v13, v13, v30 ; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr49 @@ -2322,27 +2395,30 @@ define <14 x i32> @bitcast_v28i16_to_v14i32(<28 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB14_2 ; SI-NEXT: .LBB14_4: ; %cmp.true ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v29 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v30 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v31 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v32 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v33 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v34 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v35 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v36 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v16 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v18 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v20 -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v22 -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v24 -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v27 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v16 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 @@ -2357,21 +2433,21 @@ define <14 x i32> @bitcast_v28i16_to_v14i32(<28 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; SI-NEXT: v_or_b32_e32 v0, v54, v0 +; SI-NEXT: v_or_b32_e32 v0, v51, v0 ; SI-NEXT: s_mov_b32 s6, 0x30000 -; SI-NEXT: v_or_b32_e32 v1, v53, v1 -; SI-NEXT: v_or_b32_e32 v2, v52, v2 -; SI-NEXT: v_or_b32_e32 v3, v51, v3 -; SI-NEXT: v_or_b32_e32 v4, v50, v4 -; SI-NEXT: v_or_b32_e32 v5, v49, v5 -; SI-NEXT: v_or_b32_e32 v6, v48, v6 -; SI-NEXT: v_or_b32_e32 v7, v39, v7 -; SI-NEXT: v_or_b32_e32 v8, v38, v8 -; SI-NEXT: v_or_b32_e32 v9, v37, v9 -; SI-NEXT: v_or_b32_e32 v10, v28, v10 -; SI-NEXT: v_or_b32_e32 v11, v21, v11 -; SI-NEXT: v_or_b32_e32 v12, v19, v12 -; SI-NEXT: v_or_b32_e32 v13, v17, v13 +; SI-NEXT: v_or_b32_e32 v1, v50, v1 +; SI-NEXT: v_or_b32_e32 v2, v49, v2 +; SI-NEXT: v_or_b32_e32 v3, v48, v3 +; SI-NEXT: v_or_b32_e32 v4, v39, v4 +; SI-NEXT: v_or_b32_e32 v5, v38, v5 +; SI-NEXT: v_or_b32_e32 v6, v37, v6 +; SI-NEXT: v_or_b32_e32 v7, v36, v7 +; SI-NEXT: v_or_b32_e32 v8, v35, v8 +; SI-NEXT: v_or_b32_e32 v9, v34, v9 +; SI-NEXT: v_or_b32_e32 v10, v33, v10 +; SI-NEXT: v_or_b32_e32 v11, v32, v11 +; SI-NEXT: v_or_b32_e32 v12, v31, v12 +; SI-NEXT: v_or_b32_e32 v13, v30, v13 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 @@ -2520,142 +2596,180 @@ define inreg <14 x i32> @bitcast_v28i16_to_v14i32_scalar(<28 x i16> inreg %a, i3 ; SI-LABEL: bitcast_v28i16_to_v14i32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; SI-NEXT: v_mov_b32_e32 v16, v12 -; SI-NEXT: v_mov_b32_e32 v17, v10 -; SI-NEXT: v_mov_b32_e32 v18, v8 -; SI-NEXT: v_mov_b32_e32 v19, v6 -; SI-NEXT: v_mov_b32_e32 v20, v4 -; SI-NEXT: v_mov_b32_e32 v21, v2 -; SI-NEXT: v_mov_b32_e32 v22, v0 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v14, s36, 0 +; SI-NEXT: v_writelane_b32 v14, s37, 1 +; SI-NEXT: v_writelane_b32 v14, s38, 2 +; SI-NEXT: v_writelane_b32 v14, s39, 3 +; SI-NEXT: v_writelane_b32 v14, s48, 4 +; SI-NEXT: v_writelane_b32 v14, s49, 5 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s7, s28, 16 +; SI-NEXT: s_lshr_b32 s8, s27, 16 +; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: s_lshr_b32 s13, s22, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s56, s19, 16 +; SI-NEXT: s_lshr_b32 s57, s18, 16 +; SI-NEXT: s_lshr_b32 s58, s17, 16 +; SI-NEXT: s_lshr_b32 s59, s16, 16 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: v_writelane_b32 v14, s50, 6 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v13 +; SI-NEXT: v_writelane_b32 v14, s51, 7 ; SI-NEXT: s_cbranch_scc0 .LBB15_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v22 -; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: v_or_b32_e32 v7, v0, v29 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v20 -; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: v_or_b32_e32 v9, v0, v27 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v19 -; SI-NEXT: s_or_b32 s7, s7, s8 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: v_or_b32_e32 v10, v0, v26 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v18 -; SI-NEXT: s_or_b32 s8, s8, s9 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: v_or_b32_e32 v11, v0, v25 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v17 -; SI-NEXT: s_or_b32 s9, s9, s10 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 -; SI-NEXT: v_or_b32_e32 v12, v0, v24 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 -; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_or_b32_e32 v8, v1, v28 -; SI-NEXT: v_or_b32_e32 v13, v0, v23 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_mov_b32_e32 v5, s9 -; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_lshl_b32 s5, s59, 16 +; SI-NEXT: s_or_b32 s36, s4, s5 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: s_lshl_b32 s5, s57, 16 +; SI-NEXT: s_or_b32 s38, s4, s5 +; SI-NEXT: s_and_b32 s4, s19, 0xffff +; SI-NEXT: s_lshl_b32 s5, s56, 16 +; SI-NEXT: s_and_b32 s40, s17, 0xffff +; SI-NEXT: s_lshl_b32 s41, s58, 16 +; SI-NEXT: s_or_b32 s39, s4, s5 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: s_lshl_b32 s5, s15, 16 +; SI-NEXT: s_or_b32 s37, s40, s41 +; SI-NEXT: s_or_b32 s40, s4, s5 +; SI-NEXT: s_and_b32 s4, s21, 0xffff +; SI-NEXT: s_lshl_b32 s5, s14, 16 +; SI-NEXT: s_or_b32 s41, s4, s5 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: s_lshl_b32 s5, s13, 16 +; SI-NEXT: s_or_b32 s42, s4, s5 +; SI-NEXT: s_and_b32 s4, s23, 0xffff +; SI-NEXT: s_lshl_b32 s5, s12, 16 +; SI-NEXT: s_or_b32 s43, s4, s5 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: s_lshl_b32 s5, s11, 16 +; SI-NEXT: s_or_b32 s44, s4, s5 +; SI-NEXT: s_and_b32 s4, s25, 0xffff +; SI-NEXT: s_lshl_b32 s5, s10, 16 +; SI-NEXT: s_or_b32 s45, s4, s5 +; SI-NEXT: s_and_b32 s4, s26, 0xffff +; SI-NEXT: s_lshl_b32 s5, s9, 16 +; SI-NEXT: s_or_b32 s46, s4, s5 +; SI-NEXT: s_and_b32 s4, s27, 0xffff +; SI-NEXT: s_lshl_b32 s5, s8, 16 +; SI-NEXT: s_or_b32 s47, s4, s5 +; SI-NEXT: s_and_b32 s4, s28, 0xffff +; SI-NEXT: s_lshl_b32 s5, s7, 16 +; SI-NEXT: s_or_b32 s48, s4, s5 +; SI-NEXT: s_and_b32 s4, s29, 0xffff +; SI-NEXT: s_lshl_b32 s5, s6, 16 +; SI-NEXT: s_or_b32 s49, s4, s5 ; SI-NEXT: s_cbranch_execnz .LBB15_3 ; SI-NEXT: .LBB15_2: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v22 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v29, v0 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v21 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v28, v0 -; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v20 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v27, v0 -; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v19 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_or_b32_e32 v0, v26, v0 ; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_lshl_b32 s5, s59, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s36, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s17, 0xffff +; SI-NEXT: s_lshl_b32 s5, s58, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v18 +; SI-NEXT: s_add_i32 s37, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: s_lshl_b32 s5, s57, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s38, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s19, 0xffff +; SI-NEXT: s_lshl_b32 s5, s56, 16 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 ; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s39, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: s_lshl_b32 s5, s15, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s40, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s21, 0xffff +; SI-NEXT: s_lshl_b32 s5, s14, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: v_or_b32_e32 v0, v25, v0 -; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s41, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: s_lshl_b32 s5, s13, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s42, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s23, 0xffff +; SI-NEXT: s_lshl_b32 s5, s12, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v17 -; SI-NEXT: s_or_b32 s7, s8, s7 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s43, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: s_lshl_b32 s5, s11, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s44, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s25, 0xffff +; SI-NEXT: s_lshl_b32 s5, s10, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_or_b32 s8, s9, s8 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_add_i32 s45, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s26, 0xffff +; SI-NEXT: s_lshl_b32 s5, s9, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_add_i32 s46, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s27, 0xffff +; SI-NEXT: s_lshl_b32 s5, s8, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: v_or_b32_e32 v0, v24, v0 -; SI-NEXT: s_or_b32 s9, s10, s9 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v16 -; SI-NEXT: s_or_b32 s10, s11, s10 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_add_i32 s4, s4, 0x30000 -; SI-NEXT: s_add_i32 s5, s5, 0x30000 -; SI-NEXT: s_add_i32 s6, s6, 0x30000 -; SI-NEXT: s_add_i32 s7, s7, 0x30000 -; SI-NEXT: s_add_i32 s8, s8, 0x30000 -; SI-NEXT: s_add_i32 s9, s9, 0x30000 -; SI-NEXT: s_add_i32 s10, s10, 0x30000 -; SI-NEXT: v_or_b32_e32 v0, v23, v0 -; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_mov_b32_e32 v5, s9 -; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_add_i32 s47, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s28, 0xffff +; SI-NEXT: s_lshl_b32 s5, s7, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_add_i32 s48, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s29, 0xffff +; SI-NEXT: s_lshl_b32 s5, s6, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s49, s4, 0x30000 ; SI-NEXT: .LBB15_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s36 +; SI-NEXT: v_mov_b32_e32 v1, s37 +; SI-NEXT: v_mov_b32_e32 v2, s38 +; SI-NEXT: v_mov_b32_e32 v3, s39 +; SI-NEXT: v_mov_b32_e32 v4, s40 +; SI-NEXT: v_mov_b32_e32 v5, s41 +; SI-NEXT: v_mov_b32_e32 v6, s42 +; SI-NEXT: v_mov_b32_e32 v7, s43 +; SI-NEXT: v_mov_b32_e32 v8, s44 +; SI-NEXT: v_mov_b32_e32 v9, s45 +; SI-NEXT: v_mov_b32_e32 v10, s46 +; SI-NEXT: v_mov_b32_e32 v11, s47 +; SI-NEXT: v_mov_b32_e32 v12, s48 +; SI-NEXT: v_mov_b32_e32 v13, s49 +; SI-NEXT: v_readlane_b32 s51, v14, 7 +; SI-NEXT: v_readlane_b32 s50, v14, 6 +; SI-NEXT: v_readlane_b32 s49, v14, 5 +; SI-NEXT: v_readlane_b32 s48, v14, 4 +; SI-NEXT: v_readlane_b32 s39, v14, 3 +; SI-NEXT: v_readlane_b32 s38, v14, 2 +; SI-NEXT: v_readlane_b32 s37, v14, 1 +; SI-NEXT: v_readlane_b32 s36, v14, 0 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB15_4: -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: ; implicit-def: $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 ; SI-NEXT: s_branch .LBB15_2 ; ; VI-LABEL: bitcast_v28i16_to_v14i32_scalar: @@ -2863,21 +2977,81 @@ define <28 x half> @bitcast_v14i32_to_v28f16(<14 x i32> %a, i32 %b) { ; SI-LABEL: bitcast_v14i32_to_v28f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v28, v13 -; SI-NEXT: v_mov_b32_e32 v29, v12 -; SI-NEXT: v_mov_b32_e32 v30, v11 -; SI-NEXT: v_mov_b32_e32 v31, v10 -; SI-NEXT: v_mov_b32_e32 v32, v9 -; SI-NEXT: v_mov_b32_e32 v33, v8 -; SI-NEXT: v_mov_b32_e32 v34, v7 -; SI-NEXT: v_mov_b32_e32 v35, v6 -; SI-NEXT: v_mov_b32_e32 v36, v5 -; SI-NEXT: v_mov_b32_e32 v37, v4 -; SI-NEXT: v_mov_b32_e32 v38, v3 -; SI-NEXT: v_mov_b32_e32 v39, v2 -; SI-NEXT: v_mov_b32_e32 v48, v1 -; SI-NEXT: v_mov_b32_e32 v49, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB16_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v0 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 @@ -2892,148 +3066,126 @@ define <28 x half> @bitcast_v14i32_to_v28f16(<14 x i32> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB16_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB16_4 -; SI-NEXT: .LBB16_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB16_3: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v49 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: .LBB16_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB16_2 -; SI-NEXT: .LBB16_4: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v48 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v39 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v38 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v37 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v36 -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v35 -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v34 -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v33 -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v32 -; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v31 -; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v30 -; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v29 -; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v28 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: s_cbranch_execz .LBB16_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: .LBB16_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_cvt_f16_f32_e32 v0, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v38 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v34 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v32 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v31 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v28 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v27 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v23 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v24 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v19 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v20 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v15 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v16 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: s_setpc_b64 s[30:31] +; ; VI-LABEL: bitcast_v14i32_to_v28f16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -3141,47 +3293,47 @@ define inreg <28 x half> @bitcast_v14i32_to_v28f16_scalar(<14 x i32> inreg %a, i ; SI-NEXT: s_cbranch_scc0 .LBB17_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 ; SI-NEXT: s_lshr_b32 s4, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 ; SI-NEXT: s_lshr_b32 s4, s27, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 ; SI-NEXT: s_lshr_b32 s4, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 ; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 ; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 ; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 ; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 ; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 ; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s4 ; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 ; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s4 ; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s26 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s26 ; SI-NEXT: v_cvt_f32_f16_e32 v18, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s16 ; SI-NEXT: s_cbranch_execnz .LBB17_3 ; SI-NEXT: .LBB17_2: ; %cmp.true ; SI-NEXT: s_add_i32 s16, s16, 3 @@ -3212,65 +3364,121 @@ define inreg <28 x half> @bitcast_v14i32_to_v28f16_scalar(<14 x i32> inreg %a, i ; SI-NEXT: s_lshr_b32 s15, s27, 16 ; SI-NEXT: s_lshr_b32 s40, s28, 16 ; SI-NEXT: s_lshr_b32 s41, s29, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s26 ; SI-NEXT: v_cvt_f32_f16_e32 v18, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 ; SI-NEXT: .LBB17_3: ; %end +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v27 +; SI-NEXT: v_or_b32_e32 v0, v26, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 +; SI-NEXT: v_or_b32_e32 v2, v24, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v24 +; SI-NEXT: v_or_b32_e32 v5, v5, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v22 +; SI-NEXT: v_or_b32_e32 v7, v7, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v20 +; SI-NEXT: v_or_b32_e32 v9, v18, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_or_b32_e32 v11, v16, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v16 +; SI-NEXT: v_or_b32_e32 v1, v28, v1 +; SI-NEXT: v_or_b32_e32 v3, v26, v3 +; SI-NEXT: v_or_b32_e32 v4, v23, v4 +; SI-NEXT: v_or_b32_e32 v6, v21, v6 +; SI-NEXT: v_or_b32_e32 v8, v19, v8 +; SI-NEXT: v_or_b32_e32 v10, v17, v10 +; SI-NEXT: v_or_b32_e32 v12, v15, v12 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB17_4: +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: s_branch .LBB17_2 ; ; VI-LABEL: bitcast_v14i32_to_v28f16_scalar: @@ -3414,79 +3622,116 @@ define <14 x i32> @bitcast_v28f16_to_v14i32(<28 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v28f16_to_v14i32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v42, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v26 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v13 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execz .LBB18_2 -; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v42 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v40 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v52 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v50 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v48 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v38 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v34 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v30 +; SI-NEXT: s_cbranch_execnz .LBB18_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB18_4 +; SI-NEXT: .LBB18_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB18_3: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v23 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v21 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v19 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v17 -; SI-NEXT: v_or_b32_e32 v0, v41, v0 -; SI-NEXT: v_or_b32_e32 v1, v55, v1 -; SI-NEXT: v_or_b32_e32 v2, v53, v2 -; SI-NEXT: v_or_b32_e32 v3, v51, v3 -; SI-NEXT: v_or_b32_e32 v4, v49, v4 -; SI-NEXT: v_or_b32_e32 v5, v39, v5 -; SI-NEXT: v_or_b32_e32 v6, v37, v6 -; SI-NEXT: v_or_b32_e32 v7, v35, v7 -; SI-NEXT: v_or_b32_e32 v8, v33, v8 -; SI-NEXT: v_or_b32_e32 v9, v31, v9 -; SI-NEXT: v_or_b32_e32 v10, v29, v10 +; SI-NEXT: v_or_b32_e32 v0, v50, v0 +; SI-NEXT: v_or_b32_e32 v1, v48, v1 +; SI-NEXT: v_or_b32_e32 v2, v38, v2 +; SI-NEXT: v_or_b32_e32 v3, v36, v3 +; SI-NEXT: v_or_b32_e32 v4, v34, v4 +; SI-NEXT: v_or_b32_e32 v5, v32, v5 +; SI-NEXT: v_or_b32_e32 v6, v30, v6 +; SI-NEXT: v_or_b32_e32 v7, v28, v7 +; SI-NEXT: v_or_b32_e32 v8, v26, v8 +; SI-NEXT: v_or_b32_e32 v9, v24, v9 +; SI-NEXT: v_or_b32_e32 v10, v22, v10 ; SI-NEXT: v_or_b32_e32 v11, v20, v11 ; SI-NEXT: v_or_b32_e32 v12, v18, v12 ; SI-NEXT: v_or_b32_e32 v13, v16, v13 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr49 @@ -3502,20 +3747,26 @@ define <14 x i32> @bitcast_v28f16_to_v14i32(<28 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: .LBB18_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB18_4 -; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v55 +; SI-NEXT: s_cbranch_execz .LBB18_2 +; SI-NEXT: .LBB18_4: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v48 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -3523,10 +3774,10 @@ define <14 x i32> @bitcast_v28f16_to_v14i32(<28 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v38 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v39 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -3534,11 +3785,11 @@ define <14 x i32> @bitcast_v28f16_to_v14i32(<28 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v37 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v35 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -3546,11 +3797,11 @@ define <14 x i32> @bitcast_v28f16_to_v14i32(<28 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v34 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v32 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 @@ -3558,11 +3809,11 @@ define <14 x i32> @bitcast_v28f16_to_v14i32(<28 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v31 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v29 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 @@ -3570,11 +3821,11 @@ define <14 x i32> @bitcast_v28f16_to_v14i32(<28 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v28 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v26 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 @@ -3582,11 +3833,11 @@ define <14 x i32> @bitcast_v28f16_to_v14i32(<28 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v25 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v23 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 @@ -3594,7 +3845,7 @@ define <14 x i32> @bitcast_v28f16_to_v14i32(<28 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v22 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v21 @@ -3624,12 +3875,7 @@ define <14 x i32> @bitcast_v28f16_to_v14i32(<28 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v12, v13, v12 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 ; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: .LBB18_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v28f16_to_v14i32: @@ -3764,64 +4010,106 @@ define inreg <14 x i32> @bitcast_v28f16_to_v14i32_scalar(<28 x half> inreg %a, i ; SI-LABEL: bitcast_v28f16_to_v14i32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v51, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v50, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v49, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v38, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v36, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v35, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v48, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v39, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v37, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v34, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v33, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v32, s26 -; SI-NEXT: v_cvt_f16_f32_e32 v31, s29 -; SI-NEXT: v_cvt_f16_f32_e32 v30, s28 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: s_lshr_b32 s8, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s8 +; SI-NEXT: s_lshr_b32 s8, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s8 +; SI-NEXT: s_lshr_b32 s8, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s8 +; SI-NEXT: s_lshr_b32 s8, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s8 +; SI-NEXT: s_lshr_b32 s8, s21, 16 +; SI-NEXT: s_lshr_b32 s6, s27, 16 +; SI-NEXT: s_lshr_b32 s7, s26, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s8 +; SI-NEXT: s_lshr_b32 s8, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s8 +; SI-NEXT: s_lshr_b32 s8, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s8 +; SI-NEXT: s_lshr_b32 s8, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s8 +; SI-NEXT: s_lshr_b32 s8, s17, 16 +; SI-NEXT: s_lshr_b32 s4, s29, 16 +; SI-NEXT: s_lshr_b32 s5, s28, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s8 +; SI-NEXT: s_lshr_b32 s8, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v4 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_cbranch_scc0 .LBB19_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v49 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v48 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v35 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v33 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v31 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v29 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v24 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v20 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v17 ; SI-NEXT: v_or_b32_e32 v0, v50, v0 -; SI-NEXT: v_or_b32_e32 v1, v38, v1 -; SI-NEXT: v_or_b32_e32 v2, v35, v2 -; SI-NEXT: v_or_b32_e32 v3, v39, v3 +; SI-NEXT: v_or_b32_e32 v1, v48, v1 +; SI-NEXT: v_or_b32_e32 v2, v38, v2 +; SI-NEXT: v_or_b32_e32 v3, v36, v3 ; SI-NEXT: v_or_b32_e32 v4, v34, v4 ; SI-NEXT: v_or_b32_e32 v5, v32, v5 ; SI-NEXT: v_or_b32_e32 v6, v30, v6 ; SI-NEXT: v_or_b32_e32 v7, v28, v7 ; SI-NEXT: v_or_b32_e32 v8, v26, v8 -; SI-NEXT: v_or_b32_e32 v9, v24, v9 +; SI-NEXT: v_or_b32_e32 v9, v23, v9 ; SI-NEXT: v_or_b32_e32 v10, v22, v10 -; SI-NEXT: v_or_b32_e32 v11, v20, v11 +; SI-NEXT: v_or_b32_e32 v11, v19, v11 ; SI-NEXT: v_or_b32_e32 v12, v18, v12 ; SI-NEXT: v_or_b32_e32 v13, v16, v13 ; SI-NEXT: s_cbranch_execnz .LBB19_3 @@ -3829,7 +4117,7 @@ define inreg <14 x i32> @bitcast_v28f16_to_v14i32_scalar(<28 x half> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v48 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -3837,10 +4125,10 @@ define inreg <14 x i32> @bitcast_v28f16_to_v14i32_scalar(<28 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v38 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v39 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -3848,11 +4136,11 @@ define inreg <14 x i32> @bitcast_v28f16_to_v14i32_scalar(<28 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v37 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v35 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -3899,8 +4187,8 @@ define inreg <14 x i32> @bitcast_v28f16_to_v14i32_scalar(<28 x half> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v9, v25 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v24 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 @@ -3912,7 +4200,7 @@ define inreg <14 x i32> @bitcast_v28f16_to_v14i32_scalar(<28 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v9, v10, v9 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v19 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 @@ -3920,7 +4208,7 @@ define inreg <14 x i32> @bitcast_v28f16_to_v14i32_scalar(<28 x half> inreg %a, i ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v20 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v13, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v18 @@ -5354,87 +5642,115 @@ define <28 x i16> @bitcast_v14f32_to_v28i16(<14 x float> %a, i32 %b) { ; SI-LABEL: bitcast_v14f32_to_v28i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v26, v13 -; SI-NEXT: v_mov_b32_e32 v24, v12 -; SI-NEXT: v_mov_b32_e32 v22, v11 -; SI-NEXT: v_mov_b32_e32 v20, v10 -; SI-NEXT: v_mov_b32_e32 v18, v9 -; SI-NEXT: v_mov_b32_e32 v16, v8 -; SI-NEXT: v_mov_b32_e32 v28, v7 -; SI-NEXT: v_mov_b32_e32 v12, v6 -; SI-NEXT: v_mov_b32_e32 v10, v5 -; SI-NEXT: v_mov_b32_e32 v8, v4 -; SI-NEXT: v_mov_b32_e32 v6, v3 -; SI-NEXT: v_mov_b32_e32 v4, v2 -; SI-NEXT: v_mov_b32_e32 v2, v1 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB28_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_alignbit_b32 v25, v26, v24, 16 -; SI-NEXT: v_alignbit_b32 v21, v22, v20, 16 -; SI-NEXT: v_alignbit_b32 v17, v18, v16, 16 -; SI-NEXT: v_alignbit_b32 v13, v28, v12, 16 -; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_alignbit_b32 v14, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v15, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v16, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v17, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v18, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v21, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v23, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 ; SI-NEXT: .LBB28_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB28_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 ; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 -; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 ; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 -; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 ; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 -; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 -; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 -; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 -; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 -; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 -; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 -; SI-NEXT: v_alignbit_b32 v25, v26, v24, 16 -; SI-NEXT: v_alignbit_b32 v21, v22, v20, 16 -; SI-NEXT: v_alignbit_b32 v17, v18, v16, 16 -; SI-NEXT: v_alignbit_b32 v13, v28, v12, 16 -; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_alignbit_b32 v14, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v15, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v16, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v17, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v18, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v21, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v23, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 ; SI-NEXT: .LBB28_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_mov_b32_e32 v14, v28 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v0, v0, v23 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v27 +; SI-NEXT: v_or_b32_e32 v2, v2, v21 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v26 +; SI-NEXT: v_or_b32_e32 v4, v4, v18 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v25 +; SI-NEXT: v_or_b32_e32 v6, v6, v17 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v24 +; SI-NEXT: v_or_b32_e32 v8, v8, v16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v22 +; SI-NEXT: v_or_b32_e32 v10, v10, v15 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v20 +; SI-NEXT: v_or_b32_e32 v12, v12, v14 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v19 +; SI-NEXT: v_or_b32_e32 v1, v1, v23 +; SI-NEXT: v_or_b32_e32 v3, v3, v21 +; SI-NEXT: v_or_b32_e32 v5, v5, v18 +; SI-NEXT: v_or_b32_e32 v7, v7, v17 +; SI-NEXT: v_or_b32_e32 v9, v9, v16 +; SI-NEXT: v_or_b32_e32 v11, v11, v15 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v14f32_to_v28i16: @@ -5552,34 +5868,34 @@ define inreg <28 x i16> @bitcast_v14f32_to_v28i16_scalar(<14 x float> inreg %a, ; SI-NEXT: s_lshr_b64 s[40:41], s[16:17], 16 ; SI-NEXT: s_cbranch_execnz .LBB29_4 ; SI-NEXT: .LBB29_2: ; %cmp.true -; SI-NEXT: v_add_f32_e64 v49, s17, 1.0 -; SI-NEXT: v_add_f32_e64 v48, s16, 1.0 -; SI-NEXT: v_add_f32_e64 v39, s19, 1.0 -; SI-NEXT: v_add_f32_e64 v38, s18, 1.0 -; SI-NEXT: v_add_f32_e64 v37, s21, 1.0 -; SI-NEXT: v_add_f32_e64 v36, s20, 1.0 -; SI-NEXT: v_add_f32_e64 v35, s23, 1.0 -; SI-NEXT: v_add_f32_e64 v34, s22, 1.0 -; SI-NEXT: v_add_f32_e64 v33, s25, 1.0 -; SI-NEXT: v_add_f32_e64 v32, s24, 1.0 -; SI-NEXT: v_add_f32_e64 v31, s27, 1.0 -; SI-NEXT: v_add_f32_e64 v30, s26, 1.0 -; SI-NEXT: v_add_f32_e64 v29, s29, 1.0 -; SI-NEXT: v_add_f32_e64 v28, s28, 1.0 -; SI-NEXT: v_lshr_b64 v[25:26], v[28:29], 16 -; SI-NEXT: v_lshr_b64 v[21:22], v[30:31], 16 -; SI-NEXT: v_lshr_b64 v[17:18], v[32:33], 16 -; SI-NEXT: v_lshr_b64 v[13:14], v[34:35], 16 -; SI-NEXT: v_lshr_b64 v[9:10], v[36:37], 16 -; SI-NEXT: v_lshr_b64 v[5:6], v[38:39], 16 -; SI-NEXT: v_lshr_b64 v[1:2], v[48:49], 16 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v29 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v31 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v33 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v35 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v37 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v39 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v49 +; SI-NEXT: v_add_f32_e64 v13, s29, 1.0 +; SI-NEXT: v_add_f32_e64 v12, s28, 1.0 +; SI-NEXT: v_add_f32_e64 v11, s27, 1.0 +; SI-NEXT: v_add_f32_e64 v10, s26, 1.0 +; SI-NEXT: v_lshr_b64 v[14:15], v[12:13], 16 +; SI-NEXT: v_add_f32_e64 v9, s25, 1.0 +; SI-NEXT: v_add_f32_e64 v8, s24, 1.0 +; SI-NEXT: v_lshr_b64 v[15:16], v[10:11], 16 +; SI-NEXT: v_add_f32_e64 v7, s23, 1.0 +; SI-NEXT: v_add_f32_e64 v6, s22, 1.0 +; SI-NEXT: v_lshr_b64 v[16:17], v[8:9], 16 +; SI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; SI-NEXT: v_lshr_b64 v[17:18], v[6:7], 16 +; SI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; SI-NEXT: v_lshr_b64 v[18:19], v[4:5], 16 +; SI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: v_lshr_b64 v[19:20], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[20:21], v[0:1], 16 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v1 ; SI-NEXT: s_branch .LBB29_5 ; SI-NEXT: .LBB29_3: ; SI-NEXT: ; implicit-def: $sgpr40 @@ -5598,49 +5914,77 @@ define inreg <28 x i16> @bitcast_v14f32_to_v28i16_scalar(<14 x float> inreg %a, ; SI-NEXT: ; implicit-def: $sgpr58 ; SI-NEXT: s_branch .LBB29_2 ; SI-NEXT: .LBB29_4: -; SI-NEXT: v_mov_b32_e32 v48, s16 -; SI-NEXT: v_mov_b32_e32 v49, s17 -; SI-NEXT: v_mov_b32_e32 v38, s18 -; SI-NEXT: v_mov_b32_e32 v39, s19 -; SI-NEXT: v_mov_b32_e32 v36, s20 -; SI-NEXT: v_mov_b32_e32 v37, s21 -; SI-NEXT: v_mov_b32_e32 v34, s22 -; SI-NEXT: v_mov_b32_e32 v35, s23 -; SI-NEXT: v_mov_b32_e32 v32, s24 -; SI-NEXT: v_mov_b32_e32 v33, s25 -; SI-NEXT: v_mov_b32_e32 v30, s26 -; SI-NEXT: v_mov_b32_e32 v31, s27 -; SI-NEXT: v_mov_b32_e32 v28, s28 -; SI-NEXT: v_mov_b32_e32 v29, s29 -; SI-NEXT: v_mov_b32_e32 v3, s44 -; SI-NEXT: v_mov_b32_e32 v7, s45 -; SI-NEXT: v_mov_b32_e32 v11, s46 -; SI-NEXT: v_mov_b32_e32 v15, s47 -; SI-NEXT: v_mov_b32_e32 v19, s56 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: v_mov_b32_e32 v28, s44 +; SI-NEXT: v_mov_b32_e32 v27, s45 +; SI-NEXT: v_mov_b32_e32 v26, s46 +; SI-NEXT: v_mov_b32_e32 v25, s47 +; SI-NEXT: v_mov_b32_e32 v24, s56 ; SI-NEXT: v_mov_b32_e32 v23, s57 -; SI-NEXT: v_mov_b32_e32 v27, s58 -; SI-NEXT: v_mov_b32_e32 v25, s4 -; SI-NEXT: v_mov_b32_e32 v21, s6 -; SI-NEXT: v_mov_b32_e32 v17, s8 -; SI-NEXT: v_mov_b32_e32 v13, s10 -; SI-NEXT: v_mov_b32_e32 v9, s12 -; SI-NEXT: v_mov_b32_e32 v5, s14 -; SI-NEXT: v_mov_b32_e32 v1, s40 +; SI-NEXT: v_mov_b32_e32 v22, s58 +; SI-NEXT: v_mov_b32_e32 v14, s4 +; SI-NEXT: v_mov_b32_e32 v15, s6 +; SI-NEXT: v_mov_b32_e32 v16, s8 +; SI-NEXT: v_mov_b32_e32 v17, s10 +; SI-NEXT: v_mov_b32_e32 v18, s12 +; SI-NEXT: v_mov_b32_e32 v19, s14 +; SI-NEXT: v_mov_b32_e32 v20, s40 ; SI-NEXT: .LBB29_5: ; %end -; SI-NEXT: v_mov_b32_e32 v0, v48 -; SI-NEXT: v_mov_b32_e32 v2, v49 -; SI-NEXT: v_mov_b32_e32 v4, v38 -; SI-NEXT: v_mov_b32_e32 v6, v39 -; SI-NEXT: v_mov_b32_e32 v8, v36 -; SI-NEXT: v_mov_b32_e32 v10, v37 -; SI-NEXT: v_mov_b32_e32 v12, v34 -; SI-NEXT: v_mov_b32_e32 v14, v35 -; SI-NEXT: v_mov_b32_e32 v16, v32 -; SI-NEXT: v_mov_b32_e32 v18, v33 -; SI-NEXT: v_mov_b32_e32 v20, v30 -; SI-NEXT: v_mov_b32_e32 v22, v31 -; SI-NEXT: v_mov_b32_e32 v24, v28 -; SI-NEXT: v_mov_b32_e32 v26, v29 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v0, v0, v20 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v28 +; SI-NEXT: v_or_b32_e32 v2, v2, v19 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v27 +; SI-NEXT: v_or_b32_e32 v4, v4, v18 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v26 +; SI-NEXT: v_or_b32_e32 v6, v6, v17 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v25 +; SI-NEXT: v_or_b32_e32 v8, v8, v16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v24 +; SI-NEXT: v_or_b32_e32 v10, v10, v15 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v23 +; SI-NEXT: v_or_b32_e32 v12, v12, v14 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v22 +; SI-NEXT: v_or_b32_e32 v1, v1, v20 +; SI-NEXT: v_or_b32_e32 v3, v3, v19 +; SI-NEXT: v_or_b32_e32 v5, v5, v18 +; SI-NEXT: v_or_b32_e32 v7, v7, v17 +; SI-NEXT: v_or_b32_e32 v9, v9, v16 +; SI-NEXT: v_or_b32_e32 v11, v11, v15 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v14f32_to_v28i16_scalar: @@ -5795,29 +6139,49 @@ define <14 x float> @bitcast_v28i16_to_v14f32(<28 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v28i16_to_v14f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v36, v14 -; SI-NEXT: v_mov_b32_e32 v35, v12 -; SI-NEXT: v_mov_b32_e32 v34, v10 -; SI-NEXT: v_mov_b32_e32 v33, v8 -; SI-NEXT: v_mov_b32_e32 v32, v6 -; SI-NEXT: v_mov_b32_e32 v31, v4 -; SI-NEXT: v_mov_b32_e32 v30, v2 +; SI-NEXT: v_mov_b32_e32 v16, v13 +; SI-NEXT: v_mov_b32_e32 v17, v12 +; SI-NEXT: v_mov_b32_e32 v18, v11 +; SI-NEXT: v_mov_b32_e32 v19, v10 +; SI-NEXT: v_mov_b32_e32 v20, v9 +; SI-NEXT: v_mov_b32_e32 v21, v8 +; SI-NEXT: v_mov_b32_e32 v22, v7 +; SI-NEXT: v_mov_b32_e32 v23, v6 +; SI-NEXT: v_mov_b32_e32 v24, v5 +; SI-NEXT: v_mov_b32_e32 v25, v4 +; SI-NEXT: v_mov_b32_e32 v26, v3 +; SI-NEXT: v_mov_b32_e32 v27, v2 +; SI-NEXT: v_mov_b32_e32 v28, v1 ; SI-NEXT: v_mov_b32_e32 v29, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v29 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v12 ; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v0 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -5830,50 +6194,47 @@ define <14 x float> @bitcast_v28i16_to_v14f32(<28 x i16> %a, i32 %b) { ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB30_3: ; %cmp.false ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v29 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v30 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v31 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v32 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v33 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v34 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v35 -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v36 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v16 -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v18 -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v20 -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v22 -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v24 -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v26 -; SI-NEXT: v_or_b32_e32 v0, v0, v54 -; SI-NEXT: v_or_b32_e32 v1, v1, v53 -; SI-NEXT: v_or_b32_e32 v2, v2, v52 -; SI-NEXT: v_or_b32_e32 v3, v3, v51 -; SI-NEXT: v_or_b32_e32 v4, v4, v50 -; SI-NEXT: v_or_b32_e32 v5, v5, v49 -; SI-NEXT: v_or_b32_e32 v6, v6, v48 -; SI-NEXT: v_or_b32_e32 v7, v7, v39 -; SI-NEXT: v_or_b32_e32 v8, v8, v38 -; SI-NEXT: v_or_b32_e32 v9, v9, v37 -; SI-NEXT: v_or_b32_e32 v10, v10, v28 -; SI-NEXT: v_or_b32_e32 v11, v11, v21 -; SI-NEXT: v_or_b32_e32 v12, v12, v19 -; SI-NEXT: v_or_b32_e32 v13, v13, v17 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v28 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v27 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v26 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v25 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v24 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v23 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v21 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v19 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v0, v0, v51 +; SI-NEXT: v_or_b32_e32 v1, v1, v50 +; SI-NEXT: v_or_b32_e32 v2, v2, v49 +; SI-NEXT: v_or_b32_e32 v3, v3, v48 +; SI-NEXT: v_or_b32_e32 v4, v4, v39 +; SI-NEXT: v_or_b32_e32 v5, v5, v38 +; SI-NEXT: v_or_b32_e32 v6, v6, v37 +; SI-NEXT: v_or_b32_e32 v7, v7, v36 +; SI-NEXT: v_or_b32_e32 v8, v8, v35 +; SI-NEXT: v_or_b32_e32 v9, v9, v34 +; SI-NEXT: v_or_b32_e32 v10, v10, v33 +; SI-NEXT: v_or_b32_e32 v11, v11, v32 +; SI-NEXT: v_or_b32_e32 v12, v12, v31 +; SI-NEXT: v_or_b32_e32 v13, v13, v30 ; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr49 @@ -5881,27 +6242,30 @@ define <14 x float> @bitcast_v28i16_to_v14f32(<28 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB30_2 ; SI-NEXT: .LBB30_4: ; %cmp.true ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v29 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v30 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v31 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v32 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v33 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v34 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v35 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v36 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v16 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v18 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v20 -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v22 -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v24 -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v27 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v16 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 @@ -5916,21 +6280,21 @@ define <14 x float> @bitcast_v28i16_to_v14f32(<28 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; SI-NEXT: v_or_b32_e32 v0, v54, v0 +; SI-NEXT: v_or_b32_e32 v0, v51, v0 ; SI-NEXT: s_mov_b32 s6, 0x30000 -; SI-NEXT: v_or_b32_e32 v1, v53, v1 -; SI-NEXT: v_or_b32_e32 v2, v52, v2 -; SI-NEXT: v_or_b32_e32 v3, v51, v3 -; SI-NEXT: v_or_b32_e32 v4, v50, v4 -; SI-NEXT: v_or_b32_e32 v5, v49, v5 -; SI-NEXT: v_or_b32_e32 v6, v48, v6 -; SI-NEXT: v_or_b32_e32 v7, v39, v7 -; SI-NEXT: v_or_b32_e32 v8, v38, v8 -; SI-NEXT: v_or_b32_e32 v9, v37, v9 -; SI-NEXT: v_or_b32_e32 v10, v28, v10 -; SI-NEXT: v_or_b32_e32 v11, v21, v11 -; SI-NEXT: v_or_b32_e32 v12, v19, v12 -; SI-NEXT: v_or_b32_e32 v13, v17, v13 +; SI-NEXT: v_or_b32_e32 v1, v50, v1 +; SI-NEXT: v_or_b32_e32 v2, v49, v2 +; SI-NEXT: v_or_b32_e32 v3, v48, v3 +; SI-NEXT: v_or_b32_e32 v4, v39, v4 +; SI-NEXT: v_or_b32_e32 v5, v38, v5 +; SI-NEXT: v_or_b32_e32 v6, v37, v6 +; SI-NEXT: v_or_b32_e32 v7, v36, v7 +; SI-NEXT: v_or_b32_e32 v8, v35, v8 +; SI-NEXT: v_or_b32_e32 v9, v34, v9 +; SI-NEXT: v_or_b32_e32 v10, v33, v10 +; SI-NEXT: v_or_b32_e32 v11, v32, v11 +; SI-NEXT: v_or_b32_e32 v12, v31, v12 +; SI-NEXT: v_or_b32_e32 v13, v30, v13 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 @@ -6079,142 +6443,180 @@ define inreg <14 x float> @bitcast_v28i16_to_v14f32_scalar(<28 x i16> inreg %a, ; SI-LABEL: bitcast_v28i16_to_v14f32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; SI-NEXT: v_mov_b32_e32 v16, v12 -; SI-NEXT: v_mov_b32_e32 v17, v10 -; SI-NEXT: v_mov_b32_e32 v18, v8 -; SI-NEXT: v_mov_b32_e32 v19, v6 -; SI-NEXT: v_mov_b32_e32 v20, v4 -; SI-NEXT: v_mov_b32_e32 v21, v2 -; SI-NEXT: v_mov_b32_e32 v22, v0 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v14, s36, 0 +; SI-NEXT: v_writelane_b32 v14, s37, 1 +; SI-NEXT: v_writelane_b32 v14, s38, 2 +; SI-NEXT: v_writelane_b32 v14, s39, 3 +; SI-NEXT: v_writelane_b32 v14, s48, 4 +; SI-NEXT: v_writelane_b32 v14, s49, 5 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s7, s28, 16 +; SI-NEXT: s_lshr_b32 s8, s27, 16 +; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: s_lshr_b32 s13, s22, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s56, s19, 16 +; SI-NEXT: s_lshr_b32 s57, s18, 16 +; SI-NEXT: s_lshr_b32 s58, s17, 16 +; SI-NEXT: s_lshr_b32 s59, s16, 16 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: v_writelane_b32 v14, s50, 6 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v13 +; SI-NEXT: v_writelane_b32 v14, s51, 7 ; SI-NEXT: s_cbranch_scc0 .LBB31_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v22 -; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: v_or_b32_e32 v7, v0, v29 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v20 -; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: v_or_b32_e32 v9, v0, v27 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v19 -; SI-NEXT: s_or_b32 s7, s7, s8 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: v_or_b32_e32 v10, v0, v26 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v18 -; SI-NEXT: s_or_b32 s8, s8, s9 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: v_or_b32_e32 v11, v0, v25 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v17 -; SI-NEXT: s_or_b32 s9, s9, s10 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 -; SI-NEXT: v_or_b32_e32 v12, v0, v24 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 -; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_or_b32_e32 v8, v1, v28 -; SI-NEXT: v_or_b32_e32 v13, v0, v23 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_mov_b32_e32 v5, s9 -; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_lshl_b32 s5, s59, 16 +; SI-NEXT: s_or_b32 s36, s4, s5 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: s_lshl_b32 s5, s57, 16 +; SI-NEXT: s_or_b32 s38, s4, s5 +; SI-NEXT: s_and_b32 s4, s19, 0xffff +; SI-NEXT: s_lshl_b32 s5, s56, 16 +; SI-NEXT: s_and_b32 s40, s17, 0xffff +; SI-NEXT: s_lshl_b32 s41, s58, 16 +; SI-NEXT: s_or_b32 s39, s4, s5 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: s_lshl_b32 s5, s15, 16 +; SI-NEXT: s_or_b32 s37, s40, s41 +; SI-NEXT: s_or_b32 s40, s4, s5 +; SI-NEXT: s_and_b32 s4, s21, 0xffff +; SI-NEXT: s_lshl_b32 s5, s14, 16 +; SI-NEXT: s_or_b32 s41, s4, s5 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: s_lshl_b32 s5, s13, 16 +; SI-NEXT: s_or_b32 s42, s4, s5 +; SI-NEXT: s_and_b32 s4, s23, 0xffff +; SI-NEXT: s_lshl_b32 s5, s12, 16 +; SI-NEXT: s_or_b32 s43, s4, s5 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: s_lshl_b32 s5, s11, 16 +; SI-NEXT: s_or_b32 s44, s4, s5 +; SI-NEXT: s_and_b32 s4, s25, 0xffff +; SI-NEXT: s_lshl_b32 s5, s10, 16 +; SI-NEXT: s_or_b32 s45, s4, s5 +; SI-NEXT: s_and_b32 s4, s26, 0xffff +; SI-NEXT: s_lshl_b32 s5, s9, 16 +; SI-NEXT: s_or_b32 s46, s4, s5 +; SI-NEXT: s_and_b32 s4, s27, 0xffff +; SI-NEXT: s_lshl_b32 s5, s8, 16 +; SI-NEXT: s_or_b32 s47, s4, s5 +; SI-NEXT: s_and_b32 s4, s28, 0xffff +; SI-NEXT: s_lshl_b32 s5, s7, 16 +; SI-NEXT: s_or_b32 s48, s4, s5 +; SI-NEXT: s_and_b32 s4, s29, 0xffff +; SI-NEXT: s_lshl_b32 s5, s6, 16 +; SI-NEXT: s_or_b32 s49, s4, s5 ; SI-NEXT: s_cbranch_execnz .LBB31_3 ; SI-NEXT: .LBB31_2: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v22 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v29, v0 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v21 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v28, v0 -; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v20 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v27, v0 -; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v19 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_or_b32_e32 v0, v26, v0 ; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_lshl_b32 s5, s59, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s36, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s17, 0xffff +; SI-NEXT: s_lshl_b32 s5, s58, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v18 +; SI-NEXT: s_add_i32 s37, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: s_lshl_b32 s5, s57, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s38, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s19, 0xffff +; SI-NEXT: s_lshl_b32 s5, s56, 16 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 ; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s39, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: s_lshl_b32 s5, s15, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s40, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s21, 0xffff +; SI-NEXT: s_lshl_b32 s5, s14, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: v_or_b32_e32 v0, v25, v0 -; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s41, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: s_lshl_b32 s5, s13, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s42, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s23, 0xffff +; SI-NEXT: s_lshl_b32 s5, s12, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v17 -; SI-NEXT: s_or_b32 s7, s8, s7 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s43, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: s_lshl_b32 s5, s11, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s44, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s25, 0xffff +; SI-NEXT: s_lshl_b32 s5, s10, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_or_b32 s8, s9, s8 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_add_i32 s45, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s26, 0xffff +; SI-NEXT: s_lshl_b32 s5, s9, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_add_i32 s46, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s27, 0xffff +; SI-NEXT: s_lshl_b32 s5, s8, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: v_or_b32_e32 v0, v24, v0 -; SI-NEXT: s_or_b32 s9, s10, s9 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v16 -; SI-NEXT: s_or_b32 s10, s11, s10 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_add_i32 s4, s4, 0x30000 -; SI-NEXT: s_add_i32 s5, s5, 0x30000 -; SI-NEXT: s_add_i32 s6, s6, 0x30000 -; SI-NEXT: s_add_i32 s7, s7, 0x30000 -; SI-NEXT: s_add_i32 s8, s8, 0x30000 -; SI-NEXT: s_add_i32 s9, s9, 0x30000 -; SI-NEXT: s_add_i32 s10, s10, 0x30000 -; SI-NEXT: v_or_b32_e32 v0, v23, v0 -; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_mov_b32_e32 v5, s9 -; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_add_i32 s47, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s28, 0xffff +; SI-NEXT: s_lshl_b32 s5, s7, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_add_i32 s48, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s29, 0xffff +; SI-NEXT: s_lshl_b32 s5, s6, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s49, s4, 0x30000 ; SI-NEXT: .LBB31_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s36 +; SI-NEXT: v_mov_b32_e32 v1, s37 +; SI-NEXT: v_mov_b32_e32 v2, s38 +; SI-NEXT: v_mov_b32_e32 v3, s39 +; SI-NEXT: v_mov_b32_e32 v4, s40 +; SI-NEXT: v_mov_b32_e32 v5, s41 +; SI-NEXT: v_mov_b32_e32 v6, s42 +; SI-NEXT: v_mov_b32_e32 v7, s43 +; SI-NEXT: v_mov_b32_e32 v8, s44 +; SI-NEXT: v_mov_b32_e32 v9, s45 +; SI-NEXT: v_mov_b32_e32 v10, s46 +; SI-NEXT: v_mov_b32_e32 v11, s47 +; SI-NEXT: v_mov_b32_e32 v12, s48 +; SI-NEXT: v_mov_b32_e32 v13, s49 +; SI-NEXT: v_readlane_b32 s51, v14, 7 +; SI-NEXT: v_readlane_b32 s50, v14, 6 +; SI-NEXT: v_readlane_b32 s49, v14, 5 +; SI-NEXT: v_readlane_b32 s48, v14, 4 +; SI-NEXT: v_readlane_b32 s39, v14, 3 +; SI-NEXT: v_readlane_b32 s38, v14, 2 +; SI-NEXT: v_readlane_b32 s37, v14, 1 +; SI-NEXT: v_readlane_b32 s36, v14, 0 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB31_4: -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: ; implicit-def: $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 ; SI-NEXT: s_branch .LBB31_2 ; ; VI-LABEL: bitcast_v28i16_to_v14f32_scalar: @@ -6422,21 +6824,81 @@ define <28 x half> @bitcast_v14f32_to_v28f16(<14 x float> %a, i32 %b) { ; SI-LABEL: bitcast_v14f32_to_v28f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v28, v13 -; SI-NEXT: v_mov_b32_e32 v29, v12 -; SI-NEXT: v_mov_b32_e32 v30, v11 -; SI-NEXT: v_mov_b32_e32 v31, v10 -; SI-NEXT: v_mov_b32_e32 v32, v9 -; SI-NEXT: v_mov_b32_e32 v33, v8 -; SI-NEXT: v_mov_b32_e32 v34, v7 -; SI-NEXT: v_mov_b32_e32 v35, v6 -; SI-NEXT: v_mov_b32_e32 v36, v5 -; SI-NEXT: v_mov_b32_e32 v37, v4 -; SI-NEXT: v_mov_b32_e32 v38, v3 -; SI-NEXT: v_mov_b32_e32 v39, v2 -; SI-NEXT: v_mov_b32_e32 v48, v1 -; SI-NEXT: v_mov_b32_e32 v49, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB32_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v0 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 @@ -6451,146 +6913,124 @@ define <28 x half> @bitcast_v14f32_to_v28f16(<14 x float> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB32_3 -; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: .LBB32_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB32_4 -; SI-NEXT: .LBB32_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB32_3: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v49 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB32_2 -; SI-NEXT: .LBB32_4: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v0, 1.0, v49 -; SI-NEXT: v_add_f32_e32 v2, 1.0, v48 -; SI-NEXT: v_add_f32_e32 v4, 1.0, v39 -; SI-NEXT: v_add_f32_e32 v6, 1.0, v38 -; SI-NEXT: v_add_f32_e32 v8, 1.0, v37 -; SI-NEXT: v_add_f32_e32 v10, 1.0, v36 -; SI-NEXT: v_add_f32_e32 v12, 1.0, v35 -; SI-NEXT: v_add_f32_e32 v14, 1.0, v34 -; SI-NEXT: v_add_f32_e32 v16, 1.0, v33 -; SI-NEXT: v_add_f32_e32 v18, 1.0, v32 -; SI-NEXT: v_add_f32_e32 v20, 1.0, v31 -; SI-NEXT: v_add_f32_e32 v22, 1.0, v30 -; SI-NEXT: v_add_f32_e32 v24, 1.0, v29 -; SI-NEXT: v_add_f32_e32 v26, 1.0, v28 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_cbranch_execz .LBB32_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: .LBB32_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_cvt_f16_f32_e32 v0, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v38 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v34 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v32 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v31 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v28 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v27 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v23 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v24 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v19 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v20 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v15 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v16 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v14f32_to_v28f16: @@ -6693,136 +7133,192 @@ define inreg <28 x half> @bitcast_v14f32_to_v28f16_scalar(<14 x float> inreg %a, ; SI-NEXT: s_cbranch_scc0 .LBB33_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 ; SI-NEXT: s_lshr_b32 s4, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 ; SI-NEXT: s_lshr_b32 s4, s27, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 ; SI-NEXT: s_lshr_b32 s4, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 ; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 ; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 ; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 ; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 ; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 ; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s4 ; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 ; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s4 ; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s26 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s26 ; SI-NEXT: v_cvt_f32_f16_e32 v18, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s16 ; SI-NEXT: s_cbranch_execnz .LBB33_3 ; SI-NEXT: .LBB33_2: ; %cmp.true -; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 -; SI-NEXT: v_add_f32_e64 v2, s17, 1.0 -; SI-NEXT: v_add_f32_e64 v4, s18, 1.0 -; SI-NEXT: v_add_f32_e64 v6, s19, 1.0 -; SI-NEXT: v_add_f32_e64 v8, s20, 1.0 -; SI-NEXT: v_add_f32_e64 v10, s21, 1.0 -; SI-NEXT: v_add_f32_e64 v12, s22, 1.0 -; SI-NEXT: v_add_f32_e64 v14, s23, 1.0 -; SI-NEXT: v_add_f32_e64 v16, s24, 1.0 -; SI-NEXT: v_add_f32_e64 v18, s25, 1.0 -; SI-NEXT: v_add_f32_e64 v20, s26, 1.0 -; SI-NEXT: v_add_f32_e64 v22, s27, 1.0 -; SI-NEXT: v_add_f32_e64 v24, s28, 1.0 -; SI-NEXT: v_add_f32_e64 v26, s29, 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_add_f32_e64 v1, s16, 1.0 +; SI-NEXT: v_add_f32_e64 v0, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v3, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v2, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; SI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v6, s22, 1.0 +; SI-NEXT: v_add_f32_e64 v7, s23, 1.0 +; SI-NEXT: v_add_f32_e64 v8, s24, 1.0 +; SI-NEXT: v_add_f32_e64 v10, s25, 1.0 +; SI-NEXT: v_add_f32_e64 v11, s26, 1.0 +; SI-NEXT: v_add_f32_e64 v12, s27, 1.0 +; SI-NEXT: v_add_f32_e64 v13, s28, 1.0 +; SI-NEXT: v_add_f32_e64 v14, s29, 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v7 ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v32 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: .LBB33_3: ; %end +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v27 +; SI-NEXT: v_or_b32_e32 v0, v26, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 +; SI-NEXT: v_or_b32_e32 v2, v24, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v24 +; SI-NEXT: v_or_b32_e32 v5, v5, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v22 +; SI-NEXT: v_or_b32_e32 v7, v7, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v20 +; SI-NEXT: v_or_b32_e32 v9, v18, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_or_b32_e32 v11, v16, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v16 +; SI-NEXT: v_or_b32_e32 v1, v28, v1 +; SI-NEXT: v_or_b32_e32 v3, v26, v3 +; SI-NEXT: v_or_b32_e32 v4, v23, v4 +; SI-NEXT: v_or_b32_e32 v6, v21, v6 +; SI-NEXT: v_or_b32_e32 v8, v19, v8 +; SI-NEXT: v_or_b32_e32 v10, v17, v10 +; SI-NEXT: v_or_b32_e32 v12, v15, v12 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB33_4: +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: s_branch .LBB33_2 ; ; VI-LABEL: bitcast_v14f32_to_v28f16_scalar: @@ -6977,79 +7473,116 @@ define <14 x float> @bitcast_v28f16_to_v14f32(<28 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v28f16_to_v14f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v42, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v26 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v13 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execz .LBB34_2 -; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v42 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v40 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v52 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v50 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v48 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v38 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v34 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v30 +; SI-NEXT: s_cbranch_execnz .LBB34_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB34_4 +; SI-NEXT: .LBB34_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB34_3: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v23 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v21 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v19 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v17 -; SI-NEXT: v_or_b32_e32 v0, v41, v0 -; SI-NEXT: v_or_b32_e32 v1, v55, v1 -; SI-NEXT: v_or_b32_e32 v2, v53, v2 -; SI-NEXT: v_or_b32_e32 v3, v51, v3 -; SI-NEXT: v_or_b32_e32 v4, v49, v4 -; SI-NEXT: v_or_b32_e32 v5, v39, v5 -; SI-NEXT: v_or_b32_e32 v6, v37, v6 -; SI-NEXT: v_or_b32_e32 v7, v35, v7 -; SI-NEXT: v_or_b32_e32 v8, v33, v8 -; SI-NEXT: v_or_b32_e32 v9, v31, v9 -; SI-NEXT: v_or_b32_e32 v10, v29, v10 +; SI-NEXT: v_or_b32_e32 v0, v50, v0 +; SI-NEXT: v_or_b32_e32 v1, v48, v1 +; SI-NEXT: v_or_b32_e32 v2, v38, v2 +; SI-NEXT: v_or_b32_e32 v3, v36, v3 +; SI-NEXT: v_or_b32_e32 v4, v34, v4 +; SI-NEXT: v_or_b32_e32 v5, v32, v5 +; SI-NEXT: v_or_b32_e32 v6, v30, v6 +; SI-NEXT: v_or_b32_e32 v7, v28, v7 +; SI-NEXT: v_or_b32_e32 v8, v26, v8 +; SI-NEXT: v_or_b32_e32 v9, v24, v9 +; SI-NEXT: v_or_b32_e32 v10, v22, v10 ; SI-NEXT: v_or_b32_e32 v11, v20, v11 ; SI-NEXT: v_or_b32_e32 v12, v18, v12 ; SI-NEXT: v_or_b32_e32 v13, v16, v13 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr49 @@ -7065,20 +7598,26 @@ define <14 x float> @bitcast_v28f16_to_v14f32(<28 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: .LBB34_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB34_4 -; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v55 +; SI-NEXT: s_cbranch_execz .LBB34_2 +; SI-NEXT: .LBB34_4: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v48 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -7086,10 +7625,10 @@ define <14 x float> @bitcast_v28f16_to_v14f32(<28 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v38 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v39 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -7097,11 +7636,11 @@ define <14 x float> @bitcast_v28f16_to_v14f32(<28 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v37 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v35 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -7109,11 +7648,11 @@ define <14 x float> @bitcast_v28f16_to_v14f32(<28 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v34 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v32 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 @@ -7121,11 +7660,11 @@ define <14 x float> @bitcast_v28f16_to_v14f32(<28 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v31 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v29 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 @@ -7133,11 +7672,11 @@ define <14 x float> @bitcast_v28f16_to_v14f32(<28 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v28 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v26 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 @@ -7145,11 +7684,11 @@ define <14 x float> @bitcast_v28f16_to_v14f32(<28 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v25 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v23 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 @@ -7157,7 +7696,7 @@ define <14 x float> @bitcast_v28f16_to_v14f32(<28 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v22 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v21 @@ -7187,12 +7726,7 @@ define <14 x float> @bitcast_v28f16_to_v14f32(<28 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v12, v13, v12 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 ; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: .LBB34_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v28f16_to_v14f32: @@ -7327,64 +7861,106 @@ define inreg <14 x float> @bitcast_v28f16_to_v14f32_scalar(<28 x half> inreg %a, ; SI-LABEL: bitcast_v28f16_to_v14f32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v51, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v50, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v49, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v38, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v36, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v35, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v48, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v39, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v37, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v34, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v33, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v32, s26 -; SI-NEXT: v_cvt_f16_f32_e32 v31, s29 -; SI-NEXT: v_cvt_f16_f32_e32 v30, s28 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: s_lshr_b32 s8, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s8 +; SI-NEXT: s_lshr_b32 s8, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s8 +; SI-NEXT: s_lshr_b32 s8, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s8 +; SI-NEXT: s_lshr_b32 s8, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s8 +; SI-NEXT: s_lshr_b32 s8, s21, 16 +; SI-NEXT: s_lshr_b32 s6, s27, 16 +; SI-NEXT: s_lshr_b32 s7, s26, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s8 +; SI-NEXT: s_lshr_b32 s8, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s8 +; SI-NEXT: s_lshr_b32 s8, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s8 +; SI-NEXT: s_lshr_b32 s8, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s8 +; SI-NEXT: s_lshr_b32 s8, s17, 16 +; SI-NEXT: s_lshr_b32 s4, s29, 16 +; SI-NEXT: s_lshr_b32 s5, s28, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s8 +; SI-NEXT: s_lshr_b32 s8, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v4 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_cbranch_scc0 .LBB35_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v49 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v48 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v35 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v33 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v31 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v29 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v24 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v20 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v17 ; SI-NEXT: v_or_b32_e32 v0, v50, v0 -; SI-NEXT: v_or_b32_e32 v1, v38, v1 -; SI-NEXT: v_or_b32_e32 v2, v35, v2 -; SI-NEXT: v_or_b32_e32 v3, v39, v3 +; SI-NEXT: v_or_b32_e32 v1, v48, v1 +; SI-NEXT: v_or_b32_e32 v2, v38, v2 +; SI-NEXT: v_or_b32_e32 v3, v36, v3 ; SI-NEXT: v_or_b32_e32 v4, v34, v4 ; SI-NEXT: v_or_b32_e32 v5, v32, v5 ; SI-NEXT: v_or_b32_e32 v6, v30, v6 ; SI-NEXT: v_or_b32_e32 v7, v28, v7 ; SI-NEXT: v_or_b32_e32 v8, v26, v8 -; SI-NEXT: v_or_b32_e32 v9, v24, v9 +; SI-NEXT: v_or_b32_e32 v9, v23, v9 ; SI-NEXT: v_or_b32_e32 v10, v22, v10 -; SI-NEXT: v_or_b32_e32 v11, v20, v11 +; SI-NEXT: v_or_b32_e32 v11, v19, v11 ; SI-NEXT: v_or_b32_e32 v12, v18, v12 ; SI-NEXT: v_or_b32_e32 v13, v16, v13 ; SI-NEXT: s_cbranch_execnz .LBB35_3 @@ -7392,7 +7968,7 @@ define inreg <14 x float> @bitcast_v28f16_to_v14f32_scalar(<28 x half> inreg %a, ; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v48 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -7400,10 +7976,10 @@ define inreg <14 x float> @bitcast_v28f16_to_v14f32_scalar(<28 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v38 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v39 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -7411,11 +7987,11 @@ define inreg <14 x float> @bitcast_v28f16_to_v14f32_scalar(<28 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v37 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v35 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -7462,8 +8038,8 @@ define inreg <14 x float> @bitcast_v28f16_to_v14f32_scalar(<28 x half> inreg %a, ; SI-NEXT: v_cvt_f32_f16_e32 v9, v25 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v24 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 @@ -7475,7 +8051,7 @@ define inreg <14 x float> @bitcast_v28f16_to_v14f32_scalar(<28 x half> inreg %a, ; SI-NEXT: v_or_b32_e32 v9, v10, v9 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v19 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 @@ -7483,7 +8059,7 @@ define inreg <14 x float> @bitcast_v28f16_to_v14f32_scalar(<28 x half> inreg %a, ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v20 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v13, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v18 @@ -8290,87 +8866,115 @@ define <28 x i16> @bitcast_v7i64_to_v28i16(<7 x i64> %a, i32 %b) { ; SI-LABEL: bitcast_v7i64_to_v28i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v26, v13 -; SI-NEXT: v_mov_b32_e32 v24, v12 -; SI-NEXT: v_mov_b32_e32 v22, v11 -; SI-NEXT: v_mov_b32_e32 v20, v10 -; SI-NEXT: v_mov_b32_e32 v18, v9 -; SI-NEXT: v_mov_b32_e32 v16, v8 -; SI-NEXT: v_mov_b32_e32 v28, v7 -; SI-NEXT: v_mov_b32_e32 v12, v6 -; SI-NEXT: v_mov_b32_e32 v10, v5 -; SI-NEXT: v_mov_b32_e32 v8, v4 -; SI-NEXT: v_mov_b32_e32 v6, v3 -; SI-NEXT: v_mov_b32_e32 v4, v2 -; SI-NEXT: v_mov_b32_e32 v2, v1 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB40_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_alignbit_b32 v25, v26, v24, 16 -; SI-NEXT: v_alignbit_b32 v21, v22, v20, 16 -; SI-NEXT: v_alignbit_b32 v17, v18, v16, 16 -; SI-NEXT: v_alignbit_b32 v13, v28, v12, 16 -; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_alignbit_b32 v14, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v15, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v16, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v17, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v18, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v20, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v23, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 ; SI-NEXT: .LBB40_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB40_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; SI-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc ; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; SI-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc ; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; SI-NEXT: v_addc_u32_e32 v28, vcc, 0, v28, vcc -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; SI-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc -; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; SI-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc -; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; SI-NEXT: v_addc_u32_e32 v26, vcc, 0, v26, vcc -; SI-NEXT: v_alignbit_b32 v25, v26, v24, 16 -; SI-NEXT: v_alignbit_b32 v21, v22, v20, 16 -; SI-NEXT: v_alignbit_b32 v17, v18, v16, 16 -; SI-NEXT: v_alignbit_b32 v13, v28, v12, 16 -; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_alignbit_b32 v14, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v15, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v16, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v17, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v18, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v20, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v23, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 ; SI-NEXT: .LBB40_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_mov_b32_e32 v14, v28 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v0, v0, v23 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v27 +; SI-NEXT: v_or_b32_e32 v2, v2, v20 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v26 +; SI-NEXT: v_or_b32_e32 v4, v4, v18 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v25 +; SI-NEXT: v_or_b32_e32 v6, v6, v17 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v24 +; SI-NEXT: v_or_b32_e32 v8, v8, v16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v22 +; SI-NEXT: v_or_b32_e32 v10, v10, v15 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v21 +; SI-NEXT: v_or_b32_e32 v12, v12, v14 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v19 +; SI-NEXT: v_or_b32_e32 v1, v1, v23 +; SI-NEXT: v_or_b32_e32 v3, v3, v20 +; SI-NEXT: v_or_b32_e32 v5, v5, v18 +; SI-NEXT: v_or_b32_e32 v7, v7, v17 +; SI-NEXT: v_or_b32_e32 v9, v9, v16 +; SI-NEXT: v_or_b32_e32 v11, v11, v15 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v7i64_to_v28i16: @@ -8528,34 +9132,62 @@ define inreg <28 x i16> @bitcast_v7i64_to_v28i16_scalar(<7 x i64> inreg %a, i32 ; SI-NEXT: s_lshr_b64 s[14:15], s[18:19], 16 ; SI-NEXT: s_lshr_b64 s[40:41], s[16:17], 16 ; SI-NEXT: .LBB41_3: ; %end -; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_mov_b32_e32 v1, s40 -; SI-NEXT: v_mov_b32_e32 v2, s17 -; SI-NEXT: v_mov_b32_e32 v3, s58 -; SI-NEXT: v_mov_b32_e32 v4, s18 -; SI-NEXT: v_mov_b32_e32 v5, s14 -; SI-NEXT: v_mov_b32_e32 v6, s19 -; SI-NEXT: v_mov_b32_e32 v7, s57 -; SI-NEXT: v_mov_b32_e32 v8, s20 -; SI-NEXT: v_mov_b32_e32 v9, s12 -; SI-NEXT: v_mov_b32_e32 v10, s21 -; SI-NEXT: v_mov_b32_e32 v11, s56 -; SI-NEXT: v_mov_b32_e32 v12, s22 -; SI-NEXT: v_mov_b32_e32 v13, s10 -; SI-NEXT: v_mov_b32_e32 v14, s23 -; SI-NEXT: v_mov_b32_e32 v15, s47 -; SI-NEXT: v_mov_b32_e32 v16, s24 -; SI-NEXT: v_mov_b32_e32 v17, s8 -; SI-NEXT: v_mov_b32_e32 v18, s25 -; SI-NEXT: v_mov_b32_e32 v19, s46 -; SI-NEXT: v_mov_b32_e32 v20, s26 -; SI-NEXT: v_mov_b32_e32 v21, s6 -; SI-NEXT: v_mov_b32_e32 v22, s27 -; SI-NEXT: v_mov_b32_e32 v23, s45 -; SI-NEXT: v_mov_b32_e32 v24, s28 -; SI-NEXT: v_mov_b32_e32 v25, s4 -; SI-NEXT: v_mov_b32_e32 v26, s29 -; SI-NEXT: v_mov_b32_e32 v27, s44 +; SI-NEXT: s_and_b32 s5, s16, 0xffff +; SI-NEXT: s_lshl_b32 s7, s40, 16 +; SI-NEXT: s_or_b32 s5, s5, s7 +; SI-NEXT: s_and_b32 s7, s17, 0xffff +; SI-NEXT: s_lshl_b32 s9, s58, 16 +; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: s_and_b32 s9, s18, 0xffff +; SI-NEXT: s_lshl_b32 s11, s14, 16 +; SI-NEXT: s_or_b32 s9, s9, s11 +; SI-NEXT: s_and_b32 s11, s19, 0xffff +; SI-NEXT: s_lshl_b32 s13, s57, 16 +; SI-NEXT: s_or_b32 s11, s11, s13 +; SI-NEXT: s_and_b32 s13, s20, 0xffff +; SI-NEXT: s_lshl_b32 s12, s12, 16 +; SI-NEXT: s_or_b32 s12, s13, s12 +; SI-NEXT: s_and_b32 s13, s21, 0xffff +; SI-NEXT: s_lshl_b32 s14, s56, 16 +; SI-NEXT: s_or_b32 s13, s13, s14 +; SI-NEXT: s_and_b32 s14, s22, 0xffff +; SI-NEXT: s_lshl_b32 s10, s10, 16 +; SI-NEXT: s_or_b32 s10, s14, s10 +; SI-NEXT: s_and_b32 s14, s23, 0xffff +; SI-NEXT: s_lshl_b32 s15, s47, 16 +; SI-NEXT: s_or_b32 s14, s14, s15 +; SI-NEXT: s_and_b32 s15, s24, 0xffff +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_or_b32 s8, s15, s8 +; SI-NEXT: s_and_b32 s15, s25, 0xffff +; SI-NEXT: s_lshl_b32 s16, s46, 16 +; SI-NEXT: s_or_b32 s15, s15, s16 +; SI-NEXT: s_and_b32 s16, s26, 0xffff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_or_b32 s6, s16, s6 +; SI-NEXT: s_and_b32 s16, s27, 0xffff +; SI-NEXT: s_lshl_b32 s17, s45, 16 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_and_b32 s17, s28, 0xffff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_or_b32 s4, s17, s4 +; SI-NEXT: s_and_b32 s17, s29, 0xffff +; SI-NEXT: s_lshl_b32 s18, s44, 16 +; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: v_mov_b32_e32 v0, s5 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: v_mov_b32_e32 v3, s11 +; SI-NEXT: v_mov_b32_e32 v4, s12 +; SI-NEXT: v_mov_b32_e32 v5, s13 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: v_mov_b32_e32 v7, s14 +; SI-NEXT: v_mov_b32_e32 v8, s8 +; SI-NEXT: v_mov_b32_e32 v9, s15 +; SI-NEXT: v_mov_b32_e32 v10, s6 +; SI-NEXT: v_mov_b32_e32 v11, s16 +; SI-NEXT: v_mov_b32_e32 v12, s4 +; SI-NEXT: v_mov_b32_e32 v13, s17 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB41_4: ; SI-NEXT: ; implicit-def: $sgpr40 @@ -8715,29 +9347,49 @@ define <7 x i64> @bitcast_v28i16_to_v7i64(<28 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v28i16_to_v7i64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v36, v14 -; SI-NEXT: v_mov_b32_e32 v35, v12 -; SI-NEXT: v_mov_b32_e32 v34, v10 -; SI-NEXT: v_mov_b32_e32 v33, v8 -; SI-NEXT: v_mov_b32_e32 v32, v6 -; SI-NEXT: v_mov_b32_e32 v31, v4 -; SI-NEXT: v_mov_b32_e32 v30, v2 +; SI-NEXT: v_mov_b32_e32 v16, v13 +; SI-NEXT: v_mov_b32_e32 v17, v12 +; SI-NEXT: v_mov_b32_e32 v18, v11 +; SI-NEXT: v_mov_b32_e32 v19, v10 +; SI-NEXT: v_mov_b32_e32 v20, v9 +; SI-NEXT: v_mov_b32_e32 v21, v8 +; SI-NEXT: v_mov_b32_e32 v22, v7 +; SI-NEXT: v_mov_b32_e32 v23, v6 +; SI-NEXT: v_mov_b32_e32 v24, v5 +; SI-NEXT: v_mov_b32_e32 v25, v4 +; SI-NEXT: v_mov_b32_e32 v26, v3 +; SI-NEXT: v_mov_b32_e32 v27, v2 +; SI-NEXT: v_mov_b32_e32 v28, v1 ; SI-NEXT: v_mov_b32_e32 v29, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v29 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v12 ; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v0 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -8750,50 +9402,47 @@ define <7 x i64> @bitcast_v28i16_to_v7i64(<28 x i16> %a, i32 %b) { ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB42_3: ; %cmp.false ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v29 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v30 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v31 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v32 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v33 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v34 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v35 -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v36 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v16 -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v18 -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v20 -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v22 -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v24 -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v26 -; SI-NEXT: v_or_b32_e32 v0, v0, v54 -; SI-NEXT: v_or_b32_e32 v1, v1, v53 -; SI-NEXT: v_or_b32_e32 v2, v2, v52 -; SI-NEXT: v_or_b32_e32 v3, v3, v51 -; SI-NEXT: v_or_b32_e32 v4, v4, v50 -; SI-NEXT: v_or_b32_e32 v5, v5, v49 -; SI-NEXT: v_or_b32_e32 v6, v6, v48 -; SI-NEXT: v_or_b32_e32 v7, v7, v39 -; SI-NEXT: v_or_b32_e32 v8, v8, v38 -; SI-NEXT: v_or_b32_e32 v9, v9, v37 -; SI-NEXT: v_or_b32_e32 v10, v10, v28 -; SI-NEXT: v_or_b32_e32 v11, v11, v21 -; SI-NEXT: v_or_b32_e32 v12, v12, v19 -; SI-NEXT: v_or_b32_e32 v13, v13, v17 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v28 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v27 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v26 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v25 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v24 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v23 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v21 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v19 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v0, v0, v51 +; SI-NEXT: v_or_b32_e32 v1, v1, v50 +; SI-NEXT: v_or_b32_e32 v2, v2, v49 +; SI-NEXT: v_or_b32_e32 v3, v3, v48 +; SI-NEXT: v_or_b32_e32 v4, v4, v39 +; SI-NEXT: v_or_b32_e32 v5, v5, v38 +; SI-NEXT: v_or_b32_e32 v6, v6, v37 +; SI-NEXT: v_or_b32_e32 v7, v7, v36 +; SI-NEXT: v_or_b32_e32 v8, v8, v35 +; SI-NEXT: v_or_b32_e32 v9, v9, v34 +; SI-NEXT: v_or_b32_e32 v10, v10, v33 +; SI-NEXT: v_or_b32_e32 v11, v11, v32 +; SI-NEXT: v_or_b32_e32 v12, v12, v31 +; SI-NEXT: v_or_b32_e32 v13, v13, v30 ; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr49 @@ -8801,27 +9450,30 @@ define <7 x i64> @bitcast_v28i16_to_v7i64(<28 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB42_2 ; SI-NEXT: .LBB42_4: ; %cmp.true ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v29 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v30 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v31 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v32 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v33 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v34 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v35 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v36 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v16 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v18 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v20 -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v22 -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v24 -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v27 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v16 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 @@ -8836,21 +9488,21 @@ define <7 x i64> @bitcast_v28i16_to_v7i64(<28 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; SI-NEXT: v_or_b32_e32 v0, v54, v0 +; SI-NEXT: v_or_b32_e32 v0, v51, v0 ; SI-NEXT: s_mov_b32 s6, 0x30000 -; SI-NEXT: v_or_b32_e32 v1, v53, v1 -; SI-NEXT: v_or_b32_e32 v2, v52, v2 -; SI-NEXT: v_or_b32_e32 v3, v51, v3 -; SI-NEXT: v_or_b32_e32 v4, v50, v4 -; SI-NEXT: v_or_b32_e32 v5, v49, v5 -; SI-NEXT: v_or_b32_e32 v6, v48, v6 -; SI-NEXT: v_or_b32_e32 v7, v39, v7 -; SI-NEXT: v_or_b32_e32 v8, v38, v8 -; SI-NEXT: v_or_b32_e32 v9, v37, v9 -; SI-NEXT: v_or_b32_e32 v10, v28, v10 -; SI-NEXT: v_or_b32_e32 v11, v21, v11 -; SI-NEXT: v_or_b32_e32 v12, v19, v12 -; SI-NEXT: v_or_b32_e32 v13, v17, v13 +; SI-NEXT: v_or_b32_e32 v1, v50, v1 +; SI-NEXT: v_or_b32_e32 v2, v49, v2 +; SI-NEXT: v_or_b32_e32 v3, v48, v3 +; SI-NEXT: v_or_b32_e32 v4, v39, v4 +; SI-NEXT: v_or_b32_e32 v5, v38, v5 +; SI-NEXT: v_or_b32_e32 v6, v37, v6 +; SI-NEXT: v_or_b32_e32 v7, v36, v7 +; SI-NEXT: v_or_b32_e32 v8, v35, v8 +; SI-NEXT: v_or_b32_e32 v9, v34, v9 +; SI-NEXT: v_or_b32_e32 v10, v33, v10 +; SI-NEXT: v_or_b32_e32 v11, v32, v11 +; SI-NEXT: v_or_b32_e32 v12, v31, v12 +; SI-NEXT: v_or_b32_e32 v13, v30, v13 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 @@ -8999,142 +9651,180 @@ define inreg <7 x i64> @bitcast_v28i16_to_v7i64_scalar(<28 x i16> inreg %a, i32 ; SI-LABEL: bitcast_v28i16_to_v7i64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; SI-NEXT: v_mov_b32_e32 v16, v12 -; SI-NEXT: v_mov_b32_e32 v17, v10 -; SI-NEXT: v_mov_b32_e32 v18, v8 -; SI-NEXT: v_mov_b32_e32 v19, v6 -; SI-NEXT: v_mov_b32_e32 v20, v4 -; SI-NEXT: v_mov_b32_e32 v21, v2 -; SI-NEXT: v_mov_b32_e32 v22, v0 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v14, s36, 0 +; SI-NEXT: v_writelane_b32 v14, s37, 1 +; SI-NEXT: v_writelane_b32 v14, s38, 2 +; SI-NEXT: v_writelane_b32 v14, s39, 3 +; SI-NEXT: v_writelane_b32 v14, s48, 4 +; SI-NEXT: v_writelane_b32 v14, s49, 5 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s7, s28, 16 +; SI-NEXT: s_lshr_b32 s8, s27, 16 +; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: s_lshr_b32 s13, s22, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s56, s19, 16 +; SI-NEXT: s_lshr_b32 s57, s18, 16 +; SI-NEXT: s_lshr_b32 s58, s17, 16 +; SI-NEXT: s_lshr_b32 s59, s16, 16 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: v_writelane_b32 v14, s50, 6 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v13 +; SI-NEXT: v_writelane_b32 v14, s51, 7 ; SI-NEXT: s_cbranch_scc0 .LBB43_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v22 -; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: v_or_b32_e32 v7, v0, v29 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v20 -; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: v_or_b32_e32 v9, v0, v27 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v19 -; SI-NEXT: s_or_b32 s7, s7, s8 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: v_or_b32_e32 v10, v0, v26 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v18 -; SI-NEXT: s_or_b32 s8, s8, s9 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: v_or_b32_e32 v11, v0, v25 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v17 -; SI-NEXT: s_or_b32 s9, s9, s10 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 -; SI-NEXT: v_or_b32_e32 v12, v0, v24 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 -; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_or_b32_e32 v8, v1, v28 -; SI-NEXT: v_or_b32_e32 v13, v0, v23 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_mov_b32_e32 v5, s9 -; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_lshl_b32 s5, s59, 16 +; SI-NEXT: s_or_b32 s36, s4, s5 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: s_lshl_b32 s5, s57, 16 +; SI-NEXT: s_or_b32 s38, s4, s5 +; SI-NEXT: s_and_b32 s4, s19, 0xffff +; SI-NEXT: s_lshl_b32 s5, s56, 16 +; SI-NEXT: s_and_b32 s40, s17, 0xffff +; SI-NEXT: s_lshl_b32 s41, s58, 16 +; SI-NEXT: s_or_b32 s39, s4, s5 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: s_lshl_b32 s5, s15, 16 +; SI-NEXT: s_or_b32 s37, s40, s41 +; SI-NEXT: s_or_b32 s40, s4, s5 +; SI-NEXT: s_and_b32 s4, s21, 0xffff +; SI-NEXT: s_lshl_b32 s5, s14, 16 +; SI-NEXT: s_or_b32 s41, s4, s5 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: s_lshl_b32 s5, s13, 16 +; SI-NEXT: s_or_b32 s42, s4, s5 +; SI-NEXT: s_and_b32 s4, s23, 0xffff +; SI-NEXT: s_lshl_b32 s5, s12, 16 +; SI-NEXT: s_or_b32 s43, s4, s5 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: s_lshl_b32 s5, s11, 16 +; SI-NEXT: s_or_b32 s44, s4, s5 +; SI-NEXT: s_and_b32 s4, s25, 0xffff +; SI-NEXT: s_lshl_b32 s5, s10, 16 +; SI-NEXT: s_or_b32 s45, s4, s5 +; SI-NEXT: s_and_b32 s4, s26, 0xffff +; SI-NEXT: s_lshl_b32 s5, s9, 16 +; SI-NEXT: s_or_b32 s46, s4, s5 +; SI-NEXT: s_and_b32 s4, s27, 0xffff +; SI-NEXT: s_lshl_b32 s5, s8, 16 +; SI-NEXT: s_or_b32 s47, s4, s5 +; SI-NEXT: s_and_b32 s4, s28, 0xffff +; SI-NEXT: s_lshl_b32 s5, s7, 16 +; SI-NEXT: s_or_b32 s48, s4, s5 +; SI-NEXT: s_and_b32 s4, s29, 0xffff +; SI-NEXT: s_lshl_b32 s5, s6, 16 +; SI-NEXT: s_or_b32 s49, s4, s5 ; SI-NEXT: s_cbranch_execnz .LBB43_3 ; SI-NEXT: .LBB43_2: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v22 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v29, v0 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v21 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v28, v0 -; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v20 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v27, v0 -; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v19 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_or_b32_e32 v0, v26, v0 ; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_lshl_b32 s5, s59, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s36, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s17, 0xffff +; SI-NEXT: s_lshl_b32 s5, s58, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v18 +; SI-NEXT: s_add_i32 s37, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: s_lshl_b32 s5, s57, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s38, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s19, 0xffff +; SI-NEXT: s_lshl_b32 s5, s56, 16 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 ; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s39, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: s_lshl_b32 s5, s15, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s40, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s21, 0xffff +; SI-NEXT: s_lshl_b32 s5, s14, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: v_or_b32_e32 v0, v25, v0 -; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s41, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: s_lshl_b32 s5, s13, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s42, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s23, 0xffff +; SI-NEXT: s_lshl_b32 s5, s12, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v17 -; SI-NEXT: s_or_b32 s7, s8, s7 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s43, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: s_lshl_b32 s5, s11, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s44, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s25, 0xffff +; SI-NEXT: s_lshl_b32 s5, s10, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_or_b32 s8, s9, s8 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_add_i32 s45, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s26, 0xffff +; SI-NEXT: s_lshl_b32 s5, s9, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_add_i32 s46, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s27, 0xffff +; SI-NEXT: s_lshl_b32 s5, s8, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: v_or_b32_e32 v0, v24, v0 -; SI-NEXT: s_or_b32 s9, s10, s9 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v16 -; SI-NEXT: s_or_b32 s10, s11, s10 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_add_i32 s4, s4, 0x30000 -; SI-NEXT: s_add_i32 s5, s5, 0x30000 -; SI-NEXT: s_add_i32 s6, s6, 0x30000 -; SI-NEXT: s_add_i32 s7, s7, 0x30000 -; SI-NEXT: s_add_i32 s8, s8, 0x30000 -; SI-NEXT: s_add_i32 s9, s9, 0x30000 -; SI-NEXT: s_add_i32 s10, s10, 0x30000 -; SI-NEXT: v_or_b32_e32 v0, v23, v0 -; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_mov_b32_e32 v5, s9 -; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_add_i32 s47, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s28, 0xffff +; SI-NEXT: s_lshl_b32 s5, s7, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_add_i32 s48, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s29, 0xffff +; SI-NEXT: s_lshl_b32 s5, s6, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s49, s4, 0x30000 ; SI-NEXT: .LBB43_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s36 +; SI-NEXT: v_mov_b32_e32 v1, s37 +; SI-NEXT: v_mov_b32_e32 v2, s38 +; SI-NEXT: v_mov_b32_e32 v3, s39 +; SI-NEXT: v_mov_b32_e32 v4, s40 +; SI-NEXT: v_mov_b32_e32 v5, s41 +; SI-NEXT: v_mov_b32_e32 v6, s42 +; SI-NEXT: v_mov_b32_e32 v7, s43 +; SI-NEXT: v_mov_b32_e32 v8, s44 +; SI-NEXT: v_mov_b32_e32 v9, s45 +; SI-NEXT: v_mov_b32_e32 v10, s46 +; SI-NEXT: v_mov_b32_e32 v11, s47 +; SI-NEXT: v_mov_b32_e32 v12, s48 +; SI-NEXT: v_mov_b32_e32 v13, s49 +; SI-NEXT: v_readlane_b32 s51, v14, 7 +; SI-NEXT: v_readlane_b32 s50, v14, 6 +; SI-NEXT: v_readlane_b32 s49, v14, 5 +; SI-NEXT: v_readlane_b32 s48, v14, 4 +; SI-NEXT: v_readlane_b32 s39, v14, 3 +; SI-NEXT: v_readlane_b32 s38, v14, 2 +; SI-NEXT: v_readlane_b32 s37, v14, 1 +; SI-NEXT: v_readlane_b32 s36, v14, 0 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB43_4: -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: ; implicit-def: $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 ; SI-NEXT: s_branch .LBB43_2 ; ; VI-LABEL: bitcast_v28i16_to_v7i64_scalar: @@ -9342,21 +10032,81 @@ define <28 x half> @bitcast_v7i64_to_v28f16(<7 x i64> %a, i32 %b) { ; SI-LABEL: bitcast_v7i64_to_v28f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v29, v13 -; SI-NEXT: v_mov_b32_e32 v28, v12 -; SI-NEXT: v_mov_b32_e32 v31, v11 -; SI-NEXT: v_mov_b32_e32 v30, v10 -; SI-NEXT: v_mov_b32_e32 v33, v9 -; SI-NEXT: v_mov_b32_e32 v32, v8 -; SI-NEXT: v_mov_b32_e32 v35, v7 -; SI-NEXT: v_mov_b32_e32 v34, v6 -; SI-NEXT: v_mov_b32_e32 v37, v5 -; SI-NEXT: v_mov_b32_e32 v36, v4 -; SI-NEXT: v_mov_b32_e32 v39, v3 -; SI-NEXT: v_mov_b32_e32 v38, v2 -; SI-NEXT: v_mov_b32_e32 v49, v1 -; SI-NEXT: v_mov_b32_e32 v48, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB44_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v0 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 @@ -9371,146 +10121,124 @@ define <28 x half> @bitcast_v7i64_to_v28f16(<7 x i64> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB44_3 -; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: .LBB44_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB44_4 -; SI-NEXT: .LBB44_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB44_3: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v48 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB44_2 -; SI-NEXT: .LBB44_4: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 -; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v49, vcc -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v38 -; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v39, vcc -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v36 -; SI-NEXT: v_addc_u32_e32 v8, vcc, 0, v37, vcc -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v34 -; SI-NEXT: v_addc_u32_e32 v12, vcc, 0, v35, vcc -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v32 -; SI-NEXT: v_addc_u32_e32 v16, vcc, 0, v33, vcc -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v30 -; SI-NEXT: v_addc_u32_e32 v20, vcc, 0, v31, vcc -; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v28 -; SI-NEXT: v_addc_u32_e32 v24, vcc, 0, v29, vcc -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v10 +; SI-NEXT: s_cbranch_execz .LBB44_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v11 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: .LBB44_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_cvt_f16_f32_e32 v0, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v38 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v34 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v32 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v31 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v28 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v27 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v23 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v24 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v19 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v20 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v15 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v16 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v7i64_to_v28f16: @@ -9624,47 +10352,47 @@ define inreg <28 x half> @bitcast_v7i64_to_v28f16_scalar(<7 x i64> inreg %a, i32 ; SI-NEXT: s_cbranch_scc0 .LBB45_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 ; SI-NEXT: s_lshr_b32 s4, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 ; SI-NEXT: s_lshr_b32 s4, s27, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 ; SI-NEXT: s_lshr_b32 s4, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 ; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 ; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 ; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 ; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 ; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 ; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s4 ; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 ; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s4 ; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s26 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s26 ; SI-NEXT: v_cvt_f32_f16_e32 v18, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s16 ; SI-NEXT: s_cbranch_execnz .LBB45_3 ; SI-NEXT: .LBB45_2: ; %cmp.true ; SI-NEXT: s_add_u32 s4, s16, 3 @@ -9695,65 +10423,121 @@ define inreg <28 x half> @bitcast_v7i64_to_v28f16_scalar(<7 x i64> inreg %a, i32 ; SI-NEXT: s_addc_u32 s29, s29, 0 ; SI-NEXT: s_lshr_b32 s40, s28, 16 ; SI-NEXT: s_lshr_b32 s41, s29, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s24 ; SI-NEXT: v_cvt_f32_f16_e32 v18, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s6 ; SI-NEXT: .LBB45_3: ; %end +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v27 +; SI-NEXT: v_or_b32_e32 v0, v26, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 +; SI-NEXT: v_or_b32_e32 v2, v24, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v24 +; SI-NEXT: v_or_b32_e32 v5, v5, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v22 +; SI-NEXT: v_or_b32_e32 v7, v7, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v20 +; SI-NEXT: v_or_b32_e32 v9, v18, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_or_b32_e32 v11, v16, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v16 +; SI-NEXT: v_or_b32_e32 v1, v28, v1 +; SI-NEXT: v_or_b32_e32 v3, v26, v3 +; SI-NEXT: v_or_b32_e32 v4, v23, v4 +; SI-NEXT: v_or_b32_e32 v6, v21, v6 +; SI-NEXT: v_or_b32_e32 v8, v19, v8 +; SI-NEXT: v_or_b32_e32 v10, v17, v10 +; SI-NEXT: v_or_b32_e32 v12, v15, v12 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB45_4: +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: s_branch .LBB45_2 ; ; VI-LABEL: bitcast_v7i64_to_v28f16_scalar: @@ -9897,79 +10681,116 @@ define <7 x i64> @bitcast_v28f16_to_v7i64(<28 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v28f16_to_v7i64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v42, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v26 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v13 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execz .LBB46_2 -; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v42 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v40 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v52 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v50 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v48 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v38 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v34 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v30 +; SI-NEXT: s_cbranch_execnz .LBB46_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB46_4 +; SI-NEXT: .LBB46_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB46_3: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v23 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v21 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v19 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v17 -; SI-NEXT: v_or_b32_e32 v0, v41, v0 -; SI-NEXT: v_or_b32_e32 v1, v55, v1 -; SI-NEXT: v_or_b32_e32 v2, v53, v2 -; SI-NEXT: v_or_b32_e32 v3, v51, v3 -; SI-NEXT: v_or_b32_e32 v4, v49, v4 -; SI-NEXT: v_or_b32_e32 v5, v39, v5 -; SI-NEXT: v_or_b32_e32 v6, v37, v6 -; SI-NEXT: v_or_b32_e32 v7, v35, v7 -; SI-NEXT: v_or_b32_e32 v8, v33, v8 -; SI-NEXT: v_or_b32_e32 v9, v31, v9 -; SI-NEXT: v_or_b32_e32 v10, v29, v10 +; SI-NEXT: v_or_b32_e32 v0, v50, v0 +; SI-NEXT: v_or_b32_e32 v1, v48, v1 +; SI-NEXT: v_or_b32_e32 v2, v38, v2 +; SI-NEXT: v_or_b32_e32 v3, v36, v3 +; SI-NEXT: v_or_b32_e32 v4, v34, v4 +; SI-NEXT: v_or_b32_e32 v5, v32, v5 +; SI-NEXT: v_or_b32_e32 v6, v30, v6 +; SI-NEXT: v_or_b32_e32 v7, v28, v7 +; SI-NEXT: v_or_b32_e32 v8, v26, v8 +; SI-NEXT: v_or_b32_e32 v9, v24, v9 +; SI-NEXT: v_or_b32_e32 v10, v22, v10 ; SI-NEXT: v_or_b32_e32 v11, v20, v11 ; SI-NEXT: v_or_b32_e32 v12, v18, v12 ; SI-NEXT: v_or_b32_e32 v13, v16, v13 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr49 @@ -9985,20 +10806,26 @@ define <7 x i64> @bitcast_v28f16_to_v7i64(<28 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: .LBB46_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB46_4 -; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v55 +; SI-NEXT: s_cbranch_execz .LBB46_2 +; SI-NEXT: .LBB46_4: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v48 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -10006,10 +10833,10 @@ define <7 x i64> @bitcast_v28f16_to_v7i64(<28 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v38 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v39 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -10017,11 +10844,11 @@ define <7 x i64> @bitcast_v28f16_to_v7i64(<28 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v37 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v35 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -10029,11 +10856,11 @@ define <7 x i64> @bitcast_v28f16_to_v7i64(<28 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v34 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v32 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 @@ -10041,11 +10868,11 @@ define <7 x i64> @bitcast_v28f16_to_v7i64(<28 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v31 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v29 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 @@ -10053,11 +10880,11 @@ define <7 x i64> @bitcast_v28f16_to_v7i64(<28 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v28 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v26 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 @@ -10065,11 +10892,11 @@ define <7 x i64> @bitcast_v28f16_to_v7i64(<28 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v25 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v23 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 @@ -10077,7 +10904,7 @@ define <7 x i64> @bitcast_v28f16_to_v7i64(<28 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v22 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v21 @@ -10107,12 +10934,7 @@ define <7 x i64> @bitcast_v28f16_to_v7i64(<28 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v12, v13, v12 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 ; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: .LBB46_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v28f16_to_v7i64: @@ -10247,64 +11069,106 @@ define inreg <7 x i64> @bitcast_v28f16_to_v7i64_scalar(<28 x half> inreg %a, i32 ; SI-LABEL: bitcast_v28f16_to_v7i64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v51, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v50, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v49, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v38, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v36, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v35, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v48, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v39, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v37, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v34, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v33, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v32, s26 -; SI-NEXT: v_cvt_f16_f32_e32 v31, s29 -; SI-NEXT: v_cvt_f16_f32_e32 v30, s28 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: s_lshr_b32 s8, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s8 +; SI-NEXT: s_lshr_b32 s8, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s8 +; SI-NEXT: s_lshr_b32 s8, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s8 +; SI-NEXT: s_lshr_b32 s8, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s8 +; SI-NEXT: s_lshr_b32 s8, s21, 16 +; SI-NEXT: s_lshr_b32 s6, s27, 16 +; SI-NEXT: s_lshr_b32 s7, s26, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s8 +; SI-NEXT: s_lshr_b32 s8, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s8 +; SI-NEXT: s_lshr_b32 s8, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s8 +; SI-NEXT: s_lshr_b32 s8, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s8 +; SI-NEXT: s_lshr_b32 s8, s17, 16 +; SI-NEXT: s_lshr_b32 s4, s29, 16 +; SI-NEXT: s_lshr_b32 s5, s28, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s8 +; SI-NEXT: s_lshr_b32 s8, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v4 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_cbranch_scc0 .LBB47_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v49 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v48 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v35 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v33 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v31 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v29 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v24 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v20 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v17 ; SI-NEXT: v_or_b32_e32 v0, v50, v0 -; SI-NEXT: v_or_b32_e32 v1, v38, v1 -; SI-NEXT: v_or_b32_e32 v2, v35, v2 -; SI-NEXT: v_or_b32_e32 v3, v39, v3 +; SI-NEXT: v_or_b32_e32 v1, v48, v1 +; SI-NEXT: v_or_b32_e32 v2, v38, v2 +; SI-NEXT: v_or_b32_e32 v3, v36, v3 ; SI-NEXT: v_or_b32_e32 v4, v34, v4 ; SI-NEXT: v_or_b32_e32 v5, v32, v5 ; SI-NEXT: v_or_b32_e32 v6, v30, v6 ; SI-NEXT: v_or_b32_e32 v7, v28, v7 ; SI-NEXT: v_or_b32_e32 v8, v26, v8 -; SI-NEXT: v_or_b32_e32 v9, v24, v9 +; SI-NEXT: v_or_b32_e32 v9, v23, v9 ; SI-NEXT: v_or_b32_e32 v10, v22, v10 -; SI-NEXT: v_or_b32_e32 v11, v20, v11 +; SI-NEXT: v_or_b32_e32 v11, v19, v11 ; SI-NEXT: v_or_b32_e32 v12, v18, v12 ; SI-NEXT: v_or_b32_e32 v13, v16, v13 ; SI-NEXT: s_cbranch_execnz .LBB47_3 @@ -10312,7 +11176,7 @@ define inreg <7 x i64> @bitcast_v28f16_to_v7i64_scalar(<28 x half> inreg %a, i32 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v48 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -10320,10 +11184,10 @@ define inreg <7 x i64> @bitcast_v28f16_to_v7i64_scalar(<28 x half> inreg %a, i32 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v38 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v39 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -10331,11 +11195,11 @@ define inreg <7 x i64> @bitcast_v28f16_to_v7i64_scalar(<28 x half> inreg %a, i32 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v37 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v35 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -10382,8 +11246,8 @@ define inreg <7 x i64> @bitcast_v28f16_to_v7i64_scalar(<28 x half> inreg %a, i32 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v25 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v24 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 @@ -10395,7 +11259,7 @@ define inreg <7 x i64> @bitcast_v28f16_to_v7i64_scalar(<28 x half> inreg %a, i32 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v19 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 @@ -10403,7 +11267,7 @@ define inreg <7 x i64> @bitcast_v28f16_to_v7i64_scalar(<28 x half> inreg %a, i32 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v20 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v13, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v18 @@ -10637,92 +11501,108 @@ define <28 x i16> @bitcast_v7f64_to_v28i16(<7 x double> %a, i32 %b) { ; SI-LABEL: bitcast_v7f64_to_v28i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v49, v13 -; SI-NEXT: v_mov_b32_e32 v48, v12 -; SI-NEXT: v_mov_b32_e32 v38, v11 -; SI-NEXT: v_mov_b32_e32 v37, v10 -; SI-NEXT: v_mov_b32_e32 v36, v9 -; SI-NEXT: v_mov_b32_e32 v35, v8 -; SI-NEXT: v_mov_b32_e32 v34, v7 -; SI-NEXT: v_mov_b32_e32 v33, v6 -; SI-NEXT: v_mov_b32_e32 v32, v5 -; SI-NEXT: v_mov_b32_e32 v31, v4 -; SI-NEXT: v_mov_b32_e32 v30, v3 -; SI-NEXT: v_mov_b32_e32 v29, v2 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB48_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_alignbit_b32 v25, v49, v48, 16 -; SI-NEXT: v_alignbit_b32 v21, v38, v37, 16 -; SI-NEXT: v_alignbit_b32 v17, v36, v35, 16 -; SI-NEXT: v_alignbit_b32 v13, v34, v33, 16 -; SI-NEXT: v_alignbit_b32 v9, v32, v31, 16 -; SI-NEXT: v_alignbit_b32 v5, v30, v29, 16 -; SI-NEXT: v_alignbit_b32 v28, v1, v0, 16 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v49 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v38 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v36 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v34 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v32 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_alignbit_b32 v14, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v15, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v16, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v17, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v19, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v21, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v24, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 ; SI-NEXT: .LBB48_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB48_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; SI-NEXT: v_add_f64 v[29:30], v[29:30], 1.0 -; SI-NEXT: v_add_f64 v[31:32], v[31:32], 1.0 -; SI-NEXT: v_add_f64 v[33:34], v[33:34], 1.0 -; SI-NEXT: v_add_f64 v[35:36], v[35:36], 1.0 -; SI-NEXT: v_add_f64 v[48:49], v[48:49], 1.0 -; SI-NEXT: v_add_f64 v[37:38], v[37:38], 1.0 -; SI-NEXT: v_alignbit_b32 v25, v49, v48, 16 -; SI-NEXT: v_alignbit_b32 v21, v38, v37, 16 -; SI-NEXT: v_alignbit_b32 v17, v36, v35, 16 -; SI-NEXT: v_alignbit_b32 v13, v34, v33, 16 -; SI-NEXT: v_alignbit_b32 v9, v32, v31, 16 -; SI-NEXT: v_alignbit_b32 v5, v30, v29, 16 -; SI-NEXT: v_alignbit_b32 v28, v1, v0, 16 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v49 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v38 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v36 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v34 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v32 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_alignbit_b32 v14, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v15, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v16, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v17, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v19, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v21, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v24, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 ; SI-NEXT: .LBB48_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_mov_b32_e32 v2, v1 -; SI-NEXT: v_mov_b32_e32 v4, v29 -; SI-NEXT: v_mov_b32_e32 v6, v30 -; SI-NEXT: v_mov_b32_e32 v8, v31 -; SI-NEXT: v_mov_b32_e32 v10, v32 -; SI-NEXT: v_mov_b32_e32 v12, v33 -; SI-NEXT: v_mov_b32_e32 v14, v34 -; SI-NEXT: v_mov_b32_e32 v16, v35 -; SI-NEXT: v_mov_b32_e32 v18, v36 -; SI-NEXT: v_mov_b32_e32 v20, v37 -; SI-NEXT: v_mov_b32_e32 v22, v38 -; SI-NEXT: v_mov_b32_e32 v24, v48 -; SI-NEXT: v_mov_b32_e32 v26, v49 -; SI-NEXT: v_mov_b32_e32 v1, v28 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v0, v0, v24 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v27 +; SI-NEXT: v_or_b32_e32 v2, v2, v21 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v26 +; SI-NEXT: v_or_b32_e32 v4, v4, v19 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v25 +; SI-NEXT: v_or_b32_e32 v6, v6, v17 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v23 +; SI-NEXT: v_or_b32_e32 v8, v8, v16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v22 +; SI-NEXT: v_or_b32_e32 v10, v10, v15 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v20 +; SI-NEXT: v_or_b32_e32 v12, v12, v14 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v18 +; SI-NEXT: v_or_b32_e32 v1, v1, v24 +; SI-NEXT: v_or_b32_e32 v3, v3, v21 +; SI-NEXT: v_or_b32_e32 v5, v5, v19 +; SI-NEXT: v_or_b32_e32 v7, v7, v17 +; SI-NEXT: v_or_b32_e32 v9, v9, v16 +; SI-NEXT: v_or_b32_e32 v11, v11, v15 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v7f64_to_v28i16: @@ -10826,27 +11706,27 @@ define inreg <28 x i16> @bitcast_v7f64_to_v28i16_scalar(<7 x double> inreg %a, i ; SI-NEXT: s_lshr_b64 s[40:41], s[16:17], 16 ; SI-NEXT: s_cbranch_execnz .LBB49_4 ; SI-NEXT: .LBB49_2: ; %cmp.true -; SI-NEXT: v_add_f64 v[28:29], s[28:29], 1.0 -; SI-NEXT: v_add_f64 v[30:31], s[26:27], 1.0 -; SI-NEXT: v_add_f64 v[32:33], s[24:25], 1.0 -; SI-NEXT: v_add_f64 v[34:35], s[22:23], 1.0 -; SI-NEXT: v_add_f64 v[36:37], s[20:21], 1.0 -; SI-NEXT: v_add_f64 v[38:39], s[18:19], 1.0 -; SI-NEXT: v_add_f64 v[48:49], s[16:17], 1.0 -; SI-NEXT: v_lshr_b64 v[25:26], v[28:29], 16 -; SI-NEXT: v_lshr_b64 v[21:22], v[30:31], 16 -; SI-NEXT: v_lshr_b64 v[17:18], v[32:33], 16 -; SI-NEXT: v_lshr_b64 v[13:14], v[34:35], 16 -; SI-NEXT: v_lshr_b64 v[9:10], v[36:37], 16 -; SI-NEXT: v_lshr_b64 v[5:6], v[38:39], 16 -; SI-NEXT: v_lshr_b64 v[1:2], v[48:49], 16 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v29 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v31 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v33 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v35 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v37 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v39 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v49 +; SI-NEXT: v_add_f64 v[12:13], s[28:29], 1.0 +; SI-NEXT: v_add_f64 v[10:11], s[26:27], 1.0 +; SI-NEXT: v_add_f64 v[8:9], s[24:25], 1.0 +; SI-NEXT: v_lshr_b64 v[14:15], v[12:13], 16 +; SI-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; SI-NEXT: v_lshr_b64 v[15:16], v[10:11], 16 +; SI-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; SI-NEXT: v_lshr_b64 v[16:17], v[8:9], 16 +; SI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; SI-NEXT: v_lshr_b64 v[17:18], v[6:7], 16 +; SI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; SI-NEXT: v_lshr_b64 v[18:19], v[4:5], 16 +; SI-NEXT: v_lshr_b64 v[19:20], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[20:21], v[0:1], 16 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v1 ; SI-NEXT: s_branch .LBB49_5 ; SI-NEXT: .LBB49_3: ; SI-NEXT: ; implicit-def: $sgpr40 @@ -10865,49 +11745,77 @@ define inreg <28 x i16> @bitcast_v7f64_to_v28i16_scalar(<7 x double> inreg %a, i ; SI-NEXT: ; implicit-def: $sgpr58 ; SI-NEXT: s_branch .LBB49_2 ; SI-NEXT: .LBB49_4: -; SI-NEXT: v_mov_b32_e32 v29, s29 -; SI-NEXT: v_mov_b32_e32 v31, s27 -; SI-NEXT: v_mov_b32_e32 v33, s25 -; SI-NEXT: v_mov_b32_e32 v35, s23 -; SI-NEXT: v_mov_b32_e32 v37, s21 -; SI-NEXT: v_mov_b32_e32 v39, s19 -; SI-NEXT: v_mov_b32_e32 v49, s17 -; SI-NEXT: v_mov_b32_e32 v48, s16 -; SI-NEXT: v_mov_b32_e32 v38, s18 -; SI-NEXT: v_mov_b32_e32 v36, s20 -; SI-NEXT: v_mov_b32_e32 v34, s22 -; SI-NEXT: v_mov_b32_e32 v32, s24 -; SI-NEXT: v_mov_b32_e32 v30, s26 -; SI-NEXT: v_mov_b32_e32 v28, s28 -; SI-NEXT: v_mov_b32_e32 v27, s58 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v22, s58 ; SI-NEXT: v_mov_b32_e32 v23, s57 -; SI-NEXT: v_mov_b32_e32 v19, s56 -; SI-NEXT: v_mov_b32_e32 v15, s47 -; SI-NEXT: v_mov_b32_e32 v11, s46 -; SI-NEXT: v_mov_b32_e32 v7, s45 -; SI-NEXT: v_mov_b32_e32 v3, s44 -; SI-NEXT: v_mov_b32_e32 v1, s40 -; SI-NEXT: v_mov_b32_e32 v5, s14 -; SI-NEXT: v_mov_b32_e32 v9, s12 -; SI-NEXT: v_mov_b32_e32 v13, s10 -; SI-NEXT: v_mov_b32_e32 v17, s8 -; SI-NEXT: v_mov_b32_e32 v21, s6 -; SI-NEXT: v_mov_b32_e32 v25, s4 +; SI-NEXT: v_mov_b32_e32 v24, s56 +; SI-NEXT: v_mov_b32_e32 v25, s47 +; SI-NEXT: v_mov_b32_e32 v26, s46 +; SI-NEXT: v_mov_b32_e32 v27, s45 +; SI-NEXT: v_mov_b32_e32 v28, s44 +; SI-NEXT: v_mov_b32_e32 v20, s40 +; SI-NEXT: v_mov_b32_e32 v19, s14 +; SI-NEXT: v_mov_b32_e32 v18, s12 +; SI-NEXT: v_mov_b32_e32 v17, s10 +; SI-NEXT: v_mov_b32_e32 v16, s8 +; SI-NEXT: v_mov_b32_e32 v15, s6 +; SI-NEXT: v_mov_b32_e32 v14, s4 ; SI-NEXT: .LBB49_5: ; %end -; SI-NEXT: v_mov_b32_e32 v0, v48 -; SI-NEXT: v_mov_b32_e32 v2, v49 -; SI-NEXT: v_mov_b32_e32 v4, v38 -; SI-NEXT: v_mov_b32_e32 v6, v39 -; SI-NEXT: v_mov_b32_e32 v8, v36 -; SI-NEXT: v_mov_b32_e32 v10, v37 -; SI-NEXT: v_mov_b32_e32 v12, v34 -; SI-NEXT: v_mov_b32_e32 v14, v35 -; SI-NEXT: v_mov_b32_e32 v16, v32 -; SI-NEXT: v_mov_b32_e32 v18, v33 -; SI-NEXT: v_mov_b32_e32 v20, v30 -; SI-NEXT: v_mov_b32_e32 v22, v31 -; SI-NEXT: v_mov_b32_e32 v24, v28 -; SI-NEXT: v_mov_b32_e32 v26, v29 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v0, v0, v20 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v28 +; SI-NEXT: v_or_b32_e32 v2, v2, v19 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v27 +; SI-NEXT: v_or_b32_e32 v4, v4, v18 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v26 +; SI-NEXT: v_or_b32_e32 v6, v6, v17 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v25 +; SI-NEXT: v_or_b32_e32 v8, v8, v16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v24 +; SI-NEXT: v_or_b32_e32 v10, v10, v15 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v23 +; SI-NEXT: v_or_b32_e32 v12, v12, v14 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v22 +; SI-NEXT: v_or_b32_e32 v1, v1, v20 +; SI-NEXT: v_or_b32_e32 v3, v3, v19 +; SI-NEXT: v_or_b32_e32 v5, v5, v18 +; SI-NEXT: v_or_b32_e32 v7, v7, v17 +; SI-NEXT: v_or_b32_e32 v9, v9, v16 +; SI-NEXT: v_or_b32_e32 v11, v11, v15 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v7f64_to_v28i16_scalar: @@ -11041,29 +11949,49 @@ define <7 x double> @bitcast_v28i16_to_v7f64(<28 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v28i16_to_v7f64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v36, v14 -; SI-NEXT: v_mov_b32_e32 v35, v12 -; SI-NEXT: v_mov_b32_e32 v34, v10 -; SI-NEXT: v_mov_b32_e32 v33, v8 -; SI-NEXT: v_mov_b32_e32 v32, v6 -; SI-NEXT: v_mov_b32_e32 v31, v4 -; SI-NEXT: v_mov_b32_e32 v30, v2 +; SI-NEXT: v_mov_b32_e32 v16, v13 +; SI-NEXT: v_mov_b32_e32 v17, v12 +; SI-NEXT: v_mov_b32_e32 v18, v11 +; SI-NEXT: v_mov_b32_e32 v19, v10 +; SI-NEXT: v_mov_b32_e32 v20, v9 +; SI-NEXT: v_mov_b32_e32 v21, v8 +; SI-NEXT: v_mov_b32_e32 v22, v7 +; SI-NEXT: v_mov_b32_e32 v23, v6 +; SI-NEXT: v_mov_b32_e32 v24, v5 +; SI-NEXT: v_mov_b32_e32 v25, v4 +; SI-NEXT: v_mov_b32_e32 v26, v3 +; SI-NEXT: v_mov_b32_e32 v27, v2 +; SI-NEXT: v_mov_b32_e32 v28, v1 ; SI-NEXT: v_mov_b32_e32 v29, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v29 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v12 ; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v0 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -11076,50 +12004,47 @@ define <7 x double> @bitcast_v28i16_to_v7f64(<28 x i16> %a, i32 %b) { ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB50_3: ; %cmp.false ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v29 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v30 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v31 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v32 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v33 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v34 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v35 -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v36 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v16 -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v18 -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v20 -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v22 -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v24 -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v26 -; SI-NEXT: v_or_b32_e32 v0, v0, v54 -; SI-NEXT: v_or_b32_e32 v1, v1, v53 -; SI-NEXT: v_or_b32_e32 v2, v2, v52 -; SI-NEXT: v_or_b32_e32 v3, v3, v51 -; SI-NEXT: v_or_b32_e32 v4, v4, v50 -; SI-NEXT: v_or_b32_e32 v5, v5, v49 -; SI-NEXT: v_or_b32_e32 v6, v6, v48 -; SI-NEXT: v_or_b32_e32 v7, v7, v39 -; SI-NEXT: v_or_b32_e32 v8, v8, v38 -; SI-NEXT: v_or_b32_e32 v9, v9, v37 -; SI-NEXT: v_or_b32_e32 v10, v10, v28 -; SI-NEXT: v_or_b32_e32 v11, v11, v21 -; SI-NEXT: v_or_b32_e32 v12, v12, v19 -; SI-NEXT: v_or_b32_e32 v13, v13, v17 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v28 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v27 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v26 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v25 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v24 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v23 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v21 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v19 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v0, v0, v51 +; SI-NEXT: v_or_b32_e32 v1, v1, v50 +; SI-NEXT: v_or_b32_e32 v2, v2, v49 +; SI-NEXT: v_or_b32_e32 v3, v3, v48 +; SI-NEXT: v_or_b32_e32 v4, v4, v39 +; SI-NEXT: v_or_b32_e32 v5, v5, v38 +; SI-NEXT: v_or_b32_e32 v6, v6, v37 +; SI-NEXT: v_or_b32_e32 v7, v7, v36 +; SI-NEXT: v_or_b32_e32 v8, v8, v35 +; SI-NEXT: v_or_b32_e32 v9, v9, v34 +; SI-NEXT: v_or_b32_e32 v10, v10, v33 +; SI-NEXT: v_or_b32_e32 v11, v11, v32 +; SI-NEXT: v_or_b32_e32 v12, v12, v31 +; SI-NEXT: v_or_b32_e32 v13, v13, v30 ; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr49 @@ -11127,27 +12052,30 @@ define <7 x double> @bitcast_v28i16_to_v7f64(<28 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB50_2 ; SI-NEXT: .LBB50_4: ; %cmp.true ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v29 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v30 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v31 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v32 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v33 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v34 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v35 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v36 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v16 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v18 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v20 -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v22 -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v24 -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v27 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v16 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 @@ -11162,21 +12090,21 @@ define <7 x double> @bitcast_v28i16_to_v7f64(<28 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; SI-NEXT: v_or_b32_e32 v0, v54, v0 +; SI-NEXT: v_or_b32_e32 v0, v51, v0 ; SI-NEXT: s_mov_b32 s6, 0x30000 -; SI-NEXT: v_or_b32_e32 v1, v53, v1 -; SI-NEXT: v_or_b32_e32 v2, v52, v2 -; SI-NEXT: v_or_b32_e32 v3, v51, v3 -; SI-NEXT: v_or_b32_e32 v4, v50, v4 -; SI-NEXT: v_or_b32_e32 v5, v49, v5 -; SI-NEXT: v_or_b32_e32 v6, v48, v6 -; SI-NEXT: v_or_b32_e32 v7, v39, v7 -; SI-NEXT: v_or_b32_e32 v8, v38, v8 -; SI-NEXT: v_or_b32_e32 v9, v37, v9 -; SI-NEXT: v_or_b32_e32 v10, v28, v10 -; SI-NEXT: v_or_b32_e32 v11, v21, v11 -; SI-NEXT: v_or_b32_e32 v12, v19, v12 -; SI-NEXT: v_or_b32_e32 v13, v17, v13 +; SI-NEXT: v_or_b32_e32 v1, v50, v1 +; SI-NEXT: v_or_b32_e32 v2, v49, v2 +; SI-NEXT: v_or_b32_e32 v3, v48, v3 +; SI-NEXT: v_or_b32_e32 v4, v39, v4 +; SI-NEXT: v_or_b32_e32 v5, v38, v5 +; SI-NEXT: v_or_b32_e32 v6, v37, v6 +; SI-NEXT: v_or_b32_e32 v7, v36, v7 +; SI-NEXT: v_or_b32_e32 v8, v35, v8 +; SI-NEXT: v_or_b32_e32 v9, v34, v9 +; SI-NEXT: v_or_b32_e32 v10, v33, v10 +; SI-NEXT: v_or_b32_e32 v11, v32, v11 +; SI-NEXT: v_or_b32_e32 v12, v31, v12 +; SI-NEXT: v_or_b32_e32 v13, v30, v13 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 @@ -11325,142 +12253,180 @@ define inreg <7 x double> @bitcast_v28i16_to_v7f64_scalar(<28 x i16> inreg %a, i ; SI-LABEL: bitcast_v28i16_to_v7f64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; SI-NEXT: v_mov_b32_e32 v16, v12 -; SI-NEXT: v_mov_b32_e32 v17, v10 -; SI-NEXT: v_mov_b32_e32 v18, v8 -; SI-NEXT: v_mov_b32_e32 v19, v6 -; SI-NEXT: v_mov_b32_e32 v20, v4 -; SI-NEXT: v_mov_b32_e32 v21, v2 -; SI-NEXT: v_mov_b32_e32 v22, v0 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v14, s36, 0 +; SI-NEXT: v_writelane_b32 v14, s37, 1 +; SI-NEXT: v_writelane_b32 v14, s38, 2 +; SI-NEXT: v_writelane_b32 v14, s39, 3 +; SI-NEXT: v_writelane_b32 v14, s48, 4 +; SI-NEXT: v_writelane_b32 v14, s49, 5 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s7, s28, 16 +; SI-NEXT: s_lshr_b32 s8, s27, 16 +; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: s_lshr_b32 s13, s22, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s56, s19, 16 +; SI-NEXT: s_lshr_b32 s57, s18, 16 +; SI-NEXT: s_lshr_b32 s58, s17, 16 +; SI-NEXT: s_lshr_b32 s59, s16, 16 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: v_writelane_b32 v14, s50, 6 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v13 +; SI-NEXT: v_writelane_b32 v14, s51, 7 ; SI-NEXT: s_cbranch_scc0 .LBB51_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v22 -; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: v_or_b32_e32 v7, v0, v29 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v20 -; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: v_or_b32_e32 v9, v0, v27 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v19 -; SI-NEXT: s_or_b32 s7, s7, s8 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: v_or_b32_e32 v10, v0, v26 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v18 -; SI-NEXT: s_or_b32 s8, s8, s9 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: v_or_b32_e32 v11, v0, v25 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v17 -; SI-NEXT: s_or_b32 s9, s9, s10 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 -; SI-NEXT: v_or_b32_e32 v12, v0, v24 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 -; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_or_b32_e32 v8, v1, v28 -; SI-NEXT: v_or_b32_e32 v13, v0, v23 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_mov_b32_e32 v5, s9 -; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_lshl_b32 s5, s59, 16 +; SI-NEXT: s_or_b32 s36, s4, s5 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: s_lshl_b32 s5, s57, 16 +; SI-NEXT: s_or_b32 s38, s4, s5 +; SI-NEXT: s_and_b32 s4, s19, 0xffff +; SI-NEXT: s_lshl_b32 s5, s56, 16 +; SI-NEXT: s_and_b32 s40, s17, 0xffff +; SI-NEXT: s_lshl_b32 s41, s58, 16 +; SI-NEXT: s_or_b32 s39, s4, s5 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: s_lshl_b32 s5, s15, 16 +; SI-NEXT: s_or_b32 s37, s40, s41 +; SI-NEXT: s_or_b32 s40, s4, s5 +; SI-NEXT: s_and_b32 s4, s21, 0xffff +; SI-NEXT: s_lshl_b32 s5, s14, 16 +; SI-NEXT: s_or_b32 s41, s4, s5 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: s_lshl_b32 s5, s13, 16 +; SI-NEXT: s_or_b32 s42, s4, s5 +; SI-NEXT: s_and_b32 s4, s23, 0xffff +; SI-NEXT: s_lshl_b32 s5, s12, 16 +; SI-NEXT: s_or_b32 s43, s4, s5 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: s_lshl_b32 s5, s11, 16 +; SI-NEXT: s_or_b32 s44, s4, s5 +; SI-NEXT: s_and_b32 s4, s25, 0xffff +; SI-NEXT: s_lshl_b32 s5, s10, 16 +; SI-NEXT: s_or_b32 s45, s4, s5 +; SI-NEXT: s_and_b32 s4, s26, 0xffff +; SI-NEXT: s_lshl_b32 s5, s9, 16 +; SI-NEXT: s_or_b32 s46, s4, s5 +; SI-NEXT: s_and_b32 s4, s27, 0xffff +; SI-NEXT: s_lshl_b32 s5, s8, 16 +; SI-NEXT: s_or_b32 s47, s4, s5 +; SI-NEXT: s_and_b32 s4, s28, 0xffff +; SI-NEXT: s_lshl_b32 s5, s7, 16 +; SI-NEXT: s_or_b32 s48, s4, s5 +; SI-NEXT: s_and_b32 s4, s29, 0xffff +; SI-NEXT: s_lshl_b32 s5, s6, 16 +; SI-NEXT: s_or_b32 s49, s4, s5 ; SI-NEXT: s_cbranch_execnz .LBB51_3 ; SI-NEXT: .LBB51_2: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v22 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v29, v0 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v21 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v28, v0 -; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v20 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v27, v0 -; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v19 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_or_b32_e32 v0, v26, v0 ; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_lshl_b32 s5, s59, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s36, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s17, 0xffff +; SI-NEXT: s_lshl_b32 s5, s58, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v18 +; SI-NEXT: s_add_i32 s37, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: s_lshl_b32 s5, s57, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s38, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s19, 0xffff +; SI-NEXT: s_lshl_b32 s5, s56, 16 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 ; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s39, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: s_lshl_b32 s5, s15, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s40, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s21, 0xffff +; SI-NEXT: s_lshl_b32 s5, s14, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: v_or_b32_e32 v0, v25, v0 -; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s41, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: s_lshl_b32 s5, s13, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s42, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s23, 0xffff +; SI-NEXT: s_lshl_b32 s5, s12, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v17 -; SI-NEXT: s_or_b32 s7, s8, s7 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s43, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: s_lshl_b32 s5, s11, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s44, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s25, 0xffff +; SI-NEXT: s_lshl_b32 s5, s10, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_or_b32 s8, s9, s8 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_add_i32 s45, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s26, 0xffff +; SI-NEXT: s_lshl_b32 s5, s9, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_add_i32 s46, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s27, 0xffff +; SI-NEXT: s_lshl_b32 s5, s8, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: v_or_b32_e32 v0, v24, v0 -; SI-NEXT: s_or_b32 s9, s10, s9 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v16 -; SI-NEXT: s_or_b32 s10, s11, s10 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_add_i32 s4, s4, 0x30000 -; SI-NEXT: s_add_i32 s5, s5, 0x30000 -; SI-NEXT: s_add_i32 s6, s6, 0x30000 -; SI-NEXT: s_add_i32 s7, s7, 0x30000 -; SI-NEXT: s_add_i32 s8, s8, 0x30000 -; SI-NEXT: s_add_i32 s9, s9, 0x30000 -; SI-NEXT: s_add_i32 s10, s10, 0x30000 -; SI-NEXT: v_or_b32_e32 v0, v23, v0 -; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_mov_b32_e32 v5, s9 -; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_add_i32 s47, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s28, 0xffff +; SI-NEXT: s_lshl_b32 s5, s7, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_add_i32 s48, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s29, 0xffff +; SI-NEXT: s_lshl_b32 s5, s6, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s49, s4, 0x30000 ; SI-NEXT: .LBB51_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s36 +; SI-NEXT: v_mov_b32_e32 v1, s37 +; SI-NEXT: v_mov_b32_e32 v2, s38 +; SI-NEXT: v_mov_b32_e32 v3, s39 +; SI-NEXT: v_mov_b32_e32 v4, s40 +; SI-NEXT: v_mov_b32_e32 v5, s41 +; SI-NEXT: v_mov_b32_e32 v6, s42 +; SI-NEXT: v_mov_b32_e32 v7, s43 +; SI-NEXT: v_mov_b32_e32 v8, s44 +; SI-NEXT: v_mov_b32_e32 v9, s45 +; SI-NEXT: v_mov_b32_e32 v10, s46 +; SI-NEXT: v_mov_b32_e32 v11, s47 +; SI-NEXT: v_mov_b32_e32 v12, s48 +; SI-NEXT: v_mov_b32_e32 v13, s49 +; SI-NEXT: v_readlane_b32 s51, v14, 7 +; SI-NEXT: v_readlane_b32 s50, v14, 6 +; SI-NEXT: v_readlane_b32 s49, v14, 5 +; SI-NEXT: v_readlane_b32 s48, v14, 4 +; SI-NEXT: v_readlane_b32 s39, v14, 3 +; SI-NEXT: v_readlane_b32 s38, v14, 2 +; SI-NEXT: v_readlane_b32 s37, v14, 1 +; SI-NEXT: v_readlane_b32 s36, v14, 0 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB51_4: -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: ; implicit-def: $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 ; SI-NEXT: s_branch .LBB51_2 ; ; VI-LABEL: bitcast_v28i16_to_v7f64_scalar: @@ -11671,77 +12637,77 @@ define <28 x half> @bitcast_v7f64_to_v28f16(<7 x double> %a, i32 %b) { ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 ; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB52_2 ; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v1 ; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v1 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v48, v0 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr2 @@ -11762,72 +12728,114 @@ define <28 x half> @bitcast_v7f64_to_v28f16(<7 x double> %a, i32 %b) { ; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 ; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 ; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v1 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v48, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 ; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 ; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 ; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 ; SI-NEXT: .LBB52_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_mov_b32_e32 v0, v48 -; SI-NEXT: v_mov_b32_e32 v1, v49 -; SI-NEXT: v_mov_b32_e32 v2, v39 -; SI-NEXT: v_mov_b32_e32 v3, v38 -; SI-NEXT: v_mov_b32_e32 v4, v37 -; SI-NEXT: v_mov_b32_e32 v5, v35 -; SI-NEXT: v_mov_b32_e32 v6, v36 -; SI-NEXT: v_mov_b32_e32 v7, v33 -; SI-NEXT: v_mov_b32_e32 v8, v34 -; SI-NEXT: v_mov_b32_e32 v9, v30 -; SI-NEXT: v_mov_b32_e32 v10, v32 -; SI-NEXT: v_mov_b32_e32 v11, v28 -; SI-NEXT: v_mov_b32_e32 v12, v31 -; SI-NEXT: v_mov_b32_e32 v13, v29 -; SI-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: bitcast_v7f64_to_v28f16: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: v_cvt_f16_f32_e32 v0, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v38 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v34 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v32 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v31 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v28 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v27 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v23 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v24 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v19 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v20 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v15 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v16 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v7f64_to_v28f16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB52_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 @@ -11907,129 +12915,185 @@ define inreg <28 x half> @bitcast_v7f64_to_v28f16_scalar(<7 x double> inreg %a, ; SI-NEXT: s_cbranch_scc0 .LBB53_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 ; SI-NEXT: s_lshr_b32 s4, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 ; SI-NEXT: s_lshr_b32 s4, s27, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 ; SI-NEXT: s_lshr_b32 s4, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 ; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 ; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 ; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 ; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 ; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 ; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 ; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 ; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s4 ; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s26 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s26 ; SI-NEXT: v_cvt_f32_f16_e32 v18, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s16 ; SI-NEXT: s_cbranch_execnz .LBB53_3 ; SI-NEXT: .LBB53_2: ; %cmp.true -; SI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 -; SI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 -; SI-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 -; SI-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 -; SI-NEXT: v_add_f64 v[8:9], s[24:25], 1.0 -; SI-NEXT: v_add_f64 v[10:11], s[26:27], 1.0 -; SI-NEXT: v_add_f64 v[12:13], s[28:29], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v3 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v2 +; SI-NEXT: v_add_f64 v[3:4], s[16:17], 1.0 +; SI-NEXT: v_add_f64 v[0:1], s[18:19], 1.0 +; SI-NEXT: v_add_f64 v[8:9], s[20:21], 1.0 +; SI-NEXT: v_add_f64 v[5:6], s[22:23], 1.0 +; SI-NEXT: v_add_f64 v[10:11], s[24:25], 1.0 +; SI-NEXT: v_add_f64 v[12:13], s[26:27], 1.0 +; SI-NEXT: v_add_f64 v[15:16], s[28:29], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v32 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: .LBB53_3: ; %end +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v27 +; SI-NEXT: v_or_b32_e32 v0, v26, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_or_b32_e32 v2, v24, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v23 +; SI-NEXT: v_or_b32_e32 v4, v22, v4 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v24 +; SI-NEXT: v_or_b32_e32 v5, v5, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v22 +; SI-NEXT: v_or_b32_e32 v7, v7, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v20 +; SI-NEXT: v_or_b32_e32 v9, v18, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_or_b32_e32 v11, v16, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v16 +; SI-NEXT: v_or_b32_e32 v1, v28, v1 +; SI-NEXT: v_or_b32_e32 v3, v26, v3 +; SI-NEXT: v_or_b32_e32 v6, v21, v6 +; SI-NEXT: v_or_b32_e32 v8, v19, v8 +; SI-NEXT: v_or_b32_e32 v10, v17, v10 +; SI-NEXT: v_or_b32_e32 v12, v15, v12 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB53_4: +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: s_branch .LBB53_2 ; ; VI-LABEL: bitcast_v7f64_to_v28f16_scalar: @@ -12163,79 +13227,116 @@ define <7 x double> @bitcast_v28f16_to_v7f64(<28 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v28f16_to_v7f64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v42, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v26 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v13 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execz .LBB54_2 -; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v42 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v40 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v52 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v50 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v48 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v38 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v34 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v30 +; SI-NEXT: s_cbranch_execnz .LBB54_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB54_4 +; SI-NEXT: .LBB54_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB54_3: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v23 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v21 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v19 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v17 -; SI-NEXT: v_or_b32_e32 v0, v41, v0 -; SI-NEXT: v_or_b32_e32 v1, v55, v1 -; SI-NEXT: v_or_b32_e32 v2, v53, v2 -; SI-NEXT: v_or_b32_e32 v3, v51, v3 -; SI-NEXT: v_or_b32_e32 v4, v49, v4 -; SI-NEXT: v_or_b32_e32 v5, v39, v5 -; SI-NEXT: v_or_b32_e32 v6, v37, v6 -; SI-NEXT: v_or_b32_e32 v7, v35, v7 -; SI-NEXT: v_or_b32_e32 v8, v33, v8 -; SI-NEXT: v_or_b32_e32 v9, v31, v9 -; SI-NEXT: v_or_b32_e32 v10, v29, v10 +; SI-NEXT: v_or_b32_e32 v0, v50, v0 +; SI-NEXT: v_or_b32_e32 v1, v48, v1 +; SI-NEXT: v_or_b32_e32 v2, v38, v2 +; SI-NEXT: v_or_b32_e32 v3, v36, v3 +; SI-NEXT: v_or_b32_e32 v4, v34, v4 +; SI-NEXT: v_or_b32_e32 v5, v32, v5 +; SI-NEXT: v_or_b32_e32 v6, v30, v6 +; SI-NEXT: v_or_b32_e32 v7, v28, v7 +; SI-NEXT: v_or_b32_e32 v8, v26, v8 +; SI-NEXT: v_or_b32_e32 v9, v24, v9 +; SI-NEXT: v_or_b32_e32 v10, v22, v10 ; SI-NEXT: v_or_b32_e32 v11, v20, v11 ; SI-NEXT: v_or_b32_e32 v12, v18, v12 ; SI-NEXT: v_or_b32_e32 v13, v16, v13 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr49 @@ -12251,20 +13352,26 @@ define <7 x double> @bitcast_v28f16_to_v7f64(<28 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: .LBB54_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB54_4 -; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v55 +; SI-NEXT: s_cbranch_execz .LBB54_2 +; SI-NEXT: .LBB54_4: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v48 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -12272,10 +13379,10 @@ define <7 x double> @bitcast_v28f16_to_v7f64(<28 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v38 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v39 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -12283,11 +13390,11 @@ define <7 x double> @bitcast_v28f16_to_v7f64(<28 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v37 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v35 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -12295,11 +13402,11 @@ define <7 x double> @bitcast_v28f16_to_v7f64(<28 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v34 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v32 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 @@ -12307,11 +13414,11 @@ define <7 x double> @bitcast_v28f16_to_v7f64(<28 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v31 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v29 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 @@ -12319,11 +13426,11 @@ define <7 x double> @bitcast_v28f16_to_v7f64(<28 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v28 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v26 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 @@ -12331,11 +13438,11 @@ define <7 x double> @bitcast_v28f16_to_v7f64(<28 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v25 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v23 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 @@ -12343,7 +13450,7 @@ define <7 x double> @bitcast_v28f16_to_v7f64(<28 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v22 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v21 @@ -12373,12 +13480,7 @@ define <7 x double> @bitcast_v28f16_to_v7f64(<28 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v12, v13, v12 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 ; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: .LBB54_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v28f16_to_v7f64: @@ -12513,64 +13615,106 @@ define inreg <7 x double> @bitcast_v28f16_to_v7f64_scalar(<28 x half> inreg %a, ; SI-LABEL: bitcast_v28f16_to_v7f64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v51, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v50, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v49, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v38, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v36, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v35, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v48, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v39, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v37, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v34, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v33, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v32, s26 -; SI-NEXT: v_cvt_f16_f32_e32 v31, s29 -; SI-NEXT: v_cvt_f16_f32_e32 v30, s28 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: s_lshr_b32 s8, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s8 +; SI-NEXT: s_lshr_b32 s8, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s8 +; SI-NEXT: s_lshr_b32 s8, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s8 +; SI-NEXT: s_lshr_b32 s8, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s8 +; SI-NEXT: s_lshr_b32 s8, s21, 16 +; SI-NEXT: s_lshr_b32 s6, s27, 16 +; SI-NEXT: s_lshr_b32 s7, s26, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s8 +; SI-NEXT: s_lshr_b32 s8, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s8 +; SI-NEXT: s_lshr_b32 s8, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s8 +; SI-NEXT: s_lshr_b32 s8, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s8 +; SI-NEXT: s_lshr_b32 s8, s17, 16 +; SI-NEXT: s_lshr_b32 s4, s29, 16 +; SI-NEXT: s_lshr_b32 s5, s28, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s8 +; SI-NEXT: s_lshr_b32 s8, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v4 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_cbranch_scc0 .LBB55_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v49 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v48 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v35 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v33 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v31 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v29 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v24 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v20 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v17 ; SI-NEXT: v_or_b32_e32 v0, v50, v0 -; SI-NEXT: v_or_b32_e32 v1, v38, v1 -; SI-NEXT: v_or_b32_e32 v2, v35, v2 -; SI-NEXT: v_or_b32_e32 v3, v39, v3 +; SI-NEXT: v_or_b32_e32 v1, v48, v1 +; SI-NEXT: v_or_b32_e32 v2, v38, v2 +; SI-NEXT: v_or_b32_e32 v3, v36, v3 ; SI-NEXT: v_or_b32_e32 v4, v34, v4 ; SI-NEXT: v_or_b32_e32 v5, v32, v5 ; SI-NEXT: v_or_b32_e32 v6, v30, v6 ; SI-NEXT: v_or_b32_e32 v7, v28, v7 ; SI-NEXT: v_or_b32_e32 v8, v26, v8 -; SI-NEXT: v_or_b32_e32 v9, v24, v9 +; SI-NEXT: v_or_b32_e32 v9, v23, v9 ; SI-NEXT: v_or_b32_e32 v10, v22, v10 -; SI-NEXT: v_or_b32_e32 v11, v20, v11 +; SI-NEXT: v_or_b32_e32 v11, v19, v11 ; SI-NEXT: v_or_b32_e32 v12, v18, v12 ; SI-NEXT: v_or_b32_e32 v13, v16, v13 ; SI-NEXT: s_cbranch_execnz .LBB55_3 @@ -12578,7 +13722,7 @@ define inreg <7 x double> @bitcast_v28f16_to_v7f64_scalar(<28 x half> inreg %a, ; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v48 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -12586,10 +13730,10 @@ define inreg <7 x double> @bitcast_v28f16_to_v7f64_scalar(<28 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v38 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v39 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -12597,11 +13741,11 @@ define inreg <7 x double> @bitcast_v28f16_to_v7f64_scalar(<28 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v37 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v35 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -12648,8 +13792,8 @@ define inreg <7 x double> @bitcast_v28f16_to_v7f64_scalar(<28 x half> inreg %a, ; SI-NEXT: v_cvt_f32_f16_e32 v9, v25 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v24 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 @@ -12661,7 +13805,7 @@ define inreg <7 x double> @bitcast_v28f16_to_v7f64_scalar(<28 x half> inreg %a, ; SI-NEXT: v_or_b32_e32 v9, v10, v9 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v19 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 @@ -12669,7 +13813,7 @@ define inreg <7 x double> @bitcast_v28f16_to_v7f64_scalar(<28 x half> inreg %a, ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v20 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v13, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v18 @@ -12903,46 +14047,90 @@ define <28 x half> @bitcast_v28i16_to_v28f16(<28 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v28i16_to_v28f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v47, v27 -; SI-NEXT: v_mov_b32_e32 v46, v26 -; SI-NEXT: v_mov_b32_e32 v45, v25 -; SI-NEXT: v_mov_b32_e32 v44, v24 -; SI-NEXT: v_mov_b32_e32 v43, v23 -; SI-NEXT: v_mov_b32_e32 v42, v22 -; SI-NEXT: v_mov_b32_e32 v41, v21 -; SI-NEXT: v_mov_b32_e32 v40, v20 -; SI-NEXT: v_mov_b32_e32 v55, v19 -; SI-NEXT: v_mov_b32_e32 v54, v18 -; SI-NEXT: v_mov_b32_e32 v53, v17 -; SI-NEXT: v_mov_b32_e32 v52, v16 -; SI-NEXT: v_mov_b32_e32 v51, v15 -; SI-NEXT: v_mov_b32_e32 v50, v14 -; SI-NEXT: v_mov_b32_e32 v49, v13 -; SI-NEXT: v_mov_b32_e32 v48, v12 -; SI-NEXT: v_mov_b32_e32 v39, v11 -; SI-NEXT: v_mov_b32_e32 v38, v10 -; SI-NEXT: v_mov_b32_e32 v37, v9 -; SI-NEXT: v_mov_b32_e32 v36, v8 -; SI-NEXT: v_mov_b32_e32 v35, v7 -; SI-NEXT: v_mov_b32_e32 v34, v6 -; SI-NEXT: v_mov_b32_e32 v33, v5 -; SI-NEXT: v_mov_b32_e32 v32, v4 -; SI-NEXT: v_mov_b32_e32 v31, v3 -; SI-NEXT: v_mov_b32_e32 v30, v2 -; SI-NEXT: v_mov_b32_e32 v29, v1 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v56, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB56_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_cvt_f32_f16_e32 v14, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v47 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 @@ -12957,66 +14145,6 @@ define <28 x half> @bitcast_v28i16_to_v28f16(<28 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execz .LBB56_2 -; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v0, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v47 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr52 @@ -13035,73 +14163,128 @@ define <28 x half> @bitcast_v28i16_to_v28f16(<28 x i16> %a, i32 %b) { ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB56_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v47 -; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v46 -; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v45 -; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v44 -; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v43 -; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v42 +; SI-NEXT: v_add_i32_e32 v49, vcc, 3, v47 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v33, vcc, 3, v46 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v45 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v44 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v43 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v42 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 ; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v41 -; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v40 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v40 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 ; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v55 -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 ; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v53 -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v51 -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v50 -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v49 -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v48 -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v39 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v38 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v37 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v36 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v35 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v34 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v33 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v32 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v31 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v30 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v29 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 ; SI-NEXT: .LBB56_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v16 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v25 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v29 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v20 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v32 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v23 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v35 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v26 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v37 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v30 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v39 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v33 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -13236,140 +14419,196 @@ define inreg <28 x half> @bitcast_v28i16_to_v28f16_scalar(<28 x i16> inreg %a, i ; SI-LABEL: bitcast_v28i16_to_v28f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; SI-NEXT: v_mov_b32_e32 v38, v13 -; SI-NEXT: v_mov_b32_e32 v37, v12 -; SI-NEXT: v_mov_b32_e32 v36, v11 -; SI-NEXT: v_mov_b32_e32 v35, v10 -; SI-NEXT: v_mov_b32_e32 v34, v9 -; SI-NEXT: v_mov_b32_e32 v33, v8 -; SI-NEXT: v_mov_b32_e32 v32, v7 -; SI-NEXT: v_mov_b32_e32 v31, v6 -; SI-NEXT: v_mov_b32_e32 v30, v5 -; SI-NEXT: v_mov_b32_e32 v29, v4 -; SI-NEXT: v_mov_b32_e32 v28, v3 -; SI-NEXT: v_mov_b32_e32 v49, v2 -; SI-NEXT: v_mov_b32_e32 v48, v1 -; SI-NEXT: v_mov_b32_e32 v39, v0 +; SI-NEXT: s_lshr_b32 s43, s29, 16 +; SI-NEXT: s_lshr_b32 s42, s28, 16 +; SI-NEXT: s_lshr_b32 s41, s27, 16 +; SI-NEXT: s_lshr_b32 s40, s26, 16 +; SI-NEXT: s_lshr_b32 s15, s25, 16 +; SI-NEXT: s_lshr_b32 s14, s24, 16 +; SI-NEXT: s_lshr_b32 s13, s23, 16 +; SI-NEXT: s_lshr_b32 s12, s22, 16 +; SI-NEXT: s_lshr_b32 s11, s21, 16 +; SI-NEXT: s_lshr_b32 s10, s20, 16 +; SI-NEXT: s_lshr_b32 s9, s19, 16 +; SI-NEXT: s_lshr_b32 s8, s18, 16 +; SI-NEXT: s_lshr_b32 s7, s17, 16 +; SI-NEXT: s_lshr_b32 s6, s16, 16 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_cbranch_scc0 .LBB57_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s6 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s7 ; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s8 ; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s9 ; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s10 ; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s11 ; SI-NEXT: v_cvt_f32_f16_e32 v6, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s13 ; SI-NEXT: v_cvt_f32_f16_e32 v8, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s15 ; SI-NEXT: v_cvt_f32_f16_e32 v10, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s43 ; SI-NEXT: s_cbranch_execnz .LBB57_3 ; SI-NEXT: .LBB57_2: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v38 -; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v37 -; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v36 -; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v35 -; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v34 -; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v33 -; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v32 -; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v31 -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v30 -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v29 -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v28 -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v49 -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v48 -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v39 +; SI-NEXT: s_add_i32 s43, s43, 3 ; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_add_i32 s42, s42, 3 ; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s41, s41, 3 ; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_add_i32 s40, s40, 3 ; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s15, s15, 3 ; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s14, s14, 3 ; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s13, s13, 3 ; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s12, s12, 3 ; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s11, s11, 3 ; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s10, s10, 3 ; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s9, s9, 3 ; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s8, s8, 3 ; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s7, s7, 3 ; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s6, s6, 3 ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s6 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s7 ; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s8 ; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s9 ; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s10 ; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s11 ; SI-NEXT: v_cvt_f32_f16_e32 v6, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s13 ; SI-NEXT: v_cvt_f32_f16_e32 v8, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s15 ; SI-NEXT: v_cvt_f32_f16_e32 v10, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s43 ; SI-NEXT: .LBB57_3: ; %end +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v0, v0, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v14 +; SI-NEXT: v_or_b32_e32 v1, v1, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v2, v2, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v11 +; SI-NEXT: v_or_b32_e32 v3, v3, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v4, v4, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v11 +; SI-NEXT: v_or_b32_e32 v5, v5, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v11 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v23 +; SI-NEXT: v_or_b32_e32 v9, v12, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v16 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v25 +; SI-NEXT: v_or_b32_e32 v11, v14, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v17 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB57_4: ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: s_branch .LBB57_2 ; @@ -13578,161 +14817,245 @@ define <28 x i16> @bitcast_v28f16_to_v28i16(<28 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v28f16_to_v28i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v15 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v15 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v19 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v28 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB58_2 ; SI-NEXT: ; %bb.1: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_or_b32_e32 v26, v26, v28 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_or_b32_e32 v22, v22, v28 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_or_b32_e32 v13, v13, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_or_b32_e32 v11, v11, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_or_b32_e32 v18, v18, v28 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_or_b32_e32 v9, v9, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_or_b32_e32 v14, v14, v28 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v11 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_or_b32_e32 v7, v7, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_or_b32_e32 v10, v10, v28 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v7 -; SI-NEXT: v_or_b32_e32 v6, v6, v28 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v3 -; SI-NEXT: v_or_b32_e32 v2, v2, v28 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_or_b32_e32 v5, v5, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v28 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_or_b32_e32 v4, v4, v5 -; SI-NEXT: v_or_b32_e32 v8, v8, v9 -; SI-NEXT: v_or_b32_e32 v12, v12, v13 -; SI-NEXT: v_or_b32_e32 v16, v16, v17 -; SI-NEXT: v_or_b32_e32 v20, v20, v21 -; SI-NEXT: v_or_b32_e32 v24, v24, v25 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 -; SI-NEXT: v_alignbit_b32 v17, v18, v17, 16 -; SI-NEXT: v_alignbit_b32 v21, v22, v21, 16 -; SI-NEXT: v_alignbit_b32 v25, v26, v25, 16 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v0, v0, v27 +; SI-NEXT: v_or_b32_e32 v21, v21, v26 +; SI-NEXT: v_or_b32_e32 v19, v19, v25 +; SI-NEXT: v_or_b32_e32 v18, v18, v24 +; SI-NEXT: v_or_b32_e32 v16, v16, v23 +; SI-NEXT: v_or_b32_e32 v17, v17, v22 +; SI-NEXT: v_or_b32_e32 v15, v15, v20 +; SI-NEXT: v_alignbit_b32 v27, v1, v27, 16 +; SI-NEXT: v_alignbit_b32 v26, v3, v26, 16 +; SI-NEXT: v_alignbit_b32 v25, v5, v25, 16 +; SI-NEXT: v_alignbit_b32 v24, v7, v24, 16 +; SI-NEXT: v_alignbit_b32 v23, v9, v23, 16 +; SI-NEXT: v_alignbit_b32 v22, v11, v22, 16 +; SI-NEXT: v_alignbit_b32 v20, v13, v20, 16 ; SI-NEXT: .LBB58_2: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v23 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v26 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v25 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v24 +; SI-NEXT: v_or_b32_e32 v8, v8, v16 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v22 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v20 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v0, v0, v27 +; SI-NEXT: v_or_b32_e32 v2, v2, v21 +; SI-NEXT: v_or_b32_e32 v4, v4, v19 +; SI-NEXT: v_or_b32_e32 v6, v6, v18 +; SI-NEXT: v_or_b32_e32 v10, v10, v16 +; SI-NEXT: v_or_b32_e32 v12, v12, v15 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v28f16_to_v28i16: @@ -13867,174 +15190,244 @@ define inreg <28 x i16> @bitcast_v28f16_to_v28i16_scalar(<28 x half> inreg %a, i ; SI-LABEL: bitcast_v28f16_to_v28i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v17, v12 -; SI-NEXT: v_mov_b32_e32 v21, v11 -; SI-NEXT: v_mov_b32_e32 v22, v8 -; SI-NEXT: v_mov_b32_e32 v25, v7 -; SI-NEXT: v_mov_b32_e32 v18, v4 -; SI-NEXT: v_mov_b32_e32 v26, v3 -; SI-NEXT: v_mov_b32_e32 v28, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v0, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v3, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v4, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v7, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v8, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v11, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v12, s28 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v34, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v2, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v32, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v6, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v29, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v10, s26 -; SI-NEXT: v_cvt_f16_f32_e32 v35, s29 +; SI-NEXT: s_lshr_b32 s4, s29, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: s_lshr_b32 s4, s28, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: s_lshr_b32 s4, s27, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s26, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 +; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s24 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s18 +; SI-NEXT: s_lshr_b32 s6, s22, 16 +; SI-NEXT: s_lshr_b32 s7, s20, 16 +; SI-NEXT: s_lshr_b32 s8, s18, 16 +; SI-NEXT: s_lshr_b32 s9, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v8 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_cbranch_scc0 .LBB59_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_cbranch_execnz .LBB59_3 ; SI-NEXT: .LBB59_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v12, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v35 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v12 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v30 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v7 -; SI-NEXT: v_or_b32_e32 v10, v10, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v35 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v3 -; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v29 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v15 -; SI-NEXT: v_or_b32_e32 v18, v18, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v36 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_or_b32_e32 v2, v2, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v32 -; SI-NEXT: v_or_b32_e32 v14, v14, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v30 -; SI-NEXT: v_or_b32_e32 v22, v22, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v10 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v14 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v26 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v24 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v28 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v16 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v38 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v6 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v8 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v20 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v18 +; SI-NEXT: v_or_b32_e32 v49, v14, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v39 +; SI-NEXT: v_or_b32_e32 v48, v15, v2 +; SI-NEXT: v_or_b32_e32 v38, v16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v32 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_lshr_b64 v[34:35], v[1:2], 16 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v27 -; SI-NEXT: v_lshr_b64 v[35:36], v[13:14], 16 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v26, v26, v28 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_lshr_b64 v[29:30], v[9:10], 16 -; SI-NEXT: v_lshr_b64 v[36:37], v[21:22], 16 -; SI-NEXT: v_lshr_b64 v[32:33], v[5:6], 16 -; SI-NEXT: v_lshr_b64 v[30:31], v[17:18], 16 -; SI-NEXT: v_lshr_b64 v[37:38], v[25:26], 16 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_or_b32_e32 v4, v4, v5 -; SI-NEXT: v_or_b32_e32 v8, v8, v9 -; SI-NEXT: v_or_b32_e32 v12, v12, v13 -; SI-NEXT: v_or_b32_e32 v16, v16, v17 -; SI-NEXT: v_or_b32_e32 v20, v20, v21 -; SI-NEXT: v_or_b32_e32 v24, v24, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v39, v14, v6 +; SI-NEXT: v_or_b32_e32 v36, v15, v8 +; SI-NEXT: v_or_b32_e32 v34, v16, v10 +; SI-NEXT: v_or_b32_e32 v32, v17, v12 +; SI-NEXT: v_lshr_b64 v[26:27], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[24:25], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[22:23], v[4:5], 16 +; SI-NEXT: v_lshr_b64 v[20:21], v[6:7], 16 +; SI-NEXT: v_lshr_b64 v[18:19], v[8:9], 16 +; SI-NEXT: v_lshr_b64 v[16:17], v[10:11], 16 +; SI-NEXT: v_lshr_b64 v[14:15], v[12:13], 16 ; SI-NEXT: .LBB59_3: ; %end -; SI-NEXT: v_mov_b32_e32 v1, v34 -; SI-NEXT: v_mov_b32_e32 v5, v32 -; SI-NEXT: v_mov_b32_e32 v9, v29 -; SI-NEXT: v_mov_b32_e32 v13, v35 -; SI-NEXT: v_mov_b32_e32 v17, v30 -; SI-NEXT: v_mov_b32_e32 v21, v36 -; SI-NEXT: v_mov_b32_e32 v25, v37 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v26 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v49 +; SI-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v24 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v48 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v35 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v22 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v38 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v33 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v20 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v39 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v31 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v18 +; SI-NEXT: v_or_b32_e32 v8, v8, v10 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v30 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v34 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v16 +; SI-NEXT: v_or_b32_e32 v10, v10, v12 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v29 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v12, v12, v14 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v28 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB59_4: ; SI-NEXT: s_branch .LBB59_2 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll index 08038b90687c0..1bcc09a680b2a 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll @@ -10,23 +10,20 @@ define <3 x half> @bitcast_v3bf16_to_v3f16(<3 x bfloat> %a, i32 %b) { ; SI-LABEL: bitcast_v3bf16_to_v3f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; SI-NEXT: v_mul_f32_e32 v3, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v4, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v5, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v1 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB0_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB0_4 -; SI-NEXT: .LBB0_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB0_3: ; %cmp.false +; SI-NEXT: s_cbranch_execz .LBB0_2 +; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v3 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v4 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v5 @@ -36,9 +33,10 @@ define <3 x half> @bitcast_v3bf16_to_v3f16(<3 x bfloat> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: .LBB0_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB0_2 -; SI-NEXT: .LBB0_4: ; %cmp.true +; SI-NEXT: s_cbranch_execz .LBB0_4 +; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 @@ -51,7 +49,13 @@ define <3 x half> @bitcast_v3bf16_to_v3f16(<3 x bfloat> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: .LBB0_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_cvt_f16_f32_e32 v3, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v3bf16_to_v3f16: @@ -247,38 +251,46 @@ define inreg <3 x half> @bitcast_v3bf16_to_v3f16_scalar(<3 x bfloat> inreg %a, i ; SI-LABEL: bitcast_v3bf16_to_v3f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_cmp_lg_u32 s19, 0 -; SI-NEXT: v_mul_f32_e64 v3, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v4, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v5, 1.0, s18 +; SI-NEXT: s_lshl_b32 s4, s17, 16 +; SI-NEXT: s_and_b32 s5, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s6, s16, 16 +; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: v_mul_f32_e64 v0, 1.0, s6 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s5 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s4 ; SI-NEXT: s_cbranch_scc0 .LBB1_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: s_cbranch_execnz .LBB1_3 ; SI-NEXT: .LBB1_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v4 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v3 ; SI-NEXT: .LBB1_3: ; %end +; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v4 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v2, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB1_4: -; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: s_branch .LBB1_2 ; ; VI-LABEL: bitcast_v3bf16_to_v3f16_scalar: @@ -501,35 +513,33 @@ define <3 x bfloat> @bitcast_v3f16_to_v3bf16(<3 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v3f16_to_v3bf16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v2 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 ; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB2_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB2_4 -; SI-NEXT: .LBB2_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB2_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB2_2 -; SI-NEXT: .LBB2_4: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v4 +; SI-NEXT: s_cbranch_execz .LBB2_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v3 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 @@ -537,9 +547,16 @@ define <3 x bfloat> @bitcast_v3f16_to_v3bf16(<3 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; SI-NEXT: .LBB2_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_alignbit_b32 v0, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v3f16_to_v3bf16: @@ -610,35 +627,45 @@ define inreg <3 x bfloat> @bitcast_v3f16_to_v3bf16_scalar(<3 x half> inreg %a, i ; SI-LABEL: bitcast_v3f16_to_v3bf16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v4, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v5, s18 -; SI-NEXT: s_cmp_lg_u32 s19, 0 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 +; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v3 ; SI-NEXT: s_cbranch_scc0 .LBB3_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v4 ; SI-NEXT: s_cbranch_execnz .LBB3_3 ; SI-NEXT: .LBB3_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v3 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v4 ; SI-NEXT: .LBB3_3: ; %end +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v5 +; SI-NEXT: v_lshr_b64 v[0:1], v[1:2], 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v4 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB3_4: -; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: s_branch .LBB3_2 ; ; VI-LABEL: bitcast_v3f16_to_v3bf16_scalar: @@ -723,44 +750,45 @@ define <3 x i16> @bitcast_v3bf16_to_v3i16(<3 x bfloat> %a, i32 %b) { ; SI-LABEL: bitcast_v3bf16_to_v3i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; SI-NEXT: v_mul_f32_e32 v5, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v4, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v3, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v1 ; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB4_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB4_4 -; SI-NEXT: .LBB4_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB4_3: ; %cmp.false +; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v3 ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB4_2 -; SI-NEXT: .LBB4_4: ; %cmp.true +; SI-NEXT: s_cbranch_execz .LBB4_4 +; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; SI-NEXT: v_alignbit_b32 v0, v2, v0, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v2, v1, v2, 16 +; SI-NEXT: .LBB4_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v3bf16_to_v3i16: @@ -952,35 +980,40 @@ define inreg <3 x i16> @bitcast_v3bf16_to_v3i16_scalar(<3 x bfloat> inreg %a, i3 ; SI-LABEL: bitcast_v3bf16_to_v3i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_cmp_lg_u32 s19, 0 -; SI-NEXT: v_mul_f32_e64 v5, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v4, 1.0, s18 +; SI-NEXT: s_lshl_b32 s4, s17, 16 +; SI-NEXT: s_and_b32 s5, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s6, s16, 16 +; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s6 +; SI-NEXT: v_mul_f32_e64 v0, 1.0, s5 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s4 ; SI-NEXT: s_cbranch_scc0 .LBB5_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v4 ; SI-NEXT: s_cbranch_execnz .LBB5_3 ; SI-NEXT: .LBB5_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshr_b64 v[3:4], v[1:2], 16 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_lshr_b64 v[2:3], v[1:2], 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshr_b64 v[3:4], v[0:1], 16 ; SI-NEXT: .LBB5_3: ; %end -; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB5_4: -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: s_branch .LBB5_2 ; ; VI-LABEL: bitcast_v3bf16_to_v3i16_scalar: @@ -1196,34 +1229,34 @@ define <3 x bfloat> @bitcast_v3i16_to_v3bf16(<3 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v3i16_to_v3bf16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v4, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; SI-NEXT: v_alignbit_b32 v3, v1, v0, 16 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB6_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB6_4 -; SI-NEXT: .LBB6_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB6_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB6_2 -; SI-NEXT: .LBB6_4: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v0, v2, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x30000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x30000, v1 +; SI-NEXT: ; %bb.4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v2 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v3i16_to_v3bf16: @@ -1292,32 +1325,36 @@ define inreg <3 x bfloat> @bitcast_v3i16_to_v3bf16_scalar(<3 x i16> inreg %a, i3 ; SI-LABEL: bitcast_v3i16_to_v3bf16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_cmp_lg_u32 s19, 0 +; SI-NEXT: s_lshr_b64 s[4:5], s[16:17], 16 +; SI-NEXT: s_cmp_lg_u32 s18, 0 ; SI-NEXT: s_cbranch_scc0 .LBB7_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshl_b32 s6, s16, 16 -; SI-NEXT: s_lshl_b32 s7, s17, 16 -; SI-NEXT: s_lshl_b32 s8, s18, 16 +; SI-NEXT: s_lshl_b32 s5, s16, 16 +; SI-NEXT: s_lshl_b32 s8, s4, 16 +; SI-NEXT: s_lshl_b32 s9, s17, 16 ; SI-NEXT: s_cbranch_execnz .LBB7_3 ; SI-NEXT: .LBB7_2: ; %cmp.true -; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s5, s16, 3 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_add_i32 s4, s4, 0x30000 -; SI-NEXT: s_lshl_b32 s5, s18, 16 -; SI-NEXT: s_and_b32 s7, s4, 0xffff0000 -; SI-NEXT: s_lshl_b32 s6, s4, 16 -; SI-NEXT: s_add_i32 s8, s5, 0x30000 +; SI-NEXT: s_lshl_b32 s6, s17, 16 +; SI-NEXT: s_and_b32 s8, s4, 0xffff0000 +; SI-NEXT: s_lshl_b32 s5, s4, 16 +; SI-NEXT: s_add_i32 s9, s6, 0x30000 ; SI-NEXT: .LBB7_3: ; %end -; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: v_mov_b32_e32 v1, s7 -; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: v_mul_f32_e64 v0, 1.0, s8 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_mul_f32_e64 v0, 1.0, s5 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s9 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB7_4: -; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr5 ; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr9 ; SI-NEXT: s_branch .LBB7_2 ; ; VI-LABEL: bitcast_v3i16_to_v3bf16_scalar: @@ -1398,29 +1435,36 @@ define <3 x i16> @bitcast_v3f16_to_v3i16(<3 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v3f16_to_v3i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB8_2 ; SI-NEXT: ; %bb.1: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: v_alignbit_b32 v3, v1, v2, 16 ; SI-NEXT: .LBB8_2: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v3f16_to_v3i16: @@ -1491,28 +1535,34 @@ define inreg <3 x i16> @bitcast_v3f16_to_v3i16_scalar(<3 x half> inreg %a, i32 i ; SI-LABEL: bitcast_v3f16_to_v3i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v3, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v2, s18 -; SI-NEXT: s_cmp_lg_u32 s19, 0 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 +; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v3 ; SI-NEXT: s_cbranch_scc0 .LBB9_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_cbranch_execnz .LBB9_3 ; SI-NEXT: .LBB9_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v1, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshr_b64 v[3:4], v[1:2], 16 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; SI-NEXT: v_lshr_b64 v[2:3], v[0:1], 16 +; SI-NEXT: v_or_b32_e32 v0, v4, v0 ; SI-NEXT: .LBB9_3: ; %end -; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB9_4: ; SI-NEXT: s_branch .LBB9_2 @@ -1599,39 +1649,38 @@ define <3 x half> @bitcast_v3i16_to_v3f16(<3 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v3i16_to_v3f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v6, v2 -; SI-NEXT: v_mov_b32_e32 v4, v1 -; SI-NEXT: v_mov_b32_e32 v5, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: v_alignbit_b32 v5, v1, v0, 16 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB10_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB10_4 -; SI-NEXT: .LBB10_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB10_3: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v0, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v6 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: s_cbranch_execz .LBB10_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: .LBB10_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB10_2 -; SI-NEXT: .LBB10_4: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v6 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v4 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: s_cbranch_execz .LBB10_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 +; SI-NEXT: .LBB10_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v2, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v3i16_to_v3f16: @@ -1700,26 +1749,32 @@ define inreg <3 x half> @bitcast_v3i16_to_v3f16_scalar(<3 x i16> inreg %a, i32 i ; SI-LABEL: bitcast_v3i16_to_v3f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_cmp_lg_u32 s19, 0 +; SI-NEXT: s_lshr_b64 s[4:5], s[16:17], 16 +; SI-NEXT: s_cmp_lg_u32 s18, 0 ; SI-NEXT: s_cbranch_scc0 .LBB11_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 ; SI-NEXT: s_cbranch_execnz .LBB11_3 ; SI-NEXT: .LBB11_2: ; %cmp.true -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 +; SI-NEXT: s_add_i32 s5, s17, 3 +; SI-NEXT: s_add_i32 s4, s4, 3 +; SI-NEXT: s_add_i32 s6, s16, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 ; SI-NEXT: .LBB11_3: ; %end +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB11_4: ; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: s_branch .LBB11_2 ; ; VI-LABEL: bitcast_v3i16_to_v3f16_scalar: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll index 3e2b488d02f37..0625121f9ea7a 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll @@ -1942,97 +1942,129 @@ define <32 x i16> @bitcast_v16i32_to_v32i16(<16 x i32> %a, i32 %b) { ; SI-LABEL: bitcast_v16i32_to_v32i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v30, v15 -; SI-NEXT: v_mov_b32_e32 v28, v14 -; SI-NEXT: v_mov_b32_e32 v26, v13 -; SI-NEXT: v_mov_b32_e32 v24, v12 -; SI-NEXT: v_mov_b32_e32 v22, v11 -; SI-NEXT: v_mov_b32_e32 v20, v10 -; SI-NEXT: v_mov_b32_e32 v18, v9 -; SI-NEXT: v_mov_b32_e32 v32, v8 -; SI-NEXT: v_mov_b32_e32 v14, v7 -; SI-NEXT: v_mov_b32_e32 v12, v6 -; SI-NEXT: v_mov_b32_e32 v10, v5 -; SI-NEXT: v_mov_b32_e32 v8, v4 -; SI-NEXT: v_mov_b32_e32 v6, v3 -; SI-NEXT: v_mov_b32_e32 v4, v2 -; SI-NEXT: v_mov_b32_e32 v2, v1 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB12_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_alignbit_b32 v29, v30, v28, 16 -; SI-NEXT: v_alignbit_b32 v25, v26, v24, 16 -; SI-NEXT: v_alignbit_b32 v21, v22, v20, 16 -; SI-NEXT: v_alignbit_b32 v17, v18, v32, 16 -; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 -; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_alignbit_b32 v16, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v17, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v18, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v19, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v20, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v22, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v24, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v27, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 ; SI-NEXT: .LBB12_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB12_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 ; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 ; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 ; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32 -; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 -; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; SI-NEXT: v_alignbit_b32 v29, v30, v28, 16 -; SI-NEXT: v_alignbit_b32 v25, v26, v24, 16 -; SI-NEXT: v_alignbit_b32 v21, v22, v20, 16 -; SI-NEXT: v_alignbit_b32 v17, v18, v32, 16 -; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 -; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_alignbit_b32 v16, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v17, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v18, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v19, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v20, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v22, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v24, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v27, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 ; SI-NEXT: .LBB12_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_mov_b32_e32 v16, v32 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v0, v0, v27 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v31 +; SI-NEXT: v_or_b32_e32 v2, v2, v24 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v30 +; SI-NEXT: v_or_b32_e32 v4, v4, v22 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v29 +; SI-NEXT: v_or_b32_e32 v6, v6, v20 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v28 +; SI-NEXT: v_or_b32_e32 v8, v8, v19 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v26 +; SI-NEXT: v_or_b32_e32 v10, v10, v18 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v25 +; SI-NEXT: v_or_b32_e32 v12, v12, v17 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v23 +; SI-NEXT: v_or_b32_e32 v14, v14, v16 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v21 +; SI-NEXT: v_or_b32_e32 v1, v1, v27 +; SI-NEXT: v_or_b32_e32 v3, v3, v24 +; SI-NEXT: v_or_b32_e32 v5, v5, v22 +; SI-NEXT: v_or_b32_e32 v7, v7, v20 +; SI-NEXT: v_or_b32_e32 v9, v9, v19 +; SI-NEXT: v_or_b32_e32 v11, v11, v18 +; SI-NEXT: v_or_b32_e32 v13, v13, v17 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16i32_to_v32i16: @@ -2143,111 +2175,173 @@ define inreg <32 x i16> @bitcast_v16i32_to_v32i16_scalar(<16 x i32> inreg %a, i3 ; SI-LABEL: bitcast_v16i32_to_v32i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v3, s16 +; SI-NEXT: v_mov_b32_e32 v4, s17 +; SI-NEXT: v_mov_b32_e32 v5, s18 +; SI-NEXT: v_mov_b32_e32 v6, s19 +; SI-NEXT: v_mov_b32_e32 v7, s20 +; SI-NEXT: v_mov_b32_e32 v8, s21 +; SI-NEXT: v_mov_b32_e32 v9, s22 +; SI-NEXT: v_mov_b32_e32 v10, s23 +; SI-NEXT: v_mov_b32_e32 v11, s24 +; SI-NEXT: v_mov_b32_e32 v12, s25 +; SI-NEXT: v_mov_b32_e32 v13, s26 +; SI-NEXT: v_mov_b32_e32 v14, s27 +; SI-NEXT: v_mov_b32_e32 v15, s28 +; SI-NEXT: v_mov_b32_e32 v16, s29 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: v_mov_b32_e32 v33, v1 -; SI-NEXT: v_mov_b32_e32 v32, v0 -; SI-NEXT: v_mov_b32_e32 v34, s16 -; SI-NEXT: v_mov_b32_e32 v35, s17 -; SI-NEXT: v_mov_b32_e32 v36, s18 -; SI-NEXT: v_mov_b32_e32 v37, s19 -; SI-NEXT: v_mov_b32_e32 v38, s20 -; SI-NEXT: v_mov_b32_e32 v39, s21 -; SI-NEXT: v_mov_b32_e32 v48, s22 -; SI-NEXT: v_mov_b32_e32 v49, s23 -; SI-NEXT: v_mov_b32_e32 v50, s24 -; SI-NEXT: v_mov_b32_e32 v51, s25 -; SI-NEXT: v_mov_b32_e32 v52, s26 -; SI-NEXT: v_mov_b32_e32 v53, s27 -; SI-NEXT: v_mov_b32_e32 v54, s28 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mov_b32_e32 v55, s29 +; SI-NEXT: v_readfirstlane_b32 s18, v3 +; SI-NEXT: v_readfirstlane_b32 s19, v4 +; SI-NEXT: v_readfirstlane_b32 s16, v5 +; SI-NEXT: v_readfirstlane_b32 s17, v6 +; SI-NEXT: v_readfirstlane_b32 s14, v7 +; SI-NEXT: v_readfirstlane_b32 s15, v8 +; SI-NEXT: v_readfirstlane_b32 s12, v9 +; SI-NEXT: v_readfirstlane_b32 s13, v10 +; SI-NEXT: v_readfirstlane_b32 s10, v11 +; SI-NEXT: v_readfirstlane_b32 s11, v12 +; SI-NEXT: v_readfirstlane_b32 s8, v13 +; SI-NEXT: v_readfirstlane_b32 s9, v14 +; SI-NEXT: v_readfirstlane_b32 s6, v15 +; SI-NEXT: v_readfirstlane_b32 s7, v16 +; SI-NEXT: v_readfirstlane_b32 s4, v0 +; SI-NEXT: s_and_b64 s[20:21], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s5, v1 ; SI-NEXT: s_cbranch_scc0 .LBB13_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v33 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v55 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v53 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v51 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v49 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v39 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v37 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v35 -; SI-NEXT: v_lshr_b64 v[29:30], v[32:33], 16 -; SI-NEXT: v_lshr_b64 v[25:26], v[54:55], 16 -; SI-NEXT: v_lshr_b64 v[21:22], v[52:53], 16 -; SI-NEXT: v_lshr_b64 v[17:18], v[50:51], 16 -; SI-NEXT: v_lshr_b64 v[13:14], v[48:49], 16 -; SI-NEXT: v_lshr_b64 v[9:10], v[38:39], 16 -; SI-NEXT: v_lshr_b64 v[5:6], v[36:37], 16 -; SI-NEXT: v_lshr_b64 v[1:2], v[34:35], 16 +; SI-NEXT: s_lshr_b32 s56, s5, 16 +; SI-NEXT: s_lshr_b32 s57, s7, 16 +; SI-NEXT: s_lshr_b32 s58, s9, 16 +; SI-NEXT: s_lshr_b32 s59, s11, 16 +; SI-NEXT: s_lshr_b32 s60, s13, 16 +; SI-NEXT: s_lshr_b32 s61, s15, 16 +; SI-NEXT: s_lshr_b32 s62, s17, 16 +; SI-NEXT: s_lshr_b32 s63, s19, 16 +; SI-NEXT: s_lshr_b64 s[20:21], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[22:23], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[24:25], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[42:43], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[44:45], s[18:19], 16 ; SI-NEXT: s_cbranch_execnz .LBB13_3 ; SI-NEXT: .LBB13_2: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v35, vcc, 3, v35 -; SI-NEXT: v_add_i32_e32 v34, vcc, 3, v34 -; SI-NEXT: v_add_i32_e32 v37, vcc, 3, v37 -; SI-NEXT: v_add_i32_e32 v36, vcc, 3, v36 -; SI-NEXT: v_add_i32_e32 v39, vcc, 3, v39 -; SI-NEXT: v_add_i32_e32 v38, vcc, 3, v38 -; SI-NEXT: v_add_i32_e32 v49, vcc, 3, v49 -; SI-NEXT: v_add_i32_e32 v48, vcc, 3, v48 -; SI-NEXT: v_add_i32_e32 v51, vcc, 3, v51 -; SI-NEXT: v_add_i32_e32 v50, vcc, 3, v50 -; SI-NEXT: v_add_i32_e32 v53, vcc, 3, v53 -; SI-NEXT: v_add_i32_e32 v52, vcc, 3, v52 -; SI-NEXT: v_add_i32_e32 v55, vcc, 3, v55 -; SI-NEXT: v_add_i32_e32 v54, vcc, 3, v54 -; SI-NEXT: v_add_i32_e32 v33, vcc, 3, v33 -; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32 -; SI-NEXT: v_lshr_b64 v[29:30], v[32:33], 16 -; SI-NEXT: v_lshr_b64 v[25:26], v[54:55], 16 -; SI-NEXT: v_lshr_b64 v[21:22], v[52:53], 16 -; SI-NEXT: v_lshr_b64 v[17:18], v[50:51], 16 -; SI-NEXT: v_lshr_b64 v[13:14], v[48:49], 16 -; SI-NEXT: v_lshr_b64 v[9:10], v[38:39], 16 -; SI-NEXT: v_lshr_b64 v[5:6], v[36:37], 16 -; SI-NEXT: v_lshr_b64 v[1:2], v[34:35], 16 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v33 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v55 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v53 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v51 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v49 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v39 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v37 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v35 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s15, s15, 3 +; SI-NEXT: s_add_i32 s14, s14, 3 +; SI-NEXT: s_add_i32 s13, s13, 3 +; SI-NEXT: s_add_i32 s12, s12, 3 +; SI-NEXT: s_add_i32 s11, s11, 3 +; SI-NEXT: s_add_i32 s10, s10, 3 +; SI-NEXT: s_add_i32 s9, s9, 3 +; SI-NEXT: s_add_i32 s8, s8, 3 +; SI-NEXT: s_add_i32 s7, s7, 3 +; SI-NEXT: s_add_i32 s6, s6, 3 +; SI-NEXT: s_add_i32 s5, s5, 3 +; SI-NEXT: s_add_i32 s4, s4, 3 +; SI-NEXT: s_lshr_b64 s[20:21], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[22:23], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[24:25], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[42:43], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[44:45], s[18:19], 16 +; SI-NEXT: s_lshr_b32 s56, s5, 16 +; SI-NEXT: s_lshr_b32 s57, s7, 16 +; SI-NEXT: s_lshr_b32 s58, s9, 16 +; SI-NEXT: s_lshr_b32 s59, s11, 16 +; SI-NEXT: s_lshr_b32 s60, s13, 16 +; SI-NEXT: s_lshr_b32 s61, s15, 16 +; SI-NEXT: s_lshr_b32 s62, s17, 16 +; SI-NEXT: s_lshr_b32 s63, s19, 16 ; SI-NEXT: .LBB13_3: ; %end -; SI-NEXT: v_mov_b32_e32 v0, v34 -; SI-NEXT: v_mov_b32_e32 v2, v35 -; SI-NEXT: v_mov_b32_e32 v4, v36 -; SI-NEXT: v_mov_b32_e32 v6, v37 -; SI-NEXT: v_mov_b32_e32 v8, v38 -; SI-NEXT: v_mov_b32_e32 v10, v39 -; SI-NEXT: v_mov_b32_e32 v12, v48 -; SI-NEXT: v_mov_b32_e32 v14, v49 -; SI-NEXT: v_mov_b32_e32 v16, v50 -; SI-NEXT: v_mov_b32_e32 v18, v51 -; SI-NEXT: v_mov_b32_e32 v20, v52 -; SI-NEXT: v_mov_b32_e32 v22, v53 -; SI-NEXT: v_mov_b32_e32 v24, v54 -; SI-NEXT: v_mov_b32_e32 v26, v55 -; SI-NEXT: v_mov_b32_e32 v28, v32 -; SI-NEXT: v_mov_b32_e32 v30, v33 +; SI-NEXT: s_and_b32 s18, s18, 0xffff +; SI-NEXT: s_lshl_b32 s21, s44, 16 +; SI-NEXT: s_or_b32 s18, s18, s21 +; SI-NEXT: s_and_b32 s19, s19, 0xffff +; SI-NEXT: s_lshl_b32 s21, s63, 16 +; SI-NEXT: s_or_b32 s19, s19, s21 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_lshl_b32 s21, s42, 16 +; SI-NEXT: s_or_b32 s16, s16, s21 +; SI-NEXT: s_and_b32 s17, s17, 0xffff +; SI-NEXT: s_lshl_b32 s21, s62, 16 +; SI-NEXT: s_or_b32 s17, s17, s21 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: s_lshl_b32 s21, s40, 16 +; SI-NEXT: s_or_b32 s14, s14, s21 +; SI-NEXT: s_and_b32 s15, s15, 0xffff +; SI-NEXT: s_lshl_b32 s21, s61, 16 +; SI-NEXT: s_or_b32 s15, s15, s21 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_lshl_b32 s21, s28, 16 +; SI-NEXT: s_or_b32 s12, s12, s21 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_lshl_b32 s21, s60, 16 +; SI-NEXT: s_or_b32 s13, s13, s21 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_lshl_b32 s21, s26, 16 +; SI-NEXT: s_or_b32 s10, s10, s21 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: s_lshl_b32 s21, s59, 16 +; SI-NEXT: s_or_b32 s11, s11, s21 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_lshl_b32 s21, s24, 16 +; SI-NEXT: s_or_b32 s8, s8, s21 +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_lshl_b32 s21, s58, 16 +; SI-NEXT: s_or_b32 s9, s9, s21 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_lshl_b32 s21, s22, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s20, s20, 16 +; SI-NEXT: s_or_b32 s6, s6, s21 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_lshl_b32 s21, s57, 16 +; SI-NEXT: s_or_b32 s4, s4, s20 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s20, s56, 16 +; SI-NEXT: s_or_b32 s7, s7, s21 +; SI-NEXT: s_or_b32 s5, s5, s20 +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: v_mov_b32_e32 v1, s19 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: v_mov_b32_e32 v3, s17 +; SI-NEXT: v_mov_b32_e32 v4, s14 +; SI-NEXT: v_mov_b32_e32 v5, s15 +; SI-NEXT: v_mov_b32_e32 v6, s12 +; SI-NEXT: v_mov_b32_e32 v7, s13 +; SI-NEXT: v_mov_b32_e32 v8, s10 +; SI-NEXT: v_mov_b32_e32 v9, s11 +; SI-NEXT: v_mov_b32_e32 v10, s8 +; SI-NEXT: v_mov_b32_e32 v11, s9 +; SI-NEXT: v_mov_b32_e32 v12, s6 +; SI-NEXT: v_mov_b32_e32 v13, s7 +; SI-NEXT: v_mov_b32_e32 v14, s4 +; SI-NEXT: v_mov_b32_e32 v15, s5 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB13_4: -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr63 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr61 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr59 +; SI-NEXT: ; implicit-def: $sgpr24 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr22 +; SI-NEXT: ; implicit-def: $sgpr57 +; SI-NEXT: ; implicit-def: $sgpr20 +; SI-NEXT: ; implicit-def: $sgpr56 ; SI-NEXT: s_branch .LBB13_2 ; ; VI-LABEL: bitcast_v16i32_to_v32i16_scalar: @@ -2404,95 +2498,114 @@ define <16 x i32> @bitcast_v32i16_to_v16i32(<32 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v32i16_to_v16i32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v32, v2 +; SI-NEXT: v_mov_b32_e32 v32, v15 +; SI-NEXT: v_mov_b32_e32 v17, v14 +; SI-NEXT: v_mov_b32_e32 v18, v13 +; SI-NEXT: v_mov_b32_e32 v19, v12 +; SI-NEXT: v_mov_b32_e32 v20, v11 +; SI-NEXT: v_mov_b32_e32 v21, v10 +; SI-NEXT: v_mov_b32_e32 v22, v9 +; SI-NEXT: v_mov_b32_e32 v23, v8 +; SI-NEXT: v_mov_b32_e32 v24, v7 +; SI-NEXT: v_mov_b32_e32 v25, v6 +; SI-NEXT: v_mov_b32_e32 v26, v5 +; SI-NEXT: v_mov_b32_e32 v27, v4 +; SI-NEXT: v_mov_b32_e32 v28, v3 +; SI-NEXT: v_mov_b32_e32 v29, v2 +; SI-NEXT: v_mov_b32_e32 v30, v1 ; SI-NEXT: v_mov_b32_e32 v31, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v38, v14 -; SI-NEXT: v_mov_b32_e32 v37, v12 -; SI-NEXT: v_mov_b32_e32 v36, v10 -; SI-NEXT: v_mov_b32_e32 v35, v8 -; SI-NEXT: v_mov_b32_e32 v34, v6 -; SI-NEXT: v_mov_b32_e32 v33, v4 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v29 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v31 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v0 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execz .LBB14_2 -; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB14_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB14_4 +; SI-NEXT: .LBB14_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB14_3: ; %cmp.false ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v31 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v32 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v33 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v34 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v35 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v36 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v37 -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v38 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v16 -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v18 -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v20 -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v22 -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v24 -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v26 -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v28 -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v30 -; SI-NEXT: v_or_b32_e32 v0, v0, v42 -; SI-NEXT: v_or_b32_e32 v1, v1, v41 -; SI-NEXT: v_or_b32_e32 v2, v2, v40 -; SI-NEXT: v_or_b32_e32 v3, v3, v55 -; SI-NEXT: v_or_b32_e32 v4, v4, v54 -; SI-NEXT: v_or_b32_e32 v5, v5, v53 -; SI-NEXT: v_or_b32_e32 v6, v6, v52 -; SI-NEXT: v_or_b32_e32 v7, v7, v51 -; SI-NEXT: v_or_b32_e32 v8, v8, v50 -; SI-NEXT: v_or_b32_e32 v9, v9, v49 -; SI-NEXT: v_or_b32_e32 v10, v10, v48 -; SI-NEXT: v_or_b32_e32 v11, v11, v39 -; SI-NEXT: v_or_b32_e32 v12, v12, v23 -; SI-NEXT: v_or_b32_e32 v13, v13, v21 -; SI-NEXT: v_or_b32_e32 v14, v14, v19 -; SI-NEXT: v_or_b32_e32 v15, v15, v17 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v30 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v29 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v28 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v27 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v26 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v25 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v24 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v23 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v21 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v19 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v32 +; SI-NEXT: v_or_b32_e32 v0, v0, v55 +; SI-NEXT: v_or_b32_e32 v1, v1, v54 +; SI-NEXT: v_or_b32_e32 v2, v2, v53 +; SI-NEXT: v_or_b32_e32 v3, v3, v52 +; SI-NEXT: v_or_b32_e32 v4, v4, v51 +; SI-NEXT: v_or_b32_e32 v5, v5, v50 +; SI-NEXT: v_or_b32_e32 v6, v6, v49 +; SI-NEXT: v_or_b32_e32 v7, v7, v48 +; SI-NEXT: v_or_b32_e32 v8, v8, v39 +; SI-NEXT: v_or_b32_e32 v9, v9, v38 +; SI-NEXT: v_or_b32_e32 v10, v10, v37 +; SI-NEXT: v_or_b32_e32 v11, v11, v36 +; SI-NEXT: v_or_b32_e32 v12, v12, v35 +; SI-NEXT: v_or_b32_e32 v13, v13, v34 +; SI-NEXT: v_or_b32_e32 v14, v14, v33 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr53 @@ -2502,30 +2615,32 @@ define <16 x i32> @bitcast_v32i16_to_v16i32(<32 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: .LBB14_2: ; %Flow +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB14_4 -; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: s_cbranch_execz .LBB14_2 +; SI-NEXT: .LBB14_4: ; %cmp.true ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v31 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v32 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v33 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v34 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v35 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v36 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v37 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v38 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v16 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v18 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v20 -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v22 -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v24 -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v26 -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v28 -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v29 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v27 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v32 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 @@ -2542,23 +2657,23 @@ define <16 x i32> @bitcast_v32i16_to_v16i32(<32 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: v_or_b32_e32 v0, v55, v0 ; SI-NEXT: s_mov_b32 s6, 0x30000 -; SI-NEXT: v_or_b32_e32 v1, v41, v1 -; SI-NEXT: v_or_b32_e32 v2, v40, v2 -; SI-NEXT: v_or_b32_e32 v3, v55, v3 -; SI-NEXT: v_or_b32_e32 v4, v54, v4 -; SI-NEXT: v_or_b32_e32 v5, v53, v5 -; SI-NEXT: v_or_b32_e32 v6, v52, v6 -; SI-NEXT: v_or_b32_e32 v7, v51, v7 -; SI-NEXT: v_or_b32_e32 v8, v50, v8 -; SI-NEXT: v_or_b32_e32 v9, v49, v9 -; SI-NEXT: v_or_b32_e32 v10, v48, v10 -; SI-NEXT: v_or_b32_e32 v11, v39, v11 -; SI-NEXT: v_or_b32_e32 v12, v23, v12 -; SI-NEXT: v_or_b32_e32 v13, v21, v13 -; SI-NEXT: v_or_b32_e32 v14, v19, v14 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: v_or_b32_e32 v1, v54, v1 +; SI-NEXT: v_or_b32_e32 v2, v53, v2 +; SI-NEXT: v_or_b32_e32 v3, v52, v3 +; SI-NEXT: v_or_b32_e32 v4, v51, v4 +; SI-NEXT: v_or_b32_e32 v5, v50, v5 +; SI-NEXT: v_or_b32_e32 v6, v49, v6 +; SI-NEXT: v_or_b32_e32 v7, v48, v7 +; SI-NEXT: v_or_b32_e32 v8, v39, v8 +; SI-NEXT: v_or_b32_e32 v9, v38, v9 +; SI-NEXT: v_or_b32_e32 v10, v37, v10 +; SI-NEXT: v_or_b32_e32 v11, v36, v11 +; SI-NEXT: v_or_b32_e32 v12, v35, v12 +; SI-NEXT: v_or_b32_e32 v13, v34, v13 +; SI-NEXT: v_or_b32_e32 v14, v33, v14 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 @@ -2575,12 +2690,7 @@ define <16 x i32> @bitcast_v32i16_to_v16i32(<32 x i16> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v13 ; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v14 ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v15 -; SI-NEXT: .LBB14_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v32i16_to_v16i32: @@ -2724,153 +2834,184 @@ define inreg <16 x i32> @bitcast_v32i16_to_v16i32_scalar(<32 x i16> inreg %a, i3 ; SI-LABEL: bitcast_v32i16_to_v16i32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; SI-NEXT: v_mov_b32_e32 v26, v14 -; SI-NEXT: v_mov_b32_e32 v25, v12 -; SI-NEXT: v_mov_b32_e32 v19, v10 -; SI-NEXT: v_mov_b32_e32 v20, v8 -; SI-NEXT: v_mov_b32_e32 v21, v6 -; SI-NEXT: v_mov_b32_e32 v22, v4 -; SI-NEXT: v_mov_b32_e32 v23, v2 -; SI-NEXT: v_mov_b32_e32 v24, v0 +; SI-NEXT: v_mov_b32_e32 v16, v1 +; SI-NEXT: v_mov_b32_e32 v17, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v17 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s7, s28, 16 +; SI-NEXT: s_lshr_b32 s8, s27, 16 +; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: s_lshr_b32 s13, s22, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s40, s19, 16 +; SI-NEXT: s_lshr_b32 s41, s18, 16 +; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v0 ; SI-NEXT: s_cbranch_scc0 .LBB15_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v24 ; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: v_or_b32_e32 v7, v0, v33 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v22 +; SI-NEXT: s_lshl_b32 s5, s43, 16 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: v_or_b32_e32 v9, v0, v31 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v21 -; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: v_or_b32_e32 v10, v0, v30 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v20 -; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: v_or_b32_e32 v11, v0, v29 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v19 -; SI-NEXT: s_or_b32 s7, s7, s8 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: v_or_b32_e32 v12, v0, v28 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v25 -; SI-NEXT: s_or_b32 s8, s8, s9 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: v_or_b32_e32 v13, v0, v27 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v26 -; SI-NEXT: s_or_b32 s9, s9, s10 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 -; SI-NEXT: v_or_b32_e32 v14, v0, v18 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s44, s42, 16 +; SI-NEXT: s_or_b32 s5, s5, s44 +; SI-NEXT: s_and_b32 s44, s18, 0xffff +; SI-NEXT: s_lshl_b32 s45, s41, 16 +; SI-NEXT: s_or_b32 s44, s44, s45 +; SI-NEXT: s_and_b32 s45, s19, 0xffff +; SI-NEXT: s_lshl_b32 s46, s40, 16 +; SI-NEXT: s_or_b32 s45, s45, s46 +; SI-NEXT: s_and_b32 s46, s20, 0xffff +; SI-NEXT: s_lshl_b32 s47, s15, 16 +; SI-NEXT: s_or_b32 s46, s46, s47 +; SI-NEXT: s_and_b32 s47, s21, 0xffff +; SI-NEXT: s_lshl_b32 s56, s14, 16 +; SI-NEXT: s_or_b32 s47, s47, s56 +; SI-NEXT: s_and_b32 s56, s22, 0xffff +; SI-NEXT: s_lshl_b32 s57, s13, 16 +; SI-NEXT: s_or_b32 s56, s56, s57 +; SI-NEXT: s_and_b32 s57, s23, 0xffff +; SI-NEXT: s_lshl_b32 s58, s12, 16 +; SI-NEXT: s_or_b32 s57, s57, s58 +; SI-NEXT: s_and_b32 s58, s24, 0xffff +; SI-NEXT: s_lshl_b32 s59, s11, 16 +; SI-NEXT: s_or_b32 s58, s58, s59 +; SI-NEXT: s_and_b32 s59, s25, 0xffff +; SI-NEXT: s_lshl_b32 s60, s10, 16 +; SI-NEXT: s_or_b32 s59, s59, s60 +; SI-NEXT: s_and_b32 s60, s26, 0xffff +; SI-NEXT: s_lshl_b32 s61, s9, 16 +; SI-NEXT: s_or_b32 s60, s60, s61 +; SI-NEXT: s_and_b32 s61, s27, 0xffff +; SI-NEXT: s_lshl_b32 s62, s8, 16 +; SI-NEXT: s_or_b32 s61, s61, s62 +; SI-NEXT: s_and_b32 s62, s28, 0xffff +; SI-NEXT: s_lshl_b32 s63, s7, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v17 +; SI-NEXT: s_or_b32 s62, s62, s63 +; SI-NEXT: s_and_b32 s63, s29, 0xffff +; SI-NEXT: s_lshl_b32 s72, s6, 16 +; SI-NEXT: v_or_b32_e32 v14, v0, v19 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 -; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_or_b32_e32 v8, v1, v32 -; SI-NEXT: v_or_b32_e32 v15, v0, v17 +; SI-NEXT: s_or_b32 s63, s63, s72 +; SI-NEXT: v_or_b32_e32 v15, v0, v18 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_mov_b32_e32 v5, s9 -; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: v_mov_b32_e32 v2, s44 +; SI-NEXT: v_mov_b32_e32 v3, s45 +; SI-NEXT: v_mov_b32_e32 v4, s46 +; SI-NEXT: v_mov_b32_e32 v5, s47 +; SI-NEXT: v_mov_b32_e32 v6, s56 +; SI-NEXT: v_mov_b32_e32 v7, s57 +; SI-NEXT: v_mov_b32_e32 v8, s58 +; SI-NEXT: v_mov_b32_e32 v9, s59 +; SI-NEXT: v_mov_b32_e32 v10, s60 +; SI-NEXT: v_mov_b32_e32 v11, s61 +; SI-NEXT: v_mov_b32_e32 v12, s62 +; SI-NEXT: v_mov_b32_e32 v13, s63 ; SI-NEXT: s_cbranch_execnz .LBB15_3 ; SI-NEXT: .LBB15_2: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v24 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v33, v0 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v23 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v32, v0 -; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v22 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v31, v0 -; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v21 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v30, v0 -; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v20 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v29, v0 -; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v19 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_or_b32_e32 v0, v28, v0 ; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v25 +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: s_add_i32 s17, s17, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s16, s42, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: s_and_b32 s16, s18, 0xffff +; SI-NEXT: s_lshl_b32 s17, s41, 16 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_and_b32 s17, s19, 0xffff +; SI-NEXT: s_lshl_b32 s18, s40, 16 ; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: s_and_b32 s18, s20, 0xffff +; SI-NEXT: s_lshl_b32 s15, s15, 16 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_or_b32 s15, s15, s18 +; SI-NEXT: s_and_b32 s18, s21, 0xffff +; SI-NEXT: s_lshl_b32 s14, s14, 16 ; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: v_or_b32_e32 v0, v27, v0 -; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_or_b32 s14, s14, s18 +; SI-NEXT: s_and_b32 s18, s22, 0xffff +; SI-NEXT: s_lshl_b32 s13, s13, 16 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_or_b32 s13, s13, s18 +; SI-NEXT: s_and_b32 s18, s23, 0xffff +; SI-NEXT: s_lshl_b32 s12, s12, 16 ; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v26 -; SI-NEXT: s_or_b32 s7, s8, s7 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_or_b32 s12, s12, s18 +; SI-NEXT: s_and_b32 s18, s24, 0xffff +; SI-NEXT: s_lshl_b32 s11, s11, 16 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_or_b32 s11, s11, s18 +; SI-NEXT: s_and_b32 s18, s25, 0xffff +; SI-NEXT: s_lshl_b32 s10, s10, 16 ; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_or_b32 s8, s9, s8 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_or_b32 s10, s10, s18 +; SI-NEXT: s_and_b32 s18, s26, 0xffff +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v17 +; SI-NEXT: s_or_b32 s9, s9, s18 +; SI-NEXT: s_and_b32 s18, s27, 0xffff +; SI-NEXT: s_lshl_b32 s8, s8, 16 ; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: v_or_b32_e32 v0, v18, v0 -; SI-NEXT: s_or_b32 s9, s10, s9 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s8, s8, s18 +; SI-NEXT: s_and_b32 s18, s28, 0xffff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: v_or_b32_e32 v0, v19, v0 +; SI-NEXT: s_or_b32 s7, s7, s18 +; SI-NEXT: s_and_b32 s18, s29, 0xffff +; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v16 -; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: s_or_b32 s6, s6, s18 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_add_i32 s4, s4, 0x30000 ; SI-NEXT: s_add_i32 s5, s5, 0x30000 -; SI-NEXT: s_add_i32 s6, s6, 0x30000 -; SI-NEXT: s_add_i32 s7, s7, 0x30000 -; SI-NEXT: s_add_i32 s8, s8, 0x30000 -; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s16, s16, 0x30000 +; SI-NEXT: s_add_i32 s17, s17, 0x30000 +; SI-NEXT: s_add_i32 s15, s15, 0x30000 +; SI-NEXT: s_add_i32 s14, s14, 0x30000 +; SI-NEXT: s_add_i32 s13, s13, 0x30000 +; SI-NEXT: s_add_i32 s12, s12, 0x30000 +; SI-NEXT: s_add_i32 s11, s11, 0x30000 ; SI-NEXT: s_add_i32 s10, s10, 0x30000 -; SI-NEXT: v_or_b32_e32 v0, v17, v0 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v18, v0 ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_mov_b32_e32 v5, s9 -; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: v_mov_b32_e32 v3, s17 +; SI-NEXT: v_mov_b32_e32 v4, s15 +; SI-NEXT: v_mov_b32_e32 v5, s14 +; SI-NEXT: v_mov_b32_e32 v6, s13 +; SI-NEXT: v_mov_b32_e32 v7, s12 +; SI-NEXT: v_mov_b32_e32 v8, s11 +; SI-NEXT: v_mov_b32_e32 v9, s10 +; SI-NEXT: v_mov_b32_e32 v10, s9 +; SI-NEXT: v_mov_b32_e32 v11, s8 +; SI-NEXT: v_mov_b32_e32 v12, s7 +; SI-NEXT: v_mov_b32_e32 v13, s6 ; SI-NEXT: .LBB15_3: ; %end ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB15_4: @@ -3128,23 +3269,91 @@ define <32 x half> @bitcast_v16i32_to_v32f16(<16 x i32> %a, i32 %b) { ; SI-LABEL: bitcast_v16i32_to_v32f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v32, v15 -; SI-NEXT: v_mov_b32_e32 v33, v14 -; SI-NEXT: v_mov_b32_e32 v34, v13 -; SI-NEXT: v_mov_b32_e32 v35, v12 -; SI-NEXT: v_mov_b32_e32 v36, v11 -; SI-NEXT: v_mov_b32_e32 v37, v10 -; SI-NEXT: v_mov_b32_e32 v38, v9 -; SI-NEXT: v_mov_b32_e32 v39, v8 -; SI-NEXT: v_mov_b32_e32 v48, v7 -; SI-NEXT: v_mov_b32_e32 v49, v6 -; SI-NEXT: v_mov_b32_e32 v50, v5 -; SI-NEXT: v_mov_b32_e32 v51, v4 -; SI-NEXT: v_mov_b32_e32 v52, v3 -; SI-NEXT: v_mov_b32_e32 v53, v2 -; SI-NEXT: v_mov_b32_e32 v54, v1 -; SI-NEXT: v_mov_b32_e32 v55, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB16_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v0 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 @@ -3161,164 +3370,140 @@ define <32 x half> @bitcast_v16i32_to_v32f16(<16 x i32> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB16_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB16_4 -; SI-NEXT: .LBB16_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB16_3: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v55 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: .LBB16_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB16_2 -; SI-NEXT: .LBB16_4: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v54 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v53 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v52 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v51 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v50 -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v49 -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v48 -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v39 -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v38 -; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v37 -; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v36 -; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v35 -; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v34 -; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v33 -; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v32 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: s_cbranch_execz .LBB16_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 +; SI-NEXT: .LBB16_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_cvt_f16_f32_e32 v0, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v52 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v51 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v48 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v38 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v37 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v34 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v33 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v29 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v30 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v25 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v26 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v21 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v22 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v17 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v18 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16i32_to_v32f16: @@ -3464,53 +3649,53 @@ define inreg <32 x half> @bitcast_v16i32_to_v32f16_scalar(<16 x i32> inreg %a, i ; SI-NEXT: s_cbranch_scc0 .LBB17_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_lshr_b32 s4, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 ; SI-NEXT: s_lshr_b32 s4, s6, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 ; SI-NEXT: s_lshr_b32 s4, s7, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 ; SI-NEXT: s_lshr_b32 s4, s8, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 ; SI-NEXT: s_lshr_b32 s4, s10, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 ; SI-NEXT: s_lshr_b32 s4, s11, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 ; SI-NEXT: s_lshr_b32 s4, s12, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 ; SI-NEXT: s_lshr_b32 s4, s13, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 ; SI-NEXT: s_lshr_b32 s4, s14, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 ; SI-NEXT: s_lshr_b32 s4, s15, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s4 ; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 ; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s4 ; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 ; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s4 ; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s21 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s21 ; SI-NEXT: s_cbranch_execnz .LBB17_3 ; SI-NEXT: .LBB17_2: ; %cmp.true ; SI-NEXT: s_add_i32 s21, s21, 3 @@ -3545,73 +3730,137 @@ define inreg <32 x half> @bitcast_v16i32_to_v32f16_scalar(<16 x i32> inreg %a, i ; SI-NEXT: s_lshr_b32 s43, s7, 16 ; SI-NEXT: s_lshr_b32 s44, s6, 16 ; SI-NEXT: s_lshr_b32 s45, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s45 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s44 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s43 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s45 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s44 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s4 ; SI-NEXT: .LBB17_3: ; %end +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v31 +; SI-NEXT: v_or_b32_e32 v0, v30, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 +; SI-NEXT: v_or_b32_e32 v2, v28, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v28 +; SI-NEXT: v_or_b32_e32 v5, v5, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v26 +; SI-NEXT: v_or_b32_e32 v7, v7, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v24 +; SI-NEXT: v_or_b32_e32 v9, v22, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v22 +; SI-NEXT: v_or_b32_e32 v11, v20, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_or_b32_e32 v13, v18, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v18 +; SI-NEXT: v_or_b32_e32 v1, v32, v1 +; SI-NEXT: v_or_b32_e32 v3, v30, v3 +; SI-NEXT: v_or_b32_e32 v4, v27, v4 +; SI-NEXT: v_or_b32_e32 v6, v25, v6 +; SI-NEXT: v_or_b32_e32 v8, v23, v8 +; SI-NEXT: v_or_b32_e32 v10, v21, v10 +; SI-NEXT: v_or_b32_e32 v12, v19, v12 +; SI-NEXT: v_or_b32_e32 v14, v17, v14 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB17_4: +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: s_branch .LBB17_2 ; ; VI-LABEL: bitcast_v16i32_to_v32f16_scalar: @@ -3768,97 +4017,126 @@ define <16 x i32> @bitcast_v32f16_to_v16i32(<32 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v32f16_to_v16i32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:4 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v15 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v47 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v56 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB18_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v46 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v44 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v42 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v40 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v54 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v52 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v50 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v48 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v38 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v34 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 -; SI-NEXT: v_or_b32_e32 v0, v45, v0 -; SI-NEXT: v_or_b32_e32 v1, v43, v1 -; SI-NEXT: v_or_b32_e32 v2, v41, v2 -; SI-NEXT: v_or_b32_e32 v3, v55, v3 -; SI-NEXT: v_or_b32_e32 v4, v53, v4 -; SI-NEXT: v_or_b32_e32 v5, v51, v5 -; SI-NEXT: v_or_b32_e32 v6, v49, v6 -; SI-NEXT: v_or_b32_e32 v7, v39, v7 -; SI-NEXT: v_or_b32_e32 v8, v37, v8 -; SI-NEXT: v_or_b32_e32 v9, v35, v9 -; SI-NEXT: v_or_b32_e32 v10, v33, v10 -; SI-NEXT: v_or_b32_e32 v11, v31, v11 -; SI-NEXT: v_or_b32_e32 v12, v22, v12 -; SI-NEXT: v_or_b32_e32 v13, v20, v13 -; SI-NEXT: v_or_b32_e32 v14, v18, v14 -; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v40 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v18 +; SI-NEXT: v_or_b32_e32 v0, v55, v0 +; SI-NEXT: v_or_b32_e32 v1, v53, v1 +; SI-NEXT: v_or_b32_e32 v2, v51, v2 +; SI-NEXT: v_or_b32_e32 v3, v49, v3 +; SI-NEXT: v_or_b32_e32 v4, v39, v4 +; SI-NEXT: v_or_b32_e32 v5, v37, v5 +; SI-NEXT: v_or_b32_e32 v6, v35, v6 +; SI-NEXT: v_or_b32_e32 v7, v33, v7 +; SI-NEXT: v_or_b32_e32 v8, v31, v8 +; SI-NEXT: v_or_b32_e32 v9, v29, v9 +; SI-NEXT: v_or_b32_e32 v10, v27, v10 +; SI-NEXT: v_or_b32_e32 v11, v25, v11 +; SI-NEXT: v_or_b32_e32 v12, v23, v12 +; SI-NEXT: v_or_b32_e32 v13, v21, v13 +; SI-NEXT: v_or_b32_e32 v14, v19, v14 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 ; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr54 @@ -3877,6 +4155,13 @@ define <16 x i32> @bitcast_v32f16_to_v16i32(<32 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr21 @@ -3884,15 +4169,14 @@ define <16 x i32> @bitcast_v32f16_to_v16i32(<32 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: .LBB18_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB18_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v53 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -3905,10 +4189,10 @@ define <16 x i32> @bitcast_v32f16_to_v16i32(<32 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v49 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -3916,10 +4200,10 @@ define <16 x i32> @bitcast_v32f16_to_v16i32(<32 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v39 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v48 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 @@ -3927,11 +4211,11 @@ define <16 x i32> @bitcast_v32f16_to_v16i32(<32 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v38 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v36 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 @@ -3939,11 +4223,11 @@ define <16 x i32> @bitcast_v32f16_to_v16i32(<32 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v35 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v33 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 @@ -3951,11 +4235,11 @@ define <16 x i32> @bitcast_v32f16_to_v16i32(<32 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v32 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v30 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 @@ -3963,11 +4247,11 @@ define <16 x i32> @bitcast_v32f16_to_v16i32(<32 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v29 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v27 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 @@ -3975,11 +4259,11 @@ define <16 x i32> @bitcast_v32f16_to_v16i32(<32 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v26 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v24 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 @@ -3987,11 +4271,11 @@ define <16 x i32> @bitcast_v32f16_to_v16i32(<32 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v23 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v21 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 @@ -3999,35 +4283,27 @@ define <16 x i32> @bitcast_v32f16_to_v16i32(<32 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_or_b32_e32 v12, v14, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v20 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 -; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 ; SI-NEXT: .LBB18_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -4173,82 +4449,128 @@ define inreg <16 x i32> @bitcast_v32f16_to_v16i32_scalar(<32 x half> inreg %a, i ; SI-LABEL: bitcast_v32f16_to_v16i32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v51, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v50, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v40, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v55, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v54, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v53, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v52, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v49, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v48, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v39, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v38, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v37, s26 -; SI-NEXT: v_cvt_f16_f32_e32 v36, s29 -; SI-NEXT: v_cvt_f16_f32_e32 v35, s28 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: s_lshr_b32 s10, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s10 +; SI-NEXT: s_lshr_b32 s10, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s10 +; SI-NEXT: s_lshr_b32 s10, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s10 +; SI-NEXT: s_lshr_b32 s10, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s10 +; SI-NEXT: s_lshr_b32 s8, s25, 16 +; SI-NEXT: s_lshr_b32 s9, s24, 16 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s25 +; SI-NEXT: s_lshr_b32 s6, s27, 16 +; SI-NEXT: s_lshr_b32 s7, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s19, 16 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 +; SI-NEXT: s_lshr_b32 s10, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s10 +; SI-NEXT: s_lshr_b32 s10, s17, 16 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; SI-NEXT: s_lshr_b32 s4, s29, 16 +; SI-NEXT: s_lshr_b32 s5, s28, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s10 +; SI-NEXT: s_lshr_b32 s10, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v1 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_cbranch_scc0 .LBB19_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v40 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v52 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v48 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v38 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v53 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v33 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v29 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v28 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v25 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v24 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v22 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v19 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 -; SI-NEXT: v_or_b32_e32 v0, v50, v0 -; SI-NEXT: v_or_b32_e32 v1, v55, v1 -; SI-NEXT: v_or_b32_e32 v2, v53, v2 -; SI-NEXT: v_or_b32_e32 v3, v49, v3 -; SI-NEXT: v_or_b32_e32 v4, v39, v4 -; SI-NEXT: v_or_b32_e32 v5, v37, v5 -; SI-NEXT: v_or_b32_e32 v6, v35, v6 -; SI-NEXT: v_or_b32_e32 v7, v33, v7 -; SI-NEXT: v_or_b32_e32 v8, v31, v8 -; SI-NEXT: v_or_b32_e32 v9, v29, v9 -; SI-NEXT: v_or_b32_e32 v10, v27, v10 -; SI-NEXT: v_or_b32_e32 v11, v25, v11 -; SI-NEXT: v_or_b32_e32 v12, v23, v12 -; SI-NEXT: v_or_b32_e32 v13, v21, v13 -; SI-NEXT: v_or_b32_e32 v14, v19, v14 +; SI-NEXT: v_or_b32_e32 v0, v54, v0 +; SI-NEXT: v_or_b32_e32 v1, v52, v1 +; SI-NEXT: v_or_b32_e32 v2, v50, v2 +; SI-NEXT: v_or_b32_e32 v3, v48, v3 +; SI-NEXT: v_or_b32_e32 v4, v38, v4 +; SI-NEXT: v_or_b32_e32 v5, v36, v5 +; SI-NEXT: v_or_b32_e32 v6, v34, v6 +; SI-NEXT: v_or_b32_e32 v7, v31, v7 +; SI-NEXT: v_or_b32_e32 v8, v30, v8 +; SI-NEXT: v_or_b32_e32 v9, v27, v9 +; SI-NEXT: v_or_b32_e32 v10, v26, v10 +; SI-NEXT: v_or_b32_e32 v11, v23, v11 +; SI-NEXT: v_or_b32_e32 v12, v22, v12 +; SI-NEXT: v_or_b32_e32 v13, v20, v13 +; SI-NEXT: v_or_b32_e32 v14, v18, v14 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 ; SI-NEXT: s_cbranch_execnz .LBB19_3 ; SI-NEXT: .LBB19_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v52 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -4257,25 +4579,25 @@ define inreg <16 x i32> @bitcast_v32f16_to_v16i32_scalar(<32 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v51 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v50 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v49 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v48 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v38 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -4283,11 +4605,11 @@ define inreg <16 x i32> @bitcast_v32f16_to_v16i32_scalar(<32 x half> inreg %a, i ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v37 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v35 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 @@ -4295,11 +4617,11 @@ define inreg <16 x i32> @bitcast_v32f16_to_v16i32_scalar(<32 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v34 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v31 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 @@ -4310,8 +4632,8 @@ define inreg <16 x i32> @bitcast_v32f16_to_v16i32_scalar(<32 x half> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v8, v32 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v29 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 @@ -4319,11 +4641,11 @@ define inreg <16 x i32> @bitcast_v32f16_to_v16i32_scalar(<32 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v27 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v26 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 @@ -4331,10 +4653,10 @@ define inreg <16 x i32> @bitcast_v32f16_to_v16i32_scalar(<32 x half> inreg %a, i ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v25 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v23 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v24 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 @@ -4343,11 +4665,11 @@ define inreg <16 x i32> @bitcast_v32f16_to_v16i32_scalar(<32 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v22 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v20 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 @@ -4355,10 +4677,10 @@ define inreg <16 x i32> @bitcast_v32f16_to_v16i32_scalar(<32 x half> inreg %a, i ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_or_b32_e32 v12, v14, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v19 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 @@ -4374,8 +4696,6 @@ define inreg <16 x i32> @bitcast_v32f16_to_v16i32_scalar(<32 x half> inreg %a, i ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 ; SI-NEXT: .LBB19_3: ; %end -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB19_4: ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 @@ -4573,23 +4893,75 @@ define <32 x bfloat> @bitcast_v16i32_to_v32bf16(<16 x i32> %a, i32 %b) { ; SI-LABEL: bitcast_v16i32_to_v32bf16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v55, v15 -; SI-NEXT: v_mov_b32_e32 v54, v14 -; SI-NEXT: v_mov_b32_e32 v53, v13 -; SI-NEXT: v_mov_b32_e32 v52, v12 -; SI-NEXT: v_mov_b32_e32 v51, v11 -; SI-NEXT: v_mov_b32_e32 v50, v10 -; SI-NEXT: v_mov_b32_e32 v49, v9 -; SI-NEXT: v_mov_b32_e32 v48, v8 -; SI-NEXT: v_mov_b32_e32 v39, v7 -; SI-NEXT: v_mov_b32_e32 v38, v6 -; SI-NEXT: v_mov_b32_e32 v37, v5 -; SI-NEXT: v_mov_b32_e32 v36, v4 -; SI-NEXT: v_mov_b32_e32 v35, v3 -; SI-NEXT: v_mov_b32_e32 v34, v2 -; SI-NEXT: v_mov_b32_e32 v33, v1 -; SI-NEXT: v_mov_b32_e32 v32, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB20_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v15 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v15 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v14 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v14 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v13 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v13 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v12 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v12 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v11 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v11 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v10 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v10 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v9 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v9 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v8 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v8 +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v7 +; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v6 +; SI-NEXT: v_and_b32_e32 v37, 0xffff0000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v5 +; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v4 +; SI-NEXT: v_and_b32_e32 v49, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v3 +; SI-NEXT: v_and_b32_e32 v51, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v2 +; SI-NEXT: v_and_b32_e32 v53, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v1 +; SI-NEXT: v_and_b32_e32 v55, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v0 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 @@ -4606,132 +4978,124 @@ define <32 x bfloat> @bitcast_v16i32_to_v32bf16(<16 x i32> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB20_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB20_4 -; SI-NEXT: .LBB20_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB20_3: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v55 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v55 -; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v54 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v54 -; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v53 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v53 -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v52 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v52 -; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v51 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v51 -; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v50 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v50 -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v49 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v49 -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v48 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v48 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v39 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v39 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v38 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v38 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v37 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v37 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v36 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v36 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v35 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v35 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v34 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v34 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v33 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v32 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: .LBB20_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB20_2 -; SI-NEXT: .LBB20_4: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v33 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v34 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v35 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v36 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v37 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v38 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v39 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v48 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v49 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v50 -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v51 -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v52 -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v53 -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v54 -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v55 -; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v15 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v15 -; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v14 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v14 -; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v13 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v13 -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v12 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v12 -; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v11 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v11 -; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v10 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v10 -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v9 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v9 -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v8 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v8 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v7 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v7 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v6 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v6 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v5 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v5 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v4 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v4 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v3 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_cbranch_execz .LBB20_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v15 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v15 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v14 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v14 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v13 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v13 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v12 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v12 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v11 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v11 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v10 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v10 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v9 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v9 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v8 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v8 +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v7 +; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v6 +; SI-NEXT: v_and_b32_e32 v37, 0xffff0000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v5 +; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v4 +; SI-NEXT: v_and_b32_e32 v49, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v3 +; SI-NEXT: v_and_b32_e32 v51, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v2 +; SI-NEXT: v_and_b32_e32 v53, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v1 +; SI-NEXT: v_and_b32_e32 v55, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v0 +; SI-NEXT: .LBB20_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v55 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v54 +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v53 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v52 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v51 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v50 +; SI-NEXT: v_alignbit_b32 v2, v2, v3, 16 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v49 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v48 +; SI-NEXT: v_alignbit_b32 v3, v3, v4, 16 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v39 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v38 +; SI-NEXT: v_alignbit_b32 v4, v4, v5, 16 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v37 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_mul_f32_e32 v6, 1.0, v36 +; SI-NEXT: v_alignbit_b32 v5, v5, v6, 16 +; SI-NEXT: v_mul_f32_e32 v6, 1.0, v35 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v34 +; SI-NEXT: v_alignbit_b32 v6, v6, v7, 16 +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v33 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v32 +; SI-NEXT: v_alignbit_b32 v7, v7, v8, 16 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v31 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v30 +; SI-NEXT: v_alignbit_b32 v8, v8, v9, 16 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v29 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v28 +; SI-NEXT: v_alignbit_b32 v9, v9, v10, 16 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v27 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v26 +; SI-NEXT: v_alignbit_b32 v10, v10, v11, 16 +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v25 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_mul_f32_e32 v12, 1.0, v24 +; SI-NEXT: v_alignbit_b32 v11, v11, v12, 16 +; SI-NEXT: v_mul_f32_e32 v12, 1.0, v23 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_mul_f32_e32 v13, 1.0, v22 +; SI-NEXT: v_alignbit_b32 v12, v12, v13, 16 +; SI-NEXT: v_mul_f32_e32 v13, 1.0, v21 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_mul_f32_e32 v14, 1.0, v20 +; SI-NEXT: v_alignbit_b32 v13, v13, v14, 16 +; SI-NEXT: v_mul_f32_e32 v14, 1.0, v19 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_mul_f32_e32 v15, 1.0, v18 +; SI-NEXT: v_alignbit_b32 v14, v14, v15, 16 +; SI-NEXT: v_mul_f32_e32 v15, 1.0, v17 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_alignbit_b32 v15, v15, v16, 16 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16i32_to_v32bf16: @@ -4876,38 +5240,38 @@ define inreg <32 x bfloat> @bitcast_v16i32_to_v32bf16_scalar(<16 x i32> inreg %a ; SI-NEXT: v_readfirstlane_b32 s79, v1 ; SI-NEXT: s_cbranch_scc0 .LBB21_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_and_b32 s6, s79, 0xffff0000 -; SI-NEXT: s_lshl_b32 s7, s79, 16 -; SI-NEXT: s_and_b32 s8, s78, 0xffff0000 -; SI-NEXT: s_lshl_b32 s9, s78, 16 -; SI-NEXT: s_and_b32 s10, s77, 0xffff0000 -; SI-NEXT: s_lshl_b32 s11, s77, 16 -; SI-NEXT: s_and_b32 s12, s76, 0xffff0000 -; SI-NEXT: s_lshl_b32 s13, s76, 16 -; SI-NEXT: s_and_b32 s14, s75, 0xffff0000 -; SI-NEXT: s_lshl_b32 s15, s75, 16 -; SI-NEXT: s_and_b32 s16, s74, 0xffff0000 -; SI-NEXT: s_lshl_b32 s17, s74, 16 -; SI-NEXT: s_and_b32 s18, s73, 0xffff0000 -; SI-NEXT: s_lshl_b32 s19, s73, 16 -; SI-NEXT: s_and_b32 s20, s72, 0xffff0000 -; SI-NEXT: s_lshl_b32 s21, s72, 16 -; SI-NEXT: s_and_b32 s22, s63, 0xffff0000 -; SI-NEXT: s_lshl_b32 s23, s63, 16 -; SI-NEXT: s_and_b32 s24, s62, 0xffff0000 -; SI-NEXT: s_lshl_b32 s25, s62, 16 -; SI-NEXT: s_and_b32 s26, s61, 0xffff0000 -; SI-NEXT: s_lshl_b32 s27, s61, 16 -; SI-NEXT: s_and_b32 s28, s60, 0xffff0000 -; SI-NEXT: s_lshl_b32 s29, s60, 16 -; SI-NEXT: s_and_b32 s40, s59, 0xffff0000 -; SI-NEXT: s_lshl_b32 s41, s59, 16 -; SI-NEXT: s_and_b32 s42, s58, 0xffff0000 -; SI-NEXT: s_lshl_b32 s43, s58, 16 -; SI-NEXT: s_and_b32 s44, s57, 0xffff0000 -; SI-NEXT: s_lshl_b32 s45, s57, 16 -; SI-NEXT: s_and_b32 s46, s56, 0xffff0000 -; SI-NEXT: s_lshl_b32 s47, s56, 16 +; SI-NEXT: s_and_b32 s7, s79, 0xffff0000 +; SI-NEXT: s_lshl_b32 s6, s79, 16 +; SI-NEXT: s_and_b32 s9, s78, 0xffff0000 +; SI-NEXT: s_lshl_b32 s8, s78, 16 +; SI-NEXT: s_and_b32 s11, s77, 0xffff0000 +; SI-NEXT: s_lshl_b32 s10, s77, 16 +; SI-NEXT: s_and_b32 s13, s76, 0xffff0000 +; SI-NEXT: s_lshl_b32 s12, s76, 16 +; SI-NEXT: s_and_b32 s15, s75, 0xffff0000 +; SI-NEXT: s_lshl_b32 s14, s75, 16 +; SI-NEXT: s_and_b32 s17, s74, 0xffff0000 +; SI-NEXT: s_lshl_b32 s16, s74, 16 +; SI-NEXT: s_and_b32 s19, s73, 0xffff0000 +; SI-NEXT: s_lshl_b32 s18, s73, 16 +; SI-NEXT: s_and_b32 s21, s72, 0xffff0000 +; SI-NEXT: s_lshl_b32 s20, s72, 16 +; SI-NEXT: s_and_b32 s23, s63, 0xffff0000 +; SI-NEXT: s_lshl_b32 s22, s63, 16 +; SI-NEXT: s_and_b32 s25, s62, 0xffff0000 +; SI-NEXT: s_lshl_b32 s24, s62, 16 +; SI-NEXT: s_and_b32 s27, s61, 0xffff0000 +; SI-NEXT: s_lshl_b32 s26, s61, 16 +; SI-NEXT: s_and_b32 s29, s60, 0xffff0000 +; SI-NEXT: s_lshl_b32 s28, s60, 16 +; SI-NEXT: s_and_b32 s41, s59, 0xffff0000 +; SI-NEXT: s_lshl_b32 s40, s59, 16 +; SI-NEXT: s_and_b32 s43, s58, 0xffff0000 +; SI-NEXT: s_lshl_b32 s42, s58, 16 +; SI-NEXT: s_and_b32 s45, s57, 0xffff0000 +; SI-NEXT: s_lshl_b32 s44, s57, 16 +; SI-NEXT: s_and_b32 s47, s56, 0xffff0000 +; SI-NEXT: s_lshl_b32 s46, s56, 16 ; SI-NEXT: s_cbranch_execnz .LBB21_3 ; SI-NEXT: .LBB21_2: ; %cmp.true ; SI-NEXT: s_add_i32 s56, s56, 3 @@ -4926,105 +5290,137 @@ define inreg <32 x bfloat> @bitcast_v16i32_to_v32bf16_scalar(<16 x i32> inreg %a ; SI-NEXT: s_add_i32 s77, s77, 3 ; SI-NEXT: s_add_i32 s78, s78, 3 ; SI-NEXT: s_add_i32 s79, s79, 3 -; SI-NEXT: s_and_b32 s6, s79, 0xffff0000 -; SI-NEXT: s_lshl_b32 s7, s79, 16 -; SI-NEXT: s_and_b32 s8, s78, 0xffff0000 -; SI-NEXT: s_lshl_b32 s9, s78, 16 -; SI-NEXT: s_and_b32 s10, s77, 0xffff0000 -; SI-NEXT: s_lshl_b32 s11, s77, 16 -; SI-NEXT: s_and_b32 s12, s76, 0xffff0000 -; SI-NEXT: s_lshl_b32 s13, s76, 16 -; SI-NEXT: s_and_b32 s14, s75, 0xffff0000 -; SI-NEXT: s_lshl_b32 s15, s75, 16 -; SI-NEXT: s_and_b32 s16, s74, 0xffff0000 -; SI-NEXT: s_lshl_b32 s17, s74, 16 -; SI-NEXT: s_and_b32 s18, s73, 0xffff0000 -; SI-NEXT: s_lshl_b32 s19, s73, 16 -; SI-NEXT: s_and_b32 s20, s72, 0xffff0000 -; SI-NEXT: s_lshl_b32 s21, s72, 16 -; SI-NEXT: s_and_b32 s22, s63, 0xffff0000 -; SI-NEXT: s_lshl_b32 s23, s63, 16 -; SI-NEXT: s_and_b32 s24, s62, 0xffff0000 -; SI-NEXT: s_lshl_b32 s25, s62, 16 -; SI-NEXT: s_and_b32 s26, s61, 0xffff0000 -; SI-NEXT: s_lshl_b32 s27, s61, 16 -; SI-NEXT: s_and_b32 s28, s60, 0xffff0000 -; SI-NEXT: s_lshl_b32 s29, s60, 16 -; SI-NEXT: s_and_b32 s40, s59, 0xffff0000 -; SI-NEXT: s_lshl_b32 s41, s59, 16 -; SI-NEXT: s_and_b32 s42, s58, 0xffff0000 -; SI-NEXT: s_lshl_b32 s43, s58, 16 -; SI-NEXT: s_and_b32 s44, s57, 0xffff0000 -; SI-NEXT: s_lshl_b32 s45, s57, 16 -; SI-NEXT: s_and_b32 s46, s56, 0xffff0000 -; SI-NEXT: s_lshl_b32 s47, s56, 16 +; SI-NEXT: s_and_b32 s7, s79, 0xffff0000 +; SI-NEXT: s_lshl_b32 s6, s79, 16 +; SI-NEXT: s_and_b32 s9, s78, 0xffff0000 +; SI-NEXT: s_lshl_b32 s8, s78, 16 +; SI-NEXT: s_and_b32 s11, s77, 0xffff0000 +; SI-NEXT: s_lshl_b32 s10, s77, 16 +; SI-NEXT: s_and_b32 s13, s76, 0xffff0000 +; SI-NEXT: s_lshl_b32 s12, s76, 16 +; SI-NEXT: s_and_b32 s15, s75, 0xffff0000 +; SI-NEXT: s_lshl_b32 s14, s75, 16 +; SI-NEXT: s_and_b32 s17, s74, 0xffff0000 +; SI-NEXT: s_lshl_b32 s16, s74, 16 +; SI-NEXT: s_and_b32 s19, s73, 0xffff0000 +; SI-NEXT: s_lshl_b32 s18, s73, 16 +; SI-NEXT: s_and_b32 s21, s72, 0xffff0000 +; SI-NEXT: s_lshl_b32 s20, s72, 16 +; SI-NEXT: s_and_b32 s23, s63, 0xffff0000 +; SI-NEXT: s_lshl_b32 s22, s63, 16 +; SI-NEXT: s_and_b32 s25, s62, 0xffff0000 +; SI-NEXT: s_lshl_b32 s24, s62, 16 +; SI-NEXT: s_and_b32 s27, s61, 0xffff0000 +; SI-NEXT: s_lshl_b32 s26, s61, 16 +; SI-NEXT: s_and_b32 s29, s60, 0xffff0000 +; SI-NEXT: s_lshl_b32 s28, s60, 16 +; SI-NEXT: s_and_b32 s41, s59, 0xffff0000 +; SI-NEXT: s_lshl_b32 s40, s59, 16 +; SI-NEXT: s_and_b32 s43, s58, 0xffff0000 +; SI-NEXT: s_lshl_b32 s42, s58, 16 +; SI-NEXT: s_and_b32 s45, s57, 0xffff0000 +; SI-NEXT: s_lshl_b32 s44, s57, 16 +; SI-NEXT: s_and_b32 s47, s56, 0xffff0000 +; SI-NEXT: s_lshl_b32 s46, s56, 16 ; SI-NEXT: .LBB21_3: ; %end -; SI-NEXT: v_mov_b32_e32 v0, s47 -; SI-NEXT: v_mov_b32_e32 v1, s46 -; SI-NEXT: v_mov_b32_e32 v2, s45 -; SI-NEXT: v_mov_b32_e32 v3, s44 -; SI-NEXT: v_mov_b32_e32 v4, s43 -; SI-NEXT: v_mov_b32_e32 v5, s42 -; SI-NEXT: v_mov_b32_e32 v6, s41 -; SI-NEXT: v_mov_b32_e32 v7, s40 -; SI-NEXT: v_mov_b32_e32 v8, s29 -; SI-NEXT: v_mov_b32_e32 v9, s28 -; SI-NEXT: v_mov_b32_e32 v10, s27 -; SI-NEXT: v_mov_b32_e32 v11, s26 -; SI-NEXT: v_mov_b32_e32 v12, s25 -; SI-NEXT: v_mov_b32_e32 v13, s24 -; SI-NEXT: v_mov_b32_e32 v14, s23 -; SI-NEXT: v_mov_b32_e32 v15, s22 -; SI-NEXT: v_mov_b32_e32 v16, s21 -; SI-NEXT: v_mov_b32_e32 v17, s20 -; SI-NEXT: v_mov_b32_e32 v18, s19 -; SI-NEXT: v_mov_b32_e32 v19, s18 -; SI-NEXT: v_mov_b32_e32 v20, s17 -; SI-NEXT: v_mov_b32_e32 v21, s16 -; SI-NEXT: v_mov_b32_e32 v22, s15 -; SI-NEXT: v_mov_b32_e32 v23, s14 -; SI-NEXT: v_mov_b32_e32 v24, s13 -; SI-NEXT: v_mov_b32_e32 v25, s12 -; SI-NEXT: v_mov_b32_e32 v26, s11 -; SI-NEXT: v_mov_b32_e32 v27, s10 -; SI-NEXT: v_mov_b32_e32 v28, s9 -; SI-NEXT: v_mov_b32_e32 v29, s8 -; SI-NEXT: v_mov_b32_e32 v30, s7 -; SI-NEXT: v_mov_b32_e32 v31, s6 +; SI-NEXT: v_mul_f32_e64 v0, 1.0, s47 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_mul_f32_e64 v0, 1.0, s46 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s45 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s44 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s43 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s42 +; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s41 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s40 +; SI-NEXT: v_lshr_b64 v[3:4], v[3:4], 16 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s29 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s28 +; SI-NEXT: v_lshr_b64 v[4:5], v[4:5], 16 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s27 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s26 +; SI-NEXT: v_lshr_b64 v[5:6], v[5:6], 16 +; SI-NEXT: v_mul_f32_e64 v6, 1.0, s25 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_mul_f32_e64 v6, 1.0, s24 +; SI-NEXT: v_lshr_b64 v[6:7], v[6:7], 16 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s23 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v7 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s22 +; SI-NEXT: v_lshr_b64 v[7:8], v[7:8], 16 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s21 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v8 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s20 +; SI-NEXT: v_lshr_b64 v[8:9], v[8:9], 16 +; SI-NEXT: v_mul_f32_e64 v9, 1.0, s19 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v9 +; SI-NEXT: v_mul_f32_e64 v9, 1.0, s18 +; SI-NEXT: v_lshr_b64 v[9:10], v[9:10], 16 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s17 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s16 +; SI-NEXT: v_lshr_b64 v[10:11], v[10:11], 16 +; SI-NEXT: v_mul_f32_e64 v11, 1.0, s15 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v11 +; SI-NEXT: v_mul_f32_e64 v11, 1.0, s14 +; SI-NEXT: v_lshr_b64 v[11:12], v[11:12], 16 +; SI-NEXT: v_mul_f32_e64 v12, 1.0, s13 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v12 +; SI-NEXT: v_mul_f32_e64 v12, 1.0, s12 +; SI-NEXT: v_lshr_b64 v[12:13], v[12:13], 16 +; SI-NEXT: v_mul_f32_e64 v13, 1.0, s11 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v13 +; SI-NEXT: v_mul_f32_e64 v13, 1.0, s10 +; SI-NEXT: v_lshr_b64 v[13:14], v[13:14], 16 +; SI-NEXT: v_mul_f32_e64 v14, 1.0, s9 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_mul_f32_e64 v14, 1.0, s8 +; SI-NEXT: v_lshr_b64 v[14:15], v[14:15], 16 +; SI-NEXT: v_mul_f32_e64 v15, 1.0, s7 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v15 +; SI-NEXT: v_mul_f32_e64 v15, 1.0, s6 +; SI-NEXT: v_lshr_b64 v[15:16], v[15:16], 16 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB21_4: -; SI-NEXT: ; implicit-def: $sgpr47 ; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr45 +; SI-NEXT: ; implicit-def: $sgpr47 ; SI-NEXT: ; implicit-def: $sgpr44 -; SI-NEXT: ; implicit-def: $sgpr43 +; SI-NEXT: ; implicit-def: $sgpr45 ; SI-NEXT: ; implicit-def: $sgpr42 -; SI-NEXT: ; implicit-def: $sgpr41 +; SI-NEXT: ; implicit-def: $sgpr43 ; SI-NEXT: ; implicit-def: $sgpr40 -; SI-NEXT: ; implicit-def: $sgpr29 +; SI-NEXT: ; implicit-def: $sgpr41 ; SI-NEXT: ; implicit-def: $sgpr28 -; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr29 ; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr25 +; SI-NEXT: ; implicit-def: $sgpr27 ; SI-NEXT: ; implicit-def: $sgpr24 -; SI-NEXT: ; implicit-def: $sgpr23 +; SI-NEXT: ; implicit-def: $sgpr25 ; SI-NEXT: ; implicit-def: $sgpr22 -; SI-NEXT: ; implicit-def: $sgpr21 +; SI-NEXT: ; implicit-def: $sgpr23 ; SI-NEXT: ; implicit-def: $sgpr20 -; SI-NEXT: ; implicit-def: $sgpr19 +; SI-NEXT: ; implicit-def: $sgpr21 ; SI-NEXT: ; implicit-def: $sgpr18 -; SI-NEXT: ; implicit-def: $sgpr17 +; SI-NEXT: ; implicit-def: $sgpr19 ; SI-NEXT: ; implicit-def: $sgpr16 -; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $sgpr17 ; SI-NEXT: ; implicit-def: $sgpr14 -; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $sgpr15 ; SI-NEXT: ; implicit-def: $sgpr12 -; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr13 ; SI-NEXT: ; implicit-def: $sgpr10 -; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $sgpr11 ; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr9 ; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr7 ; SI-NEXT: s_branch .LBB21_2 ; ; VI-LABEL: bitcast_v16i32_to_v32bf16_scalar: @@ -5181,132 +5577,154 @@ define <16 x i32> @bitcast_v32bf16_to_v16i32(<32 x bfloat> %a, i32 %b) { ; SI-LABEL: bitcast_v32bf16_to_v16i32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 -; SI-NEXT: v_mul_f32_e32 v45, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v46, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v43, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v44, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v41, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v42, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v55, 1.0, v7 -; SI-NEXT: v_mul_f32_e32 v40, 1.0, v6 -; SI-NEXT: v_mul_f32_e32 v53, 1.0, v9 -; SI-NEXT: v_mul_f32_e32 v54, 1.0, v8 -; SI-NEXT: v_mul_f32_e32 v51, 1.0, v11 -; SI-NEXT: v_mul_f32_e32 v52, 1.0, v10 -; SI-NEXT: v_mul_f32_e32 v49, 1.0, v13 -; SI-NEXT: v_mul_f32_e32 v50, 1.0, v12 -; SI-NEXT: v_mul_f32_e32 v39, 1.0, v15 -; SI-NEXT: v_mul_f32_e32 v48, 1.0, v14 -; SI-NEXT: v_mul_f32_e32 v37, 1.0, v17 -; SI-NEXT: v_mul_f32_e32 v38, 1.0, v16 -; SI-NEXT: v_mul_f32_e32 v35, 1.0, v19 -; SI-NEXT: v_mul_f32_e32 v36, 1.0, v18 -; SI-NEXT: v_mul_f32_e32 v33, 1.0, v21 -; SI-NEXT: v_mul_f32_e32 v34, 1.0, v20 -; SI-NEXT: v_mul_f32_e32 v31, 1.0, v23 -; SI-NEXT: v_mul_f32_e32 v32, 1.0, v22 -; SI-NEXT: v_mul_f32_e32 v22, 1.0, v25 -; SI-NEXT: v_mul_f32_e32 v23, 1.0, v24 -; SI-NEXT: v_mul_f32_e32 v20, 1.0, v27 -; SI-NEXT: v_mul_f32_e32 v21, 1.0, v26 -; SI-NEXT: v_mul_f32_e32 v17, 1.0, v29 -; SI-NEXT: v_mul_f32_e32 v19, 1.0, v28 -; SI-NEXT: v_mul_f32_e32 v18, 1.0, v30 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: v_mul_f32_e32 v54, 1.0, v32 +; SI-NEXT: v_mul_f32_e32 v55, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v52, 1.0, v31 +; SI-NEXT: v_mul_f32_e32 v53, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v50, 1.0, v30 +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v48, 1.0, v29 +; SI-NEXT: v_mul_f32_e32 v49, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v38, 1.0, v28 +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v36, 1.0, v27 +; SI-NEXT: v_mul_f32_e32 v37, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v34, 1.0, v26 +; SI-NEXT: v_mul_f32_e32 v35, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v25 +; SI-NEXT: v_mul_f32_e32 v33, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v24 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v23 +; SI-NEXT: v_mul_f32_e32 v29, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v22 +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v21 +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v20 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v19 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v17 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v15 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v47 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v16, 1.0, v56 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execz .LBB22_2 -; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v45 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v43 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v41 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v55 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v53 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v51 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v49 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v39 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v37 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v35 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v33 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v31 +; SI-NEXT: s_cbranch_execnz .LBB22_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB22_4 +; SI-NEXT: .LBB22_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB22_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v52 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v50 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v48 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v38 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v24 ; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v22 ; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v18 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v16 -; SI-NEXT: v_alignbit_b32 v0, v0, v46, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v44, 16 -; SI-NEXT: v_alignbit_b32 v2, v2, v42, 16 -; SI-NEXT: v_alignbit_b32 v3, v3, v40, 16 -; SI-NEXT: v_alignbit_b32 v4, v4, v54, 16 -; SI-NEXT: v_alignbit_b32 v5, v5, v52, 16 -; SI-NEXT: v_alignbit_b32 v6, v6, v50, 16 -; SI-NEXT: v_alignbit_b32 v7, v7, v48, 16 -; SI-NEXT: v_alignbit_b32 v8, v8, v38, 16 -; SI-NEXT: v_alignbit_b32 v9, v9, v36, 16 -; SI-NEXT: v_alignbit_b32 v10, v10, v34, 16 -; SI-NEXT: v_alignbit_b32 v11, v11, v32, 16 +; SI-NEXT: v_alignbit_b32 v0, v0, v55, 16 +; SI-NEXT: v_alignbit_b32 v1, v1, v53, 16 +; SI-NEXT: v_alignbit_b32 v2, v2, v51, 16 +; SI-NEXT: v_alignbit_b32 v3, v3, v49, 16 +; SI-NEXT: v_alignbit_b32 v4, v4, v39, 16 +; SI-NEXT: v_alignbit_b32 v5, v5, v37, 16 +; SI-NEXT: v_alignbit_b32 v6, v6, v35, 16 +; SI-NEXT: v_alignbit_b32 v7, v7, v33, 16 +; SI-NEXT: v_alignbit_b32 v8, v8, v31, 16 +; SI-NEXT: v_alignbit_b32 v9, v9, v29, 16 +; SI-NEXT: v_alignbit_b32 v10, v10, v27, 16 +; SI-NEXT: v_alignbit_b32 v11, v11, v25, 16 ; SI-NEXT: v_alignbit_b32 v12, v12, v23, 16 ; SI-NEXT: v_alignbit_b32 v13, v13, v21, 16 ; SI-NEXT: v_alignbit_b32 v14, v14, v19, 16 -; SI-NEXT: v_alignbit_b32 v15, v15, v18, 16 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: v_alignbit_b32 v15, v15, v17, 16 +; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: .LBB22_2: ; %Flow +; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB22_4 -; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v45 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v43 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v46 +; SI-NEXT: s_cbranch_execz .LBB22_2 +; SI-NEXT: .LBB22_4: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v54 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v52 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v55 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v44 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v53 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 @@ -5314,62 +5732,62 @@ define <16 x i32> @bitcast_v32bf16_to_v16i32(<32 x bfloat> %a, i32 %b) { ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 ; SI-NEXT: v_alignbit_b32 v1, v3, v2, 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v41 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v42 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v50 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v51 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v55 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v48 ; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v40 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v49 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v53 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v38 ; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v54 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v39 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v51 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v36 ; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v52 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v37 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v49 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v34 ; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v50 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v35 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v39 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v32 ; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v48 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v33 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 ; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v37 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v30 ; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v38 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v31 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v35 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v28 ; SI-NEXT: v_alignbit_b32 v8, v9, v8, 16 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v36 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v29 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 ; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v33 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v26 ; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v34 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v27 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 ; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v31 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v24 ; SI-NEXT: v_alignbit_b32 v10, v11, v10, 16 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v32 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v25 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 ; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 @@ -5385,7 +5803,7 @@ define <16 x i32> @bitcast_v32bf16_to_v16i32(<32 x bfloat> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 ; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v17 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v18 ; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 ; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v19 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 @@ -5393,23 +5811,12 @@ define <16 x i32> @bitcast_v32bf16_to_v16i32(<32 x bfloat> %a, i32 %b) { ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; SI-NEXT: v_alignbit_b32 v14, v15, v14, 16 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v18 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v17 ; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 ; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_alignbit_b32 v15, v16, v15, 16 -; SI-NEXT: .LBB22_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v32bf16_to_v16i32: @@ -6576,7 +6983,39 @@ define inreg <16 x i32> @bitcast_v32bf16_to_v16i32_scalar(<32 x bfloat> inreg %a ; SI-LABEL: bitcast_v32bf16_to_v16i32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_and_b32 s6, s29, 0xffff0000 +; SI-NEXT: s_lshl_b32 s7, s29, 16 +; SI-NEXT: s_and_b32 s8, s28, 0xffff0000 +; SI-NEXT: s_lshl_b32 s9, s28, 16 +; SI-NEXT: s_and_b32 s10, s27, 0xffff0000 +; SI-NEXT: s_lshl_b32 s11, s27, 16 +; SI-NEXT: s_and_b32 s12, s26, 0xffff0000 +; SI-NEXT: s_lshl_b32 s13, s26, 16 +; SI-NEXT: s_and_b32 s14, s25, 0xffff0000 +; SI-NEXT: s_lshl_b32 s15, s25, 16 +; SI-NEXT: s_and_b32 s25, s24, 0xffff0000 +; SI-NEXT: s_lshl_b32 s24, s24, 16 +; SI-NEXT: s_and_b32 s26, s23, 0xffff0000 +; SI-NEXT: s_lshl_b32 s23, s23, 16 +; SI-NEXT: s_and_b32 s27, s22, 0xffff0000 +; SI-NEXT: s_lshl_b32 s22, s22, 16 +; SI-NEXT: s_and_b32 s28, s21, 0xffff0000 +; SI-NEXT: s_lshl_b32 s21, s21, 16 +; SI-NEXT: s_and_b32 s29, s20, 0xffff0000 +; SI-NEXT: s_lshl_b32 s20, s20, 16 +; SI-NEXT: s_and_b32 s40, s19, 0xffff0000 +; SI-NEXT: s_lshl_b32 s19, s19, 16 +; SI-NEXT: s_and_b32 s41, s18, 0xffff0000 +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_and_b32 s42, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_and_b32 s43, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s16, s16, 16 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -6594,51 +7033,50 @@ define inreg <16 x i32> @bitcast_v32bf16_to_v16i32_scalar(<32 x bfloat> inreg %a ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mul_f32_e64 v62, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v60, 1.0, s19 -; SI-NEXT: v_mul_f32_e32 v57, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v56, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v47, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v46, 1.0, v7 -; SI-NEXT: v_mul_f32_e32 v45, 1.0, v9 -; SI-NEXT: v_mul_f32_e32 v44, 1.0, v11 -; SI-NEXT: v_mul_f32_e32 v43, 1.0, v13 -; SI-NEXT: v_mul_f32_e32 v42, 1.0, v15 -; SI-NEXT: v_mul_f32_e32 v18, 1.0, v17 -; SI-NEXT: v_mul_f32_e64 v41, 1.0, s21 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v63, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v61, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v59, 1.0, s27 -; SI-NEXT: v_mul_f32_e64 v58, 1.0, s29 -; SI-NEXT: v_mul_f32_e32 v33, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v31, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v29, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v27, 1.0, v6 -; SI-NEXT: v_mul_f32_e32 v25, 1.0, v8 -; SI-NEXT: v_mul_f32_e32 v23, 1.0, v10 -; SI-NEXT: v_mul_f32_e32 v21, 1.0, v12 -; SI-NEXT: v_mul_f32_e32 v19, 1.0, v14 -; SI-NEXT: v_mul_f32_e32 v17, 1.0, v16 +; SI-NEXT: v_mul_f32_e64 v41, 1.0, s43 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v63, 1.0, s42 +; SI-NEXT: v_mul_f32_e64 v62, 1.0, s41 +; SI-NEXT: v_mul_f32_e64 v61, 1.0, s40 +; SI-NEXT: v_mul_f32_e64 v60, 1.0, s29 +; SI-NEXT: v_mul_f32_e64 v59, 1.0, s28 +; SI-NEXT: v_mul_f32_e64 v58, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v57, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v56, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v47, 1.0, s14 +; SI-NEXT: v_mul_f32_e64 v46, 1.0, s12 +; SI-NEXT: v_mul_f32_e64 v45, 1.0, s10 +; SI-NEXT: v_mul_f32_e64 v44, 1.0, s8 +; SI-NEXT: v_mul_f32_e64 v43, 1.0, s6 +; SI-NEXT: v_mul_f32_e32 v42, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v1 ; SI-NEXT: v_mul_f32_e64 v39, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v54, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v52, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v50, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v48, 1.0, s24 -; SI-NEXT: v_mul_f32_e64 v37, 1.0, s26 -; SI-NEXT: v_mul_f32_e64 v35, 1.0, s28 +; SI-NEXT: v_mul_f32_e64 v54, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v52, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v50, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v48, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v37, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v35, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v33, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v31, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v29, 1.0, s15 +; SI-NEXT: v_mul_f32_e64 v27, 1.0, s13 +; SI-NEXT: v_mul_f32_e64 v25, 1.0, s11 +; SI-NEXT: v_mul_f32_e64 v23, 1.0, s9 +; SI-NEXT: v_mul_f32_e64 v21, 1.0, s7 ; SI-NEXT: s_cbranch_scc0 .LBB23_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v62 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v60 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v63 ; SI-NEXT: v_lshr_b64 v[0:1], v[39:40], 16 ; SI-NEXT: v_lshr_b64 v[1:2], v[54:55], 16 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v62 ; SI-NEXT: v_lshr_b64 v[2:3], v[52:53], 16 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v63 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v61 ; SI-NEXT: v_lshr_b64 v[3:4], v[50:51], 16 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v61 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v60 ; SI-NEXT: v_lshr_b64 v[4:5], v[48:49], 16 ; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v59 ; SI-NEXT: v_lshr_b64 v[5:6], v[37:38], 16 @@ -6660,16 +7098,16 @@ define inreg <16 x i32> @bitcast_v32bf16_to_v16i32_scalar(<32 x bfloat> inreg %a ; SI-NEXT: v_lshr_b64 v[13:14], v[21:22], 16 ; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v42 ; SI-NEXT: v_lshr_b64 v[14:15], v[19:20], 16 -; SI-NEXT: v_mov_b32_e32 v20, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v16 +; SI-NEXT: v_mov_b32_e32 v20, v16 ; SI-NEXT: v_lshr_b64 v[15:16], v[17:18], 16 -; SI-NEXT: v_mov_b32_e32 v18, v20 +; SI-NEXT: v_mov_b32_e32 v16, v20 ; SI-NEXT: s_cbranch_execnz .LBB23_3 ; SI-NEXT: .LBB23_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v62 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v41 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v39 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v60 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v63 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v54 @@ -6678,19 +7116,19 @@ define inreg <16 x i32> @bitcast_v32bf16_to_v16i32_scalar(<32 x bfloat> inreg %a ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 ; SI-NEXT: v_lshr_b64 v[1:2], v[2:3], 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v41 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v62 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v52 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v63 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v61 ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v50 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_lshr_b64 v[3:4], v[3:4], 16 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v61 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v60 ; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v48 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 @@ -6756,7 +7194,7 @@ define inreg <16 x i32> @bitcast_v32bf16_to_v16i32_scalar(<32 x bfloat> inreg %a ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_lshr_b64 v[14:15], v[14:15], 16 -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v18 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v17 ; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 @@ -17040,97 +17478,129 @@ define <32 x i16> @bitcast_v16f32_to_v32i16(<16 x float> %a, i32 %b) { ; SI-LABEL: bitcast_v16f32_to_v32i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v30, v15 -; SI-NEXT: v_mov_b32_e32 v28, v14 -; SI-NEXT: v_mov_b32_e32 v26, v13 -; SI-NEXT: v_mov_b32_e32 v24, v12 -; SI-NEXT: v_mov_b32_e32 v22, v11 -; SI-NEXT: v_mov_b32_e32 v20, v10 -; SI-NEXT: v_mov_b32_e32 v18, v9 -; SI-NEXT: v_mov_b32_e32 v32, v8 -; SI-NEXT: v_mov_b32_e32 v14, v7 -; SI-NEXT: v_mov_b32_e32 v12, v6 -; SI-NEXT: v_mov_b32_e32 v10, v5 -; SI-NEXT: v_mov_b32_e32 v8, v4 -; SI-NEXT: v_mov_b32_e32 v6, v3 -; SI-NEXT: v_mov_b32_e32 v4, v2 -; SI-NEXT: v_mov_b32_e32 v2, v1 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB36_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_alignbit_b32 v29, v30, v28, 16 -; SI-NEXT: v_alignbit_b32 v25, v26, v24, 16 -; SI-NEXT: v_alignbit_b32 v21, v22, v20, 16 -; SI-NEXT: v_alignbit_b32 v17, v18, v32, 16 -; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 -; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_alignbit_b32 v16, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v17, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v18, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v19, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v20, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v22, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v24, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v27, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 ; SI-NEXT: .LBB36_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB36_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 ; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 -; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 ; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 -; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 ; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 -; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 -; SI-NEXT: v_add_f32_e32 v32, 1.0, v32 -; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 -; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 -; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 -; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 -; SI-NEXT: v_add_f32_e32 v30, 1.0, v30 -; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 -; SI-NEXT: v_alignbit_b32 v29, v30, v28, 16 -; SI-NEXT: v_alignbit_b32 v25, v26, v24, 16 -; SI-NEXT: v_alignbit_b32 v21, v22, v20, 16 -; SI-NEXT: v_alignbit_b32 v17, v18, v32, 16 -; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 -; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_alignbit_b32 v16, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v17, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v18, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v19, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v20, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v22, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v24, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v27, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 ; SI-NEXT: .LBB36_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_mov_b32_e32 v16, v32 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v0, v0, v27 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v31 +; SI-NEXT: v_or_b32_e32 v2, v2, v24 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v30 +; SI-NEXT: v_or_b32_e32 v4, v4, v22 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v29 +; SI-NEXT: v_or_b32_e32 v6, v6, v20 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v28 +; SI-NEXT: v_or_b32_e32 v8, v8, v19 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v26 +; SI-NEXT: v_or_b32_e32 v10, v10, v18 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v25 +; SI-NEXT: v_or_b32_e32 v12, v12, v17 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v23 +; SI-NEXT: v_or_b32_e32 v14, v14, v16 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v21 +; SI-NEXT: v_or_b32_e32 v1, v1, v27 +; SI-NEXT: v_or_b32_e32 v3, v3, v24 +; SI-NEXT: v_or_b32_e32 v5, v5, v22 +; SI-NEXT: v_or_b32_e32 v7, v7, v20 +; SI-NEXT: v_or_b32_e32 v9, v9, v19 +; SI-NEXT: v_or_b32_e32 v11, v11, v18 +; SI-NEXT: v_or_b32_e32 v13, v13, v17 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16f32_to_v32i16: @@ -17234,110 +17704,142 @@ define inreg <32 x i16> @bitcast_v16f32_to_v32i16_scalar(<16 x float> inreg %a, ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: v_mov_b32_e32 v33, v1 -; SI-NEXT: v_mov_b32_e32 v32, v0 -; SI-NEXT: v_mov_b32_e32 v34, s16 -; SI-NEXT: v_mov_b32_e32 v35, s17 -; SI-NEXT: v_mov_b32_e32 v36, s18 -; SI-NEXT: v_mov_b32_e32 v37, s19 -; SI-NEXT: v_mov_b32_e32 v38, s20 -; SI-NEXT: v_mov_b32_e32 v39, s21 -; SI-NEXT: v_mov_b32_e32 v48, s22 -; SI-NEXT: v_mov_b32_e32 v49, s23 -; SI-NEXT: v_mov_b32_e32 v50, s24 -; SI-NEXT: v_mov_b32_e32 v51, s25 -; SI-NEXT: v_mov_b32_e32 v52, s26 -; SI-NEXT: v_mov_b32_e32 v53, s27 -; SI-NEXT: v_mov_b32_e32 v54, s28 +; SI-NEXT: v_mov_b32_e32 v16, s16 +; SI-NEXT: v_mov_b32_e32 v17, s17 +; SI-NEXT: v_mov_b32_e32 v14, s18 +; SI-NEXT: v_mov_b32_e32 v15, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mov_b32_e32 v55, s29 +; SI-NEXT: v_mov_b32_e32 v13, s29 ; SI-NEXT: s_cbranch_scc0 .LBB37_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v33 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v55 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v53 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v51 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v49 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v39 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v37 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v35 -; SI-NEXT: v_lshr_b64 v[29:30], v[32:33], 16 -; SI-NEXT: v_lshr_b64 v[25:26], v[54:55], 16 -; SI-NEXT: v_lshr_b64 v[21:22], v[52:53], 16 -; SI-NEXT: v_lshr_b64 v[17:18], v[50:51], 16 -; SI-NEXT: v_lshr_b64 v[13:14], v[48:49], 16 -; SI-NEXT: v_lshr_b64 v[9:10], v[38:39], 16 -; SI-NEXT: v_lshr_b64 v[5:6], v[36:37], 16 -; SI-NEXT: v_lshr_b64 v[1:2], v[34:35], 16 +; SI-NEXT: v_lshr_b64 v[18:19], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[19:20], v[12:13], 16 +; SI-NEXT: v_lshr_b64 v[20:21], v[10:11], 16 +; SI-NEXT: v_lshr_b64 v[21:22], v[8:9], 16 +; SI-NEXT: v_lshr_b64 v[22:23], v[6:7], 16 +; SI-NEXT: v_lshr_b64 v[23:24], v[4:5], 16 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v17 +; SI-NEXT: v_lshr_b64 v[2:3], v[14:15], 16 +; SI-NEXT: v_lshr_b64 v[24:25], v[16:17], 16 ; SI-NEXT: s_cbranch_execnz .LBB37_3 ; SI-NEXT: .LBB37_2: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v35, 1.0, v35 -; SI-NEXT: v_add_f32_e32 v34, 1.0, v34 -; SI-NEXT: v_add_f32_e32 v37, 1.0, v37 -; SI-NEXT: v_add_f32_e32 v36, 1.0, v36 -; SI-NEXT: v_add_f32_e32 v39, 1.0, v39 -; SI-NEXT: v_add_f32_e32 v38, 1.0, v38 -; SI-NEXT: v_add_f32_e32 v49, 1.0, v49 -; SI-NEXT: v_add_f32_e32 v48, 1.0, v48 -; SI-NEXT: v_add_f32_e32 v51, 1.0, v51 -; SI-NEXT: v_add_f32_e32 v50, 1.0, v50 -; SI-NEXT: v_add_f32_e32 v53, 1.0, v53 -; SI-NEXT: v_add_f32_e32 v52, 1.0, v52 -; SI-NEXT: v_add_f32_e32 v55, 1.0, v55 -; SI-NEXT: v_add_f32_e32 v54, 1.0, v54 -; SI-NEXT: v_add_f32_e32 v33, 1.0, v33 -; SI-NEXT: v_add_f32_e32 v32, 1.0, v32 -; SI-NEXT: v_lshr_b64 v[29:30], v[32:33], 16 -; SI-NEXT: v_lshr_b64 v[25:26], v[54:55], 16 -; SI-NEXT: v_lshr_b64 v[21:22], v[52:53], 16 -; SI-NEXT: v_lshr_b64 v[17:18], v[50:51], 16 -; SI-NEXT: v_lshr_b64 v[13:14], v[48:49], 16 -; SI-NEXT: v_lshr_b64 v[9:10], v[38:39], 16 -; SI-NEXT: v_lshr_b64 v[5:6], v[36:37], 16 -; SI-NEXT: v_lshr_b64 v[1:2], v[34:35], 16 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v33 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v55 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v53 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v51 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v49 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v39 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v37 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v35 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_lshr_b64 v[18:19], v[0:1], 16 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_lshr_b64 v[19:20], v[12:13], 16 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_lshr_b64 v[20:21], v[10:11], 16 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_lshr_b64 v[21:22], v[8:9], 16 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_lshr_b64 v[22:23], v[6:7], 16 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_lshr_b64 v[23:24], v[4:5], 16 +; SI-NEXT: v_lshr_b64 v[2:3], v[14:15], 16 +; SI-NEXT: v_lshr_b64 v[24:25], v[16:17], 16 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v17 ; SI-NEXT: .LBB37_3: ; %end -; SI-NEXT: v_mov_b32_e32 v0, v34 -; SI-NEXT: v_mov_b32_e32 v2, v35 -; SI-NEXT: v_mov_b32_e32 v4, v36 -; SI-NEXT: v_mov_b32_e32 v6, v37 -; SI-NEXT: v_mov_b32_e32 v8, v38 -; SI-NEXT: v_mov_b32_e32 v10, v39 -; SI-NEXT: v_mov_b32_e32 v12, v48 -; SI-NEXT: v_mov_b32_e32 v14, v49 -; SI-NEXT: v_mov_b32_e32 v16, v50 -; SI-NEXT: v_mov_b32_e32 v18, v51 -; SI-NEXT: v_mov_b32_e32 v20, v52 -; SI-NEXT: v_mov_b32_e32 v22, v53 -; SI-NEXT: v_mov_b32_e32 v24, v54 -; SI-NEXT: v_mov_b32_e32 v26, v55 -; SI-NEXT: v_mov_b32_e32 v28, v32 -; SI-NEXT: v_mov_b32_e32 v30, v33 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v24 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v16, v16, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v33 +; SI-NEXT: v_or_b32_e32 v17, v3, v17 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v14 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v32 +; SI-NEXT: v_or_b32_e32 v3, v3, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v23 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v4, v4, v14 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v31 +; SI-NEXT: v_or_b32_e32 v5, v5, v14 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v22 +; SI-NEXT: v_or_b32_e32 v6, v6, v14 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v30 +; SI-NEXT: v_or_b32_e32 v7, v7, v14 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v21 +; SI-NEXT: v_or_b32_e32 v8, v8, v14 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v29 +; SI-NEXT: v_or_b32_e32 v9, v9, v14 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v20 +; SI-NEXT: v_or_b32_e32 v10, v10, v14 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v28 +; SI-NEXT: v_or_b32_e32 v11, v11, v14 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v19 +; SI-NEXT: v_or_b32_e32 v12, v12, v14 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v27 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v18 +; SI-NEXT: v_or_b32_e32 v14, v0, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v26 +; SI-NEXT: v_or_b32_e32 v15, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, v16 +; SI-NEXT: v_mov_b32_e32 v1, v17 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB37_4: -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: s_branch .LBB37_2 ; ; VI-LABEL: bitcast_v16f32_to_v32i16_scalar: @@ -17498,95 +18000,114 @@ define <16 x float> @bitcast_v32i16_to_v16f32(<32 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v32i16_to_v16f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v32, v2 +; SI-NEXT: v_mov_b32_e32 v32, v15 +; SI-NEXT: v_mov_b32_e32 v17, v14 +; SI-NEXT: v_mov_b32_e32 v18, v13 +; SI-NEXT: v_mov_b32_e32 v19, v12 +; SI-NEXT: v_mov_b32_e32 v20, v11 +; SI-NEXT: v_mov_b32_e32 v21, v10 +; SI-NEXT: v_mov_b32_e32 v22, v9 +; SI-NEXT: v_mov_b32_e32 v23, v8 +; SI-NEXT: v_mov_b32_e32 v24, v7 +; SI-NEXT: v_mov_b32_e32 v25, v6 +; SI-NEXT: v_mov_b32_e32 v26, v5 +; SI-NEXT: v_mov_b32_e32 v27, v4 +; SI-NEXT: v_mov_b32_e32 v28, v3 +; SI-NEXT: v_mov_b32_e32 v29, v2 +; SI-NEXT: v_mov_b32_e32 v30, v1 ; SI-NEXT: v_mov_b32_e32 v31, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v38, v14 -; SI-NEXT: v_mov_b32_e32 v37, v12 -; SI-NEXT: v_mov_b32_e32 v36, v10 -; SI-NEXT: v_mov_b32_e32 v35, v8 -; SI-NEXT: v_mov_b32_e32 v34, v6 -; SI-NEXT: v_mov_b32_e32 v33, v4 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v29 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v31 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v0 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execz .LBB38_2 -; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB38_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB38_4 +; SI-NEXT: .LBB38_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB38_3: ; %cmp.false ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v31 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v32 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v33 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v34 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v35 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v36 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v37 -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v38 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v16 -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v18 -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v20 -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v22 -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v24 -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v26 -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v28 -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v30 -; SI-NEXT: v_or_b32_e32 v0, v0, v42 -; SI-NEXT: v_or_b32_e32 v1, v1, v41 -; SI-NEXT: v_or_b32_e32 v2, v2, v40 -; SI-NEXT: v_or_b32_e32 v3, v3, v55 -; SI-NEXT: v_or_b32_e32 v4, v4, v54 -; SI-NEXT: v_or_b32_e32 v5, v5, v53 -; SI-NEXT: v_or_b32_e32 v6, v6, v52 -; SI-NEXT: v_or_b32_e32 v7, v7, v51 -; SI-NEXT: v_or_b32_e32 v8, v8, v50 -; SI-NEXT: v_or_b32_e32 v9, v9, v49 -; SI-NEXT: v_or_b32_e32 v10, v10, v48 -; SI-NEXT: v_or_b32_e32 v11, v11, v39 -; SI-NEXT: v_or_b32_e32 v12, v12, v23 -; SI-NEXT: v_or_b32_e32 v13, v13, v21 -; SI-NEXT: v_or_b32_e32 v14, v14, v19 -; SI-NEXT: v_or_b32_e32 v15, v15, v17 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v30 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v29 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v28 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v27 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v26 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v25 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v24 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v23 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v21 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v19 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v32 +; SI-NEXT: v_or_b32_e32 v0, v0, v55 +; SI-NEXT: v_or_b32_e32 v1, v1, v54 +; SI-NEXT: v_or_b32_e32 v2, v2, v53 +; SI-NEXT: v_or_b32_e32 v3, v3, v52 +; SI-NEXT: v_or_b32_e32 v4, v4, v51 +; SI-NEXT: v_or_b32_e32 v5, v5, v50 +; SI-NEXT: v_or_b32_e32 v6, v6, v49 +; SI-NEXT: v_or_b32_e32 v7, v7, v48 +; SI-NEXT: v_or_b32_e32 v8, v8, v39 +; SI-NEXT: v_or_b32_e32 v9, v9, v38 +; SI-NEXT: v_or_b32_e32 v10, v10, v37 +; SI-NEXT: v_or_b32_e32 v11, v11, v36 +; SI-NEXT: v_or_b32_e32 v12, v12, v35 +; SI-NEXT: v_or_b32_e32 v13, v13, v34 +; SI-NEXT: v_or_b32_e32 v14, v14, v33 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr53 @@ -17596,30 +18117,32 @@ define <16 x float> @bitcast_v32i16_to_v16f32(<32 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: .LBB38_2: ; %Flow +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB38_4 -; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: s_cbranch_execz .LBB38_2 +; SI-NEXT: .LBB38_4: ; %cmp.true ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v31 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v32 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v33 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v34 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v35 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v36 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v37 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v38 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v16 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v18 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v20 -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v22 -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v24 -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v26 -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v28 -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v29 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v27 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v32 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 @@ -17636,23 +18159,23 @@ define <16 x float> @bitcast_v32i16_to_v16f32(<32 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: v_or_b32_e32 v0, v55, v0 ; SI-NEXT: s_mov_b32 s6, 0x30000 -; SI-NEXT: v_or_b32_e32 v1, v41, v1 -; SI-NEXT: v_or_b32_e32 v2, v40, v2 -; SI-NEXT: v_or_b32_e32 v3, v55, v3 -; SI-NEXT: v_or_b32_e32 v4, v54, v4 -; SI-NEXT: v_or_b32_e32 v5, v53, v5 -; SI-NEXT: v_or_b32_e32 v6, v52, v6 -; SI-NEXT: v_or_b32_e32 v7, v51, v7 -; SI-NEXT: v_or_b32_e32 v8, v50, v8 -; SI-NEXT: v_or_b32_e32 v9, v49, v9 -; SI-NEXT: v_or_b32_e32 v10, v48, v10 -; SI-NEXT: v_or_b32_e32 v11, v39, v11 -; SI-NEXT: v_or_b32_e32 v12, v23, v12 -; SI-NEXT: v_or_b32_e32 v13, v21, v13 -; SI-NEXT: v_or_b32_e32 v14, v19, v14 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: v_or_b32_e32 v1, v54, v1 +; SI-NEXT: v_or_b32_e32 v2, v53, v2 +; SI-NEXT: v_or_b32_e32 v3, v52, v3 +; SI-NEXT: v_or_b32_e32 v4, v51, v4 +; SI-NEXT: v_or_b32_e32 v5, v50, v5 +; SI-NEXT: v_or_b32_e32 v6, v49, v6 +; SI-NEXT: v_or_b32_e32 v7, v48, v7 +; SI-NEXT: v_or_b32_e32 v8, v39, v8 +; SI-NEXT: v_or_b32_e32 v9, v38, v9 +; SI-NEXT: v_or_b32_e32 v10, v37, v10 +; SI-NEXT: v_or_b32_e32 v11, v36, v11 +; SI-NEXT: v_or_b32_e32 v12, v35, v12 +; SI-NEXT: v_or_b32_e32 v13, v34, v13 +; SI-NEXT: v_or_b32_e32 v14, v33, v14 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 @@ -17669,12 +18192,7 @@ define <16 x float> @bitcast_v32i16_to_v16f32(<32 x i16> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v13 ; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v14 ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v15 -; SI-NEXT: .LBB38_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v32i16_to_v16f32: @@ -17818,153 +18336,184 @@ define inreg <16 x float> @bitcast_v32i16_to_v16f32_scalar(<32 x i16> inreg %a, ; SI-LABEL: bitcast_v32i16_to_v16f32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; SI-NEXT: v_mov_b32_e32 v26, v14 -; SI-NEXT: v_mov_b32_e32 v25, v12 -; SI-NEXT: v_mov_b32_e32 v19, v10 -; SI-NEXT: v_mov_b32_e32 v20, v8 -; SI-NEXT: v_mov_b32_e32 v21, v6 -; SI-NEXT: v_mov_b32_e32 v22, v4 -; SI-NEXT: v_mov_b32_e32 v23, v2 -; SI-NEXT: v_mov_b32_e32 v24, v0 +; SI-NEXT: v_mov_b32_e32 v16, v1 +; SI-NEXT: v_mov_b32_e32 v17, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v17 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s7, s28, 16 +; SI-NEXT: s_lshr_b32 s8, s27, 16 +; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: s_lshr_b32 s13, s22, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s40, s19, 16 +; SI-NEXT: s_lshr_b32 s41, s18, 16 +; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v0 ; SI-NEXT: s_cbranch_scc0 .LBB39_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v24 ; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: v_or_b32_e32 v7, v0, v33 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v22 +; SI-NEXT: s_lshl_b32 s5, s43, 16 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: v_or_b32_e32 v9, v0, v31 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v21 -; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: v_or_b32_e32 v10, v0, v30 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v20 -; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: v_or_b32_e32 v11, v0, v29 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v19 -; SI-NEXT: s_or_b32 s7, s7, s8 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: v_or_b32_e32 v12, v0, v28 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v25 -; SI-NEXT: s_or_b32 s8, s8, s9 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: v_or_b32_e32 v13, v0, v27 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v26 -; SI-NEXT: s_or_b32 s9, s9, s10 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 -; SI-NEXT: v_or_b32_e32 v14, v0, v18 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s44, s42, 16 +; SI-NEXT: s_or_b32 s5, s5, s44 +; SI-NEXT: s_and_b32 s44, s18, 0xffff +; SI-NEXT: s_lshl_b32 s45, s41, 16 +; SI-NEXT: s_or_b32 s44, s44, s45 +; SI-NEXT: s_and_b32 s45, s19, 0xffff +; SI-NEXT: s_lshl_b32 s46, s40, 16 +; SI-NEXT: s_or_b32 s45, s45, s46 +; SI-NEXT: s_and_b32 s46, s20, 0xffff +; SI-NEXT: s_lshl_b32 s47, s15, 16 +; SI-NEXT: s_or_b32 s46, s46, s47 +; SI-NEXT: s_and_b32 s47, s21, 0xffff +; SI-NEXT: s_lshl_b32 s56, s14, 16 +; SI-NEXT: s_or_b32 s47, s47, s56 +; SI-NEXT: s_and_b32 s56, s22, 0xffff +; SI-NEXT: s_lshl_b32 s57, s13, 16 +; SI-NEXT: s_or_b32 s56, s56, s57 +; SI-NEXT: s_and_b32 s57, s23, 0xffff +; SI-NEXT: s_lshl_b32 s58, s12, 16 +; SI-NEXT: s_or_b32 s57, s57, s58 +; SI-NEXT: s_and_b32 s58, s24, 0xffff +; SI-NEXT: s_lshl_b32 s59, s11, 16 +; SI-NEXT: s_or_b32 s58, s58, s59 +; SI-NEXT: s_and_b32 s59, s25, 0xffff +; SI-NEXT: s_lshl_b32 s60, s10, 16 +; SI-NEXT: s_or_b32 s59, s59, s60 +; SI-NEXT: s_and_b32 s60, s26, 0xffff +; SI-NEXT: s_lshl_b32 s61, s9, 16 +; SI-NEXT: s_or_b32 s60, s60, s61 +; SI-NEXT: s_and_b32 s61, s27, 0xffff +; SI-NEXT: s_lshl_b32 s62, s8, 16 +; SI-NEXT: s_or_b32 s61, s61, s62 +; SI-NEXT: s_and_b32 s62, s28, 0xffff +; SI-NEXT: s_lshl_b32 s63, s7, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v17 +; SI-NEXT: s_or_b32 s62, s62, s63 +; SI-NEXT: s_and_b32 s63, s29, 0xffff +; SI-NEXT: s_lshl_b32 s72, s6, 16 +; SI-NEXT: v_or_b32_e32 v14, v0, v19 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 -; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_or_b32_e32 v8, v1, v32 -; SI-NEXT: v_or_b32_e32 v15, v0, v17 +; SI-NEXT: s_or_b32 s63, s63, s72 +; SI-NEXT: v_or_b32_e32 v15, v0, v18 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_mov_b32_e32 v5, s9 -; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: v_mov_b32_e32 v2, s44 +; SI-NEXT: v_mov_b32_e32 v3, s45 +; SI-NEXT: v_mov_b32_e32 v4, s46 +; SI-NEXT: v_mov_b32_e32 v5, s47 +; SI-NEXT: v_mov_b32_e32 v6, s56 +; SI-NEXT: v_mov_b32_e32 v7, s57 +; SI-NEXT: v_mov_b32_e32 v8, s58 +; SI-NEXT: v_mov_b32_e32 v9, s59 +; SI-NEXT: v_mov_b32_e32 v10, s60 +; SI-NEXT: v_mov_b32_e32 v11, s61 +; SI-NEXT: v_mov_b32_e32 v12, s62 +; SI-NEXT: v_mov_b32_e32 v13, s63 ; SI-NEXT: s_cbranch_execnz .LBB39_3 ; SI-NEXT: .LBB39_2: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v24 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v33, v0 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v23 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v32, v0 -; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v22 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v31, v0 -; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v21 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v30, v0 -; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v20 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v29, v0 -; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v19 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_or_b32_e32 v0, v28, v0 ; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v25 +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: s_add_i32 s17, s17, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s16, s42, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: s_and_b32 s16, s18, 0xffff +; SI-NEXT: s_lshl_b32 s17, s41, 16 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_and_b32 s17, s19, 0xffff +; SI-NEXT: s_lshl_b32 s18, s40, 16 ; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: s_and_b32 s18, s20, 0xffff +; SI-NEXT: s_lshl_b32 s15, s15, 16 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_or_b32 s15, s15, s18 +; SI-NEXT: s_and_b32 s18, s21, 0xffff +; SI-NEXT: s_lshl_b32 s14, s14, 16 ; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: v_or_b32_e32 v0, v27, v0 -; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_or_b32 s14, s14, s18 +; SI-NEXT: s_and_b32 s18, s22, 0xffff +; SI-NEXT: s_lshl_b32 s13, s13, 16 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_or_b32 s13, s13, s18 +; SI-NEXT: s_and_b32 s18, s23, 0xffff +; SI-NEXT: s_lshl_b32 s12, s12, 16 ; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v26 -; SI-NEXT: s_or_b32 s7, s8, s7 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_or_b32 s12, s12, s18 +; SI-NEXT: s_and_b32 s18, s24, 0xffff +; SI-NEXT: s_lshl_b32 s11, s11, 16 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_or_b32 s11, s11, s18 +; SI-NEXT: s_and_b32 s18, s25, 0xffff +; SI-NEXT: s_lshl_b32 s10, s10, 16 ; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_or_b32 s8, s9, s8 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_or_b32 s10, s10, s18 +; SI-NEXT: s_and_b32 s18, s26, 0xffff +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v17 +; SI-NEXT: s_or_b32 s9, s9, s18 +; SI-NEXT: s_and_b32 s18, s27, 0xffff +; SI-NEXT: s_lshl_b32 s8, s8, 16 ; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: v_or_b32_e32 v0, v18, v0 -; SI-NEXT: s_or_b32 s9, s10, s9 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s8, s8, s18 +; SI-NEXT: s_and_b32 s18, s28, 0xffff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: v_or_b32_e32 v0, v19, v0 +; SI-NEXT: s_or_b32 s7, s7, s18 +; SI-NEXT: s_and_b32 s18, s29, 0xffff +; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v16 -; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: s_or_b32 s6, s6, s18 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_add_i32 s4, s4, 0x30000 ; SI-NEXT: s_add_i32 s5, s5, 0x30000 -; SI-NEXT: s_add_i32 s6, s6, 0x30000 -; SI-NEXT: s_add_i32 s7, s7, 0x30000 -; SI-NEXT: s_add_i32 s8, s8, 0x30000 -; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s16, s16, 0x30000 +; SI-NEXT: s_add_i32 s17, s17, 0x30000 +; SI-NEXT: s_add_i32 s15, s15, 0x30000 +; SI-NEXT: s_add_i32 s14, s14, 0x30000 +; SI-NEXT: s_add_i32 s13, s13, 0x30000 +; SI-NEXT: s_add_i32 s12, s12, 0x30000 +; SI-NEXT: s_add_i32 s11, s11, 0x30000 ; SI-NEXT: s_add_i32 s10, s10, 0x30000 -; SI-NEXT: v_or_b32_e32 v0, v17, v0 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v18, v0 ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_mov_b32_e32 v5, s9 -; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: v_mov_b32_e32 v3, s17 +; SI-NEXT: v_mov_b32_e32 v4, s15 +; SI-NEXT: v_mov_b32_e32 v5, s14 +; SI-NEXT: v_mov_b32_e32 v6, s13 +; SI-NEXT: v_mov_b32_e32 v7, s12 +; SI-NEXT: v_mov_b32_e32 v8, s11 +; SI-NEXT: v_mov_b32_e32 v9, s10 +; SI-NEXT: v_mov_b32_e32 v10, s9 +; SI-NEXT: v_mov_b32_e32 v11, s8 +; SI-NEXT: v_mov_b32_e32 v12, s7 +; SI-NEXT: v_mov_b32_e32 v13, s6 ; SI-NEXT: .LBB39_3: ; %end ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB39_4: @@ -18222,23 +18771,91 @@ define <32 x half> @bitcast_v16f32_to_v32f16(<16 x float> %a, i32 %b) { ; SI-LABEL: bitcast_v16f32_to_v32f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v32, v15 -; SI-NEXT: v_mov_b32_e32 v33, v14 -; SI-NEXT: v_mov_b32_e32 v34, v13 -; SI-NEXT: v_mov_b32_e32 v35, v12 -; SI-NEXT: v_mov_b32_e32 v36, v11 -; SI-NEXT: v_mov_b32_e32 v37, v10 -; SI-NEXT: v_mov_b32_e32 v38, v9 -; SI-NEXT: v_mov_b32_e32 v39, v8 -; SI-NEXT: v_mov_b32_e32 v48, v7 -; SI-NEXT: v_mov_b32_e32 v49, v6 -; SI-NEXT: v_mov_b32_e32 v50, v5 -; SI-NEXT: v_mov_b32_e32 v51, v4 -; SI-NEXT: v_mov_b32_e32 v52, v3 -; SI-NEXT: v_mov_b32_e32 v53, v2 -; SI-NEXT: v_mov_b32_e32 v54, v1 -; SI-NEXT: v_mov_b32_e32 v55, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB40_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v0 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 @@ -18255,164 +18872,140 @@ define <32 x half> @bitcast_v16f32_to_v32f16(<16 x float> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB40_3 -; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: .LBB40_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB40_4 -; SI-NEXT: .LBB40_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB40_3: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v55 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB40_2 -; SI-NEXT: .LBB40_4: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v0, 1.0, v55 -; SI-NEXT: v_add_f32_e32 v2, 1.0, v54 -; SI-NEXT: v_add_f32_e32 v4, 1.0, v53 -; SI-NEXT: v_add_f32_e32 v6, 1.0, v52 -; SI-NEXT: v_add_f32_e32 v8, 1.0, v51 -; SI-NEXT: v_add_f32_e32 v10, 1.0, v50 -; SI-NEXT: v_add_f32_e32 v12, 1.0, v49 -; SI-NEXT: v_add_f32_e32 v14, 1.0, v48 -; SI-NEXT: v_add_f32_e32 v16, 1.0, v39 -; SI-NEXT: v_add_f32_e32 v18, 1.0, v38 -; SI-NEXT: v_add_f32_e32 v20, 1.0, v37 -; SI-NEXT: v_add_f32_e32 v22, 1.0, v36 -; SI-NEXT: v_add_f32_e32 v24, 1.0, v35 -; SI-NEXT: v_add_f32_e32 v26, 1.0, v34 -; SI-NEXT: v_add_f32_e32 v28, 1.0, v33 -; SI-NEXT: v_add_f32_e32 v30, 1.0, v32 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: s_cbranch_execz .LBB40_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 +; SI-NEXT: .LBB40_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_cvt_f16_f32_e32 v0, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v52 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v51 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v48 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v38 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v37 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v34 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v33 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v29 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v30 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v25 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v26 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v21 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v22 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v17 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v18 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16f32_to_v32f16: @@ -18528,162 +19121,224 @@ define inreg <32 x half> @bitcast_v16f32_to_v32f16_scalar(<16 x float> inreg %a, ; SI-NEXT: v_mov_b32_e32 v38, s25 ; SI-NEXT: v_mov_b32_e32 v36, s26 ; SI-NEXT: v_mov_b32_e32 v35, s27 -; SI-NEXT: v_mov_b32_e32 v34, s28 +; SI-NEXT: v_mov_b32_e32 v33, s28 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v37, s29 ; SI-NEXT: s_cbranch_scc0 .LBB41_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v2 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v2 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v2 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v2 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v2 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v2 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v2 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v2 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v2 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v2 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v2 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v2 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v2 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v2 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v54 ; SI-NEXT: v_cvt_f32_f16_e32 v32, v55 ; SI-NEXT: s_cbranch_execnz .LBB41_3 ; SI-NEXT: .LBB41_2: ; %cmp.true ; SI-NEXT: v_add_f32_e32 v3, 1.0, v55 -; SI-NEXT: v_add_f32_e32 v2, 1.0, v54 -; SI-NEXT: v_add_f32_e32 v4, 1.0, v53 -; SI-NEXT: v_add_f32_e32 v6, 1.0, v52 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v54 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v53 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v52 ; SI-NEXT: v_add_f32_e32 v8, 1.0, v51 -; SI-NEXT: v_add_f32_e32 v10, 1.0, v50 -; SI-NEXT: v_add_f32_e32 v12, 1.0, v49 -; SI-NEXT: v_add_f32_e32 v14, 1.0, v48 -; SI-NEXT: v_add_f32_e32 v16, 1.0, v39 -; SI-NEXT: v_add_f32_e32 v18, 1.0, v38 -; SI-NEXT: v_add_f32_e32 v20, 1.0, v36 -; SI-NEXT: v_add_f32_e32 v22, 1.0, v35 -; SI-NEXT: v_add_f32_e32 v24, 1.0, v34 -; SI-NEXT: v_add_f32_e32 v26, 1.0, v37 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v50 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v49 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v48 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v39 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v38 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v36 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v35 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v33 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v37 ; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v32, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v52 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 ; SI-NEXT: .LBB41_3: ; %end -; SI-NEXT: v_mov_b32_e32 v0, v32 -; SI-NEXT: v_mov_b32_e32 v1, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v31 +; SI-NEXT: v_or_b32_e32 v1, v30, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 +; SI-NEXT: v_or_b32_e32 v2, v28, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v28 +; SI-NEXT: v_or_b32_e32 v5, v5, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v26 +; SI-NEXT: v_or_b32_e32 v7, v7, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v24 +; SI-NEXT: v_or_b32_e32 v9, v22, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v22 +; SI-NEXT: v_or_b32_e32 v11, v20, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v20 +; SI-NEXT: v_or_b32_e32 v13, v18, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v18 +; SI-NEXT: v_or_b32_e32 v3, v30, v3 +; SI-NEXT: v_or_b32_e32 v4, v27, v4 +; SI-NEXT: v_or_b32_e32 v6, v25, v6 +; SI-NEXT: v_or_b32_e32 v8, v23, v8 +; SI-NEXT: v_or_b32_e32 v10, v21, v10 +; SI-NEXT: v_or_b32_e32 v12, v19, v12 +; SI-NEXT: v_or_b32_e32 v14, v17, v14 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB41_4: ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: s_branch .LBB41_2 ; ; VI-LABEL: bitcast_v16f32_to_v32f16_scalar: @@ -18844,97 +19499,126 @@ define <16 x float> @bitcast_v32f16_to_v16f32(<32 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v32f16_to_v16f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:4 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v15 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v47 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v56 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB42_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v46 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v44 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v42 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v40 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v54 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v52 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v50 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v48 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v38 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v34 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 -; SI-NEXT: v_or_b32_e32 v0, v45, v0 -; SI-NEXT: v_or_b32_e32 v1, v43, v1 -; SI-NEXT: v_or_b32_e32 v2, v41, v2 -; SI-NEXT: v_or_b32_e32 v3, v55, v3 -; SI-NEXT: v_or_b32_e32 v4, v53, v4 -; SI-NEXT: v_or_b32_e32 v5, v51, v5 -; SI-NEXT: v_or_b32_e32 v6, v49, v6 -; SI-NEXT: v_or_b32_e32 v7, v39, v7 -; SI-NEXT: v_or_b32_e32 v8, v37, v8 -; SI-NEXT: v_or_b32_e32 v9, v35, v9 -; SI-NEXT: v_or_b32_e32 v10, v33, v10 -; SI-NEXT: v_or_b32_e32 v11, v31, v11 -; SI-NEXT: v_or_b32_e32 v12, v22, v12 -; SI-NEXT: v_or_b32_e32 v13, v20, v13 -; SI-NEXT: v_or_b32_e32 v14, v18, v14 -; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v40 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v18 +; SI-NEXT: v_or_b32_e32 v0, v55, v0 +; SI-NEXT: v_or_b32_e32 v1, v53, v1 +; SI-NEXT: v_or_b32_e32 v2, v51, v2 +; SI-NEXT: v_or_b32_e32 v3, v49, v3 +; SI-NEXT: v_or_b32_e32 v4, v39, v4 +; SI-NEXT: v_or_b32_e32 v5, v37, v5 +; SI-NEXT: v_or_b32_e32 v6, v35, v6 +; SI-NEXT: v_or_b32_e32 v7, v33, v7 +; SI-NEXT: v_or_b32_e32 v8, v31, v8 +; SI-NEXT: v_or_b32_e32 v9, v29, v9 +; SI-NEXT: v_or_b32_e32 v10, v27, v10 +; SI-NEXT: v_or_b32_e32 v11, v25, v11 +; SI-NEXT: v_or_b32_e32 v12, v23, v12 +; SI-NEXT: v_or_b32_e32 v13, v21, v13 +; SI-NEXT: v_or_b32_e32 v14, v19, v14 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 ; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr54 @@ -18953,6 +19637,13 @@ define <16 x float> @bitcast_v32f16_to_v16f32(<32 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr21 @@ -18960,15 +19651,14 @@ define <16 x float> @bitcast_v32f16_to_v16f32(<32 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: .LBB42_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB42_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v53 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -18981,10 +19671,10 @@ define <16 x float> @bitcast_v32f16_to_v16f32(<32 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v49 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -18992,10 +19682,10 @@ define <16 x float> @bitcast_v32f16_to_v16f32(<32 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v39 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v48 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 @@ -19003,11 +19693,11 @@ define <16 x float> @bitcast_v32f16_to_v16f32(<32 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v38 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v36 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 @@ -19015,11 +19705,11 @@ define <16 x float> @bitcast_v32f16_to_v16f32(<32 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v35 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v33 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 @@ -19027,11 +19717,11 @@ define <16 x float> @bitcast_v32f16_to_v16f32(<32 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v32 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v30 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 @@ -19039,11 +19729,11 @@ define <16 x float> @bitcast_v32f16_to_v16f32(<32 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v29 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v27 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 @@ -19051,11 +19741,11 @@ define <16 x float> @bitcast_v32f16_to_v16f32(<32 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v26 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v24 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 @@ -19063,11 +19753,11 @@ define <16 x float> @bitcast_v32f16_to_v16f32(<32 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v23 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v21 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 @@ -19075,35 +19765,27 @@ define <16 x float> @bitcast_v32f16_to_v16f32(<32 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_or_b32_e32 v12, v14, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v20 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 -; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 ; SI-NEXT: .LBB42_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -19249,82 +19931,128 @@ define inreg <16 x float> @bitcast_v32f16_to_v16f32_scalar(<32 x half> inreg %a, ; SI-LABEL: bitcast_v32f16_to_v16f32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v51, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v50, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v40, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v55, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v54, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v53, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v52, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v49, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v48, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v39, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v38, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v37, s26 -; SI-NEXT: v_cvt_f16_f32_e32 v36, s29 -; SI-NEXT: v_cvt_f16_f32_e32 v35, s28 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: s_lshr_b32 s10, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s10 +; SI-NEXT: s_lshr_b32 s10, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s10 +; SI-NEXT: s_lshr_b32 s10, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s10 +; SI-NEXT: s_lshr_b32 s10, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s10 +; SI-NEXT: s_lshr_b32 s8, s25, 16 +; SI-NEXT: s_lshr_b32 s9, s24, 16 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s25 +; SI-NEXT: s_lshr_b32 s6, s27, 16 +; SI-NEXT: s_lshr_b32 s7, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s19, 16 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 +; SI-NEXT: s_lshr_b32 s10, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s10 +; SI-NEXT: s_lshr_b32 s10, s17, 16 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; SI-NEXT: s_lshr_b32 s4, s29, 16 +; SI-NEXT: s_lshr_b32 s5, s28, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s10 +; SI-NEXT: s_lshr_b32 s10, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v1 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_cbranch_scc0 .LBB43_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v40 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v52 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v48 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v38 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v53 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v33 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v29 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v28 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v25 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v24 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v22 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v19 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 -; SI-NEXT: v_or_b32_e32 v0, v50, v0 -; SI-NEXT: v_or_b32_e32 v1, v55, v1 -; SI-NEXT: v_or_b32_e32 v2, v53, v2 -; SI-NEXT: v_or_b32_e32 v3, v49, v3 -; SI-NEXT: v_or_b32_e32 v4, v39, v4 -; SI-NEXT: v_or_b32_e32 v5, v37, v5 -; SI-NEXT: v_or_b32_e32 v6, v35, v6 -; SI-NEXT: v_or_b32_e32 v7, v33, v7 -; SI-NEXT: v_or_b32_e32 v8, v31, v8 -; SI-NEXT: v_or_b32_e32 v9, v29, v9 -; SI-NEXT: v_or_b32_e32 v10, v27, v10 -; SI-NEXT: v_or_b32_e32 v11, v25, v11 -; SI-NEXT: v_or_b32_e32 v12, v23, v12 -; SI-NEXT: v_or_b32_e32 v13, v21, v13 -; SI-NEXT: v_or_b32_e32 v14, v19, v14 +; SI-NEXT: v_or_b32_e32 v0, v54, v0 +; SI-NEXT: v_or_b32_e32 v1, v52, v1 +; SI-NEXT: v_or_b32_e32 v2, v50, v2 +; SI-NEXT: v_or_b32_e32 v3, v48, v3 +; SI-NEXT: v_or_b32_e32 v4, v38, v4 +; SI-NEXT: v_or_b32_e32 v5, v36, v5 +; SI-NEXT: v_or_b32_e32 v6, v34, v6 +; SI-NEXT: v_or_b32_e32 v7, v31, v7 +; SI-NEXT: v_or_b32_e32 v8, v30, v8 +; SI-NEXT: v_or_b32_e32 v9, v27, v9 +; SI-NEXT: v_or_b32_e32 v10, v26, v10 +; SI-NEXT: v_or_b32_e32 v11, v23, v11 +; SI-NEXT: v_or_b32_e32 v12, v22, v12 +; SI-NEXT: v_or_b32_e32 v13, v20, v13 +; SI-NEXT: v_or_b32_e32 v14, v18, v14 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 ; SI-NEXT: s_cbranch_execnz .LBB43_3 ; SI-NEXT: .LBB43_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v52 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -19333,25 +20061,25 @@ define inreg <16 x float> @bitcast_v32f16_to_v16f32_scalar(<32 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v51 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v50 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v49 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v48 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v38 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -19359,11 +20087,11 @@ define inreg <16 x float> @bitcast_v32f16_to_v16f32_scalar(<32 x half> inreg %a, ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v37 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v35 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 @@ -19371,11 +20099,11 @@ define inreg <16 x float> @bitcast_v32f16_to_v16f32_scalar(<32 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v34 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v31 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 @@ -19386,8 +20114,8 @@ define inreg <16 x float> @bitcast_v32f16_to_v16f32_scalar(<32 x half> inreg %a, ; SI-NEXT: v_cvt_f32_f16_e32 v8, v32 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v29 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 @@ -19395,11 +20123,11 @@ define inreg <16 x float> @bitcast_v32f16_to_v16f32_scalar(<32 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v27 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v26 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 @@ -19407,10 +20135,10 @@ define inreg <16 x float> @bitcast_v32f16_to_v16f32_scalar(<32 x half> inreg %a, ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v25 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v23 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v24 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 @@ -19419,11 +20147,11 @@ define inreg <16 x float> @bitcast_v32f16_to_v16f32_scalar(<32 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v22 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v20 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 @@ -19431,10 +20159,10 @@ define inreg <16 x float> @bitcast_v32f16_to_v16f32_scalar(<32 x half> inreg %a, ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_or_b32_e32 v12, v14, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v19 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 @@ -19450,8 +20178,6 @@ define inreg <16 x float> @bitcast_v32f16_to_v16f32_scalar(<32 x half> inreg %a, ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 ; SI-NEXT: .LBB43_3: ; %end -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB43_4: ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 @@ -19649,23 +20375,75 @@ define <32 x bfloat> @bitcast_v16f32_to_v32bf16(<16 x float> %a, i32 %b) { ; SI-LABEL: bitcast_v16f32_to_v32bf16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v55, v15 -; SI-NEXT: v_mov_b32_e32 v54, v14 -; SI-NEXT: v_mov_b32_e32 v53, v13 -; SI-NEXT: v_mov_b32_e32 v52, v12 -; SI-NEXT: v_mov_b32_e32 v51, v11 -; SI-NEXT: v_mov_b32_e32 v50, v10 -; SI-NEXT: v_mov_b32_e32 v49, v9 -; SI-NEXT: v_mov_b32_e32 v48, v8 -; SI-NEXT: v_mov_b32_e32 v39, v7 -; SI-NEXT: v_mov_b32_e32 v38, v6 -; SI-NEXT: v_mov_b32_e32 v37, v5 -; SI-NEXT: v_mov_b32_e32 v36, v4 -; SI-NEXT: v_mov_b32_e32 v35, v3 -; SI-NEXT: v_mov_b32_e32 v34, v2 -; SI-NEXT: v_mov_b32_e32 v33, v1 -; SI-NEXT: v_mov_b32_e32 v32, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB44_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v15 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v15 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v14 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v14 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v13 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v13 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v12 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v12 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v11 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v11 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v10 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v10 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v9 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v9 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v8 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v8 +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v7 +; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v6 +; SI-NEXT: v_and_b32_e32 v37, 0xffff0000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v5 +; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v4 +; SI-NEXT: v_and_b32_e32 v49, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v3 +; SI-NEXT: v_and_b32_e32 v51, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v2 +; SI-NEXT: v_and_b32_e32 v53, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v1 +; SI-NEXT: v_and_b32_e32 v55, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v0 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 @@ -19682,132 +20460,124 @@ define <32 x bfloat> @bitcast_v16f32_to_v32bf16(<16 x float> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB44_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB44_4 -; SI-NEXT: .LBB44_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB44_3: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v55 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v55 -; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v54 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v54 -; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v53 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v53 -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v52 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v52 -; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v51 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v51 -; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v50 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v50 -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v49 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v49 -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v48 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v48 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v39 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v39 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v38 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v38 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v37 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v37 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v36 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v36 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v35 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v35 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v34 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v34 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v33 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v32 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: .LBB44_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB44_2 -; SI-NEXT: .LBB44_4: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v0, 1.0, v32 -; SI-NEXT: v_add_f32_e32 v1, 1.0, v33 -; SI-NEXT: v_add_f32_e32 v2, 1.0, v34 -; SI-NEXT: v_add_f32_e32 v3, 1.0, v35 -; SI-NEXT: v_add_f32_e32 v4, 1.0, v36 -; SI-NEXT: v_add_f32_e32 v5, 1.0, v37 -; SI-NEXT: v_add_f32_e32 v6, 1.0, v38 -; SI-NEXT: v_add_f32_e32 v7, 1.0, v39 -; SI-NEXT: v_add_f32_e32 v8, 1.0, v48 -; SI-NEXT: v_add_f32_e32 v9, 1.0, v49 -; SI-NEXT: v_add_f32_e32 v10, 1.0, v50 -; SI-NEXT: v_add_f32_e32 v11, 1.0, v51 -; SI-NEXT: v_add_f32_e32 v12, 1.0, v52 -; SI-NEXT: v_add_f32_e32 v13, 1.0, v53 -; SI-NEXT: v_add_f32_e32 v14, 1.0, v54 -; SI-NEXT: v_add_f32_e32 v15, 1.0, v55 -; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v15 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v15 -; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v14 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v14 -; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v13 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v13 -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v12 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v12 -; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v11 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v11 -; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v10 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v10 -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v9 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v9 -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v8 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v8 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v7 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v7 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v6 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v6 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v5 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v5 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v4 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v4 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v3 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_cbranch_execz .LBB44_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v15 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v15 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v14 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v14 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v13 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v13 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v12 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v12 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v11 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v11 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v10 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v10 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v9 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v9 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v8 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v8 +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v7 +; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v6 +; SI-NEXT: v_and_b32_e32 v37, 0xffff0000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v5 +; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v4 +; SI-NEXT: v_and_b32_e32 v49, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v3 +; SI-NEXT: v_and_b32_e32 v51, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v2 +; SI-NEXT: v_and_b32_e32 v53, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v1 +; SI-NEXT: v_and_b32_e32 v55, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v0 +; SI-NEXT: .LBB44_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v55 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v54 +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v53 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v52 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v51 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v50 +; SI-NEXT: v_alignbit_b32 v2, v2, v3, 16 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v49 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v48 +; SI-NEXT: v_alignbit_b32 v3, v3, v4, 16 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v39 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v38 +; SI-NEXT: v_alignbit_b32 v4, v4, v5, 16 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v37 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_mul_f32_e32 v6, 1.0, v36 +; SI-NEXT: v_alignbit_b32 v5, v5, v6, 16 +; SI-NEXT: v_mul_f32_e32 v6, 1.0, v35 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v34 +; SI-NEXT: v_alignbit_b32 v6, v6, v7, 16 +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v33 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v32 +; SI-NEXT: v_alignbit_b32 v7, v7, v8, 16 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v31 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v30 +; SI-NEXT: v_alignbit_b32 v8, v8, v9, 16 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v29 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v28 +; SI-NEXT: v_alignbit_b32 v9, v9, v10, 16 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v27 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v26 +; SI-NEXT: v_alignbit_b32 v10, v10, v11, 16 +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v25 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_mul_f32_e32 v12, 1.0, v24 +; SI-NEXT: v_alignbit_b32 v11, v11, v12, 16 +; SI-NEXT: v_mul_f32_e32 v12, 1.0, v23 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_mul_f32_e32 v13, 1.0, v22 +; SI-NEXT: v_alignbit_b32 v12, v12, v13, 16 +; SI-NEXT: v_mul_f32_e32 v13, 1.0, v21 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_mul_f32_e32 v14, 1.0, v20 +; SI-NEXT: v_alignbit_b32 v13, v13, v14, 16 +; SI-NEXT: v_mul_f32_e32 v14, 1.0, v19 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_mul_f32_e32 v15, 1.0, v18 +; SI-NEXT: v_alignbit_b32 v14, v14, v15, 16 +; SI-NEXT: v_mul_f32_e32 v15, 1.0, v17 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_alignbit_b32 v15, v15, v16, 16 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16f32_to_v32bf16: @@ -19910,144 +20680,256 @@ define inreg <32 x bfloat> @bitcast_v16f32_to_v32bf16_scalar(<16 x float> inreg ; SI-LABEL: bitcast_v16f32_to_v32bf16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v3, s16 +; SI-NEXT: v_mov_b32_e32 v4, s17 +; SI-NEXT: v_mov_b32_e32 v5, s18 +; SI-NEXT: v_mov_b32_e32 v6, s19 +; SI-NEXT: v_mov_b32_e32 v7, s20 +; SI-NEXT: v_mov_b32_e32 v8, s21 +; SI-NEXT: v_mov_b32_e32 v9, s22 +; SI-NEXT: v_mov_b32_e32 v10, s23 +; SI-NEXT: v_mov_b32_e32 v11, s24 +; SI-NEXT: v_mov_b32_e32 v12, s25 +; SI-NEXT: v_mov_b32_e32 v13, s26 +; SI-NEXT: v_mov_b32_e32 v14, s27 +; SI-NEXT: v_mov_b32_e32 v15, s28 +; SI-NEXT: v_mov_b32_e32 v16, s29 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: v_mov_b32_e32 v34, s16 -; SI-NEXT: v_mov_b32_e32 v35, s17 -; SI-NEXT: v_mov_b32_e32 v36, s18 -; SI-NEXT: v_mov_b32_e32 v37, s19 -; SI-NEXT: v_mov_b32_e32 v38, s20 -; SI-NEXT: v_mov_b32_e32 v39, s21 -; SI-NEXT: v_mov_b32_e32 v48, s22 -; SI-NEXT: v_mov_b32_e32 v49, s23 -; SI-NEXT: v_mov_b32_e32 v50, s24 -; SI-NEXT: v_mov_b32_e32 v51, s25 -; SI-NEXT: v_mov_b32_e32 v52, s26 -; SI-NEXT: v_mov_b32_e32 v53, s27 -; SI-NEXT: v_mov_b32_e32 v54, s28 +; SI-NEXT: v_readfirstlane_b32 s6, v3 +; SI-NEXT: v_readfirstlane_b32 s7, v4 +; SI-NEXT: v_readfirstlane_b32 s8, v5 +; SI-NEXT: v_readfirstlane_b32 s9, v6 +; SI-NEXT: v_readfirstlane_b32 s10, v7 +; SI-NEXT: v_readfirstlane_b32 s11, v8 +; SI-NEXT: v_readfirstlane_b32 s12, v9 +; SI-NEXT: v_readfirstlane_b32 s13, v10 +; SI-NEXT: v_readfirstlane_b32 s14, v11 +; SI-NEXT: v_readfirstlane_b32 s15, v12 +; SI-NEXT: v_readfirstlane_b32 s16, v13 +; SI-NEXT: v_readfirstlane_b32 s17, v14 +; SI-NEXT: v_readfirstlane_b32 s18, v15 +; SI-NEXT: v_readfirstlane_b32 s19, v16 +; SI-NEXT: v_readfirstlane_b32 s20, v0 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mov_b32_e32 v55, s29 -; SI-NEXT: s_cbranch_scc0 .LBB45_4 +; SI-NEXT: v_readfirstlane_b32 s21, v1 +; SI-NEXT: s_cbranch_scc0 .LBB45_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v1 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v1 -; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v0 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v0 -; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v55 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v55 -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v54 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v54 -; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v53 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v53 -; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v52 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v52 -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v51 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v51 -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v50 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v50 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v49 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v49 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v48 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v48 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v39 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v39 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v38 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v38 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v37 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v37 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v36 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v36 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v35 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 -; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v34 -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v34 -; SI-NEXT: s_cbranch_execnz .LBB45_3 +; SI-NEXT: s_and_b32 s22, s21, 0xffff0000 +; SI-NEXT: s_lshl_b32 s23, s21, 16 +; SI-NEXT: s_and_b32 s24, s20, 0xffff0000 +; SI-NEXT: s_lshl_b32 s25, s20, 16 +; SI-NEXT: s_and_b32 s26, s19, 0xffff0000 +; SI-NEXT: s_lshl_b32 s27, s19, 16 +; SI-NEXT: s_and_b32 s28, s18, 0xffff0000 +; SI-NEXT: s_lshl_b32 s29, s18, 16 +; SI-NEXT: s_and_b32 s40, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s41, s17, 16 +; SI-NEXT: s_and_b32 s42, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s43, s16, 16 +; SI-NEXT: s_and_b32 s44, s15, 0xffff0000 +; SI-NEXT: s_lshl_b32 s45, s15, 16 +; SI-NEXT: s_and_b32 s46, s14, 0xffff0000 +; SI-NEXT: s_lshl_b32 s47, s14, 16 +; SI-NEXT: s_and_b32 s56, s13, 0xffff0000 +; SI-NEXT: s_lshl_b32 s57, s13, 16 +; SI-NEXT: s_and_b32 s58, s12, 0xffff0000 +; SI-NEXT: s_lshl_b32 s59, s12, 16 +; SI-NEXT: s_and_b32 s60, s11, 0xffff0000 +; SI-NEXT: s_lshl_b32 s61, s11, 16 +; SI-NEXT: s_and_b32 s62, s10, 0xffff0000 +; SI-NEXT: s_lshl_b32 s63, s10, 16 +; SI-NEXT: s_and_b32 s72, s9, 0xffff0000 +; SI-NEXT: s_lshl_b32 s73, s9, 16 +; SI-NEXT: s_and_b32 s74, s8, 0xffff0000 +; SI-NEXT: s_lshl_b32 s75, s8, 16 +; SI-NEXT: s_and_b32 s76, s7, 0xffff0000 +; SI-NEXT: s_lshl_b32 s77, s7, 16 +; SI-NEXT: s_and_b32 s78, s6, 0xffff0000 +; SI-NEXT: s_lshl_b32 s79, s6, 16 +; SI-NEXT: s_cbranch_execnz .LBB45_4 ; SI-NEXT: .LBB45_2: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v33, 1.0, v34 -; SI-NEXT: v_add_f32_e32 v2, 1.0, v35 -; SI-NEXT: v_add_f32_e32 v3, 1.0, v36 -; SI-NEXT: v_add_f32_e32 v4, 1.0, v37 -; SI-NEXT: v_add_f32_e32 v5, 1.0, v38 -; SI-NEXT: v_add_f32_e32 v6, 1.0, v39 -; SI-NEXT: v_add_f32_e32 v7, 1.0, v48 -; SI-NEXT: v_add_f32_e32 v8, 1.0, v49 -; SI-NEXT: v_add_f32_e32 v9, 1.0, v50 -; SI-NEXT: v_add_f32_e32 v10, 1.0, v51 -; SI-NEXT: v_add_f32_e32 v11, 1.0, v52 -; SI-NEXT: v_add_f32_e32 v12, 1.0, v53 -; SI-NEXT: v_add_f32_e32 v13, 1.0, v54 -; SI-NEXT: v_add_f32_e32 v14, 1.0, v55 -; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v1 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v1 -; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v0 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v0 -; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v14 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v14 -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v13 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v13 -; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v12 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v12 -; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v11 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v11 -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v10 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v10 -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v9 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v9 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v8 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v8 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v7 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v7 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v6 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v6 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v5 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v4 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; SI-NEXT: v_add_f32_e64 v20, s11, 1.0 +; SI-NEXT: v_add_f32_e64 v18, s12, 1.0 +; SI-NEXT: v_add_f32_e64 v16, s13, 1.0 +; SI-NEXT: v_add_f32_e64 v14, s14, 1.0 +; SI-NEXT: v_add_f32_e64 v12, s15, 1.0 +; SI-NEXT: v_add_f32_e64 v10, s16, 1.0 +; SI-NEXT: v_add_f32_e64 v8, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v6, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v4, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v2, s20, 1.0 +; SI-NEXT: v_add_f32_e64 v0, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v22, s10, 1.0 +; SI-NEXT: v_add_f32_e64 v24, s9, 1.0 +; SI-NEXT: v_add_f32_e64 v26, s8, 1.0 +; SI-NEXT: v_add_f32_e64 v28, s7, 1.0 +; SI-NEXT: v_add_f32_e64 v30, s6, 1.0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v33 -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; SI-NEXT: .LBB45_3: ; %end -; SI-NEXT: v_mov_b32_e32 v0, v33 -; SI-NEXT: v_mov_b32_e32 v1, v32 -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB45_4: -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v30 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: s_branch .LBB45_5 +; SI-NEXT: .LBB45_3: +; SI-NEXT: ; implicit-def: $sgpr79 +; SI-NEXT: ; implicit-def: $sgpr78 +; SI-NEXT: ; implicit-def: $sgpr77 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr75 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr73 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr63 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr61 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr59 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr57 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr47 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr45 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr43 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr41 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr29 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr25 +; SI-NEXT: ; implicit-def: $sgpr24 +; SI-NEXT: ; implicit-def: $sgpr23 +; SI-NEXT: ; implicit-def: $sgpr22 ; SI-NEXT: s_branch .LBB45_2 +; SI-NEXT: .LBB45_4: +; SI-NEXT: v_mov_b32_e32 v30, s79 +; SI-NEXT: v_mov_b32_e32 v31, s78 +; SI-NEXT: v_mov_b32_e32 v28, s77 +; SI-NEXT: v_mov_b32_e32 v29, s76 +; SI-NEXT: v_mov_b32_e32 v26, s75 +; SI-NEXT: v_mov_b32_e32 v27, s74 +; SI-NEXT: v_mov_b32_e32 v24, s73 +; SI-NEXT: v_mov_b32_e32 v25, s72 +; SI-NEXT: v_mov_b32_e32 v22, s63 +; SI-NEXT: v_mov_b32_e32 v23, s62 +; SI-NEXT: v_mov_b32_e32 v20, s61 +; SI-NEXT: v_mov_b32_e32 v21, s60 +; SI-NEXT: v_mov_b32_e32 v18, s59 +; SI-NEXT: v_mov_b32_e32 v19, s58 +; SI-NEXT: v_mov_b32_e32 v16, s57 +; SI-NEXT: v_mov_b32_e32 v17, s56 +; SI-NEXT: v_mov_b32_e32 v14, s47 +; SI-NEXT: v_mov_b32_e32 v15, s46 +; SI-NEXT: v_mov_b32_e32 v12, s45 +; SI-NEXT: v_mov_b32_e32 v13, s44 +; SI-NEXT: v_mov_b32_e32 v10, s43 +; SI-NEXT: v_mov_b32_e32 v11, s42 +; SI-NEXT: v_mov_b32_e32 v8, s41 +; SI-NEXT: v_mov_b32_e32 v9, s40 +; SI-NEXT: v_mov_b32_e32 v6, s29 +; SI-NEXT: v_mov_b32_e32 v7, s28 +; SI-NEXT: v_mov_b32_e32 v4, s27 +; SI-NEXT: v_mov_b32_e32 v5, s26 +; SI-NEXT: v_mov_b32_e32 v2, s25 +; SI-NEXT: v_mov_b32_e32 v3, s24 +; SI-NEXT: v_mov_b32_e32 v0, s23 +; SI-NEXT: v_mov_b32_e32 v1, s22 +; SI-NEXT: .LBB45_5: ; %end +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v30 +; SI-NEXT: v_mul_f32_e32 v29, 1.0, v29 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v28 +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v27 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v1 +; SI-NEXT: v_mul_f32_e32 v50, 1.0, v0 +; SI-NEXT: v_lshr_b64 v[0:1], v[30:31], 16 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v26 +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v25 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v48, 1.0, v2 +; SI-NEXT: v_lshr_b64 v[1:2], v[28:29], 16 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v3 +; SI-NEXT: v_lshr_b64 v[2:3], v[26:27], 16 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v38, 1.0, v4 +; SI-NEXT: v_lshr_b64 v[3:4], v[24:25], 16 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v5 +; SI-NEXT: v_lshr_b64 v[4:5], v[22:23], 16 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v36, 1.0, v6 +; SI-NEXT: v_lshr_b64 v[5:6], v[20:21], 16 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v7 +; SI-NEXT: v_lshr_b64 v[6:7], v[18:19], 16 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_mul_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v34, 1.0, v8 +; SI-NEXT: v_lshr_b64 v[7:8], v[16:17], 16 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_mul_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v9 +; SI-NEXT: v_lshr_b64 v[8:9], v[14:15], 16 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v10 +; SI-NEXT: v_lshr_b64 v[9:10], v[12:13], 16 +; SI-NEXT: v_lshr_b64 v[10:11], v[32:33], 16 +; SI-NEXT: v_lshr_b64 v[11:12], v[34:35], 16 +; SI-NEXT: v_lshr_b64 v[12:13], v[36:37], 16 +; SI-NEXT: v_lshr_b64 v[13:14], v[38:39], 16 +; SI-NEXT: v_lshr_b64 v[14:15], v[48:49], 16 +; SI-NEXT: v_lshr_b64 v[15:16], v[50:51], 16 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16f32_to_v32bf16_scalar: ; VI: ; %bb.0: @@ -20207,132 +21089,154 @@ define <16 x float> @bitcast_v32bf16_to_v16f32(<32 x bfloat> %a, i32 %b) { ; SI-LABEL: bitcast_v32bf16_to_v16f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 -; SI-NEXT: v_mul_f32_e32 v45, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v46, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v43, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v44, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v41, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v42, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v55, 1.0, v7 -; SI-NEXT: v_mul_f32_e32 v40, 1.0, v6 -; SI-NEXT: v_mul_f32_e32 v53, 1.0, v9 -; SI-NEXT: v_mul_f32_e32 v54, 1.0, v8 -; SI-NEXT: v_mul_f32_e32 v51, 1.0, v11 -; SI-NEXT: v_mul_f32_e32 v52, 1.0, v10 -; SI-NEXT: v_mul_f32_e32 v49, 1.0, v13 -; SI-NEXT: v_mul_f32_e32 v50, 1.0, v12 -; SI-NEXT: v_mul_f32_e32 v39, 1.0, v15 -; SI-NEXT: v_mul_f32_e32 v48, 1.0, v14 -; SI-NEXT: v_mul_f32_e32 v37, 1.0, v17 -; SI-NEXT: v_mul_f32_e32 v38, 1.0, v16 -; SI-NEXT: v_mul_f32_e32 v35, 1.0, v19 -; SI-NEXT: v_mul_f32_e32 v36, 1.0, v18 -; SI-NEXT: v_mul_f32_e32 v33, 1.0, v21 -; SI-NEXT: v_mul_f32_e32 v34, 1.0, v20 -; SI-NEXT: v_mul_f32_e32 v31, 1.0, v23 -; SI-NEXT: v_mul_f32_e32 v32, 1.0, v22 -; SI-NEXT: v_mul_f32_e32 v22, 1.0, v25 -; SI-NEXT: v_mul_f32_e32 v23, 1.0, v24 -; SI-NEXT: v_mul_f32_e32 v20, 1.0, v27 -; SI-NEXT: v_mul_f32_e32 v21, 1.0, v26 -; SI-NEXT: v_mul_f32_e32 v17, 1.0, v29 -; SI-NEXT: v_mul_f32_e32 v19, 1.0, v28 -; SI-NEXT: v_mul_f32_e32 v18, 1.0, v30 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: v_mul_f32_e32 v54, 1.0, v32 +; SI-NEXT: v_mul_f32_e32 v55, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v52, 1.0, v31 +; SI-NEXT: v_mul_f32_e32 v53, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v50, 1.0, v30 +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v48, 1.0, v29 +; SI-NEXT: v_mul_f32_e32 v49, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v38, 1.0, v28 +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v36, 1.0, v27 +; SI-NEXT: v_mul_f32_e32 v37, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v34, 1.0, v26 +; SI-NEXT: v_mul_f32_e32 v35, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v25 +; SI-NEXT: v_mul_f32_e32 v33, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v24 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v23 +; SI-NEXT: v_mul_f32_e32 v29, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v22 +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v21 +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v20 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v19 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v17 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v15 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v47 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v16, 1.0, v56 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execz .LBB46_2 -; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v45 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v43 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v41 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v55 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v53 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v51 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v49 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v39 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v37 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v35 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v33 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v31 +; SI-NEXT: s_cbranch_execnz .LBB46_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB46_4 +; SI-NEXT: .LBB46_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB46_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v52 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v50 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v48 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v38 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v24 ; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v22 ; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v18 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v16 -; SI-NEXT: v_alignbit_b32 v0, v0, v46, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v44, 16 -; SI-NEXT: v_alignbit_b32 v2, v2, v42, 16 -; SI-NEXT: v_alignbit_b32 v3, v3, v40, 16 -; SI-NEXT: v_alignbit_b32 v4, v4, v54, 16 -; SI-NEXT: v_alignbit_b32 v5, v5, v52, 16 -; SI-NEXT: v_alignbit_b32 v6, v6, v50, 16 -; SI-NEXT: v_alignbit_b32 v7, v7, v48, 16 -; SI-NEXT: v_alignbit_b32 v8, v8, v38, 16 -; SI-NEXT: v_alignbit_b32 v9, v9, v36, 16 -; SI-NEXT: v_alignbit_b32 v10, v10, v34, 16 -; SI-NEXT: v_alignbit_b32 v11, v11, v32, 16 +; SI-NEXT: v_alignbit_b32 v0, v0, v55, 16 +; SI-NEXT: v_alignbit_b32 v1, v1, v53, 16 +; SI-NEXT: v_alignbit_b32 v2, v2, v51, 16 +; SI-NEXT: v_alignbit_b32 v3, v3, v49, 16 +; SI-NEXT: v_alignbit_b32 v4, v4, v39, 16 +; SI-NEXT: v_alignbit_b32 v5, v5, v37, 16 +; SI-NEXT: v_alignbit_b32 v6, v6, v35, 16 +; SI-NEXT: v_alignbit_b32 v7, v7, v33, 16 +; SI-NEXT: v_alignbit_b32 v8, v8, v31, 16 +; SI-NEXT: v_alignbit_b32 v9, v9, v29, 16 +; SI-NEXT: v_alignbit_b32 v10, v10, v27, 16 +; SI-NEXT: v_alignbit_b32 v11, v11, v25, 16 ; SI-NEXT: v_alignbit_b32 v12, v12, v23, 16 ; SI-NEXT: v_alignbit_b32 v13, v13, v21, 16 ; SI-NEXT: v_alignbit_b32 v14, v14, v19, 16 -; SI-NEXT: v_alignbit_b32 v15, v15, v18, 16 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: v_alignbit_b32 v15, v15, v17, 16 +; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: .LBB46_2: ; %Flow +; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB46_4 -; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v45 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v43 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v46 +; SI-NEXT: s_cbranch_execz .LBB46_2 +; SI-NEXT: .LBB46_4: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v54 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v52 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v55 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v44 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v53 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 @@ -20340,62 +21244,62 @@ define <16 x float> @bitcast_v32bf16_to_v16f32(<32 x bfloat> %a, i32 %b) { ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 ; SI-NEXT: v_alignbit_b32 v1, v3, v2, 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v41 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v42 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v50 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v51 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v55 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v48 ; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v40 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v49 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v53 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v38 ; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v54 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v39 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v51 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v36 ; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v52 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v37 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v49 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v34 ; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v50 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v35 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v39 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v32 ; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v48 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v33 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 ; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v37 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v30 ; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v38 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v31 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v35 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v28 ; SI-NEXT: v_alignbit_b32 v8, v9, v8, 16 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v36 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v29 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 ; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v33 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v26 ; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v34 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v27 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 ; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v31 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v24 ; SI-NEXT: v_alignbit_b32 v10, v11, v10, 16 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v32 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v25 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 ; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 @@ -20411,7 +21315,7 @@ define <16 x float> @bitcast_v32bf16_to_v16f32(<32 x bfloat> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 ; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v17 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v18 ; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 ; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v19 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 @@ -20419,23 +21323,12 @@ define <16 x float> @bitcast_v32bf16_to_v16f32(<32 x bfloat> %a, i32 %b) { ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; SI-NEXT: v_alignbit_b32 v14, v15, v14, 16 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v18 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v17 ; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 ; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_alignbit_b32 v15, v16, v15, 16 -; SI-NEXT: .LBB46_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v32bf16_to_v16f32: @@ -21602,7 +22495,39 @@ define inreg <16 x float> @bitcast_v32bf16_to_v16f32_scalar(<32 x bfloat> inreg ; SI-LABEL: bitcast_v32bf16_to_v16f32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_and_b32 s6, s29, 0xffff0000 +; SI-NEXT: s_lshl_b32 s7, s29, 16 +; SI-NEXT: s_and_b32 s8, s28, 0xffff0000 +; SI-NEXT: s_lshl_b32 s9, s28, 16 +; SI-NEXT: s_and_b32 s10, s27, 0xffff0000 +; SI-NEXT: s_lshl_b32 s11, s27, 16 +; SI-NEXT: s_and_b32 s12, s26, 0xffff0000 +; SI-NEXT: s_lshl_b32 s13, s26, 16 +; SI-NEXT: s_and_b32 s14, s25, 0xffff0000 +; SI-NEXT: s_lshl_b32 s15, s25, 16 +; SI-NEXT: s_and_b32 s25, s24, 0xffff0000 +; SI-NEXT: s_lshl_b32 s24, s24, 16 +; SI-NEXT: s_and_b32 s26, s23, 0xffff0000 +; SI-NEXT: s_lshl_b32 s23, s23, 16 +; SI-NEXT: s_and_b32 s27, s22, 0xffff0000 +; SI-NEXT: s_lshl_b32 s22, s22, 16 +; SI-NEXT: s_and_b32 s28, s21, 0xffff0000 +; SI-NEXT: s_lshl_b32 s21, s21, 16 +; SI-NEXT: s_and_b32 s29, s20, 0xffff0000 +; SI-NEXT: s_lshl_b32 s20, s20, 16 +; SI-NEXT: s_and_b32 s40, s19, 0xffff0000 +; SI-NEXT: s_lshl_b32 s19, s19, 16 +; SI-NEXT: s_and_b32 s41, s18, 0xffff0000 +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_and_b32 s42, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_and_b32 s43, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s16, s16, 16 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -21620,51 +22545,50 @@ define inreg <16 x float> @bitcast_v32bf16_to_v16f32_scalar(<32 x bfloat> inreg ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mul_f32_e64 v62, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v60, 1.0, s19 -; SI-NEXT: v_mul_f32_e32 v57, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v56, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v47, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v46, 1.0, v7 -; SI-NEXT: v_mul_f32_e32 v45, 1.0, v9 -; SI-NEXT: v_mul_f32_e32 v44, 1.0, v11 -; SI-NEXT: v_mul_f32_e32 v43, 1.0, v13 -; SI-NEXT: v_mul_f32_e32 v42, 1.0, v15 -; SI-NEXT: v_mul_f32_e32 v18, 1.0, v17 -; SI-NEXT: v_mul_f32_e64 v41, 1.0, s21 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v63, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v61, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v59, 1.0, s27 -; SI-NEXT: v_mul_f32_e64 v58, 1.0, s29 -; SI-NEXT: v_mul_f32_e32 v33, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v31, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v29, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v27, 1.0, v6 -; SI-NEXT: v_mul_f32_e32 v25, 1.0, v8 -; SI-NEXT: v_mul_f32_e32 v23, 1.0, v10 -; SI-NEXT: v_mul_f32_e32 v21, 1.0, v12 -; SI-NEXT: v_mul_f32_e32 v19, 1.0, v14 -; SI-NEXT: v_mul_f32_e32 v17, 1.0, v16 +; SI-NEXT: v_mul_f32_e64 v41, 1.0, s43 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v63, 1.0, s42 +; SI-NEXT: v_mul_f32_e64 v62, 1.0, s41 +; SI-NEXT: v_mul_f32_e64 v61, 1.0, s40 +; SI-NEXT: v_mul_f32_e64 v60, 1.0, s29 +; SI-NEXT: v_mul_f32_e64 v59, 1.0, s28 +; SI-NEXT: v_mul_f32_e64 v58, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v57, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v56, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v47, 1.0, s14 +; SI-NEXT: v_mul_f32_e64 v46, 1.0, s12 +; SI-NEXT: v_mul_f32_e64 v45, 1.0, s10 +; SI-NEXT: v_mul_f32_e64 v44, 1.0, s8 +; SI-NEXT: v_mul_f32_e64 v43, 1.0, s6 +; SI-NEXT: v_mul_f32_e32 v42, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v1 ; SI-NEXT: v_mul_f32_e64 v39, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v54, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v52, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v50, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v48, 1.0, s24 -; SI-NEXT: v_mul_f32_e64 v37, 1.0, s26 -; SI-NEXT: v_mul_f32_e64 v35, 1.0, s28 +; SI-NEXT: v_mul_f32_e64 v54, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v52, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v50, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v48, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v37, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v35, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v33, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v31, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v29, 1.0, s15 +; SI-NEXT: v_mul_f32_e64 v27, 1.0, s13 +; SI-NEXT: v_mul_f32_e64 v25, 1.0, s11 +; SI-NEXT: v_mul_f32_e64 v23, 1.0, s9 +; SI-NEXT: v_mul_f32_e64 v21, 1.0, s7 ; SI-NEXT: s_cbranch_scc0 .LBB47_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v62 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v60 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v63 ; SI-NEXT: v_lshr_b64 v[0:1], v[39:40], 16 ; SI-NEXT: v_lshr_b64 v[1:2], v[54:55], 16 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v62 ; SI-NEXT: v_lshr_b64 v[2:3], v[52:53], 16 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v63 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v61 ; SI-NEXT: v_lshr_b64 v[3:4], v[50:51], 16 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v61 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v60 ; SI-NEXT: v_lshr_b64 v[4:5], v[48:49], 16 ; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v59 ; SI-NEXT: v_lshr_b64 v[5:6], v[37:38], 16 @@ -21686,16 +22610,16 @@ define inreg <16 x float> @bitcast_v32bf16_to_v16f32_scalar(<32 x bfloat> inreg ; SI-NEXT: v_lshr_b64 v[13:14], v[21:22], 16 ; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v42 ; SI-NEXT: v_lshr_b64 v[14:15], v[19:20], 16 -; SI-NEXT: v_mov_b32_e32 v20, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v16 +; SI-NEXT: v_mov_b32_e32 v20, v16 ; SI-NEXT: v_lshr_b64 v[15:16], v[17:18], 16 -; SI-NEXT: v_mov_b32_e32 v18, v20 +; SI-NEXT: v_mov_b32_e32 v16, v20 ; SI-NEXT: s_cbranch_execnz .LBB47_3 ; SI-NEXT: .LBB47_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v62 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v41 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v39 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v60 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v63 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v54 @@ -21704,19 +22628,19 @@ define inreg <16 x float> @bitcast_v32bf16_to_v16f32_scalar(<32 x bfloat> inreg ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 ; SI-NEXT: v_lshr_b64 v[1:2], v[2:3], 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v41 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v62 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v52 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v63 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v61 ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v50 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_lshr_b64 v[3:4], v[3:4], 16 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v61 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v60 ; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v48 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 @@ -21782,7 +22706,7 @@ define inreg <16 x float> @bitcast_v32bf16_to_v16f32_scalar(<32 x bfloat> inreg ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_lshr_b64 v[14:15], v[14:15], 16 -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v18 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v17 ; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 @@ -31530,97 +32454,129 @@ define <32 x i16> @bitcast_v8i64_to_v32i16(<8 x i64> %a, i32 %b) { ; SI-LABEL: bitcast_v8i64_to_v32i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v30, v15 -; SI-NEXT: v_mov_b32_e32 v28, v14 -; SI-NEXT: v_mov_b32_e32 v26, v13 -; SI-NEXT: v_mov_b32_e32 v24, v12 -; SI-NEXT: v_mov_b32_e32 v22, v11 -; SI-NEXT: v_mov_b32_e32 v20, v10 -; SI-NEXT: v_mov_b32_e32 v18, v9 -; SI-NEXT: v_mov_b32_e32 v32, v8 -; SI-NEXT: v_mov_b32_e32 v14, v7 -; SI-NEXT: v_mov_b32_e32 v12, v6 -; SI-NEXT: v_mov_b32_e32 v10, v5 -; SI-NEXT: v_mov_b32_e32 v8, v4 -; SI-NEXT: v_mov_b32_e32 v6, v3 -; SI-NEXT: v_mov_b32_e32 v4, v2 -; SI-NEXT: v_mov_b32_e32 v2, v1 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB56_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_alignbit_b32 v29, v30, v28, 16 -; SI-NEXT: v_alignbit_b32 v25, v26, v24, 16 -; SI-NEXT: v_alignbit_b32 v21, v22, v20, 16 -; SI-NEXT: v_alignbit_b32 v17, v18, v32, 16 -; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 -; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_alignbit_b32 v16, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v17, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v18, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v19, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v20, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v21, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v24, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v27, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 ; SI-NEXT: .LBB56_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB56_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; SI-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc ; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; SI-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc ; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; SI-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc -; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32 -; SI-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc -; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; SI-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc -; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; SI-NEXT: v_addc_u32_e32 v26, vcc, 0, v26, vcc -; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; SI-NEXT: v_addc_u32_e32 v30, vcc, 0, v30, vcc -; SI-NEXT: v_alignbit_b32 v29, v30, v28, 16 -; SI-NEXT: v_alignbit_b32 v25, v26, v24, 16 -; SI-NEXT: v_alignbit_b32 v21, v22, v20, 16 -; SI-NEXT: v_alignbit_b32 v17, v18, v32, 16 -; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 -; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; SI-NEXT: v_alignbit_b32 v16, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v17, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v18, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v19, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v20, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v21, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v24, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v27, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 ; SI-NEXT: .LBB56_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_mov_b32_e32 v16, v32 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v0, v0, v27 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v31 +; SI-NEXT: v_or_b32_e32 v2, v2, v24 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v30 +; SI-NEXT: v_or_b32_e32 v4, v4, v21 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v29 +; SI-NEXT: v_or_b32_e32 v6, v6, v20 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v28 +; SI-NEXT: v_or_b32_e32 v8, v8, v19 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v26 +; SI-NEXT: v_or_b32_e32 v10, v10, v18 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v25 +; SI-NEXT: v_or_b32_e32 v12, v12, v17 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v23 +; SI-NEXT: v_or_b32_e32 v14, v14, v16 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v22 +; SI-NEXT: v_or_b32_e32 v1, v1, v27 +; SI-NEXT: v_or_b32_e32 v3, v3, v24 +; SI-NEXT: v_or_b32_e32 v5, v5, v21 +; SI-NEXT: v_or_b32_e32 v7, v7, v20 +; SI-NEXT: v_or_b32_e32 v9, v9, v19 +; SI-NEXT: v_or_b32_e32 v11, v11, v18 +; SI-NEXT: v_or_b32_e32 v13, v13, v17 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8i64_to_v32i16: @@ -31735,111 +32691,173 @@ define inreg <32 x i16> @bitcast_v8i64_to_v32i16_scalar(<8 x i64> inreg %a, i32 ; SI-LABEL: bitcast_v8i64_to_v32i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v3, s16 +; SI-NEXT: v_mov_b32_e32 v4, s17 +; SI-NEXT: v_mov_b32_e32 v5, s18 +; SI-NEXT: v_mov_b32_e32 v6, s19 +; SI-NEXT: v_mov_b32_e32 v7, s20 +; SI-NEXT: v_mov_b32_e32 v8, s21 +; SI-NEXT: v_mov_b32_e32 v9, s22 +; SI-NEXT: v_mov_b32_e32 v10, s23 +; SI-NEXT: v_mov_b32_e32 v11, s24 +; SI-NEXT: v_mov_b32_e32 v12, s25 +; SI-NEXT: v_mov_b32_e32 v13, s26 +; SI-NEXT: v_mov_b32_e32 v14, s27 +; SI-NEXT: v_mov_b32_e32 v15, s28 +; SI-NEXT: v_mov_b32_e32 v16, s29 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: v_mov_b32_e32 v33, v1 -; SI-NEXT: v_mov_b32_e32 v32, v0 -; SI-NEXT: v_mov_b32_e32 v34, s16 -; SI-NEXT: v_mov_b32_e32 v35, s17 -; SI-NEXT: v_mov_b32_e32 v36, s18 -; SI-NEXT: v_mov_b32_e32 v37, s19 -; SI-NEXT: v_mov_b32_e32 v38, s20 -; SI-NEXT: v_mov_b32_e32 v39, s21 -; SI-NEXT: v_mov_b32_e32 v48, s22 -; SI-NEXT: v_mov_b32_e32 v49, s23 -; SI-NEXT: v_mov_b32_e32 v50, s24 -; SI-NEXT: v_mov_b32_e32 v51, s25 -; SI-NEXT: v_mov_b32_e32 v52, s26 -; SI-NEXT: v_mov_b32_e32 v53, s27 -; SI-NEXT: v_mov_b32_e32 v54, s28 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mov_b32_e32 v55, s29 +; SI-NEXT: v_readfirstlane_b32 s18, v3 +; SI-NEXT: v_readfirstlane_b32 s19, v4 +; SI-NEXT: v_readfirstlane_b32 s16, v5 +; SI-NEXT: v_readfirstlane_b32 s17, v6 +; SI-NEXT: v_readfirstlane_b32 s14, v7 +; SI-NEXT: v_readfirstlane_b32 s15, v8 +; SI-NEXT: v_readfirstlane_b32 s12, v9 +; SI-NEXT: v_readfirstlane_b32 s13, v10 +; SI-NEXT: v_readfirstlane_b32 s10, v11 +; SI-NEXT: v_readfirstlane_b32 s11, v12 +; SI-NEXT: v_readfirstlane_b32 s8, v13 +; SI-NEXT: v_readfirstlane_b32 s9, v14 +; SI-NEXT: v_readfirstlane_b32 s6, v15 +; SI-NEXT: v_readfirstlane_b32 s7, v16 +; SI-NEXT: v_readfirstlane_b32 s4, v0 +; SI-NEXT: s_and_b64 s[20:21], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s5, v1 ; SI-NEXT: s_cbranch_scc0 .LBB57_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v33 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v55 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v53 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v51 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v49 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v39 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v37 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v35 -; SI-NEXT: v_lshr_b64 v[29:30], v[32:33], 16 -; SI-NEXT: v_lshr_b64 v[25:26], v[54:55], 16 -; SI-NEXT: v_lshr_b64 v[21:22], v[52:53], 16 -; SI-NEXT: v_lshr_b64 v[17:18], v[50:51], 16 -; SI-NEXT: v_lshr_b64 v[13:14], v[48:49], 16 -; SI-NEXT: v_lshr_b64 v[9:10], v[38:39], 16 -; SI-NEXT: v_lshr_b64 v[5:6], v[36:37], 16 -; SI-NEXT: v_lshr_b64 v[1:2], v[34:35], 16 +; SI-NEXT: s_lshr_b32 s56, s5, 16 +; SI-NEXT: s_lshr_b32 s57, s7, 16 +; SI-NEXT: s_lshr_b32 s58, s9, 16 +; SI-NEXT: s_lshr_b32 s59, s11, 16 +; SI-NEXT: s_lshr_b32 s60, s13, 16 +; SI-NEXT: s_lshr_b32 s61, s15, 16 +; SI-NEXT: s_lshr_b32 s62, s17, 16 +; SI-NEXT: s_lshr_b32 s63, s19, 16 +; SI-NEXT: s_lshr_b64 s[20:21], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[22:23], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[24:25], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[42:43], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[44:45], s[18:19], 16 ; SI-NEXT: s_cbranch_execnz .LBB57_3 ; SI-NEXT: .LBB57_2: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32 -; SI-NEXT: v_addc_u32_e32 v33, vcc, 0, v33, vcc -; SI-NEXT: v_add_i32_e32 v54, vcc, 3, v54 -; SI-NEXT: v_addc_u32_e32 v55, vcc, 0, v55, vcc -; SI-NEXT: v_add_i32_e32 v52, vcc, 3, v52 -; SI-NEXT: v_addc_u32_e32 v53, vcc, 0, v53, vcc -; SI-NEXT: v_add_i32_e32 v50, vcc, 3, v50 -; SI-NEXT: v_addc_u32_e32 v51, vcc, 0, v51, vcc -; SI-NEXT: v_add_i32_e32 v48, vcc, 3, v48 -; SI-NEXT: v_addc_u32_e32 v49, vcc, 0, v49, vcc -; SI-NEXT: v_add_i32_e32 v38, vcc, 3, v38 -; SI-NEXT: v_addc_u32_e32 v39, vcc, 0, v39, vcc -; SI-NEXT: v_add_i32_e32 v36, vcc, 3, v36 -; SI-NEXT: v_addc_u32_e32 v37, vcc, 0, v37, vcc -; SI-NEXT: v_add_i32_e32 v34, vcc, 3, v34 -; SI-NEXT: v_addc_u32_e32 v35, vcc, 0, v35, vcc -; SI-NEXT: v_lshr_b64 v[29:30], v[32:33], 16 -; SI-NEXT: v_lshr_b64 v[25:26], v[54:55], 16 -; SI-NEXT: v_lshr_b64 v[21:22], v[52:53], 16 -; SI-NEXT: v_lshr_b64 v[17:18], v[50:51], 16 -; SI-NEXT: v_lshr_b64 v[13:14], v[48:49], 16 -; SI-NEXT: v_lshr_b64 v[9:10], v[38:39], 16 -; SI-NEXT: v_lshr_b64 v[5:6], v[36:37], 16 -; SI-NEXT: v_lshr_b64 v[1:2], v[34:35], 16 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v33 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v55 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v53 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v51 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v49 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v39 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v37 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v35 +; SI-NEXT: s_add_u32 s4, s4, 3 +; SI-NEXT: s_addc_u32 s5, s5, 0 +; SI-NEXT: s_add_u32 s6, s6, 3 +; SI-NEXT: s_addc_u32 s7, s7, 0 +; SI-NEXT: s_add_u32 s8, s8, 3 +; SI-NEXT: s_addc_u32 s9, s9, 0 +; SI-NEXT: s_add_u32 s10, s10, 3 +; SI-NEXT: s_addc_u32 s11, s11, 0 +; SI-NEXT: s_add_u32 s12, s12, 3 +; SI-NEXT: s_addc_u32 s13, s13, 0 +; SI-NEXT: s_add_u32 s14, s14, 3 +; SI-NEXT: s_addc_u32 s15, s15, 0 +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_lshr_b32 s56, s5, 16 +; SI-NEXT: s_lshr_b32 s57, s7, 16 +; SI-NEXT: s_lshr_b32 s58, s9, 16 +; SI-NEXT: s_lshr_b32 s59, s11, 16 +; SI-NEXT: s_lshr_b32 s60, s13, 16 +; SI-NEXT: s_lshr_b32 s61, s15, 16 +; SI-NEXT: s_lshr_b32 s62, s17, 16 +; SI-NEXT: s_lshr_b32 s63, s19, 16 +; SI-NEXT: s_lshr_b64 s[20:21], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[22:23], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[24:25], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[42:43], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[44:45], s[18:19], 16 ; SI-NEXT: .LBB57_3: ; %end -; SI-NEXT: v_mov_b32_e32 v0, v34 -; SI-NEXT: v_mov_b32_e32 v2, v35 -; SI-NEXT: v_mov_b32_e32 v4, v36 -; SI-NEXT: v_mov_b32_e32 v6, v37 -; SI-NEXT: v_mov_b32_e32 v8, v38 -; SI-NEXT: v_mov_b32_e32 v10, v39 -; SI-NEXT: v_mov_b32_e32 v12, v48 -; SI-NEXT: v_mov_b32_e32 v14, v49 -; SI-NEXT: v_mov_b32_e32 v16, v50 -; SI-NEXT: v_mov_b32_e32 v18, v51 -; SI-NEXT: v_mov_b32_e32 v20, v52 -; SI-NEXT: v_mov_b32_e32 v22, v53 -; SI-NEXT: v_mov_b32_e32 v24, v54 -; SI-NEXT: v_mov_b32_e32 v26, v55 -; SI-NEXT: v_mov_b32_e32 v28, v32 -; SI-NEXT: v_mov_b32_e32 v30, v33 +; SI-NEXT: s_and_b32 s18, s18, 0xffff +; SI-NEXT: s_lshl_b32 s21, s44, 16 +; SI-NEXT: s_or_b32 s18, s18, s21 +; SI-NEXT: s_and_b32 s19, s19, 0xffff +; SI-NEXT: s_lshl_b32 s21, s63, 16 +; SI-NEXT: s_or_b32 s19, s19, s21 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_lshl_b32 s21, s42, 16 +; SI-NEXT: s_or_b32 s16, s16, s21 +; SI-NEXT: s_and_b32 s17, s17, 0xffff +; SI-NEXT: s_lshl_b32 s21, s62, 16 +; SI-NEXT: s_or_b32 s17, s17, s21 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: s_lshl_b32 s21, s40, 16 +; SI-NEXT: s_or_b32 s14, s14, s21 +; SI-NEXT: s_and_b32 s15, s15, 0xffff +; SI-NEXT: s_lshl_b32 s21, s61, 16 +; SI-NEXT: s_or_b32 s15, s15, s21 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_lshl_b32 s21, s28, 16 +; SI-NEXT: s_or_b32 s12, s12, s21 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_lshl_b32 s21, s60, 16 +; SI-NEXT: s_or_b32 s13, s13, s21 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_lshl_b32 s21, s26, 16 +; SI-NEXT: s_or_b32 s10, s10, s21 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: s_lshl_b32 s21, s59, 16 +; SI-NEXT: s_or_b32 s11, s11, s21 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_lshl_b32 s21, s24, 16 +; SI-NEXT: s_or_b32 s8, s8, s21 +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_lshl_b32 s21, s58, 16 +; SI-NEXT: s_or_b32 s9, s9, s21 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_lshl_b32 s21, s22, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s20, s20, 16 +; SI-NEXT: s_or_b32 s6, s6, s21 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_lshl_b32 s21, s57, 16 +; SI-NEXT: s_or_b32 s4, s4, s20 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s20, s56, 16 +; SI-NEXT: s_or_b32 s7, s7, s21 +; SI-NEXT: s_or_b32 s5, s5, s20 +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: v_mov_b32_e32 v1, s19 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: v_mov_b32_e32 v3, s17 +; SI-NEXT: v_mov_b32_e32 v4, s14 +; SI-NEXT: v_mov_b32_e32 v5, s15 +; SI-NEXT: v_mov_b32_e32 v6, s12 +; SI-NEXT: v_mov_b32_e32 v7, s13 +; SI-NEXT: v_mov_b32_e32 v8, s10 +; SI-NEXT: v_mov_b32_e32 v9, s11 +; SI-NEXT: v_mov_b32_e32 v10, s8 +; SI-NEXT: v_mov_b32_e32 v11, s9 +; SI-NEXT: v_mov_b32_e32 v12, s6 +; SI-NEXT: v_mov_b32_e32 v13, s7 +; SI-NEXT: v_mov_b32_e32 v14, s4 +; SI-NEXT: v_mov_b32_e32 v15, s5 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB57_4: -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr63 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr61 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr59 +; SI-NEXT: ; implicit-def: $sgpr24 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr22 +; SI-NEXT: ; implicit-def: $sgpr57 +; SI-NEXT: ; implicit-def: $sgpr20 +; SI-NEXT: ; implicit-def: $sgpr56 ; SI-NEXT: s_branch .LBB57_2 ; ; VI-LABEL: bitcast_v8i64_to_v32i16_scalar: @@ -31996,95 +33014,114 @@ define <8 x i64> @bitcast_v32i16_to_v8i64(<32 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v32i16_to_v8i64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v32, v2 +; SI-NEXT: v_mov_b32_e32 v32, v15 +; SI-NEXT: v_mov_b32_e32 v17, v14 +; SI-NEXT: v_mov_b32_e32 v18, v13 +; SI-NEXT: v_mov_b32_e32 v19, v12 +; SI-NEXT: v_mov_b32_e32 v20, v11 +; SI-NEXT: v_mov_b32_e32 v21, v10 +; SI-NEXT: v_mov_b32_e32 v22, v9 +; SI-NEXT: v_mov_b32_e32 v23, v8 +; SI-NEXT: v_mov_b32_e32 v24, v7 +; SI-NEXT: v_mov_b32_e32 v25, v6 +; SI-NEXT: v_mov_b32_e32 v26, v5 +; SI-NEXT: v_mov_b32_e32 v27, v4 +; SI-NEXT: v_mov_b32_e32 v28, v3 +; SI-NEXT: v_mov_b32_e32 v29, v2 +; SI-NEXT: v_mov_b32_e32 v30, v1 ; SI-NEXT: v_mov_b32_e32 v31, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v38, v14 -; SI-NEXT: v_mov_b32_e32 v37, v12 -; SI-NEXT: v_mov_b32_e32 v36, v10 -; SI-NEXT: v_mov_b32_e32 v35, v8 -; SI-NEXT: v_mov_b32_e32 v34, v6 -; SI-NEXT: v_mov_b32_e32 v33, v4 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v29 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v31 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v0 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execz .LBB58_2 -; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB58_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB58_4 +; SI-NEXT: .LBB58_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB58_3: ; %cmp.false ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v31 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v32 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v33 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v34 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v35 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v36 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v37 -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v38 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v16 -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v18 -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v20 -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v22 -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v24 -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v26 -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v28 -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v30 -; SI-NEXT: v_or_b32_e32 v0, v0, v42 -; SI-NEXT: v_or_b32_e32 v1, v1, v41 -; SI-NEXT: v_or_b32_e32 v2, v2, v40 -; SI-NEXT: v_or_b32_e32 v3, v3, v55 -; SI-NEXT: v_or_b32_e32 v4, v4, v54 -; SI-NEXT: v_or_b32_e32 v5, v5, v53 -; SI-NEXT: v_or_b32_e32 v6, v6, v52 -; SI-NEXT: v_or_b32_e32 v7, v7, v51 -; SI-NEXT: v_or_b32_e32 v8, v8, v50 -; SI-NEXT: v_or_b32_e32 v9, v9, v49 -; SI-NEXT: v_or_b32_e32 v10, v10, v48 -; SI-NEXT: v_or_b32_e32 v11, v11, v39 -; SI-NEXT: v_or_b32_e32 v12, v12, v23 -; SI-NEXT: v_or_b32_e32 v13, v13, v21 -; SI-NEXT: v_or_b32_e32 v14, v14, v19 -; SI-NEXT: v_or_b32_e32 v15, v15, v17 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v30 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v29 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v28 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v27 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v26 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v25 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v24 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v23 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v21 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v19 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v32 +; SI-NEXT: v_or_b32_e32 v0, v0, v55 +; SI-NEXT: v_or_b32_e32 v1, v1, v54 +; SI-NEXT: v_or_b32_e32 v2, v2, v53 +; SI-NEXT: v_or_b32_e32 v3, v3, v52 +; SI-NEXT: v_or_b32_e32 v4, v4, v51 +; SI-NEXT: v_or_b32_e32 v5, v5, v50 +; SI-NEXT: v_or_b32_e32 v6, v6, v49 +; SI-NEXT: v_or_b32_e32 v7, v7, v48 +; SI-NEXT: v_or_b32_e32 v8, v8, v39 +; SI-NEXT: v_or_b32_e32 v9, v9, v38 +; SI-NEXT: v_or_b32_e32 v10, v10, v37 +; SI-NEXT: v_or_b32_e32 v11, v11, v36 +; SI-NEXT: v_or_b32_e32 v12, v12, v35 +; SI-NEXT: v_or_b32_e32 v13, v13, v34 +; SI-NEXT: v_or_b32_e32 v14, v14, v33 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr53 @@ -32094,30 +33131,32 @@ define <8 x i64> @bitcast_v32i16_to_v8i64(<32 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: .LBB58_2: ; %Flow +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB58_4 -; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: s_cbranch_execz .LBB58_2 +; SI-NEXT: .LBB58_4: ; %cmp.true ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v31 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v32 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v33 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v34 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v35 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v36 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v37 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v38 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v16 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v18 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v20 -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v22 -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v24 -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v26 -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v28 -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v29 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v27 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v32 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 @@ -32134,23 +33173,23 @@ define <8 x i64> @bitcast_v32i16_to_v8i64(<32 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: v_or_b32_e32 v0, v55, v0 ; SI-NEXT: s_mov_b32 s6, 0x30000 -; SI-NEXT: v_or_b32_e32 v1, v41, v1 -; SI-NEXT: v_or_b32_e32 v2, v40, v2 -; SI-NEXT: v_or_b32_e32 v3, v55, v3 -; SI-NEXT: v_or_b32_e32 v4, v54, v4 -; SI-NEXT: v_or_b32_e32 v5, v53, v5 -; SI-NEXT: v_or_b32_e32 v6, v52, v6 -; SI-NEXT: v_or_b32_e32 v7, v51, v7 -; SI-NEXT: v_or_b32_e32 v8, v50, v8 -; SI-NEXT: v_or_b32_e32 v9, v49, v9 -; SI-NEXT: v_or_b32_e32 v10, v48, v10 -; SI-NEXT: v_or_b32_e32 v11, v39, v11 -; SI-NEXT: v_or_b32_e32 v12, v23, v12 -; SI-NEXT: v_or_b32_e32 v13, v21, v13 -; SI-NEXT: v_or_b32_e32 v14, v19, v14 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: v_or_b32_e32 v1, v54, v1 +; SI-NEXT: v_or_b32_e32 v2, v53, v2 +; SI-NEXT: v_or_b32_e32 v3, v52, v3 +; SI-NEXT: v_or_b32_e32 v4, v51, v4 +; SI-NEXT: v_or_b32_e32 v5, v50, v5 +; SI-NEXT: v_or_b32_e32 v6, v49, v6 +; SI-NEXT: v_or_b32_e32 v7, v48, v7 +; SI-NEXT: v_or_b32_e32 v8, v39, v8 +; SI-NEXT: v_or_b32_e32 v9, v38, v9 +; SI-NEXT: v_or_b32_e32 v10, v37, v10 +; SI-NEXT: v_or_b32_e32 v11, v36, v11 +; SI-NEXT: v_or_b32_e32 v12, v35, v12 +; SI-NEXT: v_or_b32_e32 v13, v34, v13 +; SI-NEXT: v_or_b32_e32 v14, v33, v14 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 @@ -32167,12 +33206,7 @@ define <8 x i64> @bitcast_v32i16_to_v8i64(<32 x i16> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v13 ; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v14 ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v15 -; SI-NEXT: .LBB58_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v32i16_to_v8i64: @@ -32316,153 +33350,184 @@ define inreg <8 x i64> @bitcast_v32i16_to_v8i64_scalar(<32 x i16> inreg %a, i32 ; SI-LABEL: bitcast_v32i16_to_v8i64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; SI-NEXT: v_mov_b32_e32 v26, v14 -; SI-NEXT: v_mov_b32_e32 v25, v12 -; SI-NEXT: v_mov_b32_e32 v19, v10 -; SI-NEXT: v_mov_b32_e32 v20, v8 -; SI-NEXT: v_mov_b32_e32 v21, v6 -; SI-NEXT: v_mov_b32_e32 v22, v4 -; SI-NEXT: v_mov_b32_e32 v23, v2 -; SI-NEXT: v_mov_b32_e32 v24, v0 +; SI-NEXT: v_mov_b32_e32 v16, v1 +; SI-NEXT: v_mov_b32_e32 v17, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v17 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s7, s28, 16 +; SI-NEXT: s_lshr_b32 s8, s27, 16 +; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: s_lshr_b32 s13, s22, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s40, s19, 16 +; SI-NEXT: s_lshr_b32 s41, s18, 16 +; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v0 ; SI-NEXT: s_cbranch_scc0 .LBB59_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v24 ; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: v_or_b32_e32 v7, v0, v33 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v22 +; SI-NEXT: s_lshl_b32 s5, s43, 16 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: v_or_b32_e32 v9, v0, v31 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v21 -; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: v_or_b32_e32 v10, v0, v30 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v20 -; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: v_or_b32_e32 v11, v0, v29 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v19 -; SI-NEXT: s_or_b32 s7, s7, s8 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: v_or_b32_e32 v12, v0, v28 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v25 -; SI-NEXT: s_or_b32 s8, s8, s9 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: v_or_b32_e32 v13, v0, v27 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v26 -; SI-NEXT: s_or_b32 s9, s9, s10 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 -; SI-NEXT: v_or_b32_e32 v14, v0, v18 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s44, s42, 16 +; SI-NEXT: s_or_b32 s5, s5, s44 +; SI-NEXT: s_and_b32 s44, s18, 0xffff +; SI-NEXT: s_lshl_b32 s45, s41, 16 +; SI-NEXT: s_or_b32 s44, s44, s45 +; SI-NEXT: s_and_b32 s45, s19, 0xffff +; SI-NEXT: s_lshl_b32 s46, s40, 16 +; SI-NEXT: s_or_b32 s45, s45, s46 +; SI-NEXT: s_and_b32 s46, s20, 0xffff +; SI-NEXT: s_lshl_b32 s47, s15, 16 +; SI-NEXT: s_or_b32 s46, s46, s47 +; SI-NEXT: s_and_b32 s47, s21, 0xffff +; SI-NEXT: s_lshl_b32 s56, s14, 16 +; SI-NEXT: s_or_b32 s47, s47, s56 +; SI-NEXT: s_and_b32 s56, s22, 0xffff +; SI-NEXT: s_lshl_b32 s57, s13, 16 +; SI-NEXT: s_or_b32 s56, s56, s57 +; SI-NEXT: s_and_b32 s57, s23, 0xffff +; SI-NEXT: s_lshl_b32 s58, s12, 16 +; SI-NEXT: s_or_b32 s57, s57, s58 +; SI-NEXT: s_and_b32 s58, s24, 0xffff +; SI-NEXT: s_lshl_b32 s59, s11, 16 +; SI-NEXT: s_or_b32 s58, s58, s59 +; SI-NEXT: s_and_b32 s59, s25, 0xffff +; SI-NEXT: s_lshl_b32 s60, s10, 16 +; SI-NEXT: s_or_b32 s59, s59, s60 +; SI-NEXT: s_and_b32 s60, s26, 0xffff +; SI-NEXT: s_lshl_b32 s61, s9, 16 +; SI-NEXT: s_or_b32 s60, s60, s61 +; SI-NEXT: s_and_b32 s61, s27, 0xffff +; SI-NEXT: s_lshl_b32 s62, s8, 16 +; SI-NEXT: s_or_b32 s61, s61, s62 +; SI-NEXT: s_and_b32 s62, s28, 0xffff +; SI-NEXT: s_lshl_b32 s63, s7, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v17 +; SI-NEXT: s_or_b32 s62, s62, s63 +; SI-NEXT: s_and_b32 s63, s29, 0xffff +; SI-NEXT: s_lshl_b32 s72, s6, 16 +; SI-NEXT: v_or_b32_e32 v14, v0, v19 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 -; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_or_b32_e32 v8, v1, v32 -; SI-NEXT: v_or_b32_e32 v15, v0, v17 +; SI-NEXT: s_or_b32 s63, s63, s72 +; SI-NEXT: v_or_b32_e32 v15, v0, v18 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_mov_b32_e32 v5, s9 -; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: v_mov_b32_e32 v2, s44 +; SI-NEXT: v_mov_b32_e32 v3, s45 +; SI-NEXT: v_mov_b32_e32 v4, s46 +; SI-NEXT: v_mov_b32_e32 v5, s47 +; SI-NEXT: v_mov_b32_e32 v6, s56 +; SI-NEXT: v_mov_b32_e32 v7, s57 +; SI-NEXT: v_mov_b32_e32 v8, s58 +; SI-NEXT: v_mov_b32_e32 v9, s59 +; SI-NEXT: v_mov_b32_e32 v10, s60 +; SI-NEXT: v_mov_b32_e32 v11, s61 +; SI-NEXT: v_mov_b32_e32 v12, s62 +; SI-NEXT: v_mov_b32_e32 v13, s63 ; SI-NEXT: s_cbranch_execnz .LBB59_3 ; SI-NEXT: .LBB59_2: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v24 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v33, v0 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v23 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v32, v0 -; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v22 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v31, v0 -; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v21 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v30, v0 -; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v20 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v29, v0 -; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v19 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_or_b32_e32 v0, v28, v0 ; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v25 +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: s_add_i32 s17, s17, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s16, s42, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: s_and_b32 s16, s18, 0xffff +; SI-NEXT: s_lshl_b32 s17, s41, 16 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_and_b32 s17, s19, 0xffff +; SI-NEXT: s_lshl_b32 s18, s40, 16 ; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: s_and_b32 s18, s20, 0xffff +; SI-NEXT: s_lshl_b32 s15, s15, 16 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_or_b32 s15, s15, s18 +; SI-NEXT: s_and_b32 s18, s21, 0xffff +; SI-NEXT: s_lshl_b32 s14, s14, 16 ; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: v_or_b32_e32 v0, v27, v0 -; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_or_b32 s14, s14, s18 +; SI-NEXT: s_and_b32 s18, s22, 0xffff +; SI-NEXT: s_lshl_b32 s13, s13, 16 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_or_b32 s13, s13, s18 +; SI-NEXT: s_and_b32 s18, s23, 0xffff +; SI-NEXT: s_lshl_b32 s12, s12, 16 ; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v26 -; SI-NEXT: s_or_b32 s7, s8, s7 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_or_b32 s12, s12, s18 +; SI-NEXT: s_and_b32 s18, s24, 0xffff +; SI-NEXT: s_lshl_b32 s11, s11, 16 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_or_b32 s11, s11, s18 +; SI-NEXT: s_and_b32 s18, s25, 0xffff +; SI-NEXT: s_lshl_b32 s10, s10, 16 ; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_or_b32 s8, s9, s8 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_or_b32 s10, s10, s18 +; SI-NEXT: s_and_b32 s18, s26, 0xffff +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v17 +; SI-NEXT: s_or_b32 s9, s9, s18 +; SI-NEXT: s_and_b32 s18, s27, 0xffff +; SI-NEXT: s_lshl_b32 s8, s8, 16 ; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: v_or_b32_e32 v0, v18, v0 -; SI-NEXT: s_or_b32 s9, s10, s9 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s8, s8, s18 +; SI-NEXT: s_and_b32 s18, s28, 0xffff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: v_or_b32_e32 v0, v19, v0 +; SI-NEXT: s_or_b32 s7, s7, s18 +; SI-NEXT: s_and_b32 s18, s29, 0xffff +; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v16 -; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: s_or_b32 s6, s6, s18 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_add_i32 s4, s4, 0x30000 ; SI-NEXT: s_add_i32 s5, s5, 0x30000 -; SI-NEXT: s_add_i32 s6, s6, 0x30000 -; SI-NEXT: s_add_i32 s7, s7, 0x30000 -; SI-NEXT: s_add_i32 s8, s8, 0x30000 -; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s16, s16, 0x30000 +; SI-NEXT: s_add_i32 s17, s17, 0x30000 +; SI-NEXT: s_add_i32 s15, s15, 0x30000 +; SI-NEXT: s_add_i32 s14, s14, 0x30000 +; SI-NEXT: s_add_i32 s13, s13, 0x30000 +; SI-NEXT: s_add_i32 s12, s12, 0x30000 +; SI-NEXT: s_add_i32 s11, s11, 0x30000 ; SI-NEXT: s_add_i32 s10, s10, 0x30000 -; SI-NEXT: v_or_b32_e32 v0, v17, v0 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v18, v0 ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_mov_b32_e32 v5, s9 -; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: v_mov_b32_e32 v3, s17 +; SI-NEXT: v_mov_b32_e32 v4, s15 +; SI-NEXT: v_mov_b32_e32 v5, s14 +; SI-NEXT: v_mov_b32_e32 v6, s13 +; SI-NEXT: v_mov_b32_e32 v7, s12 +; SI-NEXT: v_mov_b32_e32 v8, s11 +; SI-NEXT: v_mov_b32_e32 v9, s10 +; SI-NEXT: v_mov_b32_e32 v10, s9 +; SI-NEXT: v_mov_b32_e32 v11, s8 +; SI-NEXT: v_mov_b32_e32 v12, s7 +; SI-NEXT: v_mov_b32_e32 v13, s6 ; SI-NEXT: .LBB59_3: ; %end ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB59_4: @@ -32720,23 +33785,91 @@ define <32 x half> @bitcast_v8i64_to_v32f16(<8 x i64> %a, i32 %b) { ; SI-LABEL: bitcast_v8i64_to_v32f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v33, v15 -; SI-NEXT: v_mov_b32_e32 v32, v14 -; SI-NEXT: v_mov_b32_e32 v35, v13 -; SI-NEXT: v_mov_b32_e32 v34, v12 -; SI-NEXT: v_mov_b32_e32 v37, v11 -; SI-NEXT: v_mov_b32_e32 v36, v10 -; SI-NEXT: v_mov_b32_e32 v39, v9 -; SI-NEXT: v_mov_b32_e32 v38, v8 -; SI-NEXT: v_mov_b32_e32 v49, v7 -; SI-NEXT: v_mov_b32_e32 v48, v6 -; SI-NEXT: v_mov_b32_e32 v51, v5 -; SI-NEXT: v_mov_b32_e32 v50, v4 -; SI-NEXT: v_mov_b32_e32 v53, v3 -; SI-NEXT: v_mov_b32_e32 v52, v2 -; SI-NEXT: v_mov_b32_e32 v55, v1 -; SI-NEXT: v_mov_b32_e32 v54, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB60_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v0 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 @@ -32753,164 +33886,140 @@ define <32 x half> @bitcast_v8i64_to_v32f16(<8 x i64> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB60_3 -; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: .LBB60_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB60_4 -; SI-NEXT: .LBB60_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB60_3: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v54 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB60_2 -; SI-NEXT: .LBB60_4: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v54 -; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v55, vcc -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v52 -; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v53, vcc -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v50 -; SI-NEXT: v_addc_u32_e32 v8, vcc, 0, v51, vcc -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v48 -; SI-NEXT: v_addc_u32_e32 v12, vcc, 0, v49, vcc -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v38 -; SI-NEXT: v_addc_u32_e32 v16, vcc, 0, v39, vcc -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v36 -; SI-NEXT: v_addc_u32_e32 v20, vcc, 0, v37, vcc -; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v34 -; SI-NEXT: v_addc_u32_e32 v24, vcc, 0, v35, vcc -; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v32 -; SI-NEXT: v_addc_u32_e32 v28, vcc, 0, v33, vcc -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v12 +; SI-NEXT: s_cbranch_execz .LBB60_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v13 ; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 +; SI-NEXT: .LBB60_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_cvt_f16_f32_e32 v0, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v52 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v51 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v48 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v38 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v37 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v34 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v33 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v29 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v30 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v25 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v26 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v21 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v22 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v17 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v18 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8i64_to_v32f16: @@ -33060,53 +34169,53 @@ define inreg <32 x half> @bitcast_v8i64_to_v32f16_scalar(<8 x i64> inreg %a, i32 ; SI-NEXT: s_cbranch_scc0 .LBB61_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_lshr_b32 s4, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 ; SI-NEXT: s_lshr_b32 s4, s6, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 ; SI-NEXT: s_lshr_b32 s4, s8, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 ; SI-NEXT: s_lshr_b32 s4, s7, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 ; SI-NEXT: s_lshr_b32 s4, s11, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 ; SI-NEXT: s_lshr_b32 s4, s10, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 ; SI-NEXT: s_lshr_b32 s4, s13, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 ; SI-NEXT: s_lshr_b32 s4, s12, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 ; SI-NEXT: s_lshr_b32 s4, s15, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 ; SI-NEXT: s_lshr_b32 s4, s14, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s4 ; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 ; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s4 ; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 ; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s4 ; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s20 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s20 ; SI-NEXT: s_cbranch_execnz .LBB61_3 ; SI-NEXT: .LBB61_2: ; %cmp.true ; SI-NEXT: s_add_u32 s4, s20, 3 @@ -33141,73 +34250,137 @@ define inreg <32 x half> @bitcast_v8i64_to_v32f16_scalar(<8 x i64> inreg %a, i32 ; SI-NEXT: s_addc_u32 s9, s9, 0 ; SI-NEXT: s_lshr_b32 s44, s6, 16 ; SI-NEXT: s_lshr_b32 s45, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s45 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s44 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s43 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s45 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s44 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s20 ; SI-NEXT: .LBB61_3: ; %end +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v31 +; SI-NEXT: v_or_b32_e32 v0, v30, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 +; SI-NEXT: v_or_b32_e32 v2, v28, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v28 +; SI-NEXT: v_or_b32_e32 v5, v5, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v26 +; SI-NEXT: v_or_b32_e32 v7, v7, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v24 +; SI-NEXT: v_or_b32_e32 v9, v22, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v22 +; SI-NEXT: v_or_b32_e32 v11, v20, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_or_b32_e32 v13, v18, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v18 +; SI-NEXT: v_or_b32_e32 v1, v32, v1 +; SI-NEXT: v_or_b32_e32 v3, v30, v3 +; SI-NEXT: v_or_b32_e32 v4, v27, v4 +; SI-NEXT: v_or_b32_e32 v6, v25, v6 +; SI-NEXT: v_or_b32_e32 v8, v23, v8 +; SI-NEXT: v_or_b32_e32 v10, v21, v10 +; SI-NEXT: v_or_b32_e32 v12, v19, v12 +; SI-NEXT: v_or_b32_e32 v14, v17, v14 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB61_4: +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: s_branch .LBB61_2 ; ; VI-LABEL: bitcast_v8i64_to_v32f16_scalar: @@ -33364,97 +34537,126 @@ define <8 x i64> @bitcast_v32f16_to_v8i64(<32 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v32f16_to_v8i64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:4 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v15 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v47 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v56 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB62_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v46 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v44 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v42 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v40 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v54 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v52 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v50 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v48 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v38 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v34 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 -; SI-NEXT: v_or_b32_e32 v0, v45, v0 -; SI-NEXT: v_or_b32_e32 v1, v43, v1 -; SI-NEXT: v_or_b32_e32 v2, v41, v2 -; SI-NEXT: v_or_b32_e32 v3, v55, v3 -; SI-NEXT: v_or_b32_e32 v4, v53, v4 -; SI-NEXT: v_or_b32_e32 v5, v51, v5 -; SI-NEXT: v_or_b32_e32 v6, v49, v6 -; SI-NEXT: v_or_b32_e32 v7, v39, v7 -; SI-NEXT: v_or_b32_e32 v8, v37, v8 -; SI-NEXT: v_or_b32_e32 v9, v35, v9 -; SI-NEXT: v_or_b32_e32 v10, v33, v10 -; SI-NEXT: v_or_b32_e32 v11, v31, v11 -; SI-NEXT: v_or_b32_e32 v12, v22, v12 -; SI-NEXT: v_or_b32_e32 v13, v20, v13 -; SI-NEXT: v_or_b32_e32 v14, v18, v14 -; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v40 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v18 +; SI-NEXT: v_or_b32_e32 v0, v55, v0 +; SI-NEXT: v_or_b32_e32 v1, v53, v1 +; SI-NEXT: v_or_b32_e32 v2, v51, v2 +; SI-NEXT: v_or_b32_e32 v3, v49, v3 +; SI-NEXT: v_or_b32_e32 v4, v39, v4 +; SI-NEXT: v_or_b32_e32 v5, v37, v5 +; SI-NEXT: v_or_b32_e32 v6, v35, v6 +; SI-NEXT: v_or_b32_e32 v7, v33, v7 +; SI-NEXT: v_or_b32_e32 v8, v31, v8 +; SI-NEXT: v_or_b32_e32 v9, v29, v9 +; SI-NEXT: v_or_b32_e32 v10, v27, v10 +; SI-NEXT: v_or_b32_e32 v11, v25, v11 +; SI-NEXT: v_or_b32_e32 v12, v23, v12 +; SI-NEXT: v_or_b32_e32 v13, v21, v13 +; SI-NEXT: v_or_b32_e32 v14, v19, v14 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 ; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr54 @@ -33473,6 +34675,13 @@ define <8 x i64> @bitcast_v32f16_to_v8i64(<32 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr21 @@ -33480,15 +34689,14 @@ define <8 x i64> @bitcast_v32f16_to_v8i64(<32 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: .LBB62_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB62_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v53 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -33501,10 +34709,10 @@ define <8 x i64> @bitcast_v32f16_to_v8i64(<32 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v49 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -33512,10 +34720,10 @@ define <8 x i64> @bitcast_v32f16_to_v8i64(<32 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v39 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v48 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 @@ -33523,11 +34731,11 @@ define <8 x i64> @bitcast_v32f16_to_v8i64(<32 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v38 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v36 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 @@ -33535,11 +34743,11 @@ define <8 x i64> @bitcast_v32f16_to_v8i64(<32 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v35 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v33 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 @@ -33547,11 +34755,11 @@ define <8 x i64> @bitcast_v32f16_to_v8i64(<32 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v32 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v30 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 @@ -33559,11 +34767,11 @@ define <8 x i64> @bitcast_v32f16_to_v8i64(<32 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v29 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v27 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 @@ -33571,11 +34779,11 @@ define <8 x i64> @bitcast_v32f16_to_v8i64(<32 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v26 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v24 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 @@ -33583,11 +34791,11 @@ define <8 x i64> @bitcast_v32f16_to_v8i64(<32 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v23 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v21 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 @@ -33595,35 +34803,27 @@ define <8 x i64> @bitcast_v32f16_to_v8i64(<32 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_or_b32_e32 v12, v14, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v20 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 -; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 ; SI-NEXT: .LBB62_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -33769,82 +34969,128 @@ define inreg <8 x i64> @bitcast_v32f16_to_v8i64_scalar(<32 x half> inreg %a, i32 ; SI-LABEL: bitcast_v32f16_to_v8i64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v51, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v50, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v40, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v55, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v54, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v53, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v52, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v49, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v48, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v39, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v38, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v37, s26 -; SI-NEXT: v_cvt_f16_f32_e32 v36, s29 -; SI-NEXT: v_cvt_f16_f32_e32 v35, s28 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: s_lshr_b32 s10, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s10 +; SI-NEXT: s_lshr_b32 s10, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s10 +; SI-NEXT: s_lshr_b32 s10, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s10 +; SI-NEXT: s_lshr_b32 s10, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s10 +; SI-NEXT: s_lshr_b32 s8, s25, 16 +; SI-NEXT: s_lshr_b32 s9, s24, 16 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s25 +; SI-NEXT: s_lshr_b32 s6, s27, 16 +; SI-NEXT: s_lshr_b32 s7, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s19, 16 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 +; SI-NEXT: s_lshr_b32 s10, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s10 +; SI-NEXT: s_lshr_b32 s10, s17, 16 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; SI-NEXT: s_lshr_b32 s4, s29, 16 +; SI-NEXT: s_lshr_b32 s5, s28, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s10 +; SI-NEXT: s_lshr_b32 s10, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v1 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_cbranch_scc0 .LBB63_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v40 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v52 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v48 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v38 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v53 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v33 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v29 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v28 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v25 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v24 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v22 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v19 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 -; SI-NEXT: v_or_b32_e32 v0, v50, v0 -; SI-NEXT: v_or_b32_e32 v1, v55, v1 -; SI-NEXT: v_or_b32_e32 v2, v53, v2 -; SI-NEXT: v_or_b32_e32 v3, v49, v3 -; SI-NEXT: v_or_b32_e32 v4, v39, v4 -; SI-NEXT: v_or_b32_e32 v5, v37, v5 -; SI-NEXT: v_or_b32_e32 v6, v35, v6 -; SI-NEXT: v_or_b32_e32 v7, v33, v7 -; SI-NEXT: v_or_b32_e32 v8, v31, v8 -; SI-NEXT: v_or_b32_e32 v9, v29, v9 -; SI-NEXT: v_or_b32_e32 v10, v27, v10 -; SI-NEXT: v_or_b32_e32 v11, v25, v11 -; SI-NEXT: v_or_b32_e32 v12, v23, v12 -; SI-NEXT: v_or_b32_e32 v13, v21, v13 -; SI-NEXT: v_or_b32_e32 v14, v19, v14 +; SI-NEXT: v_or_b32_e32 v0, v54, v0 +; SI-NEXT: v_or_b32_e32 v1, v52, v1 +; SI-NEXT: v_or_b32_e32 v2, v50, v2 +; SI-NEXT: v_or_b32_e32 v3, v48, v3 +; SI-NEXT: v_or_b32_e32 v4, v38, v4 +; SI-NEXT: v_or_b32_e32 v5, v36, v5 +; SI-NEXT: v_or_b32_e32 v6, v34, v6 +; SI-NEXT: v_or_b32_e32 v7, v31, v7 +; SI-NEXT: v_or_b32_e32 v8, v30, v8 +; SI-NEXT: v_or_b32_e32 v9, v27, v9 +; SI-NEXT: v_or_b32_e32 v10, v26, v10 +; SI-NEXT: v_or_b32_e32 v11, v23, v11 +; SI-NEXT: v_or_b32_e32 v12, v22, v12 +; SI-NEXT: v_or_b32_e32 v13, v20, v13 +; SI-NEXT: v_or_b32_e32 v14, v18, v14 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 ; SI-NEXT: s_cbranch_execnz .LBB63_3 ; SI-NEXT: .LBB63_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v52 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -33853,25 +35099,25 @@ define inreg <8 x i64> @bitcast_v32f16_to_v8i64_scalar(<32 x half> inreg %a, i32 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v51 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v50 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v49 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v48 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v38 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -33879,11 +35125,11 @@ define inreg <8 x i64> @bitcast_v32f16_to_v8i64_scalar(<32 x half> inreg %a, i32 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v37 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v35 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 @@ -33891,11 +35137,11 @@ define inreg <8 x i64> @bitcast_v32f16_to_v8i64_scalar(<32 x half> inreg %a, i32 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v34 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v31 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 @@ -33906,8 +35152,8 @@ define inreg <8 x i64> @bitcast_v32f16_to_v8i64_scalar(<32 x half> inreg %a, i32 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v32 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v29 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 @@ -33915,11 +35161,11 @@ define inreg <8 x i64> @bitcast_v32f16_to_v8i64_scalar(<32 x half> inreg %a, i32 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v27 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v26 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 @@ -33927,10 +35173,10 @@ define inreg <8 x i64> @bitcast_v32f16_to_v8i64_scalar(<32 x half> inreg %a, i32 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v25 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v23 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v24 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 @@ -33939,11 +35185,11 @@ define inreg <8 x i64> @bitcast_v32f16_to_v8i64_scalar(<32 x half> inreg %a, i32 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v22 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v20 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 @@ -33951,10 +35197,10 @@ define inreg <8 x i64> @bitcast_v32f16_to_v8i64_scalar(<32 x half> inreg %a, i32 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_or_b32_e32 v12, v14, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v19 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 @@ -33970,8 +35216,6 @@ define inreg <8 x i64> @bitcast_v32f16_to_v8i64_scalar(<32 x half> inreg %a, i32 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 ; SI-NEXT: .LBB63_3: ; %end -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB63_4: ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 @@ -34169,23 +35413,75 @@ define <32 x bfloat> @bitcast_v8i64_to_v32bf16(<8 x i64> %a, i32 %b) { ; SI-LABEL: bitcast_v8i64_to_v32bf16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v55, v15 -; SI-NEXT: v_mov_b32_e32 v54, v14 -; SI-NEXT: v_mov_b32_e32 v53, v13 -; SI-NEXT: v_mov_b32_e32 v52, v12 -; SI-NEXT: v_mov_b32_e32 v51, v11 -; SI-NEXT: v_mov_b32_e32 v50, v10 -; SI-NEXT: v_mov_b32_e32 v49, v9 -; SI-NEXT: v_mov_b32_e32 v48, v8 -; SI-NEXT: v_mov_b32_e32 v39, v7 -; SI-NEXT: v_mov_b32_e32 v38, v6 -; SI-NEXT: v_mov_b32_e32 v37, v5 -; SI-NEXT: v_mov_b32_e32 v36, v4 -; SI-NEXT: v_mov_b32_e32 v35, v3 -; SI-NEXT: v_mov_b32_e32 v34, v2 -; SI-NEXT: v_mov_b32_e32 v33, v1 -; SI-NEXT: v_mov_b32_e32 v32, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB64_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v15 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v15 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v14 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v14 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v13 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v13 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v12 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v12 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v11 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v11 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v10 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v10 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v9 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v9 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v8 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v8 +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v7 +; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v6 +; SI-NEXT: v_and_b32_e32 v37, 0xffff0000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v5 +; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v4 +; SI-NEXT: v_and_b32_e32 v49, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v3 +; SI-NEXT: v_and_b32_e32 v51, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v2 +; SI-NEXT: v_and_b32_e32 v53, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v1 +; SI-NEXT: v_and_b32_e32 v55, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v0 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 @@ -34202,132 +35498,124 @@ define <32 x bfloat> @bitcast_v8i64_to_v32bf16(<8 x i64> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB64_3 -; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: .LBB64_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB64_4 -; SI-NEXT: .LBB64_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB64_3: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v55 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v55 -; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v54 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v54 -; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v53 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v53 -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v52 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v52 -; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v51 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v51 -; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v50 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v50 -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v49 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v49 -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v48 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v48 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v39 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v39 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v38 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v38 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v37 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v37 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v36 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v36 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v35 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v35 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v34 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v34 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v33 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v32 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB64_2 -; SI-NEXT: .LBB64_4: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 -; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v33, vcc -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v34 -; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v35, vcc -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v36 -; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v37, vcc -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v38 -; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v39, vcc -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v48 -; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v49, vcc -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v50 -; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v51, vcc -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v52 -; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v53, vcc -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v54 -; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v55, vcc -; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v15 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v15 -; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v14 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v14 -; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v13 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v13 -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v12 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v12 -; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v11 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v11 -; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v10 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v10 -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v9 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v9 -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v8 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v8 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v7 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v7 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v6 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v6 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v5 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v5 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v4 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v4 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v3 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_cbranch_execz .LBB64_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v15 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v15 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v14 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v14 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v13 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v13 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v12 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v12 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v11 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v11 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v10 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v10 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v9 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v9 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v8 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v8 +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v7 +; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v6 +; SI-NEXT: v_and_b32_e32 v37, 0xffff0000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v5 +; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v4 +; SI-NEXT: v_and_b32_e32 v49, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v3 +; SI-NEXT: v_and_b32_e32 v51, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v2 +; SI-NEXT: v_and_b32_e32 v53, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v1 +; SI-NEXT: v_and_b32_e32 v55, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v0 +; SI-NEXT: .LBB64_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v55 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v54 +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v53 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v52 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v51 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v50 +; SI-NEXT: v_alignbit_b32 v2, v2, v3, 16 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v49 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v48 +; SI-NEXT: v_alignbit_b32 v3, v3, v4, 16 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v39 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v38 +; SI-NEXT: v_alignbit_b32 v4, v4, v5, 16 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v37 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_mul_f32_e32 v6, 1.0, v36 +; SI-NEXT: v_alignbit_b32 v5, v5, v6, 16 +; SI-NEXT: v_mul_f32_e32 v6, 1.0, v35 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v34 +; SI-NEXT: v_alignbit_b32 v6, v6, v7, 16 +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v33 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v32 +; SI-NEXT: v_alignbit_b32 v7, v7, v8, 16 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v31 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v30 +; SI-NEXT: v_alignbit_b32 v8, v8, v9, 16 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v29 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v28 +; SI-NEXT: v_alignbit_b32 v9, v9, v10, 16 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v27 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v26 +; SI-NEXT: v_alignbit_b32 v10, v10, v11, 16 +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v25 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_mul_f32_e32 v12, 1.0, v24 +; SI-NEXT: v_alignbit_b32 v11, v11, v12, 16 +; SI-NEXT: v_mul_f32_e32 v12, 1.0, v23 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_mul_f32_e32 v13, 1.0, v22 +; SI-NEXT: v_alignbit_b32 v12, v12, v13, 16 +; SI-NEXT: v_mul_f32_e32 v13, 1.0, v21 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_mul_f32_e32 v14, 1.0, v20 +; SI-NEXT: v_alignbit_b32 v13, v13, v14, 16 +; SI-NEXT: v_mul_f32_e32 v14, 1.0, v19 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_mul_f32_e32 v15, 1.0, v18 +; SI-NEXT: v_alignbit_b32 v14, v14, v15, 16 +; SI-NEXT: v_mul_f32_e32 v15, 1.0, v17 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_alignbit_b32 v15, v15, v16, 16 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8i64_to_v32bf16: @@ -34476,155 +35764,187 @@ define inreg <32 x bfloat> @bitcast_v8i64_to_v32bf16_scalar(<8 x i64> inreg %a, ; SI-NEXT: v_readfirstlane_b32 s79, v1 ; SI-NEXT: s_cbranch_scc0 .LBB65_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_and_b32 s6, s79, 0xffff0000 -; SI-NEXT: s_lshl_b32 s7, s79, 16 -; SI-NEXT: s_and_b32 s8, s78, 0xffff0000 -; SI-NEXT: s_lshl_b32 s9, s78, 16 -; SI-NEXT: s_and_b32 s10, s77, 0xffff0000 -; SI-NEXT: s_lshl_b32 s11, s77, 16 -; SI-NEXT: s_and_b32 s12, s76, 0xffff0000 -; SI-NEXT: s_lshl_b32 s13, s76, 16 -; SI-NEXT: s_and_b32 s14, s75, 0xffff0000 -; SI-NEXT: s_lshl_b32 s15, s75, 16 -; SI-NEXT: s_and_b32 s16, s74, 0xffff0000 -; SI-NEXT: s_lshl_b32 s17, s74, 16 -; SI-NEXT: s_and_b32 s18, s73, 0xffff0000 -; SI-NEXT: s_lshl_b32 s19, s73, 16 -; SI-NEXT: s_and_b32 s20, s72, 0xffff0000 -; SI-NEXT: s_lshl_b32 s21, s72, 16 -; SI-NEXT: s_and_b32 s22, s63, 0xffff0000 -; SI-NEXT: s_lshl_b32 s23, s63, 16 -; SI-NEXT: s_and_b32 s24, s62, 0xffff0000 -; SI-NEXT: s_lshl_b32 s25, s62, 16 -; SI-NEXT: s_and_b32 s26, s61, 0xffff0000 -; SI-NEXT: s_lshl_b32 s27, s61, 16 -; SI-NEXT: s_and_b32 s28, s60, 0xffff0000 -; SI-NEXT: s_lshl_b32 s29, s60, 16 -; SI-NEXT: s_and_b32 s40, s59, 0xffff0000 -; SI-NEXT: s_lshl_b32 s41, s59, 16 -; SI-NEXT: s_and_b32 s42, s58, 0xffff0000 -; SI-NEXT: s_lshl_b32 s43, s58, 16 -; SI-NEXT: s_and_b32 s44, s57, 0xffff0000 -; SI-NEXT: s_lshl_b32 s45, s57, 16 -; SI-NEXT: s_and_b32 s46, s56, 0xffff0000 -; SI-NEXT: s_lshl_b32 s47, s56, 16 +; SI-NEXT: s_and_b32 s7, s79, 0xffff0000 +; SI-NEXT: s_lshl_b32 s6, s79, 16 +; SI-NEXT: s_and_b32 s9, s78, 0xffff0000 +; SI-NEXT: s_lshl_b32 s8, s78, 16 +; SI-NEXT: s_and_b32 s11, s77, 0xffff0000 +; SI-NEXT: s_lshl_b32 s10, s77, 16 +; SI-NEXT: s_and_b32 s13, s76, 0xffff0000 +; SI-NEXT: s_lshl_b32 s12, s76, 16 +; SI-NEXT: s_and_b32 s15, s75, 0xffff0000 +; SI-NEXT: s_lshl_b32 s14, s75, 16 +; SI-NEXT: s_and_b32 s17, s74, 0xffff0000 +; SI-NEXT: s_lshl_b32 s16, s74, 16 +; SI-NEXT: s_and_b32 s19, s73, 0xffff0000 +; SI-NEXT: s_lshl_b32 s18, s73, 16 +; SI-NEXT: s_and_b32 s21, s72, 0xffff0000 +; SI-NEXT: s_lshl_b32 s20, s72, 16 +; SI-NEXT: s_and_b32 s23, s63, 0xffff0000 +; SI-NEXT: s_lshl_b32 s22, s63, 16 +; SI-NEXT: s_and_b32 s25, s62, 0xffff0000 +; SI-NEXT: s_lshl_b32 s24, s62, 16 +; SI-NEXT: s_and_b32 s27, s61, 0xffff0000 +; SI-NEXT: s_lshl_b32 s26, s61, 16 +; SI-NEXT: s_and_b32 s29, s60, 0xffff0000 +; SI-NEXT: s_lshl_b32 s28, s60, 16 +; SI-NEXT: s_and_b32 s41, s59, 0xffff0000 +; SI-NEXT: s_lshl_b32 s40, s59, 16 +; SI-NEXT: s_and_b32 s43, s58, 0xffff0000 +; SI-NEXT: s_lshl_b32 s42, s58, 16 +; SI-NEXT: s_and_b32 s45, s57, 0xffff0000 +; SI-NEXT: s_lshl_b32 s44, s57, 16 +; SI-NEXT: s_and_b32 s47, s56, 0xffff0000 +; SI-NEXT: s_lshl_b32 s46, s56, 16 ; SI-NEXT: s_cbranch_execnz .LBB65_3 ; SI-NEXT: .LBB65_2: ; %cmp.true ; SI-NEXT: s_add_u32 s4, s56, 3 ; SI-NEXT: s_addc_u32 s5, s57, 0 -; SI-NEXT: s_add_u32 s43, s58, 3 -; SI-NEXT: s_addc_u32 s41, s59, 0 -; SI-NEXT: s_add_u32 s29, s60, 3 -; SI-NEXT: s_addc_u32 s27, s61, 0 -; SI-NEXT: s_add_u32 s25, s62, 3 -; SI-NEXT: s_addc_u32 s23, s63, 0 -; SI-NEXT: s_add_u32 s21, s72, 3 -; SI-NEXT: s_addc_u32 s19, s73, 0 -; SI-NEXT: s_add_u32 s17, s74, 3 -; SI-NEXT: s_addc_u32 s15, s75, 0 -; SI-NEXT: s_add_u32 s13, s76, 3 -; SI-NEXT: s_addc_u32 s11, s77, 0 -; SI-NEXT: s_add_u32 s9, s78, 3 -; SI-NEXT: s_addc_u32 s7, s79, 0 -; SI-NEXT: s_and_b32 s6, s7, 0xffff0000 -; SI-NEXT: s_lshl_b32 s7, s7, 16 -; SI-NEXT: s_and_b32 s8, s9, 0xffff0000 -; SI-NEXT: s_lshl_b32 s9, s9, 16 -; SI-NEXT: s_and_b32 s10, s11, 0xffff0000 -; SI-NEXT: s_lshl_b32 s11, s11, 16 -; SI-NEXT: s_and_b32 s12, s13, 0xffff0000 -; SI-NEXT: s_lshl_b32 s13, s13, 16 -; SI-NEXT: s_and_b32 s14, s15, 0xffff0000 -; SI-NEXT: s_lshl_b32 s15, s15, 16 -; SI-NEXT: s_and_b32 s16, s17, 0xffff0000 -; SI-NEXT: s_lshl_b32 s17, s17, 16 -; SI-NEXT: s_and_b32 s18, s19, 0xffff0000 -; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_and_b32 s20, s21, 0xffff0000 -; SI-NEXT: s_lshl_b32 s21, s21, 16 -; SI-NEXT: s_and_b32 s22, s23, 0xffff0000 -; SI-NEXT: s_lshl_b32 s23, s23, 16 -; SI-NEXT: s_and_b32 s24, s25, 0xffff0000 -; SI-NEXT: s_lshl_b32 s25, s25, 16 -; SI-NEXT: s_and_b32 s26, s27, 0xffff0000 -; SI-NEXT: s_lshl_b32 s27, s27, 16 -; SI-NEXT: s_and_b32 s28, s29, 0xffff0000 -; SI-NEXT: s_lshl_b32 s29, s29, 16 -; SI-NEXT: s_and_b32 s40, s41, 0xffff0000 -; SI-NEXT: s_lshl_b32 s41, s41, 16 -; SI-NEXT: s_and_b32 s42, s43, 0xffff0000 -; SI-NEXT: s_lshl_b32 s43, s43, 16 -; SI-NEXT: s_and_b32 s44, s5, 0xffff0000 -; SI-NEXT: s_lshl_b32 s45, s5, 16 -; SI-NEXT: s_and_b32 s46, s4, 0xffff0000 -; SI-NEXT: s_lshl_b32 s47, s4, 16 +; SI-NEXT: s_add_u32 s42, s58, 3 +; SI-NEXT: s_addc_u32 s40, s59, 0 +; SI-NEXT: s_add_u32 s28, s60, 3 +; SI-NEXT: s_addc_u32 s26, s61, 0 +; SI-NEXT: s_add_u32 s24, s62, 3 +; SI-NEXT: s_addc_u32 s22, s63, 0 +; SI-NEXT: s_add_u32 s20, s72, 3 +; SI-NEXT: s_addc_u32 s18, s73, 0 +; SI-NEXT: s_add_u32 s16, s74, 3 +; SI-NEXT: s_addc_u32 s14, s75, 0 +; SI-NEXT: s_add_u32 s12, s76, 3 +; SI-NEXT: s_addc_u32 s10, s77, 0 +; SI-NEXT: s_add_u32 s8, s78, 3 +; SI-NEXT: s_addc_u32 s6, s79, 0 +; SI-NEXT: s_and_b32 s7, s6, 0xffff0000 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s9, s8, 0xffff0000 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_and_b32 s11, s10, 0xffff0000 +; SI-NEXT: s_lshl_b32 s10, s10, 16 +; SI-NEXT: s_and_b32 s13, s12, 0xffff0000 +; SI-NEXT: s_lshl_b32 s12, s12, 16 +; SI-NEXT: s_and_b32 s15, s14, 0xffff0000 +; SI-NEXT: s_lshl_b32 s14, s14, 16 +; SI-NEXT: s_and_b32 s17, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s16, s16, 16 +; SI-NEXT: s_and_b32 s19, s18, 0xffff0000 +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_and_b32 s21, s20, 0xffff0000 +; SI-NEXT: s_lshl_b32 s20, s20, 16 +; SI-NEXT: s_and_b32 s23, s22, 0xffff0000 +; SI-NEXT: s_lshl_b32 s22, s22, 16 +; SI-NEXT: s_and_b32 s25, s24, 0xffff0000 +; SI-NEXT: s_lshl_b32 s24, s24, 16 +; SI-NEXT: s_and_b32 s27, s26, 0xffff0000 +; SI-NEXT: s_lshl_b32 s26, s26, 16 +; SI-NEXT: s_and_b32 s29, s28, 0xffff0000 +; SI-NEXT: s_lshl_b32 s28, s28, 16 +; SI-NEXT: s_and_b32 s41, s40, 0xffff0000 +; SI-NEXT: s_lshl_b32 s40, s40, 16 +; SI-NEXT: s_and_b32 s43, s42, 0xffff0000 +; SI-NEXT: s_lshl_b32 s42, s42, 16 +; SI-NEXT: s_and_b32 s45, s5, 0xffff0000 +; SI-NEXT: s_lshl_b32 s44, s5, 16 +; SI-NEXT: s_and_b32 s47, s4, 0xffff0000 +; SI-NEXT: s_lshl_b32 s46, s4, 16 ; SI-NEXT: .LBB65_3: ; %end -; SI-NEXT: v_mov_b32_e32 v0, s47 -; SI-NEXT: v_mov_b32_e32 v1, s46 -; SI-NEXT: v_mov_b32_e32 v2, s45 -; SI-NEXT: v_mov_b32_e32 v3, s44 -; SI-NEXT: v_mov_b32_e32 v4, s43 -; SI-NEXT: v_mov_b32_e32 v5, s42 -; SI-NEXT: v_mov_b32_e32 v6, s41 -; SI-NEXT: v_mov_b32_e32 v7, s40 -; SI-NEXT: v_mov_b32_e32 v8, s29 -; SI-NEXT: v_mov_b32_e32 v9, s28 -; SI-NEXT: v_mov_b32_e32 v10, s27 -; SI-NEXT: v_mov_b32_e32 v11, s26 -; SI-NEXT: v_mov_b32_e32 v12, s25 -; SI-NEXT: v_mov_b32_e32 v13, s24 -; SI-NEXT: v_mov_b32_e32 v14, s23 -; SI-NEXT: v_mov_b32_e32 v15, s22 -; SI-NEXT: v_mov_b32_e32 v16, s21 -; SI-NEXT: v_mov_b32_e32 v17, s20 -; SI-NEXT: v_mov_b32_e32 v18, s19 -; SI-NEXT: v_mov_b32_e32 v19, s18 -; SI-NEXT: v_mov_b32_e32 v20, s17 -; SI-NEXT: v_mov_b32_e32 v21, s16 -; SI-NEXT: v_mov_b32_e32 v22, s15 -; SI-NEXT: v_mov_b32_e32 v23, s14 -; SI-NEXT: v_mov_b32_e32 v24, s13 -; SI-NEXT: v_mov_b32_e32 v25, s12 -; SI-NEXT: v_mov_b32_e32 v26, s11 -; SI-NEXT: v_mov_b32_e32 v27, s10 -; SI-NEXT: v_mov_b32_e32 v28, s9 -; SI-NEXT: v_mov_b32_e32 v29, s8 -; SI-NEXT: v_mov_b32_e32 v30, s7 -; SI-NEXT: v_mov_b32_e32 v31, s6 +; SI-NEXT: v_mul_f32_e64 v0, 1.0, s47 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_mul_f32_e64 v0, 1.0, s46 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s45 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s44 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s43 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s42 +; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s41 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s40 +; SI-NEXT: v_lshr_b64 v[3:4], v[3:4], 16 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s29 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s28 +; SI-NEXT: v_lshr_b64 v[4:5], v[4:5], 16 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s27 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s26 +; SI-NEXT: v_lshr_b64 v[5:6], v[5:6], 16 +; SI-NEXT: v_mul_f32_e64 v6, 1.0, s25 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_mul_f32_e64 v6, 1.0, s24 +; SI-NEXT: v_lshr_b64 v[6:7], v[6:7], 16 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s23 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v7 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s22 +; SI-NEXT: v_lshr_b64 v[7:8], v[7:8], 16 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s21 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v8 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s20 +; SI-NEXT: v_lshr_b64 v[8:9], v[8:9], 16 +; SI-NEXT: v_mul_f32_e64 v9, 1.0, s19 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v9 +; SI-NEXT: v_mul_f32_e64 v9, 1.0, s18 +; SI-NEXT: v_lshr_b64 v[9:10], v[9:10], 16 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s17 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s16 +; SI-NEXT: v_lshr_b64 v[10:11], v[10:11], 16 +; SI-NEXT: v_mul_f32_e64 v11, 1.0, s15 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v11 +; SI-NEXT: v_mul_f32_e64 v11, 1.0, s14 +; SI-NEXT: v_lshr_b64 v[11:12], v[11:12], 16 +; SI-NEXT: v_mul_f32_e64 v12, 1.0, s13 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v12 +; SI-NEXT: v_mul_f32_e64 v12, 1.0, s12 +; SI-NEXT: v_lshr_b64 v[12:13], v[12:13], 16 +; SI-NEXT: v_mul_f32_e64 v13, 1.0, s11 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v13 +; SI-NEXT: v_mul_f32_e64 v13, 1.0, s10 +; SI-NEXT: v_lshr_b64 v[13:14], v[13:14], 16 +; SI-NEXT: v_mul_f32_e64 v14, 1.0, s9 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_mul_f32_e64 v14, 1.0, s8 +; SI-NEXT: v_lshr_b64 v[14:15], v[14:15], 16 +; SI-NEXT: v_mul_f32_e64 v15, 1.0, s7 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v15 +; SI-NEXT: v_mul_f32_e64 v15, 1.0, s6 +; SI-NEXT: v_lshr_b64 v[15:16], v[15:16], 16 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB65_4: -; SI-NEXT: ; implicit-def: $sgpr47 ; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr45 +; SI-NEXT: ; implicit-def: $sgpr47 ; SI-NEXT: ; implicit-def: $sgpr44 -; SI-NEXT: ; implicit-def: $sgpr43 +; SI-NEXT: ; implicit-def: $sgpr45 ; SI-NEXT: ; implicit-def: $sgpr42 -; SI-NEXT: ; implicit-def: $sgpr41 +; SI-NEXT: ; implicit-def: $sgpr43 ; SI-NEXT: ; implicit-def: $sgpr40 -; SI-NEXT: ; implicit-def: $sgpr29 +; SI-NEXT: ; implicit-def: $sgpr41 ; SI-NEXT: ; implicit-def: $sgpr28 -; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr29 ; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr25 +; SI-NEXT: ; implicit-def: $sgpr27 ; SI-NEXT: ; implicit-def: $sgpr24 -; SI-NEXT: ; implicit-def: $sgpr23 +; SI-NEXT: ; implicit-def: $sgpr25 ; SI-NEXT: ; implicit-def: $sgpr22 -; SI-NEXT: ; implicit-def: $sgpr21 +; SI-NEXT: ; implicit-def: $sgpr23 ; SI-NEXT: ; implicit-def: $sgpr20 -; SI-NEXT: ; implicit-def: $sgpr19 +; SI-NEXT: ; implicit-def: $sgpr21 ; SI-NEXT: ; implicit-def: $sgpr18 -; SI-NEXT: ; implicit-def: $sgpr17 +; SI-NEXT: ; implicit-def: $sgpr19 ; SI-NEXT: ; implicit-def: $sgpr16 -; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $sgpr17 ; SI-NEXT: ; implicit-def: $sgpr14 -; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $sgpr15 ; SI-NEXT: ; implicit-def: $sgpr12 -; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr13 ; SI-NEXT: ; implicit-def: $sgpr10 -; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $sgpr11 ; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr9 ; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr7 ; SI-NEXT: s_branch .LBB65_2 ; ; VI-LABEL: bitcast_v8i64_to_v32bf16_scalar: @@ -34781,132 +36101,154 @@ define <8 x i64> @bitcast_v32bf16_to_v8i64(<32 x bfloat> %a, i32 %b) { ; SI-LABEL: bitcast_v32bf16_to_v8i64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 -; SI-NEXT: v_mul_f32_e32 v45, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v46, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v43, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v44, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v41, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v42, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v55, 1.0, v7 -; SI-NEXT: v_mul_f32_e32 v40, 1.0, v6 -; SI-NEXT: v_mul_f32_e32 v53, 1.0, v9 -; SI-NEXT: v_mul_f32_e32 v54, 1.0, v8 -; SI-NEXT: v_mul_f32_e32 v51, 1.0, v11 -; SI-NEXT: v_mul_f32_e32 v52, 1.0, v10 -; SI-NEXT: v_mul_f32_e32 v49, 1.0, v13 -; SI-NEXT: v_mul_f32_e32 v50, 1.0, v12 -; SI-NEXT: v_mul_f32_e32 v39, 1.0, v15 -; SI-NEXT: v_mul_f32_e32 v48, 1.0, v14 -; SI-NEXT: v_mul_f32_e32 v37, 1.0, v17 -; SI-NEXT: v_mul_f32_e32 v38, 1.0, v16 -; SI-NEXT: v_mul_f32_e32 v35, 1.0, v19 -; SI-NEXT: v_mul_f32_e32 v36, 1.0, v18 -; SI-NEXT: v_mul_f32_e32 v33, 1.0, v21 -; SI-NEXT: v_mul_f32_e32 v34, 1.0, v20 -; SI-NEXT: v_mul_f32_e32 v31, 1.0, v23 -; SI-NEXT: v_mul_f32_e32 v32, 1.0, v22 -; SI-NEXT: v_mul_f32_e32 v22, 1.0, v25 -; SI-NEXT: v_mul_f32_e32 v23, 1.0, v24 -; SI-NEXT: v_mul_f32_e32 v20, 1.0, v27 -; SI-NEXT: v_mul_f32_e32 v21, 1.0, v26 -; SI-NEXT: v_mul_f32_e32 v17, 1.0, v29 -; SI-NEXT: v_mul_f32_e32 v19, 1.0, v28 -; SI-NEXT: v_mul_f32_e32 v18, 1.0, v30 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: v_mul_f32_e32 v54, 1.0, v32 +; SI-NEXT: v_mul_f32_e32 v55, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v52, 1.0, v31 +; SI-NEXT: v_mul_f32_e32 v53, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v50, 1.0, v30 +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v48, 1.0, v29 +; SI-NEXT: v_mul_f32_e32 v49, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v38, 1.0, v28 +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v36, 1.0, v27 +; SI-NEXT: v_mul_f32_e32 v37, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v34, 1.0, v26 +; SI-NEXT: v_mul_f32_e32 v35, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v25 +; SI-NEXT: v_mul_f32_e32 v33, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v24 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v23 +; SI-NEXT: v_mul_f32_e32 v29, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v22 +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v21 +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v20 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v19 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v17 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v15 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v47 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v16, 1.0, v56 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execz .LBB66_2 -; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v45 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v43 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v41 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v55 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v53 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v51 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v49 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v39 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v37 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v35 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v33 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v31 +; SI-NEXT: s_cbranch_execnz .LBB66_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB66_4 +; SI-NEXT: .LBB66_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB66_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v52 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v50 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v48 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v38 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v24 ; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v22 ; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v18 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v16 -; SI-NEXT: v_alignbit_b32 v0, v0, v46, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v44, 16 -; SI-NEXT: v_alignbit_b32 v2, v2, v42, 16 -; SI-NEXT: v_alignbit_b32 v3, v3, v40, 16 -; SI-NEXT: v_alignbit_b32 v4, v4, v54, 16 -; SI-NEXT: v_alignbit_b32 v5, v5, v52, 16 -; SI-NEXT: v_alignbit_b32 v6, v6, v50, 16 -; SI-NEXT: v_alignbit_b32 v7, v7, v48, 16 -; SI-NEXT: v_alignbit_b32 v8, v8, v38, 16 -; SI-NEXT: v_alignbit_b32 v9, v9, v36, 16 -; SI-NEXT: v_alignbit_b32 v10, v10, v34, 16 -; SI-NEXT: v_alignbit_b32 v11, v11, v32, 16 +; SI-NEXT: v_alignbit_b32 v0, v0, v55, 16 +; SI-NEXT: v_alignbit_b32 v1, v1, v53, 16 +; SI-NEXT: v_alignbit_b32 v2, v2, v51, 16 +; SI-NEXT: v_alignbit_b32 v3, v3, v49, 16 +; SI-NEXT: v_alignbit_b32 v4, v4, v39, 16 +; SI-NEXT: v_alignbit_b32 v5, v5, v37, 16 +; SI-NEXT: v_alignbit_b32 v6, v6, v35, 16 +; SI-NEXT: v_alignbit_b32 v7, v7, v33, 16 +; SI-NEXT: v_alignbit_b32 v8, v8, v31, 16 +; SI-NEXT: v_alignbit_b32 v9, v9, v29, 16 +; SI-NEXT: v_alignbit_b32 v10, v10, v27, 16 +; SI-NEXT: v_alignbit_b32 v11, v11, v25, 16 ; SI-NEXT: v_alignbit_b32 v12, v12, v23, 16 ; SI-NEXT: v_alignbit_b32 v13, v13, v21, 16 ; SI-NEXT: v_alignbit_b32 v14, v14, v19, 16 -; SI-NEXT: v_alignbit_b32 v15, v15, v18, 16 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: v_alignbit_b32 v15, v15, v17, 16 +; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: .LBB66_2: ; %Flow +; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB66_4 -; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v45 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v43 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v46 +; SI-NEXT: s_cbranch_execz .LBB66_2 +; SI-NEXT: .LBB66_4: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v54 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v52 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v55 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v44 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v53 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 @@ -34914,62 +36256,62 @@ define <8 x i64> @bitcast_v32bf16_to_v8i64(<32 x bfloat> %a, i32 %b) { ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 ; SI-NEXT: v_alignbit_b32 v1, v3, v2, 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v41 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v42 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v50 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v51 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v55 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v48 ; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v40 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v49 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v53 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v38 ; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v54 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v39 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v51 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v36 ; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v52 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v37 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v49 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v34 ; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v50 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v35 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v39 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v32 ; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v48 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v33 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 ; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v37 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v30 ; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v38 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v31 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v35 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v28 ; SI-NEXT: v_alignbit_b32 v8, v9, v8, 16 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v36 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v29 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 ; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v33 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v26 ; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v34 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v27 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 ; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v31 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v24 ; SI-NEXT: v_alignbit_b32 v10, v11, v10, 16 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v32 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v25 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 ; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 @@ -34985,7 +36327,7 @@ define <8 x i64> @bitcast_v32bf16_to_v8i64(<32 x bfloat> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 ; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v17 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v18 ; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 ; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v19 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 @@ -34993,23 +36335,12 @@ define <8 x i64> @bitcast_v32bf16_to_v8i64(<32 x bfloat> %a, i32 %b) { ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; SI-NEXT: v_alignbit_b32 v14, v15, v14, 16 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v18 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v17 ; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 ; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_alignbit_b32 v15, v16, v15, 16 -; SI-NEXT: .LBB66_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v32bf16_to_v8i64: @@ -36176,7 +37507,39 @@ define inreg <8 x i64> @bitcast_v32bf16_to_v8i64_scalar(<32 x bfloat> inreg %a, ; SI-LABEL: bitcast_v32bf16_to_v8i64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_and_b32 s6, s29, 0xffff0000 +; SI-NEXT: s_lshl_b32 s7, s29, 16 +; SI-NEXT: s_and_b32 s8, s28, 0xffff0000 +; SI-NEXT: s_lshl_b32 s9, s28, 16 +; SI-NEXT: s_and_b32 s10, s27, 0xffff0000 +; SI-NEXT: s_lshl_b32 s11, s27, 16 +; SI-NEXT: s_and_b32 s12, s26, 0xffff0000 +; SI-NEXT: s_lshl_b32 s13, s26, 16 +; SI-NEXT: s_and_b32 s14, s25, 0xffff0000 +; SI-NEXT: s_lshl_b32 s15, s25, 16 +; SI-NEXT: s_and_b32 s25, s24, 0xffff0000 +; SI-NEXT: s_lshl_b32 s24, s24, 16 +; SI-NEXT: s_and_b32 s26, s23, 0xffff0000 +; SI-NEXT: s_lshl_b32 s23, s23, 16 +; SI-NEXT: s_and_b32 s27, s22, 0xffff0000 +; SI-NEXT: s_lshl_b32 s22, s22, 16 +; SI-NEXT: s_and_b32 s28, s21, 0xffff0000 +; SI-NEXT: s_lshl_b32 s21, s21, 16 +; SI-NEXT: s_and_b32 s29, s20, 0xffff0000 +; SI-NEXT: s_lshl_b32 s20, s20, 16 +; SI-NEXT: s_and_b32 s40, s19, 0xffff0000 +; SI-NEXT: s_lshl_b32 s19, s19, 16 +; SI-NEXT: s_and_b32 s41, s18, 0xffff0000 +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_and_b32 s42, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_and_b32 s43, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s16, s16, 16 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -36194,51 +37557,50 @@ define inreg <8 x i64> @bitcast_v32bf16_to_v8i64_scalar(<32 x bfloat> inreg %a, ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mul_f32_e64 v62, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v60, 1.0, s19 -; SI-NEXT: v_mul_f32_e32 v57, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v56, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v47, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v46, 1.0, v7 -; SI-NEXT: v_mul_f32_e32 v45, 1.0, v9 -; SI-NEXT: v_mul_f32_e32 v44, 1.0, v11 -; SI-NEXT: v_mul_f32_e32 v43, 1.0, v13 -; SI-NEXT: v_mul_f32_e32 v42, 1.0, v15 -; SI-NEXT: v_mul_f32_e32 v18, 1.0, v17 -; SI-NEXT: v_mul_f32_e64 v41, 1.0, s21 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v63, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v61, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v59, 1.0, s27 -; SI-NEXT: v_mul_f32_e64 v58, 1.0, s29 -; SI-NEXT: v_mul_f32_e32 v33, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v31, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v29, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v27, 1.0, v6 -; SI-NEXT: v_mul_f32_e32 v25, 1.0, v8 -; SI-NEXT: v_mul_f32_e32 v23, 1.0, v10 -; SI-NEXT: v_mul_f32_e32 v21, 1.0, v12 -; SI-NEXT: v_mul_f32_e32 v19, 1.0, v14 -; SI-NEXT: v_mul_f32_e32 v17, 1.0, v16 +; SI-NEXT: v_mul_f32_e64 v41, 1.0, s43 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v63, 1.0, s42 +; SI-NEXT: v_mul_f32_e64 v62, 1.0, s41 +; SI-NEXT: v_mul_f32_e64 v61, 1.0, s40 +; SI-NEXT: v_mul_f32_e64 v60, 1.0, s29 +; SI-NEXT: v_mul_f32_e64 v59, 1.0, s28 +; SI-NEXT: v_mul_f32_e64 v58, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v57, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v56, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v47, 1.0, s14 +; SI-NEXT: v_mul_f32_e64 v46, 1.0, s12 +; SI-NEXT: v_mul_f32_e64 v45, 1.0, s10 +; SI-NEXT: v_mul_f32_e64 v44, 1.0, s8 +; SI-NEXT: v_mul_f32_e64 v43, 1.0, s6 +; SI-NEXT: v_mul_f32_e32 v42, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v1 ; SI-NEXT: v_mul_f32_e64 v39, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v54, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v52, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v50, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v48, 1.0, s24 -; SI-NEXT: v_mul_f32_e64 v37, 1.0, s26 -; SI-NEXT: v_mul_f32_e64 v35, 1.0, s28 +; SI-NEXT: v_mul_f32_e64 v54, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v52, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v50, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v48, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v37, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v35, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v33, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v31, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v29, 1.0, s15 +; SI-NEXT: v_mul_f32_e64 v27, 1.0, s13 +; SI-NEXT: v_mul_f32_e64 v25, 1.0, s11 +; SI-NEXT: v_mul_f32_e64 v23, 1.0, s9 +; SI-NEXT: v_mul_f32_e64 v21, 1.0, s7 ; SI-NEXT: s_cbranch_scc0 .LBB67_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v62 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v60 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v63 ; SI-NEXT: v_lshr_b64 v[0:1], v[39:40], 16 ; SI-NEXT: v_lshr_b64 v[1:2], v[54:55], 16 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v62 ; SI-NEXT: v_lshr_b64 v[2:3], v[52:53], 16 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v63 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v61 ; SI-NEXT: v_lshr_b64 v[3:4], v[50:51], 16 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v61 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v60 ; SI-NEXT: v_lshr_b64 v[4:5], v[48:49], 16 ; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v59 ; SI-NEXT: v_lshr_b64 v[5:6], v[37:38], 16 @@ -36260,16 +37622,16 @@ define inreg <8 x i64> @bitcast_v32bf16_to_v8i64_scalar(<32 x bfloat> inreg %a, ; SI-NEXT: v_lshr_b64 v[13:14], v[21:22], 16 ; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v42 ; SI-NEXT: v_lshr_b64 v[14:15], v[19:20], 16 -; SI-NEXT: v_mov_b32_e32 v20, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v16 +; SI-NEXT: v_mov_b32_e32 v20, v16 ; SI-NEXT: v_lshr_b64 v[15:16], v[17:18], 16 -; SI-NEXT: v_mov_b32_e32 v18, v20 +; SI-NEXT: v_mov_b32_e32 v16, v20 ; SI-NEXT: s_cbranch_execnz .LBB67_3 ; SI-NEXT: .LBB67_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v62 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v41 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v39 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v60 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v63 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v54 @@ -36278,19 +37640,19 @@ define inreg <8 x i64> @bitcast_v32bf16_to_v8i64_scalar(<32 x bfloat> inreg %a, ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 ; SI-NEXT: v_lshr_b64 v[1:2], v[2:3], 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v41 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v62 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v52 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v63 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v61 ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v50 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_lshr_b64 v[3:4], v[3:4], 16 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v61 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v60 ; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v48 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 @@ -36356,7 +37718,7 @@ define inreg <8 x i64> @bitcast_v32bf16_to_v8i64_scalar(<32 x bfloat> inreg %a, ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_lshr_b64 v[14:15], v[14:15], 16 -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v18 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v17 ; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 @@ -45386,103 +46748,121 @@ define <32 x i16> @bitcast_v8f64_to_v32i16(<8 x double> %a, i32 %b) { ; SI-LABEL: bitcast_v8f64_to_v32i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v55, v15 -; SI-NEXT: v_mov_b32_e32 v54, v14 -; SI-NEXT: v_mov_b32_e32 v53, v13 -; SI-NEXT: v_mov_b32_e32 v52, v12 -; SI-NEXT: v_mov_b32_e32 v51, v11 -; SI-NEXT: v_mov_b32_e32 v50, v10 -; SI-NEXT: v_mov_b32_e32 v49, v9 -; SI-NEXT: v_mov_b32_e32 v48, v8 -; SI-NEXT: v_mov_b32_e32 v38, v7 -; SI-NEXT: v_mov_b32_e32 v37, v6 -; SI-NEXT: v_mov_b32_e32 v36, v5 -; SI-NEXT: v_mov_b32_e32 v35, v4 -; SI-NEXT: v_mov_b32_e32 v34, v3 -; SI-NEXT: v_mov_b32_e32 v33, v2 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB72_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_alignbit_b32 v29, v55, v54, 16 -; SI-NEXT: v_alignbit_b32 v25, v53, v52, 16 -; SI-NEXT: v_alignbit_b32 v21, v51, v50, 16 -; SI-NEXT: v_alignbit_b32 v17, v49, v48, 16 -; SI-NEXT: v_alignbit_b32 v13, v38, v37, 16 -; SI-NEXT: v_alignbit_b32 v9, v36, v35, 16 -; SI-NEXT: v_alignbit_b32 v5, v34, v33, 16 -; SI-NEXT: v_alignbit_b32 v32, v1, v0, 16 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v55 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v53 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v51 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v49 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v38 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v36 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v34 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_alignbit_b32 v16, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v17, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v18, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v19, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v20, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v22, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v25, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v28, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 ; SI-NEXT: .LBB72_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB72_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; SI-NEXT: v_add_f64 v[33:34], v[33:34], 1.0 -; SI-NEXT: v_add_f64 v[35:36], v[35:36], 1.0 -; SI-NEXT: v_add_f64 v[37:38], v[37:38], 1.0 -; SI-NEXT: v_add_f64 v[48:49], v[48:49], 1.0 -; SI-NEXT: v_add_f64 v[50:51], v[50:51], 1.0 -; SI-NEXT: v_add_f64 v[54:55], v[54:55], 1.0 -; SI-NEXT: v_add_f64 v[52:53], v[52:53], 1.0 -; SI-NEXT: v_alignbit_b32 v29, v55, v54, 16 -; SI-NEXT: v_alignbit_b32 v25, v53, v52, 16 -; SI-NEXT: v_alignbit_b32 v21, v51, v50, 16 -; SI-NEXT: v_alignbit_b32 v17, v49, v48, 16 -; SI-NEXT: v_alignbit_b32 v13, v38, v37, 16 -; SI-NEXT: v_alignbit_b32 v9, v36, v35, 16 -; SI-NEXT: v_alignbit_b32 v5, v34, v33, 16 -; SI-NEXT: v_alignbit_b32 v32, v1, v0, 16 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v55 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v53 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v51 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v49 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v38 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v36 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v34 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_alignbit_b32 v16, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v17, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v18, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v19, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v20, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v22, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v25, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v28, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 ; SI-NEXT: .LBB72_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_mov_b32_e32 v2, v1 -; SI-NEXT: v_mov_b32_e32 v4, v33 -; SI-NEXT: v_mov_b32_e32 v6, v34 -; SI-NEXT: v_mov_b32_e32 v8, v35 -; SI-NEXT: v_mov_b32_e32 v10, v36 -; SI-NEXT: v_mov_b32_e32 v12, v37 -; SI-NEXT: v_mov_b32_e32 v14, v38 -; SI-NEXT: v_mov_b32_e32 v16, v48 -; SI-NEXT: v_mov_b32_e32 v18, v49 -; SI-NEXT: v_mov_b32_e32 v20, v50 -; SI-NEXT: v_mov_b32_e32 v22, v51 -; SI-NEXT: v_mov_b32_e32 v24, v52 -; SI-NEXT: v_mov_b32_e32 v26, v53 -; SI-NEXT: v_mov_b32_e32 v28, v54 -; SI-NEXT: v_mov_b32_e32 v30, v55 -; SI-NEXT: v_mov_b32_e32 v1, v32 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v0, v0, v28 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v31 +; SI-NEXT: v_or_b32_e32 v2, v2, v25 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v30 +; SI-NEXT: v_or_b32_e32 v4, v4, v22 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v29 +; SI-NEXT: v_or_b32_e32 v6, v6, v20 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v27 +; SI-NEXT: v_or_b32_e32 v8, v8, v19 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v26 +; SI-NEXT: v_or_b32_e32 v10, v10, v18 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v24 +; SI-NEXT: v_or_b32_e32 v12, v12, v17 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v23 +; SI-NEXT: v_or_b32_e32 v14, v14, v16 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v21 +; SI-NEXT: v_or_b32_e32 v1, v1, v28 +; SI-NEXT: v_or_b32_e32 v3, v3, v25 +; SI-NEXT: v_or_b32_e32 v5, v5, v22 +; SI-NEXT: v_or_b32_e32 v7, v7, v20 +; SI-NEXT: v_or_b32_e32 v9, v9, v19 +; SI-NEXT: v_or_b32_e32 v11, v11, v18 +; SI-NEXT: v_or_b32_e32 v13, v13, v17 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8f64_to_v32i16: @@ -45570,102 +46950,134 @@ define inreg <32 x i16> @bitcast_v8f64_to_v32i16_scalar(<8 x double> inreg %a, i ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: v_mov_b32_e32 v33, v1 -; SI-NEXT: v_mov_b32_e32 v32, v0 -; SI-NEXT: v_mov_b32_e32 v34, s16 -; SI-NEXT: v_mov_b32_e32 v35, s17 -; SI-NEXT: v_mov_b32_e32 v36, s18 -; SI-NEXT: v_mov_b32_e32 v37, s19 -; SI-NEXT: v_mov_b32_e32 v38, s20 -; SI-NEXT: v_mov_b32_e32 v39, s21 -; SI-NEXT: v_mov_b32_e32 v48, s22 -; SI-NEXT: v_mov_b32_e32 v49, s23 -; SI-NEXT: v_mov_b32_e32 v50, s24 -; SI-NEXT: v_mov_b32_e32 v51, s25 -; SI-NEXT: v_mov_b32_e32 v52, s26 -; SI-NEXT: v_mov_b32_e32 v53, s27 -; SI-NEXT: v_mov_b32_e32 v54, s28 +; SI-NEXT: v_mov_b32_e32 v16, s16 +; SI-NEXT: v_mov_b32_e32 v17, s17 +; SI-NEXT: v_mov_b32_e32 v14, s18 +; SI-NEXT: v_mov_b32_e32 v15, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mov_b32_e32 v55, s29 +; SI-NEXT: v_mov_b32_e32 v13, s29 ; SI-NEXT: s_cbranch_scc0 .LBB73_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v33 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v55 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v53 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v51 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v49 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v39 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v37 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v35 -; SI-NEXT: v_lshr_b64 v[29:30], v[32:33], 16 -; SI-NEXT: v_lshr_b64 v[25:26], v[54:55], 16 -; SI-NEXT: v_lshr_b64 v[21:22], v[52:53], 16 -; SI-NEXT: v_lshr_b64 v[17:18], v[50:51], 16 -; SI-NEXT: v_lshr_b64 v[13:14], v[48:49], 16 -; SI-NEXT: v_lshr_b64 v[9:10], v[38:39], 16 -; SI-NEXT: v_lshr_b64 v[5:6], v[36:37], 16 -; SI-NEXT: v_lshr_b64 v[1:2], v[34:35], 16 +; SI-NEXT: v_lshr_b64 v[18:19], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[19:20], v[12:13], 16 +; SI-NEXT: v_lshr_b64 v[20:21], v[10:11], 16 +; SI-NEXT: v_lshr_b64 v[21:22], v[8:9], 16 +; SI-NEXT: v_lshr_b64 v[22:23], v[6:7], 16 +; SI-NEXT: v_lshr_b64 v[23:24], v[4:5], 16 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v17 +; SI-NEXT: v_lshr_b64 v[2:3], v[14:15], 16 +; SI-NEXT: v_lshr_b64 v[24:25], v[16:17], 16 ; SI-NEXT: s_cbranch_execnz .LBB73_3 ; SI-NEXT: .LBB73_2: ; %cmp.true -; SI-NEXT: v_add_f64 v[32:33], v[32:33], 1.0 -; SI-NEXT: v_add_f64 v[54:55], v[54:55], 1.0 -; SI-NEXT: v_add_f64 v[52:53], v[52:53], 1.0 -; SI-NEXT: v_add_f64 v[50:51], v[50:51], 1.0 -; SI-NEXT: v_add_f64 v[48:49], v[48:49], 1.0 -; SI-NEXT: v_add_f64 v[38:39], v[38:39], 1.0 -; SI-NEXT: v_add_f64 v[36:37], v[36:37], 1.0 -; SI-NEXT: v_add_f64 v[34:35], v[34:35], 1.0 -; SI-NEXT: v_lshr_b64 v[29:30], v[32:33], 16 -; SI-NEXT: v_lshr_b64 v[25:26], v[54:55], 16 -; SI-NEXT: v_lshr_b64 v[21:22], v[52:53], 16 -; SI-NEXT: v_lshr_b64 v[17:18], v[50:51], 16 -; SI-NEXT: v_lshr_b64 v[13:14], v[48:49], 16 -; SI-NEXT: v_lshr_b64 v[9:10], v[38:39], 16 -; SI-NEXT: v_lshr_b64 v[5:6], v[36:37], 16 -; SI-NEXT: v_lshr_b64 v[1:2], v[34:35], 16 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v33 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v55 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v53 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v51 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v49 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v39 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v37 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v35 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_lshr_b64 v[18:19], v[0:1], 16 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_lshr_b64 v[19:20], v[12:13], 16 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_lshr_b64 v[20:21], v[10:11], 16 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_lshr_b64 v[21:22], v[8:9], 16 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_lshr_b64 v[22:23], v[6:7], 16 +; SI-NEXT: v_lshr_b64 v[23:24], v[4:5], 16 +; SI-NEXT: v_lshr_b64 v[2:3], v[14:15], 16 +; SI-NEXT: v_lshr_b64 v[24:25], v[16:17], 16 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v17 ; SI-NEXT: .LBB73_3: ; %end -; SI-NEXT: v_mov_b32_e32 v0, v34 -; SI-NEXT: v_mov_b32_e32 v2, v35 -; SI-NEXT: v_mov_b32_e32 v4, v36 -; SI-NEXT: v_mov_b32_e32 v6, v37 -; SI-NEXT: v_mov_b32_e32 v8, v38 -; SI-NEXT: v_mov_b32_e32 v10, v39 -; SI-NEXT: v_mov_b32_e32 v12, v48 -; SI-NEXT: v_mov_b32_e32 v14, v49 -; SI-NEXT: v_mov_b32_e32 v16, v50 -; SI-NEXT: v_mov_b32_e32 v18, v51 -; SI-NEXT: v_mov_b32_e32 v20, v52 -; SI-NEXT: v_mov_b32_e32 v22, v53 -; SI-NEXT: v_mov_b32_e32 v24, v54 -; SI-NEXT: v_mov_b32_e32 v26, v55 -; SI-NEXT: v_mov_b32_e32 v28, v32 -; SI-NEXT: v_mov_b32_e32 v30, v33 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v24 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v16, v16, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v33 +; SI-NEXT: v_or_b32_e32 v17, v3, v17 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v14 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v32 +; SI-NEXT: v_or_b32_e32 v3, v3, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v23 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v4, v4, v14 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v31 +; SI-NEXT: v_or_b32_e32 v5, v5, v14 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v22 +; SI-NEXT: v_or_b32_e32 v6, v6, v14 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v30 +; SI-NEXT: v_or_b32_e32 v7, v7, v14 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v21 +; SI-NEXT: v_or_b32_e32 v8, v8, v14 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v29 +; SI-NEXT: v_or_b32_e32 v9, v9, v14 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v20 +; SI-NEXT: v_or_b32_e32 v10, v10, v14 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v28 +; SI-NEXT: v_or_b32_e32 v11, v11, v14 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v19 +; SI-NEXT: v_or_b32_e32 v12, v12, v14 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v27 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v18 +; SI-NEXT: v_or_b32_e32 v14, v0, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v26 +; SI-NEXT: v_or_b32_e32 v15, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, v16 +; SI-NEXT: v_mov_b32_e32 v1, v17 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB73_4: -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: s_branch .LBB73_2 ; ; VI-LABEL: bitcast_v8f64_to_v32i16_scalar: @@ -45802,95 +47214,114 @@ define <8 x double> @bitcast_v32i16_to_v8f64(<32 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v32i16_to_v8f64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v32, v2 +; SI-NEXT: v_mov_b32_e32 v32, v15 +; SI-NEXT: v_mov_b32_e32 v17, v14 +; SI-NEXT: v_mov_b32_e32 v18, v13 +; SI-NEXT: v_mov_b32_e32 v19, v12 +; SI-NEXT: v_mov_b32_e32 v20, v11 +; SI-NEXT: v_mov_b32_e32 v21, v10 +; SI-NEXT: v_mov_b32_e32 v22, v9 +; SI-NEXT: v_mov_b32_e32 v23, v8 +; SI-NEXT: v_mov_b32_e32 v24, v7 +; SI-NEXT: v_mov_b32_e32 v25, v6 +; SI-NEXT: v_mov_b32_e32 v26, v5 +; SI-NEXT: v_mov_b32_e32 v27, v4 +; SI-NEXT: v_mov_b32_e32 v28, v3 +; SI-NEXT: v_mov_b32_e32 v29, v2 +; SI-NEXT: v_mov_b32_e32 v30, v1 ; SI-NEXT: v_mov_b32_e32 v31, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v38, v14 -; SI-NEXT: v_mov_b32_e32 v37, v12 -; SI-NEXT: v_mov_b32_e32 v36, v10 -; SI-NEXT: v_mov_b32_e32 v35, v8 -; SI-NEXT: v_mov_b32_e32 v34, v6 -; SI-NEXT: v_mov_b32_e32 v33, v4 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v29 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v31 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v0 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execz .LBB74_2 -; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB74_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB74_4 +; SI-NEXT: .LBB74_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB74_3: ; %cmp.false ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v31 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v32 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v33 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v34 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v35 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v36 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v37 -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v38 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v16 -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v18 -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v20 -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v22 -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v24 -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v26 -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v28 -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v30 -; SI-NEXT: v_or_b32_e32 v0, v0, v42 -; SI-NEXT: v_or_b32_e32 v1, v1, v41 -; SI-NEXT: v_or_b32_e32 v2, v2, v40 -; SI-NEXT: v_or_b32_e32 v3, v3, v55 -; SI-NEXT: v_or_b32_e32 v4, v4, v54 -; SI-NEXT: v_or_b32_e32 v5, v5, v53 -; SI-NEXT: v_or_b32_e32 v6, v6, v52 -; SI-NEXT: v_or_b32_e32 v7, v7, v51 -; SI-NEXT: v_or_b32_e32 v8, v8, v50 -; SI-NEXT: v_or_b32_e32 v9, v9, v49 -; SI-NEXT: v_or_b32_e32 v10, v10, v48 -; SI-NEXT: v_or_b32_e32 v11, v11, v39 -; SI-NEXT: v_or_b32_e32 v12, v12, v23 -; SI-NEXT: v_or_b32_e32 v13, v13, v21 -; SI-NEXT: v_or_b32_e32 v14, v14, v19 -; SI-NEXT: v_or_b32_e32 v15, v15, v17 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v30 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v29 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v28 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v27 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v26 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v25 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v24 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v23 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v21 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v19 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v32 +; SI-NEXT: v_or_b32_e32 v0, v0, v55 +; SI-NEXT: v_or_b32_e32 v1, v1, v54 +; SI-NEXT: v_or_b32_e32 v2, v2, v53 +; SI-NEXT: v_or_b32_e32 v3, v3, v52 +; SI-NEXT: v_or_b32_e32 v4, v4, v51 +; SI-NEXT: v_or_b32_e32 v5, v5, v50 +; SI-NEXT: v_or_b32_e32 v6, v6, v49 +; SI-NEXT: v_or_b32_e32 v7, v7, v48 +; SI-NEXT: v_or_b32_e32 v8, v8, v39 +; SI-NEXT: v_or_b32_e32 v9, v9, v38 +; SI-NEXT: v_or_b32_e32 v10, v10, v37 +; SI-NEXT: v_or_b32_e32 v11, v11, v36 +; SI-NEXT: v_or_b32_e32 v12, v12, v35 +; SI-NEXT: v_or_b32_e32 v13, v13, v34 +; SI-NEXT: v_or_b32_e32 v14, v14, v33 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr53 @@ -45900,30 +47331,32 @@ define <8 x double> @bitcast_v32i16_to_v8f64(<32 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: .LBB74_2: ; %Flow +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB74_4 -; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: s_cbranch_execz .LBB74_2 +; SI-NEXT: .LBB74_4: ; %cmp.true ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v31 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v32 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v33 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v34 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v35 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v36 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v37 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v38 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v16 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v18 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v20 -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v22 -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v24 -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v26 -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v28 -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v29 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v27 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v32 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 @@ -45940,23 +47373,23 @@ define <8 x double> @bitcast_v32i16_to_v8f64(<32 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: v_or_b32_e32 v0, v55, v0 ; SI-NEXT: s_mov_b32 s6, 0x30000 -; SI-NEXT: v_or_b32_e32 v1, v41, v1 -; SI-NEXT: v_or_b32_e32 v2, v40, v2 -; SI-NEXT: v_or_b32_e32 v3, v55, v3 -; SI-NEXT: v_or_b32_e32 v4, v54, v4 -; SI-NEXT: v_or_b32_e32 v5, v53, v5 -; SI-NEXT: v_or_b32_e32 v6, v52, v6 -; SI-NEXT: v_or_b32_e32 v7, v51, v7 -; SI-NEXT: v_or_b32_e32 v8, v50, v8 -; SI-NEXT: v_or_b32_e32 v9, v49, v9 -; SI-NEXT: v_or_b32_e32 v10, v48, v10 -; SI-NEXT: v_or_b32_e32 v11, v39, v11 -; SI-NEXT: v_or_b32_e32 v12, v23, v12 -; SI-NEXT: v_or_b32_e32 v13, v21, v13 -; SI-NEXT: v_or_b32_e32 v14, v19, v14 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: v_or_b32_e32 v1, v54, v1 +; SI-NEXT: v_or_b32_e32 v2, v53, v2 +; SI-NEXT: v_or_b32_e32 v3, v52, v3 +; SI-NEXT: v_or_b32_e32 v4, v51, v4 +; SI-NEXT: v_or_b32_e32 v5, v50, v5 +; SI-NEXT: v_or_b32_e32 v6, v49, v6 +; SI-NEXT: v_or_b32_e32 v7, v48, v7 +; SI-NEXT: v_or_b32_e32 v8, v39, v8 +; SI-NEXT: v_or_b32_e32 v9, v38, v9 +; SI-NEXT: v_or_b32_e32 v10, v37, v10 +; SI-NEXT: v_or_b32_e32 v11, v36, v11 +; SI-NEXT: v_or_b32_e32 v12, v35, v12 +; SI-NEXT: v_or_b32_e32 v13, v34, v13 +; SI-NEXT: v_or_b32_e32 v14, v33, v14 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 @@ -45973,12 +47406,7 @@ define <8 x double> @bitcast_v32i16_to_v8f64(<32 x i16> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v13 ; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v14 ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v15 -; SI-NEXT: .LBB74_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v32i16_to_v8f64: @@ -46122,153 +47550,184 @@ define inreg <8 x double> @bitcast_v32i16_to_v8f64_scalar(<32 x i16> inreg %a, i ; SI-LABEL: bitcast_v32i16_to_v8f64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; SI-NEXT: v_mov_b32_e32 v26, v14 -; SI-NEXT: v_mov_b32_e32 v25, v12 -; SI-NEXT: v_mov_b32_e32 v19, v10 -; SI-NEXT: v_mov_b32_e32 v20, v8 -; SI-NEXT: v_mov_b32_e32 v21, v6 -; SI-NEXT: v_mov_b32_e32 v22, v4 -; SI-NEXT: v_mov_b32_e32 v23, v2 -; SI-NEXT: v_mov_b32_e32 v24, v0 +; SI-NEXT: v_mov_b32_e32 v16, v1 +; SI-NEXT: v_mov_b32_e32 v17, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v17 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s7, s28, 16 +; SI-NEXT: s_lshr_b32 s8, s27, 16 +; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: s_lshr_b32 s13, s22, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s40, s19, 16 +; SI-NEXT: s_lshr_b32 s41, s18, 16 +; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v0 ; SI-NEXT: s_cbranch_scc0 .LBB75_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v24 ; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: v_or_b32_e32 v7, v0, v33 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v22 +; SI-NEXT: s_lshl_b32 s5, s43, 16 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: v_or_b32_e32 v9, v0, v31 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v21 -; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: v_or_b32_e32 v10, v0, v30 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v20 -; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: v_or_b32_e32 v11, v0, v29 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v19 -; SI-NEXT: s_or_b32 s7, s7, s8 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: v_or_b32_e32 v12, v0, v28 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v25 -; SI-NEXT: s_or_b32 s8, s8, s9 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: v_or_b32_e32 v13, v0, v27 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v26 -; SI-NEXT: s_or_b32 s9, s9, s10 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 -; SI-NEXT: v_or_b32_e32 v14, v0, v18 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s44, s42, 16 +; SI-NEXT: s_or_b32 s5, s5, s44 +; SI-NEXT: s_and_b32 s44, s18, 0xffff +; SI-NEXT: s_lshl_b32 s45, s41, 16 +; SI-NEXT: s_or_b32 s44, s44, s45 +; SI-NEXT: s_and_b32 s45, s19, 0xffff +; SI-NEXT: s_lshl_b32 s46, s40, 16 +; SI-NEXT: s_or_b32 s45, s45, s46 +; SI-NEXT: s_and_b32 s46, s20, 0xffff +; SI-NEXT: s_lshl_b32 s47, s15, 16 +; SI-NEXT: s_or_b32 s46, s46, s47 +; SI-NEXT: s_and_b32 s47, s21, 0xffff +; SI-NEXT: s_lshl_b32 s56, s14, 16 +; SI-NEXT: s_or_b32 s47, s47, s56 +; SI-NEXT: s_and_b32 s56, s22, 0xffff +; SI-NEXT: s_lshl_b32 s57, s13, 16 +; SI-NEXT: s_or_b32 s56, s56, s57 +; SI-NEXT: s_and_b32 s57, s23, 0xffff +; SI-NEXT: s_lshl_b32 s58, s12, 16 +; SI-NEXT: s_or_b32 s57, s57, s58 +; SI-NEXT: s_and_b32 s58, s24, 0xffff +; SI-NEXT: s_lshl_b32 s59, s11, 16 +; SI-NEXT: s_or_b32 s58, s58, s59 +; SI-NEXT: s_and_b32 s59, s25, 0xffff +; SI-NEXT: s_lshl_b32 s60, s10, 16 +; SI-NEXT: s_or_b32 s59, s59, s60 +; SI-NEXT: s_and_b32 s60, s26, 0xffff +; SI-NEXT: s_lshl_b32 s61, s9, 16 +; SI-NEXT: s_or_b32 s60, s60, s61 +; SI-NEXT: s_and_b32 s61, s27, 0xffff +; SI-NEXT: s_lshl_b32 s62, s8, 16 +; SI-NEXT: s_or_b32 s61, s61, s62 +; SI-NEXT: s_and_b32 s62, s28, 0xffff +; SI-NEXT: s_lshl_b32 s63, s7, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v17 +; SI-NEXT: s_or_b32 s62, s62, s63 +; SI-NEXT: s_and_b32 s63, s29, 0xffff +; SI-NEXT: s_lshl_b32 s72, s6, 16 +; SI-NEXT: v_or_b32_e32 v14, v0, v19 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 -; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_or_b32_e32 v8, v1, v32 -; SI-NEXT: v_or_b32_e32 v15, v0, v17 +; SI-NEXT: s_or_b32 s63, s63, s72 +; SI-NEXT: v_or_b32_e32 v15, v0, v18 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_mov_b32_e32 v5, s9 -; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: v_mov_b32_e32 v2, s44 +; SI-NEXT: v_mov_b32_e32 v3, s45 +; SI-NEXT: v_mov_b32_e32 v4, s46 +; SI-NEXT: v_mov_b32_e32 v5, s47 +; SI-NEXT: v_mov_b32_e32 v6, s56 +; SI-NEXT: v_mov_b32_e32 v7, s57 +; SI-NEXT: v_mov_b32_e32 v8, s58 +; SI-NEXT: v_mov_b32_e32 v9, s59 +; SI-NEXT: v_mov_b32_e32 v10, s60 +; SI-NEXT: v_mov_b32_e32 v11, s61 +; SI-NEXT: v_mov_b32_e32 v12, s62 +; SI-NEXT: v_mov_b32_e32 v13, s63 ; SI-NEXT: s_cbranch_execnz .LBB75_3 ; SI-NEXT: .LBB75_2: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v24 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v33, v0 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v23 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v32, v0 -; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v22 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v31, v0 -; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v21 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v30, v0 -; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v20 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v29, v0 -; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v19 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_or_b32_e32 v0, v28, v0 ; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v25 +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: s_add_i32 s17, s17, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s16, s42, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: s_and_b32 s16, s18, 0xffff +; SI-NEXT: s_lshl_b32 s17, s41, 16 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_and_b32 s17, s19, 0xffff +; SI-NEXT: s_lshl_b32 s18, s40, 16 ; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: s_and_b32 s18, s20, 0xffff +; SI-NEXT: s_lshl_b32 s15, s15, 16 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_or_b32 s15, s15, s18 +; SI-NEXT: s_and_b32 s18, s21, 0xffff +; SI-NEXT: s_lshl_b32 s14, s14, 16 ; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: v_or_b32_e32 v0, v27, v0 -; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_or_b32 s14, s14, s18 +; SI-NEXT: s_and_b32 s18, s22, 0xffff +; SI-NEXT: s_lshl_b32 s13, s13, 16 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_or_b32 s13, s13, s18 +; SI-NEXT: s_and_b32 s18, s23, 0xffff +; SI-NEXT: s_lshl_b32 s12, s12, 16 ; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v26 -; SI-NEXT: s_or_b32 s7, s8, s7 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_or_b32 s12, s12, s18 +; SI-NEXT: s_and_b32 s18, s24, 0xffff +; SI-NEXT: s_lshl_b32 s11, s11, 16 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_or_b32 s11, s11, s18 +; SI-NEXT: s_and_b32 s18, s25, 0xffff +; SI-NEXT: s_lshl_b32 s10, s10, 16 ; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_or_b32 s8, s9, s8 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_or_b32 s10, s10, s18 +; SI-NEXT: s_and_b32 s18, s26, 0xffff +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v17 +; SI-NEXT: s_or_b32 s9, s9, s18 +; SI-NEXT: s_and_b32 s18, s27, 0xffff +; SI-NEXT: s_lshl_b32 s8, s8, 16 ; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: v_or_b32_e32 v0, v18, v0 -; SI-NEXT: s_or_b32 s9, s10, s9 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s8, s8, s18 +; SI-NEXT: s_and_b32 s18, s28, 0xffff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: v_or_b32_e32 v0, v19, v0 +; SI-NEXT: s_or_b32 s7, s7, s18 +; SI-NEXT: s_and_b32 s18, s29, 0xffff +; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v16 -; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: s_or_b32 s6, s6, s18 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_add_i32 s4, s4, 0x30000 ; SI-NEXT: s_add_i32 s5, s5, 0x30000 -; SI-NEXT: s_add_i32 s6, s6, 0x30000 -; SI-NEXT: s_add_i32 s7, s7, 0x30000 -; SI-NEXT: s_add_i32 s8, s8, 0x30000 -; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s16, s16, 0x30000 +; SI-NEXT: s_add_i32 s17, s17, 0x30000 +; SI-NEXT: s_add_i32 s15, s15, 0x30000 +; SI-NEXT: s_add_i32 s14, s14, 0x30000 +; SI-NEXT: s_add_i32 s13, s13, 0x30000 +; SI-NEXT: s_add_i32 s12, s12, 0x30000 +; SI-NEXT: s_add_i32 s11, s11, 0x30000 ; SI-NEXT: s_add_i32 s10, s10, 0x30000 -; SI-NEXT: v_or_b32_e32 v0, v17, v0 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v18, v0 ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_mov_b32_e32 v5, s9 -; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: v_mov_b32_e32 v3, s17 +; SI-NEXT: v_mov_b32_e32 v4, s15 +; SI-NEXT: v_mov_b32_e32 v5, s14 +; SI-NEXT: v_mov_b32_e32 v6, s13 +; SI-NEXT: v_mov_b32_e32 v7, s12 +; SI-NEXT: v_mov_b32_e32 v8, s11 +; SI-NEXT: v_mov_b32_e32 v9, s10 +; SI-NEXT: v_mov_b32_e32 v10, s9 +; SI-NEXT: v_mov_b32_e32 v11, s8 +; SI-NEXT: v_mov_b32_e32 v12, s7 +; SI-NEXT: v_mov_b32_e32 v13, s6 ; SI-NEXT: .LBB75_3: ; %end ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB75_4: @@ -46529,87 +47988,87 @@ define <32 x half> @bitcast_v8f64_to_v32f16(<8 x double> %a, i32 %b) { ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 ; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB76_2 ; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v1 ; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v13 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v13 ; SI-NEXT: v_cvt_f32_f16_e32 v24, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v54, v0 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr2 @@ -46632,71 +48091,119 @@ define <32 x half> @bitcast_v8f64_to_v32f16(<8 x double> %a, i32 %b) { ; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 ; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 ; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v10 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v13 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v13 ; SI-NEXT: v_cvt_f32_f16_e32 v24, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v54, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 ; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 ; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 ; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 ; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 ; SI-NEXT: .LBB76_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_mov_b32_e32 v0, v54 -; SI-NEXT: v_mov_b32_e32 v1, v55 -; SI-NEXT: v_mov_b32_e32 v2, v53 -; SI-NEXT: v_mov_b32_e32 v3, v52 -; SI-NEXT: v_mov_b32_e32 v4, v51 -; SI-NEXT: v_mov_b32_e32 v5, v49 -; SI-NEXT: v_mov_b32_e32 v6, v50 -; SI-NEXT: v_mov_b32_e32 v7, v39 -; SI-NEXT: v_mov_b32_e32 v8, v48 -; SI-NEXT: v_mov_b32_e32 v9, v36 -; SI-NEXT: v_mov_b32_e32 v10, v38 -; SI-NEXT: v_mov_b32_e32 v11, v34 -; SI-NEXT: v_mov_b32_e32 v12, v37 -; SI-NEXT: v_mov_b32_e32 v13, v32 -; SI-NEXT: v_mov_b32_e32 v14, v35 -; SI-NEXT: v_mov_b32_e32 v15, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v52 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v51 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v48 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v38 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v37 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v34 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v33 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v29 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v30 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v25 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v26 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v21 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v22 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v17 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v18 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8f64_to_v32f16: @@ -46784,166 +48291,228 @@ define inreg <32 x half> @bitcast_v8f64_to_v32f16_scalar(<8 x double> inreg %a, ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: v_mov_b32_e32 v54, s16 -; SI-NEXT: v_mov_b32_e32 v55, s17 -; SI-NEXT: v_mov_b32_e32 v52, s18 -; SI-NEXT: v_mov_b32_e32 v53, s19 -; SI-NEXT: v_mov_b32_e32 v50, s20 -; SI-NEXT: v_mov_b32_e32 v51, s21 -; SI-NEXT: v_mov_b32_e32 v48, s22 -; SI-NEXT: v_mov_b32_e32 v49, s23 -; SI-NEXT: v_mov_b32_e32 v38, s24 -; SI-NEXT: v_mov_b32_e32 v39, s25 -; SI-NEXT: v_mov_b32_e32 v36, s26 -; SI-NEXT: v_mov_b32_e32 v37, s27 -; SI-NEXT: v_mov_b32_e32 v34, s28 +; SI-NEXT: v_mov_b32_e32 v15, s16 +; SI-NEXT: v_mov_b32_e32 v16, s17 +; SI-NEXT: v_mov_b32_e32 v13, s18 +; SI-NEXT: v_mov_b32_e32 v14, s19 +; SI-NEXT: v_mov_b32_e32 v11, s20 +; SI-NEXT: v_mov_b32_e32 v12, s21 +; SI-NEXT: v_mov_b32_e32 v9, s22 +; SI-NEXT: v_mov_b32_e32 v10, s23 +; SI-NEXT: v_mov_b32_e32 v7, s24 +; SI-NEXT: v_mov_b32_e32 v8, s25 +; SI-NEXT: v_mov_b32_e32 v5, s26 +; SI-NEXT: v_mov_b32_e32 v6, s27 +; SI-NEXT: v_mov_b32_e32 v3, s28 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mov_b32_e32 v35, s29 +; SI-NEXT: v_mov_b32_e32 v4, s29 ; SI-NEXT: s_cbranch_scc0 .LBB77_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v2 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v33, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v54 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v15 ; SI-NEXT: s_cbranch_execnz .LBB77_3 ; SI-NEXT: .LBB77_2: ; %cmp.true -; SI-NEXT: v_add_f64 v[31:32], v[54:55], 1.0 -; SI-NEXT: v_add_f64 v[2:3], v[52:53], 1.0 -; SI-NEXT: v_add_f64 v[4:5], v[50:51], 1.0 -; SI-NEXT: v_add_f64 v[6:7], v[48:49], 1.0 -; SI-NEXT: v_add_f64 v[8:9], v[38:39], 1.0 -; SI-NEXT: v_add_f64 v[10:11], v[36:37], 1.0 -; SI-NEXT: v_add_f64 v[12:13], v[34:35], 1.0 +; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 +; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 +; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[3:4], 1.0 ; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v31 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v32 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 ; SI-NEXT: .LBB77_3: ; %end -; SI-NEXT: v_mov_b32_e32 v0, v32 -; SI-NEXT: v_mov_b32_e32 v1, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v52 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v51 +; SI-NEXT: v_or_b32_e32 v1, v4, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v39 +; SI-NEXT: v_or_b32_e32 v3, v6, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v38 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v35 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v34 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v30 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v31 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v26 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v27 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v22 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v23 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v18 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v19 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB77_4: -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: s_branch .LBB77_2 ; ; VI-LABEL: bitcast_v8f64_to_v32f16_scalar: @@ -47080,97 +48649,126 @@ define <8 x double> @bitcast_v32f16_to_v8f64(<32 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v32f16_to_v8f64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:4 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v15 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v47 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v56 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB78_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v46 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v44 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v42 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v40 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v54 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v52 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v50 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v48 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v38 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v34 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 -; SI-NEXT: v_or_b32_e32 v0, v45, v0 -; SI-NEXT: v_or_b32_e32 v1, v43, v1 -; SI-NEXT: v_or_b32_e32 v2, v41, v2 -; SI-NEXT: v_or_b32_e32 v3, v55, v3 -; SI-NEXT: v_or_b32_e32 v4, v53, v4 -; SI-NEXT: v_or_b32_e32 v5, v51, v5 -; SI-NEXT: v_or_b32_e32 v6, v49, v6 -; SI-NEXT: v_or_b32_e32 v7, v39, v7 -; SI-NEXT: v_or_b32_e32 v8, v37, v8 -; SI-NEXT: v_or_b32_e32 v9, v35, v9 -; SI-NEXT: v_or_b32_e32 v10, v33, v10 -; SI-NEXT: v_or_b32_e32 v11, v31, v11 -; SI-NEXT: v_or_b32_e32 v12, v22, v12 -; SI-NEXT: v_or_b32_e32 v13, v20, v13 -; SI-NEXT: v_or_b32_e32 v14, v18, v14 -; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v40 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v18 +; SI-NEXT: v_or_b32_e32 v0, v55, v0 +; SI-NEXT: v_or_b32_e32 v1, v53, v1 +; SI-NEXT: v_or_b32_e32 v2, v51, v2 +; SI-NEXT: v_or_b32_e32 v3, v49, v3 +; SI-NEXT: v_or_b32_e32 v4, v39, v4 +; SI-NEXT: v_or_b32_e32 v5, v37, v5 +; SI-NEXT: v_or_b32_e32 v6, v35, v6 +; SI-NEXT: v_or_b32_e32 v7, v33, v7 +; SI-NEXT: v_or_b32_e32 v8, v31, v8 +; SI-NEXT: v_or_b32_e32 v9, v29, v9 +; SI-NEXT: v_or_b32_e32 v10, v27, v10 +; SI-NEXT: v_or_b32_e32 v11, v25, v11 +; SI-NEXT: v_or_b32_e32 v12, v23, v12 +; SI-NEXT: v_or_b32_e32 v13, v21, v13 +; SI-NEXT: v_or_b32_e32 v14, v19, v14 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 ; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr54 @@ -47189,6 +48787,13 @@ define <8 x double> @bitcast_v32f16_to_v8f64(<32 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr21 @@ -47196,15 +48801,14 @@ define <8 x double> @bitcast_v32f16_to_v8f64(<32 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: .LBB78_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB78_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v53 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -47217,10 +48821,10 @@ define <8 x double> @bitcast_v32f16_to_v8f64(<32 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v49 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -47228,10 +48832,10 @@ define <8 x double> @bitcast_v32f16_to_v8f64(<32 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v39 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v48 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 @@ -47239,11 +48843,11 @@ define <8 x double> @bitcast_v32f16_to_v8f64(<32 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v38 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v36 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 @@ -47251,11 +48855,11 @@ define <8 x double> @bitcast_v32f16_to_v8f64(<32 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v35 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v33 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 @@ -47263,11 +48867,11 @@ define <8 x double> @bitcast_v32f16_to_v8f64(<32 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v32 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v30 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 @@ -47275,11 +48879,11 @@ define <8 x double> @bitcast_v32f16_to_v8f64(<32 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v29 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v27 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 @@ -47287,11 +48891,11 @@ define <8 x double> @bitcast_v32f16_to_v8f64(<32 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v26 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v24 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 @@ -47299,11 +48903,11 @@ define <8 x double> @bitcast_v32f16_to_v8f64(<32 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v23 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v21 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 @@ -47311,35 +48915,27 @@ define <8 x double> @bitcast_v32f16_to_v8f64(<32 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_or_b32_e32 v12, v14, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v20 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 -; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 ; SI-NEXT: .LBB78_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -47485,82 +49081,128 @@ define inreg <8 x double> @bitcast_v32f16_to_v8f64_scalar(<32 x half> inreg %a, ; SI-LABEL: bitcast_v32f16_to_v8f64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v51, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v50, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v40, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v55, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v54, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v53, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v52, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v49, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v48, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v39, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v38, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v37, s26 -; SI-NEXT: v_cvt_f16_f32_e32 v36, s29 -; SI-NEXT: v_cvt_f16_f32_e32 v35, s28 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: s_lshr_b32 s10, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s10 +; SI-NEXT: s_lshr_b32 s10, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s10 +; SI-NEXT: s_lshr_b32 s10, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s10 +; SI-NEXT: s_lshr_b32 s10, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s10 +; SI-NEXT: s_lshr_b32 s8, s25, 16 +; SI-NEXT: s_lshr_b32 s9, s24, 16 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s25 +; SI-NEXT: s_lshr_b32 s6, s27, 16 +; SI-NEXT: s_lshr_b32 s7, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s19, 16 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 +; SI-NEXT: s_lshr_b32 s10, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s10 +; SI-NEXT: s_lshr_b32 s10, s17, 16 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; SI-NEXT: s_lshr_b32 s4, s29, 16 +; SI-NEXT: s_lshr_b32 s5, s28, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s10 +; SI-NEXT: s_lshr_b32 s10, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v1 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_cbranch_scc0 .LBB79_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v40 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v52 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v48 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v38 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v53 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v33 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v29 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v28 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v25 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v24 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v22 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v19 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 -; SI-NEXT: v_or_b32_e32 v0, v50, v0 -; SI-NEXT: v_or_b32_e32 v1, v55, v1 -; SI-NEXT: v_or_b32_e32 v2, v53, v2 -; SI-NEXT: v_or_b32_e32 v3, v49, v3 -; SI-NEXT: v_or_b32_e32 v4, v39, v4 -; SI-NEXT: v_or_b32_e32 v5, v37, v5 -; SI-NEXT: v_or_b32_e32 v6, v35, v6 -; SI-NEXT: v_or_b32_e32 v7, v33, v7 -; SI-NEXT: v_or_b32_e32 v8, v31, v8 -; SI-NEXT: v_or_b32_e32 v9, v29, v9 -; SI-NEXT: v_or_b32_e32 v10, v27, v10 -; SI-NEXT: v_or_b32_e32 v11, v25, v11 -; SI-NEXT: v_or_b32_e32 v12, v23, v12 -; SI-NEXT: v_or_b32_e32 v13, v21, v13 -; SI-NEXT: v_or_b32_e32 v14, v19, v14 +; SI-NEXT: v_or_b32_e32 v0, v54, v0 +; SI-NEXT: v_or_b32_e32 v1, v52, v1 +; SI-NEXT: v_or_b32_e32 v2, v50, v2 +; SI-NEXT: v_or_b32_e32 v3, v48, v3 +; SI-NEXT: v_or_b32_e32 v4, v38, v4 +; SI-NEXT: v_or_b32_e32 v5, v36, v5 +; SI-NEXT: v_or_b32_e32 v6, v34, v6 +; SI-NEXT: v_or_b32_e32 v7, v31, v7 +; SI-NEXT: v_or_b32_e32 v8, v30, v8 +; SI-NEXT: v_or_b32_e32 v9, v27, v9 +; SI-NEXT: v_or_b32_e32 v10, v26, v10 +; SI-NEXT: v_or_b32_e32 v11, v23, v11 +; SI-NEXT: v_or_b32_e32 v12, v22, v12 +; SI-NEXT: v_or_b32_e32 v13, v20, v13 +; SI-NEXT: v_or_b32_e32 v14, v18, v14 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 ; SI-NEXT: s_cbranch_execnz .LBB79_3 ; SI-NEXT: .LBB79_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v52 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -47569,25 +49211,25 @@ define inreg <8 x double> @bitcast_v32f16_to_v8f64_scalar(<32 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v51 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v50 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v49 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v48 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v38 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -47595,11 +49237,11 @@ define inreg <8 x double> @bitcast_v32f16_to_v8f64_scalar(<32 x half> inreg %a, ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v37 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v35 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 @@ -47607,11 +49249,11 @@ define inreg <8 x double> @bitcast_v32f16_to_v8f64_scalar(<32 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v34 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v31 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 @@ -47622,8 +49264,8 @@ define inreg <8 x double> @bitcast_v32f16_to_v8f64_scalar(<32 x half> inreg %a, ; SI-NEXT: v_cvt_f32_f16_e32 v8, v32 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v29 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 @@ -47631,11 +49273,11 @@ define inreg <8 x double> @bitcast_v32f16_to_v8f64_scalar(<32 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v27 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v26 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 @@ -47643,10 +49285,10 @@ define inreg <8 x double> @bitcast_v32f16_to_v8f64_scalar(<32 x half> inreg %a, ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v25 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v23 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v24 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 @@ -47655,11 +49297,11 @@ define inreg <8 x double> @bitcast_v32f16_to_v8f64_scalar(<32 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v22 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v20 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 @@ -47667,10 +49309,10 @@ define inreg <8 x double> @bitcast_v32f16_to_v8f64_scalar(<32 x half> inreg %a, ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_or_b32_e32 v12, v14, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v19 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 @@ -47686,8 +49328,6 @@ define inreg <8 x double> @bitcast_v32f16_to_v8f64_scalar(<32 x half> inreg %a, ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 ; SI-NEXT: .LBB79_3: ; %end -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB79_4: ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 @@ -47886,74 +49526,74 @@ define <32 x bfloat> @bitcast_v8f64_to_v32bf16(<8 x double> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB80_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v15 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v15 -; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v14 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v14 -; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v13 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v13 -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v12 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v12 -; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v11 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v11 -; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v10 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v10 -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v9 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v9 -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v8 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v8 -; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v7 -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v7 -; SI-NEXT: v_and_b32_e32 v34, 0xffff0000, v6 -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v6 -; SI-NEXT: v_and_b32_e32 v36, 0xffff0000, v5 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v5 -; SI-NEXT: v_and_b32_e32 v38, 0xffff0000, v4 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v4 -; SI-NEXT: v_and_b32_e32 v48, 0xffff0000, v3 -; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v3 -; SI-NEXT: v_and_b32_e32 v50, 0xffff0000, v2 -; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v2 -; SI-NEXT: v_and_b32_e32 v52, 0xffff0000, v1 -; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v1 -; SI-NEXT: v_and_b32_e32 v54, 0xffff0000, v0 -; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v0 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v15 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v15 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v14 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v14 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v13 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v13 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v12 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v12 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v11 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v11 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v10 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v10 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v9 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v9 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v8 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v8 +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v7 +; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v6 +; SI-NEXT: v_and_b32_e32 v37, 0xffff0000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v5 +; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v4 +; SI-NEXT: v_and_b32_e32 v49, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v3 +; SI-NEXT: v_and_b32_e32 v51, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v2 +; SI-NEXT: v_and_b32_e32 v53, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v1 +; SI-NEXT: v_and_b32_e32 v55, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v0 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr4 @@ -47974,56 +49614,104 @@ define <32 x bfloat> @bitcast_v8f64_to_v32bf16(<8 x double> %a, i32 %b) { ; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 ; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 ; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v15 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v15 -; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v14 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v14 -; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v13 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v13 -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v12 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v12 -; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v11 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v11 -; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v10 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v10 -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v9 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v9 -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v8 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v8 -; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v7 -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v7 -; SI-NEXT: v_and_b32_e32 v34, 0xffff0000, v6 -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v6 -; SI-NEXT: v_and_b32_e32 v36, 0xffff0000, v5 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v5 -; SI-NEXT: v_and_b32_e32 v38, 0xffff0000, v4 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v4 -; SI-NEXT: v_and_b32_e32 v48, 0xffff0000, v3 -; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v3 -; SI-NEXT: v_and_b32_e32 v50, 0xffff0000, v2 -; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v2 -; SI-NEXT: v_and_b32_e32 v52, 0xffff0000, v1 -; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v1 -; SI-NEXT: v_and_b32_e32 v54, 0xffff0000, v0 -; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v0 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v15 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v15 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v14 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v14 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v13 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v13 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v12 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v12 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v11 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v11 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v10 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v10 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v9 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v9 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v8 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v8 +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v7 +; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v6 +; SI-NEXT: v_and_b32_e32 v37, 0xffff0000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v5 +; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v4 +; SI-NEXT: v_and_b32_e32 v49, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v3 +; SI-NEXT: v_and_b32_e32 v51, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v2 +; SI-NEXT: v_and_b32_e32 v53, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v1 +; SI-NEXT: v_and_b32_e32 v55, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v0 ; SI-NEXT: .LBB80_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_mov_b32_e32 v0, v55 -; SI-NEXT: v_mov_b32_e32 v1, v54 -; SI-NEXT: v_mov_b32_e32 v2, v53 -; SI-NEXT: v_mov_b32_e32 v3, v52 -; SI-NEXT: v_mov_b32_e32 v4, v51 -; SI-NEXT: v_mov_b32_e32 v5, v50 -; SI-NEXT: v_mov_b32_e32 v6, v49 -; SI-NEXT: v_mov_b32_e32 v7, v48 -; SI-NEXT: v_mov_b32_e32 v8, v39 -; SI-NEXT: v_mov_b32_e32 v9, v38 -; SI-NEXT: v_mov_b32_e32 v10, v37 -; SI-NEXT: v_mov_b32_e32 v11, v36 -; SI-NEXT: v_mov_b32_e32 v12, v35 -; SI-NEXT: v_mov_b32_e32 v13, v34 -; SI-NEXT: v_mov_b32_e32 v14, v33 -; SI-NEXT: v_mov_b32_e32 v15, v32 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v55 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v54 +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v53 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v52 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v51 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v50 +; SI-NEXT: v_alignbit_b32 v2, v2, v3, 16 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v49 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v48 +; SI-NEXT: v_alignbit_b32 v3, v3, v4, 16 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v39 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v38 +; SI-NEXT: v_alignbit_b32 v4, v4, v5, 16 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v37 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_mul_f32_e32 v6, 1.0, v36 +; SI-NEXT: v_alignbit_b32 v5, v5, v6, 16 +; SI-NEXT: v_mul_f32_e32 v6, 1.0, v35 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v34 +; SI-NEXT: v_alignbit_b32 v6, v6, v7, 16 +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v33 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v32 +; SI-NEXT: v_alignbit_b32 v7, v7, v8, 16 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v31 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v30 +; SI-NEXT: v_alignbit_b32 v8, v8, v9, 16 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v29 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v28 +; SI-NEXT: v_alignbit_b32 v9, v9, v10, 16 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v27 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v26 +; SI-NEXT: v_alignbit_b32 v10, v10, v11, 16 +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v25 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_mul_f32_e32 v12, 1.0, v24 +; SI-NEXT: v_alignbit_b32 v11, v11, v12, 16 +; SI-NEXT: v_mul_f32_e32 v12, 1.0, v23 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_mul_f32_e32 v13, 1.0, v22 +; SI-NEXT: v_alignbit_b32 v12, v12, v13, 16 +; SI-NEXT: v_mul_f32_e32 v13, 1.0, v21 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_mul_f32_e32 v14, 1.0, v20 +; SI-NEXT: v_alignbit_b32 v13, v13, v14, 16 +; SI-NEXT: v_mul_f32_e32 v14, 1.0, v19 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_mul_f32_e32 v15, 1.0, v18 +; SI-NEXT: v_alignbit_b32 v14, v14, v15, 16 +; SI-NEXT: v_mul_f32_e32 v15, 1.0, v17 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_alignbit_b32 v15, v15, v16, 16 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8f64_to_v32bf16: @@ -48110,136 +49798,248 @@ define inreg <32 x bfloat> @bitcast_v8f64_to_v32bf16_scalar(<8 x double> inreg % ; SI-LABEL: bitcast_v8f64_to_v32bf16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v3, s16 +; SI-NEXT: v_mov_b32_e32 v4, s17 +; SI-NEXT: v_mov_b32_e32 v5, s18 +; SI-NEXT: v_mov_b32_e32 v6, s19 +; SI-NEXT: v_mov_b32_e32 v7, s20 +; SI-NEXT: v_mov_b32_e32 v8, s21 +; SI-NEXT: v_mov_b32_e32 v9, s22 +; SI-NEXT: v_mov_b32_e32 v10, s23 +; SI-NEXT: v_mov_b32_e32 v11, s24 +; SI-NEXT: v_mov_b32_e32 v12, s25 +; SI-NEXT: v_mov_b32_e32 v13, s26 +; SI-NEXT: v_mov_b32_e32 v14, s27 +; SI-NEXT: v_mov_b32_e32 v15, s28 +; SI-NEXT: v_mov_b32_e32 v16, s29 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: v_mov_b32_e32 v54, s16 -; SI-NEXT: v_mov_b32_e32 v55, s17 -; SI-NEXT: v_mov_b32_e32 v52, s18 -; SI-NEXT: v_mov_b32_e32 v53, s19 -; SI-NEXT: v_mov_b32_e32 v50, s20 -; SI-NEXT: v_mov_b32_e32 v51, s21 -; SI-NEXT: v_mov_b32_e32 v48, s22 -; SI-NEXT: v_mov_b32_e32 v49, s23 -; SI-NEXT: v_mov_b32_e32 v38, s24 -; SI-NEXT: v_mov_b32_e32 v39, s25 -; SI-NEXT: v_mov_b32_e32 v36, s26 -; SI-NEXT: v_mov_b32_e32 v37, s27 -; SI-NEXT: v_mov_b32_e32 v34, s28 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mov_b32_e32 v35, s29 -; SI-NEXT: s_cbranch_scc0 .LBB81_4 +; SI-NEXT: v_readfirstlane_b32 s4, v3 +; SI-NEXT: v_readfirstlane_b32 s5, v4 +; SI-NEXT: v_readfirstlane_b32 s6, v5 +; SI-NEXT: v_readfirstlane_b32 s7, v6 +; SI-NEXT: v_readfirstlane_b32 s8, v7 +; SI-NEXT: v_readfirstlane_b32 s9, v8 +; SI-NEXT: v_readfirstlane_b32 s18, v9 +; SI-NEXT: v_readfirstlane_b32 s19, v10 +; SI-NEXT: v_readfirstlane_b32 s16, v11 +; SI-NEXT: v_readfirstlane_b32 s17, v12 +; SI-NEXT: v_readfirstlane_b32 s14, v13 +; SI-NEXT: v_readfirstlane_b32 s15, v14 +; SI-NEXT: v_readfirstlane_b32 s10, v15 +; SI-NEXT: v_readfirstlane_b32 s11, v16 +; SI-NEXT: v_readfirstlane_b32 s12, v0 +; SI-NEXT: s_and_b64 s[20:21], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s13, v1 +; SI-NEXT: s_cbranch_scc0 .LBB81_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v1 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v1 -; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v0 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v0 -; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v35 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v35 -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v34 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v34 -; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v37 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v37 -; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v36 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v36 -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v39 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v39 -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v38 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v38 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v49 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v49 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v48 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v48 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v51 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v51 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v50 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v50 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v53 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v53 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v52 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v52 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v55 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 -; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v54 -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v54 -; SI-NEXT: s_cbranch_execnz .LBB81_3 +; SI-NEXT: s_and_b32 s79, s13, 0xffff0000 +; SI-NEXT: s_lshl_b32 s78, s13, 16 +; SI-NEXT: s_and_b32 s77, s12, 0xffff0000 +; SI-NEXT: s_lshl_b32 s76, s12, 16 +; SI-NEXT: s_and_b32 s75, s11, 0xffff0000 +; SI-NEXT: s_lshl_b32 s74, s11, 16 +; SI-NEXT: s_and_b32 s73, s10, 0xffff0000 +; SI-NEXT: s_lshl_b32 s72, s10, 16 +; SI-NEXT: s_and_b32 s63, s15, 0xffff0000 +; SI-NEXT: s_lshl_b32 s62, s15, 16 +; SI-NEXT: s_and_b32 s61, s14, 0xffff0000 +; SI-NEXT: s_lshl_b32 s60, s14, 16 +; SI-NEXT: s_and_b32 s59, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s58, s17, 16 +; SI-NEXT: s_and_b32 s57, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s56, s16, 16 +; SI-NEXT: s_and_b32 s47, s19, 0xffff0000 +; SI-NEXT: s_lshl_b32 s46, s19, 16 +; SI-NEXT: s_and_b32 s45, s18, 0xffff0000 +; SI-NEXT: s_lshl_b32 s44, s18, 16 +; SI-NEXT: s_and_b32 s43, s9, 0xffff0000 +; SI-NEXT: s_lshl_b32 s42, s9, 16 +; SI-NEXT: s_and_b32 s41, s8, 0xffff0000 +; SI-NEXT: s_lshl_b32 s40, s8, 16 +; SI-NEXT: s_and_b32 s29, s7, 0xffff0000 +; SI-NEXT: s_lshl_b32 s28, s7, 16 +; SI-NEXT: s_and_b32 s27, s6, 0xffff0000 +; SI-NEXT: s_lshl_b32 s26, s6, 16 +; SI-NEXT: s_and_b32 s25, s5, 0xffff0000 +; SI-NEXT: s_lshl_b32 s24, s5, 16 +; SI-NEXT: s_and_b32 s23, s4, 0xffff0000 +; SI-NEXT: s_lshl_b32 s22, s4, 16 +; SI-NEXT: s_cbranch_execnz .LBB81_4 ; SI-NEXT: .LBB81_2: ; %cmp.true -; SI-NEXT: v_add_f64 v[54:55], v[54:55], 1.0 -; SI-NEXT: v_add_f64 v[2:3], v[52:53], 1.0 -; SI-NEXT: v_add_f64 v[4:5], v[50:51], 1.0 -; SI-NEXT: v_add_f64 v[6:7], v[48:49], 1.0 -; SI-NEXT: v_add_f64 v[8:9], v[38:39], 1.0 -; SI-NEXT: v_add_f64 v[10:11], v[36:37], 1.0 -; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; SI-NEXT: v_add_f64 v[12:13], v[34:35], 1.0 -; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v1 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v1 -; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v0 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v0 -; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v13 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v13 -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v12 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v12 -; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v11 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v11 -; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v10 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v10 -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v9 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v9 -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v8 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v8 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v7 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v7 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v6 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v6 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v5 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v5 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v4 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v4 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v3 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v55 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 -; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v54 -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v54 -; SI-NEXT: .LBB81_3: ; %end -; SI-NEXT: v_mov_b32_e32 v0, v33 -; SI-NEXT: v_mov_b32_e32 v1, v32 -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB81_4: -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: v_add_f64 v[19:20], s[18:19], 1.0 +; SI-NEXT: v_add_f64 v[14:15], s[16:17], 1.0 +; SI-NEXT: v_add_f64 v[10:11], s[14:15], 1.0 +; SI-NEXT: v_add_f64 v[2:3], s[12:13], 1.0 +; SI-NEXT: v_add_f64 v[6:7], s[10:11], 1.0 +; SI-NEXT: v_add_f64 v[23:24], s[8:9], 1.0 +; SI-NEXT: v_add_f64 v[27:28], s[6:7], 1.0 +; SI-NEXT: v_add_f64 v[30:31], s[4:5], 1.0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v11 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v15 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v20 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v20 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v24 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v24 +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v28 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v28 +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v31 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v31 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v30 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: s_branch .LBB81_5 +; SI-NEXT: .LBB81_3: +; SI-NEXT: ; implicit-def: $sgpr22 +; SI-NEXT: ; implicit-def: $sgpr23 +; SI-NEXT: ; implicit-def: $sgpr24 +; SI-NEXT: ; implicit-def: $sgpr25 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr29 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr41 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr43 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr45 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr47 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr57 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr59 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr61 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr63 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr73 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr75 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr77 +; SI-NEXT: ; implicit-def: $sgpr78 +; SI-NEXT: ; implicit-def: $sgpr79 ; SI-NEXT: s_branch .LBB81_2 +; SI-NEXT: .LBB81_4: +; SI-NEXT: v_mov_b32_e32 v1, s79 +; SI-NEXT: v_mov_b32_e32 v0, s78 +; SI-NEXT: v_mov_b32_e32 v3, s77 +; SI-NEXT: v_mov_b32_e32 v2, s76 +; SI-NEXT: v_mov_b32_e32 v5, s75 +; SI-NEXT: v_mov_b32_e32 v4, s74 +; SI-NEXT: v_mov_b32_e32 v7, s73 +; SI-NEXT: v_mov_b32_e32 v6, s72 +; SI-NEXT: v_mov_b32_e32 v9, s63 +; SI-NEXT: v_mov_b32_e32 v8, s62 +; SI-NEXT: v_mov_b32_e32 v11, s61 +; SI-NEXT: v_mov_b32_e32 v10, s60 +; SI-NEXT: v_mov_b32_e32 v13, s59 +; SI-NEXT: v_mov_b32_e32 v12, s58 +; SI-NEXT: v_mov_b32_e32 v15, s57 +; SI-NEXT: v_mov_b32_e32 v14, s56 +; SI-NEXT: v_mov_b32_e32 v17, s47 +; SI-NEXT: v_mov_b32_e32 v16, s46 +; SI-NEXT: v_mov_b32_e32 v18, s45 +; SI-NEXT: v_mov_b32_e32 v19, s44 +; SI-NEXT: v_mov_b32_e32 v21, s43 +; SI-NEXT: v_mov_b32_e32 v20, s42 +; SI-NEXT: v_mov_b32_e32 v22, s41 +; SI-NEXT: v_mov_b32_e32 v23, s40 +; SI-NEXT: v_mov_b32_e32 v25, s29 +; SI-NEXT: v_mov_b32_e32 v24, s28 +; SI-NEXT: v_mov_b32_e32 v26, s27 +; SI-NEXT: v_mov_b32_e32 v27, s26 +; SI-NEXT: v_mov_b32_e32 v29, s25 +; SI-NEXT: v_mov_b32_e32 v28, s24 +; SI-NEXT: v_mov_b32_e32 v31, s23 +; SI-NEXT: v_mov_b32_e32 v30, s22 +; SI-NEXT: .LBB81_5: ; %end +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v30 +; SI-NEXT: v_mul_f32_e32 v29, 1.0, v29 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v28 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v26 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v1 +; SI-NEXT: v_mul_f32_e32 v50, 1.0, v0 +; SI-NEXT: v_lshr_b64 v[0:1], v[30:31], 16 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v27 +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v25 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v48, 1.0, v2 +; SI-NEXT: v_lshr_b64 v[1:2], v[28:29], 16 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v3 +; SI-NEXT: v_lshr_b64 v[2:3], v[32:33], 16 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v22 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v23 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v38, 1.0, v4 +; SI-NEXT: v_lshr_b64 v[3:4], v[24:25], 16 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v5 +; SI-NEXT: v_lshr_b64 v[4:5], v[26:27], 16 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v18 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v19 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v36, 1.0, v6 +; SI-NEXT: v_lshr_b64 v[5:6], v[20:21], 16 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v7 +; SI-NEXT: v_lshr_b64 v[6:7], v[22:23], 16 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_mul_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v34, 1.0, v8 +; SI-NEXT: v_lshr_b64 v[7:8], v[16:17], 16 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_mul_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v9 +; SI-NEXT: v_lshr_b64 v[8:9], v[14:15], 16 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v11 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v10 +; SI-NEXT: v_lshr_b64 v[9:10], v[12:13], 16 +; SI-NEXT: v_lshr_b64 v[10:11], v[18:19], 16 +; SI-NEXT: v_lshr_b64 v[11:12], v[34:35], 16 +; SI-NEXT: v_lshr_b64 v[12:13], v[36:37], 16 +; SI-NEXT: v_lshr_b64 v[13:14], v[38:39], 16 +; SI-NEXT: v_lshr_b64 v[14:15], v[48:49], 16 +; SI-NEXT: v_lshr_b64 v[15:16], v[50:51], 16 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8f64_to_v32bf16_scalar: ; VI: ; %bb.0: @@ -48375,132 +50175,154 @@ define <8 x double> @bitcast_v32bf16_to_v8f64(<32 x bfloat> %a, i32 %b) { ; SI-LABEL: bitcast_v32bf16_to_v8f64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 -; SI-NEXT: v_mul_f32_e32 v45, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v46, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v43, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v44, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v41, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v42, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v55, 1.0, v7 -; SI-NEXT: v_mul_f32_e32 v40, 1.0, v6 -; SI-NEXT: v_mul_f32_e32 v53, 1.0, v9 -; SI-NEXT: v_mul_f32_e32 v54, 1.0, v8 -; SI-NEXT: v_mul_f32_e32 v51, 1.0, v11 -; SI-NEXT: v_mul_f32_e32 v52, 1.0, v10 -; SI-NEXT: v_mul_f32_e32 v49, 1.0, v13 -; SI-NEXT: v_mul_f32_e32 v50, 1.0, v12 -; SI-NEXT: v_mul_f32_e32 v39, 1.0, v15 -; SI-NEXT: v_mul_f32_e32 v48, 1.0, v14 -; SI-NEXT: v_mul_f32_e32 v37, 1.0, v17 -; SI-NEXT: v_mul_f32_e32 v38, 1.0, v16 -; SI-NEXT: v_mul_f32_e32 v35, 1.0, v19 -; SI-NEXT: v_mul_f32_e32 v36, 1.0, v18 -; SI-NEXT: v_mul_f32_e32 v33, 1.0, v21 -; SI-NEXT: v_mul_f32_e32 v34, 1.0, v20 -; SI-NEXT: v_mul_f32_e32 v31, 1.0, v23 -; SI-NEXT: v_mul_f32_e32 v32, 1.0, v22 -; SI-NEXT: v_mul_f32_e32 v22, 1.0, v25 -; SI-NEXT: v_mul_f32_e32 v23, 1.0, v24 -; SI-NEXT: v_mul_f32_e32 v20, 1.0, v27 -; SI-NEXT: v_mul_f32_e32 v21, 1.0, v26 -; SI-NEXT: v_mul_f32_e32 v17, 1.0, v29 -; SI-NEXT: v_mul_f32_e32 v19, 1.0, v28 -; SI-NEXT: v_mul_f32_e32 v18, 1.0, v30 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: v_mul_f32_e32 v54, 1.0, v32 +; SI-NEXT: v_mul_f32_e32 v55, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v52, 1.0, v31 +; SI-NEXT: v_mul_f32_e32 v53, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v50, 1.0, v30 +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v48, 1.0, v29 +; SI-NEXT: v_mul_f32_e32 v49, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v38, 1.0, v28 +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v36, 1.0, v27 +; SI-NEXT: v_mul_f32_e32 v37, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v34, 1.0, v26 +; SI-NEXT: v_mul_f32_e32 v35, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v25 +; SI-NEXT: v_mul_f32_e32 v33, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v24 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v23 +; SI-NEXT: v_mul_f32_e32 v29, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v22 +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v21 +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v20 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v19 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v17 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v15 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v47 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v16, 1.0, v56 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execz .LBB82_2 -; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v45 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v43 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v41 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v55 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v53 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v51 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v49 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v39 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v37 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v35 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v33 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v31 +; SI-NEXT: s_cbranch_execnz .LBB82_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB82_4 +; SI-NEXT: .LBB82_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB82_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v52 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v50 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v48 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v38 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v24 ; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v22 ; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v18 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v16 -; SI-NEXT: v_alignbit_b32 v0, v0, v46, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v44, 16 -; SI-NEXT: v_alignbit_b32 v2, v2, v42, 16 -; SI-NEXT: v_alignbit_b32 v3, v3, v40, 16 -; SI-NEXT: v_alignbit_b32 v4, v4, v54, 16 -; SI-NEXT: v_alignbit_b32 v5, v5, v52, 16 -; SI-NEXT: v_alignbit_b32 v6, v6, v50, 16 -; SI-NEXT: v_alignbit_b32 v7, v7, v48, 16 -; SI-NEXT: v_alignbit_b32 v8, v8, v38, 16 -; SI-NEXT: v_alignbit_b32 v9, v9, v36, 16 -; SI-NEXT: v_alignbit_b32 v10, v10, v34, 16 -; SI-NEXT: v_alignbit_b32 v11, v11, v32, 16 +; SI-NEXT: v_alignbit_b32 v0, v0, v55, 16 +; SI-NEXT: v_alignbit_b32 v1, v1, v53, 16 +; SI-NEXT: v_alignbit_b32 v2, v2, v51, 16 +; SI-NEXT: v_alignbit_b32 v3, v3, v49, 16 +; SI-NEXT: v_alignbit_b32 v4, v4, v39, 16 +; SI-NEXT: v_alignbit_b32 v5, v5, v37, 16 +; SI-NEXT: v_alignbit_b32 v6, v6, v35, 16 +; SI-NEXT: v_alignbit_b32 v7, v7, v33, 16 +; SI-NEXT: v_alignbit_b32 v8, v8, v31, 16 +; SI-NEXT: v_alignbit_b32 v9, v9, v29, 16 +; SI-NEXT: v_alignbit_b32 v10, v10, v27, 16 +; SI-NEXT: v_alignbit_b32 v11, v11, v25, 16 ; SI-NEXT: v_alignbit_b32 v12, v12, v23, 16 ; SI-NEXT: v_alignbit_b32 v13, v13, v21, 16 ; SI-NEXT: v_alignbit_b32 v14, v14, v19, 16 -; SI-NEXT: v_alignbit_b32 v15, v15, v18, 16 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: v_alignbit_b32 v15, v15, v17, 16 +; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: .LBB82_2: ; %Flow +; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB82_4 -; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v45 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v43 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v46 +; SI-NEXT: s_cbranch_execz .LBB82_2 +; SI-NEXT: .LBB82_4: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v54 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v52 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v55 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v44 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v53 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 @@ -48508,62 +50330,62 @@ define <8 x double> @bitcast_v32bf16_to_v8f64(<32 x bfloat> %a, i32 %b) { ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 ; SI-NEXT: v_alignbit_b32 v1, v3, v2, 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v41 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v42 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v50 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v51 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v55 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v48 ; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v40 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v49 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v53 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v38 ; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v54 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v39 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v51 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v36 ; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v52 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v37 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v49 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v34 ; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v50 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v35 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v39 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v32 ; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v48 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v33 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 ; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v37 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v30 ; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v38 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v31 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v35 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v28 ; SI-NEXT: v_alignbit_b32 v8, v9, v8, 16 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v36 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v29 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 ; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v33 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v26 ; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v34 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v27 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 ; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v31 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v24 ; SI-NEXT: v_alignbit_b32 v10, v11, v10, 16 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v32 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v25 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 ; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 @@ -48579,7 +50401,7 @@ define <8 x double> @bitcast_v32bf16_to_v8f64(<32 x bfloat> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 ; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v17 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v18 ; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 ; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v19 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 @@ -48587,23 +50409,12 @@ define <8 x double> @bitcast_v32bf16_to_v8f64(<32 x bfloat> %a, i32 %b) { ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; SI-NEXT: v_alignbit_b32 v14, v15, v14, 16 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v18 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v17 ; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 ; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_alignbit_b32 v15, v16, v15, 16 -; SI-NEXT: .LBB82_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v32bf16_to_v8f64: @@ -49770,7 +51581,39 @@ define inreg <8 x double> @bitcast_v32bf16_to_v8f64_scalar(<32 x bfloat> inreg % ; SI-LABEL: bitcast_v32bf16_to_v8f64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_and_b32 s6, s29, 0xffff0000 +; SI-NEXT: s_lshl_b32 s7, s29, 16 +; SI-NEXT: s_and_b32 s8, s28, 0xffff0000 +; SI-NEXT: s_lshl_b32 s9, s28, 16 +; SI-NEXT: s_and_b32 s10, s27, 0xffff0000 +; SI-NEXT: s_lshl_b32 s11, s27, 16 +; SI-NEXT: s_and_b32 s12, s26, 0xffff0000 +; SI-NEXT: s_lshl_b32 s13, s26, 16 +; SI-NEXT: s_and_b32 s14, s25, 0xffff0000 +; SI-NEXT: s_lshl_b32 s15, s25, 16 +; SI-NEXT: s_and_b32 s25, s24, 0xffff0000 +; SI-NEXT: s_lshl_b32 s24, s24, 16 +; SI-NEXT: s_and_b32 s26, s23, 0xffff0000 +; SI-NEXT: s_lshl_b32 s23, s23, 16 +; SI-NEXT: s_and_b32 s27, s22, 0xffff0000 +; SI-NEXT: s_lshl_b32 s22, s22, 16 +; SI-NEXT: s_and_b32 s28, s21, 0xffff0000 +; SI-NEXT: s_lshl_b32 s21, s21, 16 +; SI-NEXT: s_and_b32 s29, s20, 0xffff0000 +; SI-NEXT: s_lshl_b32 s20, s20, 16 +; SI-NEXT: s_and_b32 s40, s19, 0xffff0000 +; SI-NEXT: s_lshl_b32 s19, s19, 16 +; SI-NEXT: s_and_b32 s41, s18, 0xffff0000 +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_and_b32 s42, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_and_b32 s43, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s16, s16, 16 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -49788,51 +51631,50 @@ define inreg <8 x double> @bitcast_v32bf16_to_v8f64_scalar(<32 x bfloat> inreg % ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mul_f32_e64 v62, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v60, 1.0, s19 -; SI-NEXT: v_mul_f32_e32 v57, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v56, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v47, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v46, 1.0, v7 -; SI-NEXT: v_mul_f32_e32 v45, 1.0, v9 -; SI-NEXT: v_mul_f32_e32 v44, 1.0, v11 -; SI-NEXT: v_mul_f32_e32 v43, 1.0, v13 -; SI-NEXT: v_mul_f32_e32 v42, 1.0, v15 -; SI-NEXT: v_mul_f32_e32 v18, 1.0, v17 -; SI-NEXT: v_mul_f32_e64 v41, 1.0, s21 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v63, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v61, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v59, 1.0, s27 -; SI-NEXT: v_mul_f32_e64 v58, 1.0, s29 -; SI-NEXT: v_mul_f32_e32 v33, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v31, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v29, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v27, 1.0, v6 -; SI-NEXT: v_mul_f32_e32 v25, 1.0, v8 -; SI-NEXT: v_mul_f32_e32 v23, 1.0, v10 -; SI-NEXT: v_mul_f32_e32 v21, 1.0, v12 -; SI-NEXT: v_mul_f32_e32 v19, 1.0, v14 -; SI-NEXT: v_mul_f32_e32 v17, 1.0, v16 +; SI-NEXT: v_mul_f32_e64 v41, 1.0, s43 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v63, 1.0, s42 +; SI-NEXT: v_mul_f32_e64 v62, 1.0, s41 +; SI-NEXT: v_mul_f32_e64 v61, 1.0, s40 +; SI-NEXT: v_mul_f32_e64 v60, 1.0, s29 +; SI-NEXT: v_mul_f32_e64 v59, 1.0, s28 +; SI-NEXT: v_mul_f32_e64 v58, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v57, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v56, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v47, 1.0, s14 +; SI-NEXT: v_mul_f32_e64 v46, 1.0, s12 +; SI-NEXT: v_mul_f32_e64 v45, 1.0, s10 +; SI-NEXT: v_mul_f32_e64 v44, 1.0, s8 +; SI-NEXT: v_mul_f32_e64 v43, 1.0, s6 +; SI-NEXT: v_mul_f32_e32 v42, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v1 ; SI-NEXT: v_mul_f32_e64 v39, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v54, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v52, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v50, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v48, 1.0, s24 -; SI-NEXT: v_mul_f32_e64 v37, 1.0, s26 -; SI-NEXT: v_mul_f32_e64 v35, 1.0, s28 +; SI-NEXT: v_mul_f32_e64 v54, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v52, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v50, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v48, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v37, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v35, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v33, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v31, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v29, 1.0, s15 +; SI-NEXT: v_mul_f32_e64 v27, 1.0, s13 +; SI-NEXT: v_mul_f32_e64 v25, 1.0, s11 +; SI-NEXT: v_mul_f32_e64 v23, 1.0, s9 +; SI-NEXT: v_mul_f32_e64 v21, 1.0, s7 ; SI-NEXT: s_cbranch_scc0 .LBB83_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v62 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v60 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v63 ; SI-NEXT: v_lshr_b64 v[0:1], v[39:40], 16 ; SI-NEXT: v_lshr_b64 v[1:2], v[54:55], 16 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v62 ; SI-NEXT: v_lshr_b64 v[2:3], v[52:53], 16 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v63 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v61 ; SI-NEXT: v_lshr_b64 v[3:4], v[50:51], 16 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v61 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v60 ; SI-NEXT: v_lshr_b64 v[4:5], v[48:49], 16 ; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v59 ; SI-NEXT: v_lshr_b64 v[5:6], v[37:38], 16 @@ -49854,16 +51696,16 @@ define inreg <8 x double> @bitcast_v32bf16_to_v8f64_scalar(<32 x bfloat> inreg % ; SI-NEXT: v_lshr_b64 v[13:14], v[21:22], 16 ; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v42 ; SI-NEXT: v_lshr_b64 v[14:15], v[19:20], 16 -; SI-NEXT: v_mov_b32_e32 v20, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v16 +; SI-NEXT: v_mov_b32_e32 v20, v16 ; SI-NEXT: v_lshr_b64 v[15:16], v[17:18], 16 -; SI-NEXT: v_mov_b32_e32 v18, v20 +; SI-NEXT: v_mov_b32_e32 v16, v20 ; SI-NEXT: s_cbranch_execnz .LBB83_3 ; SI-NEXT: .LBB83_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v62 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v41 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v39 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v60 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v63 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v54 @@ -49872,19 +51714,19 @@ define inreg <8 x double> @bitcast_v32bf16_to_v8f64_scalar(<32 x bfloat> inreg % ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 ; SI-NEXT: v_lshr_b64 v[1:2], v[2:3], 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v41 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v62 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v52 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v63 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v61 ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v50 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_lshr_b64 v[3:4], v[3:4], 16 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v61 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v60 ; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v48 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 @@ -49950,7 +51792,7 @@ define inreg <8 x double> @bitcast_v32bf16_to_v8f64_scalar(<32 x bfloat> inreg % ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_lshr_b64 v[14:15], v[14:15], 16 -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v18 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v17 ; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 @@ -58969,56 +60811,109 @@ define <32 x half> @bitcast_v32i16_to_v32f16(<32 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v32i16_to_v32f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v32, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 -; SI-NEXT: v_mov_b32_e32 v62, v30 -; SI-NEXT: v_mov_b32_e32 v61, v29 -; SI-NEXT: v_mov_b32_e32 v60, v28 -; SI-NEXT: v_mov_b32_e32 v59, v27 -; SI-NEXT: v_mov_b32_e32 v58, v26 -; SI-NEXT: v_mov_b32_e32 v57, v25 -; SI-NEXT: v_mov_b32_e32 v56, v24 -; SI-NEXT: v_mov_b32_e32 v47, v23 -; SI-NEXT: v_mov_b32_e32 v46, v22 -; SI-NEXT: v_mov_b32_e32 v45, v21 -; SI-NEXT: v_mov_b32_e32 v44, v20 -; SI-NEXT: v_mov_b32_e32 v43, v19 -; SI-NEXT: v_mov_b32_e32 v42, v18 -; SI-NEXT: v_mov_b32_e32 v41, v17 -; SI-NEXT: v_mov_b32_e32 v40, v16 -; SI-NEXT: v_mov_b32_e32 v55, v15 -; SI-NEXT: v_mov_b32_e32 v54, v14 -; SI-NEXT: v_mov_b32_e32 v53, v13 -; SI-NEXT: v_mov_b32_e32 v52, v12 -; SI-NEXT: v_mov_b32_e32 v51, v11 -; SI-NEXT: v_mov_b32_e32 v50, v10 -; SI-NEXT: v_mov_b32_e32 v49, v9 -; SI-NEXT: v_mov_b32_e32 v48, v8 -; SI-NEXT: v_mov_b32_e32 v39, v7 -; SI-NEXT: v_mov_b32_e32 v38, v6 -; SI-NEXT: v_mov_b32_e32 v37, v5 -; SI-NEXT: v_mov_b32_e32 v36, v4 -; SI-NEXT: v_mov_b32_e32 v35, v3 -; SI-NEXT: v_mov_b32_e32 v34, v2 -; SI-NEXT: v_mov_b32_e32 v33, v1 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB88_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_cvt_f32_f16_e32 v16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v63 +; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 @@ -59034,78 +60929,6 @@ define <32 x half> @bitcast_v32i16_to_v32f16(<32 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execz .LBB88_2 -; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v0, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v62 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v63 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr42 @@ -59126,89 +60949,152 @@ define <32 x half> @bitcast_v32i16_to_v32f16(<32 x i16> %a, i32 %b) { ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB88_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v63 -; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v62 -; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v61 -; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v60 -; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v59 -; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v58 -; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v57 -; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v56 +; SI-NEXT: v_add_i32_e32 v55, vcc, 3, v63 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v39, vcc, 3, v62 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v33, vcc, 3, v61 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v36, vcc, 3, v60 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v59 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v58 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v57 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v56 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 ; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v47 -; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v46 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v46 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 ; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v45 -; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v44 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v44 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 ; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v43 -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v42 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v42 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v41 -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v40 -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v55 -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v54 -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v53 -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v52 -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v51 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v50 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v49 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v48 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v39 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v38 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v37 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v36 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v35 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v34 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v33 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v40 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 ; SI-NEXT: .LBB88_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v18 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v27 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v20 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v31 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v22 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v35 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v25 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v38 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v28 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v49 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v32 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v51 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v36 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v53 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v39 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -59353,167 +61239,223 @@ define inreg <32 x half> @bitcast_v32i16_to_v32f16_scalar(<32 x i16> inreg %a, i ; SI-LABEL: bitcast_v32i16_to_v32f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v54, v17 -; SI-NEXT: v_mov_b32_e32 v53, v16 -; SI-NEXT: v_mov_b32_e32 v52, v15 -; SI-NEXT: v_mov_b32_e32 v51, v14 -; SI-NEXT: v_mov_b32_e32 v50, v13 -; SI-NEXT: v_mov_b32_e32 v49, v12 -; SI-NEXT: v_mov_b32_e32 v48, v11 -; SI-NEXT: v_mov_b32_e32 v39, v10 -; SI-NEXT: v_mov_b32_e32 v38, v9 -; SI-NEXT: v_mov_b32_e32 v37, v8 -; SI-NEXT: v_mov_b32_e32 v36, v7 -; SI-NEXT: v_mov_b32_e32 v35, v6 -; SI-NEXT: v_mov_b32_e32 v34, v5 -; SI-NEXT: v_mov_b32_e32 v33, v4 -; SI-NEXT: v_mov_b32_e32 v32, v3 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v41, v2 -; SI-NEXT: v_mov_b32_e32 v40, v1 -; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: s_lshr_b32 s43, s29, 16 +; SI-NEXT: s_lshr_b32 s42, s28, 16 +; SI-NEXT: s_lshr_b32 s41, s27, 16 +; SI-NEXT: s_lshr_b32 s40, s26, 16 +; SI-NEXT: s_lshr_b32 s15, s25, 16 +; SI-NEXT: s_lshr_b32 s14, s24, 16 +; SI-NEXT: s_lshr_b32 s13, s23, 16 +; SI-NEXT: s_lshr_b32 s12, s22, 16 +; SI-NEXT: s_lshr_b32 s11, s21, 16 +; SI-NEXT: s_lshr_b32 s10, s20, 16 +; SI-NEXT: s_lshr_b32 s9, s19, 16 +; SI-NEXT: s_lshr_b32 s8, s18, 16 +; SI-NEXT: s_lshr_b32 s7, s17, 16 +; SI-NEXT: s_lshr_b32 s6, s16, 16 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v1 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v0 ; SI-NEXT: s_cbranch_scc0 .LBB89_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v34 ; SI-NEXT: s_cbranch_execnz .LBB89_3 ; SI-NEXT: .LBB89_2: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v54 -; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v53 -; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v52 -; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v51 -; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v50 -; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v49 -; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v48 -; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v39 -; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v38 -; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v37 -; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v36 -; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v35 -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v34 -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v33 -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v32 -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v41 -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v40 -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v33, vcc, 3, v34 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v35 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: s_add_i32 s43, s43, 3 ; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_add_i32 s42, s42, 3 ; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s41, s41, 3 ; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_add_i32 s40, s40, 3 ; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s15, s15, 3 ; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s14, s14, 3 ; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s13, s13, 3 ; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s12, s12, 3 ; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s11, s11, 3 ; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s10, s10, 3 ; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s9, s9, 3 ; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s8, s8, 3 ; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s7, s7, 3 ; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s6, s6, 3 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 ; SI-NEXT: .LBB89_3: ; %end -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v13 +; SI-NEXT: v_or_b32_e32 v1, v4, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v17 +; SI-NEXT: v_or_b32_e32 v3, v6, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v21 +; SI-NEXT: v_or_b32_e32 v5, v8, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v11 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v25 +; SI-NEXT: v_or_b32_e32 v7, v10, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v14 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v27 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v18 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v29 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v22 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v31 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v23 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB89_4: -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: s_branch .LBB89_2 ; ; VI-LABEL: bitcast_v32i16_to_v32f16_scalar: @@ -59767,186 +61709,278 @@ define <32 x i16> @bitcast_v32f16_to_v32i16(<32 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v32f16_to_v32i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v13 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v20 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v30 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v28 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v33 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v31, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v32 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB90_2 ; SI-NEXT: ; %bb.1: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_or_b32_e32 v30, v30, v32 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_or_b32_e32 v26, v26, v32 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_or_b32_e32 v22, v22, v32 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_or_b32_e32 v15, v15, v32 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_or_b32_e32 v13, v13, v32 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_or_b32_e32 v11, v11, v32 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 ; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; SI-NEXT: v_or_b32_e32 v18, v18, v32 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_or_b32_e32 v9, v9, v32 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_or_b32_e32 v14, v14, v32 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v11 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_or_b32_e32 v7, v7, v32 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_or_b32_e32 v10, v10, v32 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v7 -; SI-NEXT: v_or_b32_e32 v6, v6, v32 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v3 -; SI-NEXT: v_or_b32_e32 v2, v2, v32 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_or_b32_e32 v5, v5, v32 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v32 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v32 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_or_b32_e32 v4, v4, v5 -; SI-NEXT: v_or_b32_e32 v8, v8, v9 -; SI-NEXT: v_or_b32_e32 v12, v12, v13 -; SI-NEXT: v_or_b32_e32 v16, v16, v17 -; SI-NEXT: v_or_b32_e32 v20, v20, v21 -; SI-NEXT: v_or_b32_e32 v24, v24, v25 -; SI-NEXT: v_or_b32_e32 v28, v28, v29 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 -; SI-NEXT: v_alignbit_b32 v17, v18, v17, 16 -; SI-NEXT: v_alignbit_b32 v21, v22, v21, 16 -; SI-NEXT: v_alignbit_b32 v25, v26, v25, 16 -; SI-NEXT: v_alignbit_b32 v29, v30, v29, 16 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v0, v0, v31 +; SI-NEXT: v_or_b32_e32 v24, v24, v30 +; SI-NEXT: v_or_b32_e32 v22, v22, v29 +; SI-NEXT: v_or_b32_e32 v21, v21, v28 +; SI-NEXT: v_or_b32_e32 v20, v20, v27 +; SI-NEXT: v_or_b32_e32 v18, v18, v26 +; SI-NEXT: v_or_b32_e32 v19, v19, v25 +; SI-NEXT: v_or_b32_e32 v17, v17, v23 +; SI-NEXT: v_alignbit_b32 v31, v1, v31, 16 +; SI-NEXT: v_alignbit_b32 v30, v3, v30, 16 +; SI-NEXT: v_alignbit_b32 v29, v5, v29, 16 +; SI-NEXT: v_alignbit_b32 v28, v7, v28, 16 +; SI-NEXT: v_alignbit_b32 v27, v9, v27, 16 +; SI-NEXT: v_alignbit_b32 v26, v11, v26, 16 +; SI-NEXT: v_alignbit_b32 v25, v13, v25, 16 +; SI-NEXT: v_alignbit_b32 v23, v15, v23, 16 ; SI-NEXT: .LBB90_2: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v26 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v30 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v29 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v28 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v27 +; SI-NEXT: v_or_b32_e32 v10, v10, v18 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v25 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v23 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v0, v0, v31 +; SI-NEXT: v_or_b32_e32 v2, v2, v24 +; SI-NEXT: v_or_b32_e32 v4, v4, v22 +; SI-NEXT: v_or_b32_e32 v6, v6, v21 +; SI-NEXT: v_or_b32_e32 v8, v8, v20 +; SI-NEXT: v_or_b32_e32 v12, v12, v18 +; SI-NEXT: v_or_b32_e32 v14, v14, v17 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v32f16_to_v32i16: @@ -60091,198 +62125,277 @@ define inreg <32 x i16> @bitcast_v32f16_to_v32i16_scalar(<32 x half> inreg %a, i ; SI-LABEL: bitcast_v32f16_to_v32i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v21, v16 -; SI-NEXT: v_mov_b32_e32 v25, v15 -; SI-NEXT: v_mov_b32_e32 v26, v12 -; SI-NEXT: v_mov_b32_e32 v29, v11 -; SI-NEXT: v_mov_b32_e32 v22, v8 -; SI-NEXT: v_mov_b32_e32 v30, v7 -; SI-NEXT: v_mov_b32_e32 v32, v4 -; SI-NEXT: v_mov_b32_e32 v33, v3 -; SI-NEXT: v_mov_b32_e32 v34, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v0, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v3, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v4, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v7, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v8, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v11, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v12, s28 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v37, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v2, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v35, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v6, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v33, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v10, s26 -; SI-NEXT: v_cvt_f16_f32_e32 v38, s29 +; SI-NEXT: s_lshr_b32 s4, s29, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s28, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 +; SI-NEXT: s_lshr_b32 s4, s27, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_lshr_b32 s4, s26, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 +; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s26 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 +; SI-NEXT: s_lshr_b32 s6, s24, 16 +; SI-NEXT: s_lshr_b32 s7, s22, 16 +; SI-NEXT: s_lshr_b32 s8, s20, 16 +; SI-NEXT: s_lshr_b32 s9, s18, 16 +; SI-NEXT: s_lshr_b32 s10, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v14 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_cbranch_scc0 .LBB91_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_cbranch_execnz .LBB91_3 ; SI-NEXT: .LBB91_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v14, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v51 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v14 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v33 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v30 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v34 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v52 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v6 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v12 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v28 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v26 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v34 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v55 +; SI-NEXT: v_or_b32_e32 v52, v16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v49 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v8 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v10 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v15 -; SI-NEXT: v_or_b32_e32 v14, v14, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v48 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v37 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v11 -; SI-NEXT: v_or_b32_e32 v18, v18, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v49 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_or_b32_e32 v2, v2, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v35 -; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v33 -; SI-NEXT: v_or_b32_e32 v10, v10, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v38 -; SI-NEXT: v_or_b32_e32 v22, v22, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v52 -; SI-NEXT: v_or_b32_e32 v26, v26, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v22 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v20 +; SI-NEXT: v_or_b32_e32 v55, v17, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v54 +; SI-NEXT: v_or_b32_e32 v53, v16, v4 +; SI-NEXT: v_or_b32_e32 v49, v18, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v36 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v31 -; SI-NEXT: v_lshr_b64 v[48:49], v[17:18], 16 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_or_b32_e32 v30, v30, v32 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; SI-NEXT: v_lshr_b64 v[37:38], v[1:2], 16 -; SI-NEXT: v_lshr_b64 v[49:50], v[21:22], 16 -; SI-NEXT: v_lshr_b64 v[35:36], v[5:6], 16 -; SI-NEXT: v_lshr_b64 v[33:34], v[9:10], 16 -; SI-NEXT: v_lshr_b64 v[38:39], v[13:14], 16 -; SI-NEXT: v_lshr_b64 v[52:53], v[25:26], 16 -; SI-NEXT: v_lshr_b64 v[50:51], v[29:30], 16 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_or_b32_e32 v4, v4, v5 -; SI-NEXT: v_or_b32_e32 v8, v8, v9 -; SI-NEXT: v_or_b32_e32 v12, v12, v13 -; SI-NEXT: v_or_b32_e32 v16, v16, v17 -; SI-NEXT: v_or_b32_e32 v20, v20, v21 -; SI-NEXT: v_or_b32_e32 v24, v24, v25 -; SI-NEXT: v_or_b32_e32 v28, v28, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v54, v17, v8 +; SI-NEXT: v_or_b32_e32 v50, v16, v10 +; SI-NEXT: v_or_b32_e32 v38, v18, v12 +; SI-NEXT: v_or_b32_e32 v36, v19, v14 +; SI-NEXT: v_lshr_b64 v[30:31], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[28:29], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[26:27], v[4:5], 16 +; SI-NEXT: v_lshr_b64 v[24:25], v[6:7], 16 +; SI-NEXT: v_lshr_b64 v[22:23], v[8:9], 16 +; SI-NEXT: v_lshr_b64 v[20:21], v[10:11], 16 +; SI-NEXT: v_lshr_b64 v[18:19], v[12:13], 16 +; SI-NEXT: v_lshr_b64 v[16:17], v[14:15], 16 ; SI-NEXT: .LBB91_3: ; %end -; SI-NEXT: v_mov_b32_e32 v1, v37 -; SI-NEXT: v_mov_b32_e32 v5, v35 -; SI-NEXT: v_mov_b32_e32 v9, v33 -; SI-NEXT: v_mov_b32_e32 v13, v38 -; SI-NEXT: v_mov_b32_e32 v17, v48 -; SI-NEXT: v_mov_b32_e32 v21, v49 -; SI-NEXT: v_mov_b32_e32 v25, v52 -; SI-NEXT: v_mov_b32_e32 v29, v50 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v30 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v52 +; SI-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v28 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v55 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v48 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v26 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v53 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v39 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v24 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v49 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v37 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v22 +; SI-NEXT: v_or_b32_e32 v8, v8, v10 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v35 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v50 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v20 +; SI-NEXT: v_or_b32_e32 v10, v10, v12 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v34 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v38 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v18 +; SI-NEXT: v_or_b32_e32 v12, v12, v14 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v33 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v14, v14, v16 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v32 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB91_4: ; SI-NEXT: s_branch .LBB91_2 @@ -60479,85 +62592,39 @@ define <32 x bfloat> @bitcast_v32i16_to_v32bf16(<32 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v32i16_to_v32bf16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v33, v2 -; SI-NEXT: v_mov_b32_e32 v32, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 -; SI-NEXT: v_mov_b32_e32 v49, v30 -; SI-NEXT: v_mov_b32_e32 v55, v28 -; SI-NEXT: v_mov_b32_e32 v54, v26 -; SI-NEXT: v_mov_b32_e32 v53, v24 -; SI-NEXT: v_mov_b32_e32 v52, v22 -; SI-NEXT: v_mov_b32_e32 v51, v20 -; SI-NEXT: v_mov_b32_e32 v50, v18 -; SI-NEXT: v_mov_b32_e32 v48, v16 -; SI-NEXT: v_mov_b32_e32 v39, v14 -; SI-NEXT: v_mov_b32_e32 v38, v12 -; SI-NEXT: v_mov_b32_e32 v37, v10 -; SI-NEXT: v_mov_b32_e32 v36, v8 -; SI-NEXT: v_mov_b32_e32 v35, v6 -; SI-NEXT: v_mov_b32_e32 v34, v4 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v32 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v2 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB92_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB92_4 -; SI-NEXT: .LBB92_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB92_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v34 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v38 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v48 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v50 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v52 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v53 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v54 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr34 @@ -60567,114 +62634,216 @@ define <32 x bfloat> @bitcast_v32i16_to_v32bf16(<32 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB92_2 -; SI-NEXT: .LBB92_4: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v31, v0 -; SI-NEXT: v_add_i32_e32 v30, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v15 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: .LBB92_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB92_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v15, v31, v15 ; SI-NEXT: s_mov_b32 s6, 0x30000 -; SI-NEXT: v_or_b32_e32 v0, v29, v0 -; SI-NEXT: v_add_i32_e32 v28, vcc, s6, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v54 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v27, v0 -; SI-NEXT: v_add_i32_e32 v26, vcc, s6, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v25, v0 -; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v23, v0 -; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v21, v0 -; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v19, v0 -; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v17, v0 -; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v15, v0 -; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v13, v0 -; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v11, v0 -; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v9, v0 -; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v7, v0 -; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v5, v0 -; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v3, v0 -; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v14, v30, v14 +; SI-NEXT: v_or_b32_e32 v13, v29, v13 +; SI-NEXT: v_or_b32_e32 v12, v28, v12 +; SI-NEXT: v_or_b32_e32 v11, v27, v11 +; SI-NEXT: v_or_b32_e32 v10, v26, v10 +; SI-NEXT: v_or_b32_e32 v9, v25, v9 +; SI-NEXT: v_or_b32_e32 v8, v24, v8 +; SI-NEXT: v_or_b32_e32 v7, v23, v7 +; SI-NEXT: v_or_b32_e32 v6, v22, v6 +; SI-NEXT: v_or_b32_e32 v5, v21, v5 +; SI-NEXT: v_or_b32_e32 v4, v20, v4 +; SI-NEXT: v_or_b32_e32 v3, v19, v3 +; SI-NEXT: v_or_b32_e32 v2, v18, v2 +; SI-NEXT: v_or_b32_e32 v1, v17, v1 +; SI-NEXT: v_or_b32_e32 v0, v16, v0 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v10 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v12 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v22 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v26 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v28 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v30 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v0 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v1 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v2 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v3 +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v4 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v5 +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v6 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v7 +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v8 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v8 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v9 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v9 +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v10 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v10 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v11 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v11 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v12 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v12 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v13 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v13 +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v14 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v14 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v15 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v15 +; SI-NEXT: .LBB92_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v16 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v32 +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v17 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v33 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v18 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v34 +; SI-NEXT: v_alignbit_b32 v2, v2, v3, 16 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v19 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v35 +; SI-NEXT: v_alignbit_b32 v3, v3, v4, 16 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v20 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v36 +; SI-NEXT: v_alignbit_b32 v4, v4, v5, 16 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v21 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_mul_f32_e32 v6, 1.0, v37 +; SI-NEXT: v_alignbit_b32 v5, v5, v6, 16 +; SI-NEXT: v_mul_f32_e32 v6, 1.0, v22 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v38 +; SI-NEXT: v_alignbit_b32 v6, v6, v7, 16 +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v23 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v39 +; SI-NEXT: v_alignbit_b32 v7, v7, v8, 16 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v24 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v48 +; SI-NEXT: v_alignbit_b32 v8, v8, v9, 16 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v25 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v49 +; SI-NEXT: v_alignbit_b32 v9, v9, v10, 16 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v26 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v50 +; SI-NEXT: v_alignbit_b32 v10, v10, v11, 16 +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v27 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_mul_f32_e32 v12, 1.0, v51 +; SI-NEXT: v_alignbit_b32 v11, v11, v12, 16 +; SI-NEXT: v_mul_f32_e32 v12, 1.0, v28 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_mul_f32_e32 v13, 1.0, v52 +; SI-NEXT: v_alignbit_b32 v12, v12, v13, 16 +; SI-NEXT: v_mul_f32_e32 v13, 1.0, v29 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_mul_f32_e32 v14, 1.0, v53 +; SI-NEXT: v_alignbit_b32 v13, v13, v14, 16 +; SI-NEXT: v_mul_f32_e32 v14, 1.0, v30 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_mul_f32_e32 v15, 1.0, v54 +; SI-NEXT: v_alignbit_b32 v14, v14, v15, 16 +; SI-NEXT: v_mul_f32_e32 v15, 1.0, v31 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v55 +; SI-NEXT: v_alignbit_b32 v15, v15, v16, 16 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v32i16_to_v32bf16: @@ -60818,191 +62987,299 @@ define inreg <32 x bfloat> @bitcast_v32i16_to_v32bf16_scalar(<32 x i16> inreg %a ; SI-LABEL: bitcast_v32i16_to_v32bf16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v20, v17 -; SI-NEXT: v_mov_b32_e32 v33, v16 -; SI-NEXT: v_mov_b32_e32 v16, v15 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; SI-NEXT: v_mov_b32_e32 v32, v14 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v17, s30, 0 +; SI-NEXT: v_writelane_b32 v17, s31, 1 +; SI-NEXT: v_writelane_b32 v17, s34, 2 +; SI-NEXT: v_writelane_b32 v17, s35, 3 +; SI-NEXT: v_writelane_b32 v17, s36, 4 +; SI-NEXT: v_writelane_b32 v17, s37, 5 +; SI-NEXT: v_writelane_b32 v17, s38, 6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; SI-NEXT: s_lshr_b32 s35, s29, 16 +; SI-NEXT: s_lshr_b32 s34, s28, 16 +; SI-NEXT: s_lshr_b32 s31, s27, 16 +; SI-NEXT: s_lshr_b32 s30, s26, 16 +; SI-NEXT: s_lshr_b32 s95, s25, 16 +; SI-NEXT: s_lshr_b32 s94, s24, 16 +; SI-NEXT: s_lshr_b32 s93, s23, 16 +; SI-NEXT: s_lshr_b32 s92, s22, 16 +; SI-NEXT: s_lshr_b32 s91, s21, 16 +; SI-NEXT: s_lshr_b32 s90, s20, 16 +; SI-NEXT: s_lshr_b32 s89, s19, 16 +; SI-NEXT: s_lshr_b32 s88, s18, 16 +; SI-NEXT: s_lshr_b32 s79, s17, 16 +; SI-NEXT: s_lshr_b32 s78, s16, 16 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: v_writelane_b32 v17, s39, 7 +; SI-NEXT: v_readfirstlane_b32 s38, v1 +; SI-NEXT: v_readfirstlane_b32 s36, v0 +; SI-NEXT: v_readfirstlane_b32 s39, v3 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v20 +; SI-NEXT: v_readfirstlane_b32 s37, v4 ; SI-NEXT: s_cbranch_scc0 .LBB93_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshl_b32 s14, s16, 16 -; SI-NEXT: s_lshl_b32 s15, s17, 16 -; SI-NEXT: s_lshl_b32 s40, s18, 16 -; SI-NEXT: s_lshl_b32 s41, s19, 16 -; SI-NEXT: s_lshl_b32 s42, s20, 16 -; SI-NEXT: s_lshl_b32 s43, s21, 16 -; SI-NEXT: s_lshl_b32 s6, s22, 16 -; SI-NEXT: s_lshl_b32 s7, s23, 16 -; SI-NEXT: s_lshl_b32 s8, s24, 16 -; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: s_lshl_b32 s10, s26, 16 +; SI-NEXT: s_lshl_b32 s73, s16, 16 +; SI-NEXT: s_lshl_b32 s77, s78, 16 +; SI-NEXT: s_lshl_b32 s63, s17, 16 +; SI-NEXT: s_lshl_b32 s76, s79, 16 +; SI-NEXT: s_lshl_b32 s61, s18, 16 +; SI-NEXT: s_lshl_b32 s75, s88, 16 +; SI-NEXT: s_lshl_b32 s59, s19, 16 +; SI-NEXT: s_lshl_b32 s74, s89, 16 +; SI-NEXT: s_lshl_b32 s57, s20, 16 +; SI-NEXT: s_lshl_b32 s72, s90, 16 +; SI-NEXT: s_lshl_b32 s47, s21, 16 +; SI-NEXT: s_lshl_b32 s62, s91, 16 +; SI-NEXT: s_lshl_b32 s45, s22, 16 +; SI-NEXT: s_lshl_b32 s60, s92, 16 +; SI-NEXT: s_lshl_b32 s43, s23, 16 +; SI-NEXT: s_lshl_b32 s58, s93, 16 +; SI-NEXT: s_lshl_b32 s41, s24, 16 +; SI-NEXT: s_lshl_b32 s56, s94, 16 +; SI-NEXT: s_lshl_b32 s15, s25, 16 +; SI-NEXT: s_lshl_b32 s46, s95, 16 +; SI-NEXT: s_lshl_b32 s13, s26, 16 +; SI-NEXT: s_lshl_b32 s44, s30, 16 ; SI-NEXT: s_lshl_b32 s11, s27, 16 -; SI-NEXT: s_lshl_b32 s12, s28, 16 -; SI-NEXT: s_lshl_b32 s13, s29, 16 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v33 +; SI-NEXT: s_lshl_b32 s42, s31, 16 +; SI-NEXT: s_lshl_b32 s10, s28, 16 +; SI-NEXT: s_lshl_b32 s40, s34, 16 +; SI-NEXT: s_lshl_b32 s8, s29, 16 +; SI-NEXT: s_lshl_b32 s14, s35, 16 +; SI-NEXT: s_lshl_b32 s7, s36, 16 +; SI-NEXT: s_lshl_b32 s12, s37, 16 +; SI-NEXT: s_lshl_b32 s6, s38, 16 +; SI-NEXT: s_lshl_b32 s9, s39, 16 ; SI-NEXT: s_cbranch_execnz .LBB93_3 ; SI-NEXT: .LBB93_2: ; %cmp.true -; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: s_and_b32 s4, s28, 0xffff -; SI-NEXT: s_lshl_b32 s5, s29, 16 -; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s38, s38, 3 +; SI-NEXT: s_and_b32 s4, s38, 0xffff +; SI-NEXT: s_lshl_b32 s5, s39, 16 +; SI-NEXT: s_add_i32 s36, s36, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s5, s26, 0xffff -; SI-NEXT: s_lshl_b32 s6, s27, 16 -; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_and_b32 s5, s36, 0xffff +; SI-NEXT: s_lshl_b32 s6, s37, 16 +; SI-NEXT: s_add_i32 s29, s29, 3 ; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_and_b32 s6, s24, 0xffff -; SI-NEXT: s_lshl_b32 s7, s25, 16 +; SI-NEXT: s_and_b32 s6, s29, 0xffff +; SI-NEXT: s_lshl_b32 s7, s35, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 ; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s28, 0xffff +; SI-NEXT: s_lshl_b32 s8, s34, 16 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s27, 0xffff +; SI-NEXT: s_lshl_b32 s9, s31, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s30, 16 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s25, 0xffff +; SI-NEXT: s_lshl_b32 s11, s95, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: s_and_b32 s11, s24, 0xffff +; SI-NEXT: s_lshl_b32 s12, s94, 16 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_or_b32 s11, s12, s11 +; SI-NEXT: s_and_b32 s12, s23, 0xffff +; SI-NEXT: s_lshl_b32 s13, s93, 16 ; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: s_add_i32 s8, s6, 0x30000 -; SI-NEXT: s_and_b32 s6, s22, 0xffff -; SI-NEXT: s_lshl_b32 s7, s23, 16 +; SI-NEXT: s_or_b32 s12, s13, s12 +; SI-NEXT: s_and_b32 s13, s22, 0xffff +; SI-NEXT: s_lshl_b32 s14, s92, 16 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_or_b32 s13, s14, s13 +; SI-NEXT: s_and_b32 s14, s21, 0xffff +; SI-NEXT: s_lshl_b32 s15, s91, 16 ; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_and_b32 s7, s20, 0xffff -; SI-NEXT: s_lshl_b32 s9, s21, 16 +; SI-NEXT: s_or_b32 s14, s15, s14 +; SI-NEXT: s_and_b32 s15, s20, 0xffff +; SI-NEXT: s_lshl_b32 s20, s90, 16 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_or_b32 s15, s20, s15 +; SI-NEXT: s_and_b32 s19, s19, 0xffff +; SI-NEXT: s_lshl_b32 s20, s89, 16 ; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v33 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v32 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v12 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v10 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: s_or_b32 s7, s9, s7 -; SI-NEXT: s_and_b32 s9, s18, 0xffff -; SI-NEXT: s_lshl_b32 s10, s19, 16 +; SI-NEXT: s_or_b32 s19, s20, s19 +; SI-NEXT: s_and_b32 s18, s18, 0xffff +; SI-NEXT: s_lshl_b32 s20, s88, 16 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_or_b32 s18, s20, s18 +; SI-NEXT: s_and_b32 s17, s17, 0xffff +; SI-NEXT: s_lshl_b32 s20, s79, 16 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_or_b32 s9, s10, s9 -; SI-NEXT: s_and_b32 s10, s16, 0xffff -; SI-NEXT: s_lshl_b32 s11, s17, 16 -; SI-NEXT: v_or_b32_e32 v1, v31, v1 -; SI-NEXT: v_or_b32_e32 v3, v29, v3 -; SI-NEXT: v_or_b32_e32 v5, v27, v5 -; SI-NEXT: v_or_b32_e32 v7, v25, v7 -; SI-NEXT: v_or_b32_e32 v8, v23, v8 -; SI-NEXT: v_or_b32_e32 v6, v21, v6 -; SI-NEXT: v_or_b32_e32 v4, v19, v4 -; SI-NEXT: v_or_b32_e32 v2, v17, v2 -; SI-NEXT: v_or_b32_e32 v0, v15, v0 -; SI-NEXT: s_or_b32 s10, s11, s10 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x30000, v1 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x30000, v3 -; SI-NEXT: v_add_i32_e32 v5, vcc, 0x30000, v5 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v7 -; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v8 -; SI-NEXT: v_add_i32_e32 v6, vcc, 0x30000, v6 -; SI-NEXT: v_add_i32_e32 v4, vcc, 0x30000, v4 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x30000, v2 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: s_or_b32 s17, s20, s17 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_lshl_b32 s20, s78, 16 +; SI-NEXT: s_or_b32 s16, s20, s16 ; SI-NEXT: s_add_i32 s4, s4, 0x30000 ; SI-NEXT: s_add_i32 s5, s5, 0x30000 ; SI-NEXT: s_add_i32 s6, s6, 0x30000 ; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 ; SI-NEXT: s_add_i32 s9, s9, 0x30000 ; SI-NEXT: s_add_i32 s10, s10, 0x30000 -; SI-NEXT: s_and_b32 s15, s10, 0xffff0000 -; SI-NEXT: s_lshl_b32 s14, s10, 16 -; SI-NEXT: s_and_b32 s41, s9, 0xffff0000 -; SI-NEXT: s_lshl_b32 s40, s9, 16 -; SI-NEXT: s_and_b32 s43, s7, 0xffff0000 -; SI-NEXT: s_lshl_b32 s42, s7, 16 -; SI-NEXT: s_and_b32 s7, s6, 0xffff0000 -; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: s_and_b32 s9, s8, 0xffff0000 -; SI-NEXT: s_lshl_b32 s8, s8, 16 -; SI-NEXT: s_and_b32 s11, s5, 0xffff0000 -; SI-NEXT: s_lshl_b32 s10, s5, 16 -; SI-NEXT: s_and_b32 s13, s4, 0xffff0000 -; SI-NEXT: s_lshl_b32 s12, s4, 16 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v0 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v0 -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v2 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v2 -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v4 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v4 -; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v6 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v6 -; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v8 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v8 -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v7 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v7 -; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v5 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v5 -; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v3 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v3 -; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v1 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v1 +; SI-NEXT: s_add_i32 s11, s11, 0x30000 +; SI-NEXT: s_add_i32 s12, s12, 0x30000 +; SI-NEXT: s_add_i32 s13, s13, 0x30000 +; SI-NEXT: s_add_i32 s14, s14, 0x30000 +; SI-NEXT: s_add_i32 s15, s15, 0x30000 +; SI-NEXT: s_add_i32 s19, s19, 0x30000 +; SI-NEXT: s_add_i32 s18, s18, 0x30000 +; SI-NEXT: s_add_i32 s17, s17, 0x30000 +; SI-NEXT: s_add_i32 s16, s16, 0x30000 +; SI-NEXT: s_and_b32 s77, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s73, s16, 16 +; SI-NEXT: s_and_b32 s76, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s63, s17, 16 +; SI-NEXT: s_and_b32 s75, s18, 0xffff0000 +; SI-NEXT: s_lshl_b32 s61, s18, 16 +; SI-NEXT: s_and_b32 s74, s19, 0xffff0000 +; SI-NEXT: s_lshl_b32 s59, s19, 16 +; SI-NEXT: s_and_b32 s72, s15, 0xffff0000 +; SI-NEXT: s_lshl_b32 s57, s15, 16 +; SI-NEXT: s_and_b32 s62, s14, 0xffff0000 +; SI-NEXT: s_lshl_b32 s47, s14, 16 +; SI-NEXT: s_and_b32 s60, s13, 0xffff0000 +; SI-NEXT: s_lshl_b32 s45, s13, 16 +; SI-NEXT: s_and_b32 s58, s12, 0xffff0000 +; SI-NEXT: s_lshl_b32 s43, s12, 16 +; SI-NEXT: s_and_b32 s56, s11, 0xffff0000 +; SI-NEXT: s_lshl_b32 s41, s11, 16 +; SI-NEXT: s_and_b32 s46, s10, 0xffff0000 +; SI-NEXT: s_lshl_b32 s15, s10, 16 +; SI-NEXT: s_and_b32 s44, s9, 0xffff0000 +; SI-NEXT: s_lshl_b32 s13, s9, 16 +; SI-NEXT: s_and_b32 s42, s8, 0xffff0000 +; SI-NEXT: s_lshl_b32 s11, s8, 16 +; SI-NEXT: s_and_b32 s40, s7, 0xffff0000 +; SI-NEXT: s_lshl_b32 s10, s7, 16 +; SI-NEXT: s_and_b32 s14, s6, 0xffff0000 +; SI-NEXT: s_lshl_b32 s8, s6, 16 +; SI-NEXT: s_and_b32 s12, s5, 0xffff0000 +; SI-NEXT: s_lshl_b32 s7, s5, 16 +; SI-NEXT: s_and_b32 s9, s4, 0xffff0000 +; SI-NEXT: s_lshl_b32 s6, s4, 16 ; SI-NEXT: .LBB93_3: ; %end -; SI-NEXT: v_mov_b32_e32 v0, s14 -; SI-NEXT: v_mov_b32_e32 v1, s15 -; SI-NEXT: v_mov_b32_e32 v2, s40 -; SI-NEXT: v_mov_b32_e32 v3, s41 -; SI-NEXT: v_mov_b32_e32 v4, s42 -; SI-NEXT: v_mov_b32_e32 v5, s43 -; SI-NEXT: v_mov_b32_e32 v6, s6 -; SI-NEXT: v_mov_b32_e32 v7, s7 -; SI-NEXT: v_mov_b32_e32 v8, s8 -; SI-NEXT: v_mov_b32_e32 v9, s9 -; SI-NEXT: v_mov_b32_e32 v10, s10 -; SI-NEXT: v_mov_b32_e32 v11, s11 -; SI-NEXT: v_mov_b32_e32 v12, s12 -; SI-NEXT: v_mov_b32_e32 v13, s13 +; SI-NEXT: v_mul_f32_e64 v0, 1.0, s77 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_mul_f32_e64 v0, 1.0, s73 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s76 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s63 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s75 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s61 +; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s74 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s59 +; SI-NEXT: v_lshr_b64 v[3:4], v[3:4], 16 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s72 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s57 +; SI-NEXT: v_lshr_b64 v[4:5], v[4:5], 16 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s62 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s47 +; SI-NEXT: v_lshr_b64 v[5:6], v[5:6], 16 +; SI-NEXT: v_mul_f32_e64 v6, 1.0, s60 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_mul_f32_e64 v6, 1.0, s45 +; SI-NEXT: v_lshr_b64 v[6:7], v[6:7], 16 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s58 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v7 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s43 +; SI-NEXT: v_lshr_b64 v[7:8], v[7:8], 16 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s56 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v8 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s41 +; SI-NEXT: v_lshr_b64 v[8:9], v[8:9], 16 +; SI-NEXT: v_mul_f32_e64 v9, 1.0, s46 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v9 +; SI-NEXT: v_mul_f32_e64 v9, 1.0, s15 +; SI-NEXT: v_lshr_b64 v[9:10], v[9:10], 16 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s44 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s13 +; SI-NEXT: v_lshr_b64 v[10:11], v[10:11], 16 +; SI-NEXT: v_mul_f32_e64 v11, 1.0, s42 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v11 +; SI-NEXT: v_mul_f32_e64 v11, 1.0, s11 +; SI-NEXT: v_lshr_b64 v[11:12], v[11:12], 16 +; SI-NEXT: v_mul_f32_e64 v12, 1.0, s40 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v12 +; SI-NEXT: v_mul_f32_e64 v12, 1.0, s10 +; SI-NEXT: v_lshr_b64 v[12:13], v[12:13], 16 +; SI-NEXT: v_mul_f32_e64 v13, 1.0, s14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v13 +; SI-NEXT: v_mul_f32_e64 v13, 1.0, s8 +; SI-NEXT: v_lshr_b64 v[13:14], v[13:14], 16 +; SI-NEXT: v_mul_f32_e64 v14, 1.0, s12 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_mul_f32_e64 v14, 1.0, s7 +; SI-NEXT: v_lshr_b64 v[14:15], v[14:15], 16 +; SI-NEXT: v_mul_f32_e64 v15, 1.0, s9 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v15 +; SI-NEXT: v_mul_f32_e64 v15, 1.0, s6 +; SI-NEXT: v_lshr_b64 v[15:16], v[15:16], 16 +; SI-NEXT: v_readlane_b32 s39, v17, 7 +; SI-NEXT: v_readlane_b32 s38, v17, 6 +; SI-NEXT: v_readlane_b32 s37, v17, 5 +; SI-NEXT: v_readlane_b32 s36, v17, 4 +; SI-NEXT: v_readlane_b32 s35, v17, 3 +; SI-NEXT: v_readlane_b32 s34, v17, 2 +; SI-NEXT: v_readlane_b32 s31, v17, 1 +; SI-NEXT: v_readlane_b32 s30, v17, 0 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB93_4: -; SI-NEXT: ; implicit-def: $sgpr14 -; SI-NEXT: ; implicit-def: $sgpr15 -; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr73 +; SI-NEXT: ; implicit-def: $sgpr77 +; SI-NEXT: ; implicit-def: $sgpr63 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr61 +; SI-NEXT: ; implicit-def: $sgpr75 +; SI-NEXT: ; implicit-def: $sgpr59 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr57 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr47 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr45 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr43 +; SI-NEXT: ; implicit-def: $sgpr58 ; SI-NEXT: ; implicit-def: $sgpr41 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr11 ; SI-NEXT: ; implicit-def: $sgpr42 -; SI-NEXT: ; implicit-def: $sgpr43 -; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; implicit-def: $sgpr7 -; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: ; implicit-def: $sgpr9 ; SI-NEXT: ; implicit-def: $sgpr10 -; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr7 ; SI-NEXT: ; implicit-def: $sgpr12 -; SI-NEXT: ; implicit-def: $sgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr9 ; SI-NEXT: s_branch .LBB93_2 ; ; VI-LABEL: bitcast_v32i16_to_v32bf16_scalar: @@ -61256,135 +63533,164 @@ define <32 x i16> @bitcast_v32bf16_to_v32i16(<32 x bfloat> %a, i32 %b) { ; SI-LABEL: bitcast_v32bf16_to_v32i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v63, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v62, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v33, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v32, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v61, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v60, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v35, 1.0, v6 -; SI-NEXT: v_mul_f32_e32 v34, 1.0, v7 -; SI-NEXT: v_mul_f32_e32 v59, 1.0, v8 -; SI-NEXT: v_mul_f32_e32 v58, 1.0, v9 -; SI-NEXT: v_mul_f32_e32 v37, 1.0, v10 -; SI-NEXT: v_mul_f32_e32 v36, 1.0, v11 -; SI-NEXT: v_mul_f32_e32 v57, 1.0, v12 -; SI-NEXT: v_mul_f32_e32 v56, 1.0, v13 -; SI-NEXT: v_mul_f32_e32 v39, 1.0, v14 -; SI-NEXT: v_mul_f32_e32 v38, 1.0, v15 -; SI-NEXT: v_mul_f32_e32 v47, 1.0, v16 -; SI-NEXT: v_mul_f32_e32 v46, 1.0, v17 -; SI-NEXT: v_mul_f32_e32 v49, 1.0, v18 -; SI-NEXT: v_mul_f32_e32 v48, 1.0, v19 -; SI-NEXT: v_mul_f32_e32 v45, 1.0, v20 -; SI-NEXT: v_mul_f32_e32 v44, 1.0, v21 -; SI-NEXT: v_mul_f32_e32 v51, 1.0, v22 -; SI-NEXT: v_mul_f32_e32 v50, 1.0, v23 -; SI-NEXT: v_mul_f32_e32 v43, 1.0, v24 -; SI-NEXT: v_mul_f32_e32 v42, 1.0, v25 -; SI-NEXT: v_mul_f32_e32 v53, 1.0, v26 -; SI-NEXT: v_mul_f32_e32 v52, 1.0, v27 -; SI-NEXT: v_mul_f32_e32 v41, 1.0, v28 -; SI-NEXT: v_mul_f32_e32 v40, 1.0, v29 -; SI-NEXT: v_mul_f32_e32 v54, 1.0, v30 +; SI-NEXT: v_mul_f32_e32 v62, 1.0, v28 +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v27 +; SI-NEXT: v_mul_f32_e32 v61, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v60, 1.0, v32 +; SI-NEXT: v_mul_f32_e32 v34, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v33, 1.0, v31 +; SI-NEXT: v_mul_f32_e32 v59, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v58, 1.0, v30 +; SI-NEXT: v_mul_f32_e32 v37, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v36, 1.0, v29 +; SI-NEXT: v_mul_f32_e32 v57, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v56, 1.0, v26 +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v38, 1.0, v25 +; SI-NEXT: v_mul_f32_e32 v47, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v46, 1.0, v24 +; SI-NEXT: v_mul_f32_e32 v49, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v48, 1.0, v23 +; SI-NEXT: v_mul_f32_e32 v45, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v44, 1.0, v22 +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v50, 1.0, v21 +; SI-NEXT: v_mul_f32_e32 v43, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v42, 1.0, v20 +; SI-NEXT: v_mul_f32_e32 v53, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v52, 1.0, v19 +; SI-NEXT: v_mul_f32_e32 v41, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v40, 1.0, v18 +; SI-NEXT: v_mul_f32_e32 v55, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v54, 1.0, v17 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: v_mul_f32_e32 v55, 1.0, v55 -; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB94_2 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v63 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v62 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v33 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v32 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v61 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v60 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v35 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v34 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v59 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v58 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v37 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v36 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v57 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v56 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v39 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v38 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v47 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v46 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v62 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v61 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v60 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v59 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v58 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v57 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v56 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v38 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v47 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v46 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v49 ; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v48 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v45 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v44 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v51 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v50 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v43 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v42 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v53 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v52 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v41 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v40 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v54 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v45 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v44 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v50 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v43 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v42 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v52 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v40 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v54 ; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr37 @@ -61407,178 +63713,226 @@ define <32 x i16> @bitcast_v32bf16_to_v32i16(<32 x bfloat> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: .LBB94_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB94_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v62 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v63 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v60 -; SI-NEXT: v_alignbit_b32 v0, v2, v0, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v61 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v24 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v60 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v61 +; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v23 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v58 +; SI-NEXT: v_alignbit_b32 v2, v2, v1, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v59 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v58 -; SI-NEXT: v_alignbit_b32 v4, v4, v2, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v59 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v56 +; SI-NEXT: v_alignbit_b32 v4, v4, v1, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v57 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 -; SI-NEXT: v_alignbit_b32 v8, v6, v2, 16 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v56 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v57 -; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; SI-NEXT: v_alignbit_b32 v12, v7, v2, 16 ; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v46 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v47 +; SI-NEXT: v_alignbit_b32 v6, v6, v1, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v47 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v7 -; SI-NEXT: v_alignbit_b32 v16, v9, v2, 16 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v7 ; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v44 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v45 +; SI-NEXT: v_alignbit_b32 v8, v8, v1, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v45 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v9 -; SI-NEXT: v_alignbit_b32 v20, v10, v2, 16 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v42 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v43 -; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; SI-NEXT: v_alignbit_b32 v24, v11, v2, 16 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v40 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v41 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v42 +; SI-NEXT: v_alignbit_b32 v10, v10, v1, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v43 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v11 -; SI-NEXT: v_alignbit_b32 v28, v13, v2, 16 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v55 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v54 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v11 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v40 +; SI-NEXT: v_alignbit_b32 v12, v12, v1, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v41 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v13 -; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v11 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v52 -; SI-NEXT: v_alignbit_b32 v30, v31, v2, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v53 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v13 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v54 +; SI-NEXT: v_alignbit_b32 v14, v14, v1, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v55 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v15 +; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v52 +; SI-NEXT: v_alignbit_b32 v15, v16, v1, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v53 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v13 +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v50 +; SI-NEXT: v_alignbit_b32 v13, v17, v1, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v51 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v11 -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v10 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v50 -; SI-NEXT: v_alignbit_b32 v26, v27, v2, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v51 -; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v10 -; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v9 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v11 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v9 ; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v48 -; SI-NEXT: v_alignbit_b32 v22, v23, v2, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v49 +; SI-NEXT: v_alignbit_b32 v11, v18, v1, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v49 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v7 +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v7 ; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v38 -; SI-NEXT: v_alignbit_b32 v18, v19, v2, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v39 +; SI-NEXT: v_alignbit_b32 v9, v19, v1, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v39 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v6 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v36 -; SI-NEXT: v_alignbit_b32 v14, v15, v2, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v37 -; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v6 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v34 -; SI-NEXT: v_alignbit_b32 v10, v11, v2, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v35 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v7 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v36 +; SI-NEXT: v_alignbit_b32 v7, v20, v1, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v37 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v32 -; SI-NEXT: v_alignbit_b32 v6, v7, v2, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v33 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v5 +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v33 +; SI-NEXT: v_alignbit_b32 v5, v21, v1, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v34 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 -; SI-NEXT: v_alignbit_b32 v17, v18, v17, 16 -; SI-NEXT: v_alignbit_b32 v21, v22, v21, 16 -; SI-NEXT: v_alignbit_b32 v25, v26, v25, 16 -; SI-NEXT: v_alignbit_b32 v29, v30, v29, 16 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v3 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v23 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v27 +; SI-NEXT: v_alignbit_b32 v3, v22, v1, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v28 +; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_alignbit_b32 v1, v23, v1, 16 +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; SI-NEXT: v_alignbit_b32 v24, v1, v24, 16 +; SI-NEXT: v_alignbit_b32 v25, v3, v25, 16 +; SI-NEXT: v_alignbit_b32 v26, v5, v26, 16 +; SI-NEXT: v_alignbit_b32 v29, v7, v29, 16 +; SI-NEXT: v_alignbit_b32 v30, v9, v30, 16 +; SI-NEXT: v_alignbit_b32 v31, v11, v31, 16 +; SI-NEXT: v_alignbit_b32 v32, v13, v32, 16 +; SI-NEXT: v_alignbit_b32 v35, v15, v35, 16 ; SI-NEXT: .LBB94_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: bitcast_v32bf16_to_v32i16: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB94_2 -; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v0 -; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 -; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 -; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc -; VI-NEXT: v_bfe_u32 v17, v0, 16, 1 -; VI-NEXT: s_movk_i32 s6, 0x7fff -; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v0 -; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 -; VI-NEXT: v_or_b32_e32 v18, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v17, v18, vcc -; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v1 -; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; VI-NEXT: v_bfe_u32 v18, v17, 16, 1 -; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v17 -; VI-NEXT: v_add_u32_e32 v18, vcc, s6, v18 -; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; VI-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v1, v1, v23 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v25 +; SI-NEXT: v_or_b32_e32 v3, v3, v22 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v26 +; SI-NEXT: v_or_b32_e32 v5, v5, v21 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v29 +; SI-NEXT: v_or_b32_e32 v7, v7, v20 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v30 +; SI-NEXT: v_or_b32_e32 v9, v9, v19 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v31 +; SI-NEXT: v_or_b32_e32 v11, v11, v18 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v32 +; SI-NEXT: v_or_b32_e32 v13, v13, v17 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v35 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v0, v0, v24 +; SI-NEXT: v_or_b32_e32 v2, v2, v23 +; SI-NEXT: v_or_b32_e32 v4, v4, v22 +; SI-NEXT: v_or_b32_e32 v6, v6, v21 +; SI-NEXT: v_or_b32_e32 v8, v8, v20 +; SI-NEXT: v_or_b32_e32 v10, v10, v19 +; SI-NEXT: v_or_b32_e32 v12, v12, v18 +; SI-NEXT: v_or_b32_e32 v14, v14, v17 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v32bf16_to_v32i16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB94_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v0 +; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; VI-NEXT: v_bfe_u32 v17, v0, 16, 1 +; VI-NEXT: s_movk_i32 s6, 0x7fff +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v0 +; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v17, v18, vcc +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v1 +; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; VI-NEXT: v_bfe_u32 v18, v17, 16, 1 +; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v17 +; VI-NEXT: v_add_u32_e32 v18, vcc, s6, v18 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v17 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 ; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; VI-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc @@ -62653,7 +65007,39 @@ define inreg <32 x i16> @bitcast_v32bf16_to_v32i16_scalar(<32 x bfloat> inreg %a ; SI-LABEL: bitcast_v32bf16_to_v32i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v0 +; SI-NEXT: s_and_b32 s6, s29, 0xffff0000 +; SI-NEXT: s_lshl_b32 s7, s29, 16 +; SI-NEXT: s_and_b32 s8, s28, 0xffff0000 +; SI-NEXT: s_lshl_b32 s9, s28, 16 +; SI-NEXT: s_and_b32 s10, s27, 0xffff0000 +; SI-NEXT: s_lshl_b32 s11, s27, 16 +; SI-NEXT: s_and_b32 s12, s26, 0xffff0000 +; SI-NEXT: s_lshl_b32 s13, s26, 16 +; SI-NEXT: s_and_b32 s14, s25, 0xffff0000 +; SI-NEXT: s_lshl_b32 s15, s25, 16 +; SI-NEXT: s_and_b32 s25, s24, 0xffff0000 +; SI-NEXT: s_lshl_b32 s24, s24, 16 +; SI-NEXT: s_and_b32 s26, s23, 0xffff0000 +; SI-NEXT: s_lshl_b32 s23, s23, 16 +; SI-NEXT: s_and_b32 s27, s22, 0xffff0000 +; SI-NEXT: s_lshl_b32 s22, s22, 16 +; SI-NEXT: s_and_b32 s28, s21, 0xffff0000 +; SI-NEXT: s_lshl_b32 s21, s21, 16 +; SI-NEXT: s_and_b32 s29, s20, 0xffff0000 +; SI-NEXT: s_lshl_b32 s20, s20, 16 +; SI-NEXT: s_and_b32 s40, s19, 0xffff0000 +; SI-NEXT: s_lshl_b32 s19, s19, 16 +; SI-NEXT: s_and_b32 s41, s18, 0xffff0000 +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_and_b32 s42, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_and_b32 s43, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s16, s16, 16 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -62673,202 +65059,204 @@ define inreg <32 x i16> @bitcast_v32bf16_to_v32i16_scalar(<32 x bfloat> inreg %a ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mul_f32_e64 v62, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v61, 1.0, s17 -; SI-NEXT: v_mul_f32_e32 v37, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v48, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v60, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v59, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v47, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v19, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v58, 1.0, v6 -; SI-NEXT: v_mul_f32_e32 v21, 1.0, v7 -; SI-NEXT: v_mul_f32_e32 v35, 1.0, v8 -; SI-NEXT: v_mul_f32_e32 v23, 1.0, v9 -; SI-NEXT: v_mul_f32_e32 v57, 1.0, v10 -; SI-NEXT: v_mul_f32_e32 v25, 1.0, v11 -; SI-NEXT: v_mul_f32_e32 v33, 1.0, v12 -; SI-NEXT: v_mul_f32_e32 v27, 1.0, v13 -; SI-NEXT: v_mul_f32_e32 v56, 1.0, v14 -; SI-NEXT: v_mul_f32_e32 v29, 1.0, v15 -; SI-NEXT: v_mul_f32_e32 v31, 1.0, v16 -; SI-NEXT: v_mul_f32_e32 v15, 1.0, v17 -; SI-NEXT: v_mul_f32_e64 v52, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v0, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v3, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v5, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v50, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v7, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v6, 1.0, s43 +; SI-NEXT: v_mul_f32_e64 v50, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s42 +; SI-NEXT: v_mul_f32_e64 v59, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s41 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v0, 1.0, s40 +; SI-NEXT: v_mul_f32_e64 v56, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v14, 1.0, s29 +; SI-NEXT: v_mul_f32_e64 v37, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v22, 1.0, s28 +; SI-NEXT: v_mul_f32_e64 v46, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v45, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v34, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s26 +; SI-NEXT: v_mul_f32_e32 v44, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v63, 1.0, s24 -; SI-NEXT: v_mul_f32_e64 v9, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v55, 1.0, s26 -; SI-NEXT: v_mul_f32_e64 v11, 1.0, s27 -; SI-NEXT: v_mul_f32_e64 v17, 1.0, s28 -; SI-NEXT: v_mul_f32_e64 v13, 1.0, s29 +; SI-NEXT: v_mul_f32_e64 v61, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v48, 1.0, s15 +; SI-NEXT: v_mul_f32_e64 v20, 1.0, s14 +; SI-NEXT: v_mul_f32_e64 v60, 1.0, s13 +; SI-NEXT: v_mul_f32_e64 v58, 1.0, s12 +; SI-NEXT: v_mul_f32_e64 v32, 1.0, s11 +; SI-NEXT: v_mul_f32_e64 v12, 1.0, s10 +; SI-NEXT: v_mul_f32_e64 v57, 1.0, s9 +; SI-NEXT: v_mul_f32_e64 v47, 1.0, s8 +; SI-NEXT: v_mul_f32_e64 v55, 1.0, s7 +; SI-NEXT: v_mul_f32_e64 v18, 1.0, s6 ; SI-NEXT: s_cbranch_scc0 .LBB95_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v62 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v61 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v63 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v48 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v60 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v59 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v58 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v57 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v56 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v29 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v15 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v52 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v50 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v55 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v37 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v47 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v35 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v33 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v62 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v59 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v56 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v46 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v45 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v63 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v61 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v60 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v58 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v57 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v47 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v44 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v50 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v48 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v30 ; SI-NEXT: s_cbranch_execnz .LBB95_3 ; SI-NEXT: .LBB95_2: ; %cmp.true -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 -; SI-NEXT: v_add_f32_e32 v53, 0x40c00000, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v53 -; SI-NEXT: v_lshr_b64 v[4:5], v[2:3], 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v9 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v63 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; SI-NEXT: v_lshr_b64 v[8:9], v[2:3], 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v13 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v17 -; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v6 -; SI-NEXT: v_lshr_b64 v[12:13], v[2:3], 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v59 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v60 -; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v9 -; SI-NEXT: v_lshr_b64 v[16:17], v[2:3], 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v21 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v58 -; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v10 -; SI-NEXT: v_lshr_b64 v[20:21], v[2:3], 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v25 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v57 -; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v13 -; SI-NEXT: v_lshr_b64 v[24:25], v[2:3], 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v29 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v56 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v14 -; SI-NEXT: v_lshr_b64 v[28:29], v[2:3], 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v31 -; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v15 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v62 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v33 -; SI-NEXT: v_add_f32_e32 v38, 0x40c00000, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v61 -; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v27 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 +; SI-NEXT: v_add_f32_e32 v39, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v39 +; SI-NEXT: v_lshr_b64 v[5:6], v[5:6], 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v59 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v10 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v56 +; SI-NEXT: v_lshr_b64 v[9:10], v[6:7], 16 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v14 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v35 -; SI-NEXT: v_lshr_b64 v[38:39], v[38:39], 16 -; SI-NEXT: v_add_f32_e32 v35, 0x40c00000, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v23 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v47 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_add_f32_e32 v38, 0x40c00000, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v19 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v37 -; SI-NEXT: v_add_f32_e32 v37, 0x40c00000, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v48 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshr_b64 v[18:19], v[38:39], 16 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v55 -; SI-NEXT: v_add_f32_e32 v48, 0x40c00000, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v11 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; SI-NEXT: v_lshr_b64 v[13:14], v[6:7], 16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v45 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v46 +; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v21 +; SI-NEXT: v_lshr_b64 v[23:24], v[6:7], 16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v61 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v63 +; SI-NEXT: v_add_f32_e32 v38, 0x40c00000, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v38 +; SI-NEXT: v_lshr_b64 v[24:25], v[6:7], 16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v58 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v60 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v10 +; SI-NEXT: v_lshr_b64 v[25:26], v[6:7], 16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v47 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v57 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v11 +; SI-NEXT: v_lshr_b64 v[26:27], v[6:7], 16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v28 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v44 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v14 +; SI-NEXT: v_lshr_b64 v[27:28], v[6:7], 16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v30 +; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v55 +; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v18 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v32 +; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v12 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v48 +; SI-NEXT: v_add_f32_e32 v35, 0x40c00000, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v20 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v6 +; SI-NEXT: v_lshr_b64 v[19:20], v[35:36], 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v34 +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_add_f32_e32 v34, 0x40c00000, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v8 +; SI-NEXT: v_add_f32_e32 v48, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v0 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_lshr_b64 v[1:2], v[48:49], 16 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v6 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v50 +; SI-NEXT: v_lshr_b64 v[7:8], v[34:35], 16 ; SI-NEXT: v_add_f32_e32 v50, 0x40c00000, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v7 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v37 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshr_b64 v[15:16], v[28:29], 16 +; SI-NEXT: v_lshr_b64 v[17:18], v[30:31], 16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v37, 0x40c00000, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v22 ; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v2 -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v13 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v6 -; SI-NEXT: v_lshr_b64 v[6:7], v[50:51], 16 -; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v10 -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v9 -; SI-NEXT: v_lshr_b64 v[10:11], v[48:49], 16 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v53 -; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v14 -; SI-NEXT: v_lshr_b64 v[14:15], v[37:38], 16 -; SI-NEXT: v_lshr_b64 v[40:41], v[5:6], 16 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; SI-NEXT: v_lshr_b64 v[41:42], v[9:10], 16 -; SI-NEXT: v_lshr_b64 v[22:23], v[35:36], 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v52 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshr_b64 v[42:43], v[13:14], 16 -; SI-NEXT: v_lshr_b64 v[26:27], v[33:34], 16 -; SI-NEXT: v_add_f32_e32 v52, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v0 -; SI-NEXT: v_lshr_b64 v[43:44], v[17:18], 16 -; SI-NEXT: v_lshr_b64 v[30:31], v[31:32], 16 -; SI-NEXT: v_lshr_b64 v[2:3], v[52:53], 16 -; SI-NEXT: v_lshr_b64 v[44:45], v[21:22], 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_lshr_b64 v[45:46], v[25:26], 16 -; SI-NEXT: v_lshr_b64 v[54:55], v[1:2], 16 -; SI-NEXT: v_lshr_b64 v[46:47], v[29:30], 16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v11 +; SI-NEXT: v_lshr_b64 v[11:12], v[32:33], 16 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v38 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v3 +; SI-NEXT: v_lshr_b64 v[3:4], v[50:51], 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v39 +; SI-NEXT: v_lshr_b64 v[39:40], v[6:7], 16 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v8 +; SI-NEXT: v_lshr_b64 v[40:41], v[18:19], 16 +; SI-NEXT: v_lshr_b64 v[21:22], v[37:38], 16 +; SI-NEXT: v_lshr_b64 v[52:53], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[41:42], v[10:11], 16 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_lshr_b64 v[53:54], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[42:43], v[16:17], 16 +; SI-NEXT: v_lshr_b64 v[54:55], v[20:21], 16 +; SI-NEXT: v_lshr_b64 v[43:44], v[14:15], 16 ; SI-NEXT: .LBB95_3: ; %end -; SI-NEXT: v_mov_b32_e32 v5, v40 -; SI-NEXT: v_mov_b32_e32 v9, v41 -; SI-NEXT: v_mov_b32_e32 v13, v42 -; SI-NEXT: v_mov_b32_e32 v17, v43 -; SI-NEXT: v_mov_b32_e32 v21, v44 -; SI-NEXT: v_mov_b32_e32 v25, v45 -; SI-NEXT: v_mov_b32_e32 v29, v46 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v52 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v51 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v53 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v49 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v40 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v41 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v43 ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -62885,56 +65273,72 @@ define inreg <32 x i16> @bitcast_v32bf16_to_v32i16_scalar(<32 x bfloat> inreg %a ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v54 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v38 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v39 +; SI-NEXT: v_or_b32_e32 v6, v6, v8 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v35 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v36 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v25 +; SI-NEXT: v_or_b32_e32 v10, v10, v12 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v33 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v31 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v27 +; SI-NEXT: v_or_b32_e32 v14, v14, v16 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v29 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v1, v54 -; SI-NEXT: v_mov_b32_e32 v3, v53 -; SI-NEXT: v_mov_b32_e32 v7, v51 -; SI-NEXT: v_mov_b32_e32 v11, v49 -; SI-NEXT: v_mov_b32_e32 v15, v38 -; SI-NEXT: v_mov_b32_e32 v19, v39 -; SI-NEXT: v_mov_b32_e32 v23, v36 -; SI-NEXT: v_mov_b32_e32 v27, v34 -; SI-NEXT: v_mov_b32_e32 v31, v32 -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB95_4: -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: s_branch .LBB95_2 ; ; VI-LABEL: bitcast_v32bf16_to_v32i16_scalar: @@ -64222,816 +66626,778 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v32i16_to_v64i8: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v14 -; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v18 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v20 -; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v22 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v24 -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v26 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v28 -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v30 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; kill: killed $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; kill: killed $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; kill: killed $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; kill: killed $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; kill: killed $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; kill: killed $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; kill: killed $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; kill: killed $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; kill: killed $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; kill: killed $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; kill: killed $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; kill: killed $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; kill: killed $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; kill: killed $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; kill: killed $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; kill: killed $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; kill: killed $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; kill: killed $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; kill: killed $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; kill: killed $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; kill: killed $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; kill: killed $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; kill: killed $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; kill: killed $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; kill: killed $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; kill: killed $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; kill: killed $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; kill: killed $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 +; SI-NEXT: ; kill: killed $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v44 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v35 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v18 +; SI-NEXT: ; kill: killed $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v44 -; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; kill: killed $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; kill: killed $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; kill: killed $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB96_2 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v56, v1, v63 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v47, v1, v27 -; SI-NEXT: v_alignbit_b32 v1, v47, v56, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v58, v1, v59 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v57, v1, v56 +; SI-NEXT: v_alignbit_b32 v1, v57, v58, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v47, v56, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v57, v58, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v47, v56, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v51, v1, v61 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v39, v1, v60 +; SI-NEXT: v_alignbit_b32 v1, v39, v51, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v39, v51, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 -; SI-NEXT: v_or_b32_e32 v50, v1, v37 +; SI-NEXT: v_or_b32_e32 v34, v1, v63 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v33, v1, v62 +; SI-NEXT: v_alignbit_b32 v1, v33, v34, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v33, v34, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v33, v34, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 -; SI-NEXT: v_or_b32_e32 v38, v1, v36 -; SI-NEXT: v_alignbit_b32 v1, v38, v50, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v32, v1, v37 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v31, v1, v36 +; SI-NEXT: v_alignbit_b32 v1, v31, v32, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v31, v32, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v38, v50, 16 +; SI-NEXT: v_alignbit_b32 v1, v31, v32, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v29, v1, v48 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v28, v1, v38 +; SI-NEXT: v_alignbit_b32 v1, v28, v29, 24 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v38, v50, 8 +; SI-NEXT: v_alignbit_b32 v1, v28, v29, 16 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 -; SI-NEXT: v_or_b32_e32 v34, v1, v48 +; SI-NEXT: v_alignbit_b32 v1, v28, v29, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 -; SI-NEXT: v_or_b32_e32 v33, v1, v39 -; SI-NEXT: v_alignbit_b32 v1, v33, v34, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v26, v1, v50 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v25, v1, v49 +; SI-NEXT: v_alignbit_b32 v1, v25, v26, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v33, v34, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v25, v26, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v33, v34, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v25, v26, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 -; SI-NEXT: v_or_b32_e32 v32, v1, v51 +; SI-NEXT: v_or_b32_e32 v24, v1, v53 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 +; SI-NEXT: v_or_b32_e32 v22, v1, v52 +; SI-NEXT: v_alignbit_b32 v1, v22, v24, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v22, v24, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v22, v24, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 -; SI-NEXT: v_or_b32_e32 v31, v1, v49 -; SI-NEXT: v_alignbit_b32 v1, v31, v32, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v21, v1, v55 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v17, v1, v54 +; SI-NEXT: v_alignbit_b32 v1, v17, v21, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v31, v32, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v17, v21, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v31, v32, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v17, v21, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 -; SI-NEXT: v_or_b32_e32 v30, v1, v53 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 -; SI-NEXT: v_or_b32_e32 v26, v1, v52 -; SI-NEXT: v_alignbit_b32 v1, v26, v30, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v33 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v26, v30, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v31 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v26, v30, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v28 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 -; SI-NEXT: v_or_b32_e32 v22, v1, v55 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 -; SI-NEXT: v_or_b32_e32 v18, v1, v54 -; SI-NEXT: v_alignbit_b32 v1, v18, v22, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v25 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v22 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v17 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v27, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v23, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v20, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v25 -; SI-NEXT: v_or_b32_e32 v14, v1, v41 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: v_alignbit_b32 v60, v18, v22, 24 -; SI-NEXT: v_alignbit_b32 v61, v18, v22, 16 -; SI-NEXT: v_bfe_u32 v62, v44, 8, 8 +; SI-NEXT: v_bfe_u32 v1, v19, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v18, 8, 8 +; SI-NEXT: v_alignbit_b32 v41, v57, v58, 8 +; SI-NEXT: v_alignbit_b32 v40, v39, v51, 8 +; SI-NEXT: v_lshrrev_b32_e32 v46, 8, v57 +; SI-NEXT: v_lshrrev_b32_e32 v43, 8, v39 +; SI-NEXT: v_bfe_u32 v47, v44, 8, 8 +; SI-NEXT: v_bfe_u32 v45, v35, 8, 8 +; SI-NEXT: v_bfe_u32 v42, v30, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr62 ; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v10, v1, v40 -; SI-NEXT: v_alignbit_b32 v1, v10, v14, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v29 -; SI-NEXT: v_or_b32_e32 v6, v1, v43 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v35 -; SI-NEXT: v_or_b32_e32 v2, v1, v42 -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v47 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: .LBB96_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB96_4 +; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v38 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_or_b32_e32 v1, v59, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_add_i32_e32 v58, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_or_b32_e32 v3, v61, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_add_i32_e32 v51, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v4 +; SI-NEXT: v_or_b32_e32 v1, v56, v1 +; SI-NEXT: v_or_b32_e32 v5, v63, v5 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_add_i32_e32 v57, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v34, vcc, s6, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v6 +; SI-NEXT: v_or_b32_e32 v3, v60, v3 +; SI-NEXT: v_alignbit_b32 v1, v57, v58, 24 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_add_i32_e32 v39, vcc, s6, v3 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v33 +; SI-NEXT: v_alignbit_b32 v1, v57, v58, 16 +; SI-NEXT: v_or_b32_e32 v7, v37, v7 +; SI-NEXT: v_or_b32_e32 v5, v62, v5 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v31 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v39, v51, 24 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v32, vcc, s6, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v33, vcc, s6, v5 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v39, v51, 16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v33, v34, 24 +; SI-NEXT: v_or_b32_e32 v9, v48, v9 +; SI-NEXT: v_or_b32_e32 v7, v36, v7 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v24 +; SI-NEXT: v_alignbit_b32 v1, v33, v34, 16 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v29, vcc, s6, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v31, vcc, s6, v7 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v28 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v44 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v4, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v8, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v12, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v16, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v20, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v24, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v28, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v26 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v33, v34, 8 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v18 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v31, v32, 24 +; SI-NEXT: v_or_b32_e32 v11, v50, v11 +; SI-NEXT: v_or_b32_e32 v9, v38, v9 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v10 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v31, v32, 16 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v26, vcc, s6, v11 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v28, vcc, s6, v9 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v2 -; SI-NEXT: v_alignbit_b32 v57, v10, v14, 24 -; SI-NEXT: v_alignbit_b32 v58, v10, v14, 16 -; SI-NEXT: v_alignbit_b32 v45, v2, v6, 24 -; SI-NEXT: v_alignbit_b32 v46, v2, v6, 16 -; SI-NEXT: v_alignbit_b32 v59, v2, v6, 8 +; SI-NEXT: v_alignbit_b32 v1, v31, v32, 8 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; kill: killed $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: .LBB96_2: ; %Flow -; SI-NEXT: s_or_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: s_xor_b64 exec, exec, s[4:5] -; SI-NEXT: s_cbranch_execz .LBB96_4 -; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v25 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; SI-NEXT: s_mov_b32 s6, 0x30000 -; SI-NEXT: v_or_b32_e32 v4, v41, v4 -; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v63, v1 -; SI-NEXT: v_add_i32_e32 v56, vcc, s6, v1 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v3 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v27, v1 -; SI-NEXT: v_add_i32_e32 v47, vcc, s6, v1 -; SI-NEXT: v_alignbit_b32 v1, v47, v56, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v47, v56, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v29 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v2, v43, v2 -; SI-NEXT: v_add_i32_e32 v6, vcc, 0x30000, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v35 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v2, v42, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; SI-NEXT: v_alignbit_b32 v45, v2, v6, 24 -; SI-NEXT: v_alignbit_b32 v46, v2, v6, 16 -; SI-NEXT: v_alignbit_b32 v59, v2, v6, 8 -; SI-NEXT: v_lshrrev_b32_e32 v62, 24, v2 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; SI-NEXT: v_or_b32_e32 v4, v40, v4 -; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v21 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; SI-NEXT: v_or_b32_e32 v4, v55, v4 -; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v23 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; SI-NEXT: v_or_b32_e32 v4, v54, v4 -; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v17 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; SI-NEXT: v_or_b32_e32 v4, v53, v4 -; SI-NEXT: v_add_i32_e32 v30, vcc, s6, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v19 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; SI-NEXT: v_or_b32_e32 v4, v52, v4 -; SI-NEXT: v_add_i32_e32 v26, vcc, s6, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v13 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; SI-NEXT: v_or_b32_e32 v4, v51, v4 -; SI-NEXT: v_add_i32_e32 v32, vcc, s6, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v15 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; SI-NEXT: v_or_b32_e32 v4, v49, v4 -; SI-NEXT: v_add_i32_e32 v31, vcc, s6, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v9 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; SI-NEXT: v_or_b32_e32 v4, v48, v4 -; SI-NEXT: v_add_i32_e32 v34, vcc, s6, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v11 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; SI-NEXT: v_or_b32_e32 v4, v39, v4 -; SI-NEXT: v_add_i32_e32 v33, vcc, s6, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v5 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; SI-NEXT: v_or_b32_e32 v4, v37, v4 -; SI-NEXT: v_add_i32_e32 v50, vcc, s6, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v7 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; SI-NEXT: v_or_b32_e32 v4, v36, v4 -; SI-NEXT: v_add_i32_e32 v38, vcc, s6, v4 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v38, v50, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v38, v50, 16 +; SI-NEXT: v_alignbit_b32 v1, v28, v29, 24 +; SI-NEXT: v_or_b32_e32 v13, v53, v13 +; SI-NEXT: v_or_b32_e32 v11, v49, v11 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v38, v50, 8 +; SI-NEXT: v_alignbit_b32 v1, v28, v29, 16 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v13 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v25, vcc, s6, v11 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v33, v34, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v33, v34, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v33, v34, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v28, v29, 8 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v31, v32, 8 +; SI-NEXT: v_alignbit_b32 v1, v25, v26, 24 +; SI-NEXT: v_or_b32_e32 v15, v55, v15 +; SI-NEXT: v_or_b32_e32 v13, v52, v13 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v47 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v25, v26, 16 +; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v15 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v13 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v47 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v25, v26, 8 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v47 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v22, v24, 24 +; SI-NEXT: v_or_b32_e32 v15, v54, v15 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v38 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v22, v24, 16 +; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v15 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v38 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v22, v24, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v38 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v17, v21, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v33 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v17, v21, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v33 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v17, v21, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v33 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v31 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v31 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v31 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v26 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v26 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v28 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v26 +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v28 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v18 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v18 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v18 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v25 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v10 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v25 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v10 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v22 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v10 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v22 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_alignbit_b32 v4, v47, v56, 24 -; SI-NEXT: v_alignbit_b32 v24, v31, v32, 24 -; SI-NEXT: v_alignbit_b32 v28, v31, v32, 16 -; SI-NEXT: v_alignbit_b32 v12, v26, v30, 24 -; SI-NEXT: v_alignbit_b32 v16, v26, v30, 16 -; SI-NEXT: v_alignbit_b32 v44, v26, v30, 8 -; SI-NEXT: v_alignbit_b32 v60, v18, v22, 24 -; SI-NEXT: v_alignbit_b32 v61, v18, v22, 16 -; SI-NEXT: v_alignbit_b32 v20, v18, v22, 8 -; SI-NEXT: v_alignbit_b32 v57, v10, v14, 24 -; SI-NEXT: v_alignbit_b32 v58, v10, v14, 16 -; SI-NEXT: v_alignbit_b32 v8, v10, v14, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v17 +; SI-NEXT: v_alignbit_b32 v41, v57, v58, 8 +; SI-NEXT: v_alignbit_b32 v40, v39, v51, 8 +; SI-NEXT: v_lshrrev_b32_e32 v47, 24, v57 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v57 +; SI-NEXT: v_lshrrev_b32_e32 v46, 8, v57 +; SI-NEXT: v_lshrrev_b32_e32 v45, 24, v39 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v43, 8, v39 +; SI-NEXT: v_lshrrev_b32_e32 v42, 24, v33 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v22 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v17 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v2 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v17 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: .LBB96_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v56 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v58 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v41 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v47 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v57 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v46 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v44 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v47 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v50 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v51 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v40 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v38 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v43 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v35 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v45 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v34 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v33 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v42 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v32 -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v24 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v28 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v31 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v30 -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v44 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v12 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v26 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v29 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v22 -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v20 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v61 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v60 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v28 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v18 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v26 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v8 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v58 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v57 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v25 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v10 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v24 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v59 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v46 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v45 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v22 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v62 -; SI-NEXT: v_add_i32_e32 v0, vcc, 60, v0 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v21 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v17 +; SI-NEXT: v_add_i32_e32 v0, vcc, 60, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -66399,506 +68765,465 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_writelane_b32 v20, s30, 0 -; SI-NEXT: v_writelane_b32 v20, s31, 1 -; SI-NEXT: v_writelane_b32 v20, s34, 2 -; SI-NEXT: v_writelane_b32 v20, s35, 3 -; SI-NEXT: v_writelane_b32 v20, s36, 4 -; SI-NEXT: v_writelane_b32 v20, s37, 5 -; SI-NEXT: v_writelane_b32 v20, s38, 6 -; SI-NEXT: v_writelane_b32 v20, s39, 7 -; SI-NEXT: v_writelane_b32 v20, s48, 8 -; SI-NEXT: v_writelane_b32 v20, s49, 9 -; SI-NEXT: v_writelane_b32 v20, s50, 10 -; SI-NEXT: v_writelane_b32 v20, s51, 11 -; SI-NEXT: v_writelane_b32 v20, s52, 12 -; SI-NEXT: v_writelane_b32 v20, s53, 13 -; SI-NEXT: v_writelane_b32 v20, s54, 14 -; SI-NEXT: v_writelane_b32 v20, s55, 15 -; SI-NEXT: v_writelane_b32 v20, s64, 16 -; SI-NEXT: v_writelane_b32 v20, s65, 17 -; SI-NEXT: v_writelane_b32 v20, s66, 18 -; SI-NEXT: v_writelane_b32 v20, s67, 19 -; SI-NEXT: v_writelane_b32 v20, s68, 20 -; SI-NEXT: v_writelane_b32 v20, s69, 21 -; SI-NEXT: v_writelane_b32 v20, s70, 22 -; SI-NEXT: v_writelane_b32 v20, s71, 23 -; SI-NEXT: v_writelane_b32 v20, s80, 24 -; SI-NEXT: v_writelane_b32 v20, s81, 25 -; SI-NEXT: v_writelane_b32 v20, s82, 26 -; SI-NEXT: v_writelane_b32 v20, s83, 27 -; SI-NEXT: v_writelane_b32 v20, s84, 28 -; SI-NEXT: v_writelane_b32 v20, s85, 29 -; SI-NEXT: v_writelane_b32 v20, s86, 30 -; SI-NEXT: v_writelane_b32 v20, s87, 31 -; SI-NEXT: v_writelane_b32 v20, s96, 32 -; SI-NEXT: v_writelane_b32 v20, s97, 33 -; SI-NEXT: v_writelane_b32 v20, s98, 34 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 -; SI-NEXT: v_writelane_b32 v20, s99, 35 -; SI-NEXT: s_mov_b32 s93, s18 -; SI-NEXT: s_mov_b32 s31, s17 -; SI-NEXT: v_readfirstlane_b32 s59, v18 -; SI-NEXT: v_readfirstlane_b32 s18, v17 -; SI-NEXT: v_readfirstlane_b32 s63, v16 -; SI-NEXT: v_readfirstlane_b32 s17, v15 -; SI-NEXT: v_readfirstlane_b32 s72, v14 -; SI-NEXT: v_readfirstlane_b32 s76, v13 -; SI-NEXT: v_readfirstlane_b32 s57, v12 -; SI-NEXT: v_readfirstlane_b32 s61, v11 -; SI-NEXT: v_readfirstlane_b32 s44, v10 -; SI-NEXT: v_readfirstlane_b32 s58, v9 -; SI-NEXT: v_readfirstlane_b32 s62, v8 -; SI-NEXT: v_readfirstlane_b32 s45, v7 -; SI-NEXT: v_readfirstlane_b32 s96, v6 -; SI-NEXT: v_readfirstlane_b32 s97, v5 -; SI-NEXT: v_readfirstlane_b32 s99, v4 -; SI-NEXT: v_readfirstlane_b32 s46, v3 -; SI-NEXT: v_readfirstlane_b32 s83, v2 +; SI-NEXT: v_writelane_b32 v6, s30, 0 +; SI-NEXT: v_writelane_b32 v6, s31, 1 +; SI-NEXT: v_writelane_b32 v6, s34, 2 +; SI-NEXT: v_writelane_b32 v6, s35, 3 +; SI-NEXT: v_writelane_b32 v6, s36, 4 +; SI-NEXT: v_writelane_b32 v6, s37, 5 +; SI-NEXT: v_writelane_b32 v6, s38, 6 +; SI-NEXT: v_writelane_b32 v6, s39, 7 +; SI-NEXT: v_writelane_b32 v6, s48, 8 +; SI-NEXT: v_writelane_b32 v6, s49, 9 +; SI-NEXT: v_writelane_b32 v6, s50, 10 +; SI-NEXT: v_writelane_b32 v6, s51, 11 +; SI-NEXT: v_writelane_b32 v6, s52, 12 +; SI-NEXT: v_writelane_b32 v6, s53, 13 +; SI-NEXT: v_writelane_b32 v6, s54, 14 +; SI-NEXT: v_writelane_b32 v6, s55, 15 +; SI-NEXT: v_writelane_b32 v6, s64, 16 +; SI-NEXT: v_writelane_b32 v6, s65, 17 +; SI-NEXT: v_writelane_b32 v6, s66, 18 +; SI-NEXT: v_writelane_b32 v6, s67, 19 +; SI-NEXT: v_writelane_b32 v6, s68, 20 +; SI-NEXT: v_writelane_b32 v6, s69, 21 +; SI-NEXT: v_writelane_b32 v6, s70, 22 +; SI-NEXT: v_writelane_b32 v6, s71, 23 +; SI-NEXT: v_writelane_b32 v6, s80, 24 +; SI-NEXT: v_writelane_b32 v6, s81, 25 +; SI-NEXT: v_writelane_b32 v6, s82, 26 +; SI-NEXT: v_writelane_b32 v6, s83, 27 +; SI-NEXT: v_writelane_b32 v6, s84, 28 +; SI-NEXT: v_writelane_b32 v6, s85, 29 +; SI-NEXT: v_writelane_b32 v6, s86, 30 +; SI-NEXT: v_writelane_b32 v6, s87, 31 +; SI-NEXT: v_writelane_b32 v6, s96, 32 +; SI-NEXT: v_writelane_b32 v6, s97, 33 +; SI-NEXT: v_writelane_b32 v6, s98, 34 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; SI-NEXT: s_lshr_b32 s68, s29, 16 +; SI-NEXT: s_lshr_b32 s46, s28, 16 +; SI-NEXT: s_lshr_b32 s70, s27, 16 +; SI-NEXT: s_lshr_b32 s98, s26, 16 +; SI-NEXT: s_lshr_b32 s71, s25, 16 +; SI-NEXT: s_lshr_b32 s96, s24, 16 +; SI-NEXT: s_lshr_b32 s80, s23, 16 +; SI-NEXT: s_lshr_b32 s87, s22, 16 +; SI-NEXT: s_lshr_b32 s81, s21, 16 +; SI-NEXT: s_lshr_b32 s86, s20, 16 +; SI-NEXT: s_lshr_b32 s82, s19, 16 +; SI-NEXT: s_lshr_b32 s85, s18, 16 +; SI-NEXT: s_lshr_b32 s83, s17, 16 +; SI-NEXT: s_lshr_b32 s84, s16, 16 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; SI-NEXT: v_readfirstlane_b32 s56, v2 +; SI-NEXT: v_readfirstlane_b32 s58, v1 +; SI-NEXT: v_readfirstlane_b32 s69, v4 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s85, v1 -; SI-NEXT: ; implicit-def: $vgpr21 : SGPR spill to VGPR lane +; SI-NEXT: v_readfirstlane_b32 s60, v5 +; SI-NEXT: v_writelane_b32 v6, s99, 35 +; SI-NEXT: ; implicit-def: $vgpr7 : SGPR spill to VGPR lane ; SI-NEXT: s_cbranch_scc0 .LBB97_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s31, 16 +; SI-NEXT: s_lshl_b32 s5, s84, 16 +; SI-NEXT: s_or_b32 s42, s4, s5 +; SI-NEXT: s_and_b32 s4, s17, 0xffff +; SI-NEXT: s_lshl_b32 s5, s83, 16 +; SI-NEXT: s_or_b32 s43, s4, s5 +; SI-NEXT: s_lshr_b64 s[4:5], s[42:43], 24 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v7, s4, 2 +; SI-NEXT: v_writelane_b32 v7, s5, 3 +; SI-NEXT: s_lshr_b64 s[4:5], s[42:43], 16 +; SI-NEXT: v_writelane_b32 v7, s4, 0 +; SI-NEXT: v_writelane_b32 v7, s5, 1 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: s_lshl_b32 s5, s85, 16 ; SI-NEXT: s_or_b32 s40, s4, s5 -; SI-NEXT: s_and_b32 s4, s93, 0xffff -; SI-NEXT: s_lshl_b32 s5, s19, 16 +; SI-NEXT: s_and_b32 s4, s19, 0xffff +; SI-NEXT: s_lshl_b32 s5, s82, 16 ; SI-NEXT: s_or_b32 s41, s4, s5 ; SI-NEXT: s_lshr_b64 s[4:5], s[40:41], 24 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v21, s4, 4 -; SI-NEXT: v_writelane_b32 v21, s5, 5 +; SI-NEXT: v_writelane_b32 v7, s4, 6 +; SI-NEXT: v_writelane_b32 v7, s5, 7 ; SI-NEXT: s_lshr_b64 s[4:5], s[40:41], 16 -; SI-NEXT: v_writelane_b32 v21, s4, 2 -; SI-NEXT: v_writelane_b32 v21, s5, 3 -; SI-NEXT: s_lshr_b64 s[4:5], s[40:41], 8 -; SI-NEXT: v_writelane_b32 v21, s4, 0 -; SI-NEXT: v_writelane_b32 v21, s5, 1 +; SI-NEXT: v_writelane_b32 v7, s4, 4 +; SI-NEXT: v_writelane_b32 v7, s5, 5 ; SI-NEXT: s_and_b32 s4, s20, 0xffff -; SI-NEXT: s_lshl_b32 s5, s21, 16 -; SI-NEXT: s_or_b32 s14, s4, s5 +; SI-NEXT: s_lshl_b32 s5, s86, 16 +; SI-NEXT: s_or_b32 s12, s4, s5 +; SI-NEXT: s_and_b32 s4, s21, 0xffff +; SI-NEXT: s_lshl_b32 s5, s81, 16 +; SI-NEXT: s_or_b32 s13, s4, s5 +; SI-NEXT: s_lshr_b64 s[4:5], s[12:13], 24 +; SI-NEXT: v_writelane_b32 v7, s4, 8 +; SI-NEXT: v_writelane_b32 v7, s5, 9 ; SI-NEXT: s_and_b32 s4, s22, 0xffff -; SI-NEXT: s_lshl_b32 s5, s23, 16 -; SI-NEXT: s_or_b32 s15, s4, s5 -; SI-NEXT: s_lshr_b64 s[4:5], s[14:15], 24 -; SI-NEXT: v_writelane_b32 v21, s4, 10 -; SI-NEXT: v_writelane_b32 v21, s5, 11 -; SI-NEXT: s_lshr_b64 s[4:5], s[14:15], 16 -; SI-NEXT: v_writelane_b32 v21, s4, 8 -; SI-NEXT: v_writelane_b32 v21, s5, 9 -; SI-NEXT: s_lshr_b64 s[4:5], s[14:15], 8 -; SI-NEXT: v_writelane_b32 v21, s4, 6 -; SI-NEXT: v_writelane_b32 v21, s5, 7 +; SI-NEXT: s_lshl_b32 s5, s87, 16 +; SI-NEXT: s_or_b32 s8, s4, s5 +; SI-NEXT: s_and_b32 s4, s23, 0xffff +; SI-NEXT: s_lshl_b32 s5, s80, 16 +; SI-NEXT: s_or_b32 s9, s4, s5 ; SI-NEXT: s_and_b32 s4, s24, 0xffff -; SI-NEXT: s_lshl_b32 s5, s25, 16 -; SI-NEXT: s_or_b32 s10, s4, s5 +; SI-NEXT: s_lshl_b32 s5, s96, 16 +; SI-NEXT: s_or_b32 s14, s4, s5 +; SI-NEXT: s_and_b32 s4, s25, 0xffff +; SI-NEXT: s_lshl_b32 s5, s71, 16 +; SI-NEXT: s_or_b32 s15, s4, s5 ; SI-NEXT: s_and_b32 s4, s26, 0xffff -; SI-NEXT: s_lshl_b32 s5, s27, 16 +; SI-NEXT: s_lshl_b32 s5, s98, 16 +; SI-NEXT: s_or_b32 s10, s4, s5 +; SI-NEXT: s_and_b32 s4, s27, 0xffff +; SI-NEXT: s_lshl_b32 s5, s70, 16 ; SI-NEXT: s_or_b32 s11, s4, s5 -; SI-NEXT: s_lshr_b64 s[4:5], s[10:11], 24 -; SI-NEXT: v_writelane_b32 v21, s4, 16 -; SI-NEXT: v_writelane_b32 v21, s5, 17 -; SI-NEXT: s_lshr_b64 s[4:5], s[10:11], 16 -; SI-NEXT: v_writelane_b32 v21, s4, 14 -; SI-NEXT: v_writelane_b32 v21, s5, 15 -; SI-NEXT: s_lshr_b64 s[4:5], s[10:11], 8 -; SI-NEXT: v_writelane_b32 v21, s4, 12 -; SI-NEXT: v_writelane_b32 v21, s5, 13 ; SI-NEXT: s_and_b32 s4, s28, 0xffff -; SI-NEXT: s_lshl_b32 s5, s29, 16 -; SI-NEXT: s_or_b32 s42, s4, s5 -; SI-NEXT: s_and_b32 s4, s85, 0xffff -; SI-NEXT: s_lshl_b32 s5, s83, 16 -; SI-NEXT: s_or_b32 s43, s4, s5 -; SI-NEXT: s_and_b32 s4, s46, 0xffff -; SI-NEXT: s_lshl_b32 s5, s99, 16 -; SI-NEXT: s_or_b32 s12, s4, s5 -; SI-NEXT: s_and_b32 s4, s97, 0xffff -; SI-NEXT: s_lshl_b32 s5, s96, 16 -; SI-NEXT: s_or_b32 s13, s4, s5 -; SI-NEXT: s_and_b32 s4, s45, 0xffff -; SI-NEXT: s_lshl_b32 s5, s62, 16 -; SI-NEXT: s_or_b32 s8, s4, s5 -; SI-NEXT: s_and_b32 s4, s58, 0xffff -; SI-NEXT: s_lshl_b32 s5, s44, 16 -; SI-NEXT: s_or_b32 s9, s4, s5 -; SI-NEXT: s_and_b32 s4, s61, 0xffff -; SI-NEXT: s_lshl_b32 s5, s57, 16 +; SI-NEXT: s_lshl_b32 s5, s46, 16 ; SI-NEXT: s_or_b32 s6, s4, s5 -; SI-NEXT: s_and_b32 s4, s76, 0xffff -; SI-NEXT: s_lshl_b32 s5, s72, 16 +; SI-NEXT: s_and_b32 s4, s29, 0xffff +; SI-NEXT: s_lshl_b32 s5, s68, 16 ; SI-NEXT: s_or_b32 s7, s4, s5 -; SI-NEXT: s_and_b32 s4, s17, 0xffff -; SI-NEXT: s_lshl_b32 s5, s63, 16 -; SI-NEXT: s_and_b32 s78, s72, 0xffff -; SI-NEXT: s_lshr_b64 s[34:35], s[8:9], 24 +; SI-NEXT: s_and_b32 s4, s58, 0xffff +; SI-NEXT: s_lshl_b32 s5, s60, 16 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s47, s59, 16 -; SI-NEXT: s_mov_b32 s35, s78 -; SI-NEXT: s_mov_b32 s78, s93 -; SI-NEXT: s_lshr_b64 s[92:93], s[6:7], 24 -; SI-NEXT: s_or_b32 s5, s5, s47 -; SI-NEXT: s_lshr_b32 s79, s7, 8 -; SI-NEXT: s_mov_b32 s93, s78 -; SI-NEXT: s_lshr_b64 s[94:95], s[6:7], 16 -; SI-NEXT: s_mov_b32 s78, s31 -; SI-NEXT: s_lshr_b64 s[30:31], s[6:7], 8 -; SI-NEXT: s_lshr_b32 s88, s5, 8 -; SI-NEXT: s_bfe_u32 s89, s72, 0x80008 -; SI-NEXT: s_lshr_b64 s[36:37], s[8:9], 16 -; SI-NEXT: s_mov_b32 s95, s79 -; SI-NEXT: s_mov_b32 s31, s78 -; SI-NEXT: s_lshr_b64 s[78:79], s[4:5], 24 -; SI-NEXT: s_and_b32 s90, s59, 0xffff -; SI-NEXT: s_mov_b32 s37, s89 -; SI-NEXT: s_mov_b32 s79, s88 -; SI-NEXT: s_lshr_b64 s[88:89], s[4:5], 16 -; SI-NEXT: s_bfe_u32 vcc_lo, s59, 0x80008 -; SI-NEXT: s_mov_b32 s89, s90 -; SI-NEXT: s_lshr_b64 s[90:91], s[4:5], 8 -; SI-NEXT: s_lshr_b32 s60, s41, 8 -; SI-NEXT: s_lshr_b32 s87, s15, 8 -; SI-NEXT: s_lshr_b32 s82, s11, 8 -; SI-NEXT: s_lshr_b32 s71, s43, 8 -; SI-NEXT: s_lshr_b32 s68, s13, 8 -; SI-NEXT: s_lshr_b32 s73, s9, 8 -; SI-NEXT: s_and_b32 s74, s19, 0xffff -; SI-NEXT: s_and_b32 s98, s23, 0xffff -; SI-NEXT: s_and_b32 s84, s27, 0xffff -; SI-NEXT: s_and_b32 s80, s83, 0xffff -; SI-NEXT: s_and_b32 s69, s96, 0xffff -; SI-NEXT: s_and_b32 s75, s44, 0xffff -; SI-NEXT: s_bfe_u32 s47, s19, 0x80008 -; SI-NEXT: s_bfe_u32 s56, s23, 0x80008 -; SI-NEXT: s_bfe_u32 s86, s27, 0x80008 -; SI-NEXT: s_bfe_u32 s81, s83, 0x80008 -; SI-NEXT: s_bfe_u32 s70, s96, 0x80008 -; SI-NEXT: s_bfe_u32 s77, s44, 0x80008 -; SI-NEXT: s_lshr_b64 s[54:55], s[42:43], 24 -; SI-NEXT: s_lshr_b64 s[64:65], s[42:43], 16 -; SI-NEXT: s_lshr_b64 s[66:67], s[42:43], 8 -; SI-NEXT: s_lshr_b64 s[48:49], s[12:13], 24 -; SI-NEXT: s_lshr_b64 s[50:51], s[12:13], 16 -; SI-NEXT: s_lshr_b64 s[52:53], s[12:13], 8 -; SI-NEXT: s_lshr_b64 s[38:39], s[8:9], 8 -; SI-NEXT: s_mov_b32 s91, vcc_lo +; SI-NEXT: s_and_b32 s5, s56, 0xffff +; SI-NEXT: s_lshl_b32 s44, s69, 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[42:43], 8 +; SI-NEXT: s_lshr_b64 s[74:75], s[40:41], 8 +; SI-NEXT: s_lshr_b64 s[72:73], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[12:13], 8 +; SI-NEXT: s_lshr_b64 s[78:79], s[8:9], 24 +; SI-NEXT: s_lshr_b64 s[90:91], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[92:93], s[8:9], 8 +; SI-NEXT: s_lshr_b64 s[88:89], s[14:15], 24 +; SI-NEXT: s_or_b32 s5, s5, s44 +; SI-NEXT: s_lshr_b32 s99, s43, 8 +; SI-NEXT: s_lshr_b32 s93, s41, 8 +; SI-NEXT: s_lshr_b32 s89, s13, 8 +; SI-NEXT: s_lshr_b32 s77, s9, 8 +; SI-NEXT: s_lshr_b32 s73, s15, 8 +; SI-NEXT: s_lshr_b32 s61, s11, 8 +; SI-NEXT: s_lshr_b32 s57, s7, 8 +; SI-NEXT: s_lshr_b32 s45, s5, 8 +; SI-NEXT: s_bfe_u32 s44, s83, 0x80008 +; SI-NEXT: s_bfe_u32 s97, s82, 0x80008 +; SI-NEXT: s_bfe_u32 s91, s81, 0x80008 +; SI-NEXT: s_bfe_u32 s79, s80, 0x80008 +; SI-NEXT: s_bfe_u32 s75, s71, 0x80008 +; SI-NEXT: s_bfe_u32 s63, s70, 0x80008 +; SI-NEXT: s_bfe_u32 s59, s68, 0x80008 +; SI-NEXT: s_bfe_u32 s47, s69, 0x80008 +; SI-NEXT: s_lshr_b64 s[64:65], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[66:67], s[14:15], 8 +; SI-NEXT: s_lshr_b64 s[50:51], s[10:11], 24 +; SI-NEXT: s_lshr_b64 s[52:53], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[54:55], s[10:11], 8 +; SI-NEXT: s_lshr_b64 s[36:37], s[6:7], 24 +; SI-NEXT: s_lshr_b64 s[38:39], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[48:49], s[6:7], 8 +; SI-NEXT: s_lshr_b64 s[94:95], s[4:5], 24 +; SI-NEXT: s_lshr_b64 s[30:31], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[34:35], s[4:5], 8 ; SI-NEXT: s_cbranch_execnz .LBB97_3 ; SI-NEXT: .LBB97_2: ; %cmp.true -; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: s_and_b32 s4, s17, 0xffff -; SI-NEXT: s_lshl_b32 s5, s63, 16 -; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s58, s58, 3 +; SI-NEXT: s_and_b32 s4, s58, 0xffff +; SI-NEXT: s_lshl_b32 s5, s60, 16 +; SI-NEXT: s_add_i32 s56, s56, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s59, 16 -; SI-NEXT: s_add_i32 s61, s61, 3 +; SI-NEXT: s_and_b32 s5, s56, 0xffff +; SI-NEXT: s_lshl_b32 s6, s69, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 ; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_and_b32 s6, s61, 0xffff -; SI-NEXT: s_lshl_b32 s7, s57, 16 -; SI-NEXT: s_add_i32 s76, s76, 3 +; SI-NEXT: s_and_b32 s6, s28, 0xffff +; SI-NEXT: s_lshl_b32 s7, s46, 16 +; SI-NEXT: s_add_i32 s29, s29, 3 ; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_and_b32 s7, s76, 0xffff -; SI-NEXT: s_lshl_b32 s8, s72, 16 -; SI-NEXT: s_add_i32 s45, s45, 3 +; SI-NEXT: s_and_b32 s7, s29, 0xffff +; SI-NEXT: s_lshl_b32 s8, s68, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 ; SI-NEXT: s_or_b32 s7, s8, s7 -; SI-NEXT: s_and_b32 s8, s45, 0xffff -; SI-NEXT: s_lshl_b32 s9, s62, 16 -; SI-NEXT: s_add_i32 s58, s58, 3 +; SI-NEXT: s_and_b32 s8, s26, 0xffff +; SI-NEXT: s_lshl_b32 s9, s98, 16 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_add_i32 s10, s8, 0x30000 +; SI-NEXT: s_and_b32 s8, s27, 0xffff +; SI-NEXT: s_lshl_b32 s9, s70, 16 ; SI-NEXT: s_or_b32 s8, s9, s8 -; SI-NEXT: s_and_b32 s9, s58, 0xffff -; SI-NEXT: s_lshl_b32 s10, s44, 16 -; SI-NEXT: s_add_i32 s46, s46, 3 -; SI-NEXT: s_or_b32 s9, s10, s9 -; SI-NEXT: s_and_b32 s10, s46, 0xffff -; SI-NEXT: s_lshl_b32 s11, s99, 16 -; SI-NEXT: s_or_b32 s10, s11, s10 -; SI-NEXT: s_add_i32 s97, s97, 3 -; SI-NEXT: s_add_i32 s12, s10, 0x30000 -; SI-NEXT: s_and_b32 s10, s97, 0xffff -; SI-NEXT: s_lshl_b32 s11, s96, 16 -; SI-NEXT: s_or_b32 s10, s11, s10 -; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: s_add_i32 s13, s10, 0x30000 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: s_or_b32 s10, s11, s10 -; SI-NEXT: s_add_i32 s85, s85, 3 -; SI-NEXT: s_add_i32 s42, s10, 0x30000 -; SI-NEXT: s_and_b32 s10, s85, 0xffff -; SI-NEXT: s_lshl_b32 s11, s83, 16 -; SI-NEXT: s_or_b32 s10, s11, s10 ; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: s_add_i32 s43, s10, 0x30000 -; SI-NEXT: s_and_b32 s10, s24, 0xffff -; SI-NEXT: s_lshl_b32 s11, s25, 16 -; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: s_or_b32 s10, s11, s10 -; SI-NEXT: s_and_b32 s11, s26, 0xffff -; SI-NEXT: s_lshl_b32 s14, s27, 16 -; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: s_or_b32 s11, s14, s11 -; SI-NEXT: s_and_b32 s14, s20, 0xffff -; SI-NEXT: s_lshl_b32 s15, s21, 16 +; SI-NEXT: s_add_i32 s11, s8, 0x30000 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s96, 16 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s14, s8, 0x30000 +; SI-NEXT: s_and_b32 s8, s25, 0xffff +; SI-NEXT: s_lshl_b32 s9, s71, 16 +; SI-NEXT: s_or_b32 s8, s9, s8 ; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: s_or_b32 s14, s15, s14 -; SI-NEXT: s_and_b32 s15, s22, 0xffff -; SI-NEXT: s_lshl_b32 s17, s23, 16 +; SI-NEXT: s_add_i32 s15, s8, 0x30000 +; SI-NEXT: s_and_b32 s8, s22, 0xffff +; SI-NEXT: s_lshl_b32 s9, s87, 16 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s23, 0xffff +; SI-NEXT: s_lshl_b32 s12, s80, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s9, s12, s9 +; SI-NEXT: s_and_b32 s12, s20, 0xffff +; SI-NEXT: s_lshl_b32 s13, s86, 16 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_or_b32 s12, s13, s12 +; SI-NEXT: s_and_b32 s13, s21, 0xffff +; SI-NEXT: s_lshl_b32 s20, s81, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s13, s20, s13 +; SI-NEXT: s_and_b32 s18, s18, 0xffff +; SI-NEXT: s_lshl_b32 s20, s85, 16 +; SI-NEXT: s_or_b32 s18, s20, s18 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s40, s18, 0x30000 +; SI-NEXT: s_and_b32 s18, s19, 0xffff +; SI-NEXT: s_lshl_b32 s19, s82, 16 +; SI-NEXT: s_or_b32 s18, s19, s18 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: s_or_b32 s15, s17, s15 +; SI-NEXT: s_add_i32 s41, s18, 0x30000 ; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: s_lshl_b32 s17, s31, 16 -; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: s_add_i32 s40, s16, 0x30000 -; SI-NEXT: s_add_i32 s16, s93, 3 -; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: s_lshl_b32 s17, s19, 16 +; SI-NEXT: s_lshl_b32 s18, s84, 16 +; SI-NEXT: s_or_b32 s16, s18, s16 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s42, s16, 0x30000 +; SI-NEXT: s_and_b32 s16, s17, 0xffff +; SI-NEXT: s_lshl_b32 s17, s83, 16 ; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: s_add_i32 s41, s16, 0x30000 +; SI-NEXT: s_add_i32 s43, s16, 0x30000 +; SI-NEXT: s_lshr_b64 s[16:17], s[42:43], 24 +; SI-NEXT: v_writelane_b32 v7, s16, 2 +; SI-NEXT: v_writelane_b32 v7, s17, 3 +; SI-NEXT: s_lshr_b64 s[16:17], s[42:43], 16 +; SI-NEXT: v_writelane_b32 v7, s16, 0 +; SI-NEXT: v_writelane_b32 v7, s17, 1 ; SI-NEXT: s_lshr_b64 s[16:17], s[40:41], 24 -; SI-NEXT: v_writelane_b32 v21, s16, 4 -; SI-NEXT: v_writelane_b32 v21, s17, 5 +; SI-NEXT: v_writelane_b32 v7, s16, 6 +; SI-NEXT: v_writelane_b32 v7, s17, 7 ; SI-NEXT: s_lshr_b64 s[16:17], s[40:41], 16 -; SI-NEXT: v_writelane_b32 v21, s16, 2 -; SI-NEXT: v_writelane_b32 v21, s17, 3 -; SI-NEXT: s_lshr_b64 s[16:17], s[40:41], 8 -; SI-NEXT: s_add_i32 s14, s14, 0x30000 -; SI-NEXT: s_add_i32 s15, s15, 0x30000 -; SI-NEXT: v_writelane_b32 v21, s16, 0 -; SI-NEXT: v_writelane_b32 v21, s17, 1 -; SI-NEXT: s_lshr_b64 s[16:17], s[14:15], 24 -; SI-NEXT: v_writelane_b32 v21, s16, 10 -; SI-NEXT: v_writelane_b32 v21, s17, 11 -; SI-NEXT: s_lshr_b64 s[16:17], s[14:15], 16 -; SI-NEXT: v_writelane_b32 v21, s16, 8 -; SI-NEXT: v_writelane_b32 v21, s17, 9 -; SI-NEXT: s_lshr_b64 s[16:17], s[14:15], 8 -; SI-NEXT: s_add_i32 s10, s10, 0x30000 -; SI-NEXT: s_add_i32 s11, s11, 0x30000 -; SI-NEXT: v_writelane_b32 v21, s16, 6 -; SI-NEXT: v_writelane_b32 v21, s17, 7 -; SI-NEXT: s_lshr_b64 s[16:17], s[10:11], 24 -; SI-NEXT: v_writelane_b32 v21, s16, 16 -; SI-NEXT: v_writelane_b32 v21, s17, 17 -; SI-NEXT: s_lshr_b64 s[16:17], s[10:11], 16 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s12, s12, 0x30000 +; SI-NEXT: s_add_i32 s13, s13, 0x30000 +; SI-NEXT: v_writelane_b32 v7, s16, 4 ; SI-NEXT: s_add_i32 s4, s4, 0x30000 ; SI-NEXT: s_add_i32 s5, s5, 0x30000 ; SI-NEXT: s_add_i32 s6, s6, 0x30000 ; SI-NEXT: s_add_i32 s7, s7, 0x30000 -; SI-NEXT: s_add_i32 s8, s8, 0x30000 -; SI-NEXT: s_add_i32 s9, s9, 0x30000 -; SI-NEXT: v_writelane_b32 v21, s16, 14 -; SI-NEXT: v_writelane_b32 v21, s17, 15 -; SI-NEXT: s_lshr_b64 s[16:17], s[10:11], 8 -; SI-NEXT: s_lshr_b64 s[34:35], s[8:9], 24 -; SI-NEXT: s_lshr_b64 s[36:37], s[8:9], 16 -; SI-NEXT: s_lshr_b64 s[94:95], s[6:7], 16 -; SI-NEXT: s_lshr_b64 s[78:79], s[4:5], 24 -; SI-NEXT: s_lshr_b64 s[88:89], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[90:91], s[4:5], 8 -; SI-NEXT: v_writelane_b32 v21, s16, 12 -; SI-NEXT: s_lshr_b64 s[54:55], s[42:43], 24 -; SI-NEXT: s_lshr_b64 s[64:65], s[42:43], 16 -; SI-NEXT: s_lshr_b64 s[66:67], s[42:43], 8 -; SI-NEXT: s_lshr_b64 s[48:49], s[12:13], 24 -; SI-NEXT: s_lshr_b64 s[50:51], s[12:13], 16 -; SI-NEXT: s_lshr_b64 s[52:53], s[12:13], 8 -; SI-NEXT: s_lshr_b64 s[38:39], s[8:9], 8 -; SI-NEXT: s_lshr_b64 s[92:93], s[6:7], 24 -; SI-NEXT: s_lshr_b64 s[30:31], s[6:7], 8 -; SI-NEXT: s_lshr_b32 s47, s41, 24 -; SI-NEXT: s_lshr_b32 s74, s41, 16 -; SI-NEXT: s_lshr_b32 s60, s41, 8 -; SI-NEXT: s_lshr_b32 s56, s15, 24 -; SI-NEXT: s_lshr_b32 s98, s15, 16 -; SI-NEXT: s_lshr_b32 s87, s15, 8 -; SI-NEXT: s_lshr_b32 s86, s11, 24 -; SI-NEXT: s_lshr_b32 s84, s11, 16 -; SI-NEXT: s_lshr_b32 s82, s11, 8 -; SI-NEXT: s_lshr_b32 s81, s43, 24 -; SI-NEXT: s_lshr_b32 s80, s43, 16 -; SI-NEXT: s_lshr_b32 s71, s43, 8 -; SI-NEXT: s_lshr_b32 s70, s13, 24 -; SI-NEXT: s_lshr_b32 s69, s13, 16 -; SI-NEXT: s_lshr_b32 s68, s13, 8 -; SI-NEXT: s_lshr_b32 s77, s9, 24 -; SI-NEXT: s_lshr_b32 s75, s9, 16 -; SI-NEXT: s_lshr_b32 s73, s9, 8 -; SI-NEXT: s_lshr_b32 s37, s7, 24 -; SI-NEXT: s_lshr_b32 s35, s7, 16 -; SI-NEXT: s_lshr_b32 s95, s7, 8 -; SI-NEXT: s_lshr_b32 s91, s5, 24 -; SI-NEXT: s_lshr_b32 s89, s5, 16 -; SI-NEXT: s_lshr_b32 s79, s5, 8 -; SI-NEXT: v_writelane_b32 v21, s17, 13 +; SI-NEXT: s_lshr_b64 s[62:63], s[42:43], 8 +; SI-NEXT: v_writelane_b32 v7, s17, 5 +; SI-NEXT: s_lshr_b64 s[74:75], s[40:41], 8 +; SI-NEXT: s_lshr_b64 s[16:17], s[12:13], 24 +; SI-NEXT: s_lshr_b64 s[72:73], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[12:13], 8 +; SI-NEXT: s_lshr_b64 s[78:79], s[8:9], 24 +; SI-NEXT: s_lshr_b64 s[90:91], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[92:93], s[8:9], 8 +; SI-NEXT: s_lshr_b64 s[88:89], s[14:15], 24 +; SI-NEXT: v_writelane_b32 v7, s16, 8 +; SI-NEXT: s_lshr_b64 s[64:65], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[66:67], s[14:15], 8 +; SI-NEXT: s_lshr_b64 s[50:51], s[10:11], 24 +; SI-NEXT: s_lshr_b64 s[52:53], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[54:55], s[10:11], 8 +; SI-NEXT: s_lshr_b64 s[36:37], s[6:7], 24 +; SI-NEXT: s_lshr_b64 s[38:39], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[48:49], s[6:7], 8 +; SI-NEXT: s_lshr_b64 s[94:95], s[4:5], 24 +; SI-NEXT: s_lshr_b64 s[30:31], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[34:35], s[4:5], 8 +; SI-NEXT: s_lshr_b32 s44, s43, 24 +; SI-NEXT: s_lshr_b32 s83, s43, 16 +; SI-NEXT: s_lshr_b32 s99, s43, 8 +; SI-NEXT: s_lshr_b32 s97, s41, 24 +; SI-NEXT: s_lshr_b32 s82, s41, 16 +; SI-NEXT: s_lshr_b32 s93, s41, 8 +; SI-NEXT: s_lshr_b32 s91, s13, 24 +; SI-NEXT: s_lshr_b32 s81, s13, 16 +; SI-NEXT: s_lshr_b32 s89, s13, 8 +; SI-NEXT: s_lshr_b32 s79, s9, 24 +; SI-NEXT: s_lshr_b32 s80, s9, 16 +; SI-NEXT: s_lshr_b32 s77, s9, 8 +; SI-NEXT: s_lshr_b32 s75, s15, 24 +; SI-NEXT: s_lshr_b32 s71, s15, 16 +; SI-NEXT: s_lshr_b32 s73, s15, 8 +; SI-NEXT: s_lshr_b32 s63, s11, 24 +; SI-NEXT: s_lshr_b32 s70, s11, 16 +; SI-NEXT: s_lshr_b32 s61, s11, 8 +; SI-NEXT: s_lshr_b32 s59, s7, 24 +; SI-NEXT: s_lshr_b32 s68, s7, 16 +; SI-NEXT: s_lshr_b32 s57, s7, 8 +; SI-NEXT: s_lshr_b32 s47, s5, 24 +; SI-NEXT: s_lshr_b32 s69, s5, 16 +; SI-NEXT: s_lshr_b32 s45, s5, 8 +; SI-NEXT: v_writelane_b32 v7, s17, 9 ; SI-NEXT: .LBB97_3: ; %end -; SI-NEXT: v_readlane_b32 s18, v21, 0 -; SI-NEXT: v_readlane_b32 s19, v21, 1 -; SI-NEXT: s_lshl_b32 s17, s18, 8 -; SI-NEXT: v_readlane_b32 s18, v21, 2 -; SI-NEXT: s_and_b32 s16, s40, 0xff -; SI-NEXT: v_readlane_b32 s19, v21, 3 +; SI-NEXT: v_readlane_b32 s18, v7, 0 +; SI-NEXT: s_and_b32 s16, s42, 0xff +; SI-NEXT: s_lshl_b32 s17, s62, 8 +; SI-NEXT: v_readlane_b32 s19, v7, 1 ; SI-NEXT: s_or_b32 s16, s16, s17 ; SI-NEXT: s_and_b32 s17, s18, 0xff -; SI-NEXT: v_readlane_b32 s18, v21, 4 +; SI-NEXT: v_readlane_b32 s18, v7, 2 ; SI-NEXT: s_lshl_b32 s17, s17, 16 ; SI-NEXT: s_lshl_b32 s18, s18, 24 ; SI-NEXT: s_and_b32 s16, s16, 0xffff ; SI-NEXT: s_or_b32 s17, s18, s17 ; SI-NEXT: s_or_b32 s16, s16, s17 ; SI-NEXT: v_mov_b32_e32 v1, s16 -; SI-NEXT: s_and_b32 s16, s41, 0xff -; SI-NEXT: s_lshl_b32 s17, s60, 8 +; SI-NEXT: s_and_b32 s16, s43, 0xff +; SI-NEXT: s_lshl_b32 s17, s99, 8 ; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: s_and_b32 s17, s74, 0xff +; SI-NEXT: s_and_b32 s17, s83, 0xff ; SI-NEXT: s_lshl_b32 s17, s17, 16 -; SI-NEXT: s_lshl_b32 s18, s47, 24 +; SI-NEXT: s_lshl_b32 s18, s44, 24 +; SI-NEXT: v_readlane_b32 s19, v7, 3 ; SI-NEXT: s_and_b32 s16, s16, 0xffff ; SI-NEXT: s_or_b32 s17, s18, s17 ; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_readlane_b32 s18, v7, 4 ; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: v_readlane_b32 s16, v21, 6 -; SI-NEXT: s_and_b32 s14, s14, 0xff -; SI-NEXT: v_readlane_b32 s17, v21, 7 -; SI-NEXT: s_lshl_b32 s16, s16, 8 -; SI-NEXT: v_readlane_b32 s19, v21, 5 -; SI-NEXT: s_or_b32 s14, s14, s16 -; SI-NEXT: v_readlane_b32 s16, v21, 8 -; SI-NEXT: v_readlane_b32 s17, v21, 9 -; SI-NEXT: s_and_b32 s16, s16, 0xff -; SI-NEXT: v_readlane_b32 s18, v21, 10 -; SI-NEXT: s_lshl_b32 s16, s16, 16 -; SI-NEXT: s_lshl_b32 s17, s18, 24 -; SI-NEXT: s_and_b32 s14, s14, 0xffff -; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_and_b32 s16, s40, 0xff +; SI-NEXT: s_lshl_b32 s17, s74, 8 +; SI-NEXT: v_readlane_b32 s19, v7, 5 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_and_b32 s17, s18, 0xff +; SI-NEXT: v_readlane_b32 s18, v7, 6 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s18, s18, 24 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s17, s18, s17 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; SI-NEXT: s_or_b32 s14, s14, s16 +; SI-NEXT: s_or_b32 s16, s16, s17 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s14 -; SI-NEXT: s_and_b32 s14, s15, 0xff -; SI-NEXT: s_lshl_b32 s15, s87, 8 -; SI-NEXT: s_or_b32 s14, s14, s15 -; SI-NEXT: s_and_b32 s15, s98, 0xff -; SI-NEXT: s_lshl_b32 s15, s15, 16 -; SI-NEXT: s_lshl_b32 s16, s56, 24 -; SI-NEXT: s_and_b32 s14, s14, 0xffff -; SI-NEXT: s_or_b32 s15, s16, s15 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s41, 0xff +; SI-NEXT: s_lshl_b32 s17, s93, 8 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_and_b32 s17, s82, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s18, s97, 24 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s17, s18, s17 ; SI-NEXT: v_add_i32_e32 v1, vcc, 8, v0 -; SI-NEXT: s_or_b32 s14, s14, s15 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_readlane_b32 s19, v7, 7 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s14 -; SI-NEXT: v_readlane_b32 s14, v21, 12 -; SI-NEXT: s_and_b32 s10, s10, 0xff -; SI-NEXT: v_readlane_b32 s15, v21, 13 -; SI-NEXT: s_lshl_b32 s14, s14, 8 -; SI-NEXT: s_or_b32 s10, s10, s14 -; SI-NEXT: v_readlane_b32 s14, v21, 14 -; SI-NEXT: v_readlane_b32 s15, v21, 15 -; SI-NEXT: s_and_b32 s14, s14, 0xff -; SI-NEXT: v_readlane_b32 s16, v21, 16 -; SI-NEXT: s_lshl_b32 s14, s14, 16 -; SI-NEXT: s_lshl_b32 s15, s16, 24 -; SI-NEXT: s_and_b32 s10, s10, 0xffff -; SI-NEXT: s_or_b32 s14, s15, s14 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s12, s12, 0xff +; SI-NEXT: s_lshl_b32 s16, s76, 8 +; SI-NEXT: s_or_b32 s12, s12, s16 +; SI-NEXT: s_and_b32 s16, s72, 0xff +; SI-NEXT: v_readlane_b32 s18, v7, 8 +; SI-NEXT: s_lshl_b32 s16, s16, 16 +; SI-NEXT: s_lshl_b32 s17, s18, 24 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_or_b32 s16, s17, s16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 12, v0 -; SI-NEXT: s_or_b32 s10, s10, s14 +; SI-NEXT: s_or_b32 s12, s12, s16 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s10 -; SI-NEXT: s_and_b32 s10, s11, 0xff -; SI-NEXT: s_lshl_b32 s11, s82, 8 -; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: s_and_b32 s11, s84, 0xff -; SI-NEXT: s_lshl_b32 s11, s11, 16 -; SI-NEXT: s_lshl_b32 s14, s86, 24 -; SI-NEXT: s_and_b32 s10, s10, 0xffff -; SI-NEXT: s_or_b32 s11, s14, s11 +; SI-NEXT: v_mov_b32_e32 v2, s12 +; SI-NEXT: s_and_b32 s12, s13, 0xff +; SI-NEXT: s_lshl_b32 s13, s89, 8 +; SI-NEXT: s_or_b32 s12, s12, s13 +; SI-NEXT: s_and_b32 s13, s81, 0xff +; SI-NEXT: s_lshl_b32 s13, s13, 16 +; SI-NEXT: s_lshl_b32 s16, s91, 24 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_or_b32 s13, s16, s13 ; SI-NEXT: v_add_i32_e32 v1, vcc, 16, v0 -; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: s_or_b32 s12, s12, s13 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s10 -; SI-NEXT: s_and_b32 s10, s42, 0xff -; SI-NEXT: s_lshl_b32 s11, s66, 8 -; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: s_and_b32 s11, s64, 0xff -; SI-NEXT: s_lshl_b32 s11, s11, 16 -; SI-NEXT: s_lshl_b32 s14, s54, 24 -; SI-NEXT: s_and_b32 s10, s10, 0xffff -; SI-NEXT: s_or_b32 s11, s14, s11 +; SI-NEXT: v_mov_b32_e32 v2, s12 +; SI-NEXT: s_and_b32 s8, s8, 0xff +; SI-NEXT: s_lshl_b32 s12, s92, 8 +; SI-NEXT: s_or_b32 s8, s8, s12 +; SI-NEXT: s_and_b32 s12, s90, 0xff +; SI-NEXT: s_lshl_b32 s12, s12, 16 +; SI-NEXT: s_lshl_b32 s13, s78, 24 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_or_b32 s12, s13, s12 ; SI-NEXT: v_add_i32_e32 v1, vcc, 20, v0 -; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: s_or_b32 s8, s8, s12 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s10 -; SI-NEXT: s_and_b32 s10, s43, 0xff -; SI-NEXT: s_lshl_b32 s11, s71, 8 -; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: s_and_b32 s11, s80, 0xff -; SI-NEXT: s_lshl_b32 s11, s11, 16 -; SI-NEXT: s_lshl_b32 s14, s81, 24 -; SI-NEXT: s_and_b32 s10, s10, 0xffff -; SI-NEXT: s_or_b32 s11, s14, s11 +; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: s_and_b32 s8, s9, 0xff +; SI-NEXT: s_lshl_b32 s9, s77, 8 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s80, 0xff +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_lshl_b32 s12, s79, 24 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_or_b32 s9, s12, s9 ; SI-NEXT: v_add_i32_e32 v1, vcc, 24, v0 -; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: s_or_b32 s8, s8, s9 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s10 -; SI-NEXT: s_and_b32 s10, s12, 0xff -; SI-NEXT: s_lshl_b32 s11, s52, 8 -; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: s_and_b32 s11, s50, 0xff -; SI-NEXT: s_lshl_b32 s11, s11, 16 -; SI-NEXT: s_lshl_b32 s12, s48, 24 -; SI-NEXT: s_and_b32 s10, s10, 0xffff -; SI-NEXT: s_or_b32 s11, s12, s11 +; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: s_and_b32 s8, s14, 0xff +; SI-NEXT: s_lshl_b32 s9, s66, 8 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s64, 0xff +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_lshl_b32 s12, s88, 24 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_or_b32 s9, s12, s9 ; SI-NEXT: v_add_i32_e32 v1, vcc, 28, v0 -; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: s_or_b32 s8, s8, s9 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s10 -; SI-NEXT: s_and_b32 s10, s13, 0xff -; SI-NEXT: s_lshl_b32 s11, s68, 8 -; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: s_and_b32 s11, s69, 0xff -; SI-NEXT: s_lshl_b32 s11, s11, 16 -; SI-NEXT: s_lshl_b32 s12, s70, 24 -; SI-NEXT: s_and_b32 s10, s10, 0xffff -; SI-NEXT: s_or_b32 s11, s12, s11 +; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: s_and_b32 s8, s15, 0xff +; SI-NEXT: s_lshl_b32 s9, s73, 8 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s71, 0xff +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_lshl_b32 s12, s75, 24 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_or_b32 s9, s12, s9 ; SI-NEXT: v_add_i32_e32 v1, vcc, 32, v0 -; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: s_or_b32 s8, s8, s9 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s10 -; SI-NEXT: s_and_b32 s8, s8, 0xff -; SI-NEXT: s_lshl_b32 s10, s38, 8 -; SI-NEXT: s_or_b32 s8, s8, s10 -; SI-NEXT: s_and_b32 s10, s36, 0xff -; SI-NEXT: s_lshl_b32 s10, s10, 16 -; SI-NEXT: s_lshl_b32 s11, s34, 24 +; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: s_and_b32 s8, s10, 0xff +; SI-NEXT: s_lshl_b32 s9, s54, 8 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s52, 0xff +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_lshl_b32 s10, s50, 24 ; SI-NEXT: s_and_b32 s8, s8, 0xffff -; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: s_or_b32 s9, s10, s9 ; SI-NEXT: v_add_i32_e32 v1, vcc, 36, v0 -; SI-NEXT: s_or_b32 s8, s8, s10 +; SI-NEXT: s_or_b32 s8, s8, s9 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s8 -; SI-NEXT: s_and_b32 s8, s9, 0xff -; SI-NEXT: s_lshl_b32 s9, s73, 8 +; SI-NEXT: s_and_b32 s8, s11, 0xff +; SI-NEXT: s_lshl_b32 s9, s61, 8 ; SI-NEXT: s_or_b32 s8, s8, s9 -; SI-NEXT: s_and_b32 s9, s75, 0xff +; SI-NEXT: s_and_b32 s9, s70, 0xff ; SI-NEXT: s_lshl_b32 s9, s9, 16 -; SI-NEXT: s_lshl_b32 s10, s77, 24 +; SI-NEXT: s_lshl_b32 s10, s63, 24 ; SI-NEXT: s_and_b32 s8, s8, 0xffff ; SI-NEXT: s_or_b32 s9, s10, s9 ; SI-NEXT: v_add_i32_e32 v1, vcc, 40, v0 @@ -66907,11 +69232,11 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s8 ; SI-NEXT: s_and_b32 s6, s6, 0xff -; SI-NEXT: s_lshl_b32 s8, s30, 8 +; SI-NEXT: s_lshl_b32 s8, s48, 8 ; SI-NEXT: s_or_b32 s6, s6, s8 -; SI-NEXT: s_and_b32 s8, s94, 0xff +; SI-NEXT: s_and_b32 s8, s38, 0xff ; SI-NEXT: s_lshl_b32 s8, s8, 16 -; SI-NEXT: s_lshl_b32 s9, s92, 24 +; SI-NEXT: s_lshl_b32 s9, s36, 24 ; SI-NEXT: s_and_b32 s6, s6, 0xffff ; SI-NEXT: s_or_b32 s8, s9, s8 ; SI-NEXT: v_add_i32_e32 v1, vcc, 44, v0 @@ -66920,11 +69245,11 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: s_and_b32 s6, s7, 0xff -; SI-NEXT: s_lshl_b32 s7, s95, 8 +; SI-NEXT: s_lshl_b32 s7, s57, 8 ; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: s_and_b32 s7, s35, 0xff +; SI-NEXT: s_and_b32 s7, s68, 0xff ; SI-NEXT: s_lshl_b32 s7, s7, 16 -; SI-NEXT: s_lshl_b32 s8, s37, 24 +; SI-NEXT: s_lshl_b32 s8, s59, 24 ; SI-NEXT: s_and_b32 s6, s6, 0xffff ; SI-NEXT: s_or_b32 s7, s8, s7 ; SI-NEXT: v_add_i32_e32 v1, vcc, 48, v0 @@ -66933,11 +69258,11 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: s_and_b32 s4, s4, 0xff -; SI-NEXT: s_lshl_b32 s6, s90, 8 +; SI-NEXT: s_lshl_b32 s6, s34, 8 ; SI-NEXT: s_or_b32 s4, s4, s6 -; SI-NEXT: s_and_b32 s6, s88, 0xff +; SI-NEXT: s_and_b32 s6, s30, 0xff ; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: s_lshl_b32 s7, s78, 24 +; SI-NEXT: s_lshl_b32 s7, s94, 24 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s6, s7, s6 ; SI-NEXT: v_add_i32_e32 v1, vcc, 52, v0 @@ -66946,11 +69271,11 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s4 ; SI-NEXT: s_and_b32 s4, s5, 0xff -; SI-NEXT: s_lshl_b32 s5, s79, 8 +; SI-NEXT: s_lshl_b32 s5, s45, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s89, 0xff +; SI-NEXT: s_and_b32 s5, s69, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s6, s91, 24 +; SI-NEXT: s_lshl_b32 s6, s47, 24 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s6, s5 ; SI-NEXT: v_add_i32_e32 v1, vcc, 56, v0 @@ -66958,126 +69283,109 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v0, vcc, 60, v0 ; SI-NEXT: v_mov_b32_e32 v1, s4 -; SI-NEXT: v_readlane_b32 s19, v21, 11 -; SI-NEXT: v_readlane_b32 s17, v21, 17 +; SI-NEXT: v_readlane_b32 s19, v7, 9 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: v_readlane_b32 s99, v20, 35 -; SI-NEXT: v_readlane_b32 s98, v20, 34 -; SI-NEXT: v_readlane_b32 s97, v20, 33 -; SI-NEXT: v_readlane_b32 s96, v20, 32 -; SI-NEXT: v_readlane_b32 s87, v20, 31 -; SI-NEXT: v_readlane_b32 s86, v20, 30 -; SI-NEXT: v_readlane_b32 s85, v20, 29 -; SI-NEXT: v_readlane_b32 s84, v20, 28 -; SI-NEXT: v_readlane_b32 s83, v20, 27 -; SI-NEXT: v_readlane_b32 s82, v20, 26 -; SI-NEXT: v_readlane_b32 s81, v20, 25 -; SI-NEXT: v_readlane_b32 s80, v20, 24 -; SI-NEXT: v_readlane_b32 s71, v20, 23 -; SI-NEXT: v_readlane_b32 s70, v20, 22 -; SI-NEXT: v_readlane_b32 s69, v20, 21 -; SI-NEXT: v_readlane_b32 s68, v20, 20 -; SI-NEXT: v_readlane_b32 s67, v20, 19 -; SI-NEXT: v_readlane_b32 s66, v20, 18 -; SI-NEXT: v_readlane_b32 s65, v20, 17 -; SI-NEXT: v_readlane_b32 s64, v20, 16 -; SI-NEXT: v_readlane_b32 s55, v20, 15 -; SI-NEXT: v_readlane_b32 s54, v20, 14 -; SI-NEXT: v_readlane_b32 s53, v20, 13 -; SI-NEXT: v_readlane_b32 s52, v20, 12 -; SI-NEXT: v_readlane_b32 s51, v20, 11 -; SI-NEXT: v_readlane_b32 s50, v20, 10 -; SI-NEXT: v_readlane_b32 s49, v20, 9 -; SI-NEXT: v_readlane_b32 s48, v20, 8 -; SI-NEXT: v_readlane_b32 s39, v20, 7 -; SI-NEXT: v_readlane_b32 s38, v20, 6 -; SI-NEXT: v_readlane_b32 s37, v20, 5 -; SI-NEXT: v_readlane_b32 s36, v20, 4 -; SI-NEXT: v_readlane_b32 s35, v20, 3 -; SI-NEXT: v_readlane_b32 s34, v20, 2 -; SI-NEXT: v_readlane_b32 s31, v20, 1 -; SI-NEXT: v_readlane_b32 s30, v20, 0 +; SI-NEXT: v_readlane_b32 s99, v6, 35 +; SI-NEXT: v_readlane_b32 s98, v6, 34 +; SI-NEXT: v_readlane_b32 s97, v6, 33 +; SI-NEXT: v_readlane_b32 s96, v6, 32 +; SI-NEXT: v_readlane_b32 s87, v6, 31 +; SI-NEXT: v_readlane_b32 s86, v6, 30 +; SI-NEXT: v_readlane_b32 s85, v6, 29 +; SI-NEXT: v_readlane_b32 s84, v6, 28 +; SI-NEXT: v_readlane_b32 s83, v6, 27 +; SI-NEXT: v_readlane_b32 s82, v6, 26 +; SI-NEXT: v_readlane_b32 s81, v6, 25 +; SI-NEXT: v_readlane_b32 s80, v6, 24 +; SI-NEXT: v_readlane_b32 s71, v6, 23 +; SI-NEXT: v_readlane_b32 s70, v6, 22 +; SI-NEXT: v_readlane_b32 s69, v6, 21 +; SI-NEXT: v_readlane_b32 s68, v6, 20 +; SI-NEXT: v_readlane_b32 s67, v6, 19 +; SI-NEXT: v_readlane_b32 s66, v6, 18 +; SI-NEXT: v_readlane_b32 s65, v6, 17 +; SI-NEXT: v_readlane_b32 s64, v6, 16 +; SI-NEXT: v_readlane_b32 s55, v6, 15 +; SI-NEXT: v_readlane_b32 s54, v6, 14 +; SI-NEXT: v_readlane_b32 s53, v6, 13 +; SI-NEXT: v_readlane_b32 s52, v6, 12 +; SI-NEXT: v_readlane_b32 s51, v6, 11 +; SI-NEXT: v_readlane_b32 s50, v6, 10 +; SI-NEXT: v_readlane_b32 s49, v6, 9 +; SI-NEXT: v_readlane_b32 s48, v6, 8 +; SI-NEXT: v_readlane_b32 s39, v6, 7 +; SI-NEXT: v_readlane_b32 s38, v6, 6 +; SI-NEXT: v_readlane_b32 s37, v6, 5 +; SI-NEXT: v_readlane_b32 s36, v6, 4 +; SI-NEXT: v_readlane_b32 s35, v6, 3 +; SI-NEXT: v_readlane_b32 s34, v6, 2 +; SI-NEXT: v_readlane_b32 s31, v6, 1 +; SI-NEXT: v_readlane_b32 s30, v6, 0 ; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB97_4: ; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v21, s4, 0 -; SI-NEXT: v_writelane_b32 v21, s5, 1 +; SI-NEXT: v_writelane_b32 v7, s4, 0 +; SI-NEXT: v_writelane_b32 v7, s5, 1 ; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr99 +; SI-NEXT: ; implicit-def: $sgpr44 ; SI-NEXT: ; implicit-def: $sgpr40 -; SI-NEXT: ; implicit-def: $sgpr60 ; SI-NEXT: ; implicit-def: $sgpr74 -; SI-NEXT: ; implicit-def: $sgpr47 -; SI-NEXT: ; implicit-def: $sgpr14 -; SI-NEXT: ; implicit-def: $sgpr87 -; SI-NEXT: ; implicit-def: $sgpr98 -; SI-NEXT: ; implicit-def: $sgpr56 -; SI-NEXT: ; implicit-def: $sgpr10 -; SI-NEXT: ; implicit-def: $sgpr82 -; SI-NEXT: ; implicit-def: $sgpr84 -; SI-NEXT: ; implicit-def: $sgpr86 -; SI-NEXT: ; implicit-def: $sgpr71 -; SI-NEXT: ; implicit-def: $sgpr80 -; SI-NEXT: ; implicit-def: $sgpr81 -; SI-NEXT: ; implicit-def: $sgpr68 -; SI-NEXT: ; implicit-def: $sgpr69 -; SI-NEXT: ; implicit-def: $sgpr70 -; SI-NEXT: ; implicit-def: $sgpr73 -; SI-NEXT: ; implicit-def: $sgpr75 -; SI-NEXT: ; implicit-def: $sgpr77 -; SI-NEXT: ; implicit-def: $sgpr95 -; SI-NEXT: ; implicit-def: $sgpr35 -; SI-NEXT: ; implicit-def: $sgpr37 -; SI-NEXT: ; implicit-def: $sgpr79 +; SI-NEXT: ; implicit-def: $sgpr93 +; SI-NEXT: ; implicit-def: $sgpr97 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr72 ; SI-NEXT: ; implicit-def: $sgpr89 ; SI-NEXT: ; implicit-def: $sgpr91 -; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr77 +; SI-NEXT: ; implicit-def: $sgpr79 +; SI-NEXT: ; implicit-def: $sgpr73 +; SI-NEXT: ; implicit-def: $sgpr75 +; SI-NEXT: ; implicit-def: $sgpr61 +; SI-NEXT: ; implicit-def: $sgpr63 +; SI-NEXT: ; implicit-def: $sgpr57 +; SI-NEXT: ; implicit-def: $sgpr59 +; SI-NEXT: ; implicit-def: $sgpr45 +; SI-NEXT: ; implicit-def: $sgpr47 +; SI-NEXT: ; implicit-def: $sgpr92 +; SI-NEXT: ; implicit-def: $sgpr90 +; SI-NEXT: ; implicit-def: $sgpr78 +; SI-NEXT: ; implicit-def: $sgpr14 ; SI-NEXT: ; implicit-def: $sgpr66 ; SI-NEXT: ; implicit-def: $sgpr64 +; SI-NEXT: ; implicit-def: $sgpr88 +; SI-NEXT: ; implicit-def: $sgpr10 ; SI-NEXT: ; implicit-def: $sgpr54 -; SI-NEXT: ; implicit-def: $sgpr12 ; SI-NEXT: ; implicit-def: $sgpr52 ; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: ; implicit-def: $sgpr6 ; SI-NEXT: ; implicit-def: $sgpr48 -; SI-NEXT: ; implicit-def: $sgpr8 ; SI-NEXT: ; implicit-def: $sgpr38 ; SI-NEXT: ; implicit-def: $sgpr36 ; SI-NEXT: ; implicit-def: $sgpr34 -; SI-NEXT: ; implicit-def: $sgpr6 ; SI-NEXT: ; implicit-def: $sgpr30 ; SI-NEXT: ; implicit-def: $sgpr94 -; SI-NEXT: ; implicit-def: $sgpr92 -; SI-NEXT: ; implicit-def: $sgpr90 -; SI-NEXT: ; implicit-def: $sgpr88 -; SI-NEXT: ; implicit-def: $sgpr78 -; SI-NEXT: v_writelane_b32 v21, s4, 2 -; SI-NEXT: v_writelane_b32 v21, s5, 3 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v21, s4, 4 -; SI-NEXT: v_writelane_b32 v21, s5, 5 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v21, s4, 6 -; SI-NEXT: v_writelane_b32 v21, s5, 7 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v21, s4, 8 -; SI-NEXT: v_writelane_b32 v21, s5, 9 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v21, s4, 10 -; SI-NEXT: v_writelane_b32 v21, s5, 11 +; SI-NEXT: v_writelane_b32 v7, s4, 2 +; SI-NEXT: v_writelane_b32 v7, s5, 3 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v21, s4, 12 -; SI-NEXT: v_writelane_b32 v21, s5, 13 +; SI-NEXT: v_writelane_b32 v7, s4, 4 +; SI-NEXT: v_writelane_b32 v7, s5, 5 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v21, s4, 14 -; SI-NEXT: v_writelane_b32 v21, s5, 15 +; SI-NEXT: v_writelane_b32 v7, s4, 6 +; SI-NEXT: v_writelane_b32 v7, s5, 7 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v21, s4, 16 -; SI-NEXT: v_writelane_b32 v21, s5, 17 +; SI-NEXT: v_writelane_b32 v7, s4, 8 +; SI-NEXT: v_writelane_b32 v7, s5, 9 ; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: s_branch .LBB97_2 ; @@ -68478,38 +70786,29 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:88 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:120 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:128 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:112 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:104 -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:92 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:116 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:108 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:132 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:120 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:128 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:36 ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -68517,478 +70816,492 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v15 -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v11 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v9 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v23 +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v7 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v23 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v17 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: v_lshlrev_b32_e32 v40, 8, v21 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: v_lshlrev_b32_e32 v63, 24, v19 -; SI-NEXT: v_lshlrev_b32_e32 v24, 24, v27 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v55, 8, v21 +; SI-NEXT: v_lshlrev_b32_e32 v54, 24, v19 +; SI-NEXT: v_lshlrev_b32_e32 v56, 8, v29 +; SI-NEXT: v_lshlrev_b32_e32 v44, 24, v27 +; SI-NEXT: v_lshlrev_b32_e32 v47, 8, v25 +; SI-NEXT: ; kill: killed $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; kill: killed $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:124 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:124 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: v_lshlrev_b32_e32 v16, 24, v4 -; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v10 -; SI-NEXT: v_lshlrev_b32_e32 v14, 24, v12 -; SI-NEXT: v_lshlrev_b32_e32 v30, 24, v22 -; SI-NEXT: v_lshlrev_b32_e32 v26, 8, v26 -; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v31 -; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v32 -; SI-NEXT: v_lshlrev_b32_e32 v44, 8, v33 -; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v34 -; SI-NEXT: v_lshlrev_b32_e32 v56, 24, v35 -; SI-NEXT: v_lshlrev_b32_e32 v60, 8, v36 -; SI-NEXT: v_lshlrev_b32_e32 v59, 24, v37 -; SI-NEXT: v_lshlrev_b32_e32 v61, 24, v38 -; SI-NEXT: v_lshlrev_b32_e32 v22, 8, v25 -; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v39 -; SI-NEXT: v_lshlrev_b32_e32 v45, 8, v48 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v49 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v60, 8, v4 +; SI-NEXT: v_lshlrev_b32_e32 v46, 24, v6 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v62, 8, v12 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v9 +; SI-NEXT: v_lshlrev_b32_e32 v63, 24, v16 +; SI-NEXT: v_lshlrev_b32_e32 v42, 24, v2 +; SI-NEXT: v_lshlrev_b32_e32 v16, 24, v20 +; SI-NEXT: v_lshlrev_b32_e32 v57, 24, v10 +; SI-NEXT: v_lshlrev_b32_e32 v34, 8, v26 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v28 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:12 +; SI-NEXT: v_lshlrev_b32_e32 v59, 24, v14 +; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v22 +; SI-NEXT: v_lshlrev_b32_e32 v27, 24, v31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v50 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v17 +; SI-NEXT: v_lshlrev_b32_e32 v31, 8, v32 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:28 +; SI-NEXT: v_lshlrev_b32_e32 v10, 8, v33 +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v35 +; SI-NEXT: v_lshlrev_b32_e32 v51, 8, v36 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:20 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:4 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v29 -; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:116 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:100 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB98_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v21, 0xff, v58 -; SI-NEXT: v_or_b32_e32 v21, v21, v26 -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v11, 0xff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v52 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v16, v16, v13 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v1, v6, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v16 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v0, v44 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_or_b32_e32 v7, v3, v7 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v26, 0xff, v26 -; SI-NEXT: v_or_b32_e32 v26, v26, v27 -; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v9, v3 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v48, v7, v3 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v0, v0, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v11, v11, v9 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v39, v46, v9 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v29, v5, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v23, v0, v29 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v39 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v37, v26, v11 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v14, v5, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 -; SI-NEXT: v_or_b32_e32 v9, v9, v13 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v7, v13, v7 -; SI-NEXT: v_or_b32_e32 v49, v9, v7 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: v_alignbit_b32 v13, v48, v11, 16 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v36, v5, v0 +; SI-NEXT: v_alignbit_b32 v0, v23, v14, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v3, v0, v36 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v36 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v15, v13, v9 -; SI-NEXT: v_alignbit_b32 v9, v49, v15, 16 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v17, v5, v0 +; SI-NEXT: v_alignbit_b32 v0, v3, v17, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 -; SI-NEXT: v_or_b32_e32 v11, v11, v26 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: v_or_b32_e32 v35, v11, v15 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 -; SI-NEXT: v_or_b32_e32 v9, v9, v40 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: v_or_b32_e32 v19, v17, v13 -; SI-NEXT: v_or_b32_e32 v50, v9, v19 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_or_b32_e32 v11, v11, v15 -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 -; SI-NEXT: v_or_b32_e32 v6, v13, v6 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_or_b32_e32 v0, v0, v55 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v37, v7, v5 +; SI-NEXT: v_or_b32_e32 v5, v0, v37 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v37 +; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 -; SI-NEXT: v_or_b32_e32 v17, v17, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v23, v63, v9 -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; SI-NEXT: v_alignbit_b32 v9, v50, v23, 16 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: v_or_b32_e32 v33, v11, v23 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v16, v16, v13 -; SI-NEXT: v_or_b32_e32 v51, v6, v16 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v14, v14, v20 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v16 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: v_or_b32_e32 v52, v17, v14 -; SI-NEXT: v_and_b32_e32 v17, 0xff, v46 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v20, v30, v17 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: v_alignbit_b32 v17, v52, v20, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v19, v54, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 -; SI-NEXT: v_or_b32_e32 v11, v11, v22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v38, v42, v7 +; SI-NEXT: v_alignbit_b32 v53, v5, v19, 16 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v24, v6 -; SI-NEXT: v_alignbit_b32 v13, v51, v6, 16 -; SI-NEXT: v_and_b32_e32 v24, 0xff, v47 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_or_b32_e32 v10, v10, v24 -; SI-NEXT: v_or_b32_e32 v53, v21, v10 -; SI-NEXT: v_and_b32_e32 v21, 0xff, v57 -; SI-NEXT: v_and_b32_e32 v24, 0xff, v62 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_or_b32_e32 v2, v2, v21 -; SI-NEXT: v_or_b32_e32 v12, v12, v24 -; SI-NEXT: v_alignbit_b32 v21, v53, v2, 16 -; SI-NEXT: v_or_b32_e32 v54, v0, v12 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xff, v28 -; SI-NEXT: v_and_b32_e32 v24, 0xff, v43 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v0, v56 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v7, v0, v38 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v38 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v8, v8, v60 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_or_b32_e32 v0, v56, v0 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; SI-NEXT: v_or_b32_e32 v24, v59, v24 -; SI-NEXT: v_alignbit_b32 v25, v54, v0, 16 -; SI-NEXT: v_or_b32_e32 v55, v8, v24 -; SI-NEXT: v_and_b32_e32 v8, 0xff, v41 +; SI-NEXT: v_or_b32_e32 v25, v44, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v45, v7, v25, 16 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v0, v60 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v9, v0, v39 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v20, v57, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v12 +; SI-NEXT: v_or_b32_e32 v0, v0, v62 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v12, v59, v11 +; SI-NEXT: v_or_b32_e32 v11, v0, v12 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v26, v63, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v40 +; SI-NEXT: v_or_b32_e32 v0, v0, v18 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v13, v0, v16 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v41 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v2, v2, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v30 +; SI-NEXT: v_or_b32_e32 v0, v0, v34 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v61, v8 -; SI-NEXT: v_alignbit_b32 v29, v55, v8, 16 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v4, v4, v8 +; SI-NEXT: v_or_b32_e32 v15, v0, v4 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v8, v27, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v32, v9, v20, 16 +; SI-NEXT: v_alignbit_b32 v22, v11, v26, 16 +; SI-NEXT: v_alignbit_b32 v21, v13, v2, 16 +; SI-NEXT: v_alignbit_b32 v33, v15, v8, 16 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v4 ; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr62 ; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: v_or_b32_e32 v32, v11, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v14 -; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 -; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; SI-NEXT: v_or_b32_e32 v34, v4, v20 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: v_or_b32_e32 v0, v0, v18 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v0, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 -; SI-NEXT: v_or_b32_e32 v4, v4, v45 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; SI-NEXT: v_or_b32_e32 v36, v4, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v42 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: v_or_b32_e32 v2, v2, v5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v38, v2, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v18 +; SI-NEXT: v_or_b32_e32 v14, v14, v18 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_or_b32_e32 v14, v14, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_or_b32_e32 v61, v17, v19 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v19, v1, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v58 +; SI-NEXT: v_or_b32_e32 v1, v1, v51 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 +; SI-NEXT: v_or_b32_e32 v17, v17, v47 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_or_b32_e32 v43, v17, v25 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 +; SI-NEXT: v_or_b32_e32 v17, v17, v31 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_or_b32_e32 v48, v17, v20 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 +; SI-NEXT: v_or_b32_e32 v10, v17, v10 +; SI-NEXT: v_or_b32_e32 v17, v1, v8 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: v_or_b32_e32 v39, v0, v8 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: v_or_b32_e32 v25, v10, v26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: .LBB98_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB98_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v18 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v41 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v58 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v24 +; SI-NEXT: v_or_b32_e32 v0, v51, v0 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 -; SI-NEXT: v_or_b32_e32 v3, v61, v3 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_or_b32_e32 v3, v27, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v3, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v8 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_or_b32_e32 v3, v60, v3 +; SI-NEXT: v_or_b32_e32 v3, v34, v3 ; SI-NEXT: s_movk_i32 s6, 0x300 ; SI-NEXT: s_mov_b32 s7, 0x3000000 -; SI-NEXT: v_add_i32_e32 v39, vcc, s7, v1 -; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v43 +; SI-NEXT: v_add_i32_e32 v17, vcc, s7, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v30 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v3, v59, v3 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_add_i32_e32 v55, vcc, s7, v1 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v42 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v0, v3, v0 +; SI-NEXT: v_add_i32_e32 v15, vcc, s7, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v6 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v41 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v8 -; SI-NEXT: v_or_b32_e32 v1, v5, v1 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v3, v56, v3 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_add_i32_e32 v38, vcc, s7, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v19, vcc, s7, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v40 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v62 -; SI-NEXT: v_or_b32_e32 v0, v44, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v52 +; SI-NEXT: v_or_b32_e32 v0, v18, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v12, v1 +; SI-NEXT: v_or_b32_e32 v1, v16, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v54, vcc, s7, v0 +; SI-NEXT: v_add_i32_e32 v13, vcc, s7, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v57 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v28 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_alignbit_b32 v25, v54, v38, 16 -; SI-NEXT: v_alignbit_b32 v29, v55, v39, 16 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v54 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v55 +; SI-NEXT: v_or_b32_e32 v1, v63, v1 +; SI-NEXT: v_alignbit_b32 v21, v13, v19, 16 +; SI-NEXT: v_alignbit_b32 v33, v15, v17, 16 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v15 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v45, v0 +; SI-NEXT: v_or_b32_e32 v0, v10, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v36, vcc, s7, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v58 +; SI-NEXT: v_add_i32_e32 v25, vcc, s7, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v12 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v47 -; SI-NEXT: v_or_b32_e32 v0, v26, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v26 +; SI-NEXT: v_or_b32_e32 v0, v62, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v10, v1 +; SI-NEXT: v_or_b32_e32 v1, v59, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v53, vcc, s7, v0 +; SI-NEXT: v_add_i32_e32 v11, vcc, s7, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v46 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v20 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v30, v1 -; SI-NEXT: v_alignbit_b32 v21, v53, v36, 16 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v53 +; SI-NEXT: v_or_b32_e32 v1, v57, v1 +; SI-NEXT: v_alignbit_b32 v22, v11, v25, 16 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v11 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v4, v0 +; SI-NEXT: v_or_b32_e32 v0, v31, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v34, vcc, s7, v0 +; SI-NEXT: v_add_i32_e32 v48, vcc, s7, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) @@ -68996,31 +71309,31 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_or_b32_e32 v0, v20, v0 +; SI-NEXT: v_or_b32_e32 v0, v60, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v14, v1 +; SI-NEXT: v_or_b32_e32 v1, v46, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v52, vcc, s7, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, s7, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: v_alignbit_b32 v17, v52, v34, 16 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v52 +; SI-NEXT: v_alignbit_b32 v32, v9, v48, 16 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v9 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_or_b32_e32 v0, v22, v0 +; SI-NEXT: v_or_b32_e32 v0, v47, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v24, v1 +; SI-NEXT: v_or_b32_e32 v1, v44, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v32, vcc, s7, v0 +; SI-NEXT: v_add_i32_e32 v43, vcc, s7, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) @@ -69028,18 +71341,18 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_or_b32_e32 v0, v6, v0 +; SI-NEXT: v_or_b32_e32 v0, v56, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v16, v1 +; SI-NEXT: v_or_b32_e32 v1, v42, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v51, vcc, s7, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, s7, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: v_alignbit_b32 v13, v51, v32, 16 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v51 +; SI-NEXT: v_alignbit_b32 v45, v7, v43, 16 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v7 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 @@ -69052,9 +71365,9 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v63, v1 +; SI-NEXT: v_or_b32_e32 v1, v54, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v33, vcc, s7, v0 +; SI-NEXT: v_add_i32_e32 v61, vcc, s7, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) @@ -69062,19 +71375,19 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_or_b32_e32 v0, v40, v0 +; SI-NEXT: v_or_b32_e32 v0, v55, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v50, vcc, s7, v0 +; SI-NEXT: v_add_i32_e32 v5, vcc, s7, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: v_alignbit_b32 v9, v50, v33, 16 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v50 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v53, v5, v61, 16 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v5 ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 @@ -69089,7 +71402,7 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v35, vcc, s7, v0 +; SI-NEXT: v_add_i32_e32 v14, vcc, s7, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload @@ -69107,11 +71420,11 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v49, vcc, s7, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, s7, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v3 ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 @@ -69126,33 +71439,66 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v37, vcc, s7, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, s7, v0 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v48, vcc, s7, v0 -; SI-NEXT: v_alignbit_b32 v0, v48, v37, 16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v23, vcc, s7, v1 +; SI-NEXT: v_alignbit_b32 v1, v23, v0, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v0, v49, v35, 16 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v48 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v3, v14, 16 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v23 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; SI-NEXT: .LBB98_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v53 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v45 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v32 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v22 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v33 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v36 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v61 +; SI-NEXT: v_or_b32_e32 v4, v4, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v37 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v43 ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload @@ -69169,25 +71515,26 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, v37 -; SI-NEXT: v_mov_b32_e32 v2, v48 -; SI-NEXT: v_mov_b32_e32 v4, v35 -; SI-NEXT: v_mov_b32_e32 v6, v49 -; SI-NEXT: v_mov_b32_e32 v8, v33 -; SI-NEXT: v_mov_b32_e32 v10, v50 -; SI-NEXT: v_mov_b32_e32 v12, v32 -; SI-NEXT: v_mov_b32_e32 v14, v51 -; SI-NEXT: v_mov_b32_e32 v16, v34 -; SI-NEXT: v_mov_b32_e32 v18, v52 -; SI-NEXT: v_mov_b32_e32 v20, v36 -; SI-NEXT: v_mov_b32_e32 v22, v53 -; SI-NEXT: v_mov_b32_e32 v24, v38 -; SI-NEXT: v_mov_b32_e32 v26, v54 -; SI-NEXT: v_mov_b32_e32 v28, v39 -; SI-NEXT: v_mov_b32_e32 v30, v55 +; SI-NEXT: v_or_b32_e32 v6, v6, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v38 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v48 +; SI-NEXT: v_or_b32_e32 v8, v8, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v29 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v25 +; SI-NEXT: v_or_b32_e32 v10, v10, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v39 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v21 +; SI-NEXT: v_or_b32_e32 v12, v12, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v49 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v17 +; SI-NEXT: v_or_b32_e32 v14, v14, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v50 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -70902,632 +73249,783 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; SI-LABEL: bitcast_v64i8_to_v32i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v46, v30 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:24 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:36 +; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: v_readfirstlane_b32 s43, v1 -; SI-NEXT: v_readfirstlane_b32 s42, v0 -; SI-NEXT: v_lshlrev_b32_e32 v42, 8, v3 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:44 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_writelane_b32 v40, s30, 0 +; SI-NEXT: v_writelane_b32 v40, s31, 1 +; SI-NEXT: v_writelane_b32 v40, s34, 2 +; SI-NEXT: v_writelane_b32 v40, s35, 3 +; SI-NEXT: v_writelane_b32 v40, s36, 4 +; SI-NEXT: v_writelane_b32 v40, s37, 5 +; SI-NEXT: v_writelane_b32 v40, s38, 6 +; SI-NEXT: v_writelane_b32 v40, s39, 7 +; SI-NEXT: v_writelane_b32 v40, s48, 8 +; SI-NEXT: v_writelane_b32 v40, s49, 9 +; SI-NEXT: v_writelane_b32 v40, s50, 10 +; SI-NEXT: v_writelane_b32 v40, s51, 11 +; SI-NEXT: v_writelane_b32 v40, s52, 12 +; SI-NEXT: v_writelane_b32 v40, s53, 13 +; SI-NEXT: v_writelane_b32 v40, s54, 14 +; SI-NEXT: v_writelane_b32 v40, s55, 15 +; SI-NEXT: s_mov_b32 s92, s16 +; SI-NEXT: v_writelane_b32 v40, s64, 16 +; SI-NEXT: v_writelane_b32 v40, s65, 17 +; SI-NEXT: v_writelane_b32 v40, s66, 18 +; SI-NEXT: v_writelane_b32 v40, s67, 19 +; SI-NEXT: v_writelane_b32 v40, s68, 20 +; SI-NEXT: v_writelane_b32 v40, s69, 21 +; SI-NEXT: v_writelane_b32 v40, s70, 22 +; SI-NEXT: v_writelane_b32 v40, s71, 23 +; SI-NEXT: v_writelane_b32 v40, s80, 24 +; SI-NEXT: v_writelane_b32 v40, s81, 25 +; SI-NEXT: v_writelane_b32 v40, s82, 26 +; SI-NEXT: v_writelane_b32 v40, s83, 27 +; SI-NEXT: v_writelane_b32 v40, s84, 28 +; SI-NEXT: v_writelane_b32 v40, s85, 29 +; SI-NEXT: v_writelane_b32 v40, s86, 30 +; SI-NEXT: v_writelane_b32 v40, s87, 31 +; SI-NEXT: ; implicit-def: $vgpr41 : SGPR spill to VGPR lane +; SI-NEXT: v_writelane_b32 v40, s96, 32 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v18, 24, v5 -; SI-NEXT: v_lshlrev_b32_e32 v40, 8, v11 -; SI-NEXT: v_lshlrev_b32_e32 v53, 24, v13 -; SI-NEXT: v_lshlrev_b32_e32 v43, 8, v19 -; SI-NEXT: v_lshlrev_b32_e32 v52, 24, v21 -; SI-NEXT: v_lshlrev_b32_e32 v59, 8, v27 -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v29 -; SI-NEXT: v_lshlrev_b32_e32 v47, 8, v7 -; SI-NEXT: v_lshlrev_b32_e32 v45, 24, v9 -; SI-NEXT: v_lshlrev_b32_e32 v57, 8, v15 -; SI-NEXT: v_lshlrev_b32_e32 v56, 24, v17 -; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v23 -; SI-NEXT: v_lshlrev_b32_e32 v41, 24, v25 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v35 -; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v38 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v36 -; SI-NEXT: v_lshlrev_b32_e32 v19, 8, v48 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v39 +; SI-NEXT: v_writelane_b32 v41, s23, 0 +; SI-NEXT: v_writelane_b32 v40, s97, 33 +; SI-NEXT: v_writelane_b32 v41, s21, 1 +; SI-NEXT: v_readfirstlane_b32 s47, v29 +; SI-NEXT: v_writelane_b32 v40, s98, 34 +; SI-NEXT: v_writelane_b32 v41, s47, 2 +; SI-NEXT: v_writelane_b32 v40, s99, 35 +; SI-NEXT: v_readfirstlane_b32 s82, v30 +; SI-NEXT: v_readfirstlane_b32 s83, v28 +; SI-NEXT: v_readfirstlane_b32 s44, v27 +; SI-NEXT: v_readfirstlane_b32 s96, v26 +; SI-NEXT: v_readfirstlane_b32 s70, v25 +; SI-NEXT: v_readfirstlane_b32 s68, v24 +; SI-NEXT: v_readfirstlane_b32 s84, v23 +; SI-NEXT: v_readfirstlane_b32 s65, v22 +; SI-NEXT: v_readfirstlane_b32 s86, v21 +; SI-NEXT: v_readfirstlane_b32 s66, v20 +; SI-NEXT: v_readfirstlane_b32 s87, v19 +; SI-NEXT: v_readfirstlane_b32 s80, v18 +; SI-NEXT: v_readfirstlane_b32 s36, v17 +; SI-NEXT: v_readfirstlane_b32 s31, v16 +; SI-NEXT: v_readfirstlane_b32 s64, v15 +; SI-NEXT: v_readfirstlane_b32 s38, v14 +; SI-NEXT: v_readfirstlane_b32 s67, v13 +; SI-NEXT: v_readfirstlane_b32 s34, v12 +; SI-NEXT: v_readfirstlane_b32 s71, v11 +; SI-NEXT: v_readfirstlane_b32 s81, v10 +; SI-NEXT: v_readfirstlane_b32 s37, v9 +; SI-NEXT: v_readfirstlane_b32 s35, v8 +; SI-NEXT: v_readfirstlane_b32 s49, v7 +; SI-NEXT: v_readfirstlane_b32 s94, v6 +; SI-NEXT: v_readfirstlane_b32 s51, v5 +; SI-NEXT: v_readfirstlane_b32 s88, v4 +; SI-NEXT: v_readfirstlane_b32 s53, v3 +; SI-NEXT: v_readfirstlane_b32 s54, v2 +; SI-NEXT: v_readfirstlane_b32 s89, v1 +; SI-NEXT: v_readfirstlane_b32 s90, v0 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_readfirstlane_b32 s91, v31 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_readfirstlane_b32 s16, v32 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_readfirstlane_b32 s93, v33 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:12 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_readfirstlane_b32 s52, v34 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_readfirstlane_b32 s55, v35 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_readfirstlane_b32 s79, v37 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:76 +; SI-NEXT: v_readfirstlane_b32 s50, v36 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_readfirstlane_b32 s21, v38 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_lshlrev_b32_e32 v61, 8, v37 +; SI-NEXT: v_readfirstlane_b32 s56, v31 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v49 +; SI-NEXT: v_readfirstlane_b32 s85, v32 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_readfirstlane_b32 s58, v33 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_lshlrev_b32_e32 v25, 8, v30 +; SI-NEXT: v_readfirstlane_b32 s98, v39 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_readfirstlane_b32 s46, v48 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_lshlrev_b32_e32 v30, 24, v31 +; SI-NEXT: v_readfirstlane_b32 s99, v49 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_readfirstlane_b32 s97, v50 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v38, 8, v33 +; SI-NEXT: v_readfirstlane_b32 s9, v51 +; SI-NEXT: v_writelane_b32 v41, s58, 3 +; SI-NEXT: v_writelane_b32 v41, s9, 4 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v37 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v29, 24, v34 -; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: s_cbranch_scc0 .LBB99_4 +; SI-NEXT: v_readfirstlane_b32 s78, v34 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_readfirstlane_b32 s69, v35 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s30, v36 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB99_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xff, v2 -; SI-NEXT: v_or_b32_e32 v0, v0, v42 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v18, v0 -; SI-NEXT: v_or_b32_e32 v37, v1, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v10 -; SI-NEXT: v_and_b32_e32 v9, 0xff, v12 -; SI-NEXT: v_or_b32_e32 v1, v1, v40 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(11) expcnt(0) -; SI-NEXT: v_mov_b32_e32 v60, v44 -; SI-NEXT: v_or_b32_e32 v44, v53, v9 -; SI-NEXT: v_or_b32_e32 v33, v1, v44 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v9, 0xff, v20 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_mov_b32_e32 v15, v46 -; SI-NEXT: v_or_b32_e32 v46, v52, v9 -; SI-NEXT: v_and_b32_e32 v9, 0xff, v28 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v55, v3, v9 -; SI-NEXT: v_and_b32_e32 v9, 0xff, v63 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v27, v13 -; SI-NEXT: v_mov_b32_e32 v58, v8 -; SI-NEXT: v_mov_b32_e32 v49, v45 -; SI-NEXT: v_mov_b32_e32 v36, v24 -; SI-NEXT: v_mov_b32_e32 v34, v26 -; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_and_b32 s4, s92, 0xff ; SI-NEXT: s_lshl_b32 s5, s17, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s18, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s6, s19, 24 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s12, s6, s5 -; SI-NEXT: s_or_b32 s6, s4, s12 -; SI-NEXT: s_and_b32 s4, s24, 0xff -; SI-NEXT: s_lshl_b32 s5, s25, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_or_b32 s12, s4, s5 +; SI-NEXT: s_and_b32 s4, s18, 0xff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_lshl_b32 s5, s19, 24 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s24, 0xff +; SI-NEXT: s_lshl_b32 s6, s25, 8 +; SI-NEXT: s_or_b32 s13, s5, s6 ; SI-NEXT: s_and_b32 s5, s26, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s7, s27, 24 -; SI-NEXT: s_or_b32 s14, s7, s5 +; SI-NEXT: s_lshl_b32 s6, s27, 24 +; SI-NEXT: s_or_b32 s6, s6, s5 +; SI-NEXT: s_and_b32 s5, s54, 0xff +; SI-NEXT: s_lshl_b32 s7, s53, 8 +; SI-NEXT: s_or_b32 s14, s5, s7 +; SI-NEXT: s_and_b32 s5, s88, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s7, s51, 24 +; SI-NEXT: s_or_b32 s8, s7, s5 +; SI-NEXT: s_and_b32 s5, s81, 0xff +; SI-NEXT: s_lshl_b32 s7, s71, 8 +; SI-NEXT: s_or_b32 s15, s5, s7 +; SI-NEXT: s_and_b32 s5, s34, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s7, s67, 24 +; SI-NEXT: s_or_b32 s10, s7, s5 +; SI-NEXT: s_and_b32 s5, s80, 0xff +; SI-NEXT: s_lshl_b32 s7, s87, 8 +; SI-NEXT: s_or_b32 s40, s5, s7 +; SI-NEXT: s_and_b32 s5, s66, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s7, s86, 24 +; SI-NEXT: s_or_b32 s60, s7, s5 +; SI-NEXT: s_and_b32 s5, s96, 0xff +; SI-NEXT: s_lshl_b32 s7, s44, 8 +; SI-NEXT: s_or_b32 s41, s5, s7 +; SI-NEXT: s_and_b32 s5, s9, 0xff +; SI-NEXT: s_lshl_b32 s7, s97, 8 +; SI-NEXT: s_or_b32 s42, s5, s7 +; SI-NEXT: s_and_b32 s5, s21, 0xff +; SI-NEXT: s_lshl_b32 s7, s79, 8 +; SI-NEXT: s_or_b32 s43, s5, s7 +; SI-NEXT: v_readlane_b32 s7, v41, 1 ; SI-NEXT: s_and_b32 s5, s20, 0xff -; SI-NEXT: s_lshl_b32 s7, s21, 8 +; SI-NEXT: s_lshl_b32 s7, s7, 8 ; SI-NEXT: s_or_b32 s5, s5, s7 -; SI-NEXT: s_and_b32 s7, s5, 0xffff -; SI-NEXT: s_and_b32 s5, s22, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s8, s23, 24 -; SI-NEXT: s_or_b32 s5, s8, s5 -; SI-NEXT: s_or_b32 s13, s7, s5 -; SI-NEXT: s_lshr_b64 s[8:9], s[12:13], 16 +; SI-NEXT: s_and_b32 s7, s22, 0xff +; SI-NEXT: v_readlane_b32 s9, v41, 0 +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_lshl_b32 s9, s9, 24 +; SI-NEXT: s_or_b32 s57, s9, s7 ; SI-NEXT: s_and_b32 s7, s28, 0xff ; SI-NEXT: s_lshl_b32 s9, s29, 8 ; SI-NEXT: s_or_b32 s7, s7, s9 -; SI-NEXT: s_and_b32 s9, s7, 0xffff -; SI-NEXT: s_and_b32 s7, s42, 0xff -; SI-NEXT: s_lshl_b32 s7, s7, 16 -; SI-NEXT: s_lshl_b32 s10, s43, 24 -; SI-NEXT: s_or_b32 s7, s10, s7 -; SI-NEXT: s_or_b32 s15, s9, s7 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_lshr_b64 s[10:11], s[14:15], 16 -; SI-NEXT: s_or_b32 s4, s4, s14 -; SI-NEXT: v_mov_b32_e32 v39, v32 -; SI-NEXT: s_lshr_b32 s9, s5, 16 -; SI-NEXT: s_lshr_b32 s11, s7, 16 -; SI-NEXT: s_mov_b32 s7, s13 -; SI-NEXT: s_mov_b32 s5, s15 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v1, v1, v43 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v48, v1, v46 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v26 -; SI-NEXT: v_or_b32_e32 v1, v1, v59 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v35, v1, v55 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v50 -; SI-NEXT: v_or_b32_e32 v1, v1, v17 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_or_b32_e32 v43, v13, v9 -; SI-NEXT: v_or_b32_e32 v50, v1, v43 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v54 -; SI-NEXT: v_and_b32_e32 v9, 0xff, v51 -; SI-NEXT: v_or_b32_e32 v1, v1, v19 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_mov_b32_e32 v13, v4 -; SI-NEXT: v_mov_b32_e32 v4, v2 -; SI-NEXT: v_mov_b32_e32 v2, v59 -; SI-NEXT: v_mov_b32_e32 v59, v3 -; SI-NEXT: v_mov_b32_e32 v3, v63 -; SI-NEXT: v_mov_b32_e32 v63, v40 -; SI-NEXT: v_mov_b32_e32 v40, v42 -; SI-NEXT: v_or_b32_e32 v42, v11, v9 -; SI-NEXT: v_or_b32_e32 v54, v1, v42 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v6 -; SI-NEXT: v_and_b32_e32 v9, 0xff, v8 -; SI-NEXT: v_or_b32_e32 v1, v1, v47 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v11, v45, v9 -; SI-NEXT: v_or_b32_e32 v1, v1, v11 -; SI-NEXT: v_mov_b32_e32 v19, v10 -; SI-NEXT: v_lshr_b64 v[9:10], v[0:1], 16 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: v_and_b32_e32 v0, 0xff, v14 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v10, 0xff, v16 -; SI-NEXT: v_or_b32_e32 v0, v0, v57 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v23, v56, v10 -; SI-NEXT: v_mov_b32_e32 v8, v6 -; SI-NEXT: v_mov_b32_e32 v6, v14 -; SI-NEXT: v_or_b32_e32 v45, v0, v23 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v22 -; SI-NEXT: v_and_b32_e32 v14, 0xff, v24 -; SI-NEXT: v_lshr_b64 v[9:10], v[44:45], 16 -; SI-NEXT: v_or_b32_e32 v0, v0, v5 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v14, v41, v14 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v24, v17 -; SI-NEXT: v_mov_b32_e32 v17, v47 -; SI-NEXT: v_or_b32_e32 v47, v0, v14 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[9:10], v[46:47], 16 -; SI-NEXT: v_mov_b32_e32 v46, v15 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v46 -; SI-NEXT: v_and_b32_e32 v15, 0xff, v62 -; SI-NEXT: v_or_b32_e32 v0, v0, v61 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v31, v7, v15 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v9, v61 -; SI-NEXT: v_mov_b32_e32 v61, v7 -; SI-NEXT: v_mov_b32_e32 v7, v5 -; SI-NEXT: v_mov_b32_e32 v5, v52 -; SI-NEXT: v_mov_b32_e32 v52, v41 -; SI-NEXT: v_mov_b32_e32 v41, v62 -; SI-NEXT: v_mov_b32_e32 v62, v57 -; SI-NEXT: v_mov_b32_e32 v57, v53 -; SI-NEXT: v_mov_b32_e32 v53, v56 -; SI-NEXT: v_or_b32_e32 v56, v0, v31 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v51, v22 -; SI-NEXT: v_lshr_b64 v[21:22], v[55:56], 16 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v15, 0xff, v10 -; SI-NEXT: v_or_b32_e32 v0, v0, v25 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v22, v30, v15 -; SI-NEXT: v_or_b32_e32 v44, v0, v22 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v32 -; SI-NEXT: v_and_b32_e32 v15, 0xff, v60 -; SI-NEXT: v_or_b32_e32 v0, v0, v38 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v38, v29, v15 -; SI-NEXT: v_lshr_b64 v[25:26], v[43:44], 16 -; SI-NEXT: v_or_b32_e32 v43, v0, v38 -; SI-NEXT: v_mov_b32_e32 v0, v30 -; SI-NEXT: v_lshr_b64 v[29:30], v[42:43], 16 -; SI-NEXT: v_mov_b32_e32 v42, v40 -; SI-NEXT: v_mov_b32_e32 v40, v63 -; SI-NEXT: v_mov_b32_e32 v63, v3 -; SI-NEXT: v_mov_b32_e32 v3, v59 -; SI-NEXT: v_mov_b32_e32 v59, v2 -; SI-NEXT: v_mov_b32_e32 v10, v19 -; SI-NEXT: v_mov_b32_e32 v2, v4 -; SI-NEXT: v_mov_b32_e32 v4, v13 -; SI-NEXT: v_mov_b32_e32 v13, v27 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v22 -; SI-NEXT: v_mov_b32_e32 v14, v6 -; SI-NEXT: v_mov_b32_e32 v6, v8 -; SI-NEXT: v_mov_b32_e32 v8, v58 -; SI-NEXT: v_mov_b32_e32 v22, v51 -; SI-NEXT: v_mov_b32_e32 v51, v44 -; SI-NEXT: v_mov_b32_e32 v44, v60 -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v55, v43 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v30, v0 -; SI-NEXT: v_mov_b32_e32 v26, v34 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v38 -; SI-NEXT: v_mov_b32_e32 v38, v1 -; SI-NEXT: v_mov_b32_e32 v34, v45 -; SI-NEXT: v_mov_b32_e32 v45, v49 -; SI-NEXT: v_mov_b32_e32 v49, v47 -; SI-NEXT: v_mov_b32_e32 v47, v17 -; SI-NEXT: v_mov_b32_e32 v17, v24 -; SI-NEXT: v_mov_b32_e32 v24, v36 -; SI-NEXT: v_mov_b32_e32 v36, v56 -; SI-NEXT: v_mov_b32_e32 v56, v53 -; SI-NEXT: v_mov_b32_e32 v53, v57 -; SI-NEXT: v_mov_b32_e32 v57, v62 -; SI-NEXT: v_mov_b32_e32 v62, v41 -; SI-NEXT: v_mov_b32_e32 v41, v52 -; SI-NEXT: v_mov_b32_e32 v52, v5 -; SI-NEXT: v_mov_b32_e32 v5, v7 -; SI-NEXT: v_mov_b32_e32 v7, v61 -; SI-NEXT: v_mov_b32_e32 v61, v9 -; SI-NEXT: s_cbranch_execnz .LBB99_3 -; SI-NEXT: .LBB99_2: ; %cmp.true -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: s_and_b32 s4, s24, 0xff -; SI-NEXT: s_lshl_b32 s5, s25, 8 -; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_and_b32 s9, s90, 0xff +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_lshl_b32 s11, s89, 24 +; SI-NEXT: s_or_b32 s77, s11, s9 +; SI-NEXT: s_and_b32 s9, s94, 0xff +; SI-NEXT: s_lshl_b32 s11, s49, 8 +; SI-NEXT: s_or_b32 s9, s9, s11 +; SI-NEXT: s_and_b32 s11, s35, 0xff +; SI-NEXT: s_lshl_b32 s11, s11, 16 +; SI-NEXT: v_writelane_b32 v41, s44, 11 +; SI-NEXT: s_lshl_b32 s44, s37, 24 +; SI-NEXT: s_or_b32 vcc_lo, s44, s11 +; SI-NEXT: s_and_b32 s11, s38, 0xff +; SI-NEXT: s_lshl_b32 s44, s64, 8 +; SI-NEXT: s_or_b32 s11, s11, s44 +; SI-NEXT: s_and_b32 s44, s31, 0xff +; SI-NEXT: s_lshl_b32 s44, s44, 16 +; SI-NEXT: s_lshl_b32 s45, s36, 24 +; SI-NEXT: s_or_b32 vcc_hi, s45, s44 +; SI-NEXT: s_and_b32 s44, s65, 0xff +; SI-NEXT: s_lshl_b32 s45, s84, 8 +; SI-NEXT: s_or_b32 s44, s44, s45 +; SI-NEXT: s_and_b32 s45, s68, 0xff +; SI-NEXT: s_lshl_b32 s45, s45, 16 +; SI-NEXT: s_mov_b32 s23, s21 +; SI-NEXT: s_mov_b32 s21, s46 +; SI-NEXT: s_lshl_b32 s46, s70, 24 +; SI-NEXT: s_and_b32 s44, s44, 0xffff +; SI-NEXT: v_writelane_b32 v41, s97, 12 +; SI-NEXT: s_mov_b32 s97, s86 +; SI-NEXT: s_mov_b32 s86, s84 +; SI-NEXT: s_mov_b32 s84, s70 +; SI-NEXT: s_mov_b32 s70, s34 +; SI-NEXT: s_mov_b32 s34, s88 +; SI-NEXT: s_mov_b32 s88, s24 +; SI-NEXT: s_or_b32 s24, s46, s45 +; SI-NEXT: s_or_b32 s61, s44, s24 +; SI-NEXT: s_and_b32 s44, s82, 0xff +; SI-NEXT: s_lshl_b32 s45, s30, 8 +; SI-NEXT: s_or_b32 s44, s44, s45 +; SI-NEXT: s_and_b32 s45, s69, 0xff +; SI-NEXT: s_lshl_b32 s45, s45, 16 +; SI-NEXT: s_lshl_b32 s46, s78, 24 +; SI-NEXT: s_mov_b32 s95, s90 +; SI-NEXT: s_mov_b32 s90, s18 +; SI-NEXT: s_or_b32 s18, s46, s45 +; SI-NEXT: s_and_b32 s45, s83, 0xff +; SI-NEXT: s_lshl_b32 s45, s45, 16 +; SI-NEXT: s_lshl_b32 s46, s47, 24 +; SI-NEXT: s_and_b32 s44, s44, 0xffff +; SI-NEXT: s_or_b32 s62, s46, s45 +; SI-NEXT: s_or_b32 s63, s44, s18 +; SI-NEXT: s_and_b32 s44, s98, 0xff +; SI-NEXT: s_lshl_b32 s45, s58, 8 +; SI-NEXT: s_or_b32 s44, s44, s45 +; SI-NEXT: s_and_b32 s45, s85, 0xff +; SI-NEXT: s_lshl_b32 s45, s45, 16 +; SI-NEXT: s_lshl_b32 s46, s56, 24 +; SI-NEXT: s_mov_b32 s76, s56 +; SI-NEXT: s_mov_b32 s56, s85 +; SI-NEXT: s_mov_b32 s85, s79 +; SI-NEXT: s_mov_b32 s79, s19 +; SI-NEXT: s_or_b32 s19, s46, s45 +; SI-NEXT: s_and_b32 s45, s99, 0xff +; SI-NEXT: s_lshl_b32 s45, s45, 16 +; SI-NEXT: s_lshl_b32 s46, s21, 24 +; SI-NEXT: s_and_b32 s44, s44, 0xffff +; SI-NEXT: s_or_b32 s72, s46, s45 +; SI-NEXT: s_or_b32 s73, s44, s19 +; SI-NEXT: s_and_b32 s44, s52, 0xff +; SI-NEXT: s_lshl_b32 s45, s93, 8 +; SI-NEXT: s_or_b32 s44, s44, s45 +; SI-NEXT: s_and_b32 s45, s16, 0xff +; SI-NEXT: s_lshl_b32 s45, s45, 16 +; SI-NEXT: s_lshl_b32 s46, s91, 24 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_mov_b32 s47, s96 +; SI-NEXT: s_mov_b32 s96, s78 +; SI-NEXT: s_mov_b32 s78, s69 +; SI-NEXT: s_mov_b32 s69, s68 +; SI-NEXT: s_mov_b32 s68, s38 +; SI-NEXT: s_mov_b32 s38, s35 +; SI-NEXT: s_mov_b32 s35, s89 +; SI-NEXT: s_or_b32 s89, s46, s45 +; SI-NEXT: s_and_b32 s45, s50, 0xff +; SI-NEXT: s_or_b32 s5, s5, s57 +; SI-NEXT: s_lshl_b32 s45, s45, 16 +; SI-NEXT: s_lshl_b32 s46, s55, 24 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_or_b32 s74, s46, s45 +; SI-NEXT: s_mov_b32 s45, s83 +; SI-NEXT: s_mov_b32 s83, s91 +; SI-NEXT: s_mov_b32 s91, s28 +; SI-NEXT: s_and_b32 s28, s42, 0xffff +; SI-NEXT: s_mov_b32 s59, s94 +; SI-NEXT: s_mov_b32 s94, s27 +; SI-NEXT: s_and_b32 s27, s43, 0xffff +; SI-NEXT: s_or_b32 s42, s12, s4 +; SI-NEXT: s_mov_b32 s43, s5 +; SI-NEXT: s_lshr_b64 s[4:5], s[4:5], 16 +; SI-NEXT: s_or_b32 s9, s9, vcc_lo +; SI-NEXT: v_writelane_b32 v41, s4, 5 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: v_writelane_b32 v41, s5, 6 +; SI-NEXT: s_lshr_b64 s[4:5], s[8:9], 16 +; SI-NEXT: s_or_b32 s11, s11, vcc_hi +; SI-NEXT: v_writelane_b32 v41, s4, 7 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_and_b32 s44, s44, 0xffff +; SI-NEXT: v_writelane_b32 v41, s5, 8 +; SI-NEXT: s_lshr_b64 s[4:5], s[10:11], 16 +; SI-NEXT: s_or_b32 s7, s7, s77 +; SI-NEXT: s_or_b32 s75, s44, s89 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: s_and_b32 s58, s15, 0xffff +; SI-NEXT: s_mov_b32 s44, s82 +; SI-NEXT: s_mov_b32 s82, s81 +; SI-NEXT: s_mov_b32 s81, s55 +; SI-NEXT: s_mov_b32 s55, s54 +; SI-NEXT: s_mov_b32 s54, s51 +; SI-NEXT: s_mov_b32 s51, s37 +; SI-NEXT: s_mov_b32 s37, s16 +; SI-NEXT: s_and_b32 s16, s40, 0xffff +; SI-NEXT: s_mov_b32 s46, s98 +; SI-NEXT: s_mov_b32 s98, s93 +; SI-NEXT: s_and_b32 s93, s41, 0xffff +; SI-NEXT: v_writelane_b32 v41, s4, 9 +; SI-NEXT: s_mov_b32 s39, s49 +; SI-NEXT: s_or_b32 s40, s13, s6 +; SI-NEXT: s_mov_b32 s41, s7 +; SI-NEXT: s_lshr_b64 s[48:49], s[6:7], 16 +; SI-NEXT: s_or_b32 s14, s14, s8 +; SI-NEXT: s_mov_b32 s15, s9 +; SI-NEXT: s_or_b32 s12, s58, s10 +; SI-NEXT: s_mov_b32 s13, s11 +; SI-NEXT: v_writelane_b32 v41, s5, 10 +; SI-NEXT: s_or_b32 s10, s16, s60 +; SI-NEXT: s_mov_b32 s11, s61 +; SI-NEXT: s_lshr_b64 s[60:61], s[60:61], 16 +; SI-NEXT: s_or_b32 s8, s93, s62 +; SI-NEXT: s_mov_b32 s9, s63 +; SI-NEXT: s_lshr_b64 s[62:63], s[62:63], 16 +; SI-NEXT: s_or_b32 s6, s28, s72 +; SI-NEXT: s_mov_b32 s7, s73 +; SI-NEXT: s_lshr_b64 s[72:73], s[72:73], 16 +; SI-NEXT: s_or_b32 s4, s27, s74 +; SI-NEXT: s_mov_b32 s5, s75 +; SI-NEXT: s_lshr_b64 s[74:75], s[74:75], 16 +; SI-NEXT: s_mov_b32 s16, s37 +; SI-NEXT: s_mov_b32 s37, s51 +; SI-NEXT: s_mov_b32 s51, s54 +; SI-NEXT: s_mov_b32 s54, s55 +; SI-NEXT: s_mov_b32 s55, s81 +; SI-NEXT: s_mov_b32 s81, s82 +; SI-NEXT: s_mov_b32 s82, s44 +; SI-NEXT: v_readlane_b32 s44, v41, 11 +; SI-NEXT: s_mov_b32 s93, s98 +; SI-NEXT: s_mov_b32 s98, s46 +; SI-NEXT: s_mov_b32 s46, s21 +; SI-NEXT: s_mov_b32 s21, s23 +; SI-NEXT: s_mov_b32 s28, s91 +; SI-NEXT: s_mov_b32 s91, s83 +; SI-NEXT: s_mov_b32 s83, s45 +; SI-NEXT: s_mov_b32 s27, s94 +; SI-NEXT: s_mov_b32 s94, s59 +; SI-NEXT: s_lshr_b32 s23, s57, 16 +; SI-NEXT: s_lshr_b32 s57, s77, 16 +; SI-NEXT: s_lshr_b32 s59, vcc_lo, 16 +; SI-NEXT: s_lshr_b32 s61, vcc_hi, 16 +; SI-NEXT: s_lshr_b32 s63, s24, 16 +; SI-NEXT: s_mov_b32 s24, s88 +; SI-NEXT: s_mov_b32 s88, s34 +; SI-NEXT: s_mov_b32 s34, s70 +; SI-NEXT: s_mov_b32 s70, s84 +; SI-NEXT: s_mov_b32 s84, s86 +; SI-NEXT: s_mov_b32 s86, s97 +; SI-NEXT: v_readlane_b32 s97, v41, 12 +; SI-NEXT: s_lshr_b32 s73, s18, 16 +; SI-NEXT: s_mov_b32 s18, s90 +; SI-NEXT: s_mov_b32 s90, s95 +; SI-NEXT: s_mov_b32 s49, s39 +; SI-NEXT: s_lshr_b32 s75, s19, 16 +; SI-NEXT: s_mov_b32 s19, s79 +; SI-NEXT: s_mov_b32 s79, s85 +; SI-NEXT: s_mov_b32 s85, s56 +; SI-NEXT: s_mov_b32 s56, s76 +; SI-NEXT: s_lshr_b32 s45, s89, 16 +; SI-NEXT: s_mov_b32 s89, s35 +; SI-NEXT: s_mov_b32 s35, s38 +; SI-NEXT: s_mov_b32 s38, s68 +; SI-NEXT: s_mov_b32 s68, s69 +; SI-NEXT: s_mov_b32 s69, s78 +; SI-NEXT: s_mov_b32 s78, s96 +; SI-NEXT: s_mov_b32 s96, s47 +; SI-NEXT: s_mov_b64 s[76:77], 0 +; SI-NEXT: s_branch .LBB99_3 +; SI-NEXT: .LBB99_2: +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: s_mov_b64 s[76:77], -1 +; SI-NEXT: v_writelane_b32 v41, s4, 5 +; SI-NEXT: v_writelane_b32 v41, s5, 6 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr23 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; implicit-def: $sgpr57 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr59 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr61 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr63 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr73 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr75 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr45 +; SI-NEXT: v_writelane_b32 v41, s4, 7 +; SI-NEXT: v_writelane_b32 v41, s5, 8 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v41, s4, 9 +; SI-NEXT: v_writelane_b32 v41, s5, 10 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: .LBB99_3: ; %Flow +; SI-NEXT: s_andn2_b64 vcc, exec, s[76:77] +; SI-NEXT: v_readlane_b32 s76, v41, 5 +; SI-NEXT: v_readlane_b32 s77, v41, 6 +; SI-NEXT: s_mov_b32 s58, s76 +; SI-NEXT: v_readlane_b32 s76, v41, 7 +; SI-NEXT: v_readlane_b32 s77, v41, 8 +; SI-NEXT: s_cbranch_vccnz .LBB99_5 +; SI-NEXT: ; %bb.4: ; %cmp.true +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_and_b32 s4, s21, 0xff +; SI-NEXT: s_lshl_b32 s5, s79, 8 +; SI-NEXT: s_add_i32 s50, s50, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s6, s26, 0xff +; SI-NEXT: s_and_b32 s6, s50, 0xff ; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: s_lshl_b32 s5, s27, 24 +; SI-NEXT: s_lshl_b32 s5, s55, 24 ; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s39, s52, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s5, s28, 0xff -; SI-NEXT: s_lshl_b32 s6, s29, 8 -; SI-NEXT: s_add_i32 s42, s42, 3 +; SI-NEXT: s_and_b32 s5, s39, 0xff +; SI-NEXT: s_lshl_b32 s6, s93, 8 +; SI-NEXT: s_add_i32 s79, s16, 3 ; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_and_b32 s7, s42, 0xff +; SI-NEXT: s_and_b32 s7, s79, 0xff ; SI-NEXT: s_addk_i32 s5, 0x300 -; SI-NEXT: s_lshl_b32 s6, s43, 24 +; SI-NEXT: s_lshl_b32 s6, s91, 24 ; SI-NEXT: s_lshl_b32 s7, s7, 16 ; SI-NEXT: s_and_b32 s5, s5, 0xffff ; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_and_b32 s6, s16, 0xff -; SI-NEXT: s_lshl_b32 s7, s17, 8 -; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: v_readlane_b32 s6, v41, 4 +; SI-NEXT: s_add_i32 s23, s6, 3 +; SI-NEXT: s_and_b32 s6, s23, 0xff +; SI-NEXT: s_lshl_b32 s7, s97, 8 +; SI-NEXT: s_add_i32 s99, s99, 3 ; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_and_b32 s8, s18, 0xff +; SI-NEXT: s_and_b32 s8, s99, 0xff ; SI-NEXT: s_addk_i32 s6, 0x300 -; SI-NEXT: s_lshl_b32 s7, s19, 24 +; SI-NEXT: s_lshl_b32 s7, s46, 24 ; SI-NEXT: s_lshl_b32 s8, s8, 16 ; SI-NEXT: s_and_b32 s6, s6, 0xffff ; SI-NEXT: s_or_b32 s7, s7, s8 -; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s98, s98, 3 +; SI-NEXT: v_readlane_b32 s8, v41, 3 ; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_and_b32 s7, s20, 0xff -; SI-NEXT: s_lshl_b32 s8, s21, 8 -; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_and_b32 s7, s98, 0xff +; SI-NEXT: s_lshl_b32 s8, s8, 8 +; SI-NEXT: s_add_i32 s85, s85, 3 ; SI-NEXT: s_or_b32 s7, s8, s7 -; SI-NEXT: s_and_b32 s9, s22, 0xff +; SI-NEXT: s_and_b32 s9, s85, 0xff ; SI-NEXT: s_addk_i32 s7, 0x300 -; SI-NEXT: s_lshl_b32 s8, s23, 24 +; SI-NEXT: s_lshl_b32 s8, s56, 24 ; SI-NEXT: s_lshl_b32 s9, s9, 16 ; SI-NEXT: s_and_b32 s7, s7, 0xffff ; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_add_i32 s96, s96, 3 ; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s96, 0xff +; SI-NEXT: s_lshl_b32 s9, s44, 8 +; SI-NEXT: s_add_i32 s83, s83, 3 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: v_readlane_b32 s9, v41, 2 +; SI-NEXT: s_and_b32 s10, s83, 0xff +; SI-NEXT: s_addk_i32 s8, 0x300 +; SI-NEXT: s_lshl_b32 s9, s9, 24 +; SI-NEXT: s_lshl_b32 s10, s10, 16 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_add_i32 s82, s82, 3 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s82, 0xff +; SI-NEXT: s_lshl_b32 s10, s30, 8 +; SI-NEXT: s_add_i32 s69, s69, 3 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s11, s69, 0xff +; SI-NEXT: s_addk_i32 s9, 0x300 +; SI-NEXT: s_lshl_b32 s10, s78, 24 +; SI-NEXT: s_lshl_b32 s11, s11, 16 +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: s_add_i32 s80, s80, 3 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s80, 0xff +; SI-NEXT: s_lshl_b32 s11, s87, 8 +; SI-NEXT: s_add_i32 s66, s66, 3 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: s_and_b32 s12, s66, 0xff +; SI-NEXT: s_addk_i32 s10, 0x300 +; SI-NEXT: s_lshl_b32 s11, s86, 24 +; SI-NEXT: s_lshl_b32 s12, s12, 16 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_or_b32 s11, s11, s12 +; SI-NEXT: s_add_i32 s65, s65, 3 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: s_and_b32 s11, s65, 0xff +; SI-NEXT: s_lshl_b32 s12, s84, 8 +; SI-NEXT: s_add_i32 s52, s68, 3 +; SI-NEXT: s_or_b32 s11, s12, s11 +; SI-NEXT: s_and_b32 s13, s52, 0xff +; SI-NEXT: s_addk_i32 s11, 0x300 +; SI-NEXT: s_lshl_b32 s12, s70, 24 +; SI-NEXT: s_lshl_b32 s13, s13, 16 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: s_or_b32 s12, s12, s13 +; SI-NEXT: s_add_i32 s55, s81, 3 +; SI-NEXT: s_or_b32 s11, s12, s11 +; SI-NEXT: s_and_b32 s12, s55, 0xff +; SI-NEXT: s_lshl_b32 s13, s71, 8 +; SI-NEXT: s_add_i32 s48, s34, 3 +; SI-NEXT: s_or_b32 s12, s13, s12 +; SI-NEXT: s_and_b32 s14, s48, 0xff +; SI-NEXT: s_addk_i32 s12, 0x300 +; SI-NEXT: s_lshl_b32 s13, s67, 24 +; SI-NEXT: s_lshl_b32 s14, s14, 16 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_or_b32 s13, s13, s14 +; SI-NEXT: s_add_i32 s38, s38, 3 +; SI-NEXT: s_or_b32 s12, s13, s12 +; SI-NEXT: s_and_b32 s13, s38, 0xff +; SI-NEXT: s_lshl_b32 s14, s64, 8 +; SI-NEXT: s_add_i32 s31, s31, 3 +; SI-NEXT: s_or_b32 s13, s14, s13 +; SI-NEXT: s_and_b32 s15, s31, 0xff +; SI-NEXT: s_addk_i32 s13, 0x300 +; SI-NEXT: s_lshl_b32 s14, s36, 24 +; SI-NEXT: s_lshl_b32 s15, s15, 16 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_or_b32 s14, s14, s15 +; SI-NEXT: s_add_i32 s36, s54, 3 +; SI-NEXT: s_or_b32 s13, s14, s13 +; SI-NEXT: s_and_b32 s14, s36, 0xff +; SI-NEXT: s_lshl_b32 s15, s53, 8 +; SI-NEXT: s_add_i32 s95, s88, 3 +; SI-NEXT: s_or_b32 s14, s15, s14 +; SI-NEXT: s_and_b32 s21, s95, 0xff +; SI-NEXT: s_addk_i32 s14, 0x300 +; SI-NEXT: s_lshl_b32 s15, s51, 24 +; SI-NEXT: s_lshl_b32 s21, s21, 16 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: s_or_b32 s15, s15, s21 +; SI-NEXT: s_add_i32 s94, s94, 3 +; SI-NEXT: s_or_b32 s14, s15, s14 +; SI-NEXT: s_and_b32 s15, s94, 0xff +; SI-NEXT: s_lshl_b32 s21, s49, 8 +; SI-NEXT: s_add_i32 s91, s35, 3 +; SI-NEXT: s_or_b32 s15, s21, s15 +; SI-NEXT: s_and_b32 s16, s91, 0xff +; SI-NEXT: s_addk_i32 s15, 0x300 +; SI-NEXT: s_lshl_b32 s21, s37, 24 +; SI-NEXT: s_lshl_b32 s16, s16, 16 +; SI-NEXT: s_and_b32 s15, s15, 0xffff +; SI-NEXT: s_or_b32 s21, s21, s16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s15, s21, s15 +; SI-NEXT: s_and_b32 s21, s24, 0xff +; SI-NEXT: s_lshl_b32 s16, s25, 8 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s21, s16, s21 +; SI-NEXT: s_and_b32 s23, s26, 0xff +; SI-NEXT: s_addk_i32 s21, 0x300 +; SI-NEXT: s_lshl_b32 s16, s27, 24 +; SI-NEXT: s_lshl_b32 s23, s23, 16 +; SI-NEXT: s_and_b32 s21, s21, 0xffff +; SI-NEXT: s_or_b32 s16, s16, s23 +; SI-NEXT: s_or_b32 s21, s16, s21 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s40, s21, 0x3000000 +; SI-NEXT: s_and_b32 s21, s28, 0xff +; SI-NEXT: s_lshl_b32 s16, s29, 8 +; SI-NEXT: s_lshl_b32 s23, s89, 24 +; SI-NEXT: s_add_i32 s89, s90, 3 +; SI-NEXT: s_or_b32 s21, s16, s21 +; SI-NEXT: s_and_b32 s16, s89, 0xff +; SI-NEXT: s_addk_i32 s21, 0x300 +; SI-NEXT: s_lshl_b32 s16, s16, 16 +; SI-NEXT: s_and_b32 s21, s21, 0xffff +; SI-NEXT: s_or_b32 s16, s23, s16 +; SI-NEXT: s_or_b32 s16, s16, s21 +; SI-NEXT: s_add_i32 s41, s16, 0x3000000 +; SI-NEXT: s_add_i32 s16, s92, 3 +; SI-NEXT: s_and_b32 s16, s16, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_and_b32 s18, s18, 0xff +; SI-NEXT: s_addk_i32 s16, 0x300 +; SI-NEXT: s_lshl_b32 s17, s19, 24 +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s42, s16, 0x3000000 +; SI-NEXT: s_add_i32 s16, s20, 3 +; SI-NEXT: v_readlane_b32 s17, v41, 1 +; SI-NEXT: s_and_b32 s16, s16, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 8 +; SI-NEXT: s_add_i32 s18, s22, 3 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: v_readlane_b32 s17, v41, 0 +; SI-NEXT: s_and_b32 s18, s18, 0xff +; SI-NEXT: s_addk_i32 s16, 0x300 +; SI-NEXT: s_lshl_b32 s17, s17, 24 +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: s_or_b32 s16, s17, s16 ; SI-NEXT: s_add_i32 s4, s4, 0x3000000 ; SI-NEXT: s_add_i32 s5, s5, 0x3000000 ; SI-NEXT: s_add_i32 s6, s6, 0x3000000 ; SI-NEXT: s_add_i32 s7, s7, 0x3000000 -; SI-NEXT: s_lshr_b64 s[8:9], s[6:7], 16 -; SI-NEXT: s_lshr_b64 s[10:11], s[4:5], 16 -; SI-NEXT: s_lshr_b32 s9, s7, 16 -; SI-NEXT: s_lshr_b32 s11, s5, 16 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v9, v1 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v54, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v44 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v1, v9, v1 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v55, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v63 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v13, v1 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v55 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v17, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v50, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v60 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v58 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v30, v1 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v51, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v26 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v28 -; SI-NEXT: v_or_b32_e32 v0, v59, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v35, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v46 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v62 -; SI-NEXT: v_or_b32_e32 v0, v61, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v7, v1 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v36, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v20 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v52, v1 -; SI-NEXT: v_lshr_b64 v[25:26], v[50:51], 16 -; SI-NEXT: v_lshr_b64 v[29:30], v[54:55], 16 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v36 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v51 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v43, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v48, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v22 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v24 -; SI-NEXT: v_or_b32_e32 v0, v5, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v41, v1 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v49, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v10 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v12 -; SI-NEXT: v_or_b32_e32 v0, v40, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v53, v1 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v33, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v14 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v16 -; SI-NEXT: v_or_b32_e32 v0, v57, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v56, v1 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v34, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v2 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v4 -; SI-NEXT: v_or_b32_e32 v0, v42, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v18, v1 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v37, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v6 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v8 -; SI-NEXT: v_or_b32_e32 v0, v47, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v45, v1 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v38, vcc, 0x3000000, v0 -; SI-NEXT: v_lshr_b64 v[0:1], v[37:38], 16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[0:1], v[33:34], 16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[0:1], v[48:49], 16 -; SI-NEXT: v_lshr_b64 v[21:22], v[35:36], 16 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v38 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v34 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v49 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: .LBB99_3: ; %end -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v1, s8 -; SI-NEXT: v_mov_b32_e32 v2, s7 -; SI-NEXT: v_mov_b32_e32 v3, s9 -; SI-NEXT: v_mov_b32_e32 v4, s4 -; SI-NEXT: v_mov_b32_e32 v5, s10 -; SI-NEXT: v_mov_b32_e32 v6, s5 -; SI-NEXT: v_mov_b32_e32 v7, s11 -; SI-NEXT: v_mov_b32_e32 v8, v37 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_mov_b32_e32 v10, v38 -; SI-NEXT: v_mov_b32_e32 v12, v33 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_mov_b32_e32 v14, v34 -; SI-NEXT: v_mov_b32_e32 v16, v48 +; SI-NEXT: s_add_i32 s8, s8, 0x3000000 +; SI-NEXT: s_add_i32 s9, s9, 0x3000000 +; SI-NEXT: s_add_i32 s10, s10, 0x3000000 +; SI-NEXT: s_add_i32 s11, s11, 0x3000000 +; SI-NEXT: s_add_i32 s12, s12, 0x3000000 +; SI-NEXT: s_add_i32 s13, s13, 0x3000000 +; SI-NEXT: s_add_i32 s43, s16, 0x3000000 +; SI-NEXT: s_add_i32 s14, s14, 0x3000000 +; SI-NEXT: s_add_i32 s15, s15, 0x3000000 +; SI-NEXT: s_lshr_b64 s[58:59], s[42:43], 16 +; SI-NEXT: s_lshr_b64 s[16:17], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[72:73], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[74:75], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[48:49], s[40:41], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[14:15], 16 +; SI-NEXT: v_writelane_b32 v41, s16, 9 +; SI-NEXT: s_lshr_b32 s23, s43, 16 +; SI-NEXT: s_lshr_b32 s57, s41, 16 +; SI-NEXT: s_lshr_b32 s59, s15, 16 +; SI-NEXT: s_lshr_b32 s61, s13, 16 +; SI-NEXT: s_lshr_b32 s63, s11, 16 +; SI-NEXT: s_lshr_b32 s73, s9, 16 +; SI-NEXT: s_lshr_b32 s75, s7, 16 +; SI-NEXT: s_lshr_b32 s45, s5, 16 +; SI-NEXT: v_writelane_b32 v41, s17, 10 +; SI-NEXT: .LBB99_5: ; %end +; SI-NEXT: s_and_b32 s16, s42, 0xffff +; SI-NEXT: s_lshl_b32 s17, s58, 16 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_and_b32 s17, s43, 0xffff +; SI-NEXT: s_lshl_b32 s18, s23, 16 +; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: s_and_b32 s18, s40, 0xffff +; SI-NEXT: s_lshl_b32 s19, s48, 16 +; SI-NEXT: s_or_b32 s18, s18, s19 +; SI-NEXT: s_and_b32 s19, s41, 0xffff +; SI-NEXT: s_lshl_b32 s20, s57, 16 +; SI-NEXT: s_or_b32 s19, s19, s20 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: s_lshl_b32 s20, s76, 16 +; SI-NEXT: s_or_b32 s14, s14, s20 +; SI-NEXT: s_and_b32 s15, s15, 0xffff +; SI-NEXT: s_lshl_b32 s20, s59, 16 +; SI-NEXT: s_or_b32 s15, s15, s20 +; SI-NEXT: v_readlane_b32 s20, v41, 9 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_lshl_b32 s20, s20, 16 +; SI-NEXT: s_or_b32 s12, s12, s20 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_lshl_b32 s20, s61, 16 +; SI-NEXT: s_or_b32 s13, s13, s20 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_lshl_b32 s20, s60, 16 +; SI-NEXT: s_or_b32 s10, s10, s20 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: s_lshl_b32 s20, s63, 16 +; SI-NEXT: s_or_b32 s11, s11, s20 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_lshl_b32 s20, s62, 16 +; SI-NEXT: s_or_b32 s8, s8, s20 +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_lshl_b32 s20, s73, 16 +; SI-NEXT: s_or_b32 s9, s9, s20 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_lshl_b32 s20, s72, 16 +; SI-NEXT: s_or_b32 s6, s6, s20 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_lshl_b32 s20, s75, 16 +; SI-NEXT: s_or_b32 s7, s7, s20 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s20, s74, 16 +; SI-NEXT: s_or_b32 s4, s4, s20 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s20, s45, 16 +; SI-NEXT: s_or_b32 s5, s5, s20 +; SI-NEXT: v_readlane_b32 s21, v41, 10 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s14 +; SI-NEXT: v_mov_b32_e32 v5, s15 +; SI-NEXT: v_mov_b32_e32 v6, s12 +; SI-NEXT: v_mov_b32_e32 v7, s13 +; SI-NEXT: v_mov_b32_e32 v8, s10 +; SI-NEXT: v_mov_b32_e32 v9, s11 +; SI-NEXT: v_mov_b32_e32 v10, s8 +; SI-NEXT: v_mov_b32_e32 v11, s9 +; SI-NEXT: v_mov_b32_e32 v12, s6 +; SI-NEXT: v_mov_b32_e32 v13, s7 +; SI-NEXT: v_mov_b32_e32 v14, s4 +; SI-NEXT: v_mov_b32_e32 v15, s5 +; SI-NEXT: v_readlane_b32 s99, v40, 35 +; SI-NEXT: v_readlane_b32 s98, v40, 34 +; SI-NEXT: v_readlane_b32 s97, v40, 33 +; SI-NEXT: v_readlane_b32 s96, v40, 32 +; SI-NEXT: v_readlane_b32 s87, v40, 31 +; SI-NEXT: v_readlane_b32 s86, v40, 30 +; SI-NEXT: v_readlane_b32 s85, v40, 29 +; SI-NEXT: v_readlane_b32 s84, v40, 28 +; SI-NEXT: v_readlane_b32 s83, v40, 27 +; SI-NEXT: v_readlane_b32 s82, v40, 26 +; SI-NEXT: v_readlane_b32 s81, v40, 25 +; SI-NEXT: v_readlane_b32 s80, v40, 24 +; SI-NEXT: v_readlane_b32 s71, v40, 23 +; SI-NEXT: v_readlane_b32 s70, v40, 22 +; SI-NEXT: v_readlane_b32 s69, v40, 21 +; SI-NEXT: v_readlane_b32 s68, v40, 20 +; SI-NEXT: v_readlane_b32 s67, v40, 19 +; SI-NEXT: v_readlane_b32 s66, v40, 18 +; SI-NEXT: v_readlane_b32 s65, v40, 17 +; SI-NEXT: v_readlane_b32 s64, v40, 16 +; SI-NEXT: v_readlane_b32 s55, v40, 15 +; SI-NEXT: v_readlane_b32 s54, v40, 14 +; SI-NEXT: v_readlane_b32 s53, v40, 13 +; SI-NEXT: v_readlane_b32 s52, v40, 12 +; SI-NEXT: v_readlane_b32 s51, v40, 11 +; SI-NEXT: v_readlane_b32 s50, v40, 10 +; SI-NEXT: v_readlane_b32 s49, v40, 9 +; SI-NEXT: v_readlane_b32 s48, v40, 8 +; SI-NEXT: v_readlane_b32 s39, v40, 7 +; SI-NEXT: v_readlane_b32 s38, v40, 6 +; SI-NEXT: v_readlane_b32 s37, v40, 5 +; SI-NEXT: v_readlane_b32 s36, v40, 4 +; SI-NEXT: v_readlane_b32 s35, v40, 3 +; SI-NEXT: v_readlane_b32 s34, v40, 2 +; SI-NEXT: v_readlane_b32 s31, v40, 1 +; SI-NEXT: v_readlane_b32 s30, v40, 0 +; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v18, v49 -; SI-NEXT: v_mov_b32_e32 v20, v35 -; SI-NEXT: v_mov_b32_e32 v22, v36 -; SI-NEXT: v_mov_b32_e32 v24, v50 -; SI-NEXT: v_mov_b32_e32 v26, v51 -; SI-NEXT: v_mov_b32_e32 v28, v54 -; SI-NEXT: v_mov_b32_e32 v30, v55 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB99_4: -; SI-NEXT: v_mov_b32_e32 v39, v32 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: ; implicit-def: $sgpr9 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: ; implicit-def: $sgpr10 -; SI-NEXT: ; implicit-def: $sgpr11 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: s_branch .LBB99_2 ; ; VI-LABEL: bitcast_v64i8_to_v32i16_scalar: ; VI: ; %bb.0: @@ -72911,133 +75409,179 @@ define <32 x bfloat> @bitcast_v32f16_to_v32bf16(<32 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v32f16_to_v32bf16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v24 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v32, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v12 ; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v57, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v20 ; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v58, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v13 ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v59, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v19 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v60, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v14 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v61, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v18 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v62, v30 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v15 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v63, v17 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v63, v31 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 -; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB100_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v33 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v34 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v35 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v38 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v39 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v48 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v49 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v50 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v51 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v52 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v53 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v53 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v54 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v55 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v40 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v41 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v41 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v42 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v43 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v44 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v45 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v46 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v45 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v46 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v47 ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v56 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v57 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v58 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v58 ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v59 ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v60 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v61 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v62 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v62 ; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v63 -; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr35 @@ -73073,153 +75617,216 @@ define <32 x bfloat> @bitcast_v32f16_to_v32bf16(<32 x half> %a, i32 %b) { ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB100_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v1, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v62 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v1 -; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v57 -; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v56 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v1 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v45 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v44 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v1 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v41 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v58 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v1 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v53 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v40 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v1 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v49 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v46 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v52 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v1 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v37 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v42 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v48 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v61 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v56 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v57 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v40 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v45 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v48 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v41 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v54 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v36 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v1 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v50 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v5 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v2 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v0 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v53 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v12 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v16 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v37 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v20 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v33 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v24 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v12 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v38 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v20 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v34 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v24 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 ; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v28 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v30 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v31 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v32 ; SI-NEXT: .LBB100_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v0, v32 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_alignbit_b32 v0, v2, v0, 16 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v6 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v4 +; SI-NEXT: v_alignbit_b32 v2, v2, v3, 16 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v7 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v5 +; SI-NEXT: v_alignbit_b32 v3, v3, v4, 16 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v10 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v8 +; SI-NEXT: v_alignbit_b32 v4, v4, v5, 16 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v11 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_mul_f32_e32 v6, 1.0, v9 +; SI-NEXT: v_alignbit_b32 v5, v5, v6, 16 +; SI-NEXT: v_mul_f32_e32 v6, 1.0, v14 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v12 +; SI-NEXT: v_alignbit_b32 v6, v6, v7, 16 +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v15 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v13 +; SI-NEXT: v_alignbit_b32 v7, v7, v8, 16 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v18 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v16 +; SI-NEXT: v_alignbit_b32 v8, v8, v9, 16 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v19 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v17 +; SI-NEXT: v_alignbit_b32 v9, v9, v10, 16 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v22 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v20 +; SI-NEXT: v_alignbit_b32 v10, v10, v11, 16 +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v23 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_mul_f32_e32 v12, 1.0, v21 +; SI-NEXT: v_alignbit_b32 v11, v11, v12, 16 +; SI-NEXT: v_mul_f32_e32 v12, 1.0, v26 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_mul_f32_e32 v13, 1.0, v24 +; SI-NEXT: v_alignbit_b32 v12, v12, v13, 16 +; SI-NEXT: v_mul_f32_e32 v13, 1.0, v27 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_mul_f32_e32 v14, 1.0, v25 +; SI-NEXT: v_alignbit_b32 v13, v13, v14, 16 +; SI-NEXT: v_mul_f32_e32 v14, 1.0, v30 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_mul_f32_e32 v15, 1.0, v28 +; SI-NEXT: v_alignbit_b32 v14, v14, v15, 16 +; SI-NEXT: v_mul_f32_e32 v15, 1.0, v31 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v29 +; SI-NEXT: v_alignbit_b32 v15, v15, v16, 16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -73365,6 +75972,26 @@ define inreg <32 x bfloat> @bitcast_v32f16_to_v32bf16_scalar(<32 x half> inreg % ; SI-LABEL: bitcast_v32f16_to_v32bf16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_lshr_b32 s10, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s10 +; SI-NEXT: s_lshr_b32 s10, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s10 +; SI-NEXT: s_lshr_b32 s10, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s10 +; SI-NEXT: s_lshr_b32 s10, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 +; SI-NEXT: s_lshr_b32 s8, s25, 16 +; SI-NEXT: s_lshr_b32 s9, s24, 16 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s8 +; SI-NEXT: s_lshr_b32 s6, s27, 16 +; SI-NEXT: s_lshr_b32 s7, s26, 16 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -73381,211 +76008,239 @@ define inreg <32 x bfloat> @bitcast_v32f16_to_v32bf16_scalar(<32 x half> inreg % ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v32, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v33, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v7 +; SI-NEXT: s_lshr_b32 s10, s19, 16 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s10 +; SI-NEXT: s_lshr_b32 s10, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s10 +; SI-NEXT: s_lshr_b32 s10, s17, 16 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; SI-NEXT: s_lshr_b32 s4, s29, 16 +; SI-NEXT: s_lshr_b32 s5, s28, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s10 +; SI-NEXT: s_lshr_b32 s10, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v46, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v47, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v5 ; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v57, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v6 ; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v58, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v8 ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v59, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v9 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v60, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v0 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v61, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v4 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v62, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v14 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v63, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v34, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v35, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v36, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v37, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v38, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v48, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v50, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v52, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v54, s26 -; SI-NEXT: v_cvt_f16_f32_e32 v40, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v42, s28 -; SI-NEXT: v_cvt_f16_f32_e32 v44, s29 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v3 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_cbranch_scc0 .LBB101_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v33 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v38 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v48 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v50 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v52 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v54 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v40 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v42 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v44 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v49 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v53 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v53 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v54 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v41 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v43 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v45 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v46 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v47 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v56 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v57 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v58 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v59 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v60 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v61 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v62 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v63 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v40 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v41 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v44 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v45 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v46 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v56 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v58 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v60 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v62 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v63 ; SI-NEXT: s_cbranch_execnz .LBB101_3 ; SI-NEXT: .LBB101_2: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v60 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v1 -; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v60 +; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v49, 0x38000000, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v56 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v1 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v57 +; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v51, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v43 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v56 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v41 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v44 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v53 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v40 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v37 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v51 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v1 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v42 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v1 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v50 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v1 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v36 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v1 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v32 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v52 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v23 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v35 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v20 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v0 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v4 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v12 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v28 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v31 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v30 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v9 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v15 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v19 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v25 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v31 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v38 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v13 ; SI-NEXT: .LBB101_3: ; %end ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -73603,41 +76258,105 @@ define inreg <32 x bfloat> @bitcast_v32f16_to_v32bf16_scalar(<32 x half> inreg % ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v5 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_lshr_b64 v[1:2], v[2:3], 16 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v10 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v6 +; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v8 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_lshr_b64 v[3:4], v[4:5], 16 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v16 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v11 +; SI-NEXT: v_lshr_b64 v[4:5], v[4:5], 16 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v14 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v9 +; SI-NEXT: v_lshr_b64 v[5:6], v[5:6], 16 +; SI-NEXT: v_mul_f32_e32 v6, 1.0, v21 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_mul_f32_e32 v6, 1.0, v17 +; SI-NEXT: v_lshr_b64 v[6:7], v[6:7], 16 +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v18 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v7 +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v15 +; SI-NEXT: v_lshr_b64 v[7:8], v[7:8], 16 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v26 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v8 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v22 +; SI-NEXT: v_lshr_b64 v[8:9], v[8:9], 16 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v24 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v9 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v19 +; SI-NEXT: v_lshr_b64 v[9:10], v[9:10], 16 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v32 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v27 +; SI-NEXT: v_lshr_b64 v[10:11], v[10:11], 16 +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v30 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v11 +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v25 +; SI-NEXT: v_lshr_b64 v[11:12], v[11:12], 16 +; SI-NEXT: v_mul_f32_e32 v12, 1.0, v36 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v12 +; SI-NEXT: v_mul_f32_e32 v12, 1.0, v33 +; SI-NEXT: v_lshr_b64 v[12:13], v[12:13], 16 +; SI-NEXT: v_mul_f32_e32 v13, 1.0, v34 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v13 +; SI-NEXT: v_mul_f32_e32 v13, 1.0, v31 +; SI-NEXT: v_lshr_b64 v[13:14], v[13:14], 16 +; SI-NEXT: v_mul_f32_e32 v14, 1.0, v51 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_mul_f32_e32 v14, 1.0, v39 +; SI-NEXT: v_lshr_b64 v[14:15], v[14:15], 16 +; SI-NEXT: v_mul_f32_e32 v15, 1.0, v49 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v15 +; SI-NEXT: v_mul_f32_e32 v15, 1.0, v38 +; SI-NEXT: v_lshr_b64 v[15:16], v[15:16], 16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB101_4: ; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: s_branch .LBB101_2 ; ; VI-LABEL: bitcast_v32f16_to_v32bf16_scalar: @@ -73832,159 +76551,193 @@ define <32 x half> @bitcast_v32bf16_to_v32f16(<32 x bfloat> %a, i32 %b) { ; SI-LABEL: bitcast_v32bf16_to_v32f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 ; SI-NEXT: v_mul_f32_e32 v32, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v33, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v34, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v35, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v36, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v37, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v38, 1.0, v6 -; SI-NEXT: v_mul_f32_e32 v39, 1.0, v7 -; SI-NEXT: v_mul_f32_e32 v48, 1.0, v8 -; SI-NEXT: v_mul_f32_e32 v49, 1.0, v9 -; SI-NEXT: v_mul_f32_e32 v50, 1.0, v10 -; SI-NEXT: v_mul_f32_e32 v51, 1.0, v11 -; SI-NEXT: v_mul_f32_e32 v52, 1.0, v12 -; SI-NEXT: v_mul_f32_e32 v53, 1.0, v13 -; SI-NEXT: v_mul_f32_e32 v54, 1.0, v14 -; SI-NEXT: v_mul_f32_e32 v55, 1.0, v15 -; SI-NEXT: v_mul_f32_e32 v40, 1.0, v16 -; SI-NEXT: v_mul_f32_e32 v41, 1.0, v17 -; SI-NEXT: v_mul_f32_e32 v42, 1.0, v18 -; SI-NEXT: v_mul_f32_e32 v43, 1.0, v19 -; SI-NEXT: v_mul_f32_e32 v44, 1.0, v20 -; SI-NEXT: v_mul_f32_e32 v45, 1.0, v21 -; SI-NEXT: v_mul_f32_e32 v46, 1.0, v22 -; SI-NEXT: v_mul_f32_e32 v47, 1.0, v23 -; SI-NEXT: v_mul_f32_e32 v56, 1.0, v24 -; SI-NEXT: v_mul_f32_e32 v57, 1.0, v25 -; SI-NEXT: v_mul_f32_e32 v58, 1.0, v26 -; SI-NEXT: v_mul_f32_e32 v59, 1.0, v27 -; SI-NEXT: v_mul_f32_e32 v60, 1.0, v28 -; SI-NEXT: v_mul_f32_e32 v61, 1.0, v29 -; SI-NEXT: v_mul_f32_e32 v62, 1.0, v30 +; SI-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; SI-NEXT: v_mul_f32_e32 v34, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v35, 1.0, v31 +; SI-NEXT: v_mul_f32_e32 v36, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v37, 1.0, v30 +; SI-NEXT: v_mul_f32_e32 v38, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v29 +; SI-NEXT: v_mul_f32_e32 v48, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v49, 1.0, v28 +; SI-NEXT: v_mul_f32_e32 v50, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v27 +; SI-NEXT: v_mul_f32_e32 v52, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v53, 1.0, v26 +; SI-NEXT: v_mul_f32_e32 v54, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v55, 1.0, v25 +; SI-NEXT: v_mul_f32_e32 v40, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v41, 1.0, v24 +; SI-NEXT: v_mul_f32_e32 v42, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v43, 1.0, v23 +; SI-NEXT: v_mul_f32_e32 v44, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v45, 1.0, v22 +; SI-NEXT: v_mul_f32_e32 v46, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v47, 1.0, v21 +; SI-NEXT: v_mul_f32_e32 v56, 1.0, v12 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_mul_f32_e32 v57, 1.0, v20 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_mul_f32_e32 v58, 1.0, v13 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_mul_f32_e32 v59, 1.0, v19 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_mul_f32_e32 v60, 1.0, v14 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mul_f32_e32 v61, 1.0, v18 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mul_f32_e32 v62, 1.0, v15 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v63, 1.0, v17 ; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v63, 1.0, v63 +; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB102_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v32 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v33 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v34 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v35 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v36 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v37 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v38 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v39 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v48 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v49 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v50 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v51 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v52 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v53 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v54 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v55 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v40 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v41 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v42 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v43 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v44 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v45 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v46 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v47 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v56 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v57 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v58 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v59 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v60 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v61 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v62 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v43 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v45 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v47 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v57 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v59 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v61 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v6 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v12 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v20 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v62 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v63 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr33 @@ -74027,147 +76780,211 @@ define <32 x half> @bitcast_v32bf16_to_v32f16(<32 x bfloat> %a, i32 %b) { ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v62 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v61 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v60 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v59 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v58 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v57 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v56 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v47 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v46 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v45 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v44 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v43 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v42 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v41 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v40 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v55 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v54 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v53 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v52 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v51 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v50 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v49 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v48 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v39 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v38 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v37 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v36 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v35 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v34 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v33 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v32 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: .LBB102_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v0, v0, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v8 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v15 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v12 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v19 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v16 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v23 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v20 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v27 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v24 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v30 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v28 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -75301,7 +78118,39 @@ define inreg <32 x half> @bitcast_v32bf16_to_v32f16_scalar(<32 x bfloat> inreg % ; SI-LABEL: bitcast_v32bf16_to_v32f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v1 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v0 +; SI-NEXT: s_and_b32 s6, s29, 0xffff0000 +; SI-NEXT: s_lshl_b32 s7, s29, 16 +; SI-NEXT: s_and_b32 s8, s28, 0xffff0000 +; SI-NEXT: s_lshl_b32 s9, s28, 16 +; SI-NEXT: s_and_b32 s10, s27, 0xffff0000 +; SI-NEXT: s_lshl_b32 s11, s27, 16 +; SI-NEXT: s_and_b32 s12, s26, 0xffff0000 +; SI-NEXT: s_lshl_b32 s13, s26, 16 +; SI-NEXT: s_and_b32 s14, s25, 0xffff0000 +; SI-NEXT: s_lshl_b32 s15, s25, 16 +; SI-NEXT: s_and_b32 s25, s24, 0xffff0000 +; SI-NEXT: s_lshl_b32 s24, s24, 16 +; SI-NEXT: s_and_b32 s26, s23, 0xffff0000 +; SI-NEXT: s_lshl_b32 s23, s23, 16 +; SI-NEXT: s_and_b32 s27, s22, 0xffff0000 +; SI-NEXT: s_lshl_b32 s22, s22, 16 +; SI-NEXT: s_and_b32 s28, s21, 0xffff0000 +; SI-NEXT: s_lshl_b32 s21, s21, 16 +; SI-NEXT: s_and_b32 s29, s20, 0xffff0000 +; SI-NEXT: s_lshl_b32 s20, s20, 16 +; SI-NEXT: s_and_b32 s40, s19, 0xffff0000 +; SI-NEXT: s_lshl_b32 s19, s19, 16 +; SI-NEXT: s_and_b32 s41, s18, 0xffff0000 +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_and_b32 s42, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_and_b32 s43, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s16, s16, 16 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -75319,242 +78168,295 @@ define inreg <32 x half> @bitcast_v32bf16_to_v32f16_scalar(<32 x bfloat> inreg % ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mul_f32_e64 v32, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v33, 1.0, s17 -; SI-NEXT: v_mul_f32_e32 v39, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v49, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v51, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v53, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v55, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v41, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v43, 1.0, v6 -; SI-NEXT: v_mul_f32_e32 v45, 1.0, v7 -; SI-NEXT: v_mul_f32_e32 v46, 1.0, v8 -; SI-NEXT: v_mul_f32_e32 v47, 1.0, v9 -; SI-NEXT: v_mul_f32_e32 v56, 1.0, v10 +; SI-NEXT: v_mul_f32_e64 v0, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s43 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s42 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s41 +; SI-NEXT: v_mul_f32_e64 v6, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s40 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v9, 1.0, s29 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v11, 1.0, s28 +; SI-NEXT: v_mul_f32_e64 v13, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v17, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v22, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v26, 1.0, s26 ; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_mul_f32_e32 v57, 1.0, v11 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_mul_f32_e32 v58, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v57, 1.0, v16 ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_mul_f32_e32 v59, 1.0, v13 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_mul_f32_e32 v60, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v59, 1.0, v15 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mul_f32_e32 v61, 1.0, v15 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mul_f32_e32 v62, 1.0, v16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v63, 1.0, v17 -; SI-NEXT: v_mul_f32_e64 v34, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v35, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v36, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v37, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v38, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v48, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v50, 1.0, s24 -; SI-NEXT: v_mul_f32_e64 v52, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v54, 1.0, s26 -; SI-NEXT: v_mul_f32_e64 v40, 1.0, s27 -; SI-NEXT: v_mul_f32_e64 v42, 1.0, s28 -; SI-NEXT: v_mul_f32_e64 v44, 1.0, s29 +; SI-NEXT: v_mul_f32_e32 v61, 1.0, v14 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v63, 1.0, v12 +; SI-NEXT: v_mul_f32_e64 v51, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v54, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v41, 1.0, s15 +; SI-NEXT: v_mul_f32_e64 v43, 1.0, s14 +; SI-NEXT: v_mul_f32_e64 v44, 1.0, s13 +; SI-NEXT: v_mul_f32_e64 v45, 1.0, s12 +; SI-NEXT: v_mul_f32_e64 v46, 1.0, s11 +; SI-NEXT: v_mul_f32_e64 v47, 1.0, s10 +; SI-NEXT: v_mul_f32_e64 v56, 1.0, s9 +; SI-NEXT: v_mul_f32_e64 v58, 1.0, s8 +; SI-NEXT: v_mul_f32_e64 v60, 1.0, s7 +; SI-NEXT: v_mul_f32_e64 v62, 1.0, s6 ; SI-NEXT: s_cbranch_scc0 .LBB103_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v32 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v33 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v34 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v35 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v36 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v37 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v38 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v48 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v50 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v52 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v54 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v40 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v42 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v44 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v39 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v49 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v51 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v53 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v55 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v41 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v43 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v45 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v46 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v47 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v56 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v57 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v58 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v59 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v60 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v61 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v62 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v43 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v45 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v47 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v58 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v62 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v59 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v15 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v20 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v34 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v36 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v38 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v48 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v50 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v53 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v61 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v63 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 ; SI-NEXT: s_cbranch_execnz .LBB103_3 ; SI-NEXT: .LBB103_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v63 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v62 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v61 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v60 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v59 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v58 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v57 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v56 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v47 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v46 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v45 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v43 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v41 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v55 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v53 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v51 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v49 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v39 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v44 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v42 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v40 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v54 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v52 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v50 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v48 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v38 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v37 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v36 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v35 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v34 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v33 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v32 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v63 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v61 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v59 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v57 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v62 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v60 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v58 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v56 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v47 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v46 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v45 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v44 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v43 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v41 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v54 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v51 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v26 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v22 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v13 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 ; SI-NEXT: .LBB103_3: ; %end +; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v14 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v21 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v27 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v23 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v31 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v28 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v35 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v32 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v39 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v36 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v52 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v55 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v42 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v40 ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -75571,41 +78473,49 @@ define inreg <32 x half> @bitcast_v32bf16_to_v32f16_scalar(<32 x bfloat> inreg % ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v15, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v53 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v50 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB103_4: -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: s_branch .LBB103_2 ; ; VI-LABEL: bitcast_v32bf16_to_v32f16_scalar: @@ -76976,524 +79886,567 @@ define <64 x i8> @bitcast_v32f16_to_v64i8(<32 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v32f16_to_v64i8: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:8 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v26 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v31 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v9 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; kill: killed $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; kill: killed $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; kill: killed $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; kill: killed $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; kill: killed $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; kill: killed $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; kill: killed $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; kill: killed $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; kill: killed $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; kill: killed $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; kill: killed $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; kill: killed $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v3 +; SI-NEXT: ; kill: killed $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v1 +; SI-NEXT: ; kill: killed $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: ; kill: killed $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; kill: killed $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; kill: killed $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; kill: killed $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: v_cvt_f16_f32_e32 v26, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v16 +; SI-NEXT: ; kill: killed $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; kill: killed $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; kill: killed $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 +; SI-NEXT: ; kill: killed $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; kill: killed $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; kill: killed $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v28, v54 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; kill: killed $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB104_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v34 -; SI-NEXT: v_or_b32_e32 v54, v33, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v31 -; SI-NEXT: v_or_b32_e32 v50, v32, v7 -; SI-NEXT: v_alignbit_b32 v7, v50, v54, 16 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v7, v50, v54, 8 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v26 +; SI-NEXT: v_or_b32_e32 v54, v25, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v8 +; SI-NEXT: v_or_b32_e32 v50, v23, v9 +; SI-NEXT: v_alignbit_b32 v9, v50, v54, 16 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v9, v50, v54, 8 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v37 -; SI-NEXT: v_or_b32_e32 v21, v36, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_or_b32_e32 v20, v35, v7 -; SI-NEXT: v_alignbit_b32 v7, v20, v21, 24 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v29 +; SI-NEXT: v_or_b32_e32 v22, v28, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v7 +; SI-NEXT: v_or_b32_e32 v21, v27, v9 +; SI-NEXT: v_alignbit_b32 v9, v21, v22, 24 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v7, v20, v21, 16 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v9, v21, v22, 16 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v7, v20, v21, 8 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v9, v21, v22, 8 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v48 -; SI-NEXT: v_or_b32_e32 v18, v39, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v6 -; SI-NEXT: v_or_b32_e32 v19, v38, v7 -; SI-NEXT: v_alignbit_b32 v7, v19, v18, 24 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v32 +; SI-NEXT: v_or_b32_e32 v19, v31, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v6 +; SI-NEXT: v_or_b32_e32 v20, v30, v9 +; SI-NEXT: v_alignbit_b32 v9, v20, v19, 24 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v7, v19, v18, 16 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v9, v20, v19, 16 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v7, v19, v18, 8 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v9, v20, v19, 8 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v52 -; SI-NEXT: v_or_b32_e32 v16, v51, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v5 -; SI-NEXT: v_or_b32_e32 v17, v49, v7 -; SI-NEXT: v_alignbit_b32 v7, v17, v16, 24 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v35 +; SI-NEXT: v_or_b32_e32 v17, v34, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v5 +; SI-NEXT: v_or_b32_e32 v18, v33, v9 +; SI-NEXT: v_alignbit_b32 v9, v18, v17, 24 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v7, v17, v16, 8 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v9, v18, v17, 8 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v40 -; SI-NEXT: v_or_b32_e32 v15, v55, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v4 -; SI-NEXT: v_or_b32_e32 v14, v53, v7 -; SI-NEXT: v_alignbit_b32 v7, v14, v15, 24 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v38 +; SI-NEXT: v_or_b32_e32 v16, v37, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v4 +; SI-NEXT: v_or_b32_e32 v15, v36, v9 +; SI-NEXT: v_alignbit_b32 v9, v15, v16, 24 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v7, v14, v15, 16 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v9, v15, v16, 16 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v7, v14, v15, 8 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v9, v15, v16, 8 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v42 -; SI-NEXT: v_or_b32_e32 v12, v41, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v3 -; SI-NEXT: v_or_b32_e32 v13, v22, v7 -; SI-NEXT: v_alignbit_b32 v7, v13, v12, 24 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v49 +; SI-NEXT: v_or_b32_e32 v13, v48, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v3 +; SI-NEXT: v_or_b32_e32 v14, v39, v9 +; SI-NEXT: v_alignbit_b32 v9, v14, v13, 24 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v7, v13, v12, 16 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v9, v14, v13, 16 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v7, v13, v12, 8 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v9, v14, v13, 8 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v24 -; SI-NEXT: v_or_b32_e32 v10, v26, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v2 -; SI-NEXT: v_or_b32_e32 v11, v25, v7 -; SI-NEXT: v_alignbit_b32 v7, v11, v10, 24 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v53 +; SI-NEXT: v_or_b32_e32 v11, v52, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v2 +; SI-NEXT: v_or_b32_e32 v12, v51, v9 +; SI-NEXT: v_alignbit_b32 v9, v12, v11, 24 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v7, v11, v10, 16 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v9, v12, v11, 16 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v7, v11, v10, 8 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v9, v12, v11, 8 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v30 -; SI-NEXT: v_or_b32_e32 v9, v29, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v1 -; SI-NEXT: v_or_b32_e32 v7, v28, v7 -; SI-NEXT: v_alignbit_b32 v22, v7, v9, 24 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v41 +; SI-NEXT: v_or_b32_e32 v10, v40, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v1 +; SI-NEXT: v_or_b32_e32 v9, v55, v9 +; SI-NEXT: v_alignbit_b32 v23, v9, v10, 24 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v22, v7, v9, 16 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v23, v9, v10, 16 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v22, v7, v9, 8 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v23, v9, v10, 8 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v22, 8, v11 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v23, 8, v12 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v22, 8, v7 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v23, 8, v9 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v22, v1, 8, 8 +; SI-NEXT: v_bfe_u32 v23, v1, 8, 8 ; SI-NEXT: v_alignbit_b32 v43, v50, v54, 24 -; SI-NEXT: v_alignbit_b32 v27, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v42, v18, v17, 16 ; SI-NEXT: v_lshrrev_b32_e32 v63, 8, v50 -; SI-NEXT: v_lshrrev_b32_e32 v60, 8, v20 -; SI-NEXT: v_lshrrev_b32_e32 v58, 8, v19 -; SI-NEXT: v_lshrrev_b32_e32 v56, 8, v17 -; SI-NEXT: v_lshrrev_b32_e32 v46, 8, v14 -; SI-NEXT: v_lshrrev_b32_e32 v44, 8, v13 -; SI-NEXT: v_bfe_u32 v23, v31, 8, 8 -; SI-NEXT: v_bfe_u32 v62, v8, 8, 8 +; SI-NEXT: v_lshrrev_b32_e32 v60, 8, v21 +; SI-NEXT: v_lshrrev_b32_e32 v58, 8, v20 +; SI-NEXT: v_lshrrev_b32_e32 v56, 8, v18 +; SI-NEXT: v_lshrrev_b32_e32 v46, 8, v15 +; SI-NEXT: v_lshrrev_b32_e32 v44, 8, v14 +; SI-NEXT: v_bfe_u32 v24, v8, 8, 8 +; SI-NEXT: v_bfe_u32 v62, v7, 8, 8 ; SI-NEXT: v_bfe_u32 v61, v6, 8, 8 ; SI-NEXT: v_bfe_u32 v59, v5, 8, 8 ; SI-NEXT: v_bfe_u32 v57, v4, 8, 8 ; SI-NEXT: v_bfe_u32 v47, v3, 8, 8 ; SI-NEXT: v_bfe_u32 v45, v2, 8, 8 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: .LBB104_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB104_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v7, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v40 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v28 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v55 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v9, v9, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v24 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v1 -; SI-NEXT: v_or_b32_e32 v7, v10, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v10, v10, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v53 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v1 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v49 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v41 -; SI-NEXT: v_or_b32_e32 v10, v10, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v22 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v48 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v39 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v55 -; SI-NEXT: v_or_b32_e32 v12, v14, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v40 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_or_b32_e32 v13, v13, v15 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v53 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v37 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v38 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v14, v14, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v36 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v15, v15, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v52 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v4 -; SI-NEXT: v_or_b32_e32 v14, v16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v16, v16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v35 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v4 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v32 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v39 -; SI-NEXT: v_or_b32_e32 v16, v16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v38 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v31 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v30 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v22, v36 -; SI-NEXT: v_or_b32_e32 v18, v20, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v37 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_or_b32_e32 v19, v19, v21 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v35 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v28 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v29 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_or_b32_e32 v20, v20, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v27 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v21, v21, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v34 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v8 -; SI-NEXT: v_or_b32_e32 v20, v22, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v32 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v24 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_or_b32_e32 v54, v22, v23 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v31 -; SI-NEXT: v_or_b32_e32 v50, v24, v22 -; SI-NEXT: v_alignbit_b32 v22, v50, v54, 16 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v22, v22, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v7 +; SI-NEXT: v_or_b32_e32 v21, v24, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v22, v50, v54, 8 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_or_b32_e32 v54, v24, v25 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v8 +; SI-NEXT: v_or_b32_e32 v50, v23, v24 +; SI-NEXT: v_alignbit_b32 v23, v50, v54, 16 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v22, v20, v21, 24 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v23, v50, v54, 8 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v22, v20, v21, 16 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v23, v21, v22, 24 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v22, v20, v21, 8 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v23, v21, v22, 16 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v22, v19, v18, 24 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v23, v21, v22, 8 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v22, v19, v18, 16 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v23, v20, v19, 24 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v22, v19, v18, 8 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v23, v20, v19, 16 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v23, v20, v19, 8 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v22, v17, v16, 24 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v23, v18, v17, 24 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v22, v17, v16, 8 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v23, v18, v17, 8 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v22, v14, v15, 24 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v23, v15, v16, 24 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v22, v14, v15, 16 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v23, v15, v16, 16 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v22, v14, v15, 8 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v23, v15, v16, 8 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v22, v13, v12, 24 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v23, v14, v13, 24 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v22, v13, v12, 16 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v23, v14, v13, 16 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v22, v13, v12, 8 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v23, v14, v13, 8 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v22, v11, v10, 24 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v23, v12, v11, 24 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v22, v11, v10, 16 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v23, v12, v11, 16 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v22, v11, v10, 8 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v23, v12, v11, 8 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v22, v7, v9, 24 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v23, v9, v10, 24 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v22, v7, v9, 16 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v23, v9, v10, 16 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v22, v7, v9, 8 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v23, v9, v10, 8 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v22, 8, v11 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v23, 8, v12 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v22, 8, v7 +; SI-NEXT: v_lshrrev_b32_e32 v23, 8, v9 ; SI-NEXT: v_alignbit_b32 v43, v50, v54, 24 -; SI-NEXT: v_alignbit_b32 v27, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v42, v18, v17, 16 ; SI-NEXT: v_lshrrev_b32_e32 v63, 8, v50 -; SI-NEXT: v_lshrrev_b32_e32 v60, 8, v20 -; SI-NEXT: v_lshrrev_b32_e32 v58, 8, v19 -; SI-NEXT: v_lshrrev_b32_e32 v56, 8, v17 -; SI-NEXT: v_lshrrev_b32_e32 v46, 8, v14 -; SI-NEXT: v_lshrrev_b32_e32 v44, 8, v13 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: v_bfe_u32 v23, v31, 8, 8 -; SI-NEXT: v_bfe_u32 v62, v8, 8, 8 +; SI-NEXT: v_lshrrev_b32_e32 v60, 8, v21 +; SI-NEXT: v_lshrrev_b32_e32 v58, 8, v20 +; SI-NEXT: v_lshrrev_b32_e32 v56, 8, v18 +; SI-NEXT: v_lshrrev_b32_e32 v46, 8, v15 +; SI-NEXT: v_lshrrev_b32_e32 v44, 8, v14 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: v_bfe_u32 v24, v8, 8, 8 +; SI-NEXT: v_bfe_u32 v62, v7, 8, 8 ; SI-NEXT: v_bfe_u32 v61, v6, 8, 8 ; SI-NEXT: v_bfe_u32 v59, v5, 8, 8 ; SI-NEXT: v_bfe_u32 v57, v4, 8, 8 ; SI-NEXT: v_bfe_u32 v47, v3, 8, 8 ; SI-NEXT: v_bfe_u32 v45, v2, 8, 8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v22, v1, 8, 8 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_bfe_u32 v23, v1, 8, 8 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: .LBB104_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v22, 0xff, v54 -; SI-NEXT: v_lshlrev_b32_e32 v25, 24, v43 -; SI-NEXT: v_lshlrev_b32_e32 v23, 24, v23 -; SI-NEXT: v_and_b32_e32 v21, 0xff, v21 -; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 +; SI-NEXT: v_and_b32_e32 v23, 0xff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v26, 24, v43 ; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v24, 24, v24 +; SI-NEXT: v_or_b32_e32 v8, v24, v8 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 @@ -77507,127 +80460,126 @@ define <64 x i8> @bitcast_v32f16_to_v64i8(<32 x half> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v24, 8, v24 -; SI-NEXT: v_or_b32_e32 v22, v22, v24 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v25, 8, v25 +; SI-NEXT: v_or_b32_e32 v23, v23, v25 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v24, 0xff, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: v_or_b32_e32 v22, v22, v24 -; SI-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v25, 0xff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: v_or_b32_e32 v23, v23, v25 +; SI-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v22, 0xff, v50 -; SI-NEXT: v_lshlrev_b32_e32 v24, 8, v63 -; SI-NEXT: v_or_b32_e32 v22, v22, v24 -; SI-NEXT: v_and_b32_e32 v24, 0xff, v31 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; SI-NEXT: v_or_b32_e32 v23, v23, v24 -; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: v_and_b32_e32 v23, 0xff, v50 +; SI-NEXT: v_lshlrev_b32_e32 v25, 8, v63 +; SI-NEXT: v_or_b32_e32 v23, v23, v25 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_or_b32_e32 v8, v23, v8 ; SI-NEXT: v_add_i32_e32 v23, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v22, v23, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v8, v23, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v8, 0xff, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v22, 8, v22 -; SI-NEXT: v_or_b32_e32 v21, v21, v22 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v8, v8, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v23, 24, v23 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: v_or_b32_e32 v8, v8, v22 ; SI-NEXT: v_add_i32_e32 v22, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v21, v22, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v8, v22, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v8, 0xff, v21 ; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v60 -; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: v_or_b32_e32 v8, v8, v21 ; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v62 -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; SI-NEXT: v_or_b32_e32 v8, v21, v8 -; SI-NEXT: v_or_b32_e32 v8, v20, v8 -; SI-NEXT: v_add_i32_e32 v20, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v8, v20, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v7, v21, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v8, 0xff, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v7, 0xff, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 -; SI-NEXT: v_or_b32_e32 v8, v8, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v8 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v20 +; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v19 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v18, 0xff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v20, v18 -; SI-NEXT: v_or_b32_e32 v8, v8, v18 -; SI-NEXT: v_add_i32_e32 v18, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v8, v18, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v19, v8 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v8, 0xff, v19 -; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v58 -; SI-NEXT: v_or_b32_e32 v8, v8, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 24, v61 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; SI-NEXT: v_or_b32_e32 v6, v18, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v58 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v61 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_add_i32_e32 v8, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v6, v8, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v6, 0xff, v17 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v8 -; SI-NEXT: v_or_b32_e32 v6, v6, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xff, v27 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v42 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v16, 24, v16 +; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v8 ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; SI-NEXT: v_or_b32_e32 v8, v16, v8 -; SI-NEXT: v_or_b32_e32 v6, v6, v8 -; SI-NEXT: v_add_i32_e32 v8, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v6, v8, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v17 -; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v56 -; SI-NEXT: v_or_b32_e32 v6, v6, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v59 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v56 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v59 ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; SI-NEXT: v_or_b32_e32 v5, v8, v5 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 ; SI-NEXT: v_add_i32_e32 v6, vcc, 28, v0 ; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v15 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v5, 0xff, v16 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 ; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v8 +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v7 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 ; SI-NEXT: v_or_b32_e32 v5, v5, v6 ; SI-NEXT: v_add_i32_e32 v6, vcc, 32, v0 ; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v14 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v15 ; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v46 ; SI-NEXT: v_or_b32_e32 v5, v5, v6 ; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v57 @@ -77636,14 +80588,14 @@ define <64 x i8> @bitcast_v32f16_to_v64i8(<32 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v4, v5, v4 ; SI-NEXT: v_add_i32_e32 v5, vcc, 36, v0 ; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v4, 0xff, v12 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v4, 0xff, v13 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; SI-NEXT: v_or_b32_e32 v4, v4, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v6 @@ -77655,7 +80607,7 @@ define <64 x i8> @bitcast_v32f16_to_v64i8(<32 x half> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v5, vcc, 40, v0 ; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v4, 0xff, v13 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v14 ; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v44 ; SI-NEXT: v_or_b32_e32 v4, v4, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v47 @@ -77664,14 +80616,14 @@ define <64 x i8> @bitcast_v32f16_to_v64i8(<32 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_add_i32_e32 v4, vcc, 44, v0 ; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xff, v10 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v3, 0xff, v11 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 ; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v5 @@ -77682,9 +80634,9 @@ define <64 x i8> @bitcast_v32f16_to_v64i8(<32 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v3, v3, v4 ; SI-NEXT: v_add_i32_e32 v4, vcc, 48, v0 ; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xff, v11 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v12 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 ; SI-NEXT: v_or_b32_e32 v3, v3, v4 @@ -77694,14 +80646,14 @@ define <64 x i8> @bitcast_v32f16_to_v64i8(<32 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 ; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xff, v9 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v10 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 @@ -77712,36 +80664,36 @@ define <64 x i8> @bitcast_v32f16_to_v64i8(<32 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 ; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xff, v7 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v9 ; SI-NEXT: v_add_i32_e32 v0, vcc, 60, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -78995,113 +81947,161 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; SI-NEXT: s_lshr_b32 s10, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s10 +; SI-NEXT: s_lshr_b32 s10, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s10 +; SI-NEXT: s_lshr_b32 s10, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s10 +; SI-NEXT: s_lshr_b32 s10, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s10 +; SI-NEXT: s_lshr_b32 s10, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s10 +; SI-NEXT: s_lshr_b32 s10, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s10 +; SI-NEXT: s_lshr_b32 s10, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 +; SI-NEXT: s_lshr_b32 s10, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s17 +; SI-NEXT: s_lshr_b32 s9, s24, 16 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s24 +; SI-NEXT: s_lshr_b32 s6, s27, 16 +; SI-NEXT: s_lshr_b32 s7, s26, 16 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s27 +; SI-NEXT: s_lshr_b32 s4, s29, 16 +; SI-NEXT: s_lshr_b32 s5, s28, 16 +; SI-NEXT: s_lshr_b32 s8, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s29 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_writelane_b32 v40, s30, 0 ; SI-NEXT: v_writelane_b32 v40, s31, 1 -; SI-NEXT: v_cvt_f16_f32_e32 v21, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v9, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v7, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v8, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v12, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v11, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v6, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v10, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v15, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v14, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v5, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v13, s26 -; SI-NEXT: v_cvt_f16_f32_e32 v17, s29 -; SI-NEXT: v_cvt_f16_f32_e32 v16, s28 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v31 ; SI-NEXT: v_writelane_b32 v40, s34, 2 ; SI-NEXT: v_writelane_b32 v40, s35, 3 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 ; SI-NEXT: v_writelane_b32 v40, s36, 4 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_writelane_b32 v40, s37, 5 ; SI-NEXT: s_cbranch_scc0 .LBB105_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_readfirstlane_b32 s4, v21 +; SI-NEXT: v_readfirstlane_b32 s4, v12 ; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v9 +; SI-NEXT: v_readfirstlane_b32 s5, v11 ; SI-NEXT: s_or_b32 s18, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v7 +; SI-NEXT: v_readfirstlane_b32 s4, v9 ; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v8 +; SI-NEXT: v_readfirstlane_b32 s5, v10 ; SI-NEXT: s_or_b32 s19, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v12 +; SI-NEXT: v_readfirstlane_b32 s4, v15 ; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v11 +; SI-NEXT: v_readfirstlane_b32 s5, v14 ; SI-NEXT: s_or_b32 s16, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v6 +; SI-NEXT: v_readfirstlane_b32 s4, v8 ; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v10 +; SI-NEXT: v_readfirstlane_b32 s5, v13 ; SI-NEXT: s_or_b32 s17, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v15 +; SI-NEXT: v_readfirstlane_b32 s4, v18 ; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v14 +; SI-NEXT: v_readfirstlane_b32 s5, v17 ; SI-NEXT: s_or_b32 s14, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v5 +; SI-NEXT: v_readfirstlane_b32 s4, v7 ; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v13 +; SI-NEXT: v_readfirstlane_b32 s5, v16 ; SI-NEXT: s_or_b32 s15, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v17 +; SI-NEXT: v_readfirstlane_b32 s4, v21 ; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v16 +; SI-NEXT: v_readfirstlane_b32 s5, v20 ; SI-NEXT: s_or_b32 s12, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v20 +; SI-NEXT: v_readfirstlane_b32 s4, v6 ; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v22 +; SI-NEXT: v_readfirstlane_b32 s5, v19 ; SI-NEXT: s_or_b32 s13, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v25 +; SI-NEXT: v_readfirstlane_b32 s4, v24 ; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v24 +; SI-NEXT: v_readfirstlane_b32 s5, v23 ; SI-NEXT: s_or_b32 s10, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v4 +; SI-NEXT: v_readfirstlane_b32 s4, v5 ; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v23 +; SI-NEXT: v_readfirstlane_b32 s5, v22 ; SI-NEXT: s_or_b32 s11, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v28 +; SI-NEXT: v_readfirstlane_b32 s4, v27 ; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v27 +; SI-NEXT: v_readfirstlane_b32 s5, v26 ; SI-NEXT: s_or_b32 s8, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v3 +; SI-NEXT: v_readfirstlane_b32 s4, v4 ; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v26 +; SI-NEXT: v_readfirstlane_b32 s5, v25 ; SI-NEXT: s_or_b32 s9, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v33 +; SI-NEXT: v_readfirstlane_b32 s4, v34 ; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v31 +; SI-NEXT: v_readfirstlane_b32 s5, v32 ; SI-NEXT: s_or_b32 s6, s5, s4 ; SI-NEXT: v_readfirstlane_b32 s4, v2 ; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v29 +; SI-NEXT: v_readfirstlane_b32 s5, v30 ; SI-NEXT: s_or_b32 s7, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v38 +; SI-NEXT: v_readfirstlane_b32 s4, v39 ; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v37 +; SI-NEXT: v_readfirstlane_b32 s5, v38 ; SI-NEXT: s_lshr_b64 s[20:21], s[18:19], 24 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: v_readfirstlane_b32 s5, v1 ; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: v_readfirstlane_b32 s21, v35 +; SI-NEXT: v_readfirstlane_b32 s21, v36 ; SI-NEXT: s_lshr_b64 s[22:23], s[18:19], 16 ; SI-NEXT: s_lshr_b64 s[26:27], s[18:19], 8 ; SI-NEXT: s_lshr_b64 s[24:25], s[16:17], 24 @@ -79134,175 +82134,175 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; SI-NEXT: s_lshr_b32 s25, s9, 8 ; SI-NEXT: s_lshr_b32 s23, s7, 8 ; SI-NEXT: s_lshr_b32 s21, s5, 8 -; SI-NEXT: v_bfe_u32 v48, v7, 8, 8 -; SI-NEXT: v_bfe_u32 v39, v6, 8, 8 -; SI-NEXT: v_bfe_u32 v36, v5, 8, 8 -; SI-NEXT: v_bfe_u32 v34, v20, 8, 8 -; SI-NEXT: v_bfe_u32 v32, v4, 8, 8 -; SI-NEXT: v_bfe_u32 v30, v3, 8, 8 -; SI-NEXT: v_bfe_u32 v19, v2, 8, 8 -; SI-NEXT: v_bfe_u32 v18, v1, 8, 8 +; SI-NEXT: v_bfe_u32 v48, v9, 8, 8 +; SI-NEXT: v_bfe_u32 v37, v8, 8, 8 +; SI-NEXT: v_bfe_u32 v35, v7, 8, 8 +; SI-NEXT: v_bfe_u32 v33, v6, 8, 8 +; SI-NEXT: v_bfe_u32 v31, v5, 8, 8 +; SI-NEXT: v_bfe_u32 v29, v4, 8, 8 +; SI-NEXT: v_bfe_u32 v28, v2, 8, 8 +; SI-NEXT: v_bfe_u32 v3, v1, 8, 8 ; SI-NEXT: s_cbranch_execnz .LBB105_3 ; SI-NEXT: .LBB105_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v18, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v38 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_readfirstlane_b32 s4, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v35 -; SI-NEXT: v_readfirstlane_b32 s5, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v33 +; SI-NEXT: v_readfirstlane_b32 s4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v36 +; SI-NEXT: v_readfirstlane_b32 s5, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v34 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_readfirstlane_b32 s6, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v32 ; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s6, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v31 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: v_readfirstlane_b32 s5, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_readfirstlane_b32 s7, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v30 ; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: v_readfirstlane_b32 s6, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v28 -; SI-NEXT: v_readfirstlane_b32 s7, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v29 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_readfirstlane_b32 s6, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_readfirstlane_b32 s8, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v26 ; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: v_readfirstlane_b32 s8, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v27 ; SI-NEXT: v_readfirstlane_b32 s7, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_readfirstlane_b32 s9, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v25 ; SI-NEXT: s_lshl_b32 s7, s7, 16 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: s_or_b32 s7, s8, s7 -; SI-NEXT: v_readfirstlane_b32 s8, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v25 -; SI-NEXT: v_readfirstlane_b32 s9, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v26 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_readfirstlane_b32 s10, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: v_readfirstlane_b32 s8, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_readfirstlane_b32 s10, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v23 ; SI-NEXT: s_lshl_b32 s8, s8, 16 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: s_or_b32 s8, s9, s8 -; SI-NEXT: v_readfirstlane_b32 s9, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_readfirstlane_b32 s11, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v23 +; SI-NEXT: v_readfirstlane_b32 s9, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_readfirstlane_b32 s11, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v22 ; SI-NEXT: s_lshl_b32 s9, s9, 16 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: s_or_b32 s9, s10, s9 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_readfirstlane_b32 s10, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_readfirstlane_b32 s10, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_readfirstlane_b32 s12, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v20 ; SI-NEXT: s_lshl_b32 s10, s10, 16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: s_or_b32 s10, s11, s10 -; SI-NEXT: v_readfirstlane_b32 s11, v4 +; SI-NEXT: v_readfirstlane_b32 s11, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_readfirstlane_b32 s13, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v19 ; SI-NEXT: s_lshl_b32 s11, s11, 16 -; SI-NEXT: v_readfirstlane_b32 s12, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: s_or_b32 s11, s12, s11 -; SI-NEXT: v_readfirstlane_b32 s12, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v20 -; SI-NEXT: v_readfirstlane_b32 s13, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v22 +; SI-NEXT: v_readfirstlane_b32 s12, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_readfirstlane_b32 s14, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v17 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: s_lshl_b32 s12, s12, 16 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: s_or_b32 s12, s13, s12 -; SI-NEXT: v_readfirstlane_b32 s13, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_readfirstlane_b32 s13, v6 +; SI-NEXT: v_readfirstlane_b32 s15, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: s_lshl_b32 s13, s13, 16 -; SI-NEXT: v_readfirstlane_b32 s14, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_readfirstlane_b32 s17, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v21 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: s_or_b32 s13, s14, s13 -; SI-NEXT: v_readfirstlane_b32 s14, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_readfirstlane_b32 s14, v18 +; SI-NEXT: v_readfirstlane_b32 s16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: s_lshl_b32 s14, s14, 16 -; SI-NEXT: v_readfirstlane_b32 s15, v14 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: s_or_b32 s14, s15, s14 -; SI-NEXT: v_readfirstlane_b32 s15, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_readfirstlane_b32 s15, v7 +; SI-NEXT: v_readfirstlane_b32 s17, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: s_lshl_b32 s15, s15, 16 -; SI-NEXT: v_readfirstlane_b32 s16, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: s_or_b32 s15, s16, s15 -; SI-NEXT: v_readfirstlane_b32 s16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_readfirstlane_b32 s16, v15 +; SI-NEXT: v_readfirstlane_b32 s18, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: s_lshl_b32 s16, s16, 16 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: v_readfirstlane_b32 s17, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_readfirstlane_b32 s19, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v9 +; SI-NEXT: v_readfirstlane_b32 s17, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v10 ; SI-NEXT: s_lshl_b32 s17, s17, 16 -; SI-NEXT: v_readfirstlane_b32 s18, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: s_or_b32 s17, s18, s17 -; SI-NEXT: v_readfirstlane_b32 s18, v11 +; SI-NEXT: v_readfirstlane_b32 s18, v12 ; SI-NEXT: s_lshl_b32 s18, s18, 16 -; SI-NEXT: v_readfirstlane_b32 s19, v9 ; SI-NEXT: s_or_b32 s18, s19, s18 -; SI-NEXT: v_readfirstlane_b32 s19, v7 +; SI-NEXT: v_readfirstlane_b32 s19, v9 ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: v_readfirstlane_b32 s20, v8 +; SI-NEXT: v_readfirstlane_b32 s20, v3 ; SI-NEXT: s_or_b32 s19, s20, s19 ; SI-NEXT: s_lshr_b64 s[20:21], s[18:19], 24 ; SI-NEXT: s_lshr_b64 s[22:23], s[18:19], 16 @@ -79336,14 +82336,14 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; SI-NEXT: s_lshr_b32 s25, s9, 8 ; SI-NEXT: s_lshr_b32 s23, s7, 8 ; SI-NEXT: s_lshr_b32 s21, s5, 8 -; SI-NEXT: v_bfe_u32 v48, v7, 8, 8 -; SI-NEXT: v_bfe_u32 v39, v6, 8, 8 -; SI-NEXT: v_bfe_u32 v36, v5, 8, 8 -; SI-NEXT: v_bfe_u32 v34, v20, 8, 8 -; SI-NEXT: v_bfe_u32 v32, v4, 8, 8 -; SI-NEXT: v_bfe_u32 v30, v3, 8, 8 -; SI-NEXT: v_bfe_u32 v19, v2, 8, 8 -; SI-NEXT: v_bfe_u32 v18, v1, 8, 8 +; SI-NEXT: v_bfe_u32 v48, v9, 8, 8 +; SI-NEXT: v_bfe_u32 v37, v8, 8, 8 +; SI-NEXT: v_bfe_u32 v35, v7, 8, 8 +; SI-NEXT: v_bfe_u32 v33, v6, 8, 8 +; SI-NEXT: v_bfe_u32 v31, v5, 8, 8 +; SI-NEXT: v_bfe_u32 v29, v4, 8, 8 +; SI-NEXT: v_bfe_u32 v28, v2, 8, 8 +; SI-NEXT: v_bfe_u32 v3, v1, 8, 8 ; SI-NEXT: .LBB105_3: ; %end ; SI-NEXT: s_and_b32 s18, s18, 0xff ; SI-NEXT: s_lshl_b32 s26, s26, 8 @@ -79354,18 +82354,18 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; SI-NEXT: s_and_b32 s18, s18, 0xffff ; SI-NEXT: s_or_b32 s20, s20, s22 ; SI-NEXT: s_or_b32 s18, s18, s20 -; SI-NEXT: v_mov_b32_e32 v8, s18 +; SI-NEXT: v_mov_b32_e32 v10, s18 ; SI-NEXT: s_and_b32 s18, s19, 0xff ; SI-NEXT: s_lshl_b32 s19, s45, 8 -; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; SI-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen ; SI-NEXT: s_or_b32 s18, s18, s19 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v48 +; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v48 ; SI-NEXT: s_and_b32 s18, s18, 0xffff -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_or_b32_e32 v7, s18, v7 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_or_b32_e32 v9, s18, v9 ; SI-NEXT: s_and_b32 s16, s16, 0xff ; SI-NEXT: s_lshl_b32 s18, s42, 8 ; SI-NEXT: s_or_b32 s16, s16, s18 @@ -79374,22 +82374,22 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; SI-NEXT: s_lshl_b32 s19, s24, 24 ; SI-NEXT: s_and_b32 s16, s16, 0xffff ; SI-NEXT: s_or_b32 s18, s19, s18 -; SI-NEXT: v_add_i32_e32 v8, vcc, 4, v0 +; SI-NEXT: v_add_i32_e32 v10, vcc, 4, v0 ; SI-NEXT: s_or_b32 s16, s16, s18 -; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v7, vcc, 8, v0 -; SI-NEXT: v_mov_b32_e32 v8, s16 +; SI-NEXT: v_add_i32_e32 v9, vcc, 8, v0 +; SI-NEXT: v_mov_b32_e32 v10, s16 ; SI-NEXT: s_and_b32 s16, s17, 0xff ; SI-NEXT: s_lshl_b32 s17, s43, 8 -; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 -; SI-NEXT: buffer_store_dword v8, v7, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: buffer_store_dword v10, v9, s[0:3], 0 offen ; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v39 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v37 ; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_or_b32_e32 v6, s16, v6 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_or_b32_e32 v8, s16, v8 ; SI-NEXT: s_and_b32 s14, s14, 0xff ; SI-NEXT: s_lshl_b32 s16, s56, 8 ; SI-NEXT: s_or_b32 s14, s14, s16 @@ -79398,22 +82398,22 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; SI-NEXT: s_lshl_b32 s17, s40, 24 ; SI-NEXT: s_and_b32 s14, s14, 0xffff ; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: v_add_i32_e32 v7, vcc, 12, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, 12, v0 ; SI-NEXT: s_or_b32 s14, s14, s16 -; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v8, v9, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v6, vcc, 16, v0 -; SI-NEXT: v_mov_b32_e32 v7, s14 +; SI-NEXT: v_add_i32_e32 v8, vcc, 16, v0 +; SI-NEXT: v_mov_b32_e32 v9, s14 ; SI-NEXT: s_and_b32 s14, s15, 0xff ; SI-NEXT: s_lshl_b32 s15, s41, 8 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: buffer_store_dword v7, v6, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: buffer_store_dword v9, v8, s[0:3], 0 offen ; SI-NEXT: s_or_b32 s14, s14, s15 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v36 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v35 ; SI-NEXT: s_and_b32 s14, s14, 0xffff -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_or_b32_e32 v5, s14, v5 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_or_b32_e32 v7, s14, v7 ; SI-NEXT: s_and_b32 s12, s12, 0xff ; SI-NEXT: s_lshl_b32 s14, s62, 8 ; SI-NEXT: s_or_b32 s12, s12, s14 @@ -79422,23 +82422,22 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; SI-NEXT: s_lshl_b32 s15, s46, 24 ; SI-NEXT: s_and_b32 s12, s12, 0xffff ; SI-NEXT: s_or_b32 s14, s15, s14 -; SI-NEXT: v_add_i32_e32 v6, vcc, 20, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, 20, v0 ; SI-NEXT: s_or_b32 s12, s12, s14 -; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v5, vcc, 24, v0 -; SI-NEXT: v_mov_b32_e32 v6, s12 -; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v7, vcc, 24, v0 +; SI-NEXT: v_mov_b32_e32 v8, s12 ; SI-NEXT: s_and_b32 s12, s13, 0xff ; SI-NEXT: s_lshl_b32 s13, s29, 8 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v20 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: buffer_store_dword v8, v7, s[0:3], 0 offen ; SI-NEXT: s_or_b32 s12, s12, s13 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v34 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v33 ; SI-NEXT: s_and_b32 s12, s12, 0xffff -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_or_b32_e32 v5, s12, v5 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_or_b32_e32 v6, s12, v6 ; SI-NEXT: s_and_b32 s10, s10, 0xff ; SI-NEXT: s_lshl_b32 s12, s76, 8 ; SI-NEXT: s_or_b32 s10, s10, s12 @@ -79447,22 +82446,22 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; SI-NEXT: s_lshl_b32 s13, s60, 24 ; SI-NEXT: s_and_b32 s10, s10, 0xffff ; SI-NEXT: s_or_b32 s12, s13, s12 -; SI-NEXT: v_add_i32_e32 v6, vcc, 28, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 28, v0 ; SI-NEXT: s_or_b32 s10, s10, s12 -; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v5, vcc, 32, v0 -; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: v_add_i32_e32 v6, vcc, 32, v0 +; SI-NEXT: v_mov_b32_e32 v7, s10 ; SI-NEXT: s_and_b32 s10, s11, 0xff ; SI-NEXT: s_lshl_b32 s11, s27, 8 -; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 -; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: buffer_store_dword v7, v6, s[0:3], 0 offen ; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v32 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v31 ; SI-NEXT: s_and_b32 s10, s10, 0xffff -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_or_b32_e32 v4, s10, v4 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_or_b32_e32 v5, s10, v5 ; SI-NEXT: s_and_b32 s8, s8, 0xff ; SI-NEXT: s_lshl_b32 s10, s90, 8 ; SI-NEXT: s_or_b32 s8, s8, s10 @@ -79471,22 +82470,22 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; SI-NEXT: s_lshl_b32 s11, s74, 24 ; SI-NEXT: s_and_b32 s8, s8, 0xffff ; SI-NEXT: s_or_b32 s10, s11, s10 -; SI-NEXT: v_add_i32_e32 v5, vcc, 36, v0 +; SI-NEXT: v_add_i32_e32 v6, vcc, 36, v0 ; SI-NEXT: s_or_b32 s8, s8, s10 -; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v4, vcc, 40, v0 -; SI-NEXT: v_mov_b32_e32 v5, s8 +; SI-NEXT: v_add_i32_e32 v5, vcc, 40, v0 +; SI-NEXT: v_mov_b32_e32 v6, s8 ; SI-NEXT: s_and_b32 s8, s9, 0xff ; SI-NEXT: s_lshl_b32 s9, s25, 8 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: buffer_store_dword v5, v4, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen ; SI-NEXT: s_or_b32 s8, s8, s9 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v30 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v29 ; SI-NEXT: s_and_b32 s8, s8, 0xffff -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_or_b32_e32 v3, s8, v3 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_or_b32_e32 v4, s8, v4 ; SI-NEXT: s_and_b32 s6, s6, 0xff ; SI-NEXT: s_lshl_b32 s8, s94, 8 ; SI-NEXT: s_or_b32 s6, s6, s8 @@ -79495,21 +82494,21 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; SI-NEXT: s_lshl_b32 s9, s88, 24 ; SI-NEXT: s_and_b32 s6, s6, 0xffff ; SI-NEXT: s_or_b32 s8, s9, s8 -; SI-NEXT: v_add_i32_e32 v4, vcc, 44, v0 +; SI-NEXT: v_add_i32_e32 v5, vcc, 44, v0 ; SI-NEXT: s_or_b32 s6, s6, s8 -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 -; SI-NEXT: v_mov_b32_e32 v4, s6 +; SI-NEXT: v_add_i32_e32 v4, vcc, 48, v0 +; SI-NEXT: v_mov_b32_e32 v5, s6 ; SI-NEXT: s_and_b32 s6, s7, 0xff ; SI-NEXT: s_lshl_b32 s7, s23, 8 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v5, v4, s[0:3], 0 offen ; SI-NEXT: s_or_b32 s6, s6, s7 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v19 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v28 ; SI-NEXT: s_and_b32 s6, s6, 0xffff -; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 ; SI-NEXT: v_or_b32_e32 v2, s6, v2 ; SI-NEXT: s_and_b32 s4, s4, 0xff ; SI-NEXT: s_lshl_b32 s6, s36, 8 @@ -79519,19 +82518,19 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; SI-NEXT: s_lshl_b32 s7, s30, 24 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: v_add_i32_e32 v4, vcc, 52, v0 ; SI-NEXT: s_or_b32 s4, s4, s6 -; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 -; SI-NEXT: v_mov_b32_e32 v3, s4 +; SI-NEXT: v_mov_b32_e32 v4, s4 ; SI-NEXT: s_and_b32 s4, s5, 0xff ; SI-NEXT: s_lshl_b32 s5, s21, 8 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v3 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v1, s4, v1 @@ -79560,40 +82559,40 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; SI-NEXT: ; implicit-def: $sgpr28 ; SI-NEXT: ; implicit-def: $sgpr24 ; SI-NEXT: ; implicit-def: $sgpr43 -; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $sgpr14 ; SI-NEXT: ; implicit-def: $sgpr56 ; SI-NEXT: ; implicit-def: $sgpr44 ; SI-NEXT: ; implicit-def: $sgpr40 ; SI-NEXT: ; implicit-def: $sgpr41 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $sgpr12 ; SI-NEXT: ; implicit-def: $sgpr62 ; SI-NEXT: ; implicit-def: $sgpr58 ; SI-NEXT: ; implicit-def: $sgpr46 ; SI-NEXT: ; implicit-def: $sgpr29 -; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $sgpr10 ; SI-NEXT: ; implicit-def: $sgpr76 ; SI-NEXT: ; implicit-def: $sgpr72 ; SI-NEXT: ; implicit-def: $sgpr60 ; SI-NEXT: ; implicit-def: $sgpr27 -; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $sgpr8 ; SI-NEXT: ; implicit-def: $sgpr90 ; SI-NEXT: ; implicit-def: $sgpr78 ; SI-NEXT: ; implicit-def: $sgpr74 ; SI-NEXT: ; implicit-def: $sgpr25 -; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $sgpr6 ; SI-NEXT: ; implicit-def: $sgpr94 ; SI-NEXT: ; implicit-def: $sgpr92 ; SI-NEXT: ; implicit-def: $sgpr88 ; SI-NEXT: ; implicit-def: $sgpr23 -; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr21 -; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $sgpr36 ; SI-NEXT: ; implicit-def: $sgpr34 ; SI-NEXT: ; implicit-def: $sgpr30 @@ -81021,190 +84020,135 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:88 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:104 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:112 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:120 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:128 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:120 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:128 ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v3 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v39, 8, v17 +; SI-NEXT: v_lshlrev_b32_e32 v49, 8, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v25 +; SI-NEXT: v_lshlrev_b32_e32 v52, 8, v29 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v7 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:108 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:100 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v9 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v11 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v13 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; kill: killed $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; kill: killed $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; kill: killed $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; kill: killed $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; kill: killed $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; kill: killed $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v15 +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v13 +; SI-NEXT: ; kill: killed $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v23 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: v_lshlrev_b32_e32 v57, 8, v17 -; SI-NEXT: v_lshlrev_b32_e32 v59, 8, v19 -; SI-NEXT: v_lshlrev_b32_e32 v63, 8, v21 -; SI-NEXT: v_lshlrev_b32_e32 v58, 8, v25 -; SI-NEXT: v_lshlrev_b32_e32 v61, 8, v27 -; SI-NEXT: v_lshlrev_b32_e32 v24, 8, v29 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: v_lshlrev_b32_e32 v38, 8, v15 +; SI-NEXT: v_lshlrev_b32_e32 v48, 8, v19 +; SI-NEXT: v_lshlrev_b32_e32 v27, 8, v27 +; SI-NEXT: ; kill: killed $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; kill: killed $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v28, 8, v2 -; SI-NEXT: v_lshlrev_b32_e32 v30, 8, v4 -; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v6 -; SI-NEXT: v_lshlrev_b32_e32 v16, 8, v8 -; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v10 -; SI-NEXT: v_lshlrev_b32_e32 v12, 8, v12 -; SI-NEXT: v_lshlrev_b32_e32 v10, 8, v18 -; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v22 -; SI-NEXT: v_lshlrev_b32_e32 v39, 8, v26 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:116 -; SI-NEXT: v_lshlrev_b32_e32 v40, 8, v31 -; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v32 -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v33 -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v34 -; SI-NEXT: v_lshlrev_b32_e32 v35, 8, v35 -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v36 -; SI-NEXT: v_lshlrev_b32_e32 v46, 8, v37 -; SI-NEXT: v_lshlrev_b32_e32 v51, 8, v38 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124 +; SI-NEXT: v_lshlrev_b32_e32 v41, 8, v4 +; SI-NEXT: v_lshlrev_b32_e32 v44, 8, v6 +; SI-NEXT: v_lshlrev_b32_e32 v56, 8, v8 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:116 +; SI-NEXT: v_lshlrev_b32_e32 v40, 8, v2 +; SI-NEXT: v_lshlrev_b32_e32 v57, 8, v10 +; SI-NEXT: v_lshlrev_b32_e32 v60, 8, v12 +; SI-NEXT: v_lshlrev_b32_e32 v26, 8, v14 +; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v16 +; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v18 +; SI-NEXT: v_lshlrev_b32_e32 v10, 8, v22 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v24 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v28 +; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v30 +; SI-NEXT: v_lshlrev_b32_e32 v34, 8, v33 +; SI-NEXT: v_lshlrev_b32_e32 v37, 8, v35 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:52 +; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v31 +; SI-NEXT: v_lshlrev_b32_e32 v13, 8, v32 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB106_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_and_b32_e32 v19, 0xff, v55 -; SI-NEXT: v_or_b32_e32 v16, v19, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v16 -; SI-NEXT: v_and_b32_e32 v16, 0xff, v22 -; SI-NEXT: v_or_b32_e32 v14, v16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v14 -; SI-NEXT: v_and_b32_e32 v14, 0xff, v26 -; SI-NEXT: v_or_b32_e32 v12, v14, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v12 -; SI-NEXT: v_and_b32_e32 v12, 0xff, v18 -; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v10 -; SI-NEXT: v_and_b32_e32 v10, 0xff, v41 -; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xff, v42 -; SI-NEXT: v_or_b32_e32 v8, v8, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xff, v43 -; SI-NEXT: v_or_b32_e32 v8, v8, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xff, v44 -; SI-NEXT: v_or_b32_e32 v4, v8, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v4 -; SI-NEXT: v_and_b32_e32 v4, 0xff, v45 -; SI-NEXT: v_or_b32_e32 v0, v4, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v5 -; SI-NEXT: v_or_b32_e32 v0, v0, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v47 -; SI-NEXT: v_or_b32_e32 v0, v0, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v56 -; SI-NEXT: v_or_b32_e32 v0, v0, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v0 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v0, 0xff, v6 -; SI-NEXT: v_or_b32_e32 v0, v0, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v0, v0, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_or_b32_e32 v3, v3, v20 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v0, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v0 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 @@ -81230,360 +84174,417 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: v_and_b32_e32 v17, 0xff, v60 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: v_or_b32_e32 v17, v17, v30 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v17 -; SI-NEXT: v_and_b32_e32 v17, 0xff, v62 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: v_or_b32_e32 v17, v17, v20 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_or_b32_e32 v7, v7, v9 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 -; SI-NEXT: v_or_b32_e32 v9, v9, v11 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 +; SI-NEXT: v_or_b32_e32 v12, v12, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v9 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v11, v12 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 -; SI-NEXT: v_or_b32_e32 v11, v11, v13 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v11 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v11, v15 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 -; SI-NEXT: v_or_b32_e32 v13, v13, v15 +; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 +; SI-NEXT: v_or_b32_e32 v16, v16, v18 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v13 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v11, v16 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 -; SI-NEXT: v_or_b32_e32 v15, v15, v24 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v15 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; SI-NEXT: v_or_b32_e32 v7, v7, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 -; SI-NEXT: v_or_b32_e32 v15, v15, v28 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; SI-NEXT: v_or_b32_e32 v7, v7, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; SI-NEXT: v_or_b32_e32 v7, v7, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: v_or_b32_e32 v11, v11, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 -; SI-NEXT: v_or_b32_e32 v11, v11, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v11 +; SI-NEXT: v_or_b32_e32 v11, v11, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; SI-NEXT: v_or_b32_e32 v7, v7, v9 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: v_or_b32_e32 v11, v11, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: v_or_b32_e32 v11, v11, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v11 ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 -; SI-NEXT: v_or_b32_e32 v9, v9, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v9 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 -; SI-NEXT: v_or_b32_e32 v11, v11, v13 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: v_or_b32_e32 v11, v11, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 -; SI-NEXT: v_or_b32_e32 v13, v13, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v13 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: v_or_b32_e32 v9, v9, v59 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: v_or_b32_e32 v11, v11, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 -; SI-NEXT: v_or_b32_e32 v13, v13, v61 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: v_or_b32_e32 v11, v11, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: v_or_b32_e32 v11, v11, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: v_or_b32_e32 v11, v11, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v25 +; SI-NEXT: v_or_b32_e32 v11, v11, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v29 +; SI-NEXT: v_or_b32_e32 v11, v11, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v50 +; SI-NEXT: v_or_b32_e32 v11, v11, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v51 +; SI-NEXT: v_or_b32_e32 v11, v11, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v54 +; SI-NEXT: v_or_b32_e32 v11, v11, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v47 +; SI-NEXT: v_or_b32_e32 v11, v11, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v8 +; SI-NEXT: v_or_b32_e32 v3, v3, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v5 +; SI-NEXT: v_or_b32_e32 v3, v3, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v17 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v33 +; SI-NEXT: v_or_b32_e32 v1, v1, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v36 +; SI-NEXT: v_or_b32_e32 v1, v1, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v4 +; SI-NEXT: v_or_b32_e32 v1, v1, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: .LBB106_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB106_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v56 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v36 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v35 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 +; SI-NEXT: s_movk_i32 s6, 0x300 +; SI-NEXT: v_or_b32_e32 v12, v9, v12 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v11 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v33 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: v_or_b32_e32 v7, v7, v11 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 ; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 -; SI-NEXT: v_or_b32_e32 v7, v3, v7 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v47 -; SI-NEXT: v_or_b32_e32 v6, v46, v6 +; SI-NEXT: v_or_b32_e32 v2, v2, v6 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v8 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v47 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: s_movk_i32 s6, 0x300 -; SI-NEXT: v_or_b32_e32 v9, v35, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v6 -; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v42 -; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; SI-NEXT: v_or_b32_e32 v7, v39, v7 -; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v41 -; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v18 -; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; SI-NEXT: v_or_b32_e32 v7, v10, v7 -; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v26 -; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; SI-NEXT: v_or_b32_e32 v7, v12, v7 -; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v22 -; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; SI-NEXT: v_or_b32_e32 v7, v14, v7 -; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v55 -; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; SI-NEXT: v_or_b32_e32 v7, v16, v7 -; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v62 -; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; SI-NEXT: v_or_b32_e32 v7, v20, v7 -; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v60 -; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; SI-NEXT: v_or_b32_e32 v7, v30, v7 -; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v7 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v29, vcc, s6, v9 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_or_b32_e32 v3, v20, v3 +; SI-NEXT: v_or_b32_e32 v8, v26, v8 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v25 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v6, v14, v6 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v29 ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_or_b32_e32 v2, v2, v5 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v45 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_or_b32_e32 v0, v0, v5 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v44 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_or_b32_e32 v4, v4, v5 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v43 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_or_b32_e32 v1, v51, v1 -; SI-NEXT: v_or_b32_e32 v5, v40, v5 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 -; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v12 +; SI-NEXT: v_or_b32_e32 v1, v1, v11 +; SI-NEXT: v_or_b32_e32 v5, v10, v5 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v50 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; SI-NEXT: v_or_b32_e32 v0, v37, v0 +; SI-NEXT: v_or_b32_e32 v4, v34, v4 +; SI-NEXT: v_or_b32_e32 v10, v60, v10 +; SI-NEXT: v_or_b32_e32 v11, v57, v11 +; SI-NEXT: v_or_b32_e32 v12, v56, v12 +; SI-NEXT: v_or_b32_e32 v14, v44, v14 +; SI-NEXT: v_or_b32_e32 v15, v41, v15 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v3 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; SI-NEXT: v_or_b32_e32 v7, v28, v7 -; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v7 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; SI-NEXT: v_or_b32_e32 v7, v24, v7 -; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v7 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v53, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v14 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; SI-NEXT: v_or_b32_e32 v7, v61, v7 -; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v7 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; SI-NEXT: v_or_b32_e32 v7, v58, v7 -; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v7 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v49, v20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v7 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; SI-NEXT: v_or_b32_e32 v7, v63, v7 -; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v7 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v37, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v10 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; SI-NEXT: v_or_b32_e32 v7, v59, v7 -; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v7 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; SI-NEXT: v_or_b32_e32 v7, v57, v7 -; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v7 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v33, v22 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; SI-NEXT: v_or_b32_e32 v7, v24, v7 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 +; SI-NEXT: v_or_b32_e32 v19, v23, v19 +; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v24, 0xff, v24 -; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v24, v38, v24 ; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v24 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 +; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 +; SI-NEXT: v_or_b32_e32 v16, v40, v16 +; SI-NEXT: v_or_b32_e32 v17, v52, v17 +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 +; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v55, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v24, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v9 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xff, v18 +; SI-NEXT: v_or_b32_e32 v18, v27, v18 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_and_b32_e32 v23, 0xff, v23 +; SI-NEXT: v_or_b32_e32 v23, v39, v23 +; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 +; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v2 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 ; SI-NEXT: v_and_b32_e32 v25, 0xff, v25 ; SI-NEXT: v_or_b32_e32 v25, v26, v25 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v25, vcc, s6, v25 ; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v25, v4 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 +; SI-NEXT: v_or_b32_e32 v19, v49, v19 +; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v53, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v8 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 ; SI-NEXT: v_and_b32_e32 v26, 0xff, v26 ; SI-NEXT: v_or_b32_e32 v26, v27, v26 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v26, vcc, s6, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 +; SI-NEXT: v_or_b32_e32 v19, v48, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 ; SI-NEXT: v_and_b32_e32 v27, 0xff, v27 ; SI-NEXT: v_or_b32_e32 v27, v28, v27 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v27, vcc, s6, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v27, v2 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 ; SI-NEXT: v_and_b32_e32 v28, 0xff, v28 -; SI-NEXT: v_or_b32_e32 v28, v30, v28 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v28, v29, v28 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v28, vcc, s6, v28 ; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v28, v6 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; SI-NEXT: v_and_b32_e32 v29, 0xff, v29 +; SI-NEXT: v_or_b32_e32 v29, v30, v29 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v29, vcc, s6, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 ; SI-NEXT: v_and_b32_e32 v30, 0xff, v30 @@ -81591,19 +84592,85 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v30, vcc, s6, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v30, v1 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v31 ; SI-NEXT: v_and_b32_e32 v31, 0xff, v31 ; SI-NEXT: v_or_b32_e32 v31, v32, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v0 ; SI-NEXT: v_add_i32_e32 v31, vcc, s6, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v18 ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v4 ; SI-NEXT: .LBB106_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v53 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f16_f32_e32 v8, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v12 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v4, v19 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v45 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v55 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v61 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v46 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_or_b32_e32 v9, v13, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v62 ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload @@ -81620,26 +84687,26 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v8, v33 -; SI-NEXT: v_mov_b32_e32 v10, v37 -; SI-NEXT: v_mov_b32_e32 v12, v49 -; SI-NEXT: v_mov_b32_e32 v14, v53 -; SI-NEXT: v_mov_b32_e32 v16, v32 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_mov_b32_e32 v18, v34 -; SI-NEXT: v_mov_b32_e32 v20, v36 -; SI-NEXT: v_mov_b32_e32 v22, v38 -; SI-NEXT: v_mov_b32_e32 v24, v48 -; SI-NEXT: v_mov_b32_e32 v26, v50 -; SI-NEXT: v_mov_b32_e32 v28, v52 -; SI-NEXT: v_mov_b32_e32 v30, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v22 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v13, v10 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: v_or_b32_e32 v11, v11, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v30 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v31 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 +; SI-NEXT: v_or_b32_e32 v13, v18, v13 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -83475,11 +86542,11 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: s_and_b32 s4, s22, 0xff ; SI-NEXT: s_lshl_b32 s5, s23, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 ; SI-NEXT: s_and_b32 s4, s24, 0xff ; SI-NEXT: s_lshl_b32 s5, s25, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 ; SI-NEXT: s_and_b32 s4, s26, 0xff ; SI-NEXT: s_lshl_b32 s5, s27, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 @@ -83491,11 +86558,11 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: s_and_b32 s4, s77, 0xff ; SI-NEXT: s_lshl_b32 s5, s76, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 ; SI-NEXT: s_and_b32 s4, s88, 0xff ; SI-NEXT: s_lshl_b32 s5, s78, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 ; SI-NEXT: s_and_b32 s4, s91, 0xff ; SI-NEXT: s_lshl_b32 s5, s89, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 @@ -83507,11 +86574,11 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: s_and_b32 s4, s35, 0xff ; SI-NEXT: s_lshl_b32 s5, s30, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 ; SI-NEXT: s_and_b32 s4, s38, 0xff ; SI-NEXT: s_lshl_b32 s5, s36, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 ; SI-NEXT: s_and_b32 s4, s39, 0xff ; SI-NEXT: s_lshl_b32 s5, s79, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 @@ -83523,11 +86590,11 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: s_and_b32 s4, s31, 0xff ; SI-NEXT: s_lshl_b32 s5, s94, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 ; SI-NEXT: s_and_b32 s4, s37, 0xff ; SI-NEXT: s_lshl_b32 s5, s34, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 ; SI-NEXT: s_and_b32 s4, s46, 0xff ; SI-NEXT: s_lshl_b32 s5, s44, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 @@ -83539,11 +86606,11 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: s_and_b32 s4, s58, 0xff ; SI-NEXT: s_lshl_b32 s5, s57, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 ; SI-NEXT: s_and_b32 s4, s60, 0xff ; SI-NEXT: s_lshl_b32 s5, s59, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 ; SI-NEXT: s_and_b32 s4, s63, 0xff ; SI-NEXT: s_lshl_b32 s5, s61, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 @@ -83555,11 +86622,11 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: s_and_b32 s4, s75, 0xff ; SI-NEXT: s_lshl_b32 s5, s62, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s4 ; SI-NEXT: s_and_b32 s4, s73, 0xff ; SI-NEXT: s_lshl_b32 s5, s42, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 ; SI-NEXT: s_and_b32 s4, s45, 0xff ; SI-NEXT: s_lshl_b32 s5, s40, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 @@ -83571,19 +86638,19 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: s_and_b32 s4, s41, 0xff ; SI-NEXT: s_lshl_b32 s5, s13, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s4 ; SI-NEXT: s_and_b32 s4, s15, 0xff ; SI-NEXT: s_lshl_b32 s5, s10, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 ; SI-NEXT: s_and_b32 s4, s12, 0xff ; SI-NEXT: s_lshl_b32 s5, s8, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s4 ; SI-NEXT: s_and_b32 s4, s11, 0xff ; SI-NEXT: s_lshl_b32 s5, s7, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s4 ; SI-NEXT: s_and_b32 s4, s9, 0xff ; SI-NEXT: s_lshl_b32 s5, s6, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 @@ -83753,36 +86820,100 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s18 ; SI-NEXT: v_cvt_f32_f16_e32 v2, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s24 ; SI-NEXT: v_cvt_f32_f16_e32 v5, s26 ; SI-NEXT: v_cvt_f32_f16_e32 v6, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s60 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s59 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s60 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s59 ; SI-NEXT: v_cvt_f32_f16_e32 v9, s58 ; SI-NEXT: v_cvt_f32_f16_e32 v10, s57 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s56 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s47 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s56 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s47 ; SI-NEXT: v_cvt_f32_f16_e32 v13, s46 ; SI-NEXT: v_cvt_f32_f16_e32 v14, s45 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s44 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s44 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s43 ; SI-NEXT: v_cvt_f32_f16_e32 v17, s42 ; SI-NEXT: v_cvt_f32_f16_e32 v18, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s15 ; SI-NEXT: v_cvt_f32_f16_e32 v21, s14 ; SI-NEXT: v_cvt_f32_f16_e32 v22, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s11 ; SI-NEXT: v_cvt_f32_f16_e32 v25, s10 ; SI-NEXT: v_cvt_f32_f16_e32 v26, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v31, s4 ; SI-NEXT: .LBB107_3: ; %end +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v9 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v10 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v13 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v14 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v17 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v18 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v21 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v22 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v25 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v26 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v30 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v29 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 ; SI-NEXT: v_readlane_b32 s39, v32, 7 ; SI-NEXT: v_readlane_b32 s38, v32, 6 ; SI-NEXT: v_readlane_b32 s37, v32, 5 @@ -83800,34 +86931,34 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: s_branch .LBB107_2 ; @@ -85213,10 +88344,9 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; SI-LABEL: bitcast_v32bf16_to_v64i8: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 -; SI-NEXT: v_mul_f32_e32 v38, 1.0, v1 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v29, 1.0, v1 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 @@ -85276,58 +88406,94 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_and_b32_e32 v37, 0xffff0000, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v34, 0xffff0000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: v_mul_f32_e32 v36, 1.0, v2 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v27 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v31 +; SI-NEXT: v_mul_f32_e32 v33, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v24 ; SI-NEXT: v_mul_f32_e32 v31, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v35, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v51, 1.0, v6 -; SI-NEXT: v_mul_f32_e32 v52, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v32, 1.0, v8 -; SI-NEXT: v_mul_f32_e32 v49, 1.0, v7 -; SI-NEXT: v_mul_f32_e32 v54, 1.0, v10 -; SI-NEXT: v_mul_f32_e32 v55, 1.0, v9 -; SI-NEXT: v_mul_f32_e32 v33, 1.0, v12 -; SI-NEXT: v_mul_f32_e32 v53, 1.0, v11 -; SI-NEXT: v_mul_f32_e32 v41, 1.0, v14 -; SI-NEXT: v_mul_f32_e32 v42, 1.0, v13 -; SI-NEXT: v_mul_f32_e32 v34, 1.0, v16 -; SI-NEXT: v_mul_f32_e32 v40, 1.0, v15 -; SI-NEXT: v_mul_f32_e32 v44, 1.0, v18 -; SI-NEXT: v_mul_f32_e32 v45, 1.0, v17 -; SI-NEXT: v_mul_f32_e32 v39, 1.0, v20 -; SI-NEXT: v_mul_f32_e32 v43, 1.0, v19 -; SI-NEXT: v_mul_f32_e32 v46, 1.0, v22 -; SI-NEXT: v_mul_f32_e32 v47, 1.0, v21 -; SI-NEXT: v_mul_f32_e32 v22, 1.0, v24 -; SI-NEXT: v_mul_f32_e32 v24, 1.0, v23 +; SI-NEXT: v_mul_f32_e32 v36, 1.0, v34 +; SI-NEXT: v_mul_f32_e32 v38, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v25 +; SI-NEXT: v_mul_f32_e32 v34, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v49, 1.0, v39 +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v7 ; SI-NEXT: v_mul_f32_e32 v26, 1.0, v26 -; SI-NEXT: v_mul_f32_e32 v56, 1.0, v25 -; SI-NEXT: v_mul_f32_e32 v25, 1.0, v28 -; SI-NEXT: v_mul_f32_e32 v28, 1.0, v27 -; SI-NEXT: v_mul_f32_e32 v27, 1.0, v30 +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v54, 1.0, v37 +; SI-NEXT: v_mul_f32_e32 v55, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v30 +; SI-NEXT: v_mul_f32_e32 v53, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v42, 1.0, v35 +; SI-NEXT: v_mul_f32_e32 v43, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v35, 1.0, v23 +; SI-NEXT: v_mul_f32_e32 v41, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v45, 1.0, v21 +; SI-NEXT: v_mul_f32_e32 v46, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v52, 1.0, v20 +; SI-NEXT: v_mul_f32_e32 v44, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v56, 1.0, v19 ; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_mul_f32_e32 v57, 1.0, v29 +; SI-NEXT: v_mul_f32_e32 v57, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v40, 1.0, v18 +; SI-NEXT: v_mul_f32_e32 v47, 1.0, v16 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr21 @@ -85357,392 +88523,388 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v37 -; SI-NEXT: v_mul_f32_e32 v30, 1.0, v48 -; SI-NEXT: v_mul_f32_e32 v29, 1.0, v50 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB108_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v36 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v31 -; SI-NEXT: v_alignbit_b32 v48, v1, v38, 16 -; SI-NEXT: v_alignbit_b32 v50, v37, v35, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; SI-NEXT: v_alignbit_b32 v48, v1, v29, 16 +; SI-NEXT: v_alignbit_b32 v50, v37, v27, 16 ; SI-NEXT: v_alignbit_b32 v1, v50, v48, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v50, v48, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v50, v48, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v51 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v32 -; SI-NEXT: v_alignbit_b32 v23, v1, v52, 16 -; SI-NEXT: v_alignbit_b32 v21, v19, v49, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v24 +; SI-NEXT: v_alignbit_b32 v23, v1, v33, 16 +; SI-NEXT: v_alignbit_b32 v21, v19, v31, 16 ; SI-NEXT: v_alignbit_b32 v1, v21, v23, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v21, v23, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v21, v23, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v54 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v33 -; SI-NEXT: v_alignbit_b32 v17, v1, v55, 16 -; SI-NEXT: v_alignbit_b32 v18, v16, v53, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v25 +; SI-NEXT: v_alignbit_b32 v17, v1, v38, 16 +; SI-NEXT: v_alignbit_b32 v18, v16, v34, 16 ; SI-NEXT: v_alignbit_b32 v1, v18, v17, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v18, v17, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v18, v17, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v41 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v34 -; SI-NEXT: v_alignbit_b32 v14, v1, v42, 16 -; SI-NEXT: v_alignbit_b32 v15, v13, v40, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v26 +; SI-NEXT: v_alignbit_b32 v14, v1, v51, 16 +; SI-NEXT: v_alignbit_b32 v15, v13, v39, 16 ; SI-NEXT: v_alignbit_b32 v1, v15, v14, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v15, v14, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v15, v14, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v44 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v39 -; SI-NEXT: v_alignbit_b32 v11, v1, v45, 16 -; SI-NEXT: v_alignbit_b32 v12, v10, v43, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v30 +; SI-NEXT: v_alignbit_b32 v11, v1, v55, 16 +; SI-NEXT: v_alignbit_b32 v12, v10, v53, 16 ; SI-NEXT: v_alignbit_b32 v1, v12, v11, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v12, v11, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v12, v11, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v46 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v22 -; SI-NEXT: v_alignbit_b32 v8, v1, v47, 16 -; SI-NEXT: v_alignbit_b32 v9, v7, v24, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v42 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v35 +; SI-NEXT: v_alignbit_b32 v8, v1, v43, 16 +; SI-NEXT: v_alignbit_b32 v9, v7, v41, 16 ; SI-NEXT: v_alignbit_b32 v1, v9, v8, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v9, v8, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v9, v8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v25 -; SI-NEXT: v_alignbit_b32 v5, v1, v56, 16 -; SI-NEXT: v_alignbit_b32 v6, v4, v28, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v45 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v52 +; SI-NEXT: v_alignbit_b32 v5, v1, v46, 16 +; SI-NEXT: v_alignbit_b32 v6, v4, v44, 16 ; SI-NEXT: v_alignbit_b32 v1, v6, v5, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v6, v5, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v6, v5, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v56 ; SI-NEXT: v_alignbit_b32 v2, v1, v57, 16 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v22, 24, v22 -; SI-NEXT: v_alignbit_b32 v3, v1, v29, 16 -; SI-NEXT: v_lshrrev_b32_e32 v24, 8, v18 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v22, 24, v30 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v40 +; SI-NEXT: v_alignbit_b32 v3, v1, v47, 16 ; SI-NEXT: v_alignbit_b32 v20, v3, v2, 24 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v24, 24, v34 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v20, v3, v2, 16 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v22, 8, v9 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v20, v3, v2, 8 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v20, v3, v2, 16 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v20, 24, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 8, v18 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v24, 8, v15 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v22, 24, v26 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v22, 8, v6 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v22, 8, v15 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v20, v3, v2, 8 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v22, 24, v30 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v22, 24, v35 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v24, 24, v39 +; SI-NEXT: v_lshrrev_b32_e32 v22, 24, v40 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v22, 8, v9 ; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v22, 8, v3 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v22, 8, v6 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v20, 24, v31 +; SI-NEXT: v_lshrrev_b32_e32 v22, 8, v3 ; SI-NEXT: v_lshrrev_b32_e32 v63, 8, v50 -; SI-NEXT: v_lshrrev_b32_e32 v62, 24, v32 +; SI-NEXT: v_lshrrev_b32_e32 v62, 24, v24 ; SI-NEXT: v_lshrrev_b32_e32 v59, 8, v21 -; SI-NEXT: v_lshrrev_b32_e32 v60, 24, v33 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v58, 24, v25 +; SI-NEXT: v_lshrrev_b32_e32 v60, 24, v25 +; SI-NEXT: v_lshrrev_b32_e32 v58, 24, v52 ; SI-NEXT: v_lshrrev_b32_e32 v61, 8, v12 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: .LBB108_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB108_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v51 -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v52 +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v32 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v33 ; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v27 ; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 ; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v57 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_alignbit_b32 v23, v20, v19, 16 -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v49 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v31 ; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v19 -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v32 -; SI-NEXT: v_alignbit_b32 v2, v2, v1, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v29 -; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v19 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v30 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v29 -; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v36 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v19 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v24 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 ; SI-NEXT: v_alignbit_b32 v21, v19, v20, 16 -; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v38 -; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v29 +; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 ; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; SI-NEXT: v_alignbit_b32 v48, v30, v20, 16 -; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v31 -; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v35 -; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v54 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; SI-NEXT: v_alignbit_b32 v48, v28, v20, 16 +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v27 +; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v36 ; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v30 -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v55 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v38 ; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 ; SI-NEXT: v_alignbit_b32 v50, v37, v20, 16 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v26 ; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_alignbit_b32 v20, v50, v48, 24 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v56 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v41 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v49 ; SI-NEXT: v_alignbit_b32 v17, v17, v16, 16 -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v53 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v34 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v20, v50, v48, 16 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v42 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v51 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v16 -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v33 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v25 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v20, v50, v48, 8 -; SI-NEXT: v_alignbit_b32 v5, v5, v4, 16 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v28 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 ; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v16 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v16 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v20, v21, v23, 24 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v44 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v54 ; SI-NEXT: v_alignbit_b32 v14, v14, v13, 16 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v40 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v28 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v39 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v25 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v20, v21, v23, 16 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v45 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v55 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v13 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v34 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v26 ; SI-NEXT: v_alignbit_b32 v18, v16, v18, 16 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v20, v21, v23, 8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v46 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 ; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v13 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v20, v18, v17, 24 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v47 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v42 ; SI-NEXT: v_alignbit_b32 v11, v11, v10, 16 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v43 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v53 ; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v26 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v20, v18, v17, 16 -; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v43 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v10 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v39 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v30 ; SI-NEXT: v_alignbit_b32 v15, v13, v15, 16 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v20, v18, v17, 8 -; SI-NEXT: v_alignbit_b32 v8, v8, v7, 16 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v24 -; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v10 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v10 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v20, v15, v14, 24 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v24 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v45 +; SI-NEXT: v_alignbit_b32 v8, v8, v7, 16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v41 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v30 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v20, v15, v14, 16 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v46 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v7 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v22 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v35 ; SI-NEXT: v_alignbit_b32 v12, v10, v12, 16 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v20, v15, v14, 8 -; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v7 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_add_f32_e32 v35, 0x40c00000, v7 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v20, v12, v11, 24 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v22 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v56 +; SI-NEXT: v_alignbit_b32 v5, v5, v4, 16 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v44 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v35 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v20, v12, v11, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v57 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v4 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v25 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v52 ; SI-NEXT: v_alignbit_b32 v9, v7, v9, 16 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v20, v12, v11, 8 -; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v4 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_add_f32_e32 v52, 0x40c00000, v4 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v20, v9, v8, 24 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v25 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v2, v2, v1, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v47 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v52 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v20, v9, v8, 16 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v40 ; SI-NEXT: v_alignbit_b32 v6, v4, v6, 16 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v20, v9, v8, 8 -; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v1 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v40, 0x40c00000, v1 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v20, v6, v5, 24 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v27 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v40 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v20, v6, v5, 16 ; SI-NEXT: v_alignbit_b32 v3, v1, v3, 16 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v20, v6, v5, 8 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v20, v3, v2, 24 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v20, v3, v2, 16 -; SI-NEXT: v_lshrrev_b32_e32 v22, 24, v22 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v20, v3, v2, 8 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v20, 8, v18 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v20, 8, v15 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v20, 24, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 24, v26 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v22, 24, v30 ; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v22, 24, v27 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v22, 24, v35 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v20, 8, v18 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v22, 24, v40 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v22, 8, v9 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v20, 8, v15 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v22, 8, v6 ; SI-NEXT: v_lshrrev_b32_e32 v63, 8, v50 ; SI-NEXT: v_lshrrev_b32_e32 v59, 8, v21 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v20, 24, v30 -; SI-NEXT: v_lshrrev_b32_e32 v62, 24, v29 -; SI-NEXT: v_lshrrev_b32_e32 v60, 24, v28 -; SI-NEXT: v_lshrrev_b32_e32 v26, 24, v26 -; SI-NEXT: v_lshrrev_b32_e32 v24, 24, v24 -; SI-NEXT: v_lshrrev_b32_e32 v58, 24, v25 +; SI-NEXT: v_lshrrev_b32_e32 v62, 24, v24 +; SI-NEXT: v_lshrrev_b32_e32 v60, 24, v25 +; SI-NEXT: v_lshrrev_b32_e32 v58, 24, v52 ; SI-NEXT: v_lshrrev_b32_e32 v61, 8, v12 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v22, 8, v3 -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: .LBB108_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v22, 0xff, v48 ; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v20 @@ -85766,11 +88928,11 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v24, 8, v24 ; SI-NEXT: v_or_b32_e32 v22, v22, v24 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v25, 24, v25 @@ -85791,14 +88953,14 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v20, v22, v20 ; SI-NEXT: v_add_i32_e32 v22, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v20, v22, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v20, 0xff, v23 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v22, 8, v22 ; SI-NEXT: v_or_b32_e32 v20, v20, v22 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v23, 24, v23 @@ -85820,12 +88982,12 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v20, vcc, 12, v0 ; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v19, 8, v19 ; SI-NEXT: v_or_b32_e32 v17, v17, v19 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v20 @@ -85838,7 +89000,7 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v17, v19, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v17, 0xff, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 ; SI-NEXT: v_or_b32_e32 v17, v17, v18 @@ -85849,12 +89011,12 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v17, vcc, 20, v0 ; SI-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v16, 8, v16 ; SI-NEXT: v_or_b32_e32 v14, v14, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v17, 24, v17 @@ -85867,11 +89029,11 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v14, v16, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v14, 0xff, v15 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v15 ; SI-NEXT: v_or_b32_e32 v14, v14, v15 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v15, 24, v15 @@ -85880,12 +89042,12 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v14, vcc, 28, v0 ; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v13, 8, v13 ; SI-NEXT: v_or_b32_e32 v11, v11, v13 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v14, 24, v14 @@ -85900,7 +89062,7 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v11, 0xff, v12 ; SI-NEXT: v_lshlrev_b32_e32 v12, 8, v61 ; SI-NEXT: v_or_b32_e32 v11, v11, v12 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12 @@ -85909,12 +89071,12 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v11, vcc, 36, v0 ; SI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v10, 8, v10 ; SI-NEXT: v_or_b32_e32 v8, v8, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v11 @@ -85927,11 +89089,11 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v8, v10, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v8, 0xff, v9 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 ; SI-NEXT: v_or_b32_e32 v8, v8, v9 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v9 @@ -85940,12 +89102,12 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v8, vcc, 44, v0 ; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 ; SI-NEXT: v_or_b32_e32 v5, v5, v7 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v8 @@ -85958,7 +89120,7 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v5, 0xff, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 ; SI-NEXT: v_or_b32_e32 v5, v5, v6 @@ -85969,12 +89131,12 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v5, vcc, 52, v0 ; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 ; SI-NEXT: v_or_b32_e32 v2, v2, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v5 @@ -85987,34 +89149,34 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v2, 0xff, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 60, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -88351,48 +91513,80 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; SI-NEXT: v_writelane_b32 v40, s87, 31 ; SI-NEXT: v_writelane_b32 v40, s96, 32 ; SI-NEXT: v_writelane_b32 v40, s97, 33 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v2 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v1 +; SI-NEXT: s_and_b32 s6, s29, 0xffff0000 +; SI-NEXT: s_lshl_b32 s7, s29, 16 +; SI-NEXT: s_and_b32 s8, s28, 0xffff0000 +; SI-NEXT: s_lshl_b32 s9, s28, 16 +; SI-NEXT: s_and_b32 s10, s27, 0xffff0000 +; SI-NEXT: s_lshl_b32 s11, s27, 16 +; SI-NEXT: s_and_b32 s12, s26, 0xffff0000 +; SI-NEXT: s_lshl_b32 s13, s26, 16 +; SI-NEXT: s_and_b32 s14, s25, 0xffff0000 +; SI-NEXT: s_lshl_b32 s15, s25, 16 +; SI-NEXT: s_and_b32 s25, s24, 0xffff0000 +; SI-NEXT: s_lshl_b32 s24, s24, 16 +; SI-NEXT: s_and_b32 s26, s23, 0xffff0000 +; SI-NEXT: s_lshl_b32 s23, s23, 16 +; SI-NEXT: s_and_b32 s27, s22, 0xffff0000 +; SI-NEXT: s_lshl_b32 s22, s22, 16 +; SI-NEXT: s_and_b32 s28, s21, 0xffff0000 +; SI-NEXT: s_lshl_b32 s21, s21, 16 +; SI-NEXT: s_and_b32 s29, s20, 0xffff0000 +; SI-NEXT: s_lshl_b32 s20, s20, 16 +; SI-NEXT: s_and_b32 s40, s19, 0xffff0000 +; SI-NEXT: s_lshl_b32 s19, s19, 16 +; SI-NEXT: s_and_b32 s41, s18, 0xffff0000 +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_and_b32 s42, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_and_b32 s43, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s16, s16, 16 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 ; SI-NEXT: v_writelane_b32 v40, s98, 34 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mul_f32_e64 v19, 1.0, s17 -; SI-NEXT: v_mul_f32_e32 v20, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v21, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v24, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v25, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v22, 1.0, v6 -; SI-NEXT: v_mul_f32_e32 v23, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v28, 1.0, v8 -; SI-NEXT: v_mul_f32_e32 v29, 1.0, v7 -; SI-NEXT: v_mul_f32_e32 v26, 1.0, v10 -; SI-NEXT: v_mul_f32_e32 v27, 1.0, v9 -; SI-NEXT: v_mul_f32_e32 v32, 1.0, v12 -; SI-NEXT: v_mul_f32_e32 v33, 1.0, v11 -; SI-NEXT: v_mul_f32_e32 v30, 1.0, v14 -; SI-NEXT: v_mul_f32_e32 v31, 1.0, v13 -; SI-NEXT: v_mul_f32_e32 v35, 1.0, v16 -; SI-NEXT: v_mul_f32_e32 v36, 1.0, v15 -; SI-NEXT: v_mul_f32_e32 v18, 1.0, v18 -; SI-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; SI-NEXT: v_mul_f32_e64 v3, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v6, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v7, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v4, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v5, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v10, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v11, 1.0, s24 -; SI-NEXT: v_mul_f32_e64 v8, 1.0, s27 -; SI-NEXT: v_mul_f32_e64 v9, 1.0, s26 -; SI-NEXT: v_mul_f32_e64 v12, 1.0, s29 -; SI-NEXT: v_mul_f32_e64 v13, 1.0, s28 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s43 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s42 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s41 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s40 +; SI-NEXT: v_mul_f32_e64 v6, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v11, 1.0, s29 +; SI-NEXT: v_mul_f32_e64 v12, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v9, 1.0, s28 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v14, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v15, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v13, 1.0, s26 +; SI-NEXT: v_mul_f32_e32 v35, 1.0, v18 +; SI-NEXT: v_mul_f32_e32 v36, 1.0, v19 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v33, 1.0, v17 +; SI-NEXT: v_mul_f32_e64 v16, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v19, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v20, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v17, 1.0, s14 +; SI-NEXT: v_mul_f32_e64 v18, 1.0, s15 +; SI-NEXT: v_mul_f32_e64 v24, 1.0, s12 +; SI-NEXT: v_mul_f32_e64 v25, 1.0, s13 +; SI-NEXT: v_mul_f32_e64 v21, 1.0, s10 +; SI-NEXT: v_mul_f32_e64 v22, 1.0, s11 +; SI-NEXT: v_mul_f32_e64 v30, 1.0, s8 +; SI-NEXT: v_mul_f32_e64 v32, 1.0, s9 +; SI-NEXT: v_mul_f32_e64 v27, 1.0, s6 +; SI-NEXT: v_mul_f32_e64 v29, 1.0, s7 ; SI-NEXT: v_writelane_b32 v40, s99, 35 ; SI-NEXT: ; implicit-def: $vgpr41 : SGPR spill to VGPR lane ; SI-NEXT: s_cbranch_scc0 .LBB109_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_readfirstlane_b32 s4, v19 -; SI-NEXT: s_lshr_b32 s5, s4, 16 ; SI-NEXT: v_readfirstlane_b32 s4, v3 +; SI-NEXT: s_lshr_b32 s5, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v4 ; SI-NEXT: s_lshr_b64 s[74:75], s[4:5], 16 ; SI-NEXT: v_readfirstlane_b32 s4, v1 ; SI-NEXT: s_lshr_b32 s73, s4, 16 @@ -88403,55 +91597,55 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_writelane_b32 v41, s4, 0 ; SI-NEXT: v_writelane_b32 v41, s5, 1 -; SI-NEXT: v_readfirstlane_b32 s4, v6 -; SI-NEXT: s_lshr_b32 s5, s4, 16 ; SI-NEXT: v_readfirstlane_b32 s4, v7 +; SI-NEXT: s_lshr_b32 s5, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v8 ; SI-NEXT: s_lshr_b64 s[60:61], s[4:5], 16 -; SI-NEXT: v_readfirstlane_b32 s4, v4 +; SI-NEXT: v_readfirstlane_b32 s4, v5 ; SI-NEXT: s_lshr_b32 s59, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s4, v10 -; SI-NEXT: s_lshr_b32 s5, s4, 16 ; SI-NEXT: v_readfirstlane_b32 s4, v11 +; SI-NEXT: s_lshr_b32 s5, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v12 ; SI-NEXT: s_lshr_b64 s[46:47], s[4:5], 16 -; SI-NEXT: v_readfirstlane_b32 s4, v8 +; SI-NEXT: v_readfirstlane_b32 s4, v9 ; SI-NEXT: s_lshr_b32 s45, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s4, v12 +; SI-NEXT: v_readfirstlane_b32 s4, v14 ; SI-NEXT: s_lshr_b32 s5, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s4, v13 +; SI-NEXT: v_readfirstlane_b32 s4, v15 ; SI-NEXT: s_lshr_b64 s[26:27], s[4:5], 16 -; SI-NEXT: v_readfirstlane_b32 s4, v20 +; SI-NEXT: v_readfirstlane_b32 s4, v13 ; SI-NEXT: s_lshr_b32 s25, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s4, v24 +; SI-NEXT: v_readfirstlane_b32 s4, v19 ; SI-NEXT: s_lshr_b32 s5, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s4, v25 +; SI-NEXT: v_readfirstlane_b32 s4, v20 ; SI-NEXT: s_lshr_b64 s[16:17], s[4:5], 16 -; SI-NEXT: v_readfirstlane_b32 s4, v22 +; SI-NEXT: v_readfirstlane_b32 s4, v17 ; SI-NEXT: s_lshr_b32 s41, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s4, v28 +; SI-NEXT: v_readfirstlane_b32 s4, v24 ; SI-NEXT: s_lshr_b32 s5, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s4, v29 +; SI-NEXT: v_readfirstlane_b32 s4, v25 ; SI-NEXT: s_lshr_b64 s[20:21], s[4:5], 16 -; SI-NEXT: v_readfirstlane_b32 s4, v26 +; SI-NEXT: v_readfirstlane_b32 s4, v21 ; SI-NEXT: s_lshr_b32 s19, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s4, v32 +; SI-NEXT: v_readfirstlane_b32 s4, v30 ; SI-NEXT: s_lshr_b32 s5, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s4, v33 +; SI-NEXT: v_readfirstlane_b32 s4, v32 ; SI-NEXT: s_lshr_b64 s[12:13], s[4:5], 16 -; SI-NEXT: v_readfirstlane_b32 s4, v30 +; SI-NEXT: v_readfirstlane_b32 s4, v27 ; SI-NEXT: s_lshr_b32 s11, s4, 16 ; SI-NEXT: v_readfirstlane_b32 s4, v35 ; SI-NEXT: s_lshr_b32 s5, s4, 16 ; SI-NEXT: v_readfirstlane_b32 s4, v36 ; SI-NEXT: s_lshr_b64 s[6:7], s[4:5], 16 -; SI-NEXT: v_readfirstlane_b32 s4, v18 -; SI-NEXT: v_readfirstlane_b32 s58, v5 -; SI-NEXT: v_readfirstlane_b32 s44, v9 -; SI-NEXT: v_readfirstlane_b32 s24, v21 -; SI-NEXT: v_readfirstlane_b32 s40, v23 -; SI-NEXT: v_readfirstlane_b32 s18, v27 -; SI-NEXT: v_readfirstlane_b32 s10, v31 +; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: v_readfirstlane_b32 s58, v6 +; SI-NEXT: v_readfirstlane_b32 s44, v10 +; SI-NEXT: v_readfirstlane_b32 s24, v16 +; SI-NEXT: v_readfirstlane_b32 s40, v18 +; SI-NEXT: v_readfirstlane_b32 s18, v22 +; SI-NEXT: v_readfirstlane_b32 s10, v29 ; SI-NEXT: s_lshr_b32 s5, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s4, v17 +; SI-NEXT: v_readfirstlane_b32 s4, v33 ; SI-NEXT: s_lshr_b64 s[62:63], s[58:59], 16 ; SI-NEXT: s_lshr_b64 s[56:57], s[44:45], 16 ; SI-NEXT: s_lshr_b64 s[42:43], s[24:25], 16 @@ -88483,19 +91677,19 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; SI-NEXT: s_lshr_b64 s[68:69], s[20:21], 24 ; SI-NEXT: s_lshr_b64 s[70:71], s[20:21], 16 ; SI-NEXT: v_lshrrev_b32_e32 v48, 24, v1 -; SI-NEXT: v_lshrrev_b32_e32 v39, 24, v4 +; SI-NEXT: v_lshrrev_b32_e32 v39, 24, v5 ; SI-NEXT: s_lshr_b32 s24, s76, 8 -; SI-NEXT: v_lshrrev_b32_e32 v38, 24, v8 +; SI-NEXT: v_lshrrev_b32_e32 v38, 24, v9 ; SI-NEXT: s_lshr_b32 s23, s62, 8 -; SI-NEXT: v_lshrrev_b32_e32 v37, 24, v20 +; SI-NEXT: v_lshrrev_b32_e32 v37, 24, v13 ; SI-NEXT: s_lshr_b32 s18, s56, 8 -; SI-NEXT: v_lshrrev_b32_e32 v34, 24, v22 +; SI-NEXT: v_lshrrev_b32_e32 v34, 24, v17 ; SI-NEXT: s_lshr_b32 s17, s42, 8 -; SI-NEXT: v_lshrrev_b32_e32 v16, 24, v26 +; SI-NEXT: v_lshrrev_b32_e32 v28, 24, v21 ; SI-NEXT: s_lshr_b32 s15, s22, 8 -; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v30 +; SI-NEXT: v_lshrrev_b32_e32 v26, 24, v27 ; SI-NEXT: s_lshr_b32 s10, s28, 8 -; SI-NEXT: v_lshrrev_b32_e32 v14, 24, v18 +; SI-NEXT: v_lshrrev_b32_e32 v23, 24, v31 ; SI-NEXT: s_lshr_b32 s9, s14, 8 ; SI-NEXT: s_lshr_b32 s4, s8, 8 ; SI-NEXT: s_lshr_b64 s[78:79], s[20:21], 8 @@ -88507,127 +91701,127 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; SI-NEXT: s_lshr_b64 s[84:85], s[6:7], 8 ; SI-NEXT: s_cbranch_execnz .LBB109_3 ; SI-NEXT: .LBB109_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v35 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v36 -; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_readfirstlane_b32 s4, v15 +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v35 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v36 +; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; SI-NEXT: v_readfirstlane_b32 s4, v26 ; SI-NEXT: s_lshr_b32 s5, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s4, v14 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v17 -; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v14 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v18 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_readfirstlane_b32 s4, v23 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v33 +; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v23 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v31 +; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 ; SI-NEXT: s_lshr_b64 s[6:7], s[4:5], 16 -; SI-NEXT: v_readfirstlane_b32 s4, v14 +; SI-NEXT: v_readfirstlane_b32 s4, v23 ; SI-NEXT: s_lshr_b32 s5, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s4, v15 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v33 -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v32 -; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_readfirstlane_b32 s4, v26 +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v32 +; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v30 +; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; SI-NEXT: v_readfirstlane_b32 s10, v26 +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v29 ; SI-NEXT: s_lshr_b64 s[8:9], s[4:5], 16 -; SI-NEXT: v_readfirstlane_b32 s4, v16 -; SI-NEXT: v_readfirstlane_b32 s10, v15 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v31 +; SI-NEXT: v_readfirstlane_b32 s4, v28 +; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v26 +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v27 +; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 ; SI-NEXT: s_lshr_b32 s11, s4, 16 -; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v15 +; SI-NEXT: v_readfirstlane_b32 s4, v26 +; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; SI-NEXT: s_lshr_b64 s[12:13], s[10:11], 16 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v30 -; SI-NEXT: v_readfirstlane_b32 s10, v16 -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v29 -; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v28 -; SI-NEXT: v_readfirstlane_b32 s4, v15 -; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; SI-NEXT: v_readfirstlane_b32 s16, v16 -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v27 ; SI-NEXT: s_lshr_b32 s11, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s4, v17 -; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v16 -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v26 -; SI-NEXT: v_readfirstlane_b32 s18, v17 -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v25 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; SI-NEXT: v_readfirstlane_b32 s4, v24 +; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 ; SI-NEXT: s_lshr_b32 s17, s4, 16 -; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v24 -; SI-NEXT: s_lshr_b64 s[20:21], s[16:17], 16 -; SI-NEXT: v_readfirstlane_b32 s4, v16 -; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; SI-NEXT: v_readfirstlane_b32 s16, v17 -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v23 +; SI-NEXT: v_readfirstlane_b32 s4, v21 +; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; SI-NEXT: v_readfirstlane_b32 s16, v25 ; SI-NEXT: s_lshr_b32 s19, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s4, v18 -; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v17 -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v22 +; SI-NEXT: v_readfirstlane_b32 s4, v19 ; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: s_lshr_b64 s[20:21], s[16:17], 16 ; SI-NEXT: s_lshr_b32 s17, s4, 16 ; SI-NEXT: v_readfirstlane_b32 s4, v17 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 ; SI-NEXT: s_lshr_b32 s41, s4, 16 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_readfirstlane_b32 s4, v14 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; SI-NEXT: v_readfirstlane_b32 s4, v12 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v21 -; SI-NEXT: v_readfirstlane_b32 s24, v13 -; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v12 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v20 -; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; SI-NEXT: s_lshr_b32 s25, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s4, v12 ; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; SI-NEXT: s_lshr_b64 s[26:27], s[24:25], 16 ; SI-NEXT: s_lshr_b32 s25, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s24, v15 +; SI-NEXT: v_readfirstlane_b32 s4, v13 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: v_readfirstlane_b32 s4, v10 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; SI-NEXT: s_lshr_b32 s45, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s44, v11 -; SI-NEXT: v_readfirstlane_b32 s4, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; SI-NEXT: s_lshr_b64 s[26:27], s[24:25], 16 +; SI-NEXT: s_lshr_b32 s25, s4, 16 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_readfirstlane_b32 s4, v11 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 ; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; SI-NEXT: s_lshr_b64 s[46:47], s[44:45], 16 ; SI-NEXT: s_lshr_b32 s45, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s44, v12 +; SI-NEXT: v_readfirstlane_b32 s4, v9 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; SI-NEXT: v_readfirstlane_b32 s4, v6 ; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; SI-NEXT: s_lshr_b32 s59, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s58, v7 +; SI-NEXT: s_lshr_b64 s[46:47], s[44:45], 16 +; SI-NEXT: s_lshr_b32 s45, s4, 16 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_readfirstlane_b32 s4, v7 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; SI-NEXT: s_lshr_b64 s[60:61], s[58:59], 16 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_readfirstlane_b32 s58, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v19 -; SI-NEXT: v_readfirstlane_b32 s4, v4 ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; SI-NEXT: s_lshr_b32 s59, s4, 16 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_readfirstlane_b32 s58, v8 ; SI-NEXT: v_readfirstlane_b32 s4, v5 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: s_lshr_b64 s[60:61], s[58:59], 16 +; SI-NEXT: s_lshr_b32 s59, s4, 16 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_readfirstlane_b32 s4, v3 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: s_lshr_b32 s73, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s72, v3 +; SI-NEXT: v_readfirstlane_b32 s72, v4 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_readfirstlane_b32 s4, v1 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v16 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; SI-NEXT: s_lshr_b64 s[74:75], s[72:73], 16 ; SI-NEXT: s_lshr_b32 s73, s4, 16 ; SI-NEXT: v_readfirstlane_b32 s72, v2 -; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: s_lshr_b64 s[76:77], s[72:73], 16 +; SI-NEXT: v_readfirstlane_b32 s10, v28 +; SI-NEXT: v_readfirstlane_b32 s18, v22 +; SI-NEXT: v_readfirstlane_b32 s16, v20 ; SI-NEXT: v_readfirstlane_b32 s40, v18 -; SI-NEXT: v_readfirstlane_b32 s24, v13 -; SI-NEXT: v_readfirstlane_b32 s44, v9 +; SI-NEXT: v_readfirstlane_b32 s24, v14 +; SI-NEXT: v_readfirstlane_b32 s44, v10 +; SI-NEXT: v_readfirstlane_b32 s58, v6 ; SI-NEXT: s_mov_b32 s75, s76 ; SI-NEXT: s_lshr_b64 s[14:15], s[10:11], 16 ; SI-NEXT: s_lshr_b64 s[28:29], s[18:19], 16 @@ -88671,13 +91865,13 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; SI-NEXT: s_lshr_b32 s9, s14, 8 ; SI-NEXT: s_lshr_b32 s4, s8, 8 ; SI-NEXT: v_lshrrev_b32_e32 v48, 24, v1 -; SI-NEXT: v_lshrrev_b32_e32 v39, 24, v4 -; SI-NEXT: v_lshrrev_b32_e32 v38, 24, v8 -; SI-NEXT: v_lshrrev_b32_e32 v37, 24, v12 +; SI-NEXT: v_lshrrev_b32_e32 v39, 24, v5 +; SI-NEXT: v_lshrrev_b32_e32 v38, 24, v9 +; SI-NEXT: v_lshrrev_b32_e32 v37, 24, v13 ; SI-NEXT: v_lshrrev_b32_e32 v34, 24, v17 -; SI-NEXT: v_lshrrev_b32_e32 v16, 24, v16 -; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v15 -; SI-NEXT: v_lshrrev_b32_e32 v14, 24, v14 +; SI-NEXT: v_lshrrev_b32_e32 v28, 24, v21 +; SI-NEXT: v_lshrrev_b32_e32 v26, 24, v26 +; SI-NEXT: v_lshrrev_b32_e32 v23, 24, v23 ; SI-NEXT: s_lshr_b64 s[78:79], s[20:21], 8 ; SI-NEXT: s_lshr_b64 s[86:87], s[12:13], 24 ; SI-NEXT: s_lshr_b64 s[96:97], s[12:13], 16 @@ -88828,7 +92022,7 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; SI-NEXT: s_and_b32 s10, s19, 0xff ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_lshl_b32 s10, s10, 16 -; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v16 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v28 ; SI-NEXT: s_and_b32 s7, s7, 0xffff ; SI-NEXT: v_or_b32_e32 v1, s10, v1 ; SI-NEXT: v_or_b32_e32 v1, s7, v1 @@ -88853,7 +92047,7 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; SI-NEXT: s_and_b32 s9, s11, 0xff ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_lshl_b32 s9, s9, 16 -; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v15 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v26 ; SI-NEXT: s_and_b32 s7, s7, 0xffff ; SI-NEXT: v_or_b32_e32 v1, s9, v1 ; SI-NEXT: v_or_b32_e32 v1, s7, v1 @@ -88878,7 +92072,7 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_or_b32 s4, s6, s4 ; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v14 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v23 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: v_or_b32_e32 v1, s5, v1 ; SI-NEXT: v_or_b32_e32 v1, s4, v1 @@ -88971,11 +92165,11 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; SI-NEXT: ; implicit-def: $sgpr15 ; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $sgpr10 -; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $sgpr9 -; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $sgpr41 ; SI-NEXT: ; implicit-def: $sgpr20 ; SI-NEXT: ; implicit-def: $sgpr78 @@ -91377,312 +94571,302 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:132 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:88 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:112 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:120 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:128 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:108 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:104 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:100 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:116 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:124 -; SI-NEXT: v_lshlrev_b32_e32 v63, 8, v13 -; SI-NEXT: v_lshlrev_b32_e32 v10, 8, v21 -; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v27 -; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v29 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:120 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:128 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:28 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v5 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v7 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v11 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v15 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; kill: killed $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: v_lshlrev_b32_e32 v38, 8, v13 +; SI-NEXT: v_lshlrev_b32_e32 v36, 24, v19 +; SI-NEXT: v_lshlrev_b32_e32 v51, 8, v21 +; SI-NEXT: v_lshlrev_b32_e32 v48, 24, v23 +; SI-NEXT: v_lshlrev_b32_e32 v49, 24, v27 +; SI-NEXT: v_lshlrev_b32_e32 v42, 8, v29 +; SI-NEXT: ; kill: killed $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; kill: killed $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; SI-NEXT: v_lshlrev_b32_e32 v22, 24, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v8 -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v12 -; SI-NEXT: v_lshlrev_b32_e32 v18, 24, v17 -; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v20 -; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v24 +; SI-NEXT: v_lshlrev_b32_e32 v54, 24, v1 +; SI-NEXT: v_lshlrev_b32_e32 v40, 24, v2 +; SI-NEXT: v_lshlrev_b32_e32 v60, 8, v6 +; SI-NEXT: v_lshlrev_b32_e32 v47, 24, v8 +; SI-NEXT: v_lshlrev_b32_e32 v59, 24, v9 +; SI-NEXT: v_lshlrev_b32_e32 v22, 8, v14 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v16 +; SI-NEXT: v_lshlrev_b32_e32 v26, 24, v17 +; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v18 +; SI-NEXT: v_lshlrev_b32_e32 v16, 24, v20 ; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v28 -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:40 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v24 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v25 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:40 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_lshlrev_b32_e32 v57, 8, v31 -; SI-NEXT: v_lshlrev_b32_e32 v46, 24, v32 -; SI-NEXT: v_lshlrev_b32_e32 v58, 24, v33 -; SI-NEXT: v_lshlrev_b32_e32 v35, 8, v34 -; SI-NEXT: v_lshlrev_b32_e32 v61, 24, v36 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:20 +; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v28 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:20 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:12 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:4 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v3 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v5 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:116 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v7 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v11 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v15 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v19 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v23 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:124 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB110_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v5 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v13 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v5, v3, v5 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v7, v3 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v11, v11, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v21, v14, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v7 -; SI-NEXT: v_or_b32_e32 v33, v7, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v13 +; SI-NEXT: v_or_b32_e32 v23, v13, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; SI-NEXT: v_or_b32_e32 v7, v7, v63 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v7 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: v_or_b32_e32 v27, v13, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: v_or_b32_e32 v11, v11, v38 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v11, v7 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v50, v13, v11 ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_or_b32_e32 v51, v13, v11 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v13 +; SI-NEXT: v_or_b32_e32 v52, v13, v11 ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v55, v13, v11 +; SI-NEXT: v_or_b32_e32 v53, v36, v11 ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v13 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v11, v2, v10 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v11, v11, v51 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v10 -; SI-NEXT: v_or_b32_e32 v32, v10, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v41, v48, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v13, v6, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v44, v13, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v45, v49, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v13 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_or_b32_e32 v2, v2, v14 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: v_or_b32_e32 v11, v11, v42 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v15, v1, v2 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v57, v54, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; SI-NEXT: v_or_b32_e32 v36, v2, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v58, v13, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v25 +; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v37, v22, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v62 -; SI-NEXT: v_or_b32_e32 v1, v1, v4 -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v61, v40, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v37 +; SI-NEXT: v_or_b32_e32 v11, v11, v60 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v19, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v44 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v28 -; SI-NEXT: v_or_b32_e32 v48, v1, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v24 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v31, v47, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v32, v13, v11 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v20 +; SI-NEXT: v_or_b32_e32 v13, v13, v22 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v3, v13 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v33, v0, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v5 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v21, v18, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v47 -; SI-NEXT: v_or_b32_e32 v0, v0, v17 -; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v20 +; SI-NEXT: v_or_b32_e32 v30, v26, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v7 +; SI-NEXT: v_or_b32_e32 v0, v0, v6 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v43 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v23, v8, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v49 +; SI-NEXT: v_or_b32_e32 v24, v16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v10 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v59 -; SI-NEXT: v_or_b32_e32 v52, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v63 +; SI-NEXT: v_or_b32_e32 v28, v3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v56 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v53, v12, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v60 -; SI-NEXT: v_or_b32_e32 v0, v0, v57 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v26 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v27, v46, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v25 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v30 -; SI-NEXT: v_or_b32_e32 v40, v1, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v29, v58, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v45 -; SI-NEXT: v_or_b32_e32 v0, v0, v35 -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v16 +; SI-NEXT: v_or_b32_e32 v15, v1, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v12 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v4 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v31, v61, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: v_or_b32_e32 v19, v8, v0 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 @@ -91722,329 +94906,383 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v18 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: v_or_b32_e32 v11, v59, v11 ; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: .LBB110_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB110_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v45 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v16 -; SI-NEXT: v_or_b32_e32 v3, v35, v3 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x300, v3 -; SI-NEXT: v_or_b32_e32 v5, v61, v5 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v25 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v30 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: s_movk_i32 s6, 0x300 -; SI-NEXT: s_mov_b32 s7, 0x3000000 -; SI-NEXT: v_add_i32_e32 v16, vcc, s7, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v5 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v9 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v5, v58, v5 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_add_i32_e32 v9, vcc, s7, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v60 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v26 -; SI-NEXT: v_or_b32_e32 v3, v57, v3 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v5, v46, v5 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_add_i32_e32 v25, vcc, s7, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v49 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v59 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v56 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v5, v12, v5 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_add_i32_e32 v12, vcc, s7, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v47 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v20 -; SI-NEXT: v_or_b32_e32 v3, v17, v3 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v5, v8, v5 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_add_i32_e32 v8, vcc, s7, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v44 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v28 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v24 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v5, v18, v5 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_add_i32_e32 v17, vcc, s7, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v62 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v17 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v17 -; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v8 -; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v8 -; SI-NEXT: v_and_b32_e32 v53, 0xffff0000, v12 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v12 -; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v25 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v25 -; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v9 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v9 -; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v16 -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v16 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v0, v0, v4 -; SI-NEXT: v_or_b32_e32 v0, v0, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, s7, v0 -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v0 -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v0 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v1, v1, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v12 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_or_b32_e32 v2, v2, v11 ; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v22, v4 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_add_i32_e32 v4, vcc, s7, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v37, 0xffff0000, v4 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v4 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_or_b32_e32 v3, v14, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v1, vcc, s7, v1 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v1 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v1 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x300, v2 +; SI-NEXT: v_or_b32_e32 v4, v8, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v10 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v63 +; SI-NEXT: v_or_b32_e32 v4, v8, v4 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v56 +; SI-NEXT: s_movk_i32 s6, 0x300 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v1, v1, v8 +; SI-NEXT: v_or_b32_e32 v1, v1, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v43 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v6, v16, v6 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v9 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_or_b32_e32 v0, v0, v6 ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_add_i32_e32 v6, vcc, s7, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: v_or_b32_e32 v3, v10, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v5, v26, v5 +; SI-NEXT: v_or_b32_e32 v0, v5, v0 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v20 ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v2, v2, v5 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v17 +; SI-NEXT: v_or_b32_e32 v5, v22, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v3, v3, v6 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v39 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v25 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v18 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v6, v59, v6 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v37 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v6, v60, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_mov_b32 s7, 0x3000000 ; SI-NEXT: v_add_i32_e32 v2, vcc, s7, v2 -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, s7, v1 +; SI-NEXT: v_add_i32_e32 v4, vcc, s7, v4 +; SI-NEXT: v_add_i32_e32 v0, vcc, s7, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, s7, v3 +; SI-NEXT: v_add_i32_e32 v5, vcc, s7, v5 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v5 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v47, v7 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v8 +; SI-NEXT: v_add_i32_e32 v6, vcc, s7, v6 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v6 +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v0 +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v1 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v2 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v54, v9 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v10, 8, v10 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v48, v11 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v12, 8, v12 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v40, v8 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v7, vcc, s7, v7 +; SI-NEXT: v_and_b32_e32 v61, 0xffff0000, v7 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v14 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v7 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_or_b32_e32 v8, v42, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v8, vcc, s7, v8 +; SI-NEXT: v_and_b32_e32 v57, 0xffff0000, v8 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v8 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_add_i32_e32 v10, vcc, s7, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v55, 0xffff0000, v10 -; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: v_or_b32_e32 v3, v63, v3 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_add_i32_e32 v11, vcc, s7, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v11 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v49, v10 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v9, vcc, s7, v9 +; SI-NEXT: v_and_b32_e32 v45, 0xffff0000, v9 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v9 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_add_i32_e32 v7, vcc, s7, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v7 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; SI-NEXT: v_or_b32_e32 v10, v51, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v10, vcc, s7, v10 +; SI-NEXT: v_and_b32_e32 v41, 0xffff0000, v10 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v10 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v13, v5 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_add_i32_e32 v5, vcc, s7, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v5 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v36, v12 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v11, vcc, s7, v11 +; SI-NEXT: v_and_b32_e32 v53, 0xffff0000, v11 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v5 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v13, 8, v13 -; SI-NEXT: v_or_b32_e32 v3, v13, v3 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 +; SI-NEXT: v_or_b32_e32 v12, v38, v12 +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v12, vcc, s7, v12 +; SI-NEXT: v_and_b32_e32 v50, 0xffff0000, v12 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v12 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 ; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: v_or_b32_e32 v3, v13, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, s7, v3 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v13, vcc, s7, v13 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v13 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v13 ; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v14, vcc, s7, v14 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v11 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v2 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v3 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v16, 8, v16 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_add_i32_e32 v15, vcc, s7, v15 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v1 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; SI-NEXT: .LBB110_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_mov_b32_e32 v2, v43 -; SI-NEXT: v_mov_b32_e32 v10, v41 -; SI-NEXT: v_mov_b32_e32 v28, v40 -; SI-NEXT: v_mov_b32_e32 v30, v42 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v23 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v29 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v52 +; SI-NEXT: v_mul_f32_e32 v6, 1.0, v55 +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v44 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v46 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v58 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v62 +; SI-NEXT: v_mul_f32_e32 v12, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v14, 1.0, v34 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v35 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v21 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v27 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_alignbit_b32 v2, v2, v3, 16 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v50 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_alignbit_b32 v3, v3, v4, 16 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v53 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_alignbit_b32 v4, v4, v5, 16 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v41 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_alignbit_b32 v5, v5, v6, 16 +; SI-NEXT: v_mul_f32_e32 v6, 1.0, v45 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_alignbit_b32 v6, v6, v7, 16 +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v57 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_alignbit_b32 v7, v7, v8, 16 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v61 ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload @@ -92061,22 +95299,32 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v4, v33 -; SI-NEXT: v_mov_b32_e32 v6, v39 -; SI-NEXT: v_mov_b32_e32 v8, v51 -; SI-NEXT: v_mov_b32_e32 v9, v55 -; SI-NEXT: v_mov_b32_e32 v12, v32 -; SI-NEXT: v_mov_b32_e32 v14, v34 -; SI-NEXT: v_mov_b32_e32 v16, v36 -; SI-NEXT: v_mov_b32_e32 v17, v37 -; SI-NEXT: v_mov_b32_e32 v18, v38 -; SI-NEXT: v_mov_b32_e32 v20, v48 -; SI-NEXT: v_mov_b32_e32 v22, v50 -; SI-NEXT: v_mov_b32_e32 v24, v52 -; SI-NEXT: v_mov_b32_e32 v25, v53 -; SI-NEXT: v_mov_b32_e32 v26, v54 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_alignbit_b32 v8, v8, v9, 16 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v31 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_alignbit_b32 v9, v9, v10, 16 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v11 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v32 +; SI-NEXT: v_alignbit_b32 v10, v10, v11, 16 +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v13 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_alignbit_b32 v11, v11, v12, 16 +; SI-NEXT: v_mul_f32_e32 v12, 1.0, v30 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_mul_f32_e32 v13, 1.0, v33 +; SI-NEXT: v_alignbit_b32 v12, v12, v13, 16 +; SI-NEXT: v_mul_f32_e32 v13, 1.0, v24 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_alignbit_b32 v13, v13, v14, 16 +; SI-NEXT: v_mul_f32_e32 v14, 1.0, v15 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_mul_f32_e32 v15, 1.0, v28 +; SI-NEXT: v_alignbit_b32 v14, v14, v15, 16 +; SI-NEXT: v_mul_f32_e32 v15, 1.0, v19 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_alignbit_b32 v15, v15, v16, 16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -93791,486 +97039,685 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a, ; SI-LABEL: bitcast_v64i8_to_v32bf16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60 +; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:44 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_writelane_b32 v40, s30, 0 +; SI-NEXT: v_writelane_b32 v40, s31, 1 +; SI-NEXT: v_writelane_b32 v40, s34, 2 +; SI-NEXT: v_writelane_b32 v40, s35, 3 +; SI-NEXT: v_writelane_b32 v40, s36, 4 +; SI-NEXT: v_writelane_b32 v40, s37, 5 +; SI-NEXT: v_writelane_b32 v40, s38, 6 +; SI-NEXT: v_writelane_b32 v40, s39, 7 +; SI-NEXT: v_writelane_b32 v40, s48, 8 +; SI-NEXT: v_writelane_b32 v40, s49, 9 +; SI-NEXT: v_writelane_b32 v40, s50, 10 +; SI-NEXT: v_writelane_b32 v40, s51, 11 +; SI-NEXT: v_writelane_b32 v40, s52, 12 +; SI-NEXT: v_writelane_b32 v40, s53, 13 +; SI-NEXT: v_writelane_b32 v40, s54, 14 +; SI-NEXT: v_writelane_b32 v40, s55, 15 +; SI-NEXT: v_writelane_b32 v40, s64, 16 +; SI-NEXT: v_writelane_b32 v40, s65, 17 +; SI-NEXT: v_writelane_b32 v40, s66, 18 +; SI-NEXT: v_writelane_b32 v40, s67, 19 +; SI-NEXT: v_writelane_b32 v40, s68, 20 +; SI-NEXT: v_writelane_b32 v40, s69, 21 +; SI-NEXT: v_writelane_b32 v40, s70, 22 +; SI-NEXT: v_writelane_b32 v40, s71, 23 +; SI-NEXT: v_writelane_b32 v40, s80, 24 +; SI-NEXT: v_writelane_b32 v40, s81, 25 +; SI-NEXT: v_writelane_b32 v40, s82, 26 +; SI-NEXT: v_writelane_b32 v40, s83, 27 +; SI-NEXT: v_writelane_b32 v40, s84, 28 +; SI-NEXT: v_writelane_b32 v40, s85, 29 +; SI-NEXT: v_writelane_b32 v40, s86, 30 +; SI-NEXT: v_writelane_b32 v40, s87, 31 +; SI-NEXT: s_mov_b32 s6, s16 +; SI-NEXT: ; implicit-def: $vgpr41 : SGPR spill to VGPR lane +; SI-NEXT: v_writelane_b32 v40, s96, 32 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v41, s18, 0 +; SI-NEXT: v_writelane_b32 v41, s19, 1 +; SI-NEXT: v_writelane_b32 v41, s6, 2 +; SI-NEXT: v_writelane_b32 v41, s17, 3 +; SI-NEXT: v_writelane_b32 v41, s21, 4 +; SI-NEXT: v_writelane_b32 v41, s22, 5 +; SI-NEXT: v_writelane_b32 v41, s20, 6 +; SI-NEXT: v_writelane_b32 v41, s25, 7 +; SI-NEXT: v_writelane_b32 v41, s29, 8 +; SI-NEXT: v_writelane_b32 v40, s97, 33 +; SI-NEXT: v_writelane_b32 v41, s24, 9 +; SI-NEXT: v_writelane_b32 v40, s98, 34 +; SI-NEXT: v_readfirstlane_b32 s95, v30 +; SI-NEXT: v_readfirstlane_b32 s88, v29 +; SI-NEXT: v_readfirstlane_b32 s79, v28 +; SI-NEXT: v_readfirstlane_b32 s89, v27 +; SI-NEXT: v_readfirstlane_b32 s90, v26 +; SI-NEXT: v_readfirstlane_b32 s76, v25 +; SI-NEXT: v_readfirstlane_b32 s75, v24 +; SI-NEXT: v_readfirstlane_b32 s77, v23 +; SI-NEXT: v_readfirstlane_b32 s78, v22 +; SI-NEXT: v_readfirstlane_b32 s72, v21 +; SI-NEXT: v_readfirstlane_b32 s63, v20 +; SI-NEXT: v_readfirstlane_b32 s73, v19 +; SI-NEXT: v_readfirstlane_b32 s74, v18 +; SI-NEXT: v_readfirstlane_b32 s58, v17 +; SI-NEXT: v_readfirstlane_b32 s57, v16 +; SI-NEXT: v_readfirstlane_b32 s61, v15 +; SI-NEXT: v_readfirstlane_b32 s62, v14 +; SI-NEXT: v_readfirstlane_b32 s46, v13 +; SI-NEXT: v_readfirstlane_b32 s45, v12 +; SI-NEXT: v_readfirstlane_b32 s47, v11 +; SI-NEXT: v_readfirstlane_b32 s56, v10 +; SI-NEXT: v_readfirstlane_b32 s40, v9 +; SI-NEXT: v_readfirstlane_b32 s15, v8 +; SI-NEXT: v_readfirstlane_b32 s43, v7 +; SI-NEXT: v_readfirstlane_b32 s44, v6 +; SI-NEXT: v_readfirstlane_b32 s11, v5 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_readfirstlane_b32 s87, v31 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_readfirstlane_b32 s16, v32 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_readfirstlane_b32 s9, v33 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:24 ; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:40 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:36 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:68 -; SI-NEXT: v_readfirstlane_b32 s46, v30 -; SI-NEXT: v_readfirstlane_b32 s44, v23 -; SI-NEXT: v_readfirstlane_b32 s45, v22 -; SI-NEXT: v_readfirstlane_b32 s41, v15 -; SI-NEXT: v_readfirstlane_b32 s43, v14 -; SI-NEXT: v_readfirstlane_b32 s10, v7 -; SI-NEXT: v_readfirstlane_b32 s12, v6 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:12 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_readfirstlane_b32 s59, v34 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_readfirstlane_b32 s42, v35 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_readfirstlane_b32 s60, v37 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:76 +; SI-NEXT: v_readfirstlane_b32 s41, v36 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_readfirstlane_b32 s55, v38 +; SI-NEXT: v_readfirstlane_b32 s10, v4 +; SI-NEXT: v_readfirstlane_b32 s13, v3 +; SI-NEXT: v_readfirstlane_b32 s14, v2 ; SI-NEXT: v_readfirstlane_b32 s7, v1 -; SI-NEXT: v_readfirstlane_b32 s6, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v5 -; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v9 -; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v13 -; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v17 -; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v21 -; SI-NEXT: v_lshlrev_b32_e32 v14, 24, v25 -; SI-NEXT: v_lshlrev_b32_e32 v22, 24, v29 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_readfirstlane_b32 s58, v31 -; SI-NEXT: v_readfirstlane_b32 s59, v32 -; SI-NEXT: v_readfirstlane_b32 s56, v33 -; SI-NEXT: v_readfirstlane_b32 s57, v34 -; SI-NEXT: v_readfirstlane_b32 s47, v35 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v36 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_lshlrev_b32_e32 v44, 24, v37 +; SI-NEXT: v_readfirstlane_b32 s12, v0 +; SI-NEXT: v_writelane_b32 v41, s28, 10 +; SI-NEXT: v_writelane_b32 v40, s99, 35 +; SI-NEXT: v_writelane_b32 v41, s7, 11 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_readfirstlane_b32 s48, v31 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_readfirstlane_b32 s39, v32 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_lshlrev_b32_e32 v41, 24, v38 +; SI-NEXT: v_readfirstlane_b32 s50, v33 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_readfirstlane_b32 s52, v39 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_readfirstlane_b32 s31, v48 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_readfirstlane_b32 s30, v49 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshlrev_b32_e32 v30, 24, v39 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v42, 24, v48 +; SI-NEXT: v_readfirstlane_b32 s35, v50 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_readfirstlane_b32 s38, v51 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_readfirstlane_b32 s92, v34 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v37 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v45, 24, v45 -; SI-NEXT: s_cbranch_scc0 .LBB111_3 +; SI-NEXT: v_readfirstlane_b32 s91, v35 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s93, v36 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB111_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_and_b32 s4, s6, 0xff ; SI-NEXT: s_lshl_b32 s4, s4, 16 ; SI-NEXT: s_lshl_b32 s5, s17, 24 -; SI-NEXT: s_or_b32 s8, s5, s4 +; SI-NEXT: s_or_b32 s94, s5, s4 ; SI-NEXT: s_and_b32 s4, s18, 0xff ; SI-NEXT: s_lshl_b32 s4, s4, 16 ; SI-NEXT: s_lshl_b32 s5, s19, 24 -; SI-NEXT: s_or_b32 s9, s5, s4 +; SI-NEXT: s_or_b32 s34, s5, s4 ; SI-NEXT: s_and_b32 s4, s20, 0xff ; SI-NEXT: s_lshl_b32 s5, s21, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_lshl_b32 s11, s4, 16 -; SI-NEXT: s_and_b32 s4, s22, 0xff -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: s_lshl_b32 s5, s23, 24 -; SI-NEXT: s_or_b32 s13, s5, s4 -; SI-NEXT: s_and_b32 s4, s24, 0xff -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: s_lshl_b32 s5, s25, 24 -; SI-NEXT: s_or_b32 s14, s5, s4 -; SI-NEXT: s_and_b32 s4, s26, 0xff -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: s_lshl_b32 s5, s27, 24 -; SI-NEXT: s_or_b32 s15, s5, s4 -; SI-NEXT: s_and_b32 s4, s28, 0xff -; SI-NEXT: s_lshl_b32 s5, s29, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_lshl_b32 s40, s4, 16 -; SI-NEXT: s_and_b32 s4, s6, 0xff -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: s_lshl_b32 s5, s7, 24 -; SI-NEXT: s_or_b32 s42, s5, s4 -; SI-NEXT: s_and_b32 s4, s12, 0xff -; SI-NEXT: s_lshl_b32 s5, s10, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_lshl_b32 s60, s4, 16 -; SI-NEXT: s_and_b32 s4, s43, 0xff -; SI-NEXT: s_lshl_b32 s5, s41, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_and_b32_e32 v9, 0xff, v2 -; SI-NEXT: s_lshl_b32 s61, s4, 16 -; SI-NEXT: v_and_b32_e32 v17, 0xff, v18 -; SI-NEXT: s_and_b32 s4, s45, 0xff -; SI-NEXT: s_lshl_b32 s5, s44, 8 -; SI-NEXT: v_and_b32_e32 v25, 0xff, v52 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v3 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v19 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v29, 24, v51 -; SI-NEXT: v_or_b32_e32 v37, v13, v9 -; SI-NEXT: v_and_b32_e32 v13, 0xff, v8 -; SI-NEXT: v_or_b32_e32 v39, v21, v17 -; SI-NEXT: s_lshl_b32 s62, s4, 16 -; SI-NEXT: v_and_b32_e32 v21, 0xff, v24 -; SI-NEXT: s_and_b32 s4, s46, 0xff -; SI-NEXT: s_lshl_b32 s5, s47, 8 -; SI-NEXT: v_or_b32_e32 v32, v29, v25 -; SI-NEXT: v_and_b32_e32 v29, 0xff, v40 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; SI-NEXT: v_or_b32_e32 v38, v1, v13 -; SI-NEXT: v_and_b32_e32 v13, 0xff, v10 -; SI-NEXT: v_or_b32_e32 v33, v14, v21 -; SI-NEXT: v_and_b32_e32 v21, 0xff, v26 -; SI-NEXT: s_lshl_b32 s63, s4, 16 -; SI-NEXT: s_and_b32 s4, s57, 0xff -; SI-NEXT: s_lshl_b32 s5, s56, 8 -; SI-NEXT: v_or_b32_e32 v34, v42, v29 -; SI-NEXT: v_and_b32_e32 v29, 0xff, v55 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v15, 24, v11 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v23, 24, v27 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; SI-NEXT: v_lshlrev_b32_e32 v31, 24, v54 -; SI-NEXT: v_and_b32_e32 v9, 0xff, v4 -; SI-NEXT: v_or_b32_e32 v48, v15, v13 -; SI-NEXT: v_and_b32_e32 v13, 0xff, v12 -; SI-NEXT: v_and_b32_e32 v15, 0xff, v16 -; SI-NEXT: v_and_b32_e32 v17, 0xff, v20 -; SI-NEXT: v_or_b32_e32 v36, v23, v21 -; SI-NEXT: v_and_b32_e32 v21, 0xff, v28 -; SI-NEXT: v_and_b32_e32 v23, 0xff, v50 -; SI-NEXT: v_and_b32_e32 v25, 0xff, v49 -; SI-NEXT: s_lshl_b32 s72, s4, 16 -; SI-NEXT: v_or_b32_e32 v35, v31, v29 -; SI-NEXT: v_and_b32_e32 v29, 0xff, v53 -; SI-NEXT: s_and_b32 s4, s59, 0xff -; SI-NEXT: s_lshl_b32 s5, s58, 8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v31, 0xff, v43 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; SI-NEXT: v_or_b32_e32 v9, v0, v9 -; SI-NEXT: v_or_b32_e32 v13, v5, v13 -; SI-NEXT: v_or_b32_e32 v15, v6, v15 -; SI-NEXT: v_or_b32_e32 v17, v7, v17 -; SI-NEXT: v_or_b32_e32 v21, v22, v21 -; SI-NEXT: v_or_b32_e32 v23, v30, v23 -; SI-NEXT: v_or_b32_e32 v25, v41, v25 -; SI-NEXT: v_or_b32_e32 v29, v44, v29 -; SI-NEXT: s_lshl_b32 s73, s4, 16 -; SI-NEXT: v_or_b32_e32 v31, v45, v31 -; SI-NEXT: s_cbranch_execnz .LBB111_4 +; SI-NEXT: s_and_b32 s5, s22, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s6, s23, 24 +; SI-NEXT: s_or_b32 s37, s6, s5 +; SI-NEXT: s_and_b32 s5, s24, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s6, s25, 24 +; SI-NEXT: s_or_b32 s36, s6, s5 +; SI-NEXT: s_and_b32 s5, s26, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s6, s27, 24 +; SI-NEXT: s_or_b32 s49, s6, s5 +; SI-NEXT: s_and_b32 s5, s28, 0xff +; SI-NEXT: s_lshl_b32 s6, s29, 8 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s12, 0xff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_lshl_b32 s7, s7, 24 +; SI-NEXT: s_or_b32 s53, s7, s6 +; SI-NEXT: s_and_b32 s6, s14, 0xff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_lshl_b32 s7, s13, 24 +; SI-NEXT: s_or_b32 s51, s7, s6 +; SI-NEXT: s_and_b32 s6, s10, 0xff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_lshl_b32 s7, s11, 24 +; SI-NEXT: s_or_b32 s54, s7, s6 +; SI-NEXT: s_and_b32 s6, s44, 0xff +; SI-NEXT: s_lshl_b32 s7, s43, 8 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s15, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_lshl_b32 s17, s40, 24 +; SI-NEXT: s_or_b32 s65, s17, s7 +; SI-NEXT: s_and_b32 s7, s56, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_lshl_b32 s17, s47, 24 +; SI-NEXT: s_or_b32 s64, s17, s7 +; SI-NEXT: s_and_b32 s7, s45, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_lshl_b32 s17, s46, 24 +; SI-NEXT: s_or_b32 s66, s17, s7 +; SI-NEXT: s_and_b32 s7, s62, 0xff +; SI-NEXT: s_lshl_b32 s17, s61, 8 +; SI-NEXT: s_or_b32 vcc_lo, s7, s17 +; SI-NEXT: s_and_b32 s7, s57, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_lshl_b32 s17, s58, 24 +; SI-NEXT: s_or_b32 s68, s17, s7 +; SI-NEXT: s_and_b32 s7, s74, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_lshl_b32 s17, s73, 24 +; SI-NEXT: s_or_b32 s67, s17, s7 +; SI-NEXT: s_and_b32 s7, s63, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_lshl_b32 s17, s72, 24 +; SI-NEXT: s_or_b32 s69, s17, s7 +; SI-NEXT: s_and_b32 s7, s78, 0xff +; SI-NEXT: s_lshl_b32 s17, s77, 8 +; SI-NEXT: s_or_b32 vcc_hi, s7, s17 +; SI-NEXT: s_and_b32 s7, s75, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_lshl_b32 s17, s76, 24 +; SI-NEXT: s_or_b32 s71, s17, s7 +; SI-NEXT: s_and_b32 s7, s90, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_lshl_b32 s17, s89, 24 +; SI-NEXT: s_or_b32 s70, s17, s7 +; SI-NEXT: s_and_b32 s7, s79, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_lshl_b32 s17, s88, 24 +; SI-NEXT: s_or_b32 s80, s17, s7 +; SI-NEXT: s_and_b32 s7, s95, 0xff +; SI-NEXT: s_lshl_b32 s17, s93, 8 +; SI-NEXT: s_or_b32 s96, s7, s17 +; SI-NEXT: s_and_b32 s7, s91, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_lshl_b32 s17, s92, 24 +; SI-NEXT: s_or_b32 s82, s17, s7 +; SI-NEXT: s_and_b32 s7, s38, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_lshl_b32 s17, s35, 24 +; SI-NEXT: s_or_b32 s81, s17, s7 +; SI-NEXT: s_and_b32 s7, s30, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_lshl_b32 s17, s31, 24 +; SI-NEXT: s_or_b32 s83, s17, s7 +; SI-NEXT: s_and_b32 s7, s52, 0xff +; SI-NEXT: s_lshl_b32 s17, s50, 8 +; SI-NEXT: s_or_b32 s97, s7, s17 +; SI-NEXT: s_and_b32 s7, s39, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_lshl_b32 s17, s48, 24 +; SI-NEXT: s_or_b32 s85, s17, s7 +; SI-NEXT: s_and_b32 s7, s55, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_lshl_b32 s17, s60, 24 +; SI-NEXT: s_or_b32 s84, s17, s7 +; SI-NEXT: s_and_b32 s7, s41, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_lshl_b32 s17, s42, 24 +; SI-NEXT: s_or_b32 s86, s17, s7 +; SI-NEXT: s_and_b32 s7, s59, 0xff +; SI-NEXT: s_lshl_b32 s17, s9, 8 +; SI-NEXT: s_or_b32 s19, s7, s17 +; SI-NEXT: s_and_b32 s7, s16, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_lshl_b32 s17, s87, 24 +; SI-NEXT: s_mov_b32 s21, s23 +; SI-NEXT: s_mov_b32 s20, s26 +; SI-NEXT: s_mov_b32 s25, s27 +; SI-NEXT: s_mov_b32 s8, s87 +; SI-NEXT: s_or_b32 s87, s17, s7 +; SI-NEXT: s_lshl_b32 s17, s4, 16 +; SI-NEXT: s_lshl_b32 s18, s5, 16 +; SI-NEXT: s_lshl_b32 s7, s6, 16 +; SI-NEXT: s_lshl_b32 s6, vcc_lo, 16 +; SI-NEXT: s_lshl_b32 s99, vcc_hi, 16 +; SI-NEXT: s_lshl_b32 s98, s96, 16 +; SI-NEXT: s_lshl_b32 s97, s97, 16 +; SI-NEXT: s_lshl_b32 s96, s19, 16 +; SI-NEXT: s_cbranch_execnz .LBB111_3 ; SI-NEXT: .LBB111_2: ; %cmp.true ; SI-NEXT: s_add_i32 s59, s59, 3 ; SI-NEXT: s_and_b32 s4, s59, 0xff -; SI-NEXT: s_lshl_b32 s5, s58, 8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v43 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 -; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: v_or_b32_e32 v9, v45, v9 -; SI-NEXT: v_or_b32_e32 v9, s4, v9 -; SI-NEXT: v_add_i32_e32 v43, vcc, 0x3000000, v9 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v55 -; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v13, 8, v54 -; SI-NEXT: v_or_b32_e32 v9, v13, v9 -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v53 -; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 -; SI-NEXT: v_add_i32_e32 v9, vcc, 0x300, v9 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: v_or_b32_e32 v13, v44, v13 -; SI-NEXT: v_or_b32_e32 v9, v13, v9 -; SI-NEXT: s_add_i32 s57, s57, 3 -; SI-NEXT: v_add_i32_e32 v31, vcc, 0x3000000, v9 -; SI-NEXT: s_and_b32 s4, s57, 0xff -; SI-NEXT: s_lshl_b32 s5, s56, 8 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v40 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 -; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: v_or_b32_e32 v9, v42, v9 -; SI-NEXT: v_or_b32_e32 v9, s4, v9 -; SI-NEXT: v_add_i32_e32 v29, vcc, 0x3000000, v9 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v52 -; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v13, 8, v51 -; SI-NEXT: v_or_b32_e32 v9, v13, v9 -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v49 -; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 -; SI-NEXT: v_add_i32_e32 v9, vcc, 0x300, v9 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: v_or_b32_e32 v13, v41, v13 -; SI-NEXT: v_or_b32_e32 v9, v13, v9 -; SI-NEXT: s_add_i32 s46, s46, 3 -; SI-NEXT: v_add_i32_e32 v32, vcc, 0x3000000, v9 -; SI-NEXT: s_and_b32 s4, s46, 0xff -; SI-NEXT: s_lshl_b32 s5, s47, 8 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v50 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 -; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: v_or_b32_e32 v9, v30, v9 -; SI-NEXT: v_or_b32_e32 v9, s4, v9 -; SI-NEXT: v_add_i32_e32 v25, vcc, 0x3000000, v9 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v26 -; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v13, 8, v27 -; SI-NEXT: v_or_b32_e32 v9, v13, v9 -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v28 -; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 -; SI-NEXT: v_add_i32_e32 v9, vcc, 0x300, v9 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: v_or_b32_e32 v13, v22, v13 -; SI-NEXT: v_or_b32_e32 v9, v13, v9 -; SI-NEXT: s_add_i32 s45, s45, 3 -; SI-NEXT: v_add_i32_e32 v22, vcc, 0x3000000, v9 -; SI-NEXT: s_and_b32 s4, s45, 0xff -; SI-NEXT: s_lshl_b32 s5, s44, 8 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v24 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 -; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: v_or_b32_e32 v9, v14, v9 -; SI-NEXT: v_or_b32_e32 v9, s4, v9 -; SI-NEXT: v_add_i32_e32 v21, vcc, 0x3000000, v9 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v18 -; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v13, 8, v19 -; SI-NEXT: v_or_b32_e32 v9, v13, v9 -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v20 -; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 -; SI-NEXT: v_add_i32_e32 v9, vcc, 0x300, v9 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: v_or_b32_e32 v7, v7, v13 -; SI-NEXT: s_add_i32 s43, s43, 3 -; SI-NEXT: v_or_b32_e32 v7, v7, v9 -; SI-NEXT: s_and_b32 s4, s43, 0xff -; SI-NEXT: s_lshl_b32 s5, s41, 8 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v16 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 -; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: s_add_i32 s12, s12, 3 -; SI-NEXT: v_or_b32_e32 v6, s4, v6 -; SI-NEXT: s_and_b32 s4, s12, 0xff -; SI-NEXT: s_lshl_b32 s5, s10, 8 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 -; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: v_or_b32_e32 v1, v1, v8 -; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: v_or_b32_e32 v1, s4, v1 -; SI-NEXT: s_and_b32 s4, s28, 0xff -; SI-NEXT: s_lshl_b32 s5, s29, 8 -; SI-NEXT: s_add_i32 s6, s6, 3 +; SI-NEXT: s_lshl_b32 s5, s9, 8 +; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s6, s6, 0xff +; SI-NEXT: s_and_b32 s6, s16, 0xff ; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: s_lshl_b32 s5, s7, 24 +; SI-NEXT: s_lshl_b32 s5, s8, 24 ; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s55, s55, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s5, s24, 0xff -; SI-NEXT: s_lshl_b32 s6, s25, 8 -; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_and_b32 s5, s55, 0xff +; SI-NEXT: s_lshl_b32 s6, s60, 8 +; SI-NEXT: s_add_i32 s41, s41, 3 ; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_and_b32 s7, s26, 0xff +; SI-NEXT: s_and_b32 s7, s41, 0xff ; SI-NEXT: s_addk_i32 s5, 0x300 -; SI-NEXT: s_lshl_b32 s6, s27, 24 +; SI-NEXT: s_lshl_b32 s6, s42, 24 ; SI-NEXT: s_lshl_b32 s7, s7, 16 ; SI-NEXT: s_and_b32 s5, s5, 0xffff ; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s52, s52, 3 ; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_and_b32 s6, s20, 0xff -; SI-NEXT: s_lshl_b32 s7, s21, 8 -; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_and_b32 s6, s52, 0xff +; SI-NEXT: s_lshl_b32 s7, s50, 8 +; SI-NEXT: s_add_i32 s39, s39, 3 ; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_and_b32 s8, s22, 0xff -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v10 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: s_and_b32 s9, s39, 0xff ; SI-NEXT: s_addk_i32 s6, 0x300 -; SI-NEXT: s_lshl_b32 s7, s23, 24 -; SI-NEXT: s_lshl_b32 s8, s8, 16 -; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v10, 8, v11 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: s_lshl_b32 s7, s48, 24 +; SI-NEXT: s_lshl_b32 s9, s9, 16 ; SI-NEXT: s_and_b32 s6, s6, 0xffff -; SI-NEXT: s_or_b32 s7, s7, s8 -; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v12 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v4 +; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: s_add_i32 s38, s38, 3 +; SI-NEXT: s_or_b32 s9, s7, s6 +; SI-NEXT: s_and_b32 s6, s38, 0xff +; SI-NEXT: s_lshl_b32 s7, s35, 8 +; SI-NEXT: s_add_i32 s30, s30, 3 ; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_and_b32 s7, s16, 0xff -; SI-NEXT: s_lshl_b32 s8, s17, 8 -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: s_or_b32 s7, s8, s7 -; SI-NEXT: s_and_b32 s9, s18, 0xff -; SI-NEXT: v_add_i32_e32 v9, vcc, 0x300, v9 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x300, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_and_b32 s16, s30, 0xff +; SI-NEXT: s_addk_i32 s6, 0x300 +; SI-NEXT: s_lshl_b32 s7, s31, 24 +; SI-NEXT: s_lshl_b32 s16, s16, 16 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s7, s16 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_add_i32 s95, s95, 3 +; SI-NEXT: s_add_i32 s16, s6, 0x3000000 +; SI-NEXT: s_and_b32 s6, s95, 0xff +; SI-NEXT: s_lshl_b32 s7, s93, 8 +; SI-NEXT: s_add_i32 s91, s91, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s17, s91, 0xff +; SI-NEXT: s_addk_i32 s6, 0x300 +; SI-NEXT: s_lshl_b32 s7, s92, 24 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s7, s17 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_add_i32 s90, s90, 3 +; SI-NEXT: s_add_i32 s19, s6, 0x3000000 +; SI-NEXT: s_and_b32 s6, s90, 0xff +; SI-NEXT: s_lshl_b32 s7, s89, 8 +; SI-NEXT: s_add_i32 s79, s79, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s17, s79, 0xff +; SI-NEXT: s_addk_i32 s6, 0x300 +; SI-NEXT: s_lshl_b32 s7, s88, 24 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s7, s17 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_add_i32 s78, s78, 3 +; SI-NEXT: s_add_i32 s41, s6, 0x3000000 +; SI-NEXT: s_and_b32 s6, s78, 0xff +; SI-NEXT: s_lshl_b32 s7, s77, 8 +; SI-NEXT: s_add_i32 s75, s75, 3 +; SI-NEXT: s_add_i32 s14, s14, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s17, s75, 0xff +; SI-NEXT: s_and_b32 s14, s14, 0xff +; SI-NEXT: s_lshl_b32 s13, s13, 8 +; SI-NEXT: s_add_i32 s10, s10, 3 +; SI-NEXT: s_addk_i32 s6, 0x300 +; SI-NEXT: s_lshl_b32 s7, s76, 24 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_or_b32 s13, s13, s14 +; SI-NEXT: s_and_b32 s10, s10, 0xff +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s7, s17 +; SI-NEXT: s_addk_i32 s13, 0x300 +; SI-NEXT: s_lshl_b32 s11, s11, 24 +; SI-NEXT: s_lshl_b32 s10, s10, 16 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_add_i32 s74, s74, 3 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: v_readlane_b32 s11, v41, 10 +; SI-NEXT: s_add_i32 s42, s6, 0x3000000 +; SI-NEXT: s_and_b32 s6, s74, 0xff +; SI-NEXT: s_lshl_b32 s7, s73, 8 +; SI-NEXT: s_add_i32 s63, s63, 3 +; SI-NEXT: s_or_b32 s10, s10, s13 +; SI-NEXT: s_add_i32 s28, s11, 3 +; SI-NEXT: v_readlane_b32 s13, v41, 8 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s17, s63, 0xff +; SI-NEXT: s_and_b32 s11, s28, 0xff +; SI-NEXT: s_lshl_b32 s13, s13, 8 +; SI-NEXT: s_add_i32 s12, s12, 3 +; SI-NEXT: s_addk_i32 s6, 0x300 +; SI-NEXT: s_lshl_b32 s7, s72, 24 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_or_b32 s11, s13, s11 +; SI-NEXT: v_readlane_b32 s8, v41, 11 +; SI-NEXT: s_and_b32 s12, s12, 0xff +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s7, s17 +; SI-NEXT: s_addk_i32 s11, 0x300 +; SI-NEXT: s_lshl_b32 s8, s8, 24 +; SI-NEXT: s_lshl_b32 s12, s12, 16 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_add_i32 s62, s62, 3 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: s_or_b32 s8, s8, s12 +; SI-NEXT: s_add_i32 s59, s6, 0x3000000 +; SI-NEXT: s_and_b32 s6, s62, 0xff +; SI-NEXT: s_lshl_b32 s7, s61, 8 +; SI-NEXT: s_add_i32 s57, s57, 3 +; SI-NEXT: s_or_b32 s8, s8, s11 +; SI-NEXT: v_readlane_b32 s11, v41, 9 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s17, s57, 0xff +; SI-NEXT: s_add_i32 s24, s11, 3 +; SI-NEXT: v_readlane_b32 s12, v41, 7 +; SI-NEXT: s_addk_i32 s6, 0x300 +; SI-NEXT: s_lshl_b32 s7, s58, 24 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_and_b32 s11, s24, 0xff +; SI-NEXT: s_lshl_b32 s12, s12, 8 +; SI-NEXT: s_add_i32 s26, s20, 3 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s7, s17 +; SI-NEXT: s_add_i32 s56, s56, 3 +; SI-NEXT: s_or_b32 s11, s12, s11 +; SI-NEXT: s_and_b32 s13, s26, 0xff +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s56, 0xff +; SI-NEXT: s_lshl_b32 s17, s47, 8 +; SI-NEXT: s_add_i32 s45, s45, 3 +; SI-NEXT: s_addk_i32 s11, 0x300 +; SI-NEXT: s_lshl_b32 s12, s25, 24 +; SI-NEXT: s_lshl_b32 s13, s13, 16 +; SI-NEXT: s_or_b32 s7, s17, s7 +; SI-NEXT: s_and_b32 s18, s45, 0xff +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: s_or_b32 s12, s12, s13 ; SI-NEXT: s_addk_i32 s7, 0x300 -; SI-NEXT: s_lshl_b32 s8, s19, 24 -; SI-NEXT: s_lshl_b32 s9, s9, 16 -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: v_or_b32_e32 v5, v5, v10 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v0, v0, v3 +; SI-NEXT: s_lshl_b32 s17, s46, 24 +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_or_b32 s11, s12, s11 +; SI-NEXT: v_readlane_b32 s12, v41, 6 ; SI-NEXT: s_and_b32 s7, s7, 0xffff -; SI-NEXT: s_or_b32 s8, s8, s9 -; SI-NEXT: v_or_b32_e32 v5, v5, v9 -; SI-NEXT: v_or_b32_e32 v0, v0, v2 -; SI-NEXT: s_or_b32 s7, s8, s7 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x3000000, v7 -; SI-NEXT: v_add_i32_e32 v6, vcc, 0x3000000, v6 -; SI-NEXT: v_add_i32_e32 v5, vcc, 0x3000000, v5 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x3000000, v1 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x3000000, v0 +; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: s_add_i32 s20, s12, 3 +; SI-NEXT: v_readlane_b32 s13, v41, 4 +; SI-NEXT: v_readlane_b32 s14, v41, 5 +; SI-NEXT: s_or_b32 s7, s17, s7 +; SI-NEXT: s_add_i32 s44, s44, 3 +; SI-NEXT: s_and_b32 s12, s20, 0xff +; SI-NEXT: s_lshl_b32 s13, s13, 8 +; SI-NEXT: s_add_i32 s22, s14, 3 +; SI-NEXT: s_add_i32 s45, s7, 0x3000000 +; SI-NEXT: s_and_b32 s7, s44, 0xff +; SI-NEXT: s_lshl_b32 s17, s43, 8 +; SI-NEXT: s_add_i32 s15, s15, 3 +; SI-NEXT: s_or_b32 s12, s13, s12 +; SI-NEXT: s_and_b32 s14, s22, 0xff +; SI-NEXT: s_or_b32 s7, s17, s7 +; SI-NEXT: s_and_b32 s15, s15, 0xff +; SI-NEXT: s_addk_i32 s12, 0x300 +; SI-NEXT: s_lshl_b32 s13, s21, 24 +; SI-NEXT: s_lshl_b32 s14, s14, 16 +; SI-NEXT: s_addk_i32 s7, 0x300 +; SI-NEXT: s_lshl_b32 s17, s40, 24 +; SI-NEXT: s_lshl_b32 s15, s15, 16 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_or_b32 s13, s13, s14 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_or_b32 s15, s17, s15 +; SI-NEXT: s_or_b32 s12, s13, s12 +; SI-NEXT: v_readlane_b32 s13, v41, 2 +; SI-NEXT: s_or_b32 s7, s15, s7 +; SI-NEXT: s_add_i32 s13, s13, 3 +; SI-NEXT: v_readlane_b32 s14, v41, 3 +; SI-NEXT: v_readlane_b32 s15, v41, 0 +; SI-NEXT: s_and_b32 s13, s13, 0xff +; SI-NEXT: s_lshl_b32 s14, s14, 8 +; SI-NEXT: s_add_i32 s15, s15, 3 +; SI-NEXT: s_or_b32 s13, s14, s13 +; SI-NEXT: v_readlane_b32 s14, v41, 1 +; SI-NEXT: s_and_b32 s15, s15, 0xff +; SI-NEXT: s_addk_i32 s13, 0x300 +; SI-NEXT: s_lshl_b32 s14, s14, 24 +; SI-NEXT: s_lshl_b32 s15, s15, 16 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_or_b32 s14, s14, s15 +; SI-NEXT: s_or_b32 s13, s14, s13 ; SI-NEXT: s_add_i32 s4, s4, 0x3000000 ; SI-NEXT: s_add_i32 s5, s5, 0x3000000 +; SI-NEXT: s_add_i32 s9, s9, 0x3000000 ; SI-NEXT: s_add_i32 s6, s6, 0x3000000 ; SI-NEXT: s_add_i32 s7, s7, 0x3000000 -; SI-NEXT: s_and_b32 s9, s7, 0xffff0000 -; SI-NEXT: s_lshl_b32 s8, s7, 16 -; SI-NEXT: s_and_b32 s13, s6, 0xffff0000 -; SI-NEXT: s_lshl_b32 s11, s6, 16 -; SI-NEXT: s_and_b32 s15, s5, 0xffff0000 -; SI-NEXT: s_lshl_b32 s14, s5, 16 -; SI-NEXT: s_and_b32 s42, s4, 0xffff0000 -; SI-NEXT: s_lshl_b32 s40, s4, 16 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v0 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v0 -; SI-NEXT: v_and_b32_e32 v38, 0xffff0000, v1 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v1 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v5 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v5 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v6 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v6 -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v7 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v7 -; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v21 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v21 -; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v22 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v22 -; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v25 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v25 -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v32 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; SI-NEXT: v_and_b32_e32 v34, 0xffff0000, v29 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v29 -; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v31 -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v31 -; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v43 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v43 -; SI-NEXT: s_branch .LBB111_5 -; SI-NEXT: .LBB111_3: -; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: ; implicit-def: $sgpr9 -; SI-NEXT: ; implicit-def: $sgpr11 -; SI-NEXT: ; implicit-def: $sgpr13 -; SI-NEXT: ; implicit-def: $sgpr14 -; SI-NEXT: ; implicit-def: $sgpr15 -; SI-NEXT: ; implicit-def: $sgpr40 -; SI-NEXT: ; implicit-def: $sgpr42 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $sgpr60 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $sgpr61 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $sgpr62 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $sgpr63 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $sgpr72 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $sgpr73 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: s_branch .LBB111_2 -; SI-NEXT: .LBB111_4: -; SI-NEXT: v_mov_b32_e32 v10, s60 -; SI-NEXT: v_mov_b32_e32 v14, s61 -; SI-NEXT: v_mov_b32_e32 v18, s62 -; SI-NEXT: v_mov_b32_e32 v22, s63 -; SI-NEXT: v_mov_b32_e32 v26, s72 -; SI-NEXT: v_mov_b32_e32 v30, s73 -; SI-NEXT: .LBB111_5: ; %end -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v0, s8 -; SI-NEXT: v_mov_b32_e32 v1, s9 -; SI-NEXT: v_mov_b32_e32 v2, s11 -; SI-NEXT: v_mov_b32_e32 v3, s13 -; SI-NEXT: v_mov_b32_e32 v4, s14 -; SI-NEXT: v_mov_b32_e32 v5, s15 -; SI-NEXT: v_mov_b32_e32 v6, s40 -; SI-NEXT: v_mov_b32_e32 v7, s42 -; SI-NEXT: v_mov_b32_e32 v8, v37 -; SI-NEXT: v_mov_b32_e32 v11, v38 -; SI-NEXT: v_mov_b32_e32 v12, v48 -; SI-NEXT: v_mov_b32_e32 v16, v39 -; SI-NEXT: v_mov_b32_e32 v19, v33 -; SI-NEXT: v_mov_b32_e32 v20, v36 -; SI-NEXT: v_mov_b32_e32 v24, v32 -; SI-NEXT: v_mov_b32_e32 v27, v34 -; SI-NEXT: v_mov_b32_e32 v28, v35 +; SI-NEXT: s_add_i32 s10, s10, 0x3000000 +; SI-NEXT: s_add_i32 s8, s8, 0x3000000 +; SI-NEXT: s_add_i32 s11, s11, 0x3000000 +; SI-NEXT: s_add_i32 s12, s12, 0x3000000 +; SI-NEXT: s_add_i32 s13, s13, 0x3000000 +; SI-NEXT: s_and_b32 s34, s13, 0xffff0000 +; SI-NEXT: s_lshl_b32 s94, s13, 16 +; SI-NEXT: s_and_b32 s37, s12, 0xffff0000 +; SI-NEXT: s_lshl_b32 s17, s12, 16 +; SI-NEXT: s_and_b32 s49, s11, 0xffff0000 +; SI-NEXT: s_lshl_b32 s36, s11, 16 +; SI-NEXT: s_and_b32 s53, s8, 0xffff0000 +; SI-NEXT: s_lshl_b32 s18, s8, 16 +; SI-NEXT: s_and_b32 s54, s10, 0xffff0000 +; SI-NEXT: s_lshl_b32 s51, s10, 16 +; SI-NEXT: s_and_b32 s65, s7, 0xffff0000 +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_and_b32 s66, s45, 0xffff0000 +; SI-NEXT: s_lshl_b32 s64, s45, 16 +; SI-NEXT: s_and_b32 s68, s6, 0xffff0000 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s69, s59, 0xffff0000 +; SI-NEXT: s_lshl_b32 s67, s59, 16 +; SI-NEXT: s_and_b32 s71, s42, 0xffff0000 +; SI-NEXT: s_lshl_b32 s99, s42, 16 +; SI-NEXT: s_and_b32 s80, s41, 0xffff0000 +; SI-NEXT: s_lshl_b32 s70, s41, 16 +; SI-NEXT: s_and_b32 s82, s19, 0xffff0000 +; SI-NEXT: s_lshl_b32 s98, s19, 16 +; SI-NEXT: s_and_b32 s83, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s81, s16, 16 +; SI-NEXT: s_and_b32 s85, s9, 0xffff0000 +; SI-NEXT: s_lshl_b32 s97, s9, 16 +; SI-NEXT: s_and_b32 s86, s5, 0xffff0000 +; SI-NEXT: s_lshl_b32 s84, s5, 16 +; SI-NEXT: s_and_b32 s87, s4, 0xffff0000 +; SI-NEXT: s_lshl_b32 s96, s4, 16 +; SI-NEXT: .LBB111_3: ; %end +; SI-NEXT: v_mul_f32_e64 v0, 1.0, s34 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_mul_f32_e64 v0, 1.0, s94 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s37 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s17 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s49 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s36 +; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s53 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s18 +; SI-NEXT: v_lshr_b64 v[3:4], v[3:4], 16 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s54 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s51 +; SI-NEXT: v_lshr_b64 v[4:5], v[4:5], 16 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s65 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s7 +; SI-NEXT: v_lshr_b64 v[5:6], v[5:6], 16 +; SI-NEXT: v_mul_f32_e64 v6, 1.0, s66 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_mul_f32_e64 v6, 1.0, s64 +; SI-NEXT: v_lshr_b64 v[6:7], v[6:7], 16 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s68 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v7 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s6 +; SI-NEXT: v_lshr_b64 v[7:8], v[7:8], 16 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s69 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v8 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s67 +; SI-NEXT: v_lshr_b64 v[8:9], v[8:9], 16 +; SI-NEXT: v_mul_f32_e64 v9, 1.0, s71 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v9 +; SI-NEXT: v_mul_f32_e64 v9, 1.0, s99 +; SI-NEXT: v_lshr_b64 v[9:10], v[9:10], 16 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s80 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s70 +; SI-NEXT: v_lshr_b64 v[10:11], v[10:11], 16 +; SI-NEXT: v_mul_f32_e64 v11, 1.0, s82 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v11 +; SI-NEXT: v_mul_f32_e64 v11, 1.0, s98 +; SI-NEXT: v_lshr_b64 v[11:12], v[11:12], 16 +; SI-NEXT: v_mul_f32_e64 v12, 1.0, s83 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v12 +; SI-NEXT: v_mul_f32_e64 v12, 1.0, s81 +; SI-NEXT: v_lshr_b64 v[12:13], v[12:13], 16 +; SI-NEXT: v_mul_f32_e64 v13, 1.0, s85 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v13 +; SI-NEXT: v_mul_f32_e64 v13, 1.0, s97 +; SI-NEXT: v_lshr_b64 v[13:14], v[13:14], 16 +; SI-NEXT: v_mul_f32_e64 v14, 1.0, s86 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_mul_f32_e64 v14, 1.0, s84 +; SI-NEXT: v_lshr_b64 v[14:15], v[14:15], 16 +; SI-NEXT: v_mul_f32_e64 v15, 1.0, s87 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v15 +; SI-NEXT: v_mul_f32_e64 v15, 1.0, s96 +; SI-NEXT: v_lshr_b64 v[15:16], v[15:16], 16 +; SI-NEXT: v_readlane_b32 s99, v40, 35 +; SI-NEXT: v_readlane_b32 s98, v40, 34 +; SI-NEXT: v_readlane_b32 s97, v40, 33 +; SI-NEXT: v_readlane_b32 s96, v40, 32 +; SI-NEXT: v_readlane_b32 s87, v40, 31 +; SI-NEXT: v_readlane_b32 s86, v40, 30 +; SI-NEXT: v_readlane_b32 s85, v40, 29 +; SI-NEXT: v_readlane_b32 s84, v40, 28 +; SI-NEXT: v_readlane_b32 s83, v40, 27 +; SI-NEXT: v_readlane_b32 s82, v40, 26 +; SI-NEXT: v_readlane_b32 s81, v40, 25 +; SI-NEXT: v_readlane_b32 s80, v40, 24 +; SI-NEXT: v_readlane_b32 s71, v40, 23 +; SI-NEXT: v_readlane_b32 s70, v40, 22 +; SI-NEXT: v_readlane_b32 s69, v40, 21 +; SI-NEXT: v_readlane_b32 s68, v40, 20 +; SI-NEXT: v_readlane_b32 s67, v40, 19 +; SI-NEXT: v_readlane_b32 s66, v40, 18 +; SI-NEXT: v_readlane_b32 s65, v40, 17 +; SI-NEXT: v_readlane_b32 s64, v40, 16 +; SI-NEXT: v_readlane_b32 s55, v40, 15 +; SI-NEXT: v_readlane_b32 s54, v40, 14 +; SI-NEXT: v_readlane_b32 s53, v40, 13 +; SI-NEXT: v_readlane_b32 s52, v40, 12 +; SI-NEXT: v_readlane_b32 s51, v40, 11 +; SI-NEXT: v_readlane_b32 s50, v40, 10 +; SI-NEXT: v_readlane_b32 s49, v40, 9 +; SI-NEXT: v_readlane_b32 s48, v40, 8 +; SI-NEXT: v_readlane_b32 s39, v40, 7 +; SI-NEXT: v_readlane_b32 s38, v40, 6 +; SI-NEXT: v_readlane_b32 s37, v40, 5 +; SI-NEXT: v_readlane_b32 s36, v40, 4 +; SI-NEXT: v_readlane_b32 s35, v40, 3 +; SI-NEXT: v_readlane_b32 s34, v40, 2 +; SI-NEXT: v_readlane_b32 s31, v40, 1 +; SI-NEXT: v_readlane_b32 s30, v40, 0 +; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB111_4: +; SI-NEXT: s_mov_b32 s8, s87 +; SI-NEXT: s_mov_b32 s25, s27 +; SI-NEXT: s_mov_b32 s20, s26 +; SI-NEXT: s_mov_b32 s21, s23 +; SI-NEXT: ; implicit-def: $sgpr94 +; SI-NEXT: ; implicit-def: $sgpr34 +; SI-NEXT: ; implicit-def: $sgpr17 +; SI-NEXT: ; implicit-def: $sgpr37 +; SI-NEXT: ; implicit-def: $sgpr36 +; SI-NEXT: ; implicit-def: $sgpr49 +; SI-NEXT: ; implicit-def: $sgpr18 +; SI-NEXT: ; implicit-def: $sgpr53 +; SI-NEXT: ; implicit-def: $sgpr51 +; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr65 +; SI-NEXT: ; implicit-def: $sgpr64 +; SI-NEXT: ; implicit-def: $sgpr66 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr68 +; SI-NEXT: ; implicit-def: $sgpr67 +; SI-NEXT: ; implicit-def: $sgpr69 +; SI-NEXT: ; implicit-def: $sgpr99 +; SI-NEXT: ; implicit-def: $sgpr71 +; SI-NEXT: ; implicit-def: $sgpr70 +; SI-NEXT: ; implicit-def: $sgpr80 +; SI-NEXT: ; implicit-def: $sgpr98 +; SI-NEXT: ; implicit-def: $sgpr82 +; SI-NEXT: ; implicit-def: $sgpr81 +; SI-NEXT: ; implicit-def: $sgpr83 +; SI-NEXT: ; implicit-def: $sgpr97 +; SI-NEXT: ; implicit-def: $sgpr85 +; SI-NEXT: ; implicit-def: $sgpr84 +; SI-NEXT: ; implicit-def: $sgpr86 +; SI-NEXT: ; implicit-def: $sgpr96 +; SI-NEXT: ; implicit-def: $sgpr87 +; SI-NEXT: s_branch .LBB111_2 ; ; VI-LABEL: bitcast_v64i8_to_v32bf16_scalar: ; VI: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll index fe226fa0bb47f..36caff3752e26 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll @@ -2176,196 +2176,143 @@ define <36 x i16> @bitcast_v18i32_to_v36i16(<18 x i32> %a, i32 %b) { ; SI-LABEL: bitcast_v18i32_to_v36i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB12_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_alignbit_b32 v19, v18, v17, 16 -; SI-NEXT: v_alignbit_b32 v20, v16, v15, 16 -; SI-NEXT: v_alignbit_b32 v21, v14, v13, 16 -; SI-NEXT: v_alignbit_b32 v22, v12, v11, 16 -; SI-NEXT: v_alignbit_b32 v25, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v27, v8, v7, 16 -; SI-NEXT: v_alignbit_b32 v29, v6, v5, 16 -; SI-NEXT: v_alignbit_b32 v31, v4, v3, 16 -; SI-NEXT: v_alignbit_b32 v33, v2, v1, 16 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v2 +; SI-NEXT: v_alignbit_b32 v18, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v19, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v20, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v21, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v22, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v23, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v25, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v28, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v30, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v1 ; SI-NEXT: .LBB12_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB12_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 ; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 ; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 ; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 ; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 ; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 ; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; SI-NEXT: v_alignbit_b32 v19, v18, v17, 16 -; SI-NEXT: v_alignbit_b32 v20, v16, v15, 16 -; SI-NEXT: v_alignbit_b32 v21, v14, v13, 16 -; SI-NEXT: v_alignbit_b32 v22, v12, v11, 16 -; SI-NEXT: v_alignbit_b32 v25, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v27, v8, v7, 16 -; SI-NEXT: v_alignbit_b32 v29, v6, v5, 16 -; SI-NEXT: v_alignbit_b32 v31, v4, v3, 16 -; SI-NEXT: v_alignbit_b32 v33, v2, v1, 16 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v2 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_alignbit_b32 v18, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v19, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v20, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v21, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v22, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v23, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v25, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v28, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v30, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v1 ; SI-NEXT: .LBB12_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v0, v0, v30 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; SI-NEXT: v_or_b32_e32 v1, v1, v33 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v22 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v28 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v21 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v26 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v20 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v24 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v19 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v23 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x44, v0 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v35 +; SI-NEXT: v_or_b32_e32 v2, v2, v28 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v34 +; SI-NEXT: v_or_b32_e32 v4, v4, v25 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v33 +; SI-NEXT: v_or_b32_e32 v6, v6, v23 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v32 +; SI-NEXT: v_or_b32_e32 v8, v8, v22 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v31 +; SI-NEXT: v_or_b32_e32 v10, v10, v21 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v29 +; SI-NEXT: v_or_b32_e32 v12, v12, v20 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v27 +; SI-NEXT: v_or_b32_e32 v14, v14, v19 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v26 +; SI-NEXT: v_or_b32_e32 v16, v16, v18 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v24 +; SI-NEXT: v_or_b32_e32 v1, v1, v30 +; SI-NEXT: v_or_b32_e32 v3, v3, v28 +; SI-NEXT: v_or_b32_e32 v5, v5, v25 +; SI-NEXT: v_or_b32_e32 v7, v7, v23 +; SI-NEXT: v_or_b32_e32 v9, v9, v22 +; SI-NEXT: v_or_b32_e32 v11, v11, v21 +; SI-NEXT: v_or_b32_e32 v13, v13, v20 +; SI-NEXT: v_or_b32_e32 v15, v15, v19 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v18i32_to_v36i16: @@ -2759,40 +2706,40 @@ define inreg <36 x i16> @bitcast_v18i32_to_v36i16_scalar(<18 x i32> inreg %a, i3 ; SI-LABEL: bitcast_v18i32_to_v36i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v6, s16 -; SI-NEXT: v_mov_b32_e32 v7, s17 -; SI-NEXT: v_mov_b32_e32 v8, s18 -; SI-NEXT: v_mov_b32_e32 v9, s19 -; SI-NEXT: v_mov_b32_e32 v10, s20 -; SI-NEXT: v_mov_b32_e32 v11, s21 -; SI-NEXT: v_mov_b32_e32 v12, s22 -; SI-NEXT: v_mov_b32_e32 v13, s23 -; SI-NEXT: v_mov_b32_e32 v14, s24 -; SI-NEXT: v_mov_b32_e32 v15, s25 -; SI-NEXT: v_mov_b32_e32 v16, s26 -; SI-NEXT: v_mov_b32_e32 v17, s27 -; SI-NEXT: v_mov_b32_e32 v18, s28 -; SI-NEXT: v_mov_b32_e32 v19, s29 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; SI-NEXT: v_readfirstlane_b32 s20, v6 -; SI-NEXT: v_readfirstlane_b32 s21, v7 -; SI-NEXT: v_readfirstlane_b32 s18, v8 -; SI-NEXT: v_readfirstlane_b32 s19, v9 -; SI-NEXT: v_readfirstlane_b32 s16, v10 -; SI-NEXT: v_readfirstlane_b32 s17, v11 -; SI-NEXT: v_readfirstlane_b32 s14, v12 -; SI-NEXT: v_readfirstlane_b32 s15, v13 -; SI-NEXT: v_readfirstlane_b32 s12, v14 -; SI-NEXT: v_readfirstlane_b32 s13, v15 -; SI-NEXT: v_readfirstlane_b32 s10, v16 -; SI-NEXT: v_readfirstlane_b32 s11, v17 -; SI-NEXT: v_readfirstlane_b32 s8, v18 -; SI-NEXT: v_readfirstlane_b32 s9, v19 -; SI-NEXT: v_readfirstlane_b32 s6, v1 -; SI-NEXT: v_readfirstlane_b32 s7, v2 -; SI-NEXT: v_readfirstlane_b32 s4, v3 +; SI-NEXT: v_mov_b32_e32 v5, s16 +; SI-NEXT: v_mov_b32_e32 v6, s17 +; SI-NEXT: v_mov_b32_e32 v7, s18 +; SI-NEXT: v_mov_b32_e32 v8, s19 +; SI-NEXT: v_mov_b32_e32 v9, s20 +; SI-NEXT: v_mov_b32_e32 v10, s21 +; SI-NEXT: v_mov_b32_e32 v11, s22 +; SI-NEXT: v_mov_b32_e32 v12, s23 +; SI-NEXT: v_mov_b32_e32 v13, s24 +; SI-NEXT: v_mov_b32_e32 v14, s25 +; SI-NEXT: v_mov_b32_e32 v15, s26 +; SI-NEXT: v_mov_b32_e32 v16, s27 +; SI-NEXT: v_mov_b32_e32 v17, s28 +; SI-NEXT: v_mov_b32_e32 v18, s29 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: v_readfirstlane_b32 s20, v5 +; SI-NEXT: v_readfirstlane_b32 s21, v6 +; SI-NEXT: v_readfirstlane_b32 s18, v7 +; SI-NEXT: v_readfirstlane_b32 s19, v8 +; SI-NEXT: v_readfirstlane_b32 s16, v9 +; SI-NEXT: v_readfirstlane_b32 s17, v10 +; SI-NEXT: v_readfirstlane_b32 s14, v11 +; SI-NEXT: v_readfirstlane_b32 s15, v12 +; SI-NEXT: v_readfirstlane_b32 s12, v13 +; SI-NEXT: v_readfirstlane_b32 s13, v14 +; SI-NEXT: v_readfirstlane_b32 s10, v15 +; SI-NEXT: v_readfirstlane_b32 s11, v16 +; SI-NEXT: v_readfirstlane_b32 s8, v17 +; SI-NEXT: v_readfirstlane_b32 s9, v18 +; SI-NEXT: v_readfirstlane_b32 s6, v0 +; SI-NEXT: v_readfirstlane_b32 s7, v1 +; SI-NEXT: v_readfirstlane_b32 s4, v2 ; SI-NEXT: s_and_b64 s[22:23], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s5, v4 +; SI-NEXT: v_readfirstlane_b32 s5, v3 ; SI-NEXT: s_cbranch_scc0 .LBB13_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_lshr_b32 s60, s5, 16 @@ -2855,127 +2802,75 @@ define inreg <36 x i16> @bitcast_v18i32_to_v36i16_scalar(<18 x i32> inreg %a, i3 ; SI-NEXT: s_lshl_b32 s23, s56, 16 ; SI-NEXT: s_and_b32 s20, s20, 0xffff ; SI-NEXT: s_or_b32 s20, s20, s23 -; SI-NEXT: v_mov_b32_e32 v1, s20 -; SI-NEXT: s_and_b32 s20, s21, 0xffff -; SI-NEXT: s_lshl_b32 s21, s76, 16 -; SI-NEXT: s_or_b32 s20, s20, s21 -; SI-NEXT: v_mov_b32_e32 v2, s20 +; SI-NEXT: s_and_b32 s21, s21, 0xffff +; SI-NEXT: s_lshl_b32 s23, s76, 16 +; SI-NEXT: s_or_b32 s21, s21, s23 ; SI-NEXT: s_and_b32 s18, s18, 0xffff -; SI-NEXT: s_lshl_b32 s20, s46, 16 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; SI-NEXT: s_or_b32 s18, s18, s20 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s18 -; SI-NEXT: s_and_b32 s18, s19, 0xffff -; SI-NEXT: s_lshl_b32 s19, s75, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 8, v0 -; SI-NEXT: s_or_b32 s18, s18, s19 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: s_lshl_b32 s23, s46, 16 +; SI-NEXT: s_or_b32 s18, s18, s23 +; SI-NEXT: s_and_b32 s19, s19, 0xffff +; SI-NEXT: s_lshl_b32 s23, s75, 16 +; SI-NEXT: s_or_b32 s19, s19, s23 ; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: s_lshl_b32 s18, s44, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 12, v0 -; SI-NEXT: s_or_b32 s16, s16, s18 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s17, 0xffff -; SI-NEXT: s_lshl_b32 s17, s74, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 16, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_lshl_b32 s23, s44, 16 +; SI-NEXT: s_or_b32 s16, s16, s23 +; SI-NEXT: s_and_b32 s17, s17, 0xffff +; SI-NEXT: s_lshl_b32 s23, s74, 16 +; SI-NEXT: s_or_b32 s17, s17, s23 ; SI-NEXT: s_and_b32 s14, s14, 0xffff -; SI-NEXT: s_lshl_b32 s16, s42, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 20, v0 -; SI-NEXT: s_or_b32 s14, s14, s16 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s14 -; SI-NEXT: s_and_b32 s14, s15, 0xffff -; SI-NEXT: s_lshl_b32 s15, s73, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 24, v0 -; SI-NEXT: s_or_b32 s14, s14, s15 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s14 +; SI-NEXT: s_lshl_b32 s23, s42, 16 +; SI-NEXT: s_or_b32 s14, s14, s23 +; SI-NEXT: s_and_b32 s15, s15, 0xffff +; SI-NEXT: s_lshl_b32 s23, s73, 16 +; SI-NEXT: s_or_b32 s15, s15, s23 ; SI-NEXT: s_and_b32 s12, s12, 0xffff -; SI-NEXT: s_lshl_b32 s14, s40, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 28, v0 -; SI-NEXT: s_or_b32 s12, s12, s14 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s12 -; SI-NEXT: s_and_b32 s12, s13, 0xffff -; SI-NEXT: s_lshl_b32 s13, s72, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 32, v0 -; SI-NEXT: s_or_b32 s12, s12, s13 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s12 +; SI-NEXT: s_lshl_b32 s23, s40, 16 +; SI-NEXT: s_or_b32 s12, s12, s23 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_lshl_b32 s23, s72, 16 +; SI-NEXT: s_or_b32 s13, s13, s23 ; SI-NEXT: s_and_b32 s10, s10, 0xffff -; SI-NEXT: s_lshl_b32 s12, s28, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 36, v0 -; SI-NEXT: s_or_b32 s10, s10, s12 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s10 -; SI-NEXT: s_and_b32 s10, s11, 0xffff -; SI-NEXT: s_lshl_b32 s11, s63, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 40, v0 -; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: s_lshl_b32 s23, s28, 16 +; SI-NEXT: s_or_b32 s10, s10, s23 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: s_lshl_b32 s23, s63, 16 +; SI-NEXT: s_or_b32 s11, s11, s23 ; SI-NEXT: s_and_b32 s8, s8, 0xffff -; SI-NEXT: s_lshl_b32 s10, s26, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 44, v0 -; SI-NEXT: s_or_b32 s8, s8, s10 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s8 -; SI-NEXT: s_and_b32 s8, s9, 0xffff -; SI-NEXT: s_lshl_b32 s9, s62, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 48, v0 -; SI-NEXT: s_or_b32 s8, s8, s9 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: s_lshl_b32 s23, s26, 16 +; SI-NEXT: s_or_b32 s8, s8, s23 +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_lshl_b32 s23, s62, 16 +; SI-NEXT: s_or_b32 s9, s9, s23 ; SI-NEXT: s_and_b32 s6, s6, 0xffff -; SI-NEXT: s_lshl_b32 s8, s24, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 52, v0 -; SI-NEXT: s_or_b32 s6, s6, s8 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: s_and_b32 s6, s7, 0xffff -; SI-NEXT: s_lshl_b32 s7, s61, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 56, v0 -; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_lshl_b32 s23, s24, 16 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_lshl_b32 s6, s22, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 60, v0 -; SI-NEXT: s_or_b32 s4, s4, s6 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s4 -; SI-NEXT: s_and_b32 s4, s5, 0xffff -; SI-NEXT: s_lshl_b32 s5, s60, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 64, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x44, v0 -; SI-NEXT: v_mov_b32_e32 v1, s4 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_lshl_b32 s22, s22, 16 +; SI-NEXT: s_or_b32 s6, s6, s23 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_lshl_b32 s23, s61, 16 +; SI-NEXT: s_or_b32 s4, s4, s22 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s22, s60, 16 +; SI-NEXT: s_or_b32 s7, s7, s23 +; SI-NEXT: s_or_b32 s5, s5, s22 +; SI-NEXT: v_mov_b32_e32 v0, s20 +; SI-NEXT: v_mov_b32_e32 v1, s21 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s16 +; SI-NEXT: v_mov_b32_e32 v5, s17 +; SI-NEXT: v_mov_b32_e32 v6, s14 +; SI-NEXT: v_mov_b32_e32 v7, s15 +; SI-NEXT: v_mov_b32_e32 v8, s12 +; SI-NEXT: v_mov_b32_e32 v9, s13 +; SI-NEXT: v_mov_b32_e32 v10, s10 +; SI-NEXT: v_mov_b32_e32 v11, s11 +; SI-NEXT: v_mov_b32_e32 v12, s8 +; SI-NEXT: v_mov_b32_e32 v13, s9 +; SI-NEXT: v_mov_b32_e32 v14, s6 +; SI-NEXT: v_mov_b32_e32 v15, s7 +; SI-NEXT: v_mov_b32_e32 v16, s4 +; SI-NEXT: v_mov_b32_e32 v17, s5 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB13_4: ; SI-NEXT: ; implicit-def: $sgpr56 @@ -3478,101 +3373,111 @@ define <18 x i32> @bitcast_v36i16_to_v18i32(<36 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v36i16_to_v18i32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v32, v17 +; SI-NEXT: v_mov_b32_e32 v33, v16 +; SI-NEXT: v_mov_b32_e32 v34, v15 +; SI-NEXT: v_mov_b32_e32 v35, v14 +; SI-NEXT: v_mov_b32_e32 v36, v13 +; SI-NEXT: v_mov_b32_e32 v37, v12 +; SI-NEXT: v_mov_b32_e32 v38, v11 +; SI-NEXT: v_mov_b32_e32 v39, v10 +; SI-NEXT: v_mov_b32_e32 v48, v9 +; SI-NEXT: v_mov_b32_e32 v49, v8 +; SI-NEXT: v_mov_b32_e32 v50, v7 +; SI-NEXT: v_mov_b32_e32 v51, v6 +; SI-NEXT: v_mov_b32_e32 v52, v5 ; SI-NEXT: v_mov_b32_e32 v53, v4 -; SI-NEXT: v_mov_b32_e32 v54, v2 -; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 -; SI-NEXT: v_mov_b32_e32 v36, v22 -; SI-NEXT: v_mov_b32_e32 v37, v20 -; SI-NEXT: v_mov_b32_e32 v38, v18 -; SI-NEXT: v_mov_b32_e32 v39, v16 -; SI-NEXT: v_mov_b32_e32 v48, v14 -; SI-NEXT: v_mov_b32_e32 v49, v12 -; SI-NEXT: v_mov_b32_e32 v50, v10 -; SI-NEXT: v_mov_b32_e32 v51, v8 -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v7 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v29 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v0 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v4 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v6 +; SI-NEXT: v_mov_b32_e32 v54, v3 +; SI-NEXT: v_mov_b32_e32 v55, v2 +; SI-NEXT: v_mov_b32_e32 v40, v1 +; SI-NEXT: v_mov_b32_e32 v41, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v38 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v48 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v50 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v52 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v40 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v41 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v14 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB14_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v53 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v52 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v51 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v50 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v49 -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v48 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v39 -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v38 -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v37 -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v36 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v41 -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v40 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: v_or_b32_e32 v0, v0, v35 -; SI-NEXT: v_or_b32_e32 v1, v1, v34 -; SI-NEXT: v_or_b32_e32 v2, v2, v33 -; SI-NEXT: v_or_b32_e32 v3, v3, v32 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v35 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v41 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v40 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v55 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v54 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v53 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v52 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v51 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v50 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v49 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v48 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v39 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v38 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v37 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v36 +; SI-NEXT: v_or_b32_e32 v0, v0, v45 +; SI-NEXT: v_or_b32_e32 v1, v1, v44 +; SI-NEXT: v_or_b32_e32 v2, v2, v43 +; SI-NEXT: v_or_b32_e32 v3, v3, v42 ; SI-NEXT: v_or_b32_e32 v4, v4, v63 ; SI-NEXT: v_or_b32_e32 v5, v5, v62 ; SI-NEXT: v_or_b32_e32 v6, v6, v61 @@ -3581,8 +3486,10 @@ define <18 x i32> @bitcast_v36i16_to_v18i32(<36 x i16> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v9, v9, v58 ; SI-NEXT: v_or_b32_e32 v10, v10, v57 ; SI-NEXT: v_or_b32_e32 v11, v11, v56 -; SI-NEXT: v_or_b32_e32 v16, v16, v43 -; SI-NEXT: v_or_b32_e32 v17, v17, v42 +; SI-NEXT: v_or_b32_e32 v12, v12, v47 +; SI-NEXT: v_or_b32_e32 v13, v13, v46 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr53 @@ -3595,15 +3502,11 @@ define <18 x i32> @bitcast_v36i16_to_v18i32(<36 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr62 ; SI-NEXT: ; implicit-def: $vgpr61 @@ -3612,47 +3515,55 @@ define <18 x i32> @bitcast_v36i16_to_v18i32(<36 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v34 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v33 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v32 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; SI-NEXT: v_or_b32_e32 v12, v12, v47 -; SI-NEXT: v_or_b32_e32 v13, v13, v46 -; SI-NEXT: v_or_b32_e32 v14, v14, v45 -; SI-NEXT: v_or_b32_e32 v15, v15, v44 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 ; SI-NEXT: .LBB14_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB14_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v53 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v52 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v51 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v50 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v49 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v48 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v39 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v38 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v37 -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v36 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v41 -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v40 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v35 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v40 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v48 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v39 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v38 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v37 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v36 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 @@ -3665,13 +3576,13 @@ define <18 x i32> @bitcast_v36i16_to_v18i32(<36 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; SI-NEXT: v_or_b32_e32 v0, v35, v0 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_or_b32_e32 v0, v45, v0 ; SI-NEXT: s_mov_b32 s6, 0x30000 -; SI-NEXT: v_or_b32_e32 v1, v34, v1 -; SI-NEXT: v_or_b32_e32 v2, v33, v2 -; SI-NEXT: v_or_b32_e32 v3, v32, v3 +; SI-NEXT: v_or_b32_e32 v1, v44, v1 +; SI-NEXT: v_or_b32_e32 v2, v43, v2 +; SI-NEXT: v_or_b32_e32 v3, v42, v3 ; SI-NEXT: v_or_b32_e32 v4, v63, v4 ; SI-NEXT: v_or_b32_e32 v5, v62, v5 ; SI-NEXT: v_or_b32_e32 v6, v61, v6 @@ -3680,8 +3591,8 @@ define <18 x i32> @bitcast_v36i16_to_v18i32(<36 x i16> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v9, v58, v9 ; SI-NEXT: v_or_b32_e32 v10, v57, v10 ; SI-NEXT: v_or_b32_e32 v11, v56, v11 -; SI-NEXT: v_or_b32_e32 v16, v43, v16 -; SI-NEXT: v_or_b32_e32 v17, v42, v17 +; SI-NEXT: v_or_b32_e32 v12, v47, v12 +; SI-NEXT: v_or_b32_e32 v13, v46, v13 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 @@ -3690,24 +3601,22 @@ define <18 x i32> @bitcast_v36i16_to_v18i32(<36 x i16> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 ; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 ; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v34 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v33 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v32 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; SI-NEXT: v_or_b32_e32 v12, v47, v12 -; SI-NEXT: v_or_b32_e32 v13, v46, v13 -; SI-NEXT: v_or_b32_e32 v14, v45, v14 -; SI-NEXT: v_or_b32_e32 v15, v44, v15 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 ; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 ; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 ; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 @@ -3718,33 +3627,33 @@ define <18 x i32> @bitcast_v36i16_to_v18i32(<36 x i16> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 ; SI-NEXT: .LBB14_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; SI-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: bitcast_v36i16_to_v18i32: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v32, v17 -; VI-NEXT: v_mov_b32_e32 v33, v16 -; VI-NEXT: v_mov_b32_e32 v34, v15 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v36i16_to_v18i32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v32, v17 +; VI-NEXT: v_mov_b32_e32 v33, v16 +; VI-NEXT: v_mov_b32_e32 v34, v15 ; VI-NEXT: v_mov_b32_e32 v35, v14 ; VI-NEXT: v_mov_b32_e32 v36, v13 ; VI-NEXT: v_mov_b32_e32 v37, v12 @@ -4205,185 +4114,203 @@ define inreg <18 x i32> @bitcast_v36i16_to_v18i32_scalar(<36 x i16> inreg %a, i3 ; SI-LABEL: bitcast_v36i16_to_v18i32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v32, v20 -; SI-NEXT: v_mov_b32_e32 v33, v18 -; SI-NEXT: v_mov_b32_e32 v34, v16 -; SI-NEXT: v_mov_b32_e32 v35, v14 -; SI-NEXT: v_mov_b32_e32 v36, v12 -; SI-NEXT: v_mov_b32_e32 v37, v10 -; SI-NEXT: v_mov_b32_e32 v38, v8 -; SI-NEXT: v_mov_b32_e32 v39, v6 -; SI-NEXT: v_mov_b32_e32 v48, v4 -; SI-NEXT: v_mov_b32_e32 v49, v2 -; SI-NEXT: v_mov_b32_e32 v50, v0 +; SI-NEXT: v_mov_b32_e32 v32, v3 +; SI-NEXT: v_mov_b32_e32 v33, v2 +; SI-NEXT: v_mov_b32_e32 v34, v1 +; SI-NEXT: v_mov_b32_e32 v35, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v35 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s7, s28, 16 +; SI-NEXT: s_lshr_b32 s8, s27, 16 +; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: s_lshr_b32 s13, s22, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s40, s19, 16 +; SI-NEXT: s_lshr_b32 s41, s18, 16 +; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v0 ; SI-NEXT: s_cbranch_scc0 .LBB15_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 -; SI-NEXT: v_or_b32_e32 v7, v0, v45 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 -; SI-NEXT: v_or_b32_e32 v8, v0, v44 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 ; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: v_or_b32_e32 v9, v0, v43 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: s_lshl_b32 s5, s43, 16 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: v_or_b32_e32 v10, v0, v42 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 -; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: v_or_b32_e32 v11, v0, v41 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 -; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: v_or_b32_e32 v12, v0, v40 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 -; SI-NEXT: s_or_b32 s7, s7, s8 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: v_or_b32_e32 v13, v0, v55 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s44, s42, 16 +; SI-NEXT: s_or_b32 s5, s5, s44 +; SI-NEXT: s_and_b32 s44, s18, 0xffff +; SI-NEXT: s_lshl_b32 s45, s41, 16 +; SI-NEXT: s_or_b32 s44, s44, s45 +; SI-NEXT: s_and_b32 s45, s19, 0xffff +; SI-NEXT: s_lshl_b32 s46, s40, 16 +; SI-NEXT: s_or_b32 s45, s45, s46 +; SI-NEXT: s_and_b32 s46, s20, 0xffff +; SI-NEXT: s_lshl_b32 s47, s15, 16 +; SI-NEXT: s_or_b32 s46, s46, s47 +; SI-NEXT: s_and_b32 s47, s21, 0xffff +; SI-NEXT: s_lshl_b32 s56, s14, 16 +; SI-NEXT: s_or_b32 s47, s47, s56 +; SI-NEXT: s_and_b32 s56, s22, 0xffff +; SI-NEXT: s_lshl_b32 s57, s13, 16 +; SI-NEXT: s_or_b32 s56, s56, s57 +; SI-NEXT: s_and_b32 s57, s23, 0xffff +; SI-NEXT: s_lshl_b32 s58, s12, 16 +; SI-NEXT: s_or_b32 s57, s57, s58 +; SI-NEXT: s_and_b32 s58, s24, 0xffff +; SI-NEXT: s_lshl_b32 s59, s11, 16 +; SI-NEXT: s_or_b32 s58, s58, s59 +; SI-NEXT: s_and_b32 s59, s25, 0xffff +; SI-NEXT: s_lshl_b32 s60, s10, 16 +; SI-NEXT: s_or_b32 s59, s59, s60 +; SI-NEXT: s_and_b32 s60, s26, 0xffff +; SI-NEXT: s_lshl_b32 s61, s9, 16 +; SI-NEXT: s_or_b32 s60, s60, s61 +; SI-NEXT: s_and_b32 s61, s27, 0xffff +; SI-NEXT: s_lshl_b32 s62, s8, 16 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 -; SI-NEXT: s_or_b32 s8, s8, s9 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: v_or_b32_e32 v14, v0, v54 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 -; SI-NEXT: s_or_b32 s9, s9, s10 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_or_b32_e32 v15, v0, v53 +; SI-NEXT: s_or_b32 s61, s61, s62 +; SI-NEXT: s_and_b32 s62, s28, 0xffff +; SI-NEXT: s_lshl_b32 s63, s7, 16 +; SI-NEXT: v_or_b32_e32 v14, v0, v39 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 -; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_or_b32_e32 v16, v0, v52 +; SI-NEXT: s_or_b32 s62, s62, s63 +; SI-NEXT: s_and_b32 s63, s29, 0xffff +; SI-NEXT: s_lshl_b32 s72, s6, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v34 +; SI-NEXT: v_or_b32_e32 v16, v0, v37 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 -; SI-NEXT: v_or_b32_e32 v17, v0, v51 +; SI-NEXT: s_or_b32 s63, s63, s72 +; SI-NEXT: v_or_b32_e32 v15, v1, v38 +; SI-NEXT: v_or_b32_e32 v17, v0, v36 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_mov_b32_e32 v5, s9 -; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: v_mov_b32_e32 v2, s44 +; SI-NEXT: v_mov_b32_e32 v3, s45 +; SI-NEXT: v_mov_b32_e32 v4, s46 +; SI-NEXT: v_mov_b32_e32 v5, s47 +; SI-NEXT: v_mov_b32_e32 v6, s56 +; SI-NEXT: v_mov_b32_e32 v7, s57 +; SI-NEXT: v_mov_b32_e32 v8, s58 +; SI-NEXT: v_mov_b32_e32 v9, s59 +; SI-NEXT: v_mov_b32_e32 v10, s60 +; SI-NEXT: v_mov_b32_e32 v11, s61 +; SI-NEXT: v_mov_b32_e32 v12, s62 +; SI-NEXT: v_mov_b32_e32 v13, s63 ; SI-NEXT: s_cbranch_execnz .LBB15_3 ; SI-NEXT: .LBB15_2: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v45, v0 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v44, v0 -; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v43, v0 -; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v42, v0 -; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v41, v0 -; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v40, v0 -; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v55, v0 -; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_or_b32_e32 v0, v54, v0 ; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: s_add_i32 s17, s17, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s16, s42, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: s_and_b32 s16, s18, 0xffff +; SI-NEXT: s_lshl_b32 s17, s41, 16 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_and_b32 s17, s19, 0xffff +; SI-NEXT: s_lshl_b32 s18, s40, 16 ; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: s_and_b32 s18, s20, 0xffff +; SI-NEXT: s_lshl_b32 s15, s15, 16 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_or_b32 s15, s15, s18 +; SI-NEXT: s_and_b32 s18, s21, 0xffff +; SI-NEXT: s_lshl_b32 s14, s14, 16 ; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: v_or_b32_e32 v0, v53, v0 -; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_or_b32 s14, s14, s18 +; SI-NEXT: s_and_b32 s18, s22, 0xffff +; SI-NEXT: s_lshl_b32 s13, s13, 16 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: s_or_b32 s13, s13, s18 +; SI-NEXT: s_and_b32 s18, s23, 0xffff +; SI-NEXT: s_lshl_b32 s12, s12, 16 ; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 -; SI-NEXT: s_or_b32 s7, s8, s7 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: s_add_i32 s26, s26, 3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_or_b32 s8, s9, s8 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_or_b32 s12, s12, s18 +; SI-NEXT: s_and_b32 s18, s24, 0xffff +; SI-NEXT: s_lshl_b32 s11, s11, 16 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: v_or_b32_e32 v0, v39, v0 +; SI-NEXT: s_or_b32 s11, s11, s18 +; SI-NEXT: s_and_b32 s18, s25, 0xffff +; SI-NEXT: s_lshl_b32 s10, s10, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 +; SI-NEXT: s_or_b32 s10, s10, s18 +; SI-NEXT: s_and_b32 s18, s26, 0xffff +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: s_or_b32 s9, s9, s18 +; SI-NEXT: s_and_b32 s18, s27, 0xffff +; SI-NEXT: s_lshl_b32 s8, s8, 16 ; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: v_or_b32_e32 v0, v52, v0 -; SI-NEXT: s_or_b32 s9, s10, s9 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s8, s8, s18 +; SI-NEXT: s_and_b32 s18, s28, 0xffff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: v_or_b32_e32 v0, v37, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v34 +; SI-NEXT: s_or_b32 s7, s7, s18 +; SI-NEXT: s_and_b32 s18, s29, 0xffff +; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 -; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_or_b32 s6, s6, s18 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v38, v1 ; SI-NEXT: s_add_i32 s4, s4, 0x30000 ; SI-NEXT: s_add_i32 s5, s5, 0x30000 -; SI-NEXT: s_add_i32 s6, s6, 0x30000 -; SI-NEXT: s_add_i32 s7, s7, 0x30000 -; SI-NEXT: s_add_i32 s8, s8, 0x30000 -; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s16, s16, 0x30000 +; SI-NEXT: s_add_i32 s17, s17, 0x30000 +; SI-NEXT: s_add_i32 s15, s15, 0x30000 +; SI-NEXT: s_add_i32 s14, s14, 0x30000 +; SI-NEXT: s_add_i32 s13, s13, 0x30000 +; SI-NEXT: s_add_i32 s12, s12, 0x30000 +; SI-NEXT: s_add_i32 s11, s11, 0x30000 ; SI-NEXT: s_add_i32 s10, s10, 0x30000 -; SI-NEXT: v_or_b32_e32 v0, v51, v0 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v36, v0 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v1 ; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_mov_b32_e32 v5, s9 -; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: v_mov_b32_e32 v3, s17 +; SI-NEXT: v_mov_b32_e32 v4, s15 +; SI-NEXT: v_mov_b32_e32 v5, s14 +; SI-NEXT: v_mov_b32_e32 v6, s13 +; SI-NEXT: v_mov_b32_e32 v7, s12 +; SI-NEXT: v_mov_b32_e32 v8, s11 +; SI-NEXT: v_mov_b32_e32 v9, s10 +; SI-NEXT: v_mov_b32_e32 v10, s9 +; SI-NEXT: v_mov_b32_e32 v11, s8 +; SI-NEXT: v_mov_b32_e32 v12, s7 +; SI-NEXT: v_mov_b32_e32 v13, s6 ; SI-NEXT: .LBB15_3: ; %end -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB15_4: ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 @@ -4794,23 +4721,21 @@ define <36 x half> @bitcast_v18i32_to_v36f16(<18 x i32> %a, i32 %b) { ; SI-LABEL: bitcast_v18i32_to_v36f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr51 @@ -4820,86 +4745,87 @@ define <36 x half> @bitcast_v18i32_to_v36f16(<18 x i32> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB16_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v21 ; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v21 ; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v21 ; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v21 ; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v21 ; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v21 ; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v21 ; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v21 ; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v21 ; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v21 ; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v21 ; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v4 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v40, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v21 ; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v3 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v42, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v21 ; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v17 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v44, v21 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v41, v21 ; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v15 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v43, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v46, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v0 +; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 @@ -4917,11 +4843,11 @@ define <36 x half> @bitcast_v18i32_to_v36f16(<18 x i32> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: .LBB16_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB16_4 ; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 @@ -4939,196 +4865,143 @@ define <36 x half> @bitcast_v18i32_to_v36f16(<18 x i32> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 ; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 ; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v1 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 ; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 ; SI-NEXT: .LBB16_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v43 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v42 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v41 -; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v55 -; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v54 -; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v52 -; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v50 -; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v48 -; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v38 -; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v36 -; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v34 -; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v32 -; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v31 -; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v29 -; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v27 -; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v25 -; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v23 -; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v3, v40 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v21 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v4, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v54 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v52 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v51 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v48 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v39 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v35 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v36 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v31 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v32 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v27 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v28 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v22 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v24 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v19 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v21 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v18i32_to_v36f16: @@ -5522,96 +5395,96 @@ define inreg <36 x half> @bitcast_v18i32_to_v36f16_scalar(<18 x i32> inreg %a, i ; SI-LABEL: bitcast_v18i32_to_v36f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v6, s16 -; SI-NEXT: v_mov_b32_e32 v7, s17 -; SI-NEXT: v_mov_b32_e32 v8, s18 -; SI-NEXT: v_mov_b32_e32 v9, s19 -; SI-NEXT: v_mov_b32_e32 v10, s20 -; SI-NEXT: v_mov_b32_e32 v11, s21 -; SI-NEXT: v_mov_b32_e32 v12, s22 -; SI-NEXT: v_mov_b32_e32 v13, s23 -; SI-NEXT: v_mov_b32_e32 v14, s24 -; SI-NEXT: v_mov_b32_e32 v15, s25 -; SI-NEXT: v_mov_b32_e32 v16, s26 -; SI-NEXT: v_mov_b32_e32 v17, s27 -; SI-NEXT: v_mov_b32_e32 v18, s28 -; SI-NEXT: v_mov_b32_e32 v19, s29 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; SI-NEXT: v_readfirstlane_b32 s23, v6 -; SI-NEXT: v_readfirstlane_b32 s22, v7 -; SI-NEXT: v_readfirstlane_b32 s21, v8 -; SI-NEXT: v_readfirstlane_b32 s20, v9 -; SI-NEXT: v_readfirstlane_b32 s19, v10 -; SI-NEXT: v_readfirstlane_b32 s18, v11 -; SI-NEXT: v_readfirstlane_b32 s17, v12 -; SI-NEXT: v_readfirstlane_b32 s16, v13 -; SI-NEXT: v_readfirstlane_b32 s15, v14 -; SI-NEXT: v_readfirstlane_b32 s14, v15 -; SI-NEXT: v_readfirstlane_b32 s13, v16 -; SI-NEXT: v_readfirstlane_b32 s12, v17 -; SI-NEXT: v_readfirstlane_b32 s11, v18 -; SI-NEXT: v_readfirstlane_b32 s10, v19 -; SI-NEXT: v_readfirstlane_b32 s8, v1 -; SI-NEXT: v_readfirstlane_b32 s7, v2 -; SI-NEXT: v_readfirstlane_b32 s6, v3 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s9, v4 -; SI-NEXT: s_cbranch_scc0 .LBB17_4 -; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s6, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_lshr_b32 s4, s7, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s8, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 +; SI-NEXT: v_mov_b32_e32 v5, s16 +; SI-NEXT: v_mov_b32_e32 v6, s17 +; SI-NEXT: v_mov_b32_e32 v7, s18 +; SI-NEXT: v_mov_b32_e32 v8, s19 +; SI-NEXT: v_mov_b32_e32 v9, s20 +; SI-NEXT: v_mov_b32_e32 v10, s21 +; SI-NEXT: v_mov_b32_e32 v11, s22 +; SI-NEXT: v_mov_b32_e32 v12, s23 +; SI-NEXT: v_mov_b32_e32 v13, s24 +; SI-NEXT: v_mov_b32_e32 v14, s25 +; SI-NEXT: v_mov_b32_e32 v15, s26 +; SI-NEXT: v_mov_b32_e32 v16, s27 +; SI-NEXT: v_mov_b32_e32 v17, s28 +; SI-NEXT: v_mov_b32_e32 v18, s29 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: v_readfirstlane_b32 s23, v5 +; SI-NEXT: v_readfirstlane_b32 s22, v6 +; SI-NEXT: v_readfirstlane_b32 s21, v7 +; SI-NEXT: v_readfirstlane_b32 s20, v8 +; SI-NEXT: v_readfirstlane_b32 s19, v9 +; SI-NEXT: v_readfirstlane_b32 s18, v10 +; SI-NEXT: v_readfirstlane_b32 s17, v11 +; SI-NEXT: v_readfirstlane_b32 s16, v12 +; SI-NEXT: v_readfirstlane_b32 s15, v13 +; SI-NEXT: v_readfirstlane_b32 s14, v14 +; SI-NEXT: v_readfirstlane_b32 s13, v15 +; SI-NEXT: v_readfirstlane_b32 s12, v16 +; SI-NEXT: v_readfirstlane_b32 s11, v17 +; SI-NEXT: v_readfirstlane_b32 s10, v18 +; SI-NEXT: v_readfirstlane_b32 s8, v0 +; SI-NEXT: v_readfirstlane_b32 s7, v1 +; SI-NEXT: v_readfirstlane_b32 s6, v2 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s9, v3 +; SI-NEXT: s_cbranch_scc0 .LBB17_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s9, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 +; SI-NEXT: s_lshr_b32 s4, s6, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 +; SI-NEXT: s_lshr_b32 s4, s7, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 +; SI-NEXT: s_lshr_b32 s4, s8, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 ; SI-NEXT: s_lshr_b32 s4, s10, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 ; SI-NEXT: s_lshr_b32 s4, s11, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 ; SI-NEXT: s_lshr_b32 s4, s12, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 ; SI-NEXT: s_lshr_b32 s4, s13, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 ; SI-NEXT: s_lshr_b32 s4, s14, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 ; SI-NEXT: s_lshr_b32 s4, s15, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 ; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 ; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s4 ; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 ; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v30, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 ; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s4 ; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 ; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s23 ; SI-NEXT: s_cbranch_execnz .LBB17_3 ; SI-NEXT: .LBB17_2: ; %cmp.true ; SI-NEXT: s_add_i32 s23, s23, 3 @@ -5650,206 +5523,153 @@ define inreg <36 x half> @bitcast_v18i32_to_v36f16_scalar(<18 x i32> inreg %a, i ; SI-NEXT: s_lshr_b32 s47, s7, 16 ; SI-NEXT: s_lshr_b32 s56, s6, 16 ; SI-NEXT: s_lshr_b32 s57, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s57 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s56 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s47 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s46 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s45 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s44 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s43 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s57 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s56 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s47 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s45 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s44 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s4 ; SI-NEXT: .LBB17_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 ; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 ; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 ; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 ; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v35 +; SI-NEXT: v_or_b32_e32 v0, v34, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 +; SI-NEXT: v_or_b32_e32 v2, v32, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_or_b32_e32 v35, v35, v36 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: buffer_store_dword v35, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v35, vcc, 4, v0 -; SI-NEXT: v_or_b32_e32 v33, v33, v34 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v32 +; SI-NEXT: v_or_b32_e32 v5, v5, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v30 +; SI-NEXT: v_or_b32_e32 v7, v7, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: buffer_store_dword v33, v35, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v33, vcc, 8, v0 -; SI-NEXT: v_or_b32_e32 v31, v31, v32 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v28 +; SI-NEXT: v_or_b32_e32 v9, v26, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: buffer_store_dword v31, v33, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v31, vcc, 12, v0 -; SI-NEXT: v_or_b32_e32 v29, v29, v30 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v26 +; SI-NEXT: v_or_b32_e32 v11, v24, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: buffer_store_dword v29, v31, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v29, vcc, 16, v0 -; SI-NEXT: v_or_b32_e32 v27, v28, v27 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v24 +; SI-NEXT: v_or_b32_e32 v13, v22, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_or_b32_e32 v15, v20, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: buffer_store_dword v27, v29, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v27, vcc, 20, v0 -; SI-NEXT: v_or_b32_e32 v25, v26, v25 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: buffer_store_dword v25, v27, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v25, vcc, 24, v0 -; SI-NEXT: v_or_b32_e32 v23, v24, v23 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: buffer_store_dword v23, v25, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v23, vcc, 28, v0 -; SI-NEXT: v_or_b32_e32 v21, v22, v21 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: buffer_store_dword v21, v23, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v21, vcc, 32, v0 -; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: buffer_store_dword v19, v21, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v19, vcc, 36, v0 -; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: buffer_store_dword v17, v19, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v17, vcc, 40, v0 -; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: buffer_store_dword v15, v17, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v15, vcc, 44, v0 -; SI-NEXT: v_or_b32_e32 v12, v14, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: buffer_store_dword v12, v15, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v12, v13 -; SI-NEXT: v_add_i32_e32 v13, vcc, 48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: buffer_store_dword v10, v13, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v10, v11 -; SI-NEXT: v_add_i32_e32 v11, vcc, 52, v0 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: buffer_store_dword v8, v11, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v8, v9 -; SI-NEXT: v_add_i32_e32 v9, vcc, 56, v0 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, 60, v0 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v5 -; SI-NEXT: v_add_i32_e32 v5, vcc, 64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v3 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x44, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v20 +; SI-NEXT: v_or_b32_e32 v1, v36, v1 +; SI-NEXT: v_or_b32_e32 v3, v34, v3 +; SI-NEXT: v_or_b32_e32 v4, v31, v4 +; SI-NEXT: v_or_b32_e32 v6, v29, v6 +; SI-NEXT: v_or_b32_e32 v8, v27, v8 +; SI-NEXT: v_or_b32_e32 v10, v25, v10 +; SI-NEXT: v_or_b32_e32 v12, v23, v12 +; SI-NEXT: v_or_b32_e32 v14, v21, v14 +; SI-NEXT: v_or_b32_e32 v16, v19, v16 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB17_4: +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: s_branch .LBB17_2 ; ; VI-LABEL: bitcast_v18i32_to_v36f16_scalar: @@ -6332,94 +6152,128 @@ define <18 x i32> @bitcast_v36f16_to_v18i32(<36 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v36f16_to_v18i32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:20 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v34, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v35, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v10 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v47, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v27 -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v39 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v48 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v38 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v35, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v63, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB18_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v49 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v35 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v33 @@ -6434,8 +6288,9 @@ define <18 x i32> @bitcast_v36f16_to_v18i32(<36 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v55 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v53 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v51 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v37 ; SI-NEXT: v_or_b32_e32 v0, v34, v0 ; SI-NEXT: v_or_b32_e32 v1, v32, v1 ; SI-NEXT: v_or_b32_e32 v2, v62, v2 @@ -6449,6 +6304,9 @@ define <18 x i32> @bitcast_v36f16_to_v18i32(<36 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v10, v54, v10 ; SI-NEXT: v_or_b32_e32 v11, v52, v11 ; SI-NEXT: v_or_b32_e32 v12, v50, v12 +; SI-NEXT: v_or_b32_e32 v13, v48, v13 +; SI-NEXT: v_or_b32_e32 v14, v38, v14 +; SI-NEXT: v_or_b32_e32 v15, v36, v15 ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr33 @@ -6476,30 +6334,28 @@ define <18 x i32> @bitcast_v36f16_to_v18i32(<36 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; kill: killed $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v37 -; SI-NEXT: v_or_b32_e32 v17, v36, v17 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: .LBB18_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB18_4 @@ -6595,90 +6451,86 @@ define <18 x i32> @bitcast_v36f16_to_v18i32(<36 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v52 ; SI-NEXT: v_cvt_f32_f16_e32 v14, v50 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v15, v39 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v51 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v52 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v37 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v14, v12 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_or_b32_e32 v11, v13, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v36 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v48 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v38 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v36 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_or_b32_e32 v14, v16, v14 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 ; SI-NEXT: v_or_b32_e32 v17, v19, v17 ; SI-NEXT: .LBB18_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -7151,6 +7003,16 @@ define inreg <18 x i32> @bitcast_v36f16_to_v18i32_scalar(<36 x half> inreg %a, i ; SI-LABEL: bitcast_v36f16_to_v18i32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_lshr_b32 s12, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s12 +; SI-NEXT: s_lshr_b32 s12, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s12 +; SI-NEXT: s_lshr_b32 s12, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s12 +; SI-NEXT: s_lshr_b32 s12, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s12 +; SI-NEXT: s_lshr_b32 s10, s23, 16 +; SI-NEXT: s_lshr_b32 s11, s22, 16 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -7167,89 +7029,132 @@ define inreg <18 x i32> @bitcast_v36f16_to_v18i32_scalar(<36 x half> inreg %a, i ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v57, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v11 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v63, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s23 +; SI-NEXT: s_lshr_b32 s8, s25, 16 +; SI-NEXT: s_lshr_b32 s9, s24, 16 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s25 +; SI-NEXT: s_lshr_b32 s6, s27, 16 +; SI-NEXT: s_lshr_b32 s7, s26, 16 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s27 +; SI-NEXT: s_lshr_b32 s12, s17, 16 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 +; SI-NEXT: s_lshr_b32 s4, s29, 16 +; SI-NEXT: s_lshr_b32 s5, s28, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s12 +; SI-NEXT: s_lshr_b32 s12, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s17 ; SI-NEXT: v_cvt_f16_f32_e32 v38, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v29, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v35, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v34, s16 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v62, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v33, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v60, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v61, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v44, s23 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v63, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v42, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v41, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v40, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v55, s26 -; SI-NEXT: v_cvt_f16_f32_e32 v59, s29 -; SI-NEXT: v_cvt_f16_f32_e32 v58, s28 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v3 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_cbranch_scc0 .LBB19_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v62 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v60 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v44 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v42 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v40 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v45 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v63 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v60 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v57 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v47 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v50 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v48 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v38 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v30 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v28 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v26 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v56 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v21 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 ; SI-NEXT: v_or_b32_e32 v0, v34, v0 ; SI-NEXT: v_or_b32_e32 v1, v33, v1 -; SI-NEXT: v_or_b32_e32 v2, v61, v2 -; SI-NEXT: v_or_b32_e32 v3, v63, v3 -; SI-NEXT: v_or_b32_e32 v4, v41, v4 -; SI-NEXT: v_or_b32_e32 v5, v55, v5 +; SI-NEXT: v_or_b32_e32 v2, v44, v2 +; SI-NEXT: v_or_b32_e32 v3, v42, v3 +; SI-NEXT: v_or_b32_e32 v4, v62, v4 +; SI-NEXT: v_or_b32_e32 v5, v59, v5 ; SI-NEXT: v_or_b32_e32 v6, v58, v6 -; SI-NEXT: v_or_b32_e32 v7, v56, v7 -; SI-NEXT: v_or_b32_e32 v8, v51, v8 -; SI-NEXT: v_or_b32_e32 v9, v49, v9 -; SI-NEXT: v_or_b32_e32 v10, v39, v10 -; SI-NEXT: v_or_b32_e32 v11, v37, v11 -; SI-NEXT: v_or_b32_e32 v12, v31, v12 -; SI-NEXT: v_or_b32_e32 v13, v29, v13 -; SI-NEXT: v_or_b32_e32 v14, v27, v14 -; SI-NEXT: v_or_b32_e32 v15, v25, v15 -; SI-NEXT: v_or_b32_e32 v16, v23, v16 +; SI-NEXT: v_or_b32_e32 v7, v47, v7 +; SI-NEXT: v_or_b32_e32 v8, v46, v8 +; SI-NEXT: v_or_b32_e32 v9, v37, v9 +; SI-NEXT: v_or_b32_e32 v10, v36, v10 +; SI-NEXT: v_or_b32_e32 v11, v29, v11 +; SI-NEXT: v_or_b32_e32 v12, v28, v12 +; SI-NEXT: v_or_b32_e32 v13, v26, v13 +; SI-NEXT: v_or_b32_e32 v14, v24, v14 +; SI-NEXT: v_or_b32_e32 v15, v22, v15 +; SI-NEXT: v_or_b32_e32 v16, v20, v16 ; SI-NEXT: v_or_b32_e32 v17, v19, v17 ; SI-NEXT: s_cbranch_execnz .LBB19_3 ; SI-NEXT: .LBB19_2: ; %cmp.true +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v43 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v33 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 @@ -7264,11 +7169,10 @@ define inreg <18 x i32> @bitcast_v36f16_to_v18i32_scalar(<36 x half> inreg %a, i ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v63 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v62 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -7277,20 +7181,18 @@ define inreg <18 x i32> @bitcast_v36f16_to_v18i32_scalar(<36 x half> inreg %a, i ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v44 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v61 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v59 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v63 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v58 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 @@ -7298,11 +7200,11 @@ define inreg <18 x i32> @bitcast_v36f16_to_v18i32_scalar(<36 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v56 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v60 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v57 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 @@ -7313,10 +7215,10 @@ define inreg <18 x i32> @bitcast_v36f16_to_v18i32_scalar(<36 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v47 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v37 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 @@ -7324,29 +7226,29 @@ define inreg <18 x i32> @bitcast_v36f16_to_v18i32_scalar(<36 x half> inreg %a, i ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v39 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v38 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v31 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v36 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v25 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v30 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 @@ -7354,10 +7256,10 @@ define inreg <18 x i32> @bitcast_v36f16_to_v18i32_scalar(<36 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v27 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v14, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v26 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 @@ -7365,11 +7267,11 @@ define inreg <18 x i32> @bitcast_v36f16_to_v18i32_scalar(<36 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v24 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v22 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 @@ -7377,10 +7279,10 @@ define inreg <18 x i32> @bitcast_v36f16_to_v18i32_scalar(<36 x half> inreg %a, i ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_or_b32_e32 v14, v16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v21 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 @@ -7415,59 +7317,63 @@ define inreg <18 x i32> @bitcast_v36f16_to_v18i32_scalar(<36 x half> inreg %a, i ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB19_4: -; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v55, v36 +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_mov_b32_e32 v35, v42 +; SI-NEXT: v_mov_b32_e32 v42, v36 ; SI-NEXT: v_mov_b32_e32 v36, v19 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v40, v37 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v34, v43 +; SI-NEXT: v_mov_b32_e32 v43, v37 ; SI-NEXT: v_mov_b32_e32 v37, v18 -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v41, v38 -; SI-NEXT: v_mov_b32_e32 v38, v23 -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v42, v39 -; SI-NEXT: v_mov_b32_e32 v39, v24 -; SI-NEXT: v_mov_b32_e32 v43, v48 -; SI-NEXT: v_mov_b32_e32 v48, v25 -; SI-NEXT: v_mov_b32_e32 v32, v44 -; SI-NEXT: v_mov_b32_e32 v44, v49 -; SI-NEXT: v_mov_b32_e32 v49, v26 -; SI-NEXT: v_mov_b32_e32 v45, v50 -; SI-NEXT: v_mov_b32_e32 v50, v27 -; SI-NEXT: v_mov_b32_e32 v46, v51 -; SI-NEXT: v_mov_b32_e32 v51, v28 -; SI-NEXT: v_mov_b32_e32 v52, v29 -; SI-NEXT: v_mov_b32_e32 v53, v30 -; SI-NEXT: v_mov_b32_e32 v54, v31 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v33, v44 +; SI-NEXT: v_mov_b32_e32 v44, v38 +; SI-NEXT: v_mov_b32_e32 v38, v20 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v32, v45 +; SI-NEXT: v_mov_b32_e32 v45, v39 +; SI-NEXT: v_mov_b32_e32 v39, v21 +; SI-NEXT: v_mov_b32_e32 v48, v22 +; SI-NEXT: v_mov_b32_e32 v49, v23 +; SI-NEXT: v_mov_b32_e32 v50, v24 +; SI-NEXT: v_mov_b32_e32 v51, v25 +; SI-NEXT: v_mov_b32_e32 v52, v26 +; SI-NEXT: v_mov_b32_e32 v53, v27 +; SI-NEXT: v_mov_b32_e32 v54, v28 +; SI-NEXT: v_mov_b32_e32 v55, v29 +; SI-NEXT: v_mov_b32_e32 v40, v30 +; SI-NEXT: v_mov_b32_e32 v41, v31 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v24, v39 -; SI-NEXT: v_mov_b32_e32 v39, v42 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v23, v38 -; SI-NEXT: v_mov_b32_e32 v38, v41 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v21, v39 +; SI-NEXT: v_mov_b32_e32 v39, v45 +; SI-NEXT: v_mov_b32_e32 v45, v32 +; SI-NEXT: v_mov_b32_e32 v20, v38 +; SI-NEXT: v_mov_b32_e32 v38, v44 +; SI-NEXT: v_mov_b32_e32 v44, v33 ; SI-NEXT: v_mov_b32_e32 v18, v37 -; SI-NEXT: v_mov_b32_e32 v37, v40 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v37, v43 +; SI-NEXT: v_mov_b32_e32 v43, v34 ; SI-NEXT: v_mov_b32_e32 v19, v36 -; SI-NEXT: v_mov_b32_e32 v36, v55 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v31, v54 -; SI-NEXT: v_mov_b32_e32 v30, v53 -; SI-NEXT: v_mov_b32_e32 v29, v52 -; SI-NEXT: v_mov_b32_e32 v28, v51 -; SI-NEXT: v_mov_b32_e32 v51, v46 -; SI-NEXT: v_mov_b32_e32 v27, v50 -; SI-NEXT: v_mov_b32_e32 v50, v45 -; SI-NEXT: v_mov_b32_e32 v26, v49 -; SI-NEXT: v_mov_b32_e32 v49, v44 -; SI-NEXT: v_mov_b32_e32 v44, v32 -; SI-NEXT: v_mov_b32_e32 v25, v48 -; SI-NEXT: v_mov_b32_e32 v48, v43 +; SI-NEXT: v_mov_b32_e32 v36, v42 +; SI-NEXT: v_mov_b32_e32 v42, v35 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v31, v41 +; SI-NEXT: v_mov_b32_e32 v30, v40 +; SI-NEXT: v_mov_b32_e32 v29, v55 +; SI-NEXT: v_mov_b32_e32 v28, v54 +; SI-NEXT: v_mov_b32_e32 v27, v53 +; SI-NEXT: v_mov_b32_e32 v26, v52 +; SI-NEXT: v_mov_b32_e32 v25, v51 +; SI-NEXT: v_mov_b32_e32 v24, v50 +; SI-NEXT: v_mov_b32_e32 v23, v49 +; SI-NEXT: v_mov_b32_e32 v22, v48 ; SI-NEXT: s_branch .LBB19_2 ; ; VI-LABEL: bitcast_v36f16_to_v18i32_scalar: @@ -9344,196 +9250,143 @@ define <36 x i16> @bitcast_v18f32_to_v36i16(<18 x float> %a, i32 %b) { ; SI-LABEL: bitcast_v18f32_to_v36i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB28_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_alignbit_b32 v19, v18, v17, 16 -; SI-NEXT: v_alignbit_b32 v20, v16, v15, 16 -; SI-NEXT: v_alignbit_b32 v21, v14, v13, 16 -; SI-NEXT: v_alignbit_b32 v22, v12, v11, 16 -; SI-NEXT: v_alignbit_b32 v25, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v27, v8, v7, 16 -; SI-NEXT: v_alignbit_b32 v29, v6, v5, 16 -; SI-NEXT: v_alignbit_b32 v31, v4, v3, 16 -; SI-NEXT: v_alignbit_b32 v33, v2, v1, 16 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v2 +; SI-NEXT: v_alignbit_b32 v18, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v19, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v20, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v21, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v22, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v23, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v25, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v28, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v30, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v1 ; SI-NEXT: .LBB28_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB28_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 -; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 ; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 -; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 ; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 -; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 ; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 -; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 ; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 -; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 ; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 -; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 ; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 -; SI-NEXT: v_alignbit_b32 v19, v18, v17, 16 -; SI-NEXT: v_alignbit_b32 v20, v16, v15, 16 -; SI-NEXT: v_alignbit_b32 v21, v14, v13, 16 -; SI-NEXT: v_alignbit_b32 v22, v12, v11, 16 -; SI-NEXT: v_alignbit_b32 v25, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v27, v8, v7, 16 -; SI-NEXT: v_alignbit_b32 v29, v6, v5, 16 -; SI-NEXT: v_alignbit_b32 v31, v4, v3, 16 -; SI-NEXT: v_alignbit_b32 v33, v2, v1, 16 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v2 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_alignbit_b32 v18, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v19, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v20, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v21, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v22, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v23, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v25, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v28, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v30, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v1 ; SI-NEXT: .LBB28_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v0, v0, v30 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; SI-NEXT: v_or_b32_e32 v1, v1, v33 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v22 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v28 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v21 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v26 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v20 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v24 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v19 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v23 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x44, v0 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v35 +; SI-NEXT: v_or_b32_e32 v2, v2, v28 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v34 +; SI-NEXT: v_or_b32_e32 v4, v4, v25 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v33 +; SI-NEXT: v_or_b32_e32 v6, v6, v23 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v32 +; SI-NEXT: v_or_b32_e32 v8, v8, v22 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v31 +; SI-NEXT: v_or_b32_e32 v10, v10, v21 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v29 +; SI-NEXT: v_or_b32_e32 v12, v12, v20 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v27 +; SI-NEXT: v_or_b32_e32 v14, v14, v19 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v26 +; SI-NEXT: v_or_b32_e32 v16, v16, v18 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v24 +; SI-NEXT: v_or_b32_e32 v1, v1, v30 +; SI-NEXT: v_or_b32_e32 v3, v3, v28 +; SI-NEXT: v_or_b32_e32 v5, v5, v25 +; SI-NEXT: v_or_b32_e32 v7, v7, v23 +; SI-NEXT: v_or_b32_e32 v9, v9, v22 +; SI-NEXT: v_or_b32_e32 v11, v11, v21 +; SI-NEXT: v_or_b32_e32 v13, v13, v20 +; SI-NEXT: v_or_b32_e32 v15, v15, v19 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v18f32_to_v36i16: @@ -9909,208 +9762,159 @@ define inreg <36 x i16> @bitcast_v18f32_to_v36i16_scalar(<18 x float> inreg %a, ; SI-LABEL: bitcast_v18f32_to_v36i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 ; SI-NEXT: v_mov_b32_e32 v18, s16 ; SI-NEXT: v_mov_b32_e32 v19, s17 ; SI-NEXT: v_mov_b32_e32 v16, s18 ; SI-NEXT: v_mov_b32_e32 v17, s19 ; SI-NEXT: v_mov_b32_e32 v14, s20 ; SI-NEXT: v_mov_b32_e32 v15, s21 -; SI-NEXT: v_mov_b32_e32 v12, s22 -; SI-NEXT: v_mov_b32_e32 v13, s23 -; SI-NEXT: v_mov_b32_e32 v10, s24 -; SI-NEXT: v_mov_b32_e32 v11, s25 -; SI-NEXT: v_mov_b32_e32 v8, s26 -; SI-NEXT: v_mov_b32_e32 v9, s27 -; SI-NEXT: v_mov_b32_e32 v6, s28 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mov_b32_e32 v7, s29 +; SI-NEXT: v_mov_b32_e32 v13, s29 ; SI-NEXT: s_cbranch_scc0 .LBB29_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshr_b64 v[20:21], v[3:4], 16 -; SI-NEXT: v_lshr_b64 v[21:22], v[1:2], 16 -; SI-NEXT: v_lshr_b64 v[22:23], v[6:7], 16 -; SI-NEXT: v_lshr_b64 v[23:24], v[8:9], 16 +; SI-NEXT: v_lshr_b64 v[26:27], v[12:13], 16 +; SI-NEXT: v_lshr_b64 v[22:23], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[27:28], v[8:9], 16 +; SI-NEXT: v_lshr_b64 v[23:24], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[28:29], v[6:7], 16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v19 ; SI-NEXT: v_lshr_b64 v[24:25], v[10:11], 16 -; SI-NEXT: v_lshr_b64 v[25:26], v[12:13], 16 -; SI-NEXT: v_lshr_b64 v[26:27], v[14:15], 16 -; SI-NEXT: v_lshr_b64 v[27:28], v[16:17], 16 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v19 -; SI-NEXT: v_lshr_b64 v[28:29], v[18:19], 16 +; SI-NEXT: v_lshr_b64 v[4:5], v[14:15], 16 +; SI-NEXT: v_lshr_b64 v[29:30], v[16:17], 16 +; SI-NEXT: v_lshr_b64 v[20:21], v[18:19], 16 ; SI-NEXT: s_cbranch_execnz .LBB29_3 ; SI-NEXT: .LBB29_2: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 ; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 ; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshr_b64 v[20:21], v[3:4], 16 +; SI-NEXT: v_lshr_b64 v[26:27], v[12:13], 16 ; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 ; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 -; SI-NEXT: v_lshr_b64 v[21:22], v[1:2], 16 -; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 -; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 -; SI-NEXT: v_lshr_b64 v[22:23], v[6:7], 16 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshr_b64 v[22:23], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[27:28], v[8:9], 16 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 ; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 ; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 -; SI-NEXT: v_lshr_b64 v[23:24], v[8:9], 16 -; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 -; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_lshr_b64 v[23:24], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[28:29], v[6:7], 16 ; SI-NEXT: v_lshr_b64 v[24:25], v[10:11], 16 -; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 -; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 -; SI-NEXT: v_lshr_b64 v[25:26], v[12:13], 16 -; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 -; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 -; SI-NEXT: v_lshr_b64 v[26:27], v[14:15], 16 -; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 -; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 -; SI-NEXT: v_lshr_b64 v[27:28], v[16:17], 16 -; SI-NEXT: v_lshr_b64 v[28:29], v[18:19], 16 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v19 +; SI-NEXT: v_lshr_b64 v[4:5], v[14:15], 16 +; SI-NEXT: v_lshr_b64 v[29:30], v[16:17], 16 +; SI-NEXT: v_lshr_b64 v[20:21], v[18:19], 16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v19 ; SI-NEXT: .LBB29_3: ; %end -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v20 ; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; SI-NEXT: v_or_b32_e32 v18, v18, v28 -; SI-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v19 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v37 -; SI-NEXT: v_or_b32_e32 v18, v18, v19 -; SI-NEXT: v_add_i32_e32 v19, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v27 +; SI-NEXT: v_or_b32_e32 v20, v18, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v39 +; SI-NEXT: v_or_b32_e32 v21, v5, v18 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v29 ; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; SI-NEXT: v_or_b32_e32 v16, v16, v18 -; SI-NEXT: v_add_i32_e32 v18, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v16, v18, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v17 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v36 -; SI-NEXT: v_or_b32_e32 v16, v16, v17 -; SI-NEXT: v_add_i32_e32 v17, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v26 -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; SI-NEXT: v_or_b32_e32 v14, v14, v16 -; SI-NEXT: v_add_i32_e32 v16, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v14, v16, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v35 -; SI-NEXT: v_or_b32_e32 v14, v14, v15 -; SI-NEXT: v_add_i32_e32 v15, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v14, v15, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v18, v16, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v38 +; SI-NEXT: v_or_b32_e32 v19, v5, v16 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v14 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v37 +; SI-NEXT: v_or_b32_e32 v5, v5, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v28 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v6, v6, v14 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v36 +; SI-NEXT: v_or_b32_e32 v7, v7, v14 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v27 +; SI-NEXT: v_or_b32_e32 v8, v8, v14 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v35 +; SI-NEXT: v_or_b32_e32 v9, v9, v14 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v24 +; SI-NEXT: v_or_b32_e32 v10, v10, v14 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v34 +; SI-NEXT: v_or_b32_e32 v11, v11, v14 ; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v26 ; SI-NEXT: v_or_b32_e32 v12, v12, v14 -; SI-NEXT: v_add_i32_e32 v14, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v12, v14, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v13 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v34 -; SI-NEXT: v_or_b32_e32 v12, v12, v13 -; SI-NEXT: v_add_i32_e32 v13, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v12, v13, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v24 -; SI-NEXT: v_or_b32_e32 v10, v10, v12 -; SI-NEXT: v_add_i32_e32 v12, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v10, v12, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v33 -; SI-NEXT: v_or_b32_e32 v10, v10, v11 -; SI-NEXT: v_add_i32_e32 v11, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v23 -; SI-NEXT: v_or_b32_e32 v8, v8, v10 -; SI-NEXT: v_add_i32_e32 v10, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v8, v10, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v32 -; SI-NEXT: v_or_b32_e32 v8, v8, v9 -; SI-NEXT: v_add_i32_e32 v9, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v8, v9, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v22 -; SI-NEXT: v_or_b32_e32 v6, v6, v8 -; SI-NEXT: v_add_i32_e32 v8, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v6, v8, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v31 -; SI-NEXT: v_or_b32_e32 v6, v6, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v21 -; SI-NEXT: v_or_b32_e32 v1, v1, v6 -; SI-NEXT: v_add_i32_e32 v6, vcc, 56, v0 -; SI-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v20 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v5 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x44, v0 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v33 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v23 +; SI-NEXT: v_or_b32_e32 v14, v0, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v32 +; SI-NEXT: v_or_b32_e32 v15, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v22 +; SI-NEXT: v_or_b32_e32 v16, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v31 +; SI-NEXT: v_or_b32_e32 v17, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, v20 +; SI-NEXT: v_mov_b32_e32 v1, v21 +; SI-NEXT: v_mov_b32_e32 v2, v18 +; SI-NEXT: v_mov_b32_e32 v3, v19 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB29_4: -; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: s_branch .LBB29_2 ; ; VI-LABEL: bitcast_v18f32_to_v36i16_scalar: @@ -10693,101 +10497,111 @@ define <18 x float> @bitcast_v36i16_to_v18f32(<36 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v36i16_to_v18f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v32, v17 +; SI-NEXT: v_mov_b32_e32 v33, v16 +; SI-NEXT: v_mov_b32_e32 v34, v15 +; SI-NEXT: v_mov_b32_e32 v35, v14 +; SI-NEXT: v_mov_b32_e32 v36, v13 +; SI-NEXT: v_mov_b32_e32 v37, v12 +; SI-NEXT: v_mov_b32_e32 v38, v11 +; SI-NEXT: v_mov_b32_e32 v39, v10 +; SI-NEXT: v_mov_b32_e32 v48, v9 +; SI-NEXT: v_mov_b32_e32 v49, v8 +; SI-NEXT: v_mov_b32_e32 v50, v7 +; SI-NEXT: v_mov_b32_e32 v51, v6 +; SI-NEXT: v_mov_b32_e32 v52, v5 ; SI-NEXT: v_mov_b32_e32 v53, v4 -; SI-NEXT: v_mov_b32_e32 v54, v2 -; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 -; SI-NEXT: v_mov_b32_e32 v36, v22 -; SI-NEXT: v_mov_b32_e32 v37, v20 -; SI-NEXT: v_mov_b32_e32 v38, v18 -; SI-NEXT: v_mov_b32_e32 v39, v16 -; SI-NEXT: v_mov_b32_e32 v48, v14 -; SI-NEXT: v_mov_b32_e32 v49, v12 -; SI-NEXT: v_mov_b32_e32 v50, v10 -; SI-NEXT: v_mov_b32_e32 v51, v8 -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v7 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v29 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v0 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v4 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v6 +; SI-NEXT: v_mov_b32_e32 v54, v3 +; SI-NEXT: v_mov_b32_e32 v55, v2 +; SI-NEXT: v_mov_b32_e32 v40, v1 +; SI-NEXT: v_mov_b32_e32 v41, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v38 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v48 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v50 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v52 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v40 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v41 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v14 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB30_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v53 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v52 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v51 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v50 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v49 -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v48 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v39 -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v38 -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v37 -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v36 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v41 -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v40 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: v_or_b32_e32 v0, v0, v35 -; SI-NEXT: v_or_b32_e32 v1, v1, v34 -; SI-NEXT: v_or_b32_e32 v2, v2, v33 -; SI-NEXT: v_or_b32_e32 v3, v3, v32 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v35 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v41 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v40 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v55 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v54 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v53 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v52 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v51 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v50 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v49 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v48 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v39 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v38 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v37 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v36 +; SI-NEXT: v_or_b32_e32 v0, v0, v45 +; SI-NEXT: v_or_b32_e32 v1, v1, v44 +; SI-NEXT: v_or_b32_e32 v2, v2, v43 +; SI-NEXT: v_or_b32_e32 v3, v3, v42 ; SI-NEXT: v_or_b32_e32 v4, v4, v63 ; SI-NEXT: v_or_b32_e32 v5, v5, v62 ; SI-NEXT: v_or_b32_e32 v6, v6, v61 @@ -10796,8 +10610,10 @@ define <18 x float> @bitcast_v36i16_to_v18f32(<36 x i16> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v9, v9, v58 ; SI-NEXT: v_or_b32_e32 v10, v10, v57 ; SI-NEXT: v_or_b32_e32 v11, v11, v56 -; SI-NEXT: v_or_b32_e32 v16, v16, v43 -; SI-NEXT: v_or_b32_e32 v17, v17, v42 +; SI-NEXT: v_or_b32_e32 v12, v12, v47 +; SI-NEXT: v_or_b32_e32 v13, v13, v46 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr53 @@ -10810,15 +10626,11 @@ define <18 x float> @bitcast_v36i16_to_v18f32(<36 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr62 ; SI-NEXT: ; implicit-def: $vgpr61 @@ -10827,47 +10639,55 @@ define <18 x float> @bitcast_v36i16_to_v18f32(<36 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v34 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v33 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v32 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; SI-NEXT: v_or_b32_e32 v12, v12, v47 -; SI-NEXT: v_or_b32_e32 v13, v13, v46 -; SI-NEXT: v_or_b32_e32 v14, v14, v45 -; SI-NEXT: v_or_b32_e32 v15, v15, v44 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 ; SI-NEXT: .LBB30_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB30_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v53 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v52 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v51 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v50 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v49 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v48 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v39 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v38 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v37 -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v36 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v41 -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v40 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v35 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v40 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v48 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v39 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v38 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v37 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v36 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 @@ -10880,13 +10700,13 @@ define <18 x float> @bitcast_v36i16_to_v18f32(<36 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; SI-NEXT: v_or_b32_e32 v0, v35, v0 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_or_b32_e32 v0, v45, v0 ; SI-NEXT: s_mov_b32 s6, 0x30000 -; SI-NEXT: v_or_b32_e32 v1, v34, v1 -; SI-NEXT: v_or_b32_e32 v2, v33, v2 -; SI-NEXT: v_or_b32_e32 v3, v32, v3 +; SI-NEXT: v_or_b32_e32 v1, v44, v1 +; SI-NEXT: v_or_b32_e32 v2, v43, v2 +; SI-NEXT: v_or_b32_e32 v3, v42, v3 ; SI-NEXT: v_or_b32_e32 v4, v63, v4 ; SI-NEXT: v_or_b32_e32 v5, v62, v5 ; SI-NEXT: v_or_b32_e32 v6, v61, v6 @@ -10895,8 +10715,8 @@ define <18 x float> @bitcast_v36i16_to_v18f32(<36 x i16> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v9, v58, v9 ; SI-NEXT: v_or_b32_e32 v10, v57, v10 ; SI-NEXT: v_or_b32_e32 v11, v56, v11 -; SI-NEXT: v_or_b32_e32 v16, v43, v16 -; SI-NEXT: v_or_b32_e32 v17, v42, v17 +; SI-NEXT: v_or_b32_e32 v12, v47, v12 +; SI-NEXT: v_or_b32_e32 v13, v46, v13 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 @@ -10905,24 +10725,22 @@ define <18 x float> @bitcast_v36i16_to_v18f32(<36 x i16> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 ; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 ; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v34 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v33 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v32 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; SI-NEXT: v_or_b32_e32 v12, v47, v12 -; SI-NEXT: v_or_b32_e32 v13, v46, v13 -; SI-NEXT: v_or_b32_e32 v14, v45, v14 -; SI-NEXT: v_or_b32_e32 v15, v44, v15 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 ; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 ; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 ; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 @@ -10933,22 +10751,22 @@ define <18 x float> @bitcast_v36i16_to_v18f32(<36 x i16> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 ; SI-NEXT: .LBB30_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -11420,185 +11238,203 @@ define inreg <18 x float> @bitcast_v36i16_to_v18f32_scalar(<36 x i16> inreg %a, ; SI-LABEL: bitcast_v36i16_to_v18f32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v32, v20 -; SI-NEXT: v_mov_b32_e32 v33, v18 -; SI-NEXT: v_mov_b32_e32 v34, v16 -; SI-NEXT: v_mov_b32_e32 v35, v14 -; SI-NEXT: v_mov_b32_e32 v36, v12 -; SI-NEXT: v_mov_b32_e32 v37, v10 -; SI-NEXT: v_mov_b32_e32 v38, v8 -; SI-NEXT: v_mov_b32_e32 v39, v6 -; SI-NEXT: v_mov_b32_e32 v48, v4 -; SI-NEXT: v_mov_b32_e32 v49, v2 -; SI-NEXT: v_mov_b32_e32 v50, v0 +; SI-NEXT: v_mov_b32_e32 v32, v3 +; SI-NEXT: v_mov_b32_e32 v33, v2 +; SI-NEXT: v_mov_b32_e32 v34, v1 +; SI-NEXT: v_mov_b32_e32 v35, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v35 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s7, s28, 16 +; SI-NEXT: s_lshr_b32 s8, s27, 16 +; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: s_lshr_b32 s13, s22, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s40, s19, 16 +; SI-NEXT: s_lshr_b32 s41, s18, 16 +; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v0 ; SI-NEXT: s_cbranch_scc0 .LBB31_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 -; SI-NEXT: v_or_b32_e32 v7, v0, v45 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 -; SI-NEXT: v_or_b32_e32 v8, v0, v44 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 ; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: v_or_b32_e32 v9, v0, v43 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: s_lshl_b32 s5, s43, 16 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: v_or_b32_e32 v10, v0, v42 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 -; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: v_or_b32_e32 v11, v0, v41 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 -; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: v_or_b32_e32 v12, v0, v40 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 -; SI-NEXT: s_or_b32 s7, s7, s8 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: v_or_b32_e32 v13, v0, v55 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s44, s42, 16 +; SI-NEXT: s_or_b32 s5, s5, s44 +; SI-NEXT: s_and_b32 s44, s18, 0xffff +; SI-NEXT: s_lshl_b32 s45, s41, 16 +; SI-NEXT: s_or_b32 s44, s44, s45 +; SI-NEXT: s_and_b32 s45, s19, 0xffff +; SI-NEXT: s_lshl_b32 s46, s40, 16 +; SI-NEXT: s_or_b32 s45, s45, s46 +; SI-NEXT: s_and_b32 s46, s20, 0xffff +; SI-NEXT: s_lshl_b32 s47, s15, 16 +; SI-NEXT: s_or_b32 s46, s46, s47 +; SI-NEXT: s_and_b32 s47, s21, 0xffff +; SI-NEXT: s_lshl_b32 s56, s14, 16 +; SI-NEXT: s_or_b32 s47, s47, s56 +; SI-NEXT: s_and_b32 s56, s22, 0xffff +; SI-NEXT: s_lshl_b32 s57, s13, 16 +; SI-NEXT: s_or_b32 s56, s56, s57 +; SI-NEXT: s_and_b32 s57, s23, 0xffff +; SI-NEXT: s_lshl_b32 s58, s12, 16 +; SI-NEXT: s_or_b32 s57, s57, s58 +; SI-NEXT: s_and_b32 s58, s24, 0xffff +; SI-NEXT: s_lshl_b32 s59, s11, 16 +; SI-NEXT: s_or_b32 s58, s58, s59 +; SI-NEXT: s_and_b32 s59, s25, 0xffff +; SI-NEXT: s_lshl_b32 s60, s10, 16 +; SI-NEXT: s_or_b32 s59, s59, s60 +; SI-NEXT: s_and_b32 s60, s26, 0xffff +; SI-NEXT: s_lshl_b32 s61, s9, 16 +; SI-NEXT: s_or_b32 s60, s60, s61 +; SI-NEXT: s_and_b32 s61, s27, 0xffff +; SI-NEXT: s_lshl_b32 s62, s8, 16 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 -; SI-NEXT: s_or_b32 s8, s8, s9 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: v_or_b32_e32 v14, v0, v54 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 -; SI-NEXT: s_or_b32 s9, s9, s10 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_or_b32_e32 v15, v0, v53 +; SI-NEXT: s_or_b32 s61, s61, s62 +; SI-NEXT: s_and_b32 s62, s28, 0xffff +; SI-NEXT: s_lshl_b32 s63, s7, 16 +; SI-NEXT: v_or_b32_e32 v14, v0, v39 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 -; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_or_b32_e32 v16, v0, v52 +; SI-NEXT: s_or_b32 s62, s62, s63 +; SI-NEXT: s_and_b32 s63, s29, 0xffff +; SI-NEXT: s_lshl_b32 s72, s6, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v34 +; SI-NEXT: v_or_b32_e32 v16, v0, v37 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 -; SI-NEXT: v_or_b32_e32 v17, v0, v51 +; SI-NEXT: s_or_b32 s63, s63, s72 +; SI-NEXT: v_or_b32_e32 v15, v1, v38 +; SI-NEXT: v_or_b32_e32 v17, v0, v36 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_mov_b32_e32 v5, s9 -; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: v_mov_b32_e32 v2, s44 +; SI-NEXT: v_mov_b32_e32 v3, s45 +; SI-NEXT: v_mov_b32_e32 v4, s46 +; SI-NEXT: v_mov_b32_e32 v5, s47 +; SI-NEXT: v_mov_b32_e32 v6, s56 +; SI-NEXT: v_mov_b32_e32 v7, s57 +; SI-NEXT: v_mov_b32_e32 v8, s58 +; SI-NEXT: v_mov_b32_e32 v9, s59 +; SI-NEXT: v_mov_b32_e32 v10, s60 +; SI-NEXT: v_mov_b32_e32 v11, s61 +; SI-NEXT: v_mov_b32_e32 v12, s62 +; SI-NEXT: v_mov_b32_e32 v13, s63 ; SI-NEXT: s_cbranch_execnz .LBB31_3 ; SI-NEXT: .LBB31_2: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v45, v0 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v44, v0 -; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v43, v0 -; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v42, v0 -; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v41, v0 -; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v40, v0 -; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v55, v0 -; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_or_b32_e32 v0, v54, v0 ; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: s_add_i32 s17, s17, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s16, s42, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: s_and_b32 s16, s18, 0xffff +; SI-NEXT: s_lshl_b32 s17, s41, 16 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_and_b32 s17, s19, 0xffff +; SI-NEXT: s_lshl_b32 s18, s40, 16 ; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: s_and_b32 s18, s20, 0xffff +; SI-NEXT: s_lshl_b32 s15, s15, 16 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_or_b32 s15, s15, s18 +; SI-NEXT: s_and_b32 s18, s21, 0xffff +; SI-NEXT: s_lshl_b32 s14, s14, 16 ; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: v_or_b32_e32 v0, v53, v0 -; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_or_b32 s14, s14, s18 +; SI-NEXT: s_and_b32 s18, s22, 0xffff +; SI-NEXT: s_lshl_b32 s13, s13, 16 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: s_or_b32 s13, s13, s18 +; SI-NEXT: s_and_b32 s18, s23, 0xffff +; SI-NEXT: s_lshl_b32 s12, s12, 16 ; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 -; SI-NEXT: s_or_b32 s7, s8, s7 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: s_add_i32 s26, s26, 3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_or_b32 s8, s9, s8 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_or_b32 s12, s12, s18 +; SI-NEXT: s_and_b32 s18, s24, 0xffff +; SI-NEXT: s_lshl_b32 s11, s11, 16 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: v_or_b32_e32 v0, v39, v0 +; SI-NEXT: s_or_b32 s11, s11, s18 +; SI-NEXT: s_and_b32 s18, s25, 0xffff +; SI-NEXT: s_lshl_b32 s10, s10, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 +; SI-NEXT: s_or_b32 s10, s10, s18 +; SI-NEXT: s_and_b32 s18, s26, 0xffff +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: s_or_b32 s9, s9, s18 +; SI-NEXT: s_and_b32 s18, s27, 0xffff +; SI-NEXT: s_lshl_b32 s8, s8, 16 ; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: v_or_b32_e32 v0, v52, v0 -; SI-NEXT: s_or_b32 s9, s10, s9 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s8, s8, s18 +; SI-NEXT: s_and_b32 s18, s28, 0xffff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: v_or_b32_e32 v0, v37, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v34 +; SI-NEXT: s_or_b32 s7, s7, s18 +; SI-NEXT: s_and_b32 s18, s29, 0xffff +; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 -; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_or_b32 s6, s6, s18 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v38, v1 ; SI-NEXT: s_add_i32 s4, s4, 0x30000 ; SI-NEXT: s_add_i32 s5, s5, 0x30000 -; SI-NEXT: s_add_i32 s6, s6, 0x30000 -; SI-NEXT: s_add_i32 s7, s7, 0x30000 -; SI-NEXT: s_add_i32 s8, s8, 0x30000 -; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s16, s16, 0x30000 +; SI-NEXT: s_add_i32 s17, s17, 0x30000 +; SI-NEXT: s_add_i32 s15, s15, 0x30000 +; SI-NEXT: s_add_i32 s14, s14, 0x30000 +; SI-NEXT: s_add_i32 s13, s13, 0x30000 +; SI-NEXT: s_add_i32 s12, s12, 0x30000 +; SI-NEXT: s_add_i32 s11, s11, 0x30000 ; SI-NEXT: s_add_i32 s10, s10, 0x30000 -; SI-NEXT: v_or_b32_e32 v0, v51, v0 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v36, v0 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v1 ; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_mov_b32_e32 v5, s9 -; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: v_mov_b32_e32 v3, s17 +; SI-NEXT: v_mov_b32_e32 v4, s15 +; SI-NEXT: v_mov_b32_e32 v5, s14 +; SI-NEXT: v_mov_b32_e32 v6, s13 +; SI-NEXT: v_mov_b32_e32 v7, s12 +; SI-NEXT: v_mov_b32_e32 v8, s11 +; SI-NEXT: v_mov_b32_e32 v9, s10 +; SI-NEXT: v_mov_b32_e32 v10, s9 +; SI-NEXT: v_mov_b32_e32 v11, s8 +; SI-NEXT: v_mov_b32_e32 v12, s7 +; SI-NEXT: v_mov_b32_e32 v13, s6 ; SI-NEXT: .LBB31_3: ; %end -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB31_4: ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 @@ -12009,23 +11845,21 @@ define <36 x half> @bitcast_v18f32_to_v36f16(<18 x float> %a, i32 %b) { ; SI-LABEL: bitcast_v18f32_to_v36f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr51 @@ -12035,86 +11869,87 @@ define <36 x half> @bitcast_v18f32_to_v36f16(<18 x float> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB32_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v21 ; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v21 ; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v21 ; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v21 ; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v21 ; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v21 ; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v21 ; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v21 ; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v21 ; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v21 ; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v21 ; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v4 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v40, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v21 ; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v3 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v42, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v21 ; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v17 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v44, v21 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v41, v21 ; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v15 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v43, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v46, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v0 +; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 @@ -12132,11 +11967,11 @@ define <36 x half> @bitcast_v18f32_to_v36f16(<18 x float> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: .LBB32_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB32_4 ; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 @@ -12154,196 +11989,143 @@ define <36 x half> @bitcast_v18f32_to_v36f16(<18 x float> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 ; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 ; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 -; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v1 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 ; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 ; SI-NEXT: .LBB32_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v43 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v42 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v41 -; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v55 -; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v54 -; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v52 -; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v50 -; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v48 -; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v38 -; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v36 -; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v34 -; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v32 -; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v31 -; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v29 -; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v27 -; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v25 -; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v23 -; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v3, v40 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v21 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v4, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v54 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v52 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v51 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v48 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v39 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v35 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v36 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v31 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v32 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v27 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v28 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v22 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v24 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v19 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v21 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v18f32_to_v36f16: @@ -12719,331 +12501,277 @@ define inreg <36 x half> @bitcast_v18f32_to_v36f16_scalar(<18 x float> inreg %a, ; SI-LABEL: bitcast_v18f32_to_v36f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v46, s16 -; SI-NEXT: v_mov_b32_e32 v45, s17 -; SI-NEXT: v_mov_b32_e32 v44, s18 -; SI-NEXT: v_mov_b32_e32 v43, s19 -; SI-NEXT: v_mov_b32_e32 v42, s20 -; SI-NEXT: v_mov_b32_e32 v41, s21 -; SI-NEXT: v_mov_b32_e32 v40, s22 -; SI-NEXT: v_mov_b32_e32 v55, s23 -; SI-NEXT: v_mov_b32_e32 v54, s24 -; SI-NEXT: v_mov_b32_e32 v53, s25 -; SI-NEXT: v_mov_b32_e32 v51, s26 -; SI-NEXT: v_mov_b32_e32 v50, s27 -; SI-NEXT: v_mov_b32_e32 v49, s28 +; SI-NEXT: v_mov_b32_e32 v45, s16 +; SI-NEXT: v_mov_b32_e32 v44, s17 +; SI-NEXT: v_mov_b32_e32 v43, s18 +; SI-NEXT: v_mov_b32_e32 v42, s19 +; SI-NEXT: v_mov_b32_e32 v41, s20 +; SI-NEXT: v_mov_b32_e32 v40, s21 +; SI-NEXT: v_mov_b32_e32 v55, s22 +; SI-NEXT: v_mov_b32_e32 v54, s23 +; SI-NEXT: v_mov_b32_e32 v53, s24 +; SI-NEXT: v_mov_b32_e32 v52, s25 +; SI-NEXT: v_mov_b32_e32 v50, s26 +; SI-NEXT: v_mov_b32_e32 v49, s27 +; SI-NEXT: v_mov_b32_e32 v48, s28 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mov_b32_e32 v52, s29 +; SI-NEXT: v_mov_b32_e32 v51, s29 ; SI-NEXT: s_cbranch_scc0 .LBB33_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v7 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v7 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v7 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v7 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v7 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v7 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v7 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v7 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v7 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v7 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v7 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v7 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v7 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v45 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v7 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v41 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v52 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v41 ; SI-NEXT: v_cvt_f32_f16_e32 v32, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v45 ; SI-NEXT: s_cbranch_execnz .LBB33_3 ; SI-NEXT: .LBB33_2: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v5, 1.0, v46 -; SI-NEXT: v_add_f32_e32 v6, 1.0, v45 -; SI-NEXT: v_add_f32_e32 v8, 1.0, v44 -; SI-NEXT: v_add_f32_e32 v10, 1.0, v43 -; SI-NEXT: v_add_f32_e32 v12, 1.0, v42 -; SI-NEXT: v_add_f32_e32 v14, 1.0, v41 -; SI-NEXT: v_add_f32_e32 v16, 1.0, v40 -; SI-NEXT: v_add_f32_e32 v19, 1.0, v55 -; SI-NEXT: v_add_f32_e32 v21, 1.0, v54 -; SI-NEXT: v_add_f32_e32 v22, 1.0, v53 -; SI-NEXT: v_add_f32_e32 v20, 1.0, v51 -; SI-NEXT: v_add_f32_e32 v18, 1.0, v50 -; SI-NEXT: v_add_f32_e32 v17, 1.0, v49 -; SI-NEXT: v_add_f32_e32 v15, 1.0, v52 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v45 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v44 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v43 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v42 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v41 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v40 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v55 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v54 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v53 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v52 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v50 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v49 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v48 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v51 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 ; SI-NEXT: .LBB33_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v37 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v0, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v36 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v35 -; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 -; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v32 -; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v30 -; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v28 -; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v26 -; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v24 -; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v22 -; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v20 -; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v18 -; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v17 -; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v15 -; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v13 -; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v11 -; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v9 -; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v7 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB33_4: -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v33 +; SI-NEXT: v_or_b32_e32 v3, v32, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v32 +; SI-NEXT: v_or_b32_e32 v5, v5, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v30 +; SI-NEXT: v_or_b32_e32 v7, v7, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v28 +; SI-NEXT: v_or_b32_e32 v9, v26, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v26 +; SI-NEXT: v_or_b32_e32 v11, v24, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v24 +; SI-NEXT: v_or_b32_e32 v13, v22, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v22 +; SI-NEXT: v_or_b32_e32 v15, v20, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v20 +; SI-NEXT: v_or_b32_e32 v4, v31, v4 +; SI-NEXT: v_or_b32_e32 v6, v29, v6 +; SI-NEXT: v_or_b32_e32 v8, v27, v8 +; SI-NEXT: v_or_b32_e32 v10, v25, v10 +; SI-NEXT: v_or_b32_e32 v12, v23, v12 +; SI-NEXT: v_or_b32_e32 v14, v21, v14 +; SI-NEXT: v_or_b32_e32 v16, v19, v16 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB33_4: +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: s_branch .LBB33_2 ; ; VI-LABEL: bitcast_v18f32_to_v36f16_scalar: @@ -13626,94 +13354,128 @@ define <18 x float> @bitcast_v36f16_to_v18f32(<36 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v36f16_to_v18f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:20 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v34, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v35, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v10 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v47, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v27 -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v39 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v48 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v38 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v35, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v63, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB34_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v49 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v35 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v33 @@ -13728,8 +13490,9 @@ define <18 x float> @bitcast_v36f16_to_v18f32(<36 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v55 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v53 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v51 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v37 ; SI-NEXT: v_or_b32_e32 v0, v34, v0 ; SI-NEXT: v_or_b32_e32 v1, v32, v1 ; SI-NEXT: v_or_b32_e32 v2, v62, v2 @@ -13743,6 +13506,9 @@ define <18 x float> @bitcast_v36f16_to_v18f32(<36 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v10, v54, v10 ; SI-NEXT: v_or_b32_e32 v11, v52, v11 ; SI-NEXT: v_or_b32_e32 v12, v50, v12 +; SI-NEXT: v_or_b32_e32 v13, v48, v13 +; SI-NEXT: v_or_b32_e32 v14, v38, v14 +; SI-NEXT: v_or_b32_e32 v15, v36, v15 ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr33 @@ -13770,30 +13536,28 @@ define <18 x float> @bitcast_v36f16_to_v18f32(<36 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; kill: killed $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v37 -; SI-NEXT: v_or_b32_e32 v17, v36, v17 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: .LBB34_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB34_4 @@ -13889,90 +13653,86 @@ define <18 x float> @bitcast_v36f16_to_v18f32(<36 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v52 ; SI-NEXT: v_cvt_f32_f16_e32 v14, v50 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v15, v39 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v51 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v52 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v37 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v14, v12 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_or_b32_e32 v11, v13, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v36 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v48 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v38 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v36 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_or_b32_e32 v14, v16, v14 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 ; SI-NEXT: v_or_b32_e32 v17, v19, v17 ; SI-NEXT: .LBB34_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -14445,6 +14205,16 @@ define inreg <18 x float> @bitcast_v36f16_to_v18f32_scalar(<36 x half> inreg %a, ; SI-LABEL: bitcast_v36f16_to_v18f32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_lshr_b32 s12, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s12 +; SI-NEXT: s_lshr_b32 s12, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s12 +; SI-NEXT: s_lshr_b32 s12, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s12 +; SI-NEXT: s_lshr_b32 s12, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s12 +; SI-NEXT: s_lshr_b32 s10, s23, 16 +; SI-NEXT: s_lshr_b32 s11, s22, 16 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -14461,89 +14231,132 @@ define inreg <18 x float> @bitcast_v36f16_to_v18f32_scalar(<36 x half> inreg %a, ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v57, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v11 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v63, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s23 +; SI-NEXT: s_lshr_b32 s8, s25, 16 +; SI-NEXT: s_lshr_b32 s9, s24, 16 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s25 +; SI-NEXT: s_lshr_b32 s6, s27, 16 +; SI-NEXT: s_lshr_b32 s7, s26, 16 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s27 +; SI-NEXT: s_lshr_b32 s12, s17, 16 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 +; SI-NEXT: s_lshr_b32 s4, s29, 16 +; SI-NEXT: s_lshr_b32 s5, s28, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s12 +; SI-NEXT: s_lshr_b32 s12, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s17 ; SI-NEXT: v_cvt_f16_f32_e32 v38, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v29, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v35, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v34, s16 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v62, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v33, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v60, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v61, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v44, s23 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v63, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v42, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v41, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v40, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v55, s26 -; SI-NEXT: v_cvt_f16_f32_e32 v59, s29 -; SI-NEXT: v_cvt_f16_f32_e32 v58, s28 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v3 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_cbranch_scc0 .LBB35_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v62 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v60 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v44 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v42 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v40 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v45 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v63 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v60 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v57 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v47 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v50 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v48 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v38 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v30 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v28 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v26 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v56 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v21 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 ; SI-NEXT: v_or_b32_e32 v0, v34, v0 ; SI-NEXT: v_or_b32_e32 v1, v33, v1 -; SI-NEXT: v_or_b32_e32 v2, v61, v2 -; SI-NEXT: v_or_b32_e32 v3, v63, v3 -; SI-NEXT: v_or_b32_e32 v4, v41, v4 -; SI-NEXT: v_or_b32_e32 v5, v55, v5 +; SI-NEXT: v_or_b32_e32 v2, v44, v2 +; SI-NEXT: v_or_b32_e32 v3, v42, v3 +; SI-NEXT: v_or_b32_e32 v4, v62, v4 +; SI-NEXT: v_or_b32_e32 v5, v59, v5 ; SI-NEXT: v_or_b32_e32 v6, v58, v6 -; SI-NEXT: v_or_b32_e32 v7, v56, v7 -; SI-NEXT: v_or_b32_e32 v8, v51, v8 -; SI-NEXT: v_or_b32_e32 v9, v49, v9 -; SI-NEXT: v_or_b32_e32 v10, v39, v10 -; SI-NEXT: v_or_b32_e32 v11, v37, v11 -; SI-NEXT: v_or_b32_e32 v12, v31, v12 -; SI-NEXT: v_or_b32_e32 v13, v29, v13 -; SI-NEXT: v_or_b32_e32 v14, v27, v14 -; SI-NEXT: v_or_b32_e32 v15, v25, v15 -; SI-NEXT: v_or_b32_e32 v16, v23, v16 +; SI-NEXT: v_or_b32_e32 v7, v47, v7 +; SI-NEXT: v_or_b32_e32 v8, v46, v8 +; SI-NEXT: v_or_b32_e32 v9, v37, v9 +; SI-NEXT: v_or_b32_e32 v10, v36, v10 +; SI-NEXT: v_or_b32_e32 v11, v29, v11 +; SI-NEXT: v_or_b32_e32 v12, v28, v12 +; SI-NEXT: v_or_b32_e32 v13, v26, v13 +; SI-NEXT: v_or_b32_e32 v14, v24, v14 +; SI-NEXT: v_or_b32_e32 v15, v22, v15 +; SI-NEXT: v_or_b32_e32 v16, v20, v16 ; SI-NEXT: v_or_b32_e32 v17, v19, v17 ; SI-NEXT: s_cbranch_execnz .LBB35_3 ; SI-NEXT: .LBB35_2: ; %cmp.true +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v43 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v33 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 @@ -14558,11 +14371,10 @@ define inreg <18 x float> @bitcast_v36f16_to_v18f32_scalar(<36 x half> inreg %a, ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v63 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v62 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -14571,20 +14383,18 @@ define inreg <18 x float> @bitcast_v36f16_to_v18f32_scalar(<36 x half> inreg %a, ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v44 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v61 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v59 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v63 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v58 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 @@ -14592,11 +14402,11 @@ define inreg <18 x float> @bitcast_v36f16_to_v18f32_scalar(<36 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v56 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v60 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v57 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 @@ -14607,10 +14417,10 @@ define inreg <18 x float> @bitcast_v36f16_to_v18f32_scalar(<36 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v47 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v37 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 @@ -14618,29 +14428,29 @@ define inreg <18 x float> @bitcast_v36f16_to_v18f32_scalar(<36 x half> inreg %a, ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v39 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v38 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v31 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v36 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v25 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v30 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 @@ -14648,10 +14458,10 @@ define inreg <18 x float> @bitcast_v36f16_to_v18f32_scalar(<36 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v27 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v14, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v26 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 @@ -14659,11 +14469,11 @@ define inreg <18 x float> @bitcast_v36f16_to_v18f32_scalar(<36 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v24 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v22 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 @@ -14671,10 +14481,10 @@ define inreg <18 x float> @bitcast_v36f16_to_v18f32_scalar(<36 x half> inreg %a, ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_or_b32_e32 v14, v16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v21 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 @@ -14709,59 +14519,63 @@ define inreg <18 x float> @bitcast_v36f16_to_v18f32_scalar(<36 x half> inreg %a, ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB35_4: -; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v55, v36 +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_mov_b32_e32 v35, v42 +; SI-NEXT: v_mov_b32_e32 v42, v36 ; SI-NEXT: v_mov_b32_e32 v36, v19 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v40, v37 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v34, v43 +; SI-NEXT: v_mov_b32_e32 v43, v37 ; SI-NEXT: v_mov_b32_e32 v37, v18 -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v41, v38 -; SI-NEXT: v_mov_b32_e32 v38, v23 -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v42, v39 -; SI-NEXT: v_mov_b32_e32 v39, v24 -; SI-NEXT: v_mov_b32_e32 v43, v48 -; SI-NEXT: v_mov_b32_e32 v48, v25 -; SI-NEXT: v_mov_b32_e32 v32, v44 -; SI-NEXT: v_mov_b32_e32 v44, v49 -; SI-NEXT: v_mov_b32_e32 v49, v26 -; SI-NEXT: v_mov_b32_e32 v45, v50 -; SI-NEXT: v_mov_b32_e32 v50, v27 -; SI-NEXT: v_mov_b32_e32 v46, v51 -; SI-NEXT: v_mov_b32_e32 v51, v28 -; SI-NEXT: v_mov_b32_e32 v52, v29 -; SI-NEXT: v_mov_b32_e32 v53, v30 -; SI-NEXT: v_mov_b32_e32 v54, v31 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v33, v44 +; SI-NEXT: v_mov_b32_e32 v44, v38 +; SI-NEXT: v_mov_b32_e32 v38, v20 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v32, v45 +; SI-NEXT: v_mov_b32_e32 v45, v39 +; SI-NEXT: v_mov_b32_e32 v39, v21 +; SI-NEXT: v_mov_b32_e32 v48, v22 +; SI-NEXT: v_mov_b32_e32 v49, v23 +; SI-NEXT: v_mov_b32_e32 v50, v24 +; SI-NEXT: v_mov_b32_e32 v51, v25 +; SI-NEXT: v_mov_b32_e32 v52, v26 +; SI-NEXT: v_mov_b32_e32 v53, v27 +; SI-NEXT: v_mov_b32_e32 v54, v28 +; SI-NEXT: v_mov_b32_e32 v55, v29 +; SI-NEXT: v_mov_b32_e32 v40, v30 +; SI-NEXT: v_mov_b32_e32 v41, v31 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v24, v39 -; SI-NEXT: v_mov_b32_e32 v39, v42 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v23, v38 -; SI-NEXT: v_mov_b32_e32 v38, v41 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v21, v39 +; SI-NEXT: v_mov_b32_e32 v39, v45 +; SI-NEXT: v_mov_b32_e32 v45, v32 +; SI-NEXT: v_mov_b32_e32 v20, v38 +; SI-NEXT: v_mov_b32_e32 v38, v44 +; SI-NEXT: v_mov_b32_e32 v44, v33 ; SI-NEXT: v_mov_b32_e32 v18, v37 -; SI-NEXT: v_mov_b32_e32 v37, v40 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v37, v43 +; SI-NEXT: v_mov_b32_e32 v43, v34 ; SI-NEXT: v_mov_b32_e32 v19, v36 -; SI-NEXT: v_mov_b32_e32 v36, v55 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v31, v54 -; SI-NEXT: v_mov_b32_e32 v30, v53 -; SI-NEXT: v_mov_b32_e32 v29, v52 -; SI-NEXT: v_mov_b32_e32 v28, v51 -; SI-NEXT: v_mov_b32_e32 v51, v46 -; SI-NEXT: v_mov_b32_e32 v27, v50 -; SI-NEXT: v_mov_b32_e32 v50, v45 -; SI-NEXT: v_mov_b32_e32 v26, v49 -; SI-NEXT: v_mov_b32_e32 v49, v44 -; SI-NEXT: v_mov_b32_e32 v44, v32 -; SI-NEXT: v_mov_b32_e32 v25, v48 -; SI-NEXT: v_mov_b32_e32 v48, v43 +; SI-NEXT: v_mov_b32_e32 v36, v42 +; SI-NEXT: v_mov_b32_e32 v42, v35 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v31, v41 +; SI-NEXT: v_mov_b32_e32 v30, v40 +; SI-NEXT: v_mov_b32_e32 v29, v55 +; SI-NEXT: v_mov_b32_e32 v28, v54 +; SI-NEXT: v_mov_b32_e32 v27, v53 +; SI-NEXT: v_mov_b32_e32 v26, v52 +; SI-NEXT: v_mov_b32_e32 v25, v51 +; SI-NEXT: v_mov_b32_e32 v24, v50 +; SI-NEXT: v_mov_b32_e32 v23, v49 +; SI-NEXT: v_mov_b32_e32 v22, v48 ; SI-NEXT: s_branch .LBB35_2 ; ; VI-LABEL: bitcast_v36f16_to_v18f32_scalar: @@ -15837,196 +15651,143 @@ define <36 x i16> @bitcast_v9i64_to_v36i16(<9 x i64> %a, i32 %b) { ; SI-LABEL: bitcast_v9i64_to_v36i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB40_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_alignbit_b32 v19, v18, v17, 16 -; SI-NEXT: v_alignbit_b32 v20, v16, v15, 16 -; SI-NEXT: v_alignbit_b32 v21, v14, v13, 16 -; SI-NEXT: v_alignbit_b32 v22, v12, v11, 16 -; SI-NEXT: v_alignbit_b32 v24, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v26, v8, v7, 16 -; SI-NEXT: v_alignbit_b32 v29, v6, v5, 16 -; SI-NEXT: v_alignbit_b32 v31, v4, v3, 16 -; SI-NEXT: v_alignbit_b32 v33, v2, v1, 16 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v2 +; SI-NEXT: v_alignbit_b32 v18, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v19, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v20, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v21, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v22, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v23, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v25, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v27, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v30, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v1 ; SI-NEXT: .LBB40_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB40_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: v_addc_u32_e32 v12, vcc, 0, v12, vcc -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; SI-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; SI-NEXT: v_addc_u32_e32 v16, vcc, 0, v16, vcc -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; SI-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc -; SI-NEXT: v_alignbit_b32 v19, v18, v17, 16 -; SI-NEXT: v_alignbit_b32 v20, v16, v15, 16 -; SI-NEXT: v_alignbit_b32 v21, v14, v13, 16 -; SI-NEXT: v_alignbit_b32 v22, v12, v11, 16 -; SI-NEXT: v_alignbit_b32 v24, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v26, v8, v7, 16 -; SI-NEXT: v_alignbit_b32 v29, v6, v5, 16 -; SI-NEXT: v_alignbit_b32 v31, v4, v3, 16 -; SI-NEXT: v_alignbit_b32 v33, v2, v1, 16 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; SI-NEXT: v_alignbit_b32 v18, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v19, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v20, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v21, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v22, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v23, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v25, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v27, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v30, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v1 ; SI-NEXT: .LBB40_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v0, v0, v30 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; SI-NEXT: v_or_b32_e32 v1, v1, v33 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v26 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v24 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v22 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v28 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v21 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v20 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v19 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v23 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x44, v0 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v35 +; SI-NEXT: v_or_b32_e32 v2, v2, v27 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v34 +; SI-NEXT: v_or_b32_e32 v4, v4, v25 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v33 +; SI-NEXT: v_or_b32_e32 v6, v6, v23 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v32 +; SI-NEXT: v_or_b32_e32 v8, v8, v22 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v31 +; SI-NEXT: v_or_b32_e32 v10, v10, v21 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v29 +; SI-NEXT: v_or_b32_e32 v12, v12, v20 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v28 +; SI-NEXT: v_or_b32_e32 v14, v14, v19 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v26 +; SI-NEXT: v_or_b32_e32 v16, v16, v18 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v24 +; SI-NEXT: v_or_b32_e32 v1, v1, v30 +; SI-NEXT: v_or_b32_e32 v3, v3, v27 +; SI-NEXT: v_or_b32_e32 v5, v5, v25 +; SI-NEXT: v_or_b32_e32 v7, v7, v23 +; SI-NEXT: v_or_b32_e32 v9, v9, v22 +; SI-NEXT: v_or_b32_e32 v11, v11, v21 +; SI-NEXT: v_or_b32_e32 v13, v13, v20 +; SI-NEXT: v_or_b32_e32 v15, v15, v19 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v9i64_to_v36i16: @@ -16430,40 +16191,40 @@ define inreg <36 x i16> @bitcast_v9i64_to_v36i16_scalar(<9 x i64> inreg %a, i32 ; SI-LABEL: bitcast_v9i64_to_v36i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v6, s16 -; SI-NEXT: v_mov_b32_e32 v7, s17 -; SI-NEXT: v_mov_b32_e32 v8, s18 -; SI-NEXT: v_mov_b32_e32 v9, s19 -; SI-NEXT: v_mov_b32_e32 v10, s20 -; SI-NEXT: v_mov_b32_e32 v11, s21 -; SI-NEXT: v_mov_b32_e32 v12, s22 -; SI-NEXT: v_mov_b32_e32 v13, s23 -; SI-NEXT: v_mov_b32_e32 v14, s24 -; SI-NEXT: v_mov_b32_e32 v15, s25 -; SI-NEXT: v_mov_b32_e32 v16, s26 -; SI-NEXT: v_mov_b32_e32 v17, s27 -; SI-NEXT: v_mov_b32_e32 v18, s28 -; SI-NEXT: v_mov_b32_e32 v19, s29 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; SI-NEXT: v_readfirstlane_b32 s20, v6 -; SI-NEXT: v_readfirstlane_b32 s21, v7 -; SI-NEXT: v_readfirstlane_b32 s18, v8 -; SI-NEXT: v_readfirstlane_b32 s19, v9 -; SI-NEXT: v_readfirstlane_b32 s16, v10 -; SI-NEXT: v_readfirstlane_b32 s17, v11 -; SI-NEXT: v_readfirstlane_b32 s14, v12 -; SI-NEXT: v_readfirstlane_b32 s15, v13 -; SI-NEXT: v_readfirstlane_b32 s12, v14 -; SI-NEXT: v_readfirstlane_b32 s13, v15 -; SI-NEXT: v_readfirstlane_b32 s10, v16 -; SI-NEXT: v_readfirstlane_b32 s11, v17 -; SI-NEXT: v_readfirstlane_b32 s8, v18 -; SI-NEXT: v_readfirstlane_b32 s9, v19 -; SI-NEXT: v_readfirstlane_b32 s6, v1 -; SI-NEXT: v_readfirstlane_b32 s7, v2 -; SI-NEXT: v_readfirstlane_b32 s4, v3 +; SI-NEXT: v_mov_b32_e32 v5, s16 +; SI-NEXT: v_mov_b32_e32 v6, s17 +; SI-NEXT: v_mov_b32_e32 v7, s18 +; SI-NEXT: v_mov_b32_e32 v8, s19 +; SI-NEXT: v_mov_b32_e32 v9, s20 +; SI-NEXT: v_mov_b32_e32 v10, s21 +; SI-NEXT: v_mov_b32_e32 v11, s22 +; SI-NEXT: v_mov_b32_e32 v12, s23 +; SI-NEXT: v_mov_b32_e32 v13, s24 +; SI-NEXT: v_mov_b32_e32 v14, s25 +; SI-NEXT: v_mov_b32_e32 v15, s26 +; SI-NEXT: v_mov_b32_e32 v16, s27 +; SI-NEXT: v_mov_b32_e32 v17, s28 +; SI-NEXT: v_mov_b32_e32 v18, s29 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: v_readfirstlane_b32 s20, v5 +; SI-NEXT: v_readfirstlane_b32 s21, v6 +; SI-NEXT: v_readfirstlane_b32 s18, v7 +; SI-NEXT: v_readfirstlane_b32 s19, v8 +; SI-NEXT: v_readfirstlane_b32 s16, v9 +; SI-NEXT: v_readfirstlane_b32 s17, v10 +; SI-NEXT: v_readfirstlane_b32 s14, v11 +; SI-NEXT: v_readfirstlane_b32 s15, v12 +; SI-NEXT: v_readfirstlane_b32 s12, v13 +; SI-NEXT: v_readfirstlane_b32 s13, v14 +; SI-NEXT: v_readfirstlane_b32 s10, v15 +; SI-NEXT: v_readfirstlane_b32 s11, v16 +; SI-NEXT: v_readfirstlane_b32 s8, v17 +; SI-NEXT: v_readfirstlane_b32 s9, v18 +; SI-NEXT: v_readfirstlane_b32 s6, v0 +; SI-NEXT: v_readfirstlane_b32 s7, v1 +; SI-NEXT: v_readfirstlane_b32 s4, v2 ; SI-NEXT: s_and_b64 s[22:23], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s5, v4 +; SI-NEXT: v_readfirstlane_b32 s5, v3 ; SI-NEXT: s_cbranch_scc0 .LBB41_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_lshr_b32 s60, s5, 16 @@ -16526,127 +16287,75 @@ define inreg <36 x i16> @bitcast_v9i64_to_v36i16_scalar(<9 x i64> inreg %a, i32 ; SI-NEXT: s_lshl_b32 s23, s56, 16 ; SI-NEXT: s_and_b32 s20, s20, 0xffff ; SI-NEXT: s_or_b32 s20, s20, s23 -; SI-NEXT: v_mov_b32_e32 v1, s20 -; SI-NEXT: s_and_b32 s20, s21, 0xffff -; SI-NEXT: s_lshl_b32 s21, s76, 16 -; SI-NEXT: s_or_b32 s20, s20, s21 -; SI-NEXT: v_mov_b32_e32 v2, s20 +; SI-NEXT: s_and_b32 s21, s21, 0xffff +; SI-NEXT: s_lshl_b32 s23, s76, 16 +; SI-NEXT: s_or_b32 s21, s21, s23 ; SI-NEXT: s_and_b32 s18, s18, 0xffff -; SI-NEXT: s_lshl_b32 s20, s46, 16 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; SI-NEXT: s_or_b32 s18, s18, s20 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s18 -; SI-NEXT: s_and_b32 s18, s19, 0xffff -; SI-NEXT: s_lshl_b32 s19, s75, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 8, v0 -; SI-NEXT: s_or_b32 s18, s18, s19 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: s_lshl_b32 s23, s46, 16 +; SI-NEXT: s_or_b32 s18, s18, s23 +; SI-NEXT: s_and_b32 s19, s19, 0xffff +; SI-NEXT: s_lshl_b32 s23, s75, 16 +; SI-NEXT: s_or_b32 s19, s19, s23 ; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: s_lshl_b32 s18, s44, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 12, v0 -; SI-NEXT: s_or_b32 s16, s16, s18 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s17, 0xffff -; SI-NEXT: s_lshl_b32 s17, s74, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 16, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_lshl_b32 s23, s44, 16 +; SI-NEXT: s_or_b32 s16, s16, s23 +; SI-NEXT: s_and_b32 s17, s17, 0xffff +; SI-NEXT: s_lshl_b32 s23, s74, 16 +; SI-NEXT: s_or_b32 s17, s17, s23 ; SI-NEXT: s_and_b32 s14, s14, 0xffff -; SI-NEXT: s_lshl_b32 s16, s42, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 20, v0 -; SI-NEXT: s_or_b32 s14, s14, s16 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s14 -; SI-NEXT: s_and_b32 s14, s15, 0xffff -; SI-NEXT: s_lshl_b32 s15, s73, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 24, v0 -; SI-NEXT: s_or_b32 s14, s14, s15 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s14 +; SI-NEXT: s_lshl_b32 s23, s42, 16 +; SI-NEXT: s_or_b32 s14, s14, s23 +; SI-NEXT: s_and_b32 s15, s15, 0xffff +; SI-NEXT: s_lshl_b32 s23, s73, 16 +; SI-NEXT: s_or_b32 s15, s15, s23 ; SI-NEXT: s_and_b32 s12, s12, 0xffff -; SI-NEXT: s_lshl_b32 s14, s40, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 28, v0 -; SI-NEXT: s_or_b32 s12, s12, s14 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s12 -; SI-NEXT: s_and_b32 s12, s13, 0xffff -; SI-NEXT: s_lshl_b32 s13, s72, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 32, v0 -; SI-NEXT: s_or_b32 s12, s12, s13 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s12 +; SI-NEXT: s_lshl_b32 s23, s40, 16 +; SI-NEXT: s_or_b32 s12, s12, s23 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_lshl_b32 s23, s72, 16 +; SI-NEXT: s_or_b32 s13, s13, s23 ; SI-NEXT: s_and_b32 s10, s10, 0xffff -; SI-NEXT: s_lshl_b32 s12, s28, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 36, v0 -; SI-NEXT: s_or_b32 s10, s10, s12 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s10 -; SI-NEXT: s_and_b32 s10, s11, 0xffff -; SI-NEXT: s_lshl_b32 s11, s63, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 40, v0 -; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: s_lshl_b32 s23, s28, 16 +; SI-NEXT: s_or_b32 s10, s10, s23 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: s_lshl_b32 s23, s63, 16 +; SI-NEXT: s_or_b32 s11, s11, s23 ; SI-NEXT: s_and_b32 s8, s8, 0xffff -; SI-NEXT: s_lshl_b32 s10, s26, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 44, v0 -; SI-NEXT: s_or_b32 s8, s8, s10 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s8 -; SI-NEXT: s_and_b32 s8, s9, 0xffff -; SI-NEXT: s_lshl_b32 s9, s62, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 48, v0 -; SI-NEXT: s_or_b32 s8, s8, s9 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: s_lshl_b32 s23, s26, 16 +; SI-NEXT: s_or_b32 s8, s8, s23 +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_lshl_b32 s23, s62, 16 +; SI-NEXT: s_or_b32 s9, s9, s23 ; SI-NEXT: s_and_b32 s6, s6, 0xffff -; SI-NEXT: s_lshl_b32 s8, s24, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 52, v0 -; SI-NEXT: s_or_b32 s6, s6, s8 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: s_and_b32 s6, s7, 0xffff -; SI-NEXT: s_lshl_b32 s7, s61, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 56, v0 -; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_lshl_b32 s23, s24, 16 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_lshl_b32 s6, s22, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 60, v0 -; SI-NEXT: s_or_b32 s4, s4, s6 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s4 -; SI-NEXT: s_and_b32 s4, s5, 0xffff -; SI-NEXT: s_lshl_b32 s5, s60, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 64, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x44, v0 -; SI-NEXT: v_mov_b32_e32 v1, s4 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_lshl_b32 s22, s22, 16 +; SI-NEXT: s_or_b32 s6, s6, s23 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_lshl_b32 s23, s61, 16 +; SI-NEXT: s_or_b32 s4, s4, s22 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s22, s60, 16 +; SI-NEXT: s_or_b32 s7, s7, s23 +; SI-NEXT: s_or_b32 s5, s5, s22 +; SI-NEXT: v_mov_b32_e32 v0, s20 +; SI-NEXT: v_mov_b32_e32 v1, s21 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s16 +; SI-NEXT: v_mov_b32_e32 v5, s17 +; SI-NEXT: v_mov_b32_e32 v6, s14 +; SI-NEXT: v_mov_b32_e32 v7, s15 +; SI-NEXT: v_mov_b32_e32 v8, s12 +; SI-NEXT: v_mov_b32_e32 v9, s13 +; SI-NEXT: v_mov_b32_e32 v10, s10 +; SI-NEXT: v_mov_b32_e32 v11, s11 +; SI-NEXT: v_mov_b32_e32 v12, s8 +; SI-NEXT: v_mov_b32_e32 v13, s9 +; SI-NEXT: v_mov_b32_e32 v14, s6 +; SI-NEXT: v_mov_b32_e32 v15, s7 +; SI-NEXT: v_mov_b32_e32 v16, s4 +; SI-NEXT: v_mov_b32_e32 v17, s5 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB41_4: ; SI-NEXT: ; implicit-def: $sgpr56 @@ -17149,101 +16858,111 @@ define <9 x i64> @bitcast_v36i16_to_v9i64(<36 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v36i16_to_v9i64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v32, v17 +; SI-NEXT: v_mov_b32_e32 v33, v16 +; SI-NEXT: v_mov_b32_e32 v34, v15 +; SI-NEXT: v_mov_b32_e32 v35, v14 +; SI-NEXT: v_mov_b32_e32 v36, v13 +; SI-NEXT: v_mov_b32_e32 v37, v12 +; SI-NEXT: v_mov_b32_e32 v38, v11 +; SI-NEXT: v_mov_b32_e32 v39, v10 +; SI-NEXT: v_mov_b32_e32 v48, v9 +; SI-NEXT: v_mov_b32_e32 v49, v8 +; SI-NEXT: v_mov_b32_e32 v50, v7 +; SI-NEXT: v_mov_b32_e32 v51, v6 +; SI-NEXT: v_mov_b32_e32 v52, v5 ; SI-NEXT: v_mov_b32_e32 v53, v4 -; SI-NEXT: v_mov_b32_e32 v54, v2 -; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 -; SI-NEXT: v_mov_b32_e32 v36, v22 -; SI-NEXT: v_mov_b32_e32 v37, v20 -; SI-NEXT: v_mov_b32_e32 v38, v18 -; SI-NEXT: v_mov_b32_e32 v39, v16 -; SI-NEXT: v_mov_b32_e32 v48, v14 -; SI-NEXT: v_mov_b32_e32 v49, v12 -; SI-NEXT: v_mov_b32_e32 v50, v10 -; SI-NEXT: v_mov_b32_e32 v51, v8 -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v7 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v29 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v0 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v4 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v6 +; SI-NEXT: v_mov_b32_e32 v54, v3 +; SI-NEXT: v_mov_b32_e32 v55, v2 +; SI-NEXT: v_mov_b32_e32 v40, v1 +; SI-NEXT: v_mov_b32_e32 v41, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v38 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v48 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v50 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v52 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v40 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v41 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v14 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB42_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v53 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v52 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v51 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v50 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v49 -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v48 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v39 -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v38 -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v37 -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v36 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v41 -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v40 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: v_or_b32_e32 v0, v0, v35 -; SI-NEXT: v_or_b32_e32 v1, v1, v34 -; SI-NEXT: v_or_b32_e32 v2, v2, v33 -; SI-NEXT: v_or_b32_e32 v3, v3, v32 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v35 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v41 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v40 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v55 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v54 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v53 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v52 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v51 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v50 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v49 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v48 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v39 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v38 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v37 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v36 +; SI-NEXT: v_or_b32_e32 v0, v0, v45 +; SI-NEXT: v_or_b32_e32 v1, v1, v44 +; SI-NEXT: v_or_b32_e32 v2, v2, v43 +; SI-NEXT: v_or_b32_e32 v3, v3, v42 ; SI-NEXT: v_or_b32_e32 v4, v4, v63 ; SI-NEXT: v_or_b32_e32 v5, v5, v62 ; SI-NEXT: v_or_b32_e32 v6, v6, v61 @@ -17252,8 +16971,10 @@ define <9 x i64> @bitcast_v36i16_to_v9i64(<36 x i16> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v9, v9, v58 ; SI-NEXT: v_or_b32_e32 v10, v10, v57 ; SI-NEXT: v_or_b32_e32 v11, v11, v56 -; SI-NEXT: v_or_b32_e32 v16, v16, v43 -; SI-NEXT: v_or_b32_e32 v17, v17, v42 +; SI-NEXT: v_or_b32_e32 v12, v12, v47 +; SI-NEXT: v_or_b32_e32 v13, v13, v46 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr53 @@ -17266,15 +16987,11 @@ define <9 x i64> @bitcast_v36i16_to_v9i64(<36 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr62 ; SI-NEXT: ; implicit-def: $vgpr61 @@ -17283,47 +17000,55 @@ define <9 x i64> @bitcast_v36i16_to_v9i64(<36 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v34 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v33 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v32 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; SI-NEXT: v_or_b32_e32 v12, v12, v47 -; SI-NEXT: v_or_b32_e32 v13, v13, v46 -; SI-NEXT: v_or_b32_e32 v14, v14, v45 -; SI-NEXT: v_or_b32_e32 v15, v15, v44 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 ; SI-NEXT: .LBB42_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB42_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v53 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v52 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v51 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v50 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v49 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v48 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v39 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v38 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v37 -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v36 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v41 -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v40 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v35 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v40 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v48 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v39 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v38 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v37 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v36 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 @@ -17336,13 +17061,13 @@ define <9 x i64> @bitcast_v36i16_to_v9i64(<36 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; SI-NEXT: v_or_b32_e32 v0, v35, v0 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_or_b32_e32 v0, v45, v0 ; SI-NEXT: s_mov_b32 s6, 0x30000 -; SI-NEXT: v_or_b32_e32 v1, v34, v1 -; SI-NEXT: v_or_b32_e32 v2, v33, v2 -; SI-NEXT: v_or_b32_e32 v3, v32, v3 +; SI-NEXT: v_or_b32_e32 v1, v44, v1 +; SI-NEXT: v_or_b32_e32 v2, v43, v2 +; SI-NEXT: v_or_b32_e32 v3, v42, v3 ; SI-NEXT: v_or_b32_e32 v4, v63, v4 ; SI-NEXT: v_or_b32_e32 v5, v62, v5 ; SI-NEXT: v_or_b32_e32 v6, v61, v6 @@ -17351,8 +17076,8 @@ define <9 x i64> @bitcast_v36i16_to_v9i64(<36 x i16> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v9, v58, v9 ; SI-NEXT: v_or_b32_e32 v10, v57, v10 ; SI-NEXT: v_or_b32_e32 v11, v56, v11 -; SI-NEXT: v_or_b32_e32 v16, v43, v16 -; SI-NEXT: v_or_b32_e32 v17, v42, v17 +; SI-NEXT: v_or_b32_e32 v12, v47, v12 +; SI-NEXT: v_or_b32_e32 v13, v46, v13 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 @@ -17361,24 +17086,22 @@ define <9 x i64> @bitcast_v36i16_to_v9i64(<36 x i16> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 ; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 ; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v34 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v33 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v32 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; SI-NEXT: v_or_b32_e32 v12, v47, v12 -; SI-NEXT: v_or_b32_e32 v13, v46, v13 -; SI-NEXT: v_or_b32_e32 v14, v45, v14 -; SI-NEXT: v_or_b32_e32 v15, v44, v15 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 ; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 ; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 ; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 @@ -17389,22 +17112,22 @@ define <9 x i64> @bitcast_v36i16_to_v9i64(<36 x i16> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 ; SI-NEXT: .LBB42_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -17876,185 +17599,203 @@ define inreg <9 x i64> @bitcast_v36i16_to_v9i64_scalar(<36 x i16> inreg %a, i32 ; SI-LABEL: bitcast_v36i16_to_v9i64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v32, v20 -; SI-NEXT: v_mov_b32_e32 v33, v18 -; SI-NEXT: v_mov_b32_e32 v34, v16 -; SI-NEXT: v_mov_b32_e32 v35, v14 -; SI-NEXT: v_mov_b32_e32 v36, v12 -; SI-NEXT: v_mov_b32_e32 v37, v10 -; SI-NEXT: v_mov_b32_e32 v38, v8 -; SI-NEXT: v_mov_b32_e32 v39, v6 -; SI-NEXT: v_mov_b32_e32 v48, v4 -; SI-NEXT: v_mov_b32_e32 v49, v2 -; SI-NEXT: v_mov_b32_e32 v50, v0 +; SI-NEXT: v_mov_b32_e32 v32, v3 +; SI-NEXT: v_mov_b32_e32 v33, v2 +; SI-NEXT: v_mov_b32_e32 v34, v1 +; SI-NEXT: v_mov_b32_e32 v35, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v35 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s7, s28, 16 +; SI-NEXT: s_lshr_b32 s8, s27, 16 +; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: s_lshr_b32 s13, s22, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s40, s19, 16 +; SI-NEXT: s_lshr_b32 s41, s18, 16 +; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v0 ; SI-NEXT: s_cbranch_scc0 .LBB43_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 -; SI-NEXT: v_or_b32_e32 v7, v0, v45 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 -; SI-NEXT: v_or_b32_e32 v8, v0, v44 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 ; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: v_or_b32_e32 v9, v0, v43 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: s_lshl_b32 s5, s43, 16 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: v_or_b32_e32 v10, v0, v42 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 -; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: v_or_b32_e32 v11, v0, v41 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 -; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: v_or_b32_e32 v12, v0, v40 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 -; SI-NEXT: s_or_b32 s7, s7, s8 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: v_or_b32_e32 v13, v0, v55 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s44, s42, 16 +; SI-NEXT: s_or_b32 s5, s5, s44 +; SI-NEXT: s_and_b32 s44, s18, 0xffff +; SI-NEXT: s_lshl_b32 s45, s41, 16 +; SI-NEXT: s_or_b32 s44, s44, s45 +; SI-NEXT: s_and_b32 s45, s19, 0xffff +; SI-NEXT: s_lshl_b32 s46, s40, 16 +; SI-NEXT: s_or_b32 s45, s45, s46 +; SI-NEXT: s_and_b32 s46, s20, 0xffff +; SI-NEXT: s_lshl_b32 s47, s15, 16 +; SI-NEXT: s_or_b32 s46, s46, s47 +; SI-NEXT: s_and_b32 s47, s21, 0xffff +; SI-NEXT: s_lshl_b32 s56, s14, 16 +; SI-NEXT: s_or_b32 s47, s47, s56 +; SI-NEXT: s_and_b32 s56, s22, 0xffff +; SI-NEXT: s_lshl_b32 s57, s13, 16 +; SI-NEXT: s_or_b32 s56, s56, s57 +; SI-NEXT: s_and_b32 s57, s23, 0xffff +; SI-NEXT: s_lshl_b32 s58, s12, 16 +; SI-NEXT: s_or_b32 s57, s57, s58 +; SI-NEXT: s_and_b32 s58, s24, 0xffff +; SI-NEXT: s_lshl_b32 s59, s11, 16 +; SI-NEXT: s_or_b32 s58, s58, s59 +; SI-NEXT: s_and_b32 s59, s25, 0xffff +; SI-NEXT: s_lshl_b32 s60, s10, 16 +; SI-NEXT: s_or_b32 s59, s59, s60 +; SI-NEXT: s_and_b32 s60, s26, 0xffff +; SI-NEXT: s_lshl_b32 s61, s9, 16 +; SI-NEXT: s_or_b32 s60, s60, s61 +; SI-NEXT: s_and_b32 s61, s27, 0xffff +; SI-NEXT: s_lshl_b32 s62, s8, 16 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 -; SI-NEXT: s_or_b32 s8, s8, s9 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: v_or_b32_e32 v14, v0, v54 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 -; SI-NEXT: s_or_b32 s9, s9, s10 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_or_b32_e32 v15, v0, v53 +; SI-NEXT: s_or_b32 s61, s61, s62 +; SI-NEXT: s_and_b32 s62, s28, 0xffff +; SI-NEXT: s_lshl_b32 s63, s7, 16 +; SI-NEXT: v_or_b32_e32 v14, v0, v39 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 -; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_or_b32_e32 v16, v0, v52 +; SI-NEXT: s_or_b32 s62, s62, s63 +; SI-NEXT: s_and_b32 s63, s29, 0xffff +; SI-NEXT: s_lshl_b32 s72, s6, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v34 +; SI-NEXT: v_or_b32_e32 v16, v0, v37 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 -; SI-NEXT: v_or_b32_e32 v17, v0, v51 +; SI-NEXT: s_or_b32 s63, s63, s72 +; SI-NEXT: v_or_b32_e32 v15, v1, v38 +; SI-NEXT: v_or_b32_e32 v17, v0, v36 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_mov_b32_e32 v5, s9 -; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: v_mov_b32_e32 v2, s44 +; SI-NEXT: v_mov_b32_e32 v3, s45 +; SI-NEXT: v_mov_b32_e32 v4, s46 +; SI-NEXT: v_mov_b32_e32 v5, s47 +; SI-NEXT: v_mov_b32_e32 v6, s56 +; SI-NEXT: v_mov_b32_e32 v7, s57 +; SI-NEXT: v_mov_b32_e32 v8, s58 +; SI-NEXT: v_mov_b32_e32 v9, s59 +; SI-NEXT: v_mov_b32_e32 v10, s60 +; SI-NEXT: v_mov_b32_e32 v11, s61 +; SI-NEXT: v_mov_b32_e32 v12, s62 +; SI-NEXT: v_mov_b32_e32 v13, s63 ; SI-NEXT: s_cbranch_execnz .LBB43_3 ; SI-NEXT: .LBB43_2: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v45, v0 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v44, v0 -; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v43, v0 -; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v42, v0 -; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v41, v0 -; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v40, v0 -; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v55, v0 -; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_or_b32_e32 v0, v54, v0 ; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: s_add_i32 s17, s17, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s16, s42, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: s_and_b32 s16, s18, 0xffff +; SI-NEXT: s_lshl_b32 s17, s41, 16 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_and_b32 s17, s19, 0xffff +; SI-NEXT: s_lshl_b32 s18, s40, 16 ; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: s_and_b32 s18, s20, 0xffff +; SI-NEXT: s_lshl_b32 s15, s15, 16 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_or_b32 s15, s15, s18 +; SI-NEXT: s_and_b32 s18, s21, 0xffff +; SI-NEXT: s_lshl_b32 s14, s14, 16 ; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: v_or_b32_e32 v0, v53, v0 -; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_or_b32 s14, s14, s18 +; SI-NEXT: s_and_b32 s18, s22, 0xffff +; SI-NEXT: s_lshl_b32 s13, s13, 16 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: s_or_b32 s13, s13, s18 +; SI-NEXT: s_and_b32 s18, s23, 0xffff +; SI-NEXT: s_lshl_b32 s12, s12, 16 ; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 -; SI-NEXT: s_or_b32 s7, s8, s7 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: s_add_i32 s26, s26, 3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_or_b32 s8, s9, s8 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_or_b32 s12, s12, s18 +; SI-NEXT: s_and_b32 s18, s24, 0xffff +; SI-NEXT: s_lshl_b32 s11, s11, 16 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: v_or_b32_e32 v0, v39, v0 +; SI-NEXT: s_or_b32 s11, s11, s18 +; SI-NEXT: s_and_b32 s18, s25, 0xffff +; SI-NEXT: s_lshl_b32 s10, s10, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 +; SI-NEXT: s_or_b32 s10, s10, s18 +; SI-NEXT: s_and_b32 s18, s26, 0xffff +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: s_or_b32 s9, s9, s18 +; SI-NEXT: s_and_b32 s18, s27, 0xffff +; SI-NEXT: s_lshl_b32 s8, s8, 16 ; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: v_or_b32_e32 v0, v52, v0 -; SI-NEXT: s_or_b32 s9, s10, s9 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s8, s8, s18 +; SI-NEXT: s_and_b32 s18, s28, 0xffff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: v_or_b32_e32 v0, v37, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v34 +; SI-NEXT: s_or_b32 s7, s7, s18 +; SI-NEXT: s_and_b32 s18, s29, 0xffff +; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 -; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_or_b32 s6, s6, s18 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v38, v1 ; SI-NEXT: s_add_i32 s4, s4, 0x30000 ; SI-NEXT: s_add_i32 s5, s5, 0x30000 -; SI-NEXT: s_add_i32 s6, s6, 0x30000 -; SI-NEXT: s_add_i32 s7, s7, 0x30000 -; SI-NEXT: s_add_i32 s8, s8, 0x30000 -; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s16, s16, 0x30000 +; SI-NEXT: s_add_i32 s17, s17, 0x30000 +; SI-NEXT: s_add_i32 s15, s15, 0x30000 +; SI-NEXT: s_add_i32 s14, s14, 0x30000 +; SI-NEXT: s_add_i32 s13, s13, 0x30000 +; SI-NEXT: s_add_i32 s12, s12, 0x30000 +; SI-NEXT: s_add_i32 s11, s11, 0x30000 ; SI-NEXT: s_add_i32 s10, s10, 0x30000 -; SI-NEXT: v_or_b32_e32 v0, v51, v0 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v36, v0 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v1 ; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_mov_b32_e32 v5, s9 -; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: v_mov_b32_e32 v3, s17 +; SI-NEXT: v_mov_b32_e32 v4, s15 +; SI-NEXT: v_mov_b32_e32 v5, s14 +; SI-NEXT: v_mov_b32_e32 v6, s13 +; SI-NEXT: v_mov_b32_e32 v7, s12 +; SI-NEXT: v_mov_b32_e32 v8, s11 +; SI-NEXT: v_mov_b32_e32 v9, s10 +; SI-NEXT: v_mov_b32_e32 v10, s9 +; SI-NEXT: v_mov_b32_e32 v11, s8 +; SI-NEXT: v_mov_b32_e32 v12, s7 +; SI-NEXT: v_mov_b32_e32 v13, s6 ; SI-NEXT: .LBB43_3: ; %end -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB43_4: ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 @@ -18465,23 +18206,21 @@ define <36 x half> @bitcast_v9i64_to_v36f16(<9 x i64> %a, i32 %b) { ; SI-LABEL: bitcast_v9i64_to_v36f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr51 @@ -18491,86 +18230,87 @@ define <36 x half> @bitcast_v9i64_to_v36f16(<9 x i64> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB44_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v21 ; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v21 ; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v21 ; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v21 ; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v21 ; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v21 ; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v21 ; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v21 ; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v21 ; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v21 ; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v21 ; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v4 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v40, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v21 ; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v3 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v42, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v21 ; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v17 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v44, v21 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v41, v21 ; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v15 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v43, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v46, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v0 +; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 @@ -18588,237 +18328,184 @@ define <36 x half> @bitcast_v9i64_to_v36f16(<9 x i64> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: .LBB44_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB44_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: v_addc_u32_e32 v12, vcc, 0, v12, vcc -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; SI-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; SI-NEXT: v_addc_u32_e32 v16, vcc, 0, v16, vcc -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; SI-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v1 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 ; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 ; SI-NEXT: .LBB44_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v43 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v42 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v41 -; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v55 -; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v54 -; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v52 -; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v50 -; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v48 -; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v38 -; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v36 -; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v34 -; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v32 -; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v31 -; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v29 -; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v27 -; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v25 -; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v23 -; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v3, v40 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v21 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; SI-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: bitcast_v9i64_to_v36f16: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr34 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr30 -; VI-NEXT: ; implicit-def: $vgpr29 -; VI-NEXT: ; implicit-def: $vgpr28 -; VI-NEXT: ; implicit-def: $vgpr27 -; VI-NEXT: ; implicit-def: $vgpr26 -; VI-NEXT: ; implicit-def: $vgpr25 -; VI-NEXT: ; implicit-def: $vgpr24 -; VI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v4, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v54 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v52 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v51 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v48 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v39 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v35 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v36 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v31 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v32 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v27 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v28 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v22 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v24 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v19 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v21 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v9i64_to_v36f16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr28 +; VI-NEXT: ; implicit-def: $vgpr27 +; VI-NEXT: ; implicit-def: $vgpr26 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: ; implicit-def: $vgpr24 +; VI-NEXT: ; implicit-def: $vgpr23 ; VI-NEXT: ; implicit-def: $vgpr22 ; VI-NEXT: ; implicit-def: $vgpr21 ; VI-NEXT: ; implicit-def: $vgpr20 @@ -19203,96 +18890,96 @@ define inreg <36 x half> @bitcast_v9i64_to_v36f16_scalar(<9 x i64> inreg %a, i32 ; SI-LABEL: bitcast_v9i64_to_v36f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v6, s16 -; SI-NEXT: v_mov_b32_e32 v7, s17 -; SI-NEXT: v_mov_b32_e32 v8, s18 -; SI-NEXT: v_mov_b32_e32 v9, s19 -; SI-NEXT: v_mov_b32_e32 v10, s20 -; SI-NEXT: v_mov_b32_e32 v11, s21 -; SI-NEXT: v_mov_b32_e32 v12, s22 -; SI-NEXT: v_mov_b32_e32 v13, s23 -; SI-NEXT: v_mov_b32_e32 v14, s24 -; SI-NEXT: v_mov_b32_e32 v15, s25 -; SI-NEXT: v_mov_b32_e32 v16, s26 -; SI-NEXT: v_mov_b32_e32 v17, s27 -; SI-NEXT: v_mov_b32_e32 v18, s28 -; SI-NEXT: v_mov_b32_e32 v19, s29 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; SI-NEXT: v_readfirstlane_b32 s22, v6 -; SI-NEXT: v_readfirstlane_b32 s23, v7 -; SI-NEXT: v_readfirstlane_b32 s20, v8 -; SI-NEXT: v_readfirstlane_b32 s21, v9 -; SI-NEXT: v_readfirstlane_b32 s18, v10 -; SI-NEXT: v_readfirstlane_b32 s19, v11 -; SI-NEXT: v_readfirstlane_b32 s16, v12 -; SI-NEXT: v_readfirstlane_b32 s17, v13 -; SI-NEXT: v_readfirstlane_b32 s14, v14 -; SI-NEXT: v_readfirstlane_b32 s15, v15 -; SI-NEXT: v_readfirstlane_b32 s12, v16 -; SI-NEXT: v_readfirstlane_b32 s13, v17 -; SI-NEXT: v_readfirstlane_b32 s10, v18 -; SI-NEXT: v_readfirstlane_b32 s11, v19 -; SI-NEXT: v_readfirstlane_b32 s7, v1 -; SI-NEXT: v_readfirstlane_b32 s8, v2 -; SI-NEXT: v_readfirstlane_b32 s6, v3 +; SI-NEXT: v_mov_b32_e32 v5, s16 +; SI-NEXT: v_mov_b32_e32 v6, s17 +; SI-NEXT: v_mov_b32_e32 v7, s18 +; SI-NEXT: v_mov_b32_e32 v8, s19 +; SI-NEXT: v_mov_b32_e32 v9, s20 +; SI-NEXT: v_mov_b32_e32 v10, s21 +; SI-NEXT: v_mov_b32_e32 v11, s22 +; SI-NEXT: v_mov_b32_e32 v12, s23 +; SI-NEXT: v_mov_b32_e32 v13, s24 +; SI-NEXT: v_mov_b32_e32 v14, s25 +; SI-NEXT: v_mov_b32_e32 v15, s26 +; SI-NEXT: v_mov_b32_e32 v16, s27 +; SI-NEXT: v_mov_b32_e32 v17, s28 +; SI-NEXT: v_mov_b32_e32 v18, s29 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: v_readfirstlane_b32 s22, v5 +; SI-NEXT: v_readfirstlane_b32 s23, v6 +; SI-NEXT: v_readfirstlane_b32 s20, v7 +; SI-NEXT: v_readfirstlane_b32 s21, v8 +; SI-NEXT: v_readfirstlane_b32 s18, v9 +; SI-NEXT: v_readfirstlane_b32 s19, v10 +; SI-NEXT: v_readfirstlane_b32 s16, v11 +; SI-NEXT: v_readfirstlane_b32 s17, v12 +; SI-NEXT: v_readfirstlane_b32 s14, v13 +; SI-NEXT: v_readfirstlane_b32 s15, v14 +; SI-NEXT: v_readfirstlane_b32 s12, v15 +; SI-NEXT: v_readfirstlane_b32 s13, v16 +; SI-NEXT: v_readfirstlane_b32 s10, v17 +; SI-NEXT: v_readfirstlane_b32 s11, v18 +; SI-NEXT: v_readfirstlane_b32 s7, v0 +; SI-NEXT: v_readfirstlane_b32 s8, v1 +; SI-NEXT: v_readfirstlane_b32 s6, v2 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s9, v4 +; SI-NEXT: v_readfirstlane_b32 s9, v3 ; SI-NEXT: s_cbranch_scc0 .LBB45_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_lshr_b32 s4, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 ; SI-NEXT: s_lshr_b32 s4, s6, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 ; SI-NEXT: s_lshr_b32 s4, s8, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 ; SI-NEXT: s_lshr_b32 s4, s7, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 ; SI-NEXT: s_lshr_b32 s4, s11, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 ; SI-NEXT: s_lshr_b32 s4, s10, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 ; SI-NEXT: s_lshr_b32 s4, s13, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 ; SI-NEXT: s_lshr_b32 s4, s12, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 ; SI-NEXT: s_lshr_b32 s4, s15, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 ; SI-NEXT: s_lshr_b32 s4, s14, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 ; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 ; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s4 ; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 ; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v30, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 ; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s4 ; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 ; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s22 ; SI-NEXT: s_cbranch_execnz .LBB45_3 ; SI-NEXT: .LBB45_2: ; %cmp.true ; SI-NEXT: s_add_u32 s4, s22, 3 @@ -19331,206 +19018,153 @@ define inreg <36 x half> @bitcast_v9i64_to_v36f16_scalar(<9 x i64> inreg %a, i32 ; SI-NEXT: s_addc_u32 s9, s9, 0 ; SI-NEXT: s_lshr_b32 s56, s6, 16 ; SI-NEXT: s_lshr_b32 s57, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s57 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s56 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s47 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s46 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s45 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s44 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s43 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s57 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s56 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s47 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s45 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s44 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s22 ; SI-NEXT: .LBB45_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 ; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 ; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 ; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 ; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v35 +; SI-NEXT: v_or_b32_e32 v0, v34, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 +; SI-NEXT: v_or_b32_e32 v2, v32, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_or_b32_e32 v35, v35, v36 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: buffer_store_dword v35, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v35, vcc, 4, v0 -; SI-NEXT: v_or_b32_e32 v33, v33, v34 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v32 +; SI-NEXT: v_or_b32_e32 v5, v5, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v30 +; SI-NEXT: v_or_b32_e32 v7, v7, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: buffer_store_dword v33, v35, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v33, vcc, 8, v0 -; SI-NEXT: v_or_b32_e32 v31, v31, v32 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v28 +; SI-NEXT: v_or_b32_e32 v9, v26, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: buffer_store_dword v31, v33, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v31, vcc, 12, v0 -; SI-NEXT: v_or_b32_e32 v29, v29, v30 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v26 +; SI-NEXT: v_or_b32_e32 v11, v24, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: buffer_store_dword v29, v31, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v29, vcc, 16, v0 -; SI-NEXT: v_or_b32_e32 v27, v28, v27 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v24 +; SI-NEXT: v_or_b32_e32 v13, v22, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_or_b32_e32 v15, v20, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: buffer_store_dword v27, v29, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v27, vcc, 20, v0 -; SI-NEXT: v_or_b32_e32 v25, v26, v25 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: buffer_store_dword v25, v27, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v25, vcc, 24, v0 -; SI-NEXT: v_or_b32_e32 v23, v24, v23 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: buffer_store_dword v23, v25, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v23, vcc, 28, v0 -; SI-NEXT: v_or_b32_e32 v21, v22, v21 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: buffer_store_dword v21, v23, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v21, vcc, 32, v0 -; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: buffer_store_dword v19, v21, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v19, vcc, 36, v0 -; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: buffer_store_dword v17, v19, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v17, vcc, 40, v0 -; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: buffer_store_dword v15, v17, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v15, vcc, 44, v0 -; SI-NEXT: v_or_b32_e32 v12, v14, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: buffer_store_dword v12, v15, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v12, v13 -; SI-NEXT: v_add_i32_e32 v13, vcc, 48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: buffer_store_dword v10, v13, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v10, v11 -; SI-NEXT: v_add_i32_e32 v11, vcc, 52, v0 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: buffer_store_dword v8, v11, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v8, v9 -; SI-NEXT: v_add_i32_e32 v9, vcc, 56, v0 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, 60, v0 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v5 -; SI-NEXT: v_add_i32_e32 v5, vcc, 64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v3 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x44, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v20 +; SI-NEXT: v_or_b32_e32 v1, v36, v1 +; SI-NEXT: v_or_b32_e32 v3, v34, v3 +; SI-NEXT: v_or_b32_e32 v4, v31, v4 +; SI-NEXT: v_or_b32_e32 v6, v29, v6 +; SI-NEXT: v_or_b32_e32 v8, v27, v8 +; SI-NEXT: v_or_b32_e32 v10, v25, v10 +; SI-NEXT: v_or_b32_e32 v12, v23, v12 +; SI-NEXT: v_or_b32_e32 v14, v21, v14 +; SI-NEXT: v_or_b32_e32 v16, v19, v16 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB45_4: +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: s_branch .LBB45_2 ; ; VI-LABEL: bitcast_v9i64_to_v36f16_scalar: @@ -20013,94 +19647,128 @@ define <9 x i64> @bitcast_v36f16_to_v9i64(<36 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v36f16_to_v9i64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:20 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v34, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v35, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v10 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v47, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v27 -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v39 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v48 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v38 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execz .LBB46_2 -; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v49 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v35, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v63, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB46_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v35 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v33 @@ -20115,8 +19783,9 @@ define <9 x i64> @bitcast_v36f16_to_v9i64(<36 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v55 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v53 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v51 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v37 ; SI-NEXT: v_or_b32_e32 v0, v34, v0 ; SI-NEXT: v_or_b32_e32 v1, v32, v1 ; SI-NEXT: v_or_b32_e32 v2, v62, v2 @@ -20130,6 +19799,9 @@ define <9 x i64> @bitcast_v36f16_to_v9i64(<36 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v10, v54, v10 ; SI-NEXT: v_or_b32_e32 v11, v52, v11 ; SI-NEXT: v_or_b32_e32 v12, v50, v12 +; SI-NEXT: v_or_b32_e32 v13, v48, v13 +; SI-NEXT: v_or_b32_e32 v14, v38, v14 +; SI-NEXT: v_or_b32_e32 v15, v36, v15 ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr33 @@ -20157,30 +19829,28 @@ define <9 x i64> @bitcast_v36f16_to_v9i64(<36 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; kill: killed $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v37 -; SI-NEXT: v_or_b32_e32 v17, v36, v17 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: .LBB46_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB46_4 @@ -20276,90 +19946,86 @@ define <9 x i64> @bitcast_v36f16_to_v9i64(<36 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v52 ; SI-NEXT: v_cvt_f32_f16_e32 v14, v50 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v15, v39 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v51 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v52 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v37 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v14, v12 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_or_b32_e32 v11, v13, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v36 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v48 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v38 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v36 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_or_b32_e32 v14, v16, v14 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 ; SI-NEXT: v_or_b32_e32 v17, v19, v17 ; SI-NEXT: .LBB46_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -20832,6 +20498,16 @@ define inreg <9 x i64> @bitcast_v36f16_to_v9i64_scalar(<36 x half> inreg %a, i32 ; SI-LABEL: bitcast_v36f16_to_v9i64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_lshr_b32 s12, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s12 +; SI-NEXT: s_lshr_b32 s12, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s12 +; SI-NEXT: s_lshr_b32 s12, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s12 +; SI-NEXT: s_lshr_b32 s12, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s12 +; SI-NEXT: s_lshr_b32 s10, s23, 16 +; SI-NEXT: s_lshr_b32 s11, s22, 16 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -20848,89 +20524,132 @@ define inreg <9 x i64> @bitcast_v36f16_to_v9i64_scalar(<36 x half> inreg %a, i32 ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v57, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v11 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v63, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s23 +; SI-NEXT: s_lshr_b32 s8, s25, 16 +; SI-NEXT: s_lshr_b32 s9, s24, 16 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s25 +; SI-NEXT: s_lshr_b32 s6, s27, 16 +; SI-NEXT: s_lshr_b32 s7, s26, 16 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s27 +; SI-NEXT: s_lshr_b32 s12, s17, 16 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 +; SI-NEXT: s_lshr_b32 s4, s29, 16 +; SI-NEXT: s_lshr_b32 s5, s28, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s12 +; SI-NEXT: s_lshr_b32 s12, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s17 ; SI-NEXT: v_cvt_f16_f32_e32 v38, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v29, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v35, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v34, s16 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v62, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v33, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v60, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v61, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v44, s23 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v63, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v42, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v41, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v40, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v55, s26 -; SI-NEXT: v_cvt_f16_f32_e32 v59, s29 -; SI-NEXT: v_cvt_f16_f32_e32 v58, s28 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v3 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_cbranch_scc0 .LBB47_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v62 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v60 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v44 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v42 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v40 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v45 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v63 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v60 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v57 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v47 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v50 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v48 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v38 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v30 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v28 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v26 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v56 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v21 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 ; SI-NEXT: v_or_b32_e32 v0, v34, v0 ; SI-NEXT: v_or_b32_e32 v1, v33, v1 -; SI-NEXT: v_or_b32_e32 v2, v61, v2 -; SI-NEXT: v_or_b32_e32 v3, v63, v3 -; SI-NEXT: v_or_b32_e32 v4, v41, v4 -; SI-NEXT: v_or_b32_e32 v5, v55, v5 +; SI-NEXT: v_or_b32_e32 v2, v44, v2 +; SI-NEXT: v_or_b32_e32 v3, v42, v3 +; SI-NEXT: v_or_b32_e32 v4, v62, v4 +; SI-NEXT: v_or_b32_e32 v5, v59, v5 ; SI-NEXT: v_or_b32_e32 v6, v58, v6 -; SI-NEXT: v_or_b32_e32 v7, v56, v7 -; SI-NEXT: v_or_b32_e32 v8, v51, v8 -; SI-NEXT: v_or_b32_e32 v9, v49, v9 -; SI-NEXT: v_or_b32_e32 v10, v39, v10 -; SI-NEXT: v_or_b32_e32 v11, v37, v11 -; SI-NEXT: v_or_b32_e32 v12, v31, v12 -; SI-NEXT: v_or_b32_e32 v13, v29, v13 -; SI-NEXT: v_or_b32_e32 v14, v27, v14 -; SI-NEXT: v_or_b32_e32 v15, v25, v15 -; SI-NEXT: v_or_b32_e32 v16, v23, v16 +; SI-NEXT: v_or_b32_e32 v7, v47, v7 +; SI-NEXT: v_or_b32_e32 v8, v46, v8 +; SI-NEXT: v_or_b32_e32 v9, v37, v9 +; SI-NEXT: v_or_b32_e32 v10, v36, v10 +; SI-NEXT: v_or_b32_e32 v11, v29, v11 +; SI-NEXT: v_or_b32_e32 v12, v28, v12 +; SI-NEXT: v_or_b32_e32 v13, v26, v13 +; SI-NEXT: v_or_b32_e32 v14, v24, v14 +; SI-NEXT: v_or_b32_e32 v15, v22, v15 +; SI-NEXT: v_or_b32_e32 v16, v20, v16 ; SI-NEXT: v_or_b32_e32 v17, v19, v17 ; SI-NEXT: s_cbranch_execnz .LBB47_3 ; SI-NEXT: .LBB47_2: ; %cmp.true +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v43 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v33 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 @@ -20945,11 +20664,10 @@ define inreg <9 x i64> @bitcast_v36f16_to_v9i64_scalar(<36 x half> inreg %a, i32 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v63 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v62 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -20958,20 +20676,18 @@ define inreg <9 x i64> @bitcast_v36f16_to_v9i64_scalar(<36 x half> inreg %a, i32 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v44 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v61 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v59 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v63 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v58 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 @@ -20979,11 +20695,11 @@ define inreg <9 x i64> @bitcast_v36f16_to_v9i64_scalar(<36 x half> inreg %a, i32 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v56 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v60 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v57 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 @@ -20994,10 +20710,10 @@ define inreg <9 x i64> @bitcast_v36f16_to_v9i64_scalar(<36 x half> inreg %a, i32 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v47 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v37 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 @@ -21005,29 +20721,29 @@ define inreg <9 x i64> @bitcast_v36f16_to_v9i64_scalar(<36 x half> inreg %a, i32 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v39 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v38 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v31 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v36 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v25 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v30 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 @@ -21035,10 +20751,10 @@ define inreg <9 x i64> @bitcast_v36f16_to_v9i64_scalar(<36 x half> inreg %a, i32 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v27 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v14, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v26 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 @@ -21046,11 +20762,11 @@ define inreg <9 x i64> @bitcast_v36f16_to_v9i64_scalar(<36 x half> inreg %a, i32 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v24 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v22 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 @@ -21058,10 +20774,10 @@ define inreg <9 x i64> @bitcast_v36f16_to_v9i64_scalar(<36 x half> inreg %a, i32 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_or_b32_e32 v14, v16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v21 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 @@ -21096,59 +20812,63 @@ define inreg <9 x i64> @bitcast_v36f16_to_v9i64_scalar(<36 x half> inreg %a, i32 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB47_4: -; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v55, v36 +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_mov_b32_e32 v35, v42 +; SI-NEXT: v_mov_b32_e32 v42, v36 ; SI-NEXT: v_mov_b32_e32 v36, v19 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v40, v37 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v34, v43 +; SI-NEXT: v_mov_b32_e32 v43, v37 ; SI-NEXT: v_mov_b32_e32 v37, v18 -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v41, v38 -; SI-NEXT: v_mov_b32_e32 v38, v23 -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v42, v39 -; SI-NEXT: v_mov_b32_e32 v39, v24 -; SI-NEXT: v_mov_b32_e32 v43, v48 -; SI-NEXT: v_mov_b32_e32 v48, v25 -; SI-NEXT: v_mov_b32_e32 v32, v44 -; SI-NEXT: v_mov_b32_e32 v44, v49 -; SI-NEXT: v_mov_b32_e32 v49, v26 -; SI-NEXT: v_mov_b32_e32 v45, v50 -; SI-NEXT: v_mov_b32_e32 v50, v27 -; SI-NEXT: v_mov_b32_e32 v46, v51 -; SI-NEXT: v_mov_b32_e32 v51, v28 -; SI-NEXT: v_mov_b32_e32 v52, v29 -; SI-NEXT: v_mov_b32_e32 v53, v30 -; SI-NEXT: v_mov_b32_e32 v54, v31 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v33, v44 +; SI-NEXT: v_mov_b32_e32 v44, v38 +; SI-NEXT: v_mov_b32_e32 v38, v20 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v32, v45 +; SI-NEXT: v_mov_b32_e32 v45, v39 +; SI-NEXT: v_mov_b32_e32 v39, v21 +; SI-NEXT: v_mov_b32_e32 v48, v22 +; SI-NEXT: v_mov_b32_e32 v49, v23 +; SI-NEXT: v_mov_b32_e32 v50, v24 +; SI-NEXT: v_mov_b32_e32 v51, v25 +; SI-NEXT: v_mov_b32_e32 v52, v26 +; SI-NEXT: v_mov_b32_e32 v53, v27 +; SI-NEXT: v_mov_b32_e32 v54, v28 +; SI-NEXT: v_mov_b32_e32 v55, v29 +; SI-NEXT: v_mov_b32_e32 v40, v30 +; SI-NEXT: v_mov_b32_e32 v41, v31 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v24, v39 -; SI-NEXT: v_mov_b32_e32 v39, v42 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v23, v38 -; SI-NEXT: v_mov_b32_e32 v38, v41 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v21, v39 +; SI-NEXT: v_mov_b32_e32 v39, v45 +; SI-NEXT: v_mov_b32_e32 v45, v32 +; SI-NEXT: v_mov_b32_e32 v20, v38 +; SI-NEXT: v_mov_b32_e32 v38, v44 +; SI-NEXT: v_mov_b32_e32 v44, v33 ; SI-NEXT: v_mov_b32_e32 v18, v37 -; SI-NEXT: v_mov_b32_e32 v37, v40 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v37, v43 +; SI-NEXT: v_mov_b32_e32 v43, v34 ; SI-NEXT: v_mov_b32_e32 v19, v36 -; SI-NEXT: v_mov_b32_e32 v36, v55 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v31, v54 -; SI-NEXT: v_mov_b32_e32 v30, v53 -; SI-NEXT: v_mov_b32_e32 v29, v52 -; SI-NEXT: v_mov_b32_e32 v28, v51 -; SI-NEXT: v_mov_b32_e32 v51, v46 -; SI-NEXT: v_mov_b32_e32 v27, v50 -; SI-NEXT: v_mov_b32_e32 v50, v45 -; SI-NEXT: v_mov_b32_e32 v26, v49 -; SI-NEXT: v_mov_b32_e32 v49, v44 -; SI-NEXT: v_mov_b32_e32 v44, v32 -; SI-NEXT: v_mov_b32_e32 v25, v48 -; SI-NEXT: v_mov_b32_e32 v48, v43 +; SI-NEXT: v_mov_b32_e32 v36, v42 +; SI-NEXT: v_mov_b32_e32 v42, v35 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v31, v41 +; SI-NEXT: v_mov_b32_e32 v30, v40 +; SI-NEXT: v_mov_b32_e32 v29, v55 +; SI-NEXT: v_mov_b32_e32 v28, v54 +; SI-NEXT: v_mov_b32_e32 v27, v53 +; SI-NEXT: v_mov_b32_e32 v26, v52 +; SI-NEXT: v_mov_b32_e32 v25, v51 +; SI-NEXT: v_mov_b32_e32 v24, v50 +; SI-NEXT: v_mov_b32_e32 v23, v49 +; SI-NEXT: v_mov_b32_e32 v22, v48 ; SI-NEXT: s_branch .LBB47_2 ; ; VI-LABEL: bitcast_v36f16_to_v9i64_scalar: @@ -21527,187 +21247,134 @@ define <36 x i16> @bitcast_v9f64_to_v36i16(<9 x double> %a, i32 %b) { ; SI-LABEL: bitcast_v9f64_to_v36i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB48_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_alignbit_b32 v19, v18, v17, 16 -; SI-NEXT: v_alignbit_b32 v20, v16, v15, 16 -; SI-NEXT: v_alignbit_b32 v21, v14, v13, 16 -; SI-NEXT: v_alignbit_b32 v22, v12, v11, 16 -; SI-NEXT: v_alignbit_b32 v24, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v26, v8, v7, 16 -; SI-NEXT: v_alignbit_b32 v29, v6, v5, 16 -; SI-NEXT: v_alignbit_b32 v31, v4, v3, 16 -; SI-NEXT: v_alignbit_b32 v34, v2, v1, 16 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v2 +; SI-NEXT: v_alignbit_b32 v18, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v19, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v20, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v21, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v22, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v23, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v26, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v29, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v31, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v1 ; SI-NEXT: .LBB48_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB48_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 -; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 -; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 -; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 -; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 -; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 -; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 -; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 -; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 -; SI-NEXT: v_alignbit_b32 v19, v18, v17, 16 -; SI-NEXT: v_alignbit_b32 v20, v16, v15, 16 -; SI-NEXT: v_alignbit_b32 v21, v14, v13, 16 -; SI-NEXT: v_alignbit_b32 v22, v12, v11, 16 -; SI-NEXT: v_alignbit_b32 v24, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v26, v8, v7, 16 -; SI-NEXT: v_alignbit_b32 v29, v6, v5, 16 -; SI-NEXT: v_alignbit_b32 v31, v4, v3, 16 -; SI-NEXT: v_alignbit_b32 v34, v2, v1, 16 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v2 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_alignbit_b32 v18, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v19, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v20, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v21, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v22, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v23, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v26, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v29, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v31, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v1 ; SI-NEXT: .LBB48_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v0, v0, v31 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v1, v34 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v26 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v24 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v22 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v28 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v21 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v20 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v19 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v23 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x44, v0 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v35 +; SI-NEXT: v_or_b32_e32 v2, v2, v29 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v34 +; SI-NEXT: v_or_b32_e32 v4, v4, v26 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v33 +; SI-NEXT: v_or_b32_e32 v6, v6, v23 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v32 +; SI-NEXT: v_or_b32_e32 v8, v8, v22 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v30 +; SI-NEXT: v_or_b32_e32 v10, v10, v21 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v28 +; SI-NEXT: v_or_b32_e32 v12, v12, v20 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v27 +; SI-NEXT: v_or_b32_e32 v14, v14, v19 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v25 +; SI-NEXT: v_or_b32_e32 v16, v16, v18 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v24 +; SI-NEXT: v_or_b32_e32 v1, v1, v31 +; SI-NEXT: v_or_b32_e32 v3, v3, v29 +; SI-NEXT: v_or_b32_e32 v5, v5, v26 +; SI-NEXT: v_or_b32_e32 v7, v7, v23 +; SI-NEXT: v_or_b32_e32 v9, v9, v22 +; SI-NEXT: v_or_b32_e32 v11, v11, v21 +; SI-NEXT: v_or_b32_e32 v13, v13, v20 +; SI-NEXT: v_or_b32_e32 v15, v15, v19 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v9f64_to_v36i16: @@ -22065,199 +21732,150 @@ define inreg <36 x i16> @bitcast_v9f64_to_v36i16_scalar(<9 x double> inreg %a, i ; SI-LABEL: bitcast_v9f64_to_v36i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 ; SI-NEXT: v_mov_b32_e32 v18, s16 ; SI-NEXT: v_mov_b32_e32 v19, s17 ; SI-NEXT: v_mov_b32_e32 v16, s18 ; SI-NEXT: v_mov_b32_e32 v17, s19 ; SI-NEXT: v_mov_b32_e32 v14, s20 ; SI-NEXT: v_mov_b32_e32 v15, s21 -; SI-NEXT: v_mov_b32_e32 v12, s22 -; SI-NEXT: v_mov_b32_e32 v13, s23 -; SI-NEXT: v_mov_b32_e32 v10, s24 -; SI-NEXT: v_mov_b32_e32 v11, s25 -; SI-NEXT: v_mov_b32_e32 v8, s26 -; SI-NEXT: v_mov_b32_e32 v9, s27 -; SI-NEXT: v_mov_b32_e32 v6, s28 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mov_b32_e32 v7, s29 +; SI-NEXT: v_mov_b32_e32 v13, s29 ; SI-NEXT: s_cbranch_scc0 .LBB49_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshr_b64 v[20:21], v[3:4], 16 -; SI-NEXT: v_lshr_b64 v[21:22], v[1:2], 16 -; SI-NEXT: v_lshr_b64 v[22:23], v[6:7], 16 -; SI-NEXT: v_lshr_b64 v[23:24], v[8:9], 16 +; SI-NEXT: v_lshr_b64 v[26:27], v[12:13], 16 +; SI-NEXT: v_lshr_b64 v[22:23], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[27:28], v[8:9], 16 +; SI-NEXT: v_lshr_b64 v[23:24], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[28:29], v[6:7], 16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v19 ; SI-NEXT: v_lshr_b64 v[24:25], v[10:11], 16 -; SI-NEXT: v_lshr_b64 v[25:26], v[12:13], 16 -; SI-NEXT: v_lshr_b64 v[26:27], v[14:15], 16 -; SI-NEXT: v_lshr_b64 v[27:28], v[16:17], 16 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v19 -; SI-NEXT: v_lshr_b64 v[28:29], v[18:19], 16 +; SI-NEXT: v_lshr_b64 v[4:5], v[14:15], 16 +; SI-NEXT: v_lshr_b64 v[29:30], v[16:17], 16 +; SI-NEXT: v_lshr_b64 v[20:21], v[18:19], 16 ; SI-NEXT: s_cbranch_execnz .LBB49_3 ; SI-NEXT: .LBB49_2: ; %cmp.true -; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 -; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 -; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; SI-NEXT: v_lshr_b64 v[20:21], v[3:4], 16 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; SI-NEXT: v_lshr_b64 v[21:22], v[1:2], 16 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_lshr_b64 v[26:27], v[12:13], 16 ; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; SI-NEXT: v_lshr_b64 v[22:23], v[6:7], 16 -; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; SI-NEXT: v_lshr_b64 v[23:24], v[8:9], 16 ; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; SI-NEXT: v_lshr_b64 v[24:25], v[10:11], 16 ; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; SI-NEXT: v_lshr_b64 v[25:26], v[12:13], 16 ; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; SI-NEXT: v_lshr_b64 v[26:27], v[14:15], 16 -; SI-NEXT: v_lshr_b64 v[27:28], v[16:17], 16 -; SI-NEXT: v_lshr_b64 v[28:29], v[18:19], 16 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v19 +; SI-NEXT: v_lshr_b64 v[22:23], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[27:28], v[8:9], 16 +; SI-NEXT: v_lshr_b64 v[23:24], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[28:29], v[6:7], 16 +; SI-NEXT: v_lshr_b64 v[24:25], v[10:11], 16 +; SI-NEXT: v_lshr_b64 v[4:5], v[14:15], 16 +; SI-NEXT: v_lshr_b64 v[29:30], v[16:17], 16 +; SI-NEXT: v_lshr_b64 v[20:21], v[18:19], 16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v19 ; SI-NEXT: .LBB49_3: ; %end -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v20 ; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; SI-NEXT: v_or_b32_e32 v18, v18, v28 -; SI-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v19 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v37 -; SI-NEXT: v_or_b32_e32 v18, v18, v19 -; SI-NEXT: v_add_i32_e32 v19, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v27 +; SI-NEXT: v_or_b32_e32 v20, v18, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v39 +; SI-NEXT: v_or_b32_e32 v21, v5, v18 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v29 ; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; SI-NEXT: v_or_b32_e32 v16, v16, v18 -; SI-NEXT: v_add_i32_e32 v18, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v16, v18, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v17 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v36 -; SI-NEXT: v_or_b32_e32 v16, v16, v17 -; SI-NEXT: v_add_i32_e32 v17, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v26 -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; SI-NEXT: v_or_b32_e32 v14, v14, v16 -; SI-NEXT: v_add_i32_e32 v16, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v14, v16, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v35 -; SI-NEXT: v_or_b32_e32 v14, v14, v15 -; SI-NEXT: v_add_i32_e32 v15, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v14, v15, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v18, v16, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v38 +; SI-NEXT: v_or_b32_e32 v19, v5, v16 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v14 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v37 +; SI-NEXT: v_or_b32_e32 v5, v5, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v28 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v6, v6, v14 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v36 +; SI-NEXT: v_or_b32_e32 v7, v7, v14 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v27 +; SI-NEXT: v_or_b32_e32 v8, v8, v14 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v35 +; SI-NEXT: v_or_b32_e32 v9, v9, v14 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v24 +; SI-NEXT: v_or_b32_e32 v10, v10, v14 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v34 +; SI-NEXT: v_or_b32_e32 v11, v11, v14 ; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v26 ; SI-NEXT: v_or_b32_e32 v12, v12, v14 -; SI-NEXT: v_add_i32_e32 v14, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v12, v14, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v13 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v34 -; SI-NEXT: v_or_b32_e32 v12, v12, v13 -; SI-NEXT: v_add_i32_e32 v13, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v12, v13, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v24 -; SI-NEXT: v_or_b32_e32 v10, v10, v12 -; SI-NEXT: v_add_i32_e32 v12, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v10, v12, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v33 -; SI-NEXT: v_or_b32_e32 v10, v10, v11 -; SI-NEXT: v_add_i32_e32 v11, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v23 -; SI-NEXT: v_or_b32_e32 v8, v8, v10 -; SI-NEXT: v_add_i32_e32 v10, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v8, v10, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v32 -; SI-NEXT: v_or_b32_e32 v8, v8, v9 -; SI-NEXT: v_add_i32_e32 v9, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v8, v9, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v22 -; SI-NEXT: v_or_b32_e32 v6, v6, v8 -; SI-NEXT: v_add_i32_e32 v8, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v6, v8, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v31 -; SI-NEXT: v_or_b32_e32 v6, v6, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v21 -; SI-NEXT: v_or_b32_e32 v1, v1, v6 -; SI-NEXT: v_add_i32_e32 v6, vcc, 56, v0 -; SI-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v20 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v5 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x44, v0 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v33 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v23 +; SI-NEXT: v_or_b32_e32 v14, v0, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v32 +; SI-NEXT: v_or_b32_e32 v15, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v22 +; SI-NEXT: v_or_b32_e32 v16, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v31 +; SI-NEXT: v_or_b32_e32 v17, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, v20 +; SI-NEXT: v_mov_b32_e32 v1, v21 +; SI-NEXT: v_mov_b32_e32 v2, v18 +; SI-NEXT: v_mov_b32_e32 v3, v19 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB49_4: -; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: s_branch .LBB49_2 ; ; VI-LABEL: bitcast_v9f64_to_v36i16_scalar: @@ -22807,101 +22425,111 @@ define <9 x double> @bitcast_v36i16_to_v9f64(<36 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v36i16_to_v9f64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v32, v17 +; SI-NEXT: v_mov_b32_e32 v33, v16 +; SI-NEXT: v_mov_b32_e32 v34, v15 +; SI-NEXT: v_mov_b32_e32 v35, v14 +; SI-NEXT: v_mov_b32_e32 v36, v13 +; SI-NEXT: v_mov_b32_e32 v37, v12 +; SI-NEXT: v_mov_b32_e32 v38, v11 +; SI-NEXT: v_mov_b32_e32 v39, v10 +; SI-NEXT: v_mov_b32_e32 v48, v9 +; SI-NEXT: v_mov_b32_e32 v49, v8 +; SI-NEXT: v_mov_b32_e32 v50, v7 +; SI-NEXT: v_mov_b32_e32 v51, v6 +; SI-NEXT: v_mov_b32_e32 v52, v5 ; SI-NEXT: v_mov_b32_e32 v53, v4 -; SI-NEXT: v_mov_b32_e32 v54, v2 -; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 -; SI-NEXT: v_mov_b32_e32 v36, v22 -; SI-NEXT: v_mov_b32_e32 v37, v20 -; SI-NEXT: v_mov_b32_e32 v38, v18 -; SI-NEXT: v_mov_b32_e32 v39, v16 -; SI-NEXT: v_mov_b32_e32 v48, v14 -; SI-NEXT: v_mov_b32_e32 v49, v12 -; SI-NEXT: v_mov_b32_e32 v50, v10 -; SI-NEXT: v_mov_b32_e32 v51, v8 -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v7 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v29 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v0 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v4 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v6 +; SI-NEXT: v_mov_b32_e32 v54, v3 +; SI-NEXT: v_mov_b32_e32 v55, v2 +; SI-NEXT: v_mov_b32_e32 v40, v1 +; SI-NEXT: v_mov_b32_e32 v41, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v38 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v48 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v50 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v52 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v40 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v41 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v14 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB50_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v53 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v52 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v51 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v50 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v49 -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v48 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v39 -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v38 -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v37 -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v36 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v41 -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v40 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: v_or_b32_e32 v0, v0, v35 -; SI-NEXT: v_or_b32_e32 v1, v1, v34 -; SI-NEXT: v_or_b32_e32 v2, v2, v33 -; SI-NEXT: v_or_b32_e32 v3, v3, v32 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v35 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v41 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v40 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v55 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v54 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v53 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v52 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v51 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v50 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v49 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v48 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v39 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v38 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v37 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v36 +; SI-NEXT: v_or_b32_e32 v0, v0, v45 +; SI-NEXT: v_or_b32_e32 v1, v1, v44 +; SI-NEXT: v_or_b32_e32 v2, v2, v43 +; SI-NEXT: v_or_b32_e32 v3, v3, v42 ; SI-NEXT: v_or_b32_e32 v4, v4, v63 ; SI-NEXT: v_or_b32_e32 v5, v5, v62 ; SI-NEXT: v_or_b32_e32 v6, v6, v61 @@ -22910,8 +22538,10 @@ define <9 x double> @bitcast_v36i16_to_v9f64(<36 x i16> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v9, v9, v58 ; SI-NEXT: v_or_b32_e32 v10, v10, v57 ; SI-NEXT: v_or_b32_e32 v11, v11, v56 -; SI-NEXT: v_or_b32_e32 v16, v16, v43 -; SI-NEXT: v_or_b32_e32 v17, v17, v42 +; SI-NEXT: v_or_b32_e32 v12, v12, v47 +; SI-NEXT: v_or_b32_e32 v13, v13, v46 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr53 @@ -22924,15 +22554,11 @@ define <9 x double> @bitcast_v36i16_to_v9f64(<36 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr62 ; SI-NEXT: ; implicit-def: $vgpr61 @@ -22941,47 +22567,55 @@ define <9 x double> @bitcast_v36i16_to_v9f64(<36 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v34 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v33 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v32 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; SI-NEXT: v_or_b32_e32 v12, v12, v47 -; SI-NEXT: v_or_b32_e32 v13, v13, v46 -; SI-NEXT: v_or_b32_e32 v14, v14, v45 -; SI-NEXT: v_or_b32_e32 v15, v15, v44 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 ; SI-NEXT: .LBB50_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB50_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v53 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v52 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v51 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v50 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v49 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v48 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v39 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v38 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v37 -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v36 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v41 -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v40 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v35 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v40 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v48 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v39 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v38 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v37 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v36 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 @@ -22994,13 +22628,13 @@ define <9 x double> @bitcast_v36i16_to_v9f64(<36 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; SI-NEXT: v_or_b32_e32 v0, v35, v0 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_or_b32_e32 v0, v45, v0 ; SI-NEXT: s_mov_b32 s6, 0x30000 -; SI-NEXT: v_or_b32_e32 v1, v34, v1 -; SI-NEXT: v_or_b32_e32 v2, v33, v2 -; SI-NEXT: v_or_b32_e32 v3, v32, v3 +; SI-NEXT: v_or_b32_e32 v1, v44, v1 +; SI-NEXT: v_or_b32_e32 v2, v43, v2 +; SI-NEXT: v_or_b32_e32 v3, v42, v3 ; SI-NEXT: v_or_b32_e32 v4, v63, v4 ; SI-NEXT: v_or_b32_e32 v5, v62, v5 ; SI-NEXT: v_or_b32_e32 v6, v61, v6 @@ -23009,8 +22643,8 @@ define <9 x double> @bitcast_v36i16_to_v9f64(<36 x i16> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v9, v58, v9 ; SI-NEXT: v_or_b32_e32 v10, v57, v10 ; SI-NEXT: v_or_b32_e32 v11, v56, v11 -; SI-NEXT: v_or_b32_e32 v16, v43, v16 -; SI-NEXT: v_or_b32_e32 v17, v42, v17 +; SI-NEXT: v_or_b32_e32 v12, v47, v12 +; SI-NEXT: v_or_b32_e32 v13, v46, v13 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 @@ -23019,24 +22653,22 @@ define <9 x double> @bitcast_v36i16_to_v9f64(<36 x i16> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 ; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 ; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v34 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v33 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v32 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; SI-NEXT: v_or_b32_e32 v12, v47, v12 -; SI-NEXT: v_or_b32_e32 v13, v46, v13 -; SI-NEXT: v_or_b32_e32 v14, v45, v14 -; SI-NEXT: v_or_b32_e32 v15, v44, v15 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 ; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 ; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 ; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 @@ -23047,22 +22679,22 @@ define <9 x double> @bitcast_v36i16_to_v9f64(<36 x i16> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 ; SI-NEXT: .LBB50_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -23534,185 +23166,203 @@ define inreg <9 x double> @bitcast_v36i16_to_v9f64_scalar(<36 x i16> inreg %a, i ; SI-LABEL: bitcast_v36i16_to_v9f64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v32, v20 -; SI-NEXT: v_mov_b32_e32 v33, v18 -; SI-NEXT: v_mov_b32_e32 v34, v16 -; SI-NEXT: v_mov_b32_e32 v35, v14 -; SI-NEXT: v_mov_b32_e32 v36, v12 -; SI-NEXT: v_mov_b32_e32 v37, v10 -; SI-NEXT: v_mov_b32_e32 v38, v8 -; SI-NEXT: v_mov_b32_e32 v39, v6 -; SI-NEXT: v_mov_b32_e32 v48, v4 -; SI-NEXT: v_mov_b32_e32 v49, v2 -; SI-NEXT: v_mov_b32_e32 v50, v0 +; SI-NEXT: v_mov_b32_e32 v32, v3 +; SI-NEXT: v_mov_b32_e32 v33, v2 +; SI-NEXT: v_mov_b32_e32 v34, v1 +; SI-NEXT: v_mov_b32_e32 v35, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v35 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s7, s28, 16 +; SI-NEXT: s_lshr_b32 s8, s27, 16 +; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: s_lshr_b32 s13, s22, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s40, s19, 16 +; SI-NEXT: s_lshr_b32 s41, s18, 16 +; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v0 ; SI-NEXT: s_cbranch_scc0 .LBB51_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 -; SI-NEXT: v_or_b32_e32 v7, v0, v45 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 -; SI-NEXT: v_or_b32_e32 v8, v0, v44 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 ; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: v_or_b32_e32 v9, v0, v43 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: s_lshl_b32 s5, s43, 16 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: v_or_b32_e32 v10, v0, v42 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 -; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: v_or_b32_e32 v11, v0, v41 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 -; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: v_or_b32_e32 v12, v0, v40 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 -; SI-NEXT: s_or_b32 s7, s7, s8 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: v_or_b32_e32 v13, v0, v55 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s44, s42, 16 +; SI-NEXT: s_or_b32 s5, s5, s44 +; SI-NEXT: s_and_b32 s44, s18, 0xffff +; SI-NEXT: s_lshl_b32 s45, s41, 16 +; SI-NEXT: s_or_b32 s44, s44, s45 +; SI-NEXT: s_and_b32 s45, s19, 0xffff +; SI-NEXT: s_lshl_b32 s46, s40, 16 +; SI-NEXT: s_or_b32 s45, s45, s46 +; SI-NEXT: s_and_b32 s46, s20, 0xffff +; SI-NEXT: s_lshl_b32 s47, s15, 16 +; SI-NEXT: s_or_b32 s46, s46, s47 +; SI-NEXT: s_and_b32 s47, s21, 0xffff +; SI-NEXT: s_lshl_b32 s56, s14, 16 +; SI-NEXT: s_or_b32 s47, s47, s56 +; SI-NEXT: s_and_b32 s56, s22, 0xffff +; SI-NEXT: s_lshl_b32 s57, s13, 16 +; SI-NEXT: s_or_b32 s56, s56, s57 +; SI-NEXT: s_and_b32 s57, s23, 0xffff +; SI-NEXT: s_lshl_b32 s58, s12, 16 +; SI-NEXT: s_or_b32 s57, s57, s58 +; SI-NEXT: s_and_b32 s58, s24, 0xffff +; SI-NEXT: s_lshl_b32 s59, s11, 16 +; SI-NEXT: s_or_b32 s58, s58, s59 +; SI-NEXT: s_and_b32 s59, s25, 0xffff +; SI-NEXT: s_lshl_b32 s60, s10, 16 +; SI-NEXT: s_or_b32 s59, s59, s60 +; SI-NEXT: s_and_b32 s60, s26, 0xffff +; SI-NEXT: s_lshl_b32 s61, s9, 16 +; SI-NEXT: s_or_b32 s60, s60, s61 +; SI-NEXT: s_and_b32 s61, s27, 0xffff +; SI-NEXT: s_lshl_b32 s62, s8, 16 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 -; SI-NEXT: s_or_b32 s8, s8, s9 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: v_or_b32_e32 v14, v0, v54 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 -; SI-NEXT: s_or_b32 s9, s9, s10 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_or_b32_e32 v15, v0, v53 +; SI-NEXT: s_or_b32 s61, s61, s62 +; SI-NEXT: s_and_b32 s62, s28, 0xffff +; SI-NEXT: s_lshl_b32 s63, s7, 16 +; SI-NEXT: v_or_b32_e32 v14, v0, v39 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 -; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_or_b32_e32 v16, v0, v52 +; SI-NEXT: s_or_b32 s62, s62, s63 +; SI-NEXT: s_and_b32 s63, s29, 0xffff +; SI-NEXT: s_lshl_b32 s72, s6, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v34 +; SI-NEXT: v_or_b32_e32 v16, v0, v37 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 -; SI-NEXT: v_or_b32_e32 v17, v0, v51 +; SI-NEXT: s_or_b32 s63, s63, s72 +; SI-NEXT: v_or_b32_e32 v15, v1, v38 +; SI-NEXT: v_or_b32_e32 v17, v0, v36 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_mov_b32_e32 v5, s9 -; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: v_mov_b32_e32 v2, s44 +; SI-NEXT: v_mov_b32_e32 v3, s45 +; SI-NEXT: v_mov_b32_e32 v4, s46 +; SI-NEXT: v_mov_b32_e32 v5, s47 +; SI-NEXT: v_mov_b32_e32 v6, s56 +; SI-NEXT: v_mov_b32_e32 v7, s57 +; SI-NEXT: v_mov_b32_e32 v8, s58 +; SI-NEXT: v_mov_b32_e32 v9, s59 +; SI-NEXT: v_mov_b32_e32 v10, s60 +; SI-NEXT: v_mov_b32_e32 v11, s61 +; SI-NEXT: v_mov_b32_e32 v12, s62 +; SI-NEXT: v_mov_b32_e32 v13, s63 ; SI-NEXT: s_cbranch_execnz .LBB51_3 ; SI-NEXT: .LBB51_2: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v45, v0 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v44, v0 -; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v43, v0 -; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v42, v0 -; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v41, v0 -; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v40, v0 -; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v55, v0 -; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_or_b32_e32 v0, v54, v0 ; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: s_add_i32 s17, s17, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s16, s42, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: s_and_b32 s16, s18, 0xffff +; SI-NEXT: s_lshl_b32 s17, s41, 16 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_and_b32 s17, s19, 0xffff +; SI-NEXT: s_lshl_b32 s18, s40, 16 ; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: s_and_b32 s18, s20, 0xffff +; SI-NEXT: s_lshl_b32 s15, s15, 16 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_or_b32 s15, s15, s18 +; SI-NEXT: s_and_b32 s18, s21, 0xffff +; SI-NEXT: s_lshl_b32 s14, s14, 16 ; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: v_or_b32_e32 v0, v53, v0 -; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_or_b32 s14, s14, s18 +; SI-NEXT: s_and_b32 s18, s22, 0xffff +; SI-NEXT: s_lshl_b32 s13, s13, 16 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: s_or_b32 s13, s13, s18 +; SI-NEXT: s_and_b32 s18, s23, 0xffff +; SI-NEXT: s_lshl_b32 s12, s12, 16 ; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 -; SI-NEXT: s_or_b32 s7, s8, s7 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: s_add_i32 s26, s26, 3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_or_b32 s8, s9, s8 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_or_b32 s12, s12, s18 +; SI-NEXT: s_and_b32 s18, s24, 0xffff +; SI-NEXT: s_lshl_b32 s11, s11, 16 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: v_or_b32_e32 v0, v39, v0 +; SI-NEXT: s_or_b32 s11, s11, s18 +; SI-NEXT: s_and_b32 s18, s25, 0xffff +; SI-NEXT: s_lshl_b32 s10, s10, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 +; SI-NEXT: s_or_b32 s10, s10, s18 +; SI-NEXT: s_and_b32 s18, s26, 0xffff +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: s_or_b32 s9, s9, s18 +; SI-NEXT: s_and_b32 s18, s27, 0xffff +; SI-NEXT: s_lshl_b32 s8, s8, 16 ; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: v_or_b32_e32 v0, v52, v0 -; SI-NEXT: s_or_b32 s9, s10, s9 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s8, s8, s18 +; SI-NEXT: s_and_b32 s18, s28, 0xffff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: v_or_b32_e32 v0, v37, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v34 +; SI-NEXT: s_or_b32 s7, s7, s18 +; SI-NEXT: s_and_b32 s18, s29, 0xffff +; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 -; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_or_b32 s6, s6, s18 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v38, v1 ; SI-NEXT: s_add_i32 s4, s4, 0x30000 ; SI-NEXT: s_add_i32 s5, s5, 0x30000 -; SI-NEXT: s_add_i32 s6, s6, 0x30000 -; SI-NEXT: s_add_i32 s7, s7, 0x30000 -; SI-NEXT: s_add_i32 s8, s8, 0x30000 -; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s16, s16, 0x30000 +; SI-NEXT: s_add_i32 s17, s17, 0x30000 +; SI-NEXT: s_add_i32 s15, s15, 0x30000 +; SI-NEXT: s_add_i32 s14, s14, 0x30000 +; SI-NEXT: s_add_i32 s13, s13, 0x30000 +; SI-NEXT: s_add_i32 s12, s12, 0x30000 +; SI-NEXT: s_add_i32 s11, s11, 0x30000 ; SI-NEXT: s_add_i32 s10, s10, 0x30000 -; SI-NEXT: v_or_b32_e32 v0, v51, v0 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v36, v0 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v1 ; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_mov_b32_e32 v5, s9 -; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: v_mov_b32_e32 v3, s17 +; SI-NEXT: v_mov_b32_e32 v4, s15 +; SI-NEXT: v_mov_b32_e32 v5, s14 +; SI-NEXT: v_mov_b32_e32 v6, s13 +; SI-NEXT: v_mov_b32_e32 v7, s12 +; SI-NEXT: v_mov_b32_e32 v8, s11 +; SI-NEXT: v_mov_b32_e32 v9, s10 +; SI-NEXT: v_mov_b32_e32 v10, s9 +; SI-NEXT: v_mov_b32_e32 v11, s8 +; SI-NEXT: v_mov_b32_e32 v12, s7 +; SI-NEXT: v_mov_b32_e32 v13, s6 ; SI-NEXT: .LBB51_3: ; %end -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB51_4: ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 @@ -24123,23 +23773,21 @@ define <36 x half> @bitcast_v9f64_to_v36f16(<9 x double> %a, i32 %b) { ; SI-LABEL: bitcast_v9f64_to_v36f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr51 @@ -24149,297 +23797,245 @@ define <36 x half> @bitcast_v9f64_to_v36f16(<9 x double> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB52_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v21 ; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v21 ; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v21 ; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v21 ; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v21 ; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v21 ; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v21 ; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v21 ; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v21 ; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v21 ; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v21 ; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v4 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v40, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v21 ; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v3 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v42, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v21 ; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v17 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v44, v21 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v41, v21 ; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v15 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v43, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v46, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: .LBB52_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB52_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 -; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 -; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 -; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 -; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 -; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 -; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 -; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 -; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v1 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 ; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 ; SI-NEXT: .LBB52_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v43 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v42 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v41 -; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v55 -; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v54 -; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v52 -; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v50 -; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v48 -; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v38 -; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v36 -; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v34 -; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v32 -; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v31 -; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v29 -; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v27 -; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v25 -; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v23 -; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v3, v40 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v21 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v4, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v54 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v52 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v51 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v48 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v39 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v35 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v36 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v31 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v32 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v27 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v28 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v22 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v24 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v19 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v21 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v9f64_to_v36f16: @@ -24797,22 +24393,22 @@ define inreg <36 x half> @bitcast_v9f64_to_v36f16_scalar(<9 x double> inreg %a, ; SI-LABEL: bitcast_v9f64_to_v36f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; SI-NEXT: v_mov_b32_e32 v18, s16 -; SI-NEXT: v_mov_b32_e32 v19, s17 -; SI-NEXT: v_mov_b32_e32 v16, s18 -; SI-NEXT: v_mov_b32_e32 v17, s19 -; SI-NEXT: v_mov_b32_e32 v14, s20 -; SI-NEXT: v_mov_b32_e32 v15, s21 -; SI-NEXT: v_mov_b32_e32 v12, s22 -; SI-NEXT: v_mov_b32_e32 v13, s23 -; SI-NEXT: v_mov_b32_e32 v10, s24 -; SI-NEXT: v_mov_b32_e32 v11, s25 -; SI-NEXT: v_mov_b32_e32 v8, s26 -; SI-NEXT: v_mov_b32_e32 v9, s27 -; SI-NEXT: v_mov_b32_e32 v6, s28 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: v_mov_b32_e32 v17, s16 +; SI-NEXT: v_mov_b32_e32 v18, s17 +; SI-NEXT: v_mov_b32_e32 v15, s18 +; SI-NEXT: v_mov_b32_e32 v16, s19 +; SI-NEXT: v_mov_b32_e32 v13, s20 +; SI-NEXT: v_mov_b32_e32 v14, s21 +; SI-NEXT: v_mov_b32_e32 v11, s22 +; SI-NEXT: v_mov_b32_e32 v12, s23 +; SI-NEXT: v_mov_b32_e32 v9, s24 +; SI-NEXT: v_mov_b32_e32 v10, s25 +; SI-NEXT: v_mov_b32_e32 v7, s26 +; SI-NEXT: v_mov_b32_e32 v8, s27 +; SI-NEXT: v_mov_b32_e32 v5, s28 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mov_b32_e32 v7, s29 +; SI-NEXT: v_mov_b32_e32 v6, s29 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill @@ -24822,255 +24418,142 @@ define inreg <36 x half> @bitcast_v9f64_to_v36f16_scalar(<9 x double> inreg %a, ; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB53_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v17 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v40, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v16 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v42, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v3 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v44, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v46, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v18 -; SI-NEXT: s_cbranch_execnz .LBB53_3 -; SI-NEXT: .LBB53_2: ; %cmp.true -; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; SI-NEXT: v_add_f64 v[5:6], v[6:7], 1.0 -; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 -; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v1 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v11 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v41, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v14 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v43, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v45, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v17 +; SI-NEXT: s_cbranch_execnz .LBB53_3 +; SI-NEXT: .LBB53_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 +; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[5:6], 1.0 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v13 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 ; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 ; SI-NEXT: .LBB53_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v43 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v42 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v41 -; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v55 -; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v54 -; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v52 -; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v50 -; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v48 -; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v38 -; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v36 -; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v34 -; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v32 -; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v31 -; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v29 -; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v27 -; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v25 -; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v23 -; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v3, v40 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v21 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -25078,45 +24561,107 @@ define inreg <36 x half> @bitcast_v9f64_to_v36f16_scalar(<9 x double> inreg %a, ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v54 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v52 +; SI-NEXT: v_or_b32_e32 v3, v6, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v51 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v49 +; SI-NEXT: v_or_b32_e32 v5, v8, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v48 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v36 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v37 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v32 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v33 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v28 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v29 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v23 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v25 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v20 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v22 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB53_4: -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: s_branch .LBB53_2 ; ; VI-LABEL: bitcast_v9f64_to_v36f16_scalar: @@ -25666,94 +25211,128 @@ define <9 x double> @bitcast_v36f16_to_v9f64(<36 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v36f16_to_v9f64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:20 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v34, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v35, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v10 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v47, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v27 -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v39 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v48 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v38 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v35, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v63, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB54_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v49 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v35 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v33 @@ -25768,8 +25347,9 @@ define <9 x double> @bitcast_v36f16_to_v9f64(<36 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v55 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v53 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v51 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v37 ; SI-NEXT: v_or_b32_e32 v0, v34, v0 ; SI-NEXT: v_or_b32_e32 v1, v32, v1 ; SI-NEXT: v_or_b32_e32 v2, v62, v2 @@ -25783,6 +25363,9 @@ define <9 x double> @bitcast_v36f16_to_v9f64(<36 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v10, v54, v10 ; SI-NEXT: v_or_b32_e32 v11, v52, v11 ; SI-NEXT: v_or_b32_e32 v12, v50, v12 +; SI-NEXT: v_or_b32_e32 v13, v48, v13 +; SI-NEXT: v_or_b32_e32 v14, v38, v14 +; SI-NEXT: v_or_b32_e32 v15, v36, v15 ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr33 @@ -25810,30 +25393,28 @@ define <9 x double> @bitcast_v36f16_to_v9f64(<36 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; kill: killed $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v37 -; SI-NEXT: v_or_b32_e32 v17, v36, v17 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: .LBB54_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB54_4 @@ -25929,90 +25510,86 @@ define <9 x double> @bitcast_v36f16_to_v9f64(<36 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v52 ; SI-NEXT: v_cvt_f32_f16_e32 v14, v50 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v15, v39 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v51 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v52 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v37 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v14, v12 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_or_b32_e32 v11, v13, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v36 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v48 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v38 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v36 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_or_b32_e32 v14, v16, v14 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 ; SI-NEXT: v_or_b32_e32 v17, v19, v17 ; SI-NEXT: .LBB54_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -26485,6 +26062,16 @@ define inreg <9 x double> @bitcast_v36f16_to_v9f64_scalar(<36 x half> inreg %a, ; SI-LABEL: bitcast_v36f16_to_v9f64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_lshr_b32 s12, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s12 +; SI-NEXT: s_lshr_b32 s12, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s12 +; SI-NEXT: s_lshr_b32 s12, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s12 +; SI-NEXT: s_lshr_b32 s12, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s12 +; SI-NEXT: s_lshr_b32 s10, s23, 16 +; SI-NEXT: s_lshr_b32 s11, s22, 16 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -26501,89 +26088,132 @@ define inreg <9 x double> @bitcast_v36f16_to_v9f64_scalar(<36 x half> inreg %a, ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v57, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v11 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v63, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s23 +; SI-NEXT: s_lshr_b32 s8, s25, 16 +; SI-NEXT: s_lshr_b32 s9, s24, 16 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s25 +; SI-NEXT: s_lshr_b32 s6, s27, 16 +; SI-NEXT: s_lshr_b32 s7, s26, 16 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s27 +; SI-NEXT: s_lshr_b32 s12, s17, 16 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 +; SI-NEXT: s_lshr_b32 s4, s29, 16 +; SI-NEXT: s_lshr_b32 s5, s28, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s12 +; SI-NEXT: s_lshr_b32 s12, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s17 ; SI-NEXT: v_cvt_f16_f32_e32 v38, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v29, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v35, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v34, s16 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v62, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v33, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v60, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v61, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v44, s23 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v63, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v42, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v41, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v40, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v55, s26 -; SI-NEXT: v_cvt_f16_f32_e32 v59, s29 -; SI-NEXT: v_cvt_f16_f32_e32 v58, s28 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v3 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_cbranch_scc0 .LBB55_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v62 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v60 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v44 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v42 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v40 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v45 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v63 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v60 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v57 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v47 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v50 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v48 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v38 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v30 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v28 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v26 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v56 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v21 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 ; SI-NEXT: v_or_b32_e32 v0, v34, v0 ; SI-NEXT: v_or_b32_e32 v1, v33, v1 -; SI-NEXT: v_or_b32_e32 v2, v61, v2 -; SI-NEXT: v_or_b32_e32 v3, v63, v3 -; SI-NEXT: v_or_b32_e32 v4, v41, v4 -; SI-NEXT: v_or_b32_e32 v5, v55, v5 +; SI-NEXT: v_or_b32_e32 v2, v44, v2 +; SI-NEXT: v_or_b32_e32 v3, v42, v3 +; SI-NEXT: v_or_b32_e32 v4, v62, v4 +; SI-NEXT: v_or_b32_e32 v5, v59, v5 ; SI-NEXT: v_or_b32_e32 v6, v58, v6 -; SI-NEXT: v_or_b32_e32 v7, v56, v7 -; SI-NEXT: v_or_b32_e32 v8, v51, v8 -; SI-NEXT: v_or_b32_e32 v9, v49, v9 -; SI-NEXT: v_or_b32_e32 v10, v39, v10 -; SI-NEXT: v_or_b32_e32 v11, v37, v11 -; SI-NEXT: v_or_b32_e32 v12, v31, v12 -; SI-NEXT: v_or_b32_e32 v13, v29, v13 -; SI-NEXT: v_or_b32_e32 v14, v27, v14 -; SI-NEXT: v_or_b32_e32 v15, v25, v15 -; SI-NEXT: v_or_b32_e32 v16, v23, v16 +; SI-NEXT: v_or_b32_e32 v7, v47, v7 +; SI-NEXT: v_or_b32_e32 v8, v46, v8 +; SI-NEXT: v_or_b32_e32 v9, v37, v9 +; SI-NEXT: v_or_b32_e32 v10, v36, v10 +; SI-NEXT: v_or_b32_e32 v11, v29, v11 +; SI-NEXT: v_or_b32_e32 v12, v28, v12 +; SI-NEXT: v_or_b32_e32 v13, v26, v13 +; SI-NEXT: v_or_b32_e32 v14, v24, v14 +; SI-NEXT: v_or_b32_e32 v15, v22, v15 +; SI-NEXT: v_or_b32_e32 v16, v20, v16 ; SI-NEXT: v_or_b32_e32 v17, v19, v17 ; SI-NEXT: s_cbranch_execnz .LBB55_3 ; SI-NEXT: .LBB55_2: ; %cmp.true +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v43 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v33 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 @@ -26598,11 +26228,10 @@ define inreg <9 x double> @bitcast_v36f16_to_v9f64_scalar(<36 x half> inreg %a, ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v63 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v62 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -26611,20 +26240,18 @@ define inreg <9 x double> @bitcast_v36f16_to_v9f64_scalar(<36 x half> inreg %a, ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v44 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v61 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v59 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v63 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v58 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 @@ -26632,11 +26259,11 @@ define inreg <9 x double> @bitcast_v36f16_to_v9f64_scalar(<36 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v56 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v60 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v57 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 @@ -26647,10 +26274,10 @@ define inreg <9 x double> @bitcast_v36f16_to_v9f64_scalar(<36 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v47 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v37 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 @@ -26658,29 +26285,29 @@ define inreg <9 x double> @bitcast_v36f16_to_v9f64_scalar(<36 x half> inreg %a, ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v39 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v38 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v31 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v36 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v25 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v30 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 @@ -26688,10 +26315,10 @@ define inreg <9 x double> @bitcast_v36f16_to_v9f64_scalar(<36 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v27 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v14, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v26 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 @@ -26699,11 +26326,11 @@ define inreg <9 x double> @bitcast_v36f16_to_v9f64_scalar(<36 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v24 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v22 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 @@ -26711,10 +26338,10 @@ define inreg <9 x double> @bitcast_v36f16_to_v9f64_scalar(<36 x half> inreg %a, ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_or_b32_e32 v14, v16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v21 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 @@ -26749,59 +26376,63 @@ define inreg <9 x double> @bitcast_v36f16_to_v9f64_scalar(<36 x half> inreg %a, ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB55_4: -; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v55, v36 +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_mov_b32_e32 v35, v42 +; SI-NEXT: v_mov_b32_e32 v42, v36 ; SI-NEXT: v_mov_b32_e32 v36, v19 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v40, v37 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v34, v43 +; SI-NEXT: v_mov_b32_e32 v43, v37 ; SI-NEXT: v_mov_b32_e32 v37, v18 -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v41, v38 -; SI-NEXT: v_mov_b32_e32 v38, v23 -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v42, v39 -; SI-NEXT: v_mov_b32_e32 v39, v24 -; SI-NEXT: v_mov_b32_e32 v43, v48 -; SI-NEXT: v_mov_b32_e32 v48, v25 -; SI-NEXT: v_mov_b32_e32 v32, v44 -; SI-NEXT: v_mov_b32_e32 v44, v49 -; SI-NEXT: v_mov_b32_e32 v49, v26 -; SI-NEXT: v_mov_b32_e32 v45, v50 -; SI-NEXT: v_mov_b32_e32 v50, v27 -; SI-NEXT: v_mov_b32_e32 v46, v51 -; SI-NEXT: v_mov_b32_e32 v51, v28 -; SI-NEXT: v_mov_b32_e32 v52, v29 -; SI-NEXT: v_mov_b32_e32 v53, v30 -; SI-NEXT: v_mov_b32_e32 v54, v31 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v33, v44 +; SI-NEXT: v_mov_b32_e32 v44, v38 +; SI-NEXT: v_mov_b32_e32 v38, v20 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v32, v45 +; SI-NEXT: v_mov_b32_e32 v45, v39 +; SI-NEXT: v_mov_b32_e32 v39, v21 +; SI-NEXT: v_mov_b32_e32 v48, v22 +; SI-NEXT: v_mov_b32_e32 v49, v23 +; SI-NEXT: v_mov_b32_e32 v50, v24 +; SI-NEXT: v_mov_b32_e32 v51, v25 +; SI-NEXT: v_mov_b32_e32 v52, v26 +; SI-NEXT: v_mov_b32_e32 v53, v27 +; SI-NEXT: v_mov_b32_e32 v54, v28 +; SI-NEXT: v_mov_b32_e32 v55, v29 +; SI-NEXT: v_mov_b32_e32 v40, v30 +; SI-NEXT: v_mov_b32_e32 v41, v31 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v24, v39 -; SI-NEXT: v_mov_b32_e32 v39, v42 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v23, v38 -; SI-NEXT: v_mov_b32_e32 v38, v41 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v21, v39 +; SI-NEXT: v_mov_b32_e32 v39, v45 +; SI-NEXT: v_mov_b32_e32 v45, v32 +; SI-NEXT: v_mov_b32_e32 v20, v38 +; SI-NEXT: v_mov_b32_e32 v38, v44 +; SI-NEXT: v_mov_b32_e32 v44, v33 ; SI-NEXT: v_mov_b32_e32 v18, v37 -; SI-NEXT: v_mov_b32_e32 v37, v40 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v37, v43 +; SI-NEXT: v_mov_b32_e32 v43, v34 ; SI-NEXT: v_mov_b32_e32 v19, v36 -; SI-NEXT: v_mov_b32_e32 v36, v55 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v31, v54 -; SI-NEXT: v_mov_b32_e32 v30, v53 -; SI-NEXT: v_mov_b32_e32 v29, v52 -; SI-NEXT: v_mov_b32_e32 v28, v51 -; SI-NEXT: v_mov_b32_e32 v51, v46 -; SI-NEXT: v_mov_b32_e32 v27, v50 -; SI-NEXT: v_mov_b32_e32 v50, v45 -; SI-NEXT: v_mov_b32_e32 v26, v49 -; SI-NEXT: v_mov_b32_e32 v49, v44 -; SI-NEXT: v_mov_b32_e32 v44, v32 -; SI-NEXT: v_mov_b32_e32 v25, v48 -; SI-NEXT: v_mov_b32_e32 v48, v43 +; SI-NEXT: v_mov_b32_e32 v36, v42 +; SI-NEXT: v_mov_b32_e32 v42, v35 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v31, v41 +; SI-NEXT: v_mov_b32_e32 v30, v40 +; SI-NEXT: v_mov_b32_e32 v29, v55 +; SI-NEXT: v_mov_b32_e32 v28, v54 +; SI-NEXT: v_mov_b32_e32 v27, v53 +; SI-NEXT: v_mov_b32_e32 v26, v52 +; SI-NEXT: v_mov_b32_e32 v25, v51 +; SI-NEXT: v_mov_b32_e32 v24, v50 +; SI-NEXT: v_mov_b32_e32 v23, v49 +; SI-NEXT: v_mov_b32_e32 v22, v48 ; SI-NEXT: s_branch .LBB55_2 ; ; VI-LABEL: bitcast_v36f16_to_v9f64_scalar: @@ -27179,135 +26810,144 @@ end: define <36 x half> @bitcast_v36i16_to_v36f16(<36 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v36i16_to_v36f16: ; SI: ; %bb.0: +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; kill: killed $vgpr27 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; kill: killed $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; kill: killed $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; kill: killed $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; kill: killed $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; kill: killed $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; kill: killed $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; kill: killed $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; kill: killed $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; kill: killed $vgpr27 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v17 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; kill: killed $vgpr27 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v16 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; kill: killed $vgpr27 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v15 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v10 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; kill: killed $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; kill: killed $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; kill: killed $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB56_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v62, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v2 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v63, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v4 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v7 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v10 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v13 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v19 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v56 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v59 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v61 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v63 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v11 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v22 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v21 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v26 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v39, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v26 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr4 @@ -27324,7 +26964,16 @@ define <36 x half> @bitcast_v36i16_to_v36f16(<36 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr21 @@ -27333,274 +26982,206 @@ define <36 x half> @bitcast_v36i16_to_v36f16(<36 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: .LBB56_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB56_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v25 -; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v27 -; SI-NEXT: v_add_i32_e32 v39, vcc, 3, v39 -; SI-NEXT: v_add_i32_e32 v34, vcc, 3, v34 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v30 -; SI-NEXT: v_add_i32_e32 v35, vcc, 3, v35 -; SI-NEXT: v_add_i32_e32 v36, vcc, 3, v36 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v2 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v39 -; SI-NEXT: v_add_i32_e32 v37, vcc, 3, v37 -; SI-NEXT: v_add_i32_e32 v38, vcc, 3, v38 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v3 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 -; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 -; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v4 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v35 -; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v5 ; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v6 ; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v37 -; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 ; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 ; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 ; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 ; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 ; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v63 ; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v62 ; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v61 ; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v29 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v60 +; SI-NEXT: v_add_i32_e32 v34, vcc, 3, v59 +; SI-NEXT: v_add_i32_e32 v33, vcc, 3, v58 +; SI-NEXT: v_add_i32_e32 v35, vcc, 3, v57 +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v56 +; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v47 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v34 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v37, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v31 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: .LBB56_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v62 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v63 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: v_cvt_f16_f32_e32 v1, v31 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v59 -; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v57 -; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v47 -; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v45 -; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v43 -; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v41 -; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v55 -; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v53 -; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v51 -; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v49 -; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 -; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 -; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v2, v48 -; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v48 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v53 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x44, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v2, v29 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v4, v33 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v6, v37 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v8, v49 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v52 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v28 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v55 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v31 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v41 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v35 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v43 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v39 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v36i16_to_v36f16: @@ -27967,312 +27548,249 @@ define inreg <36 x half> @bitcast_v36i16_to_v36f16_scalar(<36 x i16> inreg %a, i ; SI-LABEL: bitcast_v36i16_to_v36f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v23 +; SI-NEXT: s_lshr_b32 s43, s29, 16 +; SI-NEXT: s_lshr_b32 s42, s28, 16 +; SI-NEXT: s_lshr_b32 s41, s27, 16 +; SI-NEXT: s_lshr_b32 s40, s26, 16 +; SI-NEXT: s_lshr_b32 s15, s25, 16 +; SI-NEXT: s_lshr_b32 s14, s24, 16 +; SI-NEXT: s_lshr_b32 s13, s23, 16 +; SI-NEXT: s_lshr_b32 s12, s22, 16 +; SI-NEXT: s_lshr_b32 s11, s21, 16 +; SI-NEXT: s_lshr_b32 s10, s20, 16 +; SI-NEXT: s_lshr_b32 s9, s19, 16 +; SI-NEXT: s_lshr_b32 s8, s18, 16 +; SI-NEXT: s_lshr_b32 s7, s17, 16 +; SI-NEXT: s_lshr_b32 s6, s16, 16 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v1 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v0 ; SI-NEXT: s_cbranch_scc0 .LBB57_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v46, s16 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v57, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v47, s18 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v58, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v43, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v56, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v41, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v45, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v55, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v44, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v53, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v42, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v51, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v40, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v50 ; SI-NEXT: s_cbranch_execnz .LBB57_3 ; SI-NEXT: .LBB57_2: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v39, vcc, 3, v50 ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v49 ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v48 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: s_add_i32 s43, s43, 3 ; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_add_i32 s42, s42, 3 ; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s41, s41, 3 ; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_add_i32 s40, s40, 3 ; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s15, s15, 3 ; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s14, s14, 3 ; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s13, s13, 3 ; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s12, s12, 3 ; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s11, s11, 3 ; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s10, s10, 3 ; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s9, s9, 3 ; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s8, s8, 3 ; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s7, s7, 3 ; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s6, s6, 3 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v46, s16 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v57, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v47, s18 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v58, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v43, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v56, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v41, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v45, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v55, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v44, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v53, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v42, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v51, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v40, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 ; SI-NEXT: .LBB57_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v47 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v43 -; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v41 -; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v55 -; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v53 -; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v51 -; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v49 -; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v39 -; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 -; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v35 -; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 -; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v31 -; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v29 -; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v27 -; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v25 -; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v24 -; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v23 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v6 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v15 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v19 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v10 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v23 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v13 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v27 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v16 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v31 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v20 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v33 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v24 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v35 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v28 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v37 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v29 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB57_4: -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: s_branch .LBB57_2 ; ; VI-LABEL: bitcast_v36i16_to_v36f16_scalar: @@ -28867,337 +28385,311 @@ define <36 x i16> @bitcast_v36f16_to_v36i16(<36 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v36f16_to_v36i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:24 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:20 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v30 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v21 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v40 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v41 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v42 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v9, v43 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v44 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v45 -; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB58_2 -; SI-NEXT: ; %bb.1: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v20 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v19 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v20 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v19 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v20 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v34 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v33 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v31 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_or_b32_e32 v1, v1, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_or_b32_e32 v3, v3, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v7 -; SI-NEXT: v_or_b32_e32 v6, v6, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v11 -; SI-NEXT: v_or_b32_e32 v10, v10, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v14 -; SI-NEXT: v_or_b32_e32 v13, v13, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v53 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v20 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v38 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v32 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_or_b32_e32 v31, v20, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v20 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v34 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_or_b32_e32 v33, v20, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v20 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v36 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB58_2 +; SI-NEXT: ; %bb.1: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_or_b32_e32 v36, v20, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v20 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v35 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v48 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_or_b32_e32 v39, v20, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_or_b32_e32 v50, v21, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v49 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_or_b32_e32 v49, v21, v22 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v52 -; SI-NEXT: v_or_b32_e32 v38, v24, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_or_b32_e32 v17, v17, v36 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_or_b32_e32 v15, v15, v36 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_or_b32_e32 v13, v13, v36 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v35, v25, v23 -; SI-NEXT: v_or_b32_e32 v17, v17, v24 -; SI-NEXT: v_or_b32_e32 v15, v15, v19 -; SI-NEXT: v_or_b32_e32 v12, v12, v18 -; SI-NEXT: v_or_b32_e32 v8, v8, v16 -; SI-NEXT: v_or_b32_e32 v5, v5, v9 -; SI-NEXT: v_alignbit_b32 v55, v39, v20, 16 -; SI-NEXT: v_alignbit_b32 v54, v36, v22, 16 -; SI-NEXT: v_alignbit_b32 v53, v33, v21, 16 -; SI-NEXT: v_alignbit_b32 v52, v31, v23, 16 -; SI-NEXT: v_alignbit_b32 v51, v13, v24, 16 -; SI-NEXT: v_alignbit_b32 v19, v10, v19, 16 -; SI-NEXT: v_alignbit_b32 v18, v6, v18, 16 -; SI-NEXT: v_alignbit_b32 v16, v3, v16, 16 -; SI-NEXT: v_alignbit_b32 v9, v1, v9, 16 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_or_b32_e32 v11, v11, v36 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 +; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_or_b32_e32 v9, v9, v36 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_or_b32_e32 v7, v7, v36 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_or_b32_e32 v5, v5, v36 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v36 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v36 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_or_b32_e32 v0, v0, v35 +; SI-NEXT: v_or_b32_e32 v27, v27, v34 +; SI-NEXT: v_or_b32_e32 v26, v26, v33 +; SI-NEXT: v_or_b32_e32 v24, v24, v32 +; SI-NEXT: v_or_b32_e32 v23, v23, v31 +; SI-NEXT: v_or_b32_e32 v22, v22, v30 +; SI-NEXT: v_or_b32_e32 v20, v20, v29 +; SI-NEXT: v_or_b32_e32 v21, v21, v28 +; SI-NEXT: v_or_b32_e32 v19, v19, v25 +; SI-NEXT: v_alignbit_b32 v35, v1, v35, 16 +; SI-NEXT: v_alignbit_b32 v34, v3, v34, 16 +; SI-NEXT: v_alignbit_b32 v33, v5, v33, 16 +; SI-NEXT: v_alignbit_b32 v32, v7, v32, 16 +; SI-NEXT: v_alignbit_b32 v31, v9, v31, 16 +; SI-NEXT: v_alignbit_b32 v30, v11, v30, 16 +; SI-NEXT: v_alignbit_b32 v29, v13, v29, 16 +; SI-NEXT: v_alignbit_b32 v28, v15, v28, 16 +; SI-NEXT: v_alignbit_b32 v25, v17, v25, 16 ; SI-NEXT: .LBB58_2: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v50 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v55 -; SI-NEXT: v_or_b32_e32 v20, v20, v21 -; SI-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v39 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v48 -; SI-NEXT: v_or_b32_e32 v20, v20, v21 -; SI-NEXT: v_add_i32_e32 v21, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v20, v21, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v49 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v54 -; SI-NEXT: v_or_b32_e32 v20, v20, v21 -; SI-NEXT: v_add_i32_e32 v21, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v20, v21, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v36 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v37 -; SI-NEXT: v_or_b32_e32 v20, v20, v21 -; SI-NEXT: v_add_i32_e32 v21, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v20, v21, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v38 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v53 -; SI-NEXT: v_or_b32_e32 v20, v20, v21 -; SI-NEXT: v_add_i32_e32 v21, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v20, v21, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v33 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v34 -; SI-NEXT: v_or_b32_e32 v20, v20, v21 -; SI-NEXT: v_add_i32_e32 v21, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v20, v21, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v35 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v52 -; SI-NEXT: v_or_b32_e32 v20, v20, v21 -; SI-NEXT: v_add_i32_e32 v21, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v20, v21, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v31 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v32 -; SI-NEXT: v_or_b32_e32 v20, v20, v21 -; SI-NEXT: v_add_i32_e32 v21, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v20, v21, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v51 -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v17, v17, v20 -; SI-NEXT: v_add_i32_e32 v20, vcc, 32, v0 -; SI-NEXT: v_or_b32_e32 v13, v13, v14 -; SI-NEXT: v_add_i32_e32 v14, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v17, v20, s[0:3], 0 offen -; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v19 -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_or_b32_e32 v13, v13, v14 -; SI-NEXT: v_add_i32_e32 v14, vcc, 40, v0 -; SI-NEXT: v_or_b32_e32 v10, v10, v11 -; SI-NEXT: v_add_i32_e32 v11, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen -; SI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v18 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v10, v10, v11 -; SI-NEXT: v_add_i32_e32 v11, vcc, 48, v0 -; SI-NEXT: v_or_b32_e32 v6, v6, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen -; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v6, v6, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, 56, v0 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 60, v0 -; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v9 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 64, v0 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v29 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x44, v0 -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v34 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v33 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v32 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v31 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v30 +; SI-NEXT: v_or_b32_e32 v12, v12, v20 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v28 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v25 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v0, v0, v35 +; SI-NEXT: v_or_b32_e32 v2, v2, v27 +; SI-NEXT: v_or_b32_e32 v4, v4, v26 +; SI-NEXT: v_or_b32_e32 v6, v6, v24 +; SI-NEXT: v_or_b32_e32 v8, v8, v23 +; SI-NEXT: v_or_b32_e32 v10, v10, v22 +; SI-NEXT: v_or_b32_e32 v14, v14, v20 +; SI-NEXT: v_or_b32_e32 v16, v16, v19 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v36f16_to_v36i16: @@ -29565,322 +29057,325 @@ define inreg <36 x i16> @bitcast_v36f16_to_v36i16_scalar(<36 x half> inreg %a, i ; SI-LABEL: bitcast_v36f16_to_v36i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_lshr_b32 s4, s27, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s24 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v52, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v22 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v45, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v44, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v43, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v41, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v42, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v55, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v40, s28 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s22 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_cvt_f16_f32_e32 v42, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s6 +; SI-NEXT: s_lshr_b32 s7, s28, 16 +; SI-NEXT: s_lshr_b32 s8, s26, 16 +; SI-NEXT: s_lshr_b32 s9, s24, 16 +; SI-NEXT: s_lshr_b32 s10, s22, 16 +; SI-NEXT: s_lshr_b32 s11, s20, 16 +; SI-NEXT: s_lshr_b32 s12, s18, 16 +; SI-NEXT: s_lshr_b32 s13, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v10 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v45, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v34, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v21, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v32, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v18, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v30, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v16, s26 -; SI-NEXT: v_cvt_f16_f32_e32 v28, s29 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v16 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_cbranch_scc0 .LBB59_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_cbranch_execnz .LBB59_3 ; SI-NEXT: .LBB59_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v5, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v5 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_or_b32_e32 v21, v3, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v18 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v32 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v5 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_or_b32_e32 v18, v3, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v30 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v5 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_or_b32_e32 v16, v3, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v28 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v5 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_or_b32_e32 v14, v3, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v10 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v26 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v5 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v50 -; SI-NEXT: v_or_b32_e32 v10, v3, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v24 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v5 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v49 -; SI-NEXT: v_or_b32_e32 v8, v3, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v22 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v5 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v16 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v37 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v39 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v18 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v34 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v14 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_or_b32_e32 v6, v3, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v37 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v4 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v4 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v6 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v8 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v37 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v4, v3, v4 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v38 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v11 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v36 -; SI-NEXT: v_or_b32_e32 v2, v2, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v45 -; SI-NEXT: v_or_b32_e32 v43, v12, v17 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v54 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v39 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_or_b32_e32 v45, v11, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v40 -; SI-NEXT: v_or_b32_e32 v42, v12, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v53 -; SI-NEXT: v_or_b32_e32 v54, v19, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v43 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v49 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v30 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v28 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_or_b32_e32 v44, v18, v0 +; SI-NEXT: v_or_b32_e32 v42, v19, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v55 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v52 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v10 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v12 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v24 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v22 +; SI-NEXT: v_or_b32_e32 v43, v19, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v45 +; SI-NEXT: v_or_b32_e32 v55, v18, v6 +; SI-NEXT: v_or_b32_e32 v52, v20, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v50 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_or_b32_e32 v40, v11, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v51 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v53, v12, v7 -; SI-NEXT: v_or_b32_e32 v51, v11, v5 -; SI-NEXT: v_or_b32_e32 v48, v19, v3 -; SI-NEXT: v_or_b32_e32 v38, v22, v1 -; SI-NEXT: v_lshr_b64 v[34:35], v[20:21], 16 -; SI-NEXT: v_lshr_b64 v[32:33], v[17:18], 16 -; SI-NEXT: v_lshr_b64 v[30:31], v[15:16], 16 -; SI-NEXT: v_lshr_b64 v[28:29], v[13:14], 16 -; SI-NEXT: v_lshr_b64 v[26:27], v[9:10], 16 -; SI-NEXT: v_lshr_b64 v[24:25], v[7:8], 16 -; SI-NEXT: v_lshr_b64 v[22:23], v[5:6], 16 -; SI-NEXT: v_lshr_b64 v[19:20], v[3:4], 16 -; SI-NEXT: v_lshr_b64 v[11:12], v[1:2], 16 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v45, v19, v10 +; SI-NEXT: v_or_b32_e32 v40, v18, v12 +; SI-NEXT: v_or_b32_e32 v53, v20, v14 +; SI-NEXT: v_or_b32_e32 v50, v21, v16 +; SI-NEXT: v_lshr_b64 v[34:35], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[32:33], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[30:31], v[4:5], 16 +; SI-NEXT: v_lshr_b64 v[28:29], v[6:7], 16 +; SI-NEXT: v_lshr_b64 v[26:27], v[8:9], 16 +; SI-NEXT: v_lshr_b64 v[24:25], v[10:11], 16 +; SI-NEXT: v_lshr_b64 v[22:23], v[12:13], 16 +; SI-NEXT: v_lshr_b64 v[20:21], v[14:15], 16 +; SI-NEXT: v_lshr_b64 v[18:19], v[16:17], 16 ; SI-NEXT: .LBB59_3: ; %end -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v34 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v45 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v44 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v32 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v43 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v41 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v30 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v42 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v55 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v28 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v40 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v52 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v26 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v50 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v53 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v24 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v49 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v51 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v22 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v39 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v48 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v19 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v37 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v38 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v11 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v34 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v44 +; SI-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x44, v0 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v42 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v54 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v30 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v43 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v51 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v28 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v55 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v48 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v26 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v52 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v38 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v45 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v24 +; SI-NEXT: v_or_b32_e32 v10, v10, v12 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v49 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v40 ; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v22 +; SI-NEXT: v_or_b32_e32 v12, v12, v14 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v39 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v53 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v20 +; SI-NEXT: v_or_b32_e32 v14, v14, v16 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v37 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v50 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v16, v16, v18 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v36 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB59_4: ; SI-NEXT: s_branch .LBB59_2 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll index 51bffb7f7c8cd..ce06af35bf4f0 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll @@ -2200,216 +2200,157 @@ define <40 x i16> @bitcast_v20i32_to_v40i16(<20 x i32> %a, i32 %b) { ; SI-LABEL: bitcast_v20i32_to_v40i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v21 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB12_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_alignbit_b32 v21, v20, v19, 16 -; SI-NEXT: v_alignbit_b32 v22, v18, v17, 16 -; SI-NEXT: v_alignbit_b32 v23, v16, v15, 16 -; SI-NEXT: v_alignbit_b32 v24, v14, v13, 16 -; SI-NEXT: v_alignbit_b32 v26, v12, v11, 16 -; SI-NEXT: v_alignbit_b32 v28, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v31, v8, v7, 16 -; SI-NEXT: v_alignbit_b32 v33, v6, v5, 16 -; SI-NEXT: v_alignbit_b32 v35, v4, v3, 16 -; SI-NEXT: v_alignbit_b32 v37, v2, v1, 16 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v2 +; SI-NEXT: v_alignbit_b32 v20, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v21, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v22, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v23, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v24, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v25, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v26, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v29, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v31, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v34, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v1 ; SI-NEXT: .LBB12_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB12_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 ; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 ; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 ; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 ; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 ; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 ; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 ; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; SI-NEXT: v_alignbit_b32 v21, v20, v19, 16 -; SI-NEXT: v_alignbit_b32 v22, v18, v17, 16 -; SI-NEXT: v_alignbit_b32 v23, v16, v15, 16 -; SI-NEXT: v_alignbit_b32 v24, v14, v13, 16 -; SI-NEXT: v_alignbit_b32 v26, v12, v11, 16 -; SI-NEXT: v_alignbit_b32 v28, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v31, v8, v7, 16 -; SI-NEXT: v_alignbit_b32 v33, v6, v5, 16 -; SI-NEXT: v_alignbit_b32 v35, v4, v3, 16 -; SI-NEXT: v_alignbit_b32 v37, v2, v1, 16 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v2 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_alignbit_b32 v20, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v21, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v22, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v23, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v24, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v25, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v26, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v29, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v31, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v34, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v1 ; SI-NEXT: .LBB12_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v0, v0, v34 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; SI-NEXT: v_or_b32_e32 v1, v1, v37 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v28 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v26 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v24 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v23 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v22 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v21 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x4c, v0 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v39 +; SI-NEXT: v_or_b32_e32 v2, v2, v31 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v38 +; SI-NEXT: v_or_b32_e32 v4, v4, v29 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v37 +; SI-NEXT: v_or_b32_e32 v6, v6, v26 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v36 +; SI-NEXT: v_or_b32_e32 v8, v8, v25 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v35 +; SI-NEXT: v_or_b32_e32 v10, v10, v24 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v33 +; SI-NEXT: v_or_b32_e32 v12, v12, v23 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v32 +; SI-NEXT: v_or_b32_e32 v14, v14, v22 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v30 +; SI-NEXT: v_or_b32_e32 v16, v16, v21 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v28 +; SI-NEXT: v_or_b32_e32 v18, v18, v20 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v27 +; SI-NEXT: v_or_b32_e32 v1, v1, v34 +; SI-NEXT: v_or_b32_e32 v3, v3, v31 +; SI-NEXT: v_or_b32_e32 v5, v5, v29 +; SI-NEXT: v_or_b32_e32 v7, v7, v26 +; SI-NEXT: v_or_b32_e32 v9, v9, v25 +; SI-NEXT: v_or_b32_e32 v11, v11, v24 +; SI-NEXT: v_or_b32_e32 v13, v13, v23 +; SI-NEXT: v_or_b32_e32 v15, v15, v22 +; SI-NEXT: v_or_b32_e32 v17, v17, v21 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v20i32_to_v40i16: @@ -2837,42 +2778,42 @@ define inreg <40 x i16> @bitcast_v20i32_to_v40i16_scalar(<20 x i32> inreg %a, i3 ; SI-LABEL: bitcast_v20i32_to_v40i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v8, s16 -; SI-NEXT: v_mov_b32_e32 v9, s17 -; SI-NEXT: v_mov_b32_e32 v10, s18 -; SI-NEXT: v_mov_b32_e32 v11, s19 -; SI-NEXT: v_mov_b32_e32 v12, s20 -; SI-NEXT: v_mov_b32_e32 v13, s21 -; SI-NEXT: v_mov_b32_e32 v14, s22 -; SI-NEXT: v_mov_b32_e32 v15, s23 -; SI-NEXT: v_mov_b32_e32 v16, s24 -; SI-NEXT: v_mov_b32_e32 v17, s25 -; SI-NEXT: v_mov_b32_e32 v18, s26 -; SI-NEXT: v_mov_b32_e32 v19, s27 -; SI-NEXT: v_readfirstlane_b32 s22, v8 -; SI-NEXT: v_mov_b32_e32 v8, s28 -; SI-NEXT: v_readfirstlane_b32 s23, v9 -; SI-NEXT: v_mov_b32_e32 v9, s29 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 -; SI-NEXT: v_readfirstlane_b32 s20, v10 -; SI-NEXT: v_readfirstlane_b32 s21, v11 -; SI-NEXT: v_readfirstlane_b32 s18, v12 -; SI-NEXT: v_readfirstlane_b32 s19, v13 -; SI-NEXT: v_readfirstlane_b32 s16, v14 -; SI-NEXT: v_readfirstlane_b32 s17, v15 -; SI-NEXT: v_readfirstlane_b32 s14, v16 -; SI-NEXT: v_readfirstlane_b32 s15, v17 -; SI-NEXT: v_readfirstlane_b32 s12, v18 -; SI-NEXT: v_readfirstlane_b32 s13, v19 -; SI-NEXT: v_readfirstlane_b32 s10, v8 -; SI-NEXT: v_readfirstlane_b32 s11, v9 -; SI-NEXT: v_readfirstlane_b32 s8, v1 -; SI-NEXT: v_readfirstlane_b32 s9, v2 -; SI-NEXT: v_readfirstlane_b32 s6, v3 -; SI-NEXT: v_readfirstlane_b32 s7, v4 -; SI-NEXT: v_readfirstlane_b32 s4, v5 +; SI-NEXT: v_mov_b32_e32 v7, s16 +; SI-NEXT: v_mov_b32_e32 v8, s17 +; SI-NEXT: v_mov_b32_e32 v9, s18 +; SI-NEXT: v_mov_b32_e32 v10, s19 +; SI-NEXT: v_mov_b32_e32 v11, s20 +; SI-NEXT: v_mov_b32_e32 v12, s21 +; SI-NEXT: v_mov_b32_e32 v13, s22 +; SI-NEXT: v_mov_b32_e32 v14, s23 +; SI-NEXT: v_mov_b32_e32 v15, s24 +; SI-NEXT: v_mov_b32_e32 v16, s25 +; SI-NEXT: v_mov_b32_e32 v17, s26 +; SI-NEXT: v_mov_b32_e32 v18, s27 +; SI-NEXT: v_mov_b32_e32 v19, s28 +; SI-NEXT: v_readfirstlane_b32 s22, v7 +; SI-NEXT: v_mov_b32_e32 v7, s29 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: v_readfirstlane_b32 s23, v8 +; SI-NEXT: v_readfirstlane_b32 s20, v9 +; SI-NEXT: v_readfirstlane_b32 s21, v10 +; SI-NEXT: v_readfirstlane_b32 s18, v11 +; SI-NEXT: v_readfirstlane_b32 s19, v12 +; SI-NEXT: v_readfirstlane_b32 s16, v13 +; SI-NEXT: v_readfirstlane_b32 s17, v14 +; SI-NEXT: v_readfirstlane_b32 s14, v15 +; SI-NEXT: v_readfirstlane_b32 s15, v16 +; SI-NEXT: v_readfirstlane_b32 s12, v17 +; SI-NEXT: v_readfirstlane_b32 s13, v18 +; SI-NEXT: v_readfirstlane_b32 s10, v19 +; SI-NEXT: v_readfirstlane_b32 s11, v7 +; SI-NEXT: v_readfirstlane_b32 s8, v0 +; SI-NEXT: v_readfirstlane_b32 s9, v1 +; SI-NEXT: v_readfirstlane_b32 s6, v2 +; SI-NEXT: v_readfirstlane_b32 s7, v3 +; SI-NEXT: v_readfirstlane_b32 s4, v4 ; SI-NEXT: s_and_b64 s[24:25], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s5, v6 +; SI-NEXT: v_readfirstlane_b32 s5, v5 ; SI-NEXT: s_cbranch_scc0 .LBB13_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_lshr_b32 s72, s5, 16 @@ -2941,141 +2882,83 @@ define inreg <40 x i16> @bitcast_v20i32_to_v40i16_scalar(<20 x i32> inreg %a, i3 ; SI-NEXT: s_lshl_b32 s25, s60, 16 ; SI-NEXT: s_and_b32 s22, s22, 0xffff ; SI-NEXT: s_or_b32 s22, s22, s25 -; SI-NEXT: v_mov_b32_e32 v1, s22 -; SI-NEXT: s_and_b32 s22, s23, 0xffff -; SI-NEXT: s_lshl_b32 s23, s89, 16 -; SI-NEXT: s_or_b32 s22, s22, s23 -; SI-NEXT: v_mov_b32_e32 v2, s22 -; SI-NEXT: s_lshl_b32 s22, s58, 16 +; SI-NEXT: s_and_b32 s23, s23, 0xffff +; SI-NEXT: s_lshl_b32 s25, s89, 16 +; SI-NEXT: s_or_b32 s23, s23, s25 +; SI-NEXT: s_lshl_b32 s25, s58, 16 ; SI-NEXT: s_and_b32 s20, s20, 0xffff -; SI-NEXT: s_or_b32 s20, s20, s22 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s20 -; SI-NEXT: s_and_b32 s20, s21, 0xffff -; SI-NEXT: s_lshl_b32 s21, s88, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 8, v0 -; SI-NEXT: s_or_b32 s20, s20, s21 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s20 +; SI-NEXT: s_or_b32 s20, s20, s25 +; SI-NEXT: s_and_b32 s21, s21, 0xffff +; SI-NEXT: s_lshl_b32 s25, s88, 16 +; SI-NEXT: s_or_b32 s21, s21, s25 +; SI-NEXT: s_lshl_b32 s25, s56, 16 ; SI-NEXT: s_and_b32 s18, s18, 0xffff -; SI-NEXT: s_lshl_b32 s20, s56, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 12, v0 -; SI-NEXT: s_or_b32 s18, s18, s20 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s18 -; SI-NEXT: s_and_b32 s18, s19, 0xffff -; SI-NEXT: s_lshl_b32 s19, s79, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 16, v0 -; SI-NEXT: s_or_b32 s18, s18, s19 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: s_or_b32 s18, s18, s25 +; SI-NEXT: s_and_b32 s19, s19, 0xffff +; SI-NEXT: s_lshl_b32 s25, s79, 16 +; SI-NEXT: s_or_b32 s19, s19, s25 ; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: s_lshl_b32 s18, s46, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 20, v0 -; SI-NEXT: s_or_b32 s16, s16, s18 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s17, 0xffff -; SI-NEXT: s_lshl_b32 s17, s78, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 24, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_lshl_b32 s25, s46, 16 +; SI-NEXT: s_or_b32 s16, s16, s25 +; SI-NEXT: s_and_b32 s17, s17, 0xffff +; SI-NEXT: s_lshl_b32 s25, s78, 16 +; SI-NEXT: s_or_b32 s17, s17, s25 ; SI-NEXT: s_and_b32 s14, s14, 0xffff -; SI-NEXT: s_lshl_b32 s16, s44, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 28, v0 -; SI-NEXT: s_or_b32 s14, s14, s16 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s14 -; SI-NEXT: s_and_b32 s14, s15, 0xffff -; SI-NEXT: s_lshl_b32 s15, s77, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 32, v0 -; SI-NEXT: s_or_b32 s14, s14, s15 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s14 +; SI-NEXT: s_lshl_b32 s25, s44, 16 +; SI-NEXT: s_or_b32 s14, s14, s25 +; SI-NEXT: s_and_b32 s15, s15, 0xffff +; SI-NEXT: s_lshl_b32 s25, s77, 16 +; SI-NEXT: s_or_b32 s15, s15, s25 ; SI-NEXT: s_and_b32 s12, s12, 0xffff -; SI-NEXT: s_lshl_b32 s14, s42, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 36, v0 -; SI-NEXT: s_or_b32 s12, s12, s14 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s12 -; SI-NEXT: s_and_b32 s12, s13, 0xffff -; SI-NEXT: s_lshl_b32 s13, s76, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 40, v0 -; SI-NEXT: s_or_b32 s12, s12, s13 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s12 +; SI-NEXT: s_lshl_b32 s25, s42, 16 +; SI-NEXT: s_or_b32 s12, s12, s25 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_lshl_b32 s25, s76, 16 +; SI-NEXT: s_or_b32 s13, s13, s25 ; SI-NEXT: s_and_b32 s10, s10, 0xffff -; SI-NEXT: s_lshl_b32 s12, s40, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 44, v0 -; SI-NEXT: s_or_b32 s10, s10, s12 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s10 -; SI-NEXT: s_and_b32 s10, s11, 0xffff -; SI-NEXT: s_lshl_b32 s11, s75, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 48, v0 -; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: s_lshl_b32 s25, s40, 16 +; SI-NEXT: s_or_b32 s10, s10, s25 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: s_lshl_b32 s25, s75, 16 +; SI-NEXT: s_or_b32 s11, s11, s25 ; SI-NEXT: s_and_b32 s8, s8, 0xffff -; SI-NEXT: s_lshl_b32 s10, s28, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 52, v0 -; SI-NEXT: s_or_b32 s8, s8, s10 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s8 -; SI-NEXT: s_and_b32 s8, s9, 0xffff -; SI-NEXT: s_lshl_b32 s9, s74, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 56, v0 -; SI-NEXT: s_or_b32 s8, s8, s9 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: s_lshl_b32 s25, s28, 16 +; SI-NEXT: s_or_b32 s8, s8, s25 +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_lshl_b32 s25, s74, 16 +; SI-NEXT: s_or_b32 s9, s9, s25 ; SI-NEXT: s_and_b32 s6, s6, 0xffff -; SI-NEXT: s_lshl_b32 s8, s26, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 60, v0 -; SI-NEXT: s_or_b32 s6, s6, s8 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: s_and_b32 s6, s7, 0xffff -; SI-NEXT: s_lshl_b32 s7, s73, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 64, v0 -; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_lshl_b32 s25, s26, 16 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_lshl_b32 s6, s24, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x44, v0 -; SI-NEXT: s_or_b32 s4, s4, s6 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s4 -; SI-NEXT: s_and_b32 s4, s5, 0xffff -; SI-NEXT: s_lshl_b32 s5, s72, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x48, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x4c, v0 -; SI-NEXT: v_mov_b32_e32 v1, s4 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_lshl_b32 s24, s24, 16 +; SI-NEXT: s_or_b32 s6, s6, s25 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_lshl_b32 s25, s73, 16 +; SI-NEXT: s_or_b32 s4, s4, s24 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s24, s72, 16 +; SI-NEXT: s_or_b32 s7, s7, s25 +; SI-NEXT: s_or_b32 s5, s5, s24 +; SI-NEXT: v_mov_b32_e32 v0, s22 +; SI-NEXT: v_mov_b32_e32 v1, s23 +; SI-NEXT: v_mov_b32_e32 v2, s20 +; SI-NEXT: v_mov_b32_e32 v3, s21 +; SI-NEXT: v_mov_b32_e32 v4, s18 +; SI-NEXT: v_mov_b32_e32 v5, s19 +; SI-NEXT: v_mov_b32_e32 v6, s16 +; SI-NEXT: v_mov_b32_e32 v7, s17 +; SI-NEXT: v_mov_b32_e32 v8, s14 +; SI-NEXT: v_mov_b32_e32 v9, s15 +; SI-NEXT: v_mov_b32_e32 v10, s12 +; SI-NEXT: v_mov_b32_e32 v11, s13 +; SI-NEXT: v_mov_b32_e32 v12, s10 +; SI-NEXT: v_mov_b32_e32 v13, s11 +; SI-NEXT: v_mov_b32_e32 v14, s8 +; SI-NEXT: v_mov_b32_e32 v15, s9 +; SI-NEXT: v_mov_b32_e32 v16, s6 +; SI-NEXT: v_mov_b32_e32 v17, s7 +; SI-NEXT: v_mov_b32_e32 v18, s4 +; SI-NEXT: v_mov_b32_e32 v19, s5 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB13_4: ; SI-NEXT: ; implicit-def: $sgpr60 @@ -3652,132 +3535,136 @@ define <20 x i32> @bitcast_v40i16_to_v20i32(<40 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v40i16_to_v20i32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v52, v6 -; SI-NEXT: v_mov_b32_e32 v53, v4 -; SI-NEXT: v_mov_b32_e32 v54, v2 -; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 -; SI-NEXT: v_mov_b32_e32 v49, v12 -; SI-NEXT: v_mov_b32_e32 v50, v10 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v32, v19 +; SI-NEXT: v_mov_b32_e32 v33, v18 +; SI-NEXT: v_mov_b32_e32 v34, v17 +; SI-NEXT: v_mov_b32_e32 v35, v16 +; SI-NEXT: v_mov_b32_e32 v36, v15 +; SI-NEXT: v_mov_b32_e32 v37, v14 +; SI-NEXT: v_mov_b32_e32 v38, v13 +; SI-NEXT: v_mov_b32_e32 v39, v12 +; SI-NEXT: v_mov_b32_e32 v48, v11 +; SI-NEXT: v_mov_b32_e32 v49, v10 +; SI-NEXT: v_mov_b32_e32 v50, v9 ; SI-NEXT: v_mov_b32_e32 v51, v8 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v37, v20 -; SI-NEXT: v_mov_b32_e32 v38, v18 -; SI-NEXT: v_mov_b32_e32 v39, v16 -; SI-NEXT: v_mov_b32_e32 v48, v14 -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v15 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v29 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v4 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:20 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v8 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:12 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v52, v7 +; SI-NEXT: v_mov_b32_e32 v53, v6 +; SI-NEXT: v_mov_b32_e32 v54, v5 +; SI-NEXT: v_mov_b32_e32 v55, v4 +; SI-NEXT: v_mov_b32_e32 v40, v3 +; SI-NEXT: v_mov_b32_e32 v41, v2 +; SI-NEXT: v_mov_b32_e32 v42, v1 +; SI-NEXT: v_mov_b32_e32 v43, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v38 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v48 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v50 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v52 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v40 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v42 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v43 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v12 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB14_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; kill: killed $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; kill: killed $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; kill: killed $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; kill: killed $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; kill: killed $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v53 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v52 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v51 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v50 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v49 -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v48 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v39 -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v38 -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v37 -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v43 -; SI-NEXT: ; kill: killed $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: v_or_b32_e32 v0, v0, v42 -; SI-NEXT: v_or_b32_e32 v1, v1, v41 -; SI-NEXT: v_or_b32_e32 v2, v2, v36 -; SI-NEXT: v_or_b32_e32 v3, v3, v35 -; SI-NEXT: v_or_b32_e32 v4, v4, v40 -; SI-NEXT: v_or_b32_e32 v5, v5, v34 -; SI-NEXT: v_or_b32_e32 v6, v6, v33 -; SI-NEXT: v_or_b32_e32 v7, v7, v32 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v39 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v43 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v42 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v41 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v40 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v55 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v54 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v53 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v52 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v51 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v50 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v49 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v48 +; SI-NEXT: v_or_b32_e32 v0, v0, v59 +; SI-NEXT: v_or_b32_e32 v1, v1, v58 +; SI-NEXT: v_or_b32_e32 v2, v2, v57 +; SI-NEXT: v_or_b32_e32 v3, v3, v56 +; SI-NEXT: v_or_b32_e32 v4, v4, v47 +; SI-NEXT: v_or_b32_e32 v5, v5, v46 +; SI-NEXT: v_or_b32_e32 v6, v6, v45 +; SI-NEXT: v_or_b32_e32 v7, v7, v44 ; SI-NEXT: v_or_b32_e32 v8, v8, v63 ; SI-NEXT: v_or_b32_e32 v9, v9, v62 ; SI-NEXT: v_or_b32_e32 v10, v10, v61 -; SI-NEXT: v_or_b32_e32 v16, v16, v47 +; SI-NEXT: v_or_b32_e32 v11, v11, v60 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr53 @@ -3787,81 +3674,91 @@ define <20 x i32> @bitcast_v40i16_to_v20i32(<40 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; kill: killed $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr62 ; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v38 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v37 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v36 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v35 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v34 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v33 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v32 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; SI-NEXT: v_or_b32_e32 v11, v11, v60 -; SI-NEXT: v_or_b32_e32 v12, v12, v59 -; SI-NEXT: v_or_b32_e32 v13, v13, v58 -; SI-NEXT: v_or_b32_e32 v14, v14, v57 -; SI-NEXT: v_or_b32_e32 v15, v15, v56 -; SI-NEXT: v_or_b32_e32 v17, v17, v46 -; SI-NEXT: v_or_b32_e32 v18, v18, v45 -; SI-NEXT: v_or_b32_e32 v19, v19, v44 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 ; SI-NEXT: .LBB14_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB14_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v39 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v53 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v52 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v51 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v50 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v49 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v48 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v39 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v38 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v37 -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v43 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v43 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v42 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v41 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v40 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v48 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 @@ -3873,20 +3770,20 @@ define <20 x i32> @bitcast_v40i16_to_v20i32(<40 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_or_b32_e32 v0, v59, v0 ; SI-NEXT: s_mov_b32 s6, 0x30000 -; SI-NEXT: v_or_b32_e32 v1, v41, v1 -; SI-NEXT: v_or_b32_e32 v2, v36, v2 -; SI-NEXT: v_or_b32_e32 v3, v35, v3 -; SI-NEXT: v_or_b32_e32 v4, v40, v4 -; SI-NEXT: v_or_b32_e32 v5, v34, v5 -; SI-NEXT: v_or_b32_e32 v6, v33, v6 -; SI-NEXT: v_or_b32_e32 v7, v32, v7 +; SI-NEXT: v_or_b32_e32 v1, v58, v1 +; SI-NEXT: v_or_b32_e32 v2, v57, v2 +; SI-NEXT: v_or_b32_e32 v3, v56, v3 +; SI-NEXT: v_or_b32_e32 v4, v47, v4 +; SI-NEXT: v_or_b32_e32 v5, v46, v5 +; SI-NEXT: v_or_b32_e32 v6, v45, v6 +; SI-NEXT: v_or_b32_e32 v7, v44, v7 ; SI-NEXT: v_or_b32_e32 v8, v63, v8 ; SI-NEXT: v_or_b32_e32 v9, v62, v9 ; SI-NEXT: v_or_b32_e32 v10, v61, v10 -; SI-NEXT: v_or_b32_e32 v16, v47, v16 +; SI-NEXT: v_or_b32_e32 v11, v60, v11 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 @@ -3897,62 +3794,64 @@ define <20 x i32> @bitcast_v40i16_to_v20i32(<40 x i16> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 ; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 ; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 -; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 -; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v38 ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v37 ; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v36 ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v35 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v34 ; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v33 ; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v32 ; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; SI-NEXT: v_or_b32_e32 v11, v60, v11 -; SI-NEXT: v_or_b32_e32 v12, v59, v12 -; SI-NEXT: v_or_b32_e32 v13, v58, v13 -; SI-NEXT: v_or_b32_e32 v14, v57, v14 -; SI-NEXT: v_or_b32_e32 v15, v56, v15 -; SI-NEXT: v_or_b32_e32 v17, v46, v17 -; SI-NEXT: v_or_b32_e32 v18, v45, v18 -; SI-NEXT: v_or_b32_e32 v19, v44, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 ; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 ; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 ; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 ; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 ; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 ; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 ; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 ; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v19 ; SI-NEXT: .LBB14_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -4484,209 +4383,221 @@ define inreg <20 x i32> @bitcast_v40i16_to_v20i32_scalar(<40 x i16> inreg %a, i3 ; SI-LABEL: bitcast_v40i16_to_v20i32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v32, v24 -; SI-NEXT: v_mov_b32_e32 v33, v22 -; SI-NEXT: v_mov_b32_e32 v34, v20 -; SI-NEXT: v_mov_b32_e32 v35, v18 -; SI-NEXT: v_mov_b32_e32 v36, v16 -; SI-NEXT: v_mov_b32_e32 v37, v14 -; SI-NEXT: v_mov_b32_e32 v38, v12 -; SI-NEXT: v_mov_b32_e32 v39, v10 -; SI-NEXT: v_mov_b32_e32 v48, v8 -; SI-NEXT: v_mov_b32_e32 v49, v6 -; SI-NEXT: v_mov_b32_e32 v50, v4 -; SI-NEXT: v_mov_b32_e32 v51, v2 -; SI-NEXT: v_mov_b32_e32 v52, v0 +; SI-NEXT: v_mov_b32_e32 v32, v5 +; SI-NEXT: v_mov_b32_e32 v33, v4 +; SI-NEXT: v_mov_b32_e32 v34, v3 +; SI-NEXT: v_mov_b32_e32 v35, v2 +; SI-NEXT: v_mov_b32_e32 v36, v1 +; SI-NEXT: v_mov_b32_e32 v37, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v37 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s7, s28, 16 +; SI-NEXT: s_lshr_b32 s8, s27, 16 +; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: s_lshr_b32 s13, s22, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s40, s19, 16 +; SI-NEXT: s_lshr_b32 s41, s18, 16 +; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v0 ; SI-NEXT: s_cbranch_scc0 .LBB15_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 -; SI-NEXT: v_or_b32_e32 v7, v0, v57 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 -; SI-NEXT: v_or_b32_e32 v8, v0, v56 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 -; SI-NEXT: v_or_b32_e32 v9, v0, v47 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 -; SI-NEXT: v_or_b32_e32 v10, v0, v46 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 ; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: v_or_b32_e32 v11, v0, v45 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: s_lshl_b32 s5, s43, 16 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: v_or_b32_e32 v12, v0, v44 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 -; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: v_or_b32_e32 v13, v0, v43 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s44, s42, 16 +; SI-NEXT: s_or_b32 s5, s5, s44 +; SI-NEXT: s_and_b32 s44, s18, 0xffff +; SI-NEXT: s_lshl_b32 s45, s41, 16 +; SI-NEXT: s_or_b32 s44, s44, s45 +; SI-NEXT: s_and_b32 s45, s19, 0xffff +; SI-NEXT: s_lshl_b32 s46, s40, 16 +; SI-NEXT: s_or_b32 s45, s45, s46 +; SI-NEXT: s_and_b32 s46, s20, 0xffff +; SI-NEXT: s_lshl_b32 s47, s15, 16 +; SI-NEXT: s_or_b32 s46, s46, s47 +; SI-NEXT: s_and_b32 s47, s21, 0xffff +; SI-NEXT: s_lshl_b32 s56, s14, 16 +; SI-NEXT: s_or_b32 s47, s47, s56 +; SI-NEXT: s_and_b32 s56, s22, 0xffff +; SI-NEXT: s_lshl_b32 s57, s13, 16 +; SI-NEXT: s_or_b32 s56, s56, s57 +; SI-NEXT: s_and_b32 s57, s23, 0xffff +; SI-NEXT: s_lshl_b32 s58, s12, 16 +; SI-NEXT: s_or_b32 s57, s57, s58 +; SI-NEXT: s_and_b32 s58, s24, 0xffff +; SI-NEXT: s_lshl_b32 s59, s11, 16 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 -; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: v_or_b32_e32 v14, v0, v42 +; SI-NEXT: s_or_b32 s58, s58, s59 +; SI-NEXT: s_and_b32 s59, s25, 0xffff +; SI-NEXT: s_lshl_b32 s60, s10, 16 +; SI-NEXT: v_or_b32_e32 v14, v0, v51 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 -; SI-NEXT: s_or_b32 s7, s7, s8 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: v_or_b32_e32 v15, v0, v41 +; SI-NEXT: s_or_b32 s59, s59, s60 +; SI-NEXT: s_and_b32 s60, s26, 0xffff +; SI-NEXT: s_lshl_b32 s61, s9, 16 +; SI-NEXT: v_or_b32_e32 v15, v0, v50 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 -; SI-NEXT: s_or_b32 s8, s8, s9 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: v_or_b32_e32 v16, v0, v40 +; SI-NEXT: s_or_b32 s60, s60, s61 +; SI-NEXT: s_and_b32 s61, s27, 0xffff +; SI-NEXT: s_lshl_b32 s62, s8, 16 +; SI-NEXT: v_or_b32_e32 v16, v0, v49 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 -; SI-NEXT: s_or_b32 s9, s9, s10 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_or_b32_e32 v17, v0, v55 +; SI-NEXT: s_or_b32 s61, s61, s62 +; SI-NEXT: s_and_b32 s62, s28, 0xffff +; SI-NEXT: s_lshl_b32 s63, s7, 16 +; SI-NEXT: v_or_b32_e32 v17, v0, v48 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 -; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_or_b32_e32 v18, v0, v54 +; SI-NEXT: s_or_b32 s62, s62, s63 +; SI-NEXT: s_and_b32 s63, s29, 0xffff +; SI-NEXT: s_lshl_b32 s72, s6, 16 +; SI-NEXT: v_or_b32_e32 v18, v0, v39 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 -; SI-NEXT: v_or_b32_e32 v19, v0, v53 +; SI-NEXT: s_or_b32 s63, s63, s72 +; SI-NEXT: v_or_b32_e32 v19, v0, v38 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_mov_b32_e32 v5, s9 -; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: v_mov_b32_e32 v2, s44 +; SI-NEXT: v_mov_b32_e32 v3, s45 +; SI-NEXT: v_mov_b32_e32 v4, s46 +; SI-NEXT: v_mov_b32_e32 v5, s47 +; SI-NEXT: v_mov_b32_e32 v6, s56 +; SI-NEXT: v_mov_b32_e32 v7, s57 +; SI-NEXT: v_mov_b32_e32 v8, s58 +; SI-NEXT: v_mov_b32_e32 v9, s59 +; SI-NEXT: v_mov_b32_e32 v10, s60 +; SI-NEXT: v_mov_b32_e32 v11, s61 +; SI-NEXT: v_mov_b32_e32 v12, s62 +; SI-NEXT: v_mov_b32_e32 v13, s63 ; SI-NEXT: s_cbranch_execnz .LBB15_3 ; SI-NEXT: .LBB15_2: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v57, v0 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v56, v0 -; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v47, v0 -; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v46, v0 -; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v45, v0 -; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v44, v0 -; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v43, v0 -; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: v_or_b32_e32 v0, v51, v0 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s16, s42, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: s_and_b32 s16, s18, 0xffff +; SI-NEXT: s_lshl_b32 s17, s41, 16 +; SI-NEXT: s_add_i32 s19, s19, 3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v41, v0 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_and_b32 s17, s19, 0xffff +; SI-NEXT: s_lshl_b32 s18, s40, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_or_b32_e32 v0, v50, v0 +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: s_and_b32 s18, s20, 0xffff +; SI-NEXT: s_lshl_b32 s15, s15, 16 +; SI-NEXT: s_add_i32 s21, s21, 3 ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: s_or_b32 s15, s15, s18 +; SI-NEXT: s_and_b32 s18, s21, 0xffff +; SI-NEXT: s_lshl_b32 s14, s14, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_or_b32_e32 v0, v40, v0 -; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s14, s14, s18 +; SI-NEXT: s_and_b32 s18, s22, 0xffff +; SI-NEXT: s_lshl_b32 s13, s13, 16 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: v_or_b32_e32 v0, v49, v0 +; SI-NEXT: s_or_b32 s13, s13, s18 +; SI-NEXT: s_and_b32 s18, s23, 0xffff +; SI-NEXT: s_lshl_b32 s12, s12, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 ; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s12, s12, s18 +; SI-NEXT: s_and_b32 s18, s24, 0xffff +; SI-NEXT: s_lshl_b32 s11, s11, 16 +; SI-NEXT: s_add_i32 s25, s25, 3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: v_or_b32_e32 v0, v55, v0 -; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s11, s11, s18 +; SI-NEXT: s_and_b32 s18, s25, 0xffff +; SI-NEXT: s_lshl_b32 s10, s10, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: v_or_b32_e32 v0, v48, v0 +; SI-NEXT: s_or_b32 s10, s10, s18 +; SI-NEXT: s_and_b32 s18, s26, 0xffff +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_add_i32 s27, s27, 3 ; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 -; SI-NEXT: s_or_b32 s7, s8, s7 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_or_b32 s8, s9, s8 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_or_b32 s9, s9, s18 +; SI-NEXT: s_and_b32 s18, s27, 0xffff +; SI-NEXT: s_lshl_b32 s8, s8, 16 ; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: v_or_b32_e32 v0, v54, v0 -; SI-NEXT: s_or_b32 s9, s10, s9 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s8, s8, s18 +; SI-NEXT: s_and_b32 s18, s28, 0xffff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: v_or_b32_e32 v0, v39, v0 +; SI-NEXT: s_or_b32 s7, s7, s18 +; SI-NEXT: s_and_b32 s18, s29, 0xffff +; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 -; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: s_or_b32 s6, s6, s18 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_add_i32 s4, s4, 0x30000 ; SI-NEXT: s_add_i32 s5, s5, 0x30000 -; SI-NEXT: s_add_i32 s6, s6, 0x30000 -; SI-NEXT: s_add_i32 s7, s7, 0x30000 -; SI-NEXT: s_add_i32 s8, s8, 0x30000 -; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s16, s16, 0x30000 +; SI-NEXT: s_add_i32 s17, s17, 0x30000 +; SI-NEXT: s_add_i32 s15, s15, 0x30000 +; SI-NEXT: s_add_i32 s14, s14, 0x30000 +; SI-NEXT: s_add_i32 s13, s13, 0x30000 +; SI-NEXT: s_add_i32 s12, s12, 0x30000 +; SI-NEXT: s_add_i32 s11, s11, 0x30000 ; SI-NEXT: s_add_i32 s10, s10, 0x30000 -; SI-NEXT: v_or_b32_e32 v0, v53, v0 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v38, v0 ; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_mov_b32_e32 v5, s9 -; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: v_mov_b32_e32 v3, s17 +; SI-NEXT: v_mov_b32_e32 v4, s15 +; SI-NEXT: v_mov_b32_e32 v5, s14 +; SI-NEXT: v_mov_b32_e32 v6, s13 +; SI-NEXT: v_mov_b32_e32 v7, s12 +; SI-NEXT: v_mov_b32_e32 v8, s11 +; SI-NEXT: v_mov_b32_e32 v9, s10 +; SI-NEXT: v_mov_b32_e32 v10, s9 +; SI-NEXT: v_mov_b32_e32 v11, s8 +; SI-NEXT: v_mov_b32_e32 v12, s7 +; SI-NEXT: v_mov_b32_e32 v13, s6 ; SI-NEXT: .LBB15_3: ; %end -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB15_4: ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 @@ -5228,29 +5139,27 @@ define <40 x half> @bitcast_v20i32_to_v40f16(<20 x i32> %a, i32 %b) { ; SI-LABEL: bitcast_v20i32_to_v40f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v21 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr41 @@ -5260,96 +5169,98 @@ define <40 x half> @bitcast_v20i32_to_v40f16(<20 x i32> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB16_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v23 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v23 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v23 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v23 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v23 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v23 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v23 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v23 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v23 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v23 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v23 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v23 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v23 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v4 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v46, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v23 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v3 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v56, v23 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v45, v23 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v19 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v58, v23 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v47, v23 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v17 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v57, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v60, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v0 +; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 @@ -5369,11 +5280,11 @@ define <40 x half> @bitcast_v20i32_to_v40f16(<20 x i32> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: .LBB16_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB16_4 ; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 @@ -5393,222 +5304,163 @@ define <40 x half> @bitcast_v20i32_to_v40f16(<20 x i32> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 ; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 ; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v1 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 ; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 ; SI-NEXT: .LBB16_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v57 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v56 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v47 -; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v45 -; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v44 -; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v44 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v41 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v42 -; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v40 -; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v54 -; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v52 -; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v50 -; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v48 -; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v38 -; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 -; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v35 -; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 -; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v31 -; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v29 -; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v27 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v25 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v23 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x4c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v54 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v53 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v49 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v50 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v37 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v38 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v33 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v34 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v28 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v30 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v24 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v27 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v21 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v23 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v20i32_to_v40f16: @@ -6036,109 +5888,109 @@ define inreg <40 x half> @bitcast_v20i32_to_v40f16_scalar(<20 x i32> inreg %a, i ; SI-LABEL: bitcast_v20i32_to_v40f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v8, s16 -; SI-NEXT: v_mov_b32_e32 v9, s17 -; SI-NEXT: v_mov_b32_e32 v10, s18 -; SI-NEXT: v_mov_b32_e32 v11, s19 -; SI-NEXT: v_mov_b32_e32 v12, s20 -; SI-NEXT: v_mov_b32_e32 v13, s21 -; SI-NEXT: v_mov_b32_e32 v14, s22 -; SI-NEXT: v_mov_b32_e32 v15, s23 -; SI-NEXT: v_mov_b32_e32 v16, s24 -; SI-NEXT: v_mov_b32_e32 v17, s25 -; SI-NEXT: v_mov_b32_e32 v18, s26 -; SI-NEXT: v_mov_b32_e32 v19, s27 -; SI-NEXT: v_readfirstlane_b32 s23, v8 -; SI-NEXT: v_mov_b32_e32 v8, s28 -; SI-NEXT: v_readfirstlane_b32 s24, v9 -; SI-NEXT: v_mov_b32_e32 v9, s29 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 -; SI-NEXT: v_readfirstlane_b32 s25, v10 -; SI-NEXT: v_readfirstlane_b32 s22, v11 -; SI-NEXT: v_readfirstlane_b32 s21, v12 -; SI-NEXT: v_readfirstlane_b32 s20, v13 -; SI-NEXT: v_readfirstlane_b32 s19, v14 -; SI-NEXT: v_readfirstlane_b32 s18, v15 -; SI-NEXT: v_readfirstlane_b32 s17, v16 -; SI-NEXT: v_readfirstlane_b32 s16, v17 -; SI-NEXT: v_readfirstlane_b32 s15, v18 -; SI-NEXT: v_readfirstlane_b32 s14, v19 -; SI-NEXT: v_readfirstlane_b32 s13, v8 -; SI-NEXT: v_readfirstlane_b32 s12, v9 -; SI-NEXT: v_readfirstlane_b32 s11, v1 -; SI-NEXT: v_readfirstlane_b32 s10, v2 -; SI-NEXT: v_readfirstlane_b32 s8, v3 -; SI-NEXT: v_readfirstlane_b32 s7, v4 -; SI-NEXT: v_readfirstlane_b32 s6, v5 +; SI-NEXT: v_mov_b32_e32 v7, s16 +; SI-NEXT: v_mov_b32_e32 v8, s17 +; SI-NEXT: v_mov_b32_e32 v9, s18 +; SI-NEXT: v_mov_b32_e32 v10, s19 +; SI-NEXT: v_mov_b32_e32 v11, s20 +; SI-NEXT: v_mov_b32_e32 v12, s21 +; SI-NEXT: v_mov_b32_e32 v13, s22 +; SI-NEXT: v_mov_b32_e32 v14, s23 +; SI-NEXT: v_mov_b32_e32 v15, s24 +; SI-NEXT: v_mov_b32_e32 v16, s25 +; SI-NEXT: v_mov_b32_e32 v17, s26 +; SI-NEXT: v_mov_b32_e32 v18, s27 +; SI-NEXT: v_mov_b32_e32 v19, s28 +; SI-NEXT: v_readfirstlane_b32 s24, v7 +; SI-NEXT: v_mov_b32_e32 v7, s29 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: v_readfirstlane_b32 s25, v8 +; SI-NEXT: v_readfirstlane_b32 s23, v9 +; SI-NEXT: v_readfirstlane_b32 s22, v10 +; SI-NEXT: v_readfirstlane_b32 s21, v11 +; SI-NEXT: v_readfirstlane_b32 s20, v12 +; SI-NEXT: v_readfirstlane_b32 s19, v13 +; SI-NEXT: v_readfirstlane_b32 s18, v14 +; SI-NEXT: v_readfirstlane_b32 s17, v15 +; SI-NEXT: v_readfirstlane_b32 s16, v16 +; SI-NEXT: v_readfirstlane_b32 s15, v17 +; SI-NEXT: v_readfirstlane_b32 s14, v18 +; SI-NEXT: v_readfirstlane_b32 s13, v19 +; SI-NEXT: v_readfirstlane_b32 s12, v7 +; SI-NEXT: v_readfirstlane_b32 s11, v0 +; SI-NEXT: v_readfirstlane_b32 s10, v1 +; SI-NEXT: v_readfirstlane_b32 s8, v2 +; SI-NEXT: v_readfirstlane_b32 s7, v3 +; SI-NEXT: v_readfirstlane_b32 s6, v4 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s9, v6 +; SI-NEXT: v_readfirstlane_b32 s9, v5 ; SI-NEXT: s_cbranch_scc0 .LBB17_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_lshr_b32 s4, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 ; SI-NEXT: s_lshr_b32 s4, s6, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 ; SI-NEXT: s_lshr_b32 s4, s7, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 ; SI-NEXT: s_lshr_b32 s4, s8, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 ; SI-NEXT: s_lshr_b32 s4, s10, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 ; SI-NEXT: s_lshr_b32 s4, s11, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 ; SI-NEXT: s_lshr_b32 s4, s12, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 ; SI-NEXT: s_lshr_b32 s4, s13, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 ; SI-NEXT: s_lshr_b32 s4, s14, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 ; SI-NEXT: s_lshr_b32 s4, s15, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 ; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 ; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 ; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 ; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s4 ; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 ; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v34, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s4 ; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 ; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s24 ; SI-NEXT: s_cbranch_execnz .LBB17_3 ; SI-NEXT: .LBB17_2: ; %cmp.true -; SI-NEXT: s_add_i32 s23, s23, 3 ; SI-NEXT: s_add_i32 s24, s24, 3 ; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 ; SI-NEXT: s_add_i32 s22, s22, 3 ; SI-NEXT: s_add_i32 s21, s21, 3 ; SI-NEXT: s_add_i32 s20, s20, 3 @@ -6156,9 +6008,9 @@ define inreg <40 x half> @bitcast_v20i32_to_v40f16_scalar(<20 x i32> inreg %a, i ; SI-NEXT: s_add_i32 s7, s7, 3 ; SI-NEXT: s_add_i32 s6, s6, 3 ; SI-NEXT: s_add_i32 s9, s9, 3 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: s_lshr_b32 s5, s24, 16 -; SI-NEXT: s_lshr_b32 s26, s25, 16 +; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: s_lshr_b32 s5, s25, 16 +; SI-NEXT: s_lshr_b32 s26, s23, 16 ; SI-NEXT: s_lshr_b32 s27, s22, 16 ; SI-NEXT: s_lshr_b32 s28, s21, 16 ; SI-NEXT: s_lshr_b32 s29, s20, 16 @@ -6176,228 +6028,169 @@ define inreg <40 x half> @bitcast_v20i32_to_v40f16_scalar(<20 x i32> inreg %a, i ; SI-NEXT: s_lshr_b32 s59, s7, 16 ; SI-NEXT: s_lshr_b32 s60, s6, 16 ; SI-NEXT: s_lshr_b32 s61, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s61 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s60 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s59 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s58 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s57 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s56 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s61 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s60 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s59 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s58 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s57 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s56 ; SI-NEXT: v_cvt_f32_f16_e32 v12, s47 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s46 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s45 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s44 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s43 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s45 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s44 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s4 ; SI-NEXT: .LBB17_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 ; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 ; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 ; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 ; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v39 +; SI-NEXT: v_or_b32_e32 v0, v38, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 +; SI-NEXT: v_or_b32_e32 v2, v36, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_or_b32_e32 v39, v39, v48 -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: buffer_store_dword v39, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v39, vcc, 4, v0 -; SI-NEXT: v_or_b32_e32 v37, v37, v38 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v36 +; SI-NEXT: v_or_b32_e32 v5, v5, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v34 +; SI-NEXT: v_or_b32_e32 v7, v7, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: buffer_store_dword v37, v39, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v37, vcc, 8, v0 -; SI-NEXT: v_or_b32_e32 v35, v35, v36 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v32 +; SI-NEXT: v_or_b32_e32 v9, v30, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: buffer_store_dword v35, v37, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v35, vcc, 12, v0 -; SI-NEXT: v_or_b32_e32 v33, v33, v34 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v30 +; SI-NEXT: v_or_b32_e32 v11, v28, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: buffer_store_dword v33, v35, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v33, vcc, 16, v0 -; SI-NEXT: v_or_b32_e32 v31, v32, v31 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v28 +; SI-NEXT: v_or_b32_e32 v13, v26, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: buffer_store_dword v31, v33, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v31, vcc, 20, v0 -; SI-NEXT: v_or_b32_e32 v29, v30, v29 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v26 +; SI-NEXT: v_or_b32_e32 v15, v24, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_or_b32_e32 v17, v22, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: buffer_store_dword v29, v31, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v29, vcc, 24, v0 -; SI-NEXT: v_or_b32_e32 v27, v28, v27 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: buffer_store_dword v27, v29, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v27, vcc, 28, v0 -; SI-NEXT: v_or_b32_e32 v25, v26, v25 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: buffer_store_dword v25, v27, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v25, vcc, 32, v0 -; SI-NEXT: v_or_b32_e32 v23, v24, v23 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: buffer_store_dword v23, v25, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v23, vcc, 36, v0 -; SI-NEXT: v_or_b32_e32 v21, v22, v21 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: buffer_store_dword v21, v23, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v21, vcc, 40, v0 -; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: buffer_store_dword v19, v21, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v19, vcc, 44, v0 -; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: buffer_store_dword v16, v19, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v16, v17 -; SI-NEXT: v_add_i32_e32 v17, vcc, 48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v16, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v14, v15 -; SI-NEXT: v_add_i32_e32 v15, vcc, 52, v0 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v14, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: buffer_store_dword v12, v15, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v12, v13 -; SI-NEXT: v_add_i32_e32 v13, vcc, 56, v0 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: buffer_store_dword v10, v13, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v10, v11 -; SI-NEXT: v_add_i32_e32 v11, vcc, 60, v0 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: buffer_store_dword v8, v11, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v8, v9 -; SI-NEXT: v_add_i32_e32 v9, vcc, 64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v5 -; SI-NEXT: v_add_i32_e32 v5, vcc, 0x48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v3 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x4c, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v22 +; SI-NEXT: v_or_b32_e32 v1, v48, v1 +; SI-NEXT: v_or_b32_e32 v3, v38, v3 +; SI-NEXT: v_or_b32_e32 v4, v35, v4 +; SI-NEXT: v_or_b32_e32 v6, v33, v6 +; SI-NEXT: v_or_b32_e32 v8, v31, v8 +; SI-NEXT: v_or_b32_e32 v10, v29, v10 +; SI-NEXT: v_or_b32_e32 v12, v27, v12 +; SI-NEXT: v_or_b32_e32 v14, v25, v14 +; SI-NEXT: v_or_b32_e32 v16, v23, v16 +; SI-NEXT: v_or_b32_e32 v18, v21, v18 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB17_4: +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: s_branch .LBB17_2 ; ; VI-LABEL: bitcast_v20i32_to_v40f16_scalar: @@ -6952,128 +6745,146 @@ define <20 x i32> @bitcast_v40f16_to_v20i32(<40 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v40f16_to_v20i32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:36 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v38, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v20 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v23 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v48 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f16_f32_e32 v48, v55 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v32 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v54 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v40 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v53 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v39, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v63, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v52 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB18_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v41 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; kill: killed $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; kill: killed $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; kill: killed $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; kill: killed $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; kill: killed $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; kill: killed $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; kill: killed $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; kill: killed $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; kill: killed $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; kill: killed $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v39 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v37 @@ -7086,9 +6897,11 @@ define <20 x i32> @bitcast_v40f16_to_v20i32(<40 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v47 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v45 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v43 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v49 -; SI-NEXT: ; kill: killed $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v41 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v53 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v49 ; SI-NEXT: v_or_b32_e32 v0, v38, v0 ; SI-NEXT: v_or_b32_e32 v1, v36, v1 ; SI-NEXT: v_or_b32_e32 v2, v34, v2 @@ -7100,7 +6913,11 @@ define <20 x i32> @bitcast_v40f16_to_v20i32(<40 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v8, v46, v8 ; SI-NEXT: v_or_b32_e32 v9, v44, v9 ; SI-NEXT: v_or_b32_e32 v10, v42, v10 -; SI-NEXT: v_or_b32_e32 v19, v48, v19 +; SI-NEXT: v_or_b32_e32 v11, v40, v11 +; SI-NEXT: v_or_b32_e32 v12, v54, v12 +; SI-NEXT: v_or_b32_e32 v13, v52, v13 +; SI-NEXT: v_or_b32_e32 v14, v50, v14 +; SI-NEXT: v_or_b32_e32 v15, v48, v15 ; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr37 @@ -7124,47 +6941,50 @@ define <20 x i32> @bitcast_v40f16_to_v20i32(<40 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; kill: killed $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v51 -; SI-NEXT: v_or_b32_e32 v18, v50, v18 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 ; SI-NEXT: .LBB18_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB18_4 @@ -7172,9 +6992,6 @@ define <20 x i32> @bitcast_v40f16_to_v20i32(<40 x half> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v39 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v37 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v36 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 @@ -7192,7 +7009,11 @@ define <20 x i32> @bitcast_v40f16_to_v20i32(<40 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v2, v35 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v34 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v62 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -7202,175 +7023,165 @@ define <20 x i32> @bitcast_v40f16_to_v20i32(<40 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v33 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v62 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v58 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v58 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v57 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v56 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v56 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v46 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v43 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v43 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v53 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v47 ; SI-NEXT: v_or_b32_e32 v7, v9, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v48 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v10, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v44 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v52 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v41 ; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v50 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v54 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v14, v51 ; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v49 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v16, v14 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v48 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v51 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v50 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_or_b32_e32 v18, v19, v18 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 ; SI-NEXT: v_or_b32_e32 v19, v21, v19 ; SI-NEXT: .LBB18_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -7903,6 +7714,16 @@ define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i ; SI-LABEL: bitcast_v40f16_to_v20i32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_lshr_b32 s14, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s14 +; SI-NEXT: s_lshr_b32 s14, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s14 +; SI-NEXT: s_lshr_b32 s14, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s14 +; SI-NEXT: s_lshr_b32 s14, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s14 +; SI-NEXT: s_lshr_b32 s12, s21, 16 +; SI-NEXT: s_lshr_b32 s13, s20, 16 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -7919,98 +7740,149 @@ define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v32, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v0 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v63, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v10 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_cvt_f16_f32_e32 v58, v16 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_cvt_f16_f32_e32 v59, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s21 +; SI-NEXT: s_lshr_b32 s10, s23, 16 +; SI-NEXT: s_lshr_b32 s11, s22, 16 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s23 +; SI-NEXT: s_lshr_b32 s8, s25, 16 +; SI-NEXT: s_lshr_b32 s9, s24, 16 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v15 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v63, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s25 +; SI-NEXT: s_lshr_b32 s6, s27, 16 +; SI-NEXT: s_lshr_b32 s7, s26, 16 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s27 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v0 +; SI-NEXT: s_lshr_b32 s4, s29, 16 +; SI-NEXT: s_lshr_b32 s5, s28, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v52, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v58, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v60, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v59, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v57, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v39, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v56, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v38, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v37, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v36, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v35, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v34, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v61, s26 -; SI-NEXT: v_cvt_f16_f32_e32 v25, s29 -; SI-NEXT: v_cvt_f16_f32_e32 v24, s28 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v5 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_cbranch_scc0 .LBB19_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v58 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v59 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v38 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v34 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v45 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v35 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v63 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v44 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v42 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v40 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v54 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v52 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v50 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v48 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v30 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v44 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v60 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v53 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v25 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v23 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: v_or_b32_e32 v0, v60, v0 -; SI-NEXT: v_or_b32_e32 v1, v57, v1 -; SI-NEXT: v_or_b32_e32 v2, v56, v2 -; SI-NEXT: v_or_b32_e32 v3, v37, v3 -; SI-NEXT: v_or_b32_e32 v4, v35, v4 -; SI-NEXT: v_or_b32_e32 v5, v61, v5 -; SI-NEXT: v_or_b32_e32 v6, v24, v6 -; SI-NEXT: v_or_b32_e32 v7, v47, v7 +; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: v_or_b32_e32 v1, v56, v1 +; SI-NEXT: v_or_b32_e32 v2, v46, v2 +; SI-NEXT: v_or_b32_e32 v3, v38, v3 +; SI-NEXT: v_or_b32_e32 v4, v37, v4 +; SI-NEXT: v_or_b32_e32 v5, v34, v5 +; SI-NEXT: v_or_b32_e32 v6, v33, v6 +; SI-NEXT: v_or_b32_e32 v7, v63, v7 ; SI-NEXT: v_or_b32_e32 v8, v62, v8 -; SI-NEXT: v_or_b32_e32 v9, v43, v9 -; SI-NEXT: v_or_b32_e32 v10, v41, v10 -; SI-NEXT: v_or_b32_e32 v11, v55, v11 -; SI-NEXT: v_or_b32_e32 v12, v53, v12 -; SI-NEXT: v_or_b32_e32 v13, v51, v13 -; SI-NEXT: v_or_b32_e32 v14, v49, v14 -; SI-NEXT: v_or_b32_e32 v15, v31, v15 -; SI-NEXT: v_or_b32_e32 v16, v29, v16 -; SI-NEXT: v_or_b32_e32 v17, v27, v17 +; SI-NEXT: v_or_b32_e32 v9, v55, v9 +; SI-NEXT: v_or_b32_e32 v10, v54, v10 +; SI-NEXT: v_or_b32_e32 v11, v51, v11 +; SI-NEXT: v_or_b32_e32 v12, v50, v12 +; SI-NEXT: v_or_b32_e32 v13, v48, v13 +; SI-NEXT: v_or_b32_e32 v14, v30, v14 +; SI-NEXT: v_or_b32_e32 v15, v28, v15 +; SI-NEXT: v_or_b32_e32 v16, v26, v16 +; SI-NEXT: v_or_b32_e32 v17, v24, v17 ; SI-NEXT: v_or_b32_e32 v18, v22, v18 ; SI-NEXT: v_or_b32_e32 v19, v21, v19 ; SI-NEXT: s_cbranch_execnz .LBB19_3 ; SI-NEXT: .LBB19_2: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v58 -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v56 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -8023,10 +7895,11 @@ define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v46 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v37 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -8035,18 +7908,19 @@ define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v45 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v34 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v33 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v36 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v39 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v32 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 @@ -8054,10 +7928,10 @@ define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v63 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v36 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v62 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 @@ -8065,25 +7939,24 @@ define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v60 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v35 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v54 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v51 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v44 ; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v61 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 @@ -8092,20 +7965,20 @@ define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v55 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v48 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v30 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v53 ; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v52 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 @@ -8114,20 +7987,20 @@ define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v50 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v24 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v31 ; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v29 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 @@ -8136,7 +8009,7 @@ define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v28 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 @@ -8147,7 +8020,7 @@ define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v25 ; SI-NEXT: v_or_b32_e32 v16, v18, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v18, v23 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 @@ -8188,87 +8061,86 @@ define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB19_4: -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v59, v48 +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v39, v44 +; SI-NEXT: v_mov_b32_e32 v44, v48 ; SI-NEXT: v_mov_b32_e32 v48, v21 -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v60, v49 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_mov_b32_e32 v38, v45 +; SI-NEXT: v_mov_b32_e32 v45, v49 ; SI-NEXT: v_mov_b32_e32 v49, v20 -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v43, v50 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_mov_b32_e32 v37, v46 +; SI-NEXT: v_mov_b32_e32 v46, v50 ; SI-NEXT: v_mov_b32_e32 v50, v22 -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v44, v51 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_mov_b32_e32 v36, v47 +; SI-NEXT: v_mov_b32_e32 v47, v51 ; SI-NEXT: v_mov_b32_e32 v51, v23 -; SI-NEXT: v_mov_b32_e32 v45, v52 -; SI-NEXT: v_mov_b32_e32 v52, v27 -; SI-NEXT: v_mov_b32_e32 v46, v53 -; SI-NEXT: v_mov_b32_e32 v53, v28 -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v41, v32 -; SI-NEXT: v_mov_b32_e32 v33, v47 -; SI-NEXT: v_mov_b32_e32 v47, v54 -; SI-NEXT: v_mov_b32_e32 v54, v29 -; SI-NEXT: v_mov_b32_e32 v42, v56 -; SI-NEXT: v_mov_b32_e32 v56, v55 -; SI-NEXT: v_mov_b32_e32 v55, v30 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_mov_b32_e32 v35, v56 +; SI-NEXT: v_mov_b32_e32 v56, v52 +; SI-NEXT: v_mov_b32_e32 v52, v24 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v34, v57 +; SI-NEXT: v_mov_b32_e32 v57, v53 +; SI-NEXT: v_mov_b32_e32 v53, v25 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v36, v57 -; SI-NEXT: v_mov_b32_e32 v57, v40 -; SI-NEXT: v_mov_b32_e32 v40, v31 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v39, v58 -; SI-NEXT: v_mov_b32_e32 v58, v37 -; SI-NEXT: v_mov_b32_e32 v37, v34 -; SI-NEXT: v_mov_b32_e32 v34, v24 -; SI-NEXT: v_mov_b32_e32 v32, v38 -; SI-NEXT: v_mov_b32_e32 v38, v35 -; SI-NEXT: v_mov_b32_e32 v35, v25 +; SI-NEXT: v_mov_b32_e32 v33, v58 +; SI-NEXT: v_mov_b32_e32 v58, v54 +; SI-NEXT: v_mov_b32_e32 v54, v26 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v32, v59 +; SI-NEXT: v_mov_b32_e32 v59, v55 +; SI-NEXT: v_mov_b32_e32 v55, v27 +; SI-NEXT: v_mov_b32_e32 v40, v28 +; SI-NEXT: v_mov_b32_e32 v41, v29 +; SI-NEXT: v_mov_b32_e32 v42, v30 +; SI-NEXT: v_mov_b32_e32 v43, v31 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v25, v35 -; SI-NEXT: v_mov_b32_e32 v35, v38 -; SI-NEXT: v_mov_b32_e32 v38, v32 -; SI-NEXT: v_mov_b32_e32 v24, v34 -; SI-NEXT: v_mov_b32_e32 v34, v37 -; SI-NEXT: v_mov_b32_e32 v37, v58 -; SI-NEXT: v_mov_b32_e32 v58, v39 -; SI-NEXT: v_mov_b32_e32 v31, v40 -; SI-NEXT: v_mov_b32_e32 v40, v57 -; SI-NEXT: v_mov_b32_e32 v57, v36 -; SI-NEXT: v_mov_b32_e32 v30, v55 -; SI-NEXT: v_mov_b32_e32 v55, v56 -; SI-NEXT: v_mov_b32_e32 v56, v42 -; SI-NEXT: v_mov_b32_e32 v32, v41 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v27, v55 +; SI-NEXT: v_mov_b32_e32 v55, v59 +; SI-NEXT: v_mov_b32_e32 v59, v32 +; SI-NEXT: v_mov_b32_e32 v26, v54 +; SI-NEXT: v_mov_b32_e32 v54, v58 +; SI-NEXT: v_mov_b32_e32 v58, v33 +; SI-NEXT: v_mov_b32_e32 v25, v53 +; SI-NEXT: v_mov_b32_e32 v53, v57 +; SI-NEXT: v_mov_b32_e32 v57, v34 +; SI-NEXT: v_mov_b32_e32 v24, v52 +; SI-NEXT: v_mov_b32_e32 v52, v56 +; SI-NEXT: v_mov_b32_e32 v56, v35 ; SI-NEXT: v_mov_b32_e32 v23, v51 -; SI-NEXT: v_mov_b32_e32 v51, v44 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v51, v47 +; SI-NEXT: v_mov_b32_e32 v47, v36 ; SI-NEXT: v_mov_b32_e32 v22, v50 -; SI-NEXT: v_mov_b32_e32 v50, v43 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v50, v46 +; SI-NEXT: v_mov_b32_e32 v46, v37 ; SI-NEXT: v_mov_b32_e32 v20, v49 -; SI-NEXT: v_mov_b32_e32 v49, v60 -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v49, v45 +; SI-NEXT: v_mov_b32_e32 v45, v38 ; SI-NEXT: v_mov_b32_e32 v21, v48 -; SI-NEXT: v_mov_b32_e32 v48, v59 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v29, v54 -; SI-NEXT: v_mov_b32_e32 v54, v47 -; SI-NEXT: v_mov_b32_e32 v47, v33 -; SI-NEXT: v_mov_b32_e32 v28, v53 -; SI-NEXT: v_mov_b32_e32 v53, v46 -; SI-NEXT: v_mov_b32_e32 v27, v52 -; SI-NEXT: v_mov_b32_e32 v52, v45 +; SI-NEXT: v_mov_b32_e32 v48, v44 +; SI-NEXT: v_mov_b32_e32 v44, v39 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v31, v43 +; SI-NEXT: v_mov_b32_e32 v30, v42 +; SI-NEXT: v_mov_b32_e32 v29, v41 +; SI-NEXT: v_mov_b32_e32 v28, v40 ; SI-NEXT: s_branch .LBB19_2 ; ; VI-LABEL: bitcast_v40f16_to_v20i32_scalar: @@ -10186,216 +10058,157 @@ define <40 x i16> @bitcast_v20f32_to_v40i16(<20 x float> %a, i32 %b) { ; SI-LABEL: bitcast_v20f32_to_v40i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v21 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB28_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_alignbit_b32 v21, v20, v19, 16 -; SI-NEXT: v_alignbit_b32 v22, v18, v17, 16 -; SI-NEXT: v_alignbit_b32 v23, v16, v15, 16 -; SI-NEXT: v_alignbit_b32 v24, v14, v13, 16 -; SI-NEXT: v_alignbit_b32 v26, v12, v11, 16 -; SI-NEXT: v_alignbit_b32 v28, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v31, v8, v7, 16 -; SI-NEXT: v_alignbit_b32 v33, v6, v5, 16 -; SI-NEXT: v_alignbit_b32 v35, v4, v3, 16 -; SI-NEXT: v_alignbit_b32 v37, v2, v1, 16 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v2 +; SI-NEXT: v_alignbit_b32 v20, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v21, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v22, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v23, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v24, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v25, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v26, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v29, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v31, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v34, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v1 ; SI-NEXT: .LBB28_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB28_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 -; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 ; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 -; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 ; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 -; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 ; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 -; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 ; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 -; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 ; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 -; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 ; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 -; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 ; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 -; SI-NEXT: v_alignbit_b32 v21, v20, v19, 16 -; SI-NEXT: v_alignbit_b32 v22, v18, v17, 16 -; SI-NEXT: v_alignbit_b32 v23, v16, v15, 16 -; SI-NEXT: v_alignbit_b32 v24, v14, v13, 16 -; SI-NEXT: v_alignbit_b32 v26, v12, v11, 16 -; SI-NEXT: v_alignbit_b32 v28, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v31, v8, v7, 16 -; SI-NEXT: v_alignbit_b32 v33, v6, v5, 16 -; SI-NEXT: v_alignbit_b32 v35, v4, v3, 16 -; SI-NEXT: v_alignbit_b32 v37, v2, v1, 16 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v2 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_alignbit_b32 v20, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v21, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v22, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v23, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v24, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v25, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v26, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v29, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v31, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v34, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v1 ; SI-NEXT: .LBB28_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v0, v0, v34 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; SI-NEXT: v_or_b32_e32 v1, v1, v37 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v28 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v26 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v24 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v23 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v22 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v21 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x4c, v0 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v39 +; SI-NEXT: v_or_b32_e32 v2, v2, v31 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v38 +; SI-NEXT: v_or_b32_e32 v4, v4, v29 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v37 +; SI-NEXT: v_or_b32_e32 v6, v6, v26 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v36 +; SI-NEXT: v_or_b32_e32 v8, v8, v25 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v35 +; SI-NEXT: v_or_b32_e32 v10, v10, v24 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v33 +; SI-NEXT: v_or_b32_e32 v12, v12, v23 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v32 +; SI-NEXT: v_or_b32_e32 v14, v14, v22 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v30 +; SI-NEXT: v_or_b32_e32 v16, v16, v21 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v28 +; SI-NEXT: v_or_b32_e32 v18, v18, v20 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v27 +; SI-NEXT: v_or_b32_e32 v1, v1, v34 +; SI-NEXT: v_or_b32_e32 v3, v3, v31 +; SI-NEXT: v_or_b32_e32 v5, v5, v29 +; SI-NEXT: v_or_b32_e32 v7, v7, v26 +; SI-NEXT: v_or_b32_e32 v9, v9, v25 +; SI-NEXT: v_or_b32_e32 v11, v11, v24 +; SI-NEXT: v_or_b32_e32 v13, v13, v23 +; SI-NEXT: v_or_b32_e32 v15, v15, v22 +; SI-NEXT: v_or_b32_e32 v17, v17, v21 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v20f32_to_v40i16: @@ -10803,228 +10616,175 @@ define inreg <40 x i16> @bitcast_v20f32_to_v40i16_scalar(<20 x float> inreg %a, ; SI-LABEL: bitcast_v20f32_to_v40i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 -; SI-NEXT: v_mov_b32_e32 v19, s16 -; SI-NEXT: v_mov_b32_e32 v20, s17 -; SI-NEXT: v_mov_b32_e32 v17, s18 -; SI-NEXT: v_mov_b32_e32 v18, s19 -; SI-NEXT: v_mov_b32_e32 v15, s20 -; SI-NEXT: v_mov_b32_e32 v16, s21 -; SI-NEXT: v_mov_b32_e32 v13, s22 -; SI-NEXT: v_mov_b32_e32 v14, s23 -; SI-NEXT: v_mov_b32_e32 v11, s24 -; SI-NEXT: v_mov_b32_e32 v12, s25 -; SI-NEXT: v_mov_b32_e32 v9, s26 -; SI-NEXT: v_mov_b32_e32 v10, s27 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: v_mov_b32_e32 v20, s16 +; SI-NEXT: v_mov_b32_e32 v21, s17 +; SI-NEXT: v_mov_b32_e32 v18, s18 +; SI-NEXT: v_mov_b32_e32 v19, s19 +; SI-NEXT: v_mov_b32_e32 v16, s20 +; SI-NEXT: v_mov_b32_e32 v17, s21 +; SI-NEXT: v_mov_b32_e32 v14, s22 +; SI-NEXT: v_mov_b32_e32 v15, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mov_b32_e32 v7, s28 -; SI-NEXT: v_mov_b32_e32 v8, s29 +; SI-NEXT: v_mov_b32_e32 v13, s29 ; SI-NEXT: s_cbranch_scc0 .LBB29_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshr_b64 v[21:22], v[5:6], 16 -; SI-NEXT: v_lshr_b64 v[27:28], v[9:10], 16 -; SI-NEXT: v_lshr_b64 v[22:23], v[3:4], 16 -; SI-NEXT: v_lshr_b64 v[28:29], v[13:14], 16 -; SI-NEXT: v_lshr_b64 v[23:24], v[1:2], 16 -; SI-NEXT: v_lshr_b64 v[29:30], v[15:16], 16 -; SI-NEXT: v_lshr_b64 v[24:25], v[7:8], 16 -; SI-NEXT: v_lshr_b64 v[30:31], v[17:18], 16 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v20 -; SI-NEXT: v_lshr_b64 v[25:26], v[11:12], 16 -; SI-NEXT: v_lshr_b64 v[31:32], v[19:20], 16 +; SI-NEXT: v_lshr_b64 v[26:27], v[4:5], 16 +; SI-NEXT: v_lshr_b64 v[27:28], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[28:29], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[22:23], v[16:17], 16 +; SI-NEXT: v_lshr_b64 v[29:30], v[12:13], 16 +; SI-NEXT: v_lshr_b64 v[23:24], v[18:19], 16 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v21 +; SI-NEXT: v_lshr_b64 v[32:33], v[10:11], 16 +; SI-NEXT: v_lshr_b64 v[30:31], v[8:9], 16 +; SI-NEXT: v_lshr_b64 v[6:7], v[14:15], 16 +; SI-NEXT: v_lshr_b64 v[24:25], v[20:21], 16 ; SI-NEXT: s_cbranch_execnz .LBB29_3 ; SI-NEXT: .LBB29_2: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 -; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 -; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 ; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 -; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 -; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 ; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 ; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_lshr_b64 v[21:22], v[5:6], 16 -; SI-NEXT: v_lshr_b64 v[27:28], v[9:10], 16 -; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 -; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 ; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshr_b64 v[22:23], v[3:4], 16 -; SI-NEXT: v_lshr_b64 v[28:29], v[13:14], 16 -; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_lshr_b64 v[26:27], v[4:5], 16 ; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 -; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 -; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 -; SI-NEXT: v_lshr_b64 v[23:24], v[1:2], 16 -; SI-NEXT: v_lshr_b64 v[29:30], v[15:16], 16 -; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshr_b64 v[27:28], v[2:3], 16 ; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 ; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_lshr_b64 v[28:29], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[22:23], v[16:17], 16 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 ; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 -; SI-NEXT: v_lshr_b64 v[24:25], v[7:8], 16 -; SI-NEXT: v_lshr_b64 v[30:31], v[17:18], 16 -; SI-NEXT: v_lshr_b64 v[25:26], v[11:12], 16 -; SI-NEXT: v_lshr_b64 v[31:32], v[19:20], 16 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v20 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_lshr_b64 v[29:30], v[12:13], 16 +; SI-NEXT: v_lshr_b64 v[23:24], v[18:19], 16 +; SI-NEXT: v_lshr_b64 v[32:33], v[10:11], 16 +; SI-NEXT: v_lshr_b64 v[30:31], v[8:9], 16 +; SI-NEXT: v_lshr_b64 v[6:7], v[14:15], 16 +; SI-NEXT: v_lshr_b64 v[24:25], v[20:21], 16 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v21 ; SI-NEXT: .LBB29_3: ; %end -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v31 -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; SI-NEXT: v_or_b32_e32 v19, v19, v26 -; SI-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v50 -; SI-NEXT: v_or_b32_e32 v19, v19, v20 -; SI-NEXT: v_add_i32_e32 v20, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v30 -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; SI-NEXT: v_or_b32_e32 v17, v17, v19 -; SI-NEXT: v_add_i32_e32 v19, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v17, v19, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v49 -; SI-NEXT: v_or_b32_e32 v17, v17, v18 -; SI-NEXT: v_add_i32_e32 v18, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v29 -; SI-NEXT: v_or_b32_e32 v15, v15, v17 -; SI-NEXT: v_add_i32_e32 v17, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v15, v17, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v48 -; SI-NEXT: v_or_b32_e32 v15, v15, v16 -; SI-NEXT: v_add_i32_e32 v16, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v28 -; SI-NEXT: v_or_b32_e32 v13, v13, v15 -; SI-NEXT: v_add_i32_e32 v15, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v13, v15, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v39 -; SI-NEXT: v_or_b32_e32 v13, v13, v14 -; SI-NEXT: v_add_i32_e32 v14, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v25 -; SI-NEXT: v_or_b32_e32 v11, v11, v13 -; SI-NEXT: v_add_i32_e32 v13, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v11, v13, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v38 -; SI-NEXT: v_or_b32_e32 v11, v11, v12 -; SI-NEXT: v_add_i32_e32 v12, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v27 -; SI-NEXT: v_or_b32_e32 v9, v9, v11 -; SI-NEXT: v_add_i32_e32 v11, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v9, v11, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v37 -; SI-NEXT: v_or_b32_e32 v9, v9, v10 -; SI-NEXT: v_add_i32_e32 v10, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v24 -; SI-NEXT: v_or_b32_e32 v7, v7, v9 -; SI-NEXT: v_add_i32_e32 v9, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v7, v9, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v36 -; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: v_add_i32_e32 v8, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v24 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v24, v20, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v51 +; SI-NEXT: v_or_b32_e32 v25, v7, v20 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v23 -; SI-NEXT: v_or_b32_e32 v1, v1, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, 56, v0 -; SI-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v22 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v21 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x4c, v0 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v20, v18, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v50 +; SI-NEXT: v_or_b32_e32 v21, v7, v18 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v22 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v22, v16, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v49 +; SI-NEXT: v_or_b32_e32 v23, v7, v16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v48 +; SI-NEXT: v_or_b32_e32 v7, v7, v14 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v30 +; SI-NEXT: v_or_b32_e32 v8, v8, v14 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v39 +; SI-NEXT: v_or_b32_e32 v9, v9, v14 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v32 +; SI-NEXT: v_or_b32_e32 v10, v10, v14 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v38 +; SI-NEXT: v_or_b32_e32 v11, v11, v14 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v29 +; SI-NEXT: v_or_b32_e32 v12, v12, v14 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v37 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v28 +; SI-NEXT: v_or_b32_e32 v14, v0, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v36 +; SI-NEXT: v_or_b32_e32 v15, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v27 +; SI-NEXT: v_or_b32_e32 v16, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v35 +; SI-NEXT: v_or_b32_e32 v17, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v26 +; SI-NEXT: v_or_b32_e32 v18, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v34 +; SI-NEXT: v_or_b32_e32 v19, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, v24 +; SI-NEXT: v_mov_b32_e32 v1, v25 +; SI-NEXT: v_mov_b32_e32 v2, v20 +; SI-NEXT: v_mov_b32_e32 v3, v21 +; SI-NEXT: v_mov_b32_e32 v4, v22 +; SI-NEXT: v_mov_b32_e32 v5, v23 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB29_4: -; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: s_branch .LBB29_2 ; ; VI-LABEL: bitcast_v20f32_to_v40i16_scalar: @@ -11621,132 +11381,136 @@ define <20 x float> @bitcast_v40i16_to_v20f32(<40 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v40i16_to_v20f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v52, v6 -; SI-NEXT: v_mov_b32_e32 v53, v4 -; SI-NEXT: v_mov_b32_e32 v54, v2 -; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 -; SI-NEXT: v_mov_b32_e32 v49, v12 -; SI-NEXT: v_mov_b32_e32 v50, v10 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v32, v19 +; SI-NEXT: v_mov_b32_e32 v33, v18 +; SI-NEXT: v_mov_b32_e32 v34, v17 +; SI-NEXT: v_mov_b32_e32 v35, v16 +; SI-NEXT: v_mov_b32_e32 v36, v15 +; SI-NEXT: v_mov_b32_e32 v37, v14 +; SI-NEXT: v_mov_b32_e32 v38, v13 +; SI-NEXT: v_mov_b32_e32 v39, v12 +; SI-NEXT: v_mov_b32_e32 v48, v11 +; SI-NEXT: v_mov_b32_e32 v49, v10 +; SI-NEXT: v_mov_b32_e32 v50, v9 ; SI-NEXT: v_mov_b32_e32 v51, v8 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v37, v20 -; SI-NEXT: v_mov_b32_e32 v38, v18 -; SI-NEXT: v_mov_b32_e32 v39, v16 -; SI-NEXT: v_mov_b32_e32 v48, v14 -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v15 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v29 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v4 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:20 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v8 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:12 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v52, v7 +; SI-NEXT: v_mov_b32_e32 v53, v6 +; SI-NEXT: v_mov_b32_e32 v54, v5 +; SI-NEXT: v_mov_b32_e32 v55, v4 +; SI-NEXT: v_mov_b32_e32 v40, v3 +; SI-NEXT: v_mov_b32_e32 v41, v2 +; SI-NEXT: v_mov_b32_e32 v42, v1 +; SI-NEXT: v_mov_b32_e32 v43, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v38 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v48 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v50 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v52 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v40 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v42 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v43 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v12 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB30_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; kill: killed $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; kill: killed $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; kill: killed $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; kill: killed $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; kill: killed $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v53 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v52 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v51 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v50 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v49 -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v48 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v39 -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v38 -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v37 -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v43 -; SI-NEXT: ; kill: killed $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: v_or_b32_e32 v0, v0, v42 -; SI-NEXT: v_or_b32_e32 v1, v1, v41 -; SI-NEXT: v_or_b32_e32 v2, v2, v36 -; SI-NEXT: v_or_b32_e32 v3, v3, v35 -; SI-NEXT: v_or_b32_e32 v4, v4, v40 -; SI-NEXT: v_or_b32_e32 v5, v5, v34 -; SI-NEXT: v_or_b32_e32 v6, v6, v33 -; SI-NEXT: v_or_b32_e32 v7, v7, v32 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v39 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v43 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v42 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v41 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v40 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v55 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v54 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v53 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v52 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v51 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v50 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v49 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v48 +; SI-NEXT: v_or_b32_e32 v0, v0, v59 +; SI-NEXT: v_or_b32_e32 v1, v1, v58 +; SI-NEXT: v_or_b32_e32 v2, v2, v57 +; SI-NEXT: v_or_b32_e32 v3, v3, v56 +; SI-NEXT: v_or_b32_e32 v4, v4, v47 +; SI-NEXT: v_or_b32_e32 v5, v5, v46 +; SI-NEXT: v_or_b32_e32 v6, v6, v45 +; SI-NEXT: v_or_b32_e32 v7, v7, v44 ; SI-NEXT: v_or_b32_e32 v8, v8, v63 ; SI-NEXT: v_or_b32_e32 v9, v9, v62 ; SI-NEXT: v_or_b32_e32 v10, v10, v61 -; SI-NEXT: v_or_b32_e32 v16, v16, v47 +; SI-NEXT: v_or_b32_e32 v11, v11, v60 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr53 @@ -11756,81 +11520,91 @@ define <20 x float> @bitcast_v40i16_to_v20f32(<40 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; kill: killed $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr62 ; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v38 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v37 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v36 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v35 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v34 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v33 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v32 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; SI-NEXT: v_or_b32_e32 v11, v11, v60 -; SI-NEXT: v_or_b32_e32 v12, v12, v59 -; SI-NEXT: v_or_b32_e32 v13, v13, v58 -; SI-NEXT: v_or_b32_e32 v14, v14, v57 -; SI-NEXT: v_or_b32_e32 v15, v15, v56 -; SI-NEXT: v_or_b32_e32 v17, v17, v46 -; SI-NEXT: v_or_b32_e32 v18, v18, v45 -; SI-NEXT: v_or_b32_e32 v19, v19, v44 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 ; SI-NEXT: .LBB30_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB30_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v39 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v53 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v52 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v51 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v50 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v49 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v48 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v39 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v38 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v37 -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v43 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v43 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v42 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v41 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v40 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v48 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 @@ -11842,20 +11616,20 @@ define <20 x float> @bitcast_v40i16_to_v20f32(<40 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_or_b32_e32 v0, v59, v0 ; SI-NEXT: s_mov_b32 s6, 0x30000 -; SI-NEXT: v_or_b32_e32 v1, v41, v1 -; SI-NEXT: v_or_b32_e32 v2, v36, v2 -; SI-NEXT: v_or_b32_e32 v3, v35, v3 -; SI-NEXT: v_or_b32_e32 v4, v40, v4 -; SI-NEXT: v_or_b32_e32 v5, v34, v5 -; SI-NEXT: v_or_b32_e32 v6, v33, v6 -; SI-NEXT: v_or_b32_e32 v7, v32, v7 +; SI-NEXT: v_or_b32_e32 v1, v58, v1 +; SI-NEXT: v_or_b32_e32 v2, v57, v2 +; SI-NEXT: v_or_b32_e32 v3, v56, v3 +; SI-NEXT: v_or_b32_e32 v4, v47, v4 +; SI-NEXT: v_or_b32_e32 v5, v46, v5 +; SI-NEXT: v_or_b32_e32 v6, v45, v6 +; SI-NEXT: v_or_b32_e32 v7, v44, v7 ; SI-NEXT: v_or_b32_e32 v8, v63, v8 ; SI-NEXT: v_or_b32_e32 v9, v62, v9 ; SI-NEXT: v_or_b32_e32 v10, v61, v10 -; SI-NEXT: v_or_b32_e32 v16, v47, v16 +; SI-NEXT: v_or_b32_e32 v11, v60, v11 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 @@ -11866,62 +11640,64 @@ define <20 x float> @bitcast_v40i16_to_v20f32(<40 x i16> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 ; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 ; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 -; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 -; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v38 ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v37 ; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v36 ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v35 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v34 ; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v33 ; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v32 ; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; SI-NEXT: v_or_b32_e32 v11, v60, v11 -; SI-NEXT: v_or_b32_e32 v12, v59, v12 -; SI-NEXT: v_or_b32_e32 v13, v58, v13 -; SI-NEXT: v_or_b32_e32 v14, v57, v14 -; SI-NEXT: v_or_b32_e32 v15, v56, v15 -; SI-NEXT: v_or_b32_e32 v17, v46, v17 -; SI-NEXT: v_or_b32_e32 v18, v45, v18 -; SI-NEXT: v_or_b32_e32 v19, v44, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 ; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 ; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 ; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 ; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 ; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 ; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 ; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 ; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v19 ; SI-NEXT: .LBB30_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -12453,209 +12229,221 @@ define inreg <20 x float> @bitcast_v40i16_to_v20f32_scalar(<40 x i16> inreg %a, ; SI-LABEL: bitcast_v40i16_to_v20f32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v32, v24 -; SI-NEXT: v_mov_b32_e32 v33, v22 -; SI-NEXT: v_mov_b32_e32 v34, v20 -; SI-NEXT: v_mov_b32_e32 v35, v18 -; SI-NEXT: v_mov_b32_e32 v36, v16 -; SI-NEXT: v_mov_b32_e32 v37, v14 -; SI-NEXT: v_mov_b32_e32 v38, v12 -; SI-NEXT: v_mov_b32_e32 v39, v10 -; SI-NEXT: v_mov_b32_e32 v48, v8 -; SI-NEXT: v_mov_b32_e32 v49, v6 -; SI-NEXT: v_mov_b32_e32 v50, v4 -; SI-NEXT: v_mov_b32_e32 v51, v2 -; SI-NEXT: v_mov_b32_e32 v52, v0 +; SI-NEXT: v_mov_b32_e32 v32, v5 +; SI-NEXT: v_mov_b32_e32 v33, v4 +; SI-NEXT: v_mov_b32_e32 v34, v3 +; SI-NEXT: v_mov_b32_e32 v35, v2 +; SI-NEXT: v_mov_b32_e32 v36, v1 +; SI-NEXT: v_mov_b32_e32 v37, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v37 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s7, s28, 16 +; SI-NEXT: s_lshr_b32 s8, s27, 16 +; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: s_lshr_b32 s13, s22, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s40, s19, 16 +; SI-NEXT: s_lshr_b32 s41, s18, 16 +; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v0 ; SI-NEXT: s_cbranch_scc0 .LBB31_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 -; SI-NEXT: v_or_b32_e32 v7, v0, v57 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 -; SI-NEXT: v_or_b32_e32 v8, v0, v56 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 -; SI-NEXT: v_or_b32_e32 v9, v0, v47 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 -; SI-NEXT: v_or_b32_e32 v10, v0, v46 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 ; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: v_or_b32_e32 v11, v0, v45 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: s_lshl_b32 s5, s43, 16 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: v_or_b32_e32 v12, v0, v44 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 -; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: v_or_b32_e32 v13, v0, v43 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s44, s42, 16 +; SI-NEXT: s_or_b32 s5, s5, s44 +; SI-NEXT: s_and_b32 s44, s18, 0xffff +; SI-NEXT: s_lshl_b32 s45, s41, 16 +; SI-NEXT: s_or_b32 s44, s44, s45 +; SI-NEXT: s_and_b32 s45, s19, 0xffff +; SI-NEXT: s_lshl_b32 s46, s40, 16 +; SI-NEXT: s_or_b32 s45, s45, s46 +; SI-NEXT: s_and_b32 s46, s20, 0xffff +; SI-NEXT: s_lshl_b32 s47, s15, 16 +; SI-NEXT: s_or_b32 s46, s46, s47 +; SI-NEXT: s_and_b32 s47, s21, 0xffff +; SI-NEXT: s_lshl_b32 s56, s14, 16 +; SI-NEXT: s_or_b32 s47, s47, s56 +; SI-NEXT: s_and_b32 s56, s22, 0xffff +; SI-NEXT: s_lshl_b32 s57, s13, 16 +; SI-NEXT: s_or_b32 s56, s56, s57 +; SI-NEXT: s_and_b32 s57, s23, 0xffff +; SI-NEXT: s_lshl_b32 s58, s12, 16 +; SI-NEXT: s_or_b32 s57, s57, s58 +; SI-NEXT: s_and_b32 s58, s24, 0xffff +; SI-NEXT: s_lshl_b32 s59, s11, 16 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 -; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: v_or_b32_e32 v14, v0, v42 +; SI-NEXT: s_or_b32 s58, s58, s59 +; SI-NEXT: s_and_b32 s59, s25, 0xffff +; SI-NEXT: s_lshl_b32 s60, s10, 16 +; SI-NEXT: v_or_b32_e32 v14, v0, v51 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 -; SI-NEXT: s_or_b32 s7, s7, s8 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: v_or_b32_e32 v15, v0, v41 +; SI-NEXT: s_or_b32 s59, s59, s60 +; SI-NEXT: s_and_b32 s60, s26, 0xffff +; SI-NEXT: s_lshl_b32 s61, s9, 16 +; SI-NEXT: v_or_b32_e32 v15, v0, v50 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 -; SI-NEXT: s_or_b32 s8, s8, s9 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: v_or_b32_e32 v16, v0, v40 +; SI-NEXT: s_or_b32 s60, s60, s61 +; SI-NEXT: s_and_b32 s61, s27, 0xffff +; SI-NEXT: s_lshl_b32 s62, s8, 16 +; SI-NEXT: v_or_b32_e32 v16, v0, v49 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 -; SI-NEXT: s_or_b32 s9, s9, s10 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_or_b32_e32 v17, v0, v55 +; SI-NEXT: s_or_b32 s61, s61, s62 +; SI-NEXT: s_and_b32 s62, s28, 0xffff +; SI-NEXT: s_lshl_b32 s63, s7, 16 +; SI-NEXT: v_or_b32_e32 v17, v0, v48 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 -; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_or_b32_e32 v18, v0, v54 +; SI-NEXT: s_or_b32 s62, s62, s63 +; SI-NEXT: s_and_b32 s63, s29, 0xffff +; SI-NEXT: s_lshl_b32 s72, s6, 16 +; SI-NEXT: v_or_b32_e32 v18, v0, v39 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 -; SI-NEXT: v_or_b32_e32 v19, v0, v53 +; SI-NEXT: s_or_b32 s63, s63, s72 +; SI-NEXT: v_or_b32_e32 v19, v0, v38 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_mov_b32_e32 v5, s9 -; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: v_mov_b32_e32 v2, s44 +; SI-NEXT: v_mov_b32_e32 v3, s45 +; SI-NEXT: v_mov_b32_e32 v4, s46 +; SI-NEXT: v_mov_b32_e32 v5, s47 +; SI-NEXT: v_mov_b32_e32 v6, s56 +; SI-NEXT: v_mov_b32_e32 v7, s57 +; SI-NEXT: v_mov_b32_e32 v8, s58 +; SI-NEXT: v_mov_b32_e32 v9, s59 +; SI-NEXT: v_mov_b32_e32 v10, s60 +; SI-NEXT: v_mov_b32_e32 v11, s61 +; SI-NEXT: v_mov_b32_e32 v12, s62 +; SI-NEXT: v_mov_b32_e32 v13, s63 ; SI-NEXT: s_cbranch_execnz .LBB31_3 ; SI-NEXT: .LBB31_2: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v57, v0 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v56, v0 -; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v47, v0 -; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v46, v0 -; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v45, v0 -; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v44, v0 -; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v43, v0 -; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: v_or_b32_e32 v0, v51, v0 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s16, s42, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: s_and_b32 s16, s18, 0xffff +; SI-NEXT: s_lshl_b32 s17, s41, 16 +; SI-NEXT: s_add_i32 s19, s19, 3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v41, v0 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_and_b32 s17, s19, 0xffff +; SI-NEXT: s_lshl_b32 s18, s40, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_or_b32_e32 v0, v50, v0 +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: s_and_b32 s18, s20, 0xffff +; SI-NEXT: s_lshl_b32 s15, s15, 16 +; SI-NEXT: s_add_i32 s21, s21, 3 ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: s_or_b32 s15, s15, s18 +; SI-NEXT: s_and_b32 s18, s21, 0xffff +; SI-NEXT: s_lshl_b32 s14, s14, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_or_b32_e32 v0, v40, v0 -; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s14, s14, s18 +; SI-NEXT: s_and_b32 s18, s22, 0xffff +; SI-NEXT: s_lshl_b32 s13, s13, 16 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: v_or_b32_e32 v0, v49, v0 +; SI-NEXT: s_or_b32 s13, s13, s18 +; SI-NEXT: s_and_b32 s18, s23, 0xffff +; SI-NEXT: s_lshl_b32 s12, s12, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 ; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s12, s12, s18 +; SI-NEXT: s_and_b32 s18, s24, 0xffff +; SI-NEXT: s_lshl_b32 s11, s11, 16 +; SI-NEXT: s_add_i32 s25, s25, 3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: v_or_b32_e32 v0, v55, v0 -; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s11, s11, s18 +; SI-NEXT: s_and_b32 s18, s25, 0xffff +; SI-NEXT: s_lshl_b32 s10, s10, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: v_or_b32_e32 v0, v48, v0 +; SI-NEXT: s_or_b32 s10, s10, s18 +; SI-NEXT: s_and_b32 s18, s26, 0xffff +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_add_i32 s27, s27, 3 ; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 -; SI-NEXT: s_or_b32 s7, s8, s7 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_or_b32 s8, s9, s8 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_or_b32 s9, s9, s18 +; SI-NEXT: s_and_b32 s18, s27, 0xffff +; SI-NEXT: s_lshl_b32 s8, s8, 16 ; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: v_or_b32_e32 v0, v54, v0 -; SI-NEXT: s_or_b32 s9, s10, s9 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s8, s8, s18 +; SI-NEXT: s_and_b32 s18, s28, 0xffff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: v_or_b32_e32 v0, v39, v0 +; SI-NEXT: s_or_b32 s7, s7, s18 +; SI-NEXT: s_and_b32 s18, s29, 0xffff +; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 -; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: s_or_b32 s6, s6, s18 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_add_i32 s4, s4, 0x30000 ; SI-NEXT: s_add_i32 s5, s5, 0x30000 -; SI-NEXT: s_add_i32 s6, s6, 0x30000 -; SI-NEXT: s_add_i32 s7, s7, 0x30000 -; SI-NEXT: s_add_i32 s8, s8, 0x30000 -; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s16, s16, 0x30000 +; SI-NEXT: s_add_i32 s17, s17, 0x30000 +; SI-NEXT: s_add_i32 s15, s15, 0x30000 +; SI-NEXT: s_add_i32 s14, s14, 0x30000 +; SI-NEXT: s_add_i32 s13, s13, 0x30000 +; SI-NEXT: s_add_i32 s12, s12, 0x30000 +; SI-NEXT: s_add_i32 s11, s11, 0x30000 ; SI-NEXT: s_add_i32 s10, s10, 0x30000 -; SI-NEXT: v_or_b32_e32 v0, v53, v0 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v38, v0 ; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_mov_b32_e32 v5, s9 -; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: v_mov_b32_e32 v3, s17 +; SI-NEXT: v_mov_b32_e32 v4, s15 +; SI-NEXT: v_mov_b32_e32 v5, s14 +; SI-NEXT: v_mov_b32_e32 v6, s13 +; SI-NEXT: v_mov_b32_e32 v7, s12 +; SI-NEXT: v_mov_b32_e32 v8, s11 +; SI-NEXT: v_mov_b32_e32 v9, s10 +; SI-NEXT: v_mov_b32_e32 v10, s9 +; SI-NEXT: v_mov_b32_e32 v11, s8 +; SI-NEXT: v_mov_b32_e32 v12, s7 +; SI-NEXT: v_mov_b32_e32 v13, s6 ; SI-NEXT: .LBB31_3: ; %end -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB31_4: ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 @@ -13197,29 +12985,27 @@ define <40 x half> @bitcast_v20f32_to_v40f16(<20 x float> %a, i32 %b) { ; SI-LABEL: bitcast_v20f32_to_v40f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v21 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr41 @@ -13229,96 +13015,98 @@ define <40 x half> @bitcast_v20f32_to_v40f16(<20 x float> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB32_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v23 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v23 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v23 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v23 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v23 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v23 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v23 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v23 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v23 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v23 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v23 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v23 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v23 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v4 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v46, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v23 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v3 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v56, v23 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v45, v23 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v19 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v58, v23 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v47, v23 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v17 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v57, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v60, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v0 +; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 @@ -13338,11 +13126,11 @@ define <40 x half> @bitcast_v20f32_to_v40f16(<20 x float> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: .LBB32_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB32_4 ; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 @@ -13362,222 +13150,163 @@ define <40 x half> @bitcast_v20f32_to_v40f16(<20 x float> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 ; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 ; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 -; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v1 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 ; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 ; SI-NEXT: .LBB32_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v57 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v56 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v47 -; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v45 -; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v44 -; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v42 -; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v40 -; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v54 -; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v52 -; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v50 -; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v48 -; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v38 -; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 -; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v35 -; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 -; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v31 -; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v29 -; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v27 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v25 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v44 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v41 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v23 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x4c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v54 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v53 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v49 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v50 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v37 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v38 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v33 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v34 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v28 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v30 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v24 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v27 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v21 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v23 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v20f32_to_v40f16: @@ -13985,375 +13714,315 @@ define inreg <40 x half> @bitcast_v20f32_to_v40f16_scalar(<20 x float> inreg %a, ; SI-LABEL: bitcast_v20f32_to_v40f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v60, s16 -; SI-NEXT: v_mov_b32_e32 v59, s17 -; SI-NEXT: v_mov_b32_e32 v58, s18 -; SI-NEXT: v_mov_b32_e32 v57, s19 -; SI-NEXT: v_mov_b32_e32 v56, s20 -; SI-NEXT: v_mov_b32_e32 v47, s21 -; SI-NEXT: v_mov_b32_e32 v46, s22 -; SI-NEXT: v_mov_b32_e32 v45, s23 -; SI-NEXT: v_mov_b32_e32 v44, s24 -; SI-NEXT: v_mov_b32_e32 v41, s25 +; SI-NEXT: v_mov_b32_e32 v59, s16 +; SI-NEXT: v_mov_b32_e32 v58, s17 +; SI-NEXT: v_mov_b32_e32 v57, s18 +; SI-NEXT: v_mov_b32_e32 v56, s19 +; SI-NEXT: v_mov_b32_e32 v47, s20 +; SI-NEXT: v_mov_b32_e32 v46, s21 +; SI-NEXT: v_mov_b32_e32 v45, s22 +; SI-NEXT: v_mov_b32_e32 v44, s23 +; SI-NEXT: v_mov_b32_e32 v43, s24 +; SI-NEXT: v_mov_b32_e32 v42, s25 ; SI-NEXT: v_mov_b32_e32 v40, s26 ; SI-NEXT: v_mov_b32_e32 v55, s27 -; SI-NEXT: v_mov_b32_e32 v42, s28 +; SI-NEXT: v_mov_b32_e32 v54, s28 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mov_b32_e32 v43, s29 +; SI-NEXT: v_mov_b32_e32 v41, s29 ; SI-NEXT: s_cbranch_scc0 .LBB33_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v9 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v9 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v9 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v9 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v9 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v9 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v9 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v9 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v9 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v9 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v9 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v9 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v9 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v9 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v9 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v59 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v9 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v9 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v6 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v6 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v6 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v6 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v6 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v6 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v6 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v6 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v6 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v6 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v58 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v47 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v47 ; SI-NEXT: v_cvt_f32_f16_e32 v38, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v59 ; SI-NEXT: s_cbranch_execnz .LBB33_3 ; SI-NEXT: .LBB33_2: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v7, 1.0, v60 -; SI-NEXT: v_add_f32_e32 v8, 1.0, v59 -; SI-NEXT: v_add_f32_e32 v10, 1.0, v58 -; SI-NEXT: v_add_f32_e32 v12, 1.0, v57 -; SI-NEXT: v_add_f32_e32 v14, 1.0, v56 -; SI-NEXT: v_add_f32_e32 v16, 1.0, v47 -; SI-NEXT: v_add_f32_e32 v18, 1.0, v46 -; SI-NEXT: v_add_f32_e32 v20, 1.0, v45 -; SI-NEXT: v_add_f32_e32 v22, 1.0, v44 -; SI-NEXT: v_add_f32_e32 v25, 1.0, v41 -; SI-NEXT: v_add_f32_e32 v26, 1.0, v40 -; SI-NEXT: v_add_f32_e32 v24, 1.0, v55 -; SI-NEXT: v_add_f32_e32 v23, 1.0, v42 -; SI-NEXT: v_add_f32_e32 v21, 1.0, v43 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v59 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v58 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v57 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v56 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v47 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v46 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v45 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v44 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v43 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v42 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v40 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v55 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v54 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v41 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 ; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 ; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 -; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 ; SI-NEXT: .LBB33_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v51 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v0, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v50 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v49 -; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v39 -; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v38 -; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v36 -; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v34 -; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v32 -; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v30 -; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v28 -; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v26 -; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v24 -; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v23 -; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v21 -; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v19 -; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v17 -; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v15 -; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v13 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v11 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v9 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x4c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v38 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v36 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v34 +; SI-NEXT: v_or_b32_e32 v7, v7, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v32 +; SI-NEXT: v_or_b32_e32 v9, v30, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v30 +; SI-NEXT: v_or_b32_e32 v11, v28, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v28 +; SI-NEXT: v_or_b32_e32 v13, v26, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v26 +; SI-NEXT: v_or_b32_e32 v15, v24, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v24 +; SI-NEXT: v_or_b32_e32 v17, v22, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v22 +; SI-NEXT: v_or_b32_e32 v5, v35, v5 +; SI-NEXT: v_or_b32_e32 v6, v33, v6 +; SI-NEXT: v_or_b32_e32 v8, v31, v8 +; SI-NEXT: v_or_b32_e32 v10, v29, v10 +; SI-NEXT: v_or_b32_e32 v12, v27, v12 +; SI-NEXT: v_or_b32_e32 v14, v25, v14 +; SI-NEXT: v_or_b32_e32 v16, v23, v16 +; SI-NEXT: v_or_b32_e32 v18, v21, v18 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB33_4: -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: s_branch .LBB33_2 ; ; VI-LABEL: bitcast_v20f32_to_v40f16_scalar: @@ -14950,128 +14619,146 @@ define <20 x float> @bitcast_v40f16_to_v20f32(<40 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v40f16_to_v20f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:36 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v38, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v32 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v20 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v23 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v39, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v63, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v48 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f16_f32_e32 v48, v55 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v54 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v40 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v53 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v52 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execz .LBB34_2 -; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v41 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; kill: killed $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; kill: killed $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; kill: killed $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; kill: killed $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; kill: killed $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; kill: killed $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; kill: killed $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; kill: killed $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; kill: killed $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; kill: killed $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB34_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v39 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v37 @@ -15084,9 +14771,11 @@ define <20 x float> @bitcast_v40f16_to_v20f32(<40 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v47 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v45 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v43 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v49 -; SI-NEXT: ; kill: killed $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v41 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v53 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v49 ; SI-NEXT: v_or_b32_e32 v0, v38, v0 ; SI-NEXT: v_or_b32_e32 v1, v36, v1 ; SI-NEXT: v_or_b32_e32 v2, v34, v2 @@ -15098,7 +14787,11 @@ define <20 x float> @bitcast_v40f16_to_v20f32(<40 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v8, v46, v8 ; SI-NEXT: v_or_b32_e32 v9, v44, v9 ; SI-NEXT: v_or_b32_e32 v10, v42, v10 -; SI-NEXT: v_or_b32_e32 v19, v48, v19 +; SI-NEXT: v_or_b32_e32 v11, v40, v11 +; SI-NEXT: v_or_b32_e32 v12, v54, v12 +; SI-NEXT: v_or_b32_e32 v13, v52, v13 +; SI-NEXT: v_or_b32_e32 v14, v50, v14 +; SI-NEXT: v_or_b32_e32 v15, v48, v15 ; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr37 @@ -15122,47 +14815,50 @@ define <20 x float> @bitcast_v40f16_to_v20f32(<40 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; kill: killed $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v51 -; SI-NEXT: v_or_b32_e32 v18, v50, v18 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 ; SI-NEXT: .LBB34_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB34_4 @@ -15170,9 +14866,6 @@ define <20 x float> @bitcast_v40f16_to_v20f32(<40 x half> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v39 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v37 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v36 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 @@ -15190,7 +14883,11 @@ define <20 x float> @bitcast_v40f16_to_v20f32(<40 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v2, v35 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v34 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v62 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -15200,152 +14897,142 @@ define <20 x float> @bitcast_v40f16_to_v20f32(<40 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v33 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v62 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v58 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v58 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v57 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v56 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v56 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v46 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v43 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v43 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v53 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v47 ; SI-NEXT: v_or_b32_e32 v7, v9, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v48 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v10, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v44 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v52 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v41 ; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v50 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v54 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v14, v51 ; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v49 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v16, v14 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v48 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v51 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v50 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_or_b32_e32 v18, v19, v18 @@ -15353,22 +15040,22 @@ define <20 x float> @bitcast_v40f16_to_v20f32(<40 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v19, v21, v19 ; SI-NEXT: .LBB34_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -15901,6 +15588,16 @@ define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a, ; SI-LABEL: bitcast_v40f16_to_v20f32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_lshr_b32 s14, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s14 +; SI-NEXT: s_lshr_b32 s14, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s14 +; SI-NEXT: s_lshr_b32 s14, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s14 +; SI-NEXT: s_lshr_b32 s14, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s14 +; SI-NEXT: s_lshr_b32 s12, s21, 16 +; SI-NEXT: s_lshr_b32 s13, s20, 16 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -15917,98 +15614,149 @@ define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a, ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v32, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v0 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v63, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v10 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_cvt_f16_f32_e32 v58, v16 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_cvt_f16_f32_e32 v59, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s21 +; SI-NEXT: s_lshr_b32 s10, s23, 16 +; SI-NEXT: s_lshr_b32 s11, s22, 16 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s23 +; SI-NEXT: s_lshr_b32 s8, s25, 16 +; SI-NEXT: s_lshr_b32 s9, s24, 16 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v15 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v63, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s25 +; SI-NEXT: s_lshr_b32 s6, s27, 16 +; SI-NEXT: s_lshr_b32 s7, s26, 16 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s27 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v0 +; SI-NEXT: s_lshr_b32 s4, s29, 16 +; SI-NEXT: s_lshr_b32 s5, s28, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v52, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v58, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v60, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v59, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v57, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v39, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v56, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v38, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v37, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v36, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v35, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v34, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v61, s26 -; SI-NEXT: v_cvt_f16_f32_e32 v25, s29 -; SI-NEXT: v_cvt_f16_f32_e32 v24, s28 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v5 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_cbranch_scc0 .LBB35_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v58 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v59 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v38 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v34 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v45 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v35 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v63 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v44 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v42 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v40 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v54 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v52 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v50 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v48 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v30 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v44 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v60 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v53 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v25 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v23 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: v_or_b32_e32 v0, v60, v0 -; SI-NEXT: v_or_b32_e32 v1, v57, v1 -; SI-NEXT: v_or_b32_e32 v2, v56, v2 -; SI-NEXT: v_or_b32_e32 v3, v37, v3 -; SI-NEXT: v_or_b32_e32 v4, v35, v4 -; SI-NEXT: v_or_b32_e32 v5, v61, v5 -; SI-NEXT: v_or_b32_e32 v6, v24, v6 -; SI-NEXT: v_or_b32_e32 v7, v47, v7 +; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: v_or_b32_e32 v1, v56, v1 +; SI-NEXT: v_or_b32_e32 v2, v46, v2 +; SI-NEXT: v_or_b32_e32 v3, v38, v3 +; SI-NEXT: v_or_b32_e32 v4, v37, v4 +; SI-NEXT: v_or_b32_e32 v5, v34, v5 +; SI-NEXT: v_or_b32_e32 v6, v33, v6 +; SI-NEXT: v_or_b32_e32 v7, v63, v7 ; SI-NEXT: v_or_b32_e32 v8, v62, v8 -; SI-NEXT: v_or_b32_e32 v9, v43, v9 -; SI-NEXT: v_or_b32_e32 v10, v41, v10 -; SI-NEXT: v_or_b32_e32 v11, v55, v11 -; SI-NEXT: v_or_b32_e32 v12, v53, v12 -; SI-NEXT: v_or_b32_e32 v13, v51, v13 -; SI-NEXT: v_or_b32_e32 v14, v49, v14 -; SI-NEXT: v_or_b32_e32 v15, v31, v15 -; SI-NEXT: v_or_b32_e32 v16, v29, v16 -; SI-NEXT: v_or_b32_e32 v17, v27, v17 +; SI-NEXT: v_or_b32_e32 v9, v55, v9 +; SI-NEXT: v_or_b32_e32 v10, v54, v10 +; SI-NEXT: v_or_b32_e32 v11, v51, v11 +; SI-NEXT: v_or_b32_e32 v12, v50, v12 +; SI-NEXT: v_or_b32_e32 v13, v48, v13 +; SI-NEXT: v_or_b32_e32 v14, v30, v14 +; SI-NEXT: v_or_b32_e32 v15, v28, v15 +; SI-NEXT: v_or_b32_e32 v16, v26, v16 +; SI-NEXT: v_or_b32_e32 v17, v24, v17 ; SI-NEXT: v_or_b32_e32 v18, v22, v18 ; SI-NEXT: v_or_b32_e32 v19, v21, v19 ; SI-NEXT: s_cbranch_execnz .LBB35_3 ; SI-NEXT: .LBB35_2: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v58 -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v56 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -16021,10 +15769,11 @@ define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a, ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v46 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v37 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -16033,18 +15782,19 @@ define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a, ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v45 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v34 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v33 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v36 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v39 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v32 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 @@ -16052,10 +15802,10 @@ define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v63 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v36 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v62 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 @@ -16063,25 +15813,24 @@ define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v60 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v35 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v54 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v51 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v44 ; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v61 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 @@ -16090,20 +15839,20 @@ define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v55 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v48 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v30 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v53 ; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v52 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 @@ -16112,20 +15861,20 @@ define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v50 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v24 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v31 ; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v29 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 @@ -16134,7 +15883,7 @@ define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v28 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 @@ -16145,7 +15894,7 @@ define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a, ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v25 ; SI-NEXT: v_or_b32_e32 v16, v18, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v18, v23 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 @@ -16186,87 +15935,86 @@ define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a, ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB35_4: -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v59, v48 +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v39, v44 +; SI-NEXT: v_mov_b32_e32 v44, v48 ; SI-NEXT: v_mov_b32_e32 v48, v21 -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v60, v49 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_mov_b32_e32 v38, v45 +; SI-NEXT: v_mov_b32_e32 v45, v49 ; SI-NEXT: v_mov_b32_e32 v49, v20 -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v43, v50 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_mov_b32_e32 v37, v46 +; SI-NEXT: v_mov_b32_e32 v46, v50 ; SI-NEXT: v_mov_b32_e32 v50, v22 -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v44, v51 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_mov_b32_e32 v36, v47 +; SI-NEXT: v_mov_b32_e32 v47, v51 ; SI-NEXT: v_mov_b32_e32 v51, v23 -; SI-NEXT: v_mov_b32_e32 v45, v52 -; SI-NEXT: v_mov_b32_e32 v52, v27 -; SI-NEXT: v_mov_b32_e32 v46, v53 -; SI-NEXT: v_mov_b32_e32 v53, v28 -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v41, v32 -; SI-NEXT: v_mov_b32_e32 v33, v47 -; SI-NEXT: v_mov_b32_e32 v47, v54 -; SI-NEXT: v_mov_b32_e32 v54, v29 -; SI-NEXT: v_mov_b32_e32 v42, v56 -; SI-NEXT: v_mov_b32_e32 v56, v55 -; SI-NEXT: v_mov_b32_e32 v55, v30 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v36, v57 -; SI-NEXT: v_mov_b32_e32 v57, v40 -; SI-NEXT: v_mov_b32_e32 v40, v31 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v39, v58 -; SI-NEXT: v_mov_b32_e32 v58, v37 -; SI-NEXT: v_mov_b32_e32 v37, v34 -; SI-NEXT: v_mov_b32_e32 v34, v24 -; SI-NEXT: v_mov_b32_e32 v32, v38 -; SI-NEXT: v_mov_b32_e32 v38, v35 -; SI-NEXT: v_mov_b32_e32 v35, v25 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_mov_b32_e32 v35, v56 +; SI-NEXT: v_mov_b32_e32 v56, v52 +; SI-NEXT: v_mov_b32_e32 v52, v24 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v34, v57 +; SI-NEXT: v_mov_b32_e32 v57, v53 +; SI-NEXT: v_mov_b32_e32 v53, v25 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v33, v58 +; SI-NEXT: v_mov_b32_e32 v58, v54 +; SI-NEXT: v_mov_b32_e32 v54, v26 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v32, v59 +; SI-NEXT: v_mov_b32_e32 v59, v55 +; SI-NEXT: v_mov_b32_e32 v55, v27 +; SI-NEXT: v_mov_b32_e32 v40, v28 +; SI-NEXT: v_mov_b32_e32 v41, v29 +; SI-NEXT: v_mov_b32_e32 v42, v30 +; SI-NEXT: v_mov_b32_e32 v43, v31 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v25, v35 -; SI-NEXT: v_mov_b32_e32 v35, v38 -; SI-NEXT: v_mov_b32_e32 v38, v32 -; SI-NEXT: v_mov_b32_e32 v24, v34 -; SI-NEXT: v_mov_b32_e32 v34, v37 -; SI-NEXT: v_mov_b32_e32 v37, v58 -; SI-NEXT: v_mov_b32_e32 v58, v39 -; SI-NEXT: v_mov_b32_e32 v31, v40 -; SI-NEXT: v_mov_b32_e32 v40, v57 -; SI-NEXT: v_mov_b32_e32 v57, v36 -; SI-NEXT: v_mov_b32_e32 v30, v55 -; SI-NEXT: v_mov_b32_e32 v55, v56 -; SI-NEXT: v_mov_b32_e32 v56, v42 -; SI-NEXT: v_mov_b32_e32 v32, v41 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v27, v55 +; SI-NEXT: v_mov_b32_e32 v55, v59 +; SI-NEXT: v_mov_b32_e32 v59, v32 +; SI-NEXT: v_mov_b32_e32 v26, v54 +; SI-NEXT: v_mov_b32_e32 v54, v58 +; SI-NEXT: v_mov_b32_e32 v58, v33 +; SI-NEXT: v_mov_b32_e32 v25, v53 +; SI-NEXT: v_mov_b32_e32 v53, v57 +; SI-NEXT: v_mov_b32_e32 v57, v34 +; SI-NEXT: v_mov_b32_e32 v24, v52 +; SI-NEXT: v_mov_b32_e32 v52, v56 +; SI-NEXT: v_mov_b32_e32 v56, v35 ; SI-NEXT: v_mov_b32_e32 v23, v51 -; SI-NEXT: v_mov_b32_e32 v51, v44 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v51, v47 +; SI-NEXT: v_mov_b32_e32 v47, v36 ; SI-NEXT: v_mov_b32_e32 v22, v50 -; SI-NEXT: v_mov_b32_e32 v50, v43 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v50, v46 +; SI-NEXT: v_mov_b32_e32 v46, v37 ; SI-NEXT: v_mov_b32_e32 v20, v49 -; SI-NEXT: v_mov_b32_e32 v49, v60 -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v49, v45 +; SI-NEXT: v_mov_b32_e32 v45, v38 ; SI-NEXT: v_mov_b32_e32 v21, v48 -; SI-NEXT: v_mov_b32_e32 v48, v59 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v29, v54 -; SI-NEXT: v_mov_b32_e32 v54, v47 -; SI-NEXT: v_mov_b32_e32 v47, v33 -; SI-NEXT: v_mov_b32_e32 v28, v53 -; SI-NEXT: v_mov_b32_e32 v53, v46 -; SI-NEXT: v_mov_b32_e32 v27, v52 -; SI-NEXT: v_mov_b32_e32 v52, v45 +; SI-NEXT: v_mov_b32_e32 v48, v44 +; SI-NEXT: v_mov_b32_e32 v44, v39 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v31, v43 +; SI-NEXT: v_mov_b32_e32 v30, v42 +; SI-NEXT: v_mov_b32_e32 v29, v41 +; SI-NEXT: v_mov_b32_e32 v28, v40 ; SI-NEXT: s_branch .LBB35_2 ; ; VI-LABEL: bitcast_v40f16_to_v20f32_scalar: @@ -17464,216 +17212,157 @@ define <40 x i16> @bitcast_v10i64_to_v40i16(<10 x i64> %a, i32 %b) { ; SI-LABEL: bitcast_v10i64_to_v40i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v21 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB40_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_alignbit_b32 v21, v20, v19, 16 -; SI-NEXT: v_alignbit_b32 v22, v18, v17, 16 -; SI-NEXT: v_alignbit_b32 v23, v16, v15, 16 -; SI-NEXT: v_alignbit_b32 v24, v14, v13, 16 -; SI-NEXT: v_alignbit_b32 v25, v12, v11, 16 -; SI-NEXT: v_alignbit_b32 v28, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v30, v8, v7, 16 -; SI-NEXT: v_alignbit_b32 v33, v6, v5, 16 -; SI-NEXT: v_alignbit_b32 v35, v4, v3, 16 -; SI-NEXT: v_alignbit_b32 v37, v2, v1, 16 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v2 +; SI-NEXT: v_alignbit_b32 v20, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v21, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v22, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v23, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v24, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v25, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v26, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v28, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v31, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v34, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v1 ; SI-NEXT: .LBB40_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB40_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: v_addc_u32_e32 v12, vcc, 0, v12, vcc -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; SI-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; SI-NEXT: v_addc_u32_e32 v16, vcc, 0, v16, vcc -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; SI-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; SI-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc -; SI-NEXT: v_alignbit_b32 v21, v20, v19, 16 -; SI-NEXT: v_alignbit_b32 v22, v18, v17, 16 -; SI-NEXT: v_alignbit_b32 v23, v16, v15, 16 -; SI-NEXT: v_alignbit_b32 v24, v14, v13, 16 -; SI-NEXT: v_alignbit_b32 v25, v12, v11, 16 -; SI-NEXT: v_alignbit_b32 v28, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v30, v8, v7, 16 -; SI-NEXT: v_alignbit_b32 v33, v6, v5, 16 -; SI-NEXT: v_alignbit_b32 v35, v4, v3, 16 -; SI-NEXT: v_alignbit_b32 v37, v2, v1, 16 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; SI-NEXT: v_alignbit_b32 v20, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v21, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v22, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v23, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v24, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v25, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v26, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v28, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v31, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v34, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v1 ; SI-NEXT: .LBB40_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v0, v0, v34 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; SI-NEXT: v_or_b32_e32 v1, v1, v37 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v28 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v24 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v23 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v22 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v21 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v26 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x4c, v0 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v39 +; SI-NEXT: v_or_b32_e32 v2, v2, v31 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v38 +; SI-NEXT: v_or_b32_e32 v4, v4, v28 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v37 +; SI-NEXT: v_or_b32_e32 v6, v6, v26 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v36 +; SI-NEXT: v_or_b32_e32 v8, v8, v25 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v35 +; SI-NEXT: v_or_b32_e32 v10, v10, v24 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v33 +; SI-NEXT: v_or_b32_e32 v12, v12, v23 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v32 +; SI-NEXT: v_or_b32_e32 v14, v14, v22 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v30 +; SI-NEXT: v_or_b32_e32 v16, v16, v21 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v29 +; SI-NEXT: v_or_b32_e32 v18, v18, v20 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v27 +; SI-NEXT: v_or_b32_e32 v1, v1, v34 +; SI-NEXT: v_or_b32_e32 v3, v3, v31 +; SI-NEXT: v_or_b32_e32 v5, v5, v28 +; SI-NEXT: v_or_b32_e32 v7, v7, v26 +; SI-NEXT: v_or_b32_e32 v9, v9, v25 +; SI-NEXT: v_or_b32_e32 v11, v11, v24 +; SI-NEXT: v_or_b32_e32 v13, v13, v23 +; SI-NEXT: v_or_b32_e32 v15, v15, v22 +; SI-NEXT: v_or_b32_e32 v17, v17, v21 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v10i64_to_v40i16: @@ -18111,42 +17800,42 @@ define inreg <40 x i16> @bitcast_v10i64_to_v40i16_scalar(<10 x i64> inreg %a, i3 ; SI-LABEL: bitcast_v10i64_to_v40i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v8, s16 -; SI-NEXT: v_mov_b32_e32 v9, s17 -; SI-NEXT: v_mov_b32_e32 v10, s18 -; SI-NEXT: v_mov_b32_e32 v11, s19 -; SI-NEXT: v_mov_b32_e32 v12, s20 -; SI-NEXT: v_mov_b32_e32 v13, s21 -; SI-NEXT: v_mov_b32_e32 v14, s22 -; SI-NEXT: v_mov_b32_e32 v15, s23 -; SI-NEXT: v_mov_b32_e32 v16, s24 -; SI-NEXT: v_mov_b32_e32 v17, s25 -; SI-NEXT: v_mov_b32_e32 v18, s26 -; SI-NEXT: v_mov_b32_e32 v19, s27 -; SI-NEXT: v_readfirstlane_b32 s22, v8 -; SI-NEXT: v_mov_b32_e32 v8, s28 -; SI-NEXT: v_readfirstlane_b32 s23, v9 -; SI-NEXT: v_mov_b32_e32 v9, s29 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 -; SI-NEXT: v_readfirstlane_b32 s20, v10 -; SI-NEXT: v_readfirstlane_b32 s21, v11 -; SI-NEXT: v_readfirstlane_b32 s18, v12 -; SI-NEXT: v_readfirstlane_b32 s19, v13 -; SI-NEXT: v_readfirstlane_b32 s16, v14 -; SI-NEXT: v_readfirstlane_b32 s17, v15 -; SI-NEXT: v_readfirstlane_b32 s14, v16 -; SI-NEXT: v_readfirstlane_b32 s15, v17 -; SI-NEXT: v_readfirstlane_b32 s12, v18 -; SI-NEXT: v_readfirstlane_b32 s13, v19 -; SI-NEXT: v_readfirstlane_b32 s10, v8 -; SI-NEXT: v_readfirstlane_b32 s11, v9 -; SI-NEXT: v_readfirstlane_b32 s8, v1 -; SI-NEXT: v_readfirstlane_b32 s9, v2 -; SI-NEXT: v_readfirstlane_b32 s6, v3 -; SI-NEXT: v_readfirstlane_b32 s7, v4 -; SI-NEXT: v_readfirstlane_b32 s4, v5 +; SI-NEXT: v_mov_b32_e32 v7, s16 +; SI-NEXT: v_mov_b32_e32 v8, s17 +; SI-NEXT: v_mov_b32_e32 v9, s18 +; SI-NEXT: v_mov_b32_e32 v10, s19 +; SI-NEXT: v_mov_b32_e32 v11, s20 +; SI-NEXT: v_mov_b32_e32 v12, s21 +; SI-NEXT: v_mov_b32_e32 v13, s22 +; SI-NEXT: v_mov_b32_e32 v14, s23 +; SI-NEXT: v_mov_b32_e32 v15, s24 +; SI-NEXT: v_mov_b32_e32 v16, s25 +; SI-NEXT: v_mov_b32_e32 v17, s26 +; SI-NEXT: v_mov_b32_e32 v18, s27 +; SI-NEXT: v_mov_b32_e32 v19, s28 +; SI-NEXT: v_readfirstlane_b32 s22, v7 +; SI-NEXT: v_mov_b32_e32 v7, s29 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: v_readfirstlane_b32 s23, v8 +; SI-NEXT: v_readfirstlane_b32 s20, v9 +; SI-NEXT: v_readfirstlane_b32 s21, v10 +; SI-NEXT: v_readfirstlane_b32 s18, v11 +; SI-NEXT: v_readfirstlane_b32 s19, v12 +; SI-NEXT: v_readfirstlane_b32 s16, v13 +; SI-NEXT: v_readfirstlane_b32 s17, v14 +; SI-NEXT: v_readfirstlane_b32 s14, v15 +; SI-NEXT: v_readfirstlane_b32 s15, v16 +; SI-NEXT: v_readfirstlane_b32 s12, v17 +; SI-NEXT: v_readfirstlane_b32 s13, v18 +; SI-NEXT: v_readfirstlane_b32 s10, v19 +; SI-NEXT: v_readfirstlane_b32 s11, v7 +; SI-NEXT: v_readfirstlane_b32 s8, v0 +; SI-NEXT: v_readfirstlane_b32 s9, v1 +; SI-NEXT: v_readfirstlane_b32 s6, v2 +; SI-NEXT: v_readfirstlane_b32 s7, v3 +; SI-NEXT: v_readfirstlane_b32 s4, v4 ; SI-NEXT: s_and_b64 s[24:25], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s5, v6 +; SI-NEXT: v_readfirstlane_b32 s5, v5 ; SI-NEXT: s_cbranch_scc0 .LBB41_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_lshr_b32 s72, s5, 16 @@ -18215,141 +17904,83 @@ define inreg <40 x i16> @bitcast_v10i64_to_v40i16_scalar(<10 x i64> inreg %a, i3 ; SI-NEXT: s_lshl_b32 s25, s60, 16 ; SI-NEXT: s_and_b32 s22, s22, 0xffff ; SI-NEXT: s_or_b32 s22, s22, s25 -; SI-NEXT: v_mov_b32_e32 v1, s22 -; SI-NEXT: s_and_b32 s22, s23, 0xffff -; SI-NEXT: s_lshl_b32 s23, s89, 16 -; SI-NEXT: s_or_b32 s22, s22, s23 -; SI-NEXT: v_mov_b32_e32 v2, s22 -; SI-NEXT: s_lshl_b32 s22, s58, 16 +; SI-NEXT: s_and_b32 s23, s23, 0xffff +; SI-NEXT: s_lshl_b32 s25, s89, 16 +; SI-NEXT: s_or_b32 s23, s23, s25 +; SI-NEXT: s_lshl_b32 s25, s58, 16 ; SI-NEXT: s_and_b32 s20, s20, 0xffff -; SI-NEXT: s_or_b32 s20, s20, s22 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s20 -; SI-NEXT: s_and_b32 s20, s21, 0xffff -; SI-NEXT: s_lshl_b32 s21, s88, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 8, v0 -; SI-NEXT: s_or_b32 s20, s20, s21 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s20 +; SI-NEXT: s_or_b32 s20, s20, s25 +; SI-NEXT: s_and_b32 s21, s21, 0xffff +; SI-NEXT: s_lshl_b32 s25, s88, 16 +; SI-NEXT: s_or_b32 s21, s21, s25 +; SI-NEXT: s_lshl_b32 s25, s56, 16 ; SI-NEXT: s_and_b32 s18, s18, 0xffff -; SI-NEXT: s_lshl_b32 s20, s56, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 12, v0 -; SI-NEXT: s_or_b32 s18, s18, s20 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s18 -; SI-NEXT: s_and_b32 s18, s19, 0xffff -; SI-NEXT: s_lshl_b32 s19, s79, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 16, v0 -; SI-NEXT: s_or_b32 s18, s18, s19 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: s_or_b32 s18, s18, s25 +; SI-NEXT: s_and_b32 s19, s19, 0xffff +; SI-NEXT: s_lshl_b32 s25, s79, 16 +; SI-NEXT: s_or_b32 s19, s19, s25 ; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: s_lshl_b32 s18, s46, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 20, v0 -; SI-NEXT: s_or_b32 s16, s16, s18 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s17, 0xffff -; SI-NEXT: s_lshl_b32 s17, s78, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 24, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_lshl_b32 s25, s46, 16 +; SI-NEXT: s_or_b32 s16, s16, s25 +; SI-NEXT: s_and_b32 s17, s17, 0xffff +; SI-NEXT: s_lshl_b32 s25, s78, 16 +; SI-NEXT: s_or_b32 s17, s17, s25 ; SI-NEXT: s_and_b32 s14, s14, 0xffff -; SI-NEXT: s_lshl_b32 s16, s44, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 28, v0 -; SI-NEXT: s_or_b32 s14, s14, s16 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s14 -; SI-NEXT: s_and_b32 s14, s15, 0xffff -; SI-NEXT: s_lshl_b32 s15, s77, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 32, v0 -; SI-NEXT: s_or_b32 s14, s14, s15 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s14 +; SI-NEXT: s_lshl_b32 s25, s44, 16 +; SI-NEXT: s_or_b32 s14, s14, s25 +; SI-NEXT: s_and_b32 s15, s15, 0xffff +; SI-NEXT: s_lshl_b32 s25, s77, 16 +; SI-NEXT: s_or_b32 s15, s15, s25 ; SI-NEXT: s_and_b32 s12, s12, 0xffff -; SI-NEXT: s_lshl_b32 s14, s42, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 36, v0 -; SI-NEXT: s_or_b32 s12, s12, s14 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s12 -; SI-NEXT: s_and_b32 s12, s13, 0xffff -; SI-NEXT: s_lshl_b32 s13, s76, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 40, v0 -; SI-NEXT: s_or_b32 s12, s12, s13 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s12 +; SI-NEXT: s_lshl_b32 s25, s42, 16 +; SI-NEXT: s_or_b32 s12, s12, s25 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_lshl_b32 s25, s76, 16 +; SI-NEXT: s_or_b32 s13, s13, s25 ; SI-NEXT: s_and_b32 s10, s10, 0xffff -; SI-NEXT: s_lshl_b32 s12, s40, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 44, v0 -; SI-NEXT: s_or_b32 s10, s10, s12 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s10 -; SI-NEXT: s_and_b32 s10, s11, 0xffff -; SI-NEXT: s_lshl_b32 s11, s75, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 48, v0 -; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: s_lshl_b32 s25, s40, 16 +; SI-NEXT: s_or_b32 s10, s10, s25 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: s_lshl_b32 s25, s75, 16 +; SI-NEXT: s_or_b32 s11, s11, s25 ; SI-NEXT: s_and_b32 s8, s8, 0xffff -; SI-NEXT: s_lshl_b32 s10, s28, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 52, v0 -; SI-NEXT: s_or_b32 s8, s8, s10 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s8 -; SI-NEXT: s_and_b32 s8, s9, 0xffff -; SI-NEXT: s_lshl_b32 s9, s74, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 56, v0 -; SI-NEXT: s_or_b32 s8, s8, s9 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: s_lshl_b32 s25, s28, 16 +; SI-NEXT: s_or_b32 s8, s8, s25 +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_lshl_b32 s25, s74, 16 +; SI-NEXT: s_or_b32 s9, s9, s25 ; SI-NEXT: s_and_b32 s6, s6, 0xffff -; SI-NEXT: s_lshl_b32 s8, s26, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 60, v0 -; SI-NEXT: s_or_b32 s6, s6, s8 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: s_and_b32 s6, s7, 0xffff -; SI-NEXT: s_lshl_b32 s7, s73, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 64, v0 -; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_lshl_b32 s25, s26, 16 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_lshl_b32 s6, s24, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x44, v0 -; SI-NEXT: s_or_b32 s4, s4, s6 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s4 -; SI-NEXT: s_and_b32 s4, s5, 0xffff -; SI-NEXT: s_lshl_b32 s5, s72, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x48, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x4c, v0 -; SI-NEXT: v_mov_b32_e32 v1, s4 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_lshl_b32 s24, s24, 16 +; SI-NEXT: s_or_b32 s6, s6, s25 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_lshl_b32 s25, s73, 16 +; SI-NEXT: s_or_b32 s4, s4, s24 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s24, s72, 16 +; SI-NEXT: s_or_b32 s7, s7, s25 +; SI-NEXT: s_or_b32 s5, s5, s24 +; SI-NEXT: v_mov_b32_e32 v0, s22 +; SI-NEXT: v_mov_b32_e32 v1, s23 +; SI-NEXT: v_mov_b32_e32 v2, s20 +; SI-NEXT: v_mov_b32_e32 v3, s21 +; SI-NEXT: v_mov_b32_e32 v4, s18 +; SI-NEXT: v_mov_b32_e32 v5, s19 +; SI-NEXT: v_mov_b32_e32 v6, s16 +; SI-NEXT: v_mov_b32_e32 v7, s17 +; SI-NEXT: v_mov_b32_e32 v8, s14 +; SI-NEXT: v_mov_b32_e32 v9, s15 +; SI-NEXT: v_mov_b32_e32 v10, s12 +; SI-NEXT: v_mov_b32_e32 v11, s13 +; SI-NEXT: v_mov_b32_e32 v12, s10 +; SI-NEXT: v_mov_b32_e32 v13, s11 +; SI-NEXT: v_mov_b32_e32 v14, s8 +; SI-NEXT: v_mov_b32_e32 v15, s9 +; SI-NEXT: v_mov_b32_e32 v16, s6 +; SI-NEXT: v_mov_b32_e32 v17, s7 +; SI-NEXT: v_mov_b32_e32 v18, s4 +; SI-NEXT: v_mov_b32_e32 v19, s5 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB41_4: ; SI-NEXT: ; implicit-def: $sgpr60 @@ -18926,132 +18557,136 @@ define <10 x i64> @bitcast_v40i16_to_v10i64(<40 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v40i16_to_v10i64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v52, v6 -; SI-NEXT: v_mov_b32_e32 v53, v4 -; SI-NEXT: v_mov_b32_e32 v54, v2 -; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 -; SI-NEXT: v_mov_b32_e32 v49, v12 -; SI-NEXT: v_mov_b32_e32 v50, v10 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v32, v19 +; SI-NEXT: v_mov_b32_e32 v33, v18 +; SI-NEXT: v_mov_b32_e32 v34, v17 +; SI-NEXT: v_mov_b32_e32 v35, v16 +; SI-NEXT: v_mov_b32_e32 v36, v15 +; SI-NEXT: v_mov_b32_e32 v37, v14 +; SI-NEXT: v_mov_b32_e32 v38, v13 +; SI-NEXT: v_mov_b32_e32 v39, v12 +; SI-NEXT: v_mov_b32_e32 v48, v11 +; SI-NEXT: v_mov_b32_e32 v49, v10 +; SI-NEXT: v_mov_b32_e32 v50, v9 ; SI-NEXT: v_mov_b32_e32 v51, v8 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v37, v20 -; SI-NEXT: v_mov_b32_e32 v38, v18 -; SI-NEXT: v_mov_b32_e32 v39, v16 -; SI-NEXT: v_mov_b32_e32 v48, v14 -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v15 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v29 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v4 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:20 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v8 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:12 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v52, v7 +; SI-NEXT: v_mov_b32_e32 v53, v6 +; SI-NEXT: v_mov_b32_e32 v54, v5 +; SI-NEXT: v_mov_b32_e32 v55, v4 +; SI-NEXT: v_mov_b32_e32 v40, v3 +; SI-NEXT: v_mov_b32_e32 v41, v2 +; SI-NEXT: v_mov_b32_e32 v42, v1 +; SI-NEXT: v_mov_b32_e32 v43, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v38 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v48 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v50 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v52 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v40 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v42 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v43 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v12 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB42_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; kill: killed $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; kill: killed $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; kill: killed $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; kill: killed $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; kill: killed $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v53 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v52 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v51 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v50 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v49 -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v48 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v39 -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v38 -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v37 -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v43 -; SI-NEXT: ; kill: killed $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: v_or_b32_e32 v0, v0, v42 -; SI-NEXT: v_or_b32_e32 v1, v1, v41 -; SI-NEXT: v_or_b32_e32 v2, v2, v36 -; SI-NEXT: v_or_b32_e32 v3, v3, v35 -; SI-NEXT: v_or_b32_e32 v4, v4, v40 -; SI-NEXT: v_or_b32_e32 v5, v5, v34 -; SI-NEXT: v_or_b32_e32 v6, v6, v33 -; SI-NEXT: v_or_b32_e32 v7, v7, v32 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v39 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v43 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v42 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v41 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v40 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v55 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v54 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v53 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v52 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v51 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v50 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v49 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v48 +; SI-NEXT: v_or_b32_e32 v0, v0, v59 +; SI-NEXT: v_or_b32_e32 v1, v1, v58 +; SI-NEXT: v_or_b32_e32 v2, v2, v57 +; SI-NEXT: v_or_b32_e32 v3, v3, v56 +; SI-NEXT: v_or_b32_e32 v4, v4, v47 +; SI-NEXT: v_or_b32_e32 v5, v5, v46 +; SI-NEXT: v_or_b32_e32 v6, v6, v45 +; SI-NEXT: v_or_b32_e32 v7, v7, v44 ; SI-NEXT: v_or_b32_e32 v8, v8, v63 ; SI-NEXT: v_or_b32_e32 v9, v9, v62 ; SI-NEXT: v_or_b32_e32 v10, v10, v61 -; SI-NEXT: v_or_b32_e32 v16, v16, v47 +; SI-NEXT: v_or_b32_e32 v11, v11, v60 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr53 @@ -19061,81 +18696,91 @@ define <10 x i64> @bitcast_v40i16_to_v10i64(<40 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; kill: killed $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr62 ; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v38 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v37 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v36 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v35 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v34 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v33 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v32 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; SI-NEXT: v_or_b32_e32 v11, v11, v60 -; SI-NEXT: v_or_b32_e32 v12, v12, v59 -; SI-NEXT: v_or_b32_e32 v13, v13, v58 -; SI-NEXT: v_or_b32_e32 v14, v14, v57 -; SI-NEXT: v_or_b32_e32 v15, v15, v56 -; SI-NEXT: v_or_b32_e32 v17, v17, v46 -; SI-NEXT: v_or_b32_e32 v18, v18, v45 -; SI-NEXT: v_or_b32_e32 v19, v19, v44 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 ; SI-NEXT: .LBB42_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB42_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v39 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v53 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v52 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v51 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v50 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v49 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v48 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v39 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v38 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v37 -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v43 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v43 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v42 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v41 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v40 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v48 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 @@ -19147,20 +18792,20 @@ define <10 x i64> @bitcast_v40i16_to_v10i64(<40 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_or_b32_e32 v0, v59, v0 ; SI-NEXT: s_mov_b32 s6, 0x30000 -; SI-NEXT: v_or_b32_e32 v1, v41, v1 -; SI-NEXT: v_or_b32_e32 v2, v36, v2 -; SI-NEXT: v_or_b32_e32 v3, v35, v3 -; SI-NEXT: v_or_b32_e32 v4, v40, v4 -; SI-NEXT: v_or_b32_e32 v5, v34, v5 -; SI-NEXT: v_or_b32_e32 v6, v33, v6 -; SI-NEXT: v_or_b32_e32 v7, v32, v7 +; SI-NEXT: v_or_b32_e32 v1, v58, v1 +; SI-NEXT: v_or_b32_e32 v2, v57, v2 +; SI-NEXT: v_or_b32_e32 v3, v56, v3 +; SI-NEXT: v_or_b32_e32 v4, v47, v4 +; SI-NEXT: v_or_b32_e32 v5, v46, v5 +; SI-NEXT: v_or_b32_e32 v6, v45, v6 +; SI-NEXT: v_or_b32_e32 v7, v44, v7 ; SI-NEXT: v_or_b32_e32 v8, v63, v8 ; SI-NEXT: v_or_b32_e32 v9, v62, v9 ; SI-NEXT: v_or_b32_e32 v10, v61, v10 -; SI-NEXT: v_or_b32_e32 v16, v47, v16 +; SI-NEXT: v_or_b32_e32 v11, v60, v11 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 @@ -19171,62 +18816,64 @@ define <10 x i64> @bitcast_v40i16_to_v10i64(<40 x i16> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 ; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 ; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 -; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 -; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v38 ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v37 ; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v36 ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v35 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v34 ; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v33 ; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v32 ; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; SI-NEXT: v_or_b32_e32 v11, v60, v11 -; SI-NEXT: v_or_b32_e32 v12, v59, v12 -; SI-NEXT: v_or_b32_e32 v13, v58, v13 -; SI-NEXT: v_or_b32_e32 v14, v57, v14 -; SI-NEXT: v_or_b32_e32 v15, v56, v15 -; SI-NEXT: v_or_b32_e32 v17, v46, v17 -; SI-NEXT: v_or_b32_e32 v18, v45, v18 -; SI-NEXT: v_or_b32_e32 v19, v44, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 ; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 ; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 ; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 ; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 ; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 ; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 ; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 ; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v19 ; SI-NEXT: .LBB42_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -19758,209 +19405,221 @@ define inreg <10 x i64> @bitcast_v40i16_to_v10i64_scalar(<40 x i16> inreg %a, i3 ; SI-LABEL: bitcast_v40i16_to_v10i64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v32, v24 -; SI-NEXT: v_mov_b32_e32 v33, v22 -; SI-NEXT: v_mov_b32_e32 v34, v20 -; SI-NEXT: v_mov_b32_e32 v35, v18 -; SI-NEXT: v_mov_b32_e32 v36, v16 -; SI-NEXT: v_mov_b32_e32 v37, v14 -; SI-NEXT: v_mov_b32_e32 v38, v12 -; SI-NEXT: v_mov_b32_e32 v39, v10 -; SI-NEXT: v_mov_b32_e32 v48, v8 -; SI-NEXT: v_mov_b32_e32 v49, v6 -; SI-NEXT: v_mov_b32_e32 v50, v4 -; SI-NEXT: v_mov_b32_e32 v51, v2 -; SI-NEXT: v_mov_b32_e32 v52, v0 +; SI-NEXT: v_mov_b32_e32 v32, v5 +; SI-NEXT: v_mov_b32_e32 v33, v4 +; SI-NEXT: v_mov_b32_e32 v34, v3 +; SI-NEXT: v_mov_b32_e32 v35, v2 +; SI-NEXT: v_mov_b32_e32 v36, v1 +; SI-NEXT: v_mov_b32_e32 v37, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v37 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s7, s28, 16 +; SI-NEXT: s_lshr_b32 s8, s27, 16 +; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: s_lshr_b32 s13, s22, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s40, s19, 16 +; SI-NEXT: s_lshr_b32 s41, s18, 16 +; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v0 ; SI-NEXT: s_cbranch_scc0 .LBB43_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 -; SI-NEXT: v_or_b32_e32 v7, v0, v57 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 -; SI-NEXT: v_or_b32_e32 v8, v0, v56 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 -; SI-NEXT: v_or_b32_e32 v9, v0, v47 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 -; SI-NEXT: v_or_b32_e32 v10, v0, v46 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 ; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: v_or_b32_e32 v11, v0, v45 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: s_lshl_b32 s5, s43, 16 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: v_or_b32_e32 v12, v0, v44 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 -; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: v_or_b32_e32 v13, v0, v43 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s44, s42, 16 +; SI-NEXT: s_or_b32 s5, s5, s44 +; SI-NEXT: s_and_b32 s44, s18, 0xffff +; SI-NEXT: s_lshl_b32 s45, s41, 16 +; SI-NEXT: s_or_b32 s44, s44, s45 +; SI-NEXT: s_and_b32 s45, s19, 0xffff +; SI-NEXT: s_lshl_b32 s46, s40, 16 +; SI-NEXT: s_or_b32 s45, s45, s46 +; SI-NEXT: s_and_b32 s46, s20, 0xffff +; SI-NEXT: s_lshl_b32 s47, s15, 16 +; SI-NEXT: s_or_b32 s46, s46, s47 +; SI-NEXT: s_and_b32 s47, s21, 0xffff +; SI-NEXT: s_lshl_b32 s56, s14, 16 +; SI-NEXT: s_or_b32 s47, s47, s56 +; SI-NEXT: s_and_b32 s56, s22, 0xffff +; SI-NEXT: s_lshl_b32 s57, s13, 16 +; SI-NEXT: s_or_b32 s56, s56, s57 +; SI-NEXT: s_and_b32 s57, s23, 0xffff +; SI-NEXT: s_lshl_b32 s58, s12, 16 +; SI-NEXT: s_or_b32 s57, s57, s58 +; SI-NEXT: s_and_b32 s58, s24, 0xffff +; SI-NEXT: s_lshl_b32 s59, s11, 16 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 -; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: v_or_b32_e32 v14, v0, v42 +; SI-NEXT: s_or_b32 s58, s58, s59 +; SI-NEXT: s_and_b32 s59, s25, 0xffff +; SI-NEXT: s_lshl_b32 s60, s10, 16 +; SI-NEXT: v_or_b32_e32 v14, v0, v51 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 -; SI-NEXT: s_or_b32 s7, s7, s8 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: v_or_b32_e32 v15, v0, v41 +; SI-NEXT: s_or_b32 s59, s59, s60 +; SI-NEXT: s_and_b32 s60, s26, 0xffff +; SI-NEXT: s_lshl_b32 s61, s9, 16 +; SI-NEXT: v_or_b32_e32 v15, v0, v50 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 -; SI-NEXT: s_or_b32 s8, s8, s9 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: v_or_b32_e32 v16, v0, v40 +; SI-NEXT: s_or_b32 s60, s60, s61 +; SI-NEXT: s_and_b32 s61, s27, 0xffff +; SI-NEXT: s_lshl_b32 s62, s8, 16 +; SI-NEXT: v_or_b32_e32 v16, v0, v49 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 -; SI-NEXT: s_or_b32 s9, s9, s10 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_or_b32_e32 v17, v0, v55 +; SI-NEXT: s_or_b32 s61, s61, s62 +; SI-NEXT: s_and_b32 s62, s28, 0xffff +; SI-NEXT: s_lshl_b32 s63, s7, 16 +; SI-NEXT: v_or_b32_e32 v17, v0, v48 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 -; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_or_b32_e32 v18, v0, v54 +; SI-NEXT: s_or_b32 s62, s62, s63 +; SI-NEXT: s_and_b32 s63, s29, 0xffff +; SI-NEXT: s_lshl_b32 s72, s6, 16 +; SI-NEXT: v_or_b32_e32 v18, v0, v39 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 -; SI-NEXT: v_or_b32_e32 v19, v0, v53 +; SI-NEXT: s_or_b32 s63, s63, s72 +; SI-NEXT: v_or_b32_e32 v19, v0, v38 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_mov_b32_e32 v5, s9 -; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: v_mov_b32_e32 v2, s44 +; SI-NEXT: v_mov_b32_e32 v3, s45 +; SI-NEXT: v_mov_b32_e32 v4, s46 +; SI-NEXT: v_mov_b32_e32 v5, s47 +; SI-NEXT: v_mov_b32_e32 v6, s56 +; SI-NEXT: v_mov_b32_e32 v7, s57 +; SI-NEXT: v_mov_b32_e32 v8, s58 +; SI-NEXT: v_mov_b32_e32 v9, s59 +; SI-NEXT: v_mov_b32_e32 v10, s60 +; SI-NEXT: v_mov_b32_e32 v11, s61 +; SI-NEXT: v_mov_b32_e32 v12, s62 +; SI-NEXT: v_mov_b32_e32 v13, s63 ; SI-NEXT: s_cbranch_execnz .LBB43_3 ; SI-NEXT: .LBB43_2: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v57, v0 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v56, v0 -; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v47, v0 -; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v46, v0 -; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v45, v0 -; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v44, v0 -; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v43, v0 -; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: v_or_b32_e32 v0, v51, v0 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s16, s42, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: s_and_b32 s16, s18, 0xffff +; SI-NEXT: s_lshl_b32 s17, s41, 16 +; SI-NEXT: s_add_i32 s19, s19, 3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v41, v0 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_and_b32 s17, s19, 0xffff +; SI-NEXT: s_lshl_b32 s18, s40, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_or_b32_e32 v0, v50, v0 +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: s_and_b32 s18, s20, 0xffff +; SI-NEXT: s_lshl_b32 s15, s15, 16 +; SI-NEXT: s_add_i32 s21, s21, 3 ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: s_or_b32 s15, s15, s18 +; SI-NEXT: s_and_b32 s18, s21, 0xffff +; SI-NEXT: s_lshl_b32 s14, s14, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_or_b32_e32 v0, v40, v0 -; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s14, s14, s18 +; SI-NEXT: s_and_b32 s18, s22, 0xffff +; SI-NEXT: s_lshl_b32 s13, s13, 16 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: v_or_b32_e32 v0, v49, v0 +; SI-NEXT: s_or_b32 s13, s13, s18 +; SI-NEXT: s_and_b32 s18, s23, 0xffff +; SI-NEXT: s_lshl_b32 s12, s12, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 ; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s12, s12, s18 +; SI-NEXT: s_and_b32 s18, s24, 0xffff +; SI-NEXT: s_lshl_b32 s11, s11, 16 +; SI-NEXT: s_add_i32 s25, s25, 3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: v_or_b32_e32 v0, v55, v0 -; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s11, s11, s18 +; SI-NEXT: s_and_b32 s18, s25, 0xffff +; SI-NEXT: s_lshl_b32 s10, s10, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: v_or_b32_e32 v0, v48, v0 +; SI-NEXT: s_or_b32 s10, s10, s18 +; SI-NEXT: s_and_b32 s18, s26, 0xffff +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_add_i32 s27, s27, 3 ; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 -; SI-NEXT: s_or_b32 s7, s8, s7 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_or_b32 s8, s9, s8 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_or_b32 s9, s9, s18 +; SI-NEXT: s_and_b32 s18, s27, 0xffff +; SI-NEXT: s_lshl_b32 s8, s8, 16 ; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: v_or_b32_e32 v0, v54, v0 -; SI-NEXT: s_or_b32 s9, s10, s9 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s8, s8, s18 +; SI-NEXT: s_and_b32 s18, s28, 0xffff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: v_or_b32_e32 v0, v39, v0 +; SI-NEXT: s_or_b32 s7, s7, s18 +; SI-NEXT: s_and_b32 s18, s29, 0xffff +; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 -; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: s_or_b32 s6, s6, s18 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_add_i32 s4, s4, 0x30000 ; SI-NEXT: s_add_i32 s5, s5, 0x30000 -; SI-NEXT: s_add_i32 s6, s6, 0x30000 -; SI-NEXT: s_add_i32 s7, s7, 0x30000 -; SI-NEXT: s_add_i32 s8, s8, 0x30000 -; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s16, s16, 0x30000 +; SI-NEXT: s_add_i32 s17, s17, 0x30000 +; SI-NEXT: s_add_i32 s15, s15, 0x30000 +; SI-NEXT: s_add_i32 s14, s14, 0x30000 +; SI-NEXT: s_add_i32 s13, s13, 0x30000 +; SI-NEXT: s_add_i32 s12, s12, 0x30000 +; SI-NEXT: s_add_i32 s11, s11, 0x30000 ; SI-NEXT: s_add_i32 s10, s10, 0x30000 -; SI-NEXT: v_or_b32_e32 v0, v53, v0 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v38, v0 ; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_mov_b32_e32 v5, s9 -; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: v_mov_b32_e32 v3, s17 +; SI-NEXT: v_mov_b32_e32 v4, s15 +; SI-NEXT: v_mov_b32_e32 v5, s14 +; SI-NEXT: v_mov_b32_e32 v6, s13 +; SI-NEXT: v_mov_b32_e32 v7, s12 +; SI-NEXT: v_mov_b32_e32 v8, s11 +; SI-NEXT: v_mov_b32_e32 v9, s10 +; SI-NEXT: v_mov_b32_e32 v10, s9 +; SI-NEXT: v_mov_b32_e32 v11, s8 +; SI-NEXT: v_mov_b32_e32 v12, s7 +; SI-NEXT: v_mov_b32_e32 v13, s6 ; SI-NEXT: .LBB43_3: ; %end -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB43_4: ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 @@ -20502,29 +20161,27 @@ define <40 x half> @bitcast_v10i64_to_v40f16(<10 x i64> %a, i32 %b) { ; SI-LABEL: bitcast_v10i64_to_v40f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v21 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr41 @@ -20534,96 +20191,98 @@ define <40 x half> @bitcast_v10i64_to_v40f16(<10 x i64> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB44_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v23 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v23 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v23 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v23 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v23 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v23 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v23 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v23 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v23 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v23 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v23 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v23 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v23 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v4 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v46, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v23 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v3 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v56, v23 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v45, v23 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v19 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v58, v23 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v47, v23 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v17 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v57, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v60, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v0 +; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 @@ -20643,246 +20302,187 @@ define <40 x half> @bitcast_v10i64_to_v40f16(<10 x i64> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: .LBB44_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB44_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: v_addc_u32_e32 v12, vcc, 0, v12, vcc -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; SI-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; SI-NEXT: v_addc_u32_e32 v16, vcc, 0, v16, vcc -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; SI-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; SI-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v1 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 ; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 ; SI-NEXT: .LBB44_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v57 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v56 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v47 -; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v45 -; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v44 -; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v42 -; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v40 -; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v54 -; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v52 -; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v50 -; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v48 -; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v38 -; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 -; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v35 -; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 -; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v31 -; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v29 -; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v27 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v25 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v44 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v41 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v23 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x4c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v54 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v53 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v49 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v50 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v37 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v38 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v33 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v34 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v28 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v30 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v24 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v27 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v21 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v23 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v10i64_to_v40f16: @@ -21320,114 +20920,114 @@ define inreg <40 x half> @bitcast_v10i64_to_v40f16_scalar(<10 x i64> inreg %a, i ; SI-LABEL: bitcast_v10i64_to_v40f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v8, s16 -; SI-NEXT: v_mov_b32_e32 v9, s17 -; SI-NEXT: v_mov_b32_e32 v10, s18 -; SI-NEXT: v_mov_b32_e32 v11, s19 -; SI-NEXT: v_mov_b32_e32 v12, s20 -; SI-NEXT: v_mov_b32_e32 v13, s21 -; SI-NEXT: v_mov_b32_e32 v14, s22 -; SI-NEXT: v_mov_b32_e32 v15, s23 -; SI-NEXT: v_mov_b32_e32 v16, s24 -; SI-NEXT: v_mov_b32_e32 v17, s25 -; SI-NEXT: v_mov_b32_e32 v18, s26 -; SI-NEXT: v_mov_b32_e32 v19, s27 -; SI-NEXT: v_readfirstlane_b32 s22, v8 -; SI-NEXT: v_mov_b32_e32 v8, s28 -; SI-NEXT: v_readfirstlane_b32 s25, v9 -; SI-NEXT: v_mov_b32_e32 v9, s29 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 +; SI-NEXT: v_mov_b32_e32 v7, s16 +; SI-NEXT: v_mov_b32_e32 v8, s17 +; SI-NEXT: v_mov_b32_e32 v9, s18 +; SI-NEXT: v_mov_b32_e32 v10, s19 +; SI-NEXT: v_mov_b32_e32 v11, s20 +; SI-NEXT: v_mov_b32_e32 v12, s21 +; SI-NEXT: v_mov_b32_e32 v13, s22 +; SI-NEXT: v_mov_b32_e32 v14, s23 +; SI-NEXT: v_mov_b32_e32 v15, s24 +; SI-NEXT: v_mov_b32_e32 v16, s25 +; SI-NEXT: v_mov_b32_e32 v17, s26 +; SI-NEXT: v_mov_b32_e32 v18, s27 +; SI-NEXT: v_mov_b32_e32 v19, s28 +; SI-NEXT: v_readfirstlane_b32 s24, v7 +; SI-NEXT: v_mov_b32_e32 v7, s29 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: v_readfirstlane_b32 s25, v8 +; SI-NEXT: v_readfirstlane_b32 s22, v9 ; SI-NEXT: v_readfirstlane_b32 s23, v10 -; SI-NEXT: v_readfirstlane_b32 s24, v11 -; SI-NEXT: v_readfirstlane_b32 s20, v12 -; SI-NEXT: v_readfirstlane_b32 s21, v13 -; SI-NEXT: v_readfirstlane_b32 s18, v14 -; SI-NEXT: v_readfirstlane_b32 s19, v15 -; SI-NEXT: v_readfirstlane_b32 s16, v16 -; SI-NEXT: v_readfirstlane_b32 s17, v17 -; SI-NEXT: v_readfirstlane_b32 s14, v18 -; SI-NEXT: v_readfirstlane_b32 s15, v19 -; SI-NEXT: v_readfirstlane_b32 s12, v8 -; SI-NEXT: v_readfirstlane_b32 s13, v9 -; SI-NEXT: v_readfirstlane_b32 s10, v1 -; SI-NEXT: v_readfirstlane_b32 s11, v2 -; SI-NEXT: v_readfirstlane_b32 s7, v3 -; SI-NEXT: v_readfirstlane_b32 s8, v4 -; SI-NEXT: v_readfirstlane_b32 s6, v5 +; SI-NEXT: v_readfirstlane_b32 s20, v11 +; SI-NEXT: v_readfirstlane_b32 s21, v12 +; SI-NEXT: v_readfirstlane_b32 s18, v13 +; SI-NEXT: v_readfirstlane_b32 s19, v14 +; SI-NEXT: v_readfirstlane_b32 s16, v15 +; SI-NEXT: v_readfirstlane_b32 s17, v16 +; SI-NEXT: v_readfirstlane_b32 s14, v17 +; SI-NEXT: v_readfirstlane_b32 s15, v18 +; SI-NEXT: v_readfirstlane_b32 s12, v19 +; SI-NEXT: v_readfirstlane_b32 s13, v7 +; SI-NEXT: v_readfirstlane_b32 s10, v0 +; SI-NEXT: v_readfirstlane_b32 s11, v1 +; SI-NEXT: v_readfirstlane_b32 s7, v2 +; SI-NEXT: v_readfirstlane_b32 s8, v3 +; SI-NEXT: v_readfirstlane_b32 s6, v4 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s9, v6 +; SI-NEXT: v_readfirstlane_b32 s9, v5 ; SI-NEXT: s_cbranch_scc0 .LBB45_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_lshr_b32 s4, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 ; SI-NEXT: s_lshr_b32 s4, s6, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 ; SI-NEXT: s_lshr_b32 s4, s8, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 ; SI-NEXT: s_lshr_b32 s4, s7, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 ; SI-NEXT: s_lshr_b32 s4, s11, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 ; SI-NEXT: s_lshr_b32 s4, s10, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 ; SI-NEXT: s_lshr_b32 s4, s13, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 ; SI-NEXT: s_lshr_b32 s4, s12, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 ; SI-NEXT: s_lshr_b32 s4, s15, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 ; SI-NEXT: s_lshr_b32 s4, s14, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 ; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 ; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 ; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 ; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s4 ; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 ; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s4 -; SI-NEXT: s_lshr_b32 s4, s24, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v34, s4 ; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s4 -; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 ; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s4 +; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s24 ; SI-NEXT: s_cbranch_execnz .LBB45_3 ; SI-NEXT: .LBB45_2: ; %cmp.true -; SI-NEXT: s_add_u32 s4, s22, 3 +; SI-NEXT: s_add_u32 s4, s24, 3 ; SI-NEXT: s_addc_u32 s5, s25, 0 -; SI-NEXT: s_lshr_b32 s22, s4, 16 +; SI-NEXT: s_lshr_b32 s24, s4, 16 ; SI-NEXT: s_lshr_b32 s25, s5, 16 -; SI-NEXT: s_add_u32 s23, s23, 3 -; SI-NEXT: s_addc_u32 s24, s24, 0 -; SI-NEXT: s_lshr_b32 s26, s23, 16 -; SI-NEXT: s_lshr_b32 s27, s24, 16 +; SI-NEXT: s_add_u32 s22, s22, 3 +; SI-NEXT: s_addc_u32 s23, s23, 0 +; SI-NEXT: s_lshr_b32 s26, s22, 16 +; SI-NEXT: s_lshr_b32 s27, s23, 16 ; SI-NEXT: s_add_u32 s20, s20, 3 ; SI-NEXT: s_addc_u32 s21, s21, 0 ; SI-NEXT: s_lshr_b32 s28, s20, 16 @@ -21460,228 +21060,169 @@ define inreg <40 x half> @bitcast_v10i64_to_v40f16_scalar(<10 x i64> inreg %a, i ; SI-NEXT: s_addc_u32 s9, s9, 0 ; SI-NEXT: s_lshr_b32 s60, s6, 16 ; SI-NEXT: s_lshr_b32 s61, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s61 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s60 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s59 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s58 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s57 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s56 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s61 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s60 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s59 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s58 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s57 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s56 ; SI-NEXT: v_cvt_f32_f16_e32 v12, s47 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s46 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s45 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s44 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s43 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s45 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s44 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s24 ; SI-NEXT: .LBB45_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 ; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 ; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 ; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 ; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v39 +; SI-NEXT: v_or_b32_e32 v0, v38, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 +; SI-NEXT: v_or_b32_e32 v2, v36, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_or_b32_e32 v39, v39, v48 -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: buffer_store_dword v39, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v39, vcc, 4, v0 -; SI-NEXT: v_or_b32_e32 v37, v37, v38 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v36 +; SI-NEXT: v_or_b32_e32 v5, v5, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v34 +; SI-NEXT: v_or_b32_e32 v7, v7, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: buffer_store_dword v37, v39, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v37, vcc, 8, v0 -; SI-NEXT: v_or_b32_e32 v35, v35, v36 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v32 +; SI-NEXT: v_or_b32_e32 v9, v30, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: buffer_store_dword v35, v37, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v35, vcc, 12, v0 -; SI-NEXT: v_or_b32_e32 v33, v33, v34 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v30 +; SI-NEXT: v_or_b32_e32 v11, v28, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: buffer_store_dword v33, v35, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v33, vcc, 16, v0 -; SI-NEXT: v_or_b32_e32 v31, v32, v31 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v28 +; SI-NEXT: v_or_b32_e32 v13, v26, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: buffer_store_dword v31, v33, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v31, vcc, 20, v0 -; SI-NEXT: v_or_b32_e32 v29, v30, v29 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v26 +; SI-NEXT: v_or_b32_e32 v15, v24, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_or_b32_e32 v17, v22, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: buffer_store_dword v29, v31, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v29, vcc, 24, v0 -; SI-NEXT: v_or_b32_e32 v27, v28, v27 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: buffer_store_dword v27, v29, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v27, vcc, 28, v0 -; SI-NEXT: v_or_b32_e32 v25, v26, v25 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: buffer_store_dword v25, v27, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v25, vcc, 32, v0 -; SI-NEXT: v_or_b32_e32 v23, v24, v23 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: buffer_store_dword v23, v25, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v23, vcc, 36, v0 -; SI-NEXT: v_or_b32_e32 v21, v22, v21 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: buffer_store_dword v21, v23, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v21, vcc, 40, v0 -; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: buffer_store_dword v19, v21, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v19, vcc, 44, v0 -; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: buffer_store_dword v16, v19, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v16, v17 -; SI-NEXT: v_add_i32_e32 v17, vcc, 48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v16, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v14, v15 -; SI-NEXT: v_add_i32_e32 v15, vcc, 52, v0 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v14, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: buffer_store_dword v12, v15, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v12, v13 -; SI-NEXT: v_add_i32_e32 v13, vcc, 56, v0 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: buffer_store_dword v10, v13, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v10, v11 -; SI-NEXT: v_add_i32_e32 v11, vcc, 60, v0 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: buffer_store_dword v8, v11, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v8, v9 -; SI-NEXT: v_add_i32_e32 v9, vcc, 64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v5 -; SI-NEXT: v_add_i32_e32 v5, vcc, 0x48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v3 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x4c, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB45_4: -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v22 +; SI-NEXT: v_or_b32_e32 v1, v48, v1 +; SI-NEXT: v_or_b32_e32 v3, v38, v3 +; SI-NEXT: v_or_b32_e32 v4, v35, v4 +; SI-NEXT: v_or_b32_e32 v6, v33, v6 +; SI-NEXT: v_or_b32_e32 v8, v31, v8 +; SI-NEXT: v_or_b32_e32 v10, v29, v10 +; SI-NEXT: v_or_b32_e32 v12, v27, v12 +; SI-NEXT: v_or_b32_e32 v14, v25, v14 +; SI-NEXT: v_or_b32_e32 v16, v23, v16 +; SI-NEXT: v_or_b32_e32 v18, v21, v18 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB45_4: ; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: s_branch .LBB45_2 ; ; VI-LABEL: bitcast_v10i64_to_v40f16_scalar: @@ -22236,128 +21777,146 @@ define <10 x i64> @bitcast_v40f16_to_v10i64(<40 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v40f16_to_v10i64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:36 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v38, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v20 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v23 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v48 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f16_f32_e32 v48, v55 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v32 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v54 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v40 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v53 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v39, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v63, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v52 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB46_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v41 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; kill: killed $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; kill: killed $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; kill: killed $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; kill: killed $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; kill: killed $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; kill: killed $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; kill: killed $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; kill: killed $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; kill: killed $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; kill: killed $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v39 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v37 @@ -22370,9 +21929,11 @@ define <10 x i64> @bitcast_v40f16_to_v10i64(<40 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v47 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v45 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v43 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v49 -; SI-NEXT: ; kill: killed $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v41 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v53 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v49 ; SI-NEXT: v_or_b32_e32 v0, v38, v0 ; SI-NEXT: v_or_b32_e32 v1, v36, v1 ; SI-NEXT: v_or_b32_e32 v2, v34, v2 @@ -22384,7 +21945,11 @@ define <10 x i64> @bitcast_v40f16_to_v10i64(<40 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v8, v46, v8 ; SI-NEXT: v_or_b32_e32 v9, v44, v9 ; SI-NEXT: v_or_b32_e32 v10, v42, v10 -; SI-NEXT: v_or_b32_e32 v19, v48, v19 +; SI-NEXT: v_or_b32_e32 v11, v40, v11 +; SI-NEXT: v_or_b32_e32 v12, v54, v12 +; SI-NEXT: v_or_b32_e32 v13, v52, v13 +; SI-NEXT: v_or_b32_e32 v14, v50, v14 +; SI-NEXT: v_or_b32_e32 v15, v48, v15 ; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr37 @@ -22408,47 +21973,50 @@ define <10 x i64> @bitcast_v40f16_to_v10i64(<40 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; kill: killed $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v51 -; SI-NEXT: v_or_b32_e32 v18, v50, v18 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 ; SI-NEXT: .LBB46_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB46_4 @@ -22456,9 +22024,6 @@ define <10 x i64> @bitcast_v40f16_to_v10i64(<40 x half> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v39 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v37 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v36 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 @@ -22476,7 +22041,11 @@ define <10 x i64> @bitcast_v40f16_to_v10i64(<40 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v2, v35 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v34 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v62 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -22486,152 +22055,142 @@ define <10 x i64> @bitcast_v40f16_to_v10i64(<40 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v33 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v62 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v58 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v58 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v57 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v56 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v56 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v46 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v43 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v43 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v53 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v47 ; SI-NEXT: v_or_b32_e32 v7, v9, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v48 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v10, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v44 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v52 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v41 ; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v50 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v54 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v14, v51 ; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v49 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v16, v14 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v48 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v51 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v50 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_or_b32_e32 v18, v19, v18 @@ -22639,22 +22198,22 @@ define <10 x i64> @bitcast_v40f16_to_v10i64(<40 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v19, v21, v19 ; SI-NEXT: .LBB46_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -23187,6 +22746,16 @@ define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i ; SI-LABEL: bitcast_v40f16_to_v10i64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_lshr_b32 s14, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s14 +; SI-NEXT: s_lshr_b32 s14, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s14 +; SI-NEXT: s_lshr_b32 s14, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s14 +; SI-NEXT: s_lshr_b32 s14, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s14 +; SI-NEXT: s_lshr_b32 s12, s21, 16 +; SI-NEXT: s_lshr_b32 s13, s20, 16 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -23203,98 +22772,149 @@ define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v32, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v0 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v63, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v10 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_cvt_f16_f32_e32 v58, v16 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_cvt_f16_f32_e32 v59, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s21 +; SI-NEXT: s_lshr_b32 s10, s23, 16 +; SI-NEXT: s_lshr_b32 s11, s22, 16 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s23 +; SI-NEXT: s_lshr_b32 s8, s25, 16 +; SI-NEXT: s_lshr_b32 s9, s24, 16 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v15 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v63, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s25 +; SI-NEXT: s_lshr_b32 s6, s27, 16 +; SI-NEXT: s_lshr_b32 s7, s26, 16 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s27 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v0 +; SI-NEXT: s_lshr_b32 s4, s29, 16 +; SI-NEXT: s_lshr_b32 s5, s28, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v52, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v58, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v60, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v59, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v57, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v39, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v56, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v38, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v37, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v36, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v35, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v34, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v61, s26 -; SI-NEXT: v_cvt_f16_f32_e32 v25, s29 -; SI-NEXT: v_cvt_f16_f32_e32 v24, s28 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v5 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_cbranch_scc0 .LBB47_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v58 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v59 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v38 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v34 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v45 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v35 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v63 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v44 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v42 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v40 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v54 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v52 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v50 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v48 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v30 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v44 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v60 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v53 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v25 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v23 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: v_or_b32_e32 v0, v60, v0 -; SI-NEXT: v_or_b32_e32 v1, v57, v1 -; SI-NEXT: v_or_b32_e32 v2, v56, v2 -; SI-NEXT: v_or_b32_e32 v3, v37, v3 -; SI-NEXT: v_or_b32_e32 v4, v35, v4 -; SI-NEXT: v_or_b32_e32 v5, v61, v5 -; SI-NEXT: v_or_b32_e32 v6, v24, v6 -; SI-NEXT: v_or_b32_e32 v7, v47, v7 +; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: v_or_b32_e32 v1, v56, v1 +; SI-NEXT: v_or_b32_e32 v2, v46, v2 +; SI-NEXT: v_or_b32_e32 v3, v38, v3 +; SI-NEXT: v_or_b32_e32 v4, v37, v4 +; SI-NEXT: v_or_b32_e32 v5, v34, v5 +; SI-NEXT: v_or_b32_e32 v6, v33, v6 +; SI-NEXT: v_or_b32_e32 v7, v63, v7 ; SI-NEXT: v_or_b32_e32 v8, v62, v8 -; SI-NEXT: v_or_b32_e32 v9, v43, v9 -; SI-NEXT: v_or_b32_e32 v10, v41, v10 -; SI-NEXT: v_or_b32_e32 v11, v55, v11 -; SI-NEXT: v_or_b32_e32 v12, v53, v12 -; SI-NEXT: v_or_b32_e32 v13, v51, v13 -; SI-NEXT: v_or_b32_e32 v14, v49, v14 -; SI-NEXT: v_or_b32_e32 v15, v31, v15 -; SI-NEXT: v_or_b32_e32 v16, v29, v16 -; SI-NEXT: v_or_b32_e32 v17, v27, v17 +; SI-NEXT: v_or_b32_e32 v9, v55, v9 +; SI-NEXT: v_or_b32_e32 v10, v54, v10 +; SI-NEXT: v_or_b32_e32 v11, v51, v11 +; SI-NEXT: v_or_b32_e32 v12, v50, v12 +; SI-NEXT: v_or_b32_e32 v13, v48, v13 +; SI-NEXT: v_or_b32_e32 v14, v30, v14 +; SI-NEXT: v_or_b32_e32 v15, v28, v15 +; SI-NEXT: v_or_b32_e32 v16, v26, v16 +; SI-NEXT: v_or_b32_e32 v17, v24, v17 ; SI-NEXT: v_or_b32_e32 v18, v22, v18 ; SI-NEXT: v_or_b32_e32 v19, v21, v19 ; SI-NEXT: s_cbranch_execnz .LBB47_3 ; SI-NEXT: .LBB47_2: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v58 -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v56 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -23307,10 +22927,11 @@ define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v46 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v37 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -23319,18 +22940,19 @@ define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v45 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v34 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v33 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v36 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v39 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v32 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 @@ -23338,10 +22960,10 @@ define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v63 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v36 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v62 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 @@ -23349,25 +22971,24 @@ define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v60 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v35 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v54 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v51 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v44 ; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v61 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 @@ -23376,20 +22997,20 @@ define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v55 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v48 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v30 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v53 ; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v52 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 @@ -23398,20 +23019,20 @@ define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v50 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v24 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v31 ; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v29 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 @@ -23420,7 +23041,7 @@ define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v28 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 @@ -23431,7 +23052,7 @@ define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v25 ; SI-NEXT: v_or_b32_e32 v16, v18, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v18, v23 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 @@ -23472,87 +23093,86 @@ define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB47_4: -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v59, v48 +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v39, v44 +; SI-NEXT: v_mov_b32_e32 v44, v48 ; SI-NEXT: v_mov_b32_e32 v48, v21 -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v60, v49 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_mov_b32_e32 v38, v45 +; SI-NEXT: v_mov_b32_e32 v45, v49 ; SI-NEXT: v_mov_b32_e32 v49, v20 -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v43, v50 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_mov_b32_e32 v37, v46 +; SI-NEXT: v_mov_b32_e32 v46, v50 ; SI-NEXT: v_mov_b32_e32 v50, v22 -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v44, v51 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_mov_b32_e32 v36, v47 +; SI-NEXT: v_mov_b32_e32 v47, v51 ; SI-NEXT: v_mov_b32_e32 v51, v23 -; SI-NEXT: v_mov_b32_e32 v45, v52 -; SI-NEXT: v_mov_b32_e32 v52, v27 -; SI-NEXT: v_mov_b32_e32 v46, v53 -; SI-NEXT: v_mov_b32_e32 v53, v28 -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v41, v32 -; SI-NEXT: v_mov_b32_e32 v33, v47 -; SI-NEXT: v_mov_b32_e32 v47, v54 -; SI-NEXT: v_mov_b32_e32 v54, v29 -; SI-NEXT: v_mov_b32_e32 v42, v56 -; SI-NEXT: v_mov_b32_e32 v56, v55 -; SI-NEXT: v_mov_b32_e32 v55, v30 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_mov_b32_e32 v35, v56 +; SI-NEXT: v_mov_b32_e32 v56, v52 +; SI-NEXT: v_mov_b32_e32 v52, v24 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v34, v57 +; SI-NEXT: v_mov_b32_e32 v57, v53 +; SI-NEXT: v_mov_b32_e32 v53, v25 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v36, v57 -; SI-NEXT: v_mov_b32_e32 v57, v40 -; SI-NEXT: v_mov_b32_e32 v40, v31 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v39, v58 -; SI-NEXT: v_mov_b32_e32 v58, v37 -; SI-NEXT: v_mov_b32_e32 v37, v34 -; SI-NEXT: v_mov_b32_e32 v34, v24 -; SI-NEXT: v_mov_b32_e32 v32, v38 -; SI-NEXT: v_mov_b32_e32 v38, v35 -; SI-NEXT: v_mov_b32_e32 v35, v25 +; SI-NEXT: v_mov_b32_e32 v33, v58 +; SI-NEXT: v_mov_b32_e32 v58, v54 +; SI-NEXT: v_mov_b32_e32 v54, v26 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v32, v59 +; SI-NEXT: v_mov_b32_e32 v59, v55 +; SI-NEXT: v_mov_b32_e32 v55, v27 +; SI-NEXT: v_mov_b32_e32 v40, v28 +; SI-NEXT: v_mov_b32_e32 v41, v29 +; SI-NEXT: v_mov_b32_e32 v42, v30 +; SI-NEXT: v_mov_b32_e32 v43, v31 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v25, v35 -; SI-NEXT: v_mov_b32_e32 v35, v38 -; SI-NEXT: v_mov_b32_e32 v38, v32 -; SI-NEXT: v_mov_b32_e32 v24, v34 -; SI-NEXT: v_mov_b32_e32 v34, v37 -; SI-NEXT: v_mov_b32_e32 v37, v58 -; SI-NEXT: v_mov_b32_e32 v58, v39 -; SI-NEXT: v_mov_b32_e32 v31, v40 -; SI-NEXT: v_mov_b32_e32 v40, v57 -; SI-NEXT: v_mov_b32_e32 v57, v36 -; SI-NEXT: v_mov_b32_e32 v30, v55 -; SI-NEXT: v_mov_b32_e32 v55, v56 -; SI-NEXT: v_mov_b32_e32 v56, v42 -; SI-NEXT: v_mov_b32_e32 v32, v41 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v27, v55 +; SI-NEXT: v_mov_b32_e32 v55, v59 +; SI-NEXT: v_mov_b32_e32 v59, v32 +; SI-NEXT: v_mov_b32_e32 v26, v54 +; SI-NEXT: v_mov_b32_e32 v54, v58 +; SI-NEXT: v_mov_b32_e32 v58, v33 +; SI-NEXT: v_mov_b32_e32 v25, v53 +; SI-NEXT: v_mov_b32_e32 v53, v57 +; SI-NEXT: v_mov_b32_e32 v57, v34 +; SI-NEXT: v_mov_b32_e32 v24, v52 +; SI-NEXT: v_mov_b32_e32 v52, v56 +; SI-NEXT: v_mov_b32_e32 v56, v35 ; SI-NEXT: v_mov_b32_e32 v23, v51 -; SI-NEXT: v_mov_b32_e32 v51, v44 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v51, v47 +; SI-NEXT: v_mov_b32_e32 v47, v36 ; SI-NEXT: v_mov_b32_e32 v22, v50 -; SI-NEXT: v_mov_b32_e32 v50, v43 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v50, v46 +; SI-NEXT: v_mov_b32_e32 v46, v37 ; SI-NEXT: v_mov_b32_e32 v20, v49 -; SI-NEXT: v_mov_b32_e32 v49, v60 -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v49, v45 +; SI-NEXT: v_mov_b32_e32 v45, v38 ; SI-NEXT: v_mov_b32_e32 v21, v48 -; SI-NEXT: v_mov_b32_e32 v48, v59 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v29, v54 -; SI-NEXT: v_mov_b32_e32 v54, v47 -; SI-NEXT: v_mov_b32_e32 v47, v33 -; SI-NEXT: v_mov_b32_e32 v28, v53 -; SI-NEXT: v_mov_b32_e32 v53, v46 -; SI-NEXT: v_mov_b32_e32 v27, v52 -; SI-NEXT: v_mov_b32_e32 v52, v45 +; SI-NEXT: v_mov_b32_e32 v48, v44 +; SI-NEXT: v_mov_b32_e32 v44, v39 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v31, v43 +; SI-NEXT: v_mov_b32_e32 v30, v42 +; SI-NEXT: v_mov_b32_e32 v29, v41 +; SI-NEXT: v_mov_b32_e32 v28, v40 ; SI-NEXT: s_branch .LBB47_2 ; ; VI-LABEL: bitcast_v40f16_to_v10i64_scalar: @@ -24060,206 +23680,147 @@ define <40 x i16> @bitcast_v10f64_to_v40i16(<10 x double> %a, i32 %b) { ; SI-LABEL: bitcast_v10f64_to_v40i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v21 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB48_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_alignbit_b32 v21, v20, v19, 16 -; SI-NEXT: v_alignbit_b32 v22, v18, v17, 16 -; SI-NEXT: v_alignbit_b32 v23, v16, v15, 16 -; SI-NEXT: v_alignbit_b32 v24, v14, v13, 16 -; SI-NEXT: v_alignbit_b32 v25, v12, v11, 16 -; SI-NEXT: v_alignbit_b32 v28, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v30, v8, v7, 16 -; SI-NEXT: v_alignbit_b32 v33, v6, v5, 16 -; SI-NEXT: v_alignbit_b32 v35, v4, v3, 16 -; SI-NEXT: v_alignbit_b32 v37, v2, v1, 16 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v2 +; SI-NEXT: v_alignbit_b32 v20, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v21, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v22, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v23, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v24, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v25, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v26, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v29, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v32, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v35, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v1 ; SI-NEXT: .LBB48_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB48_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 -; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 -; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 -; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 -; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 -; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 -; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 -; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 -; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 -; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 -; SI-NEXT: v_alignbit_b32 v21, v20, v19, 16 -; SI-NEXT: v_alignbit_b32 v22, v18, v17, 16 -; SI-NEXT: v_alignbit_b32 v23, v16, v15, 16 -; SI-NEXT: v_alignbit_b32 v24, v14, v13, 16 -; SI-NEXT: v_alignbit_b32 v25, v12, v11, 16 -; SI-NEXT: v_alignbit_b32 v28, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v30, v8, v7, 16 -; SI-NEXT: v_alignbit_b32 v33, v6, v5, 16 -; SI-NEXT: v_alignbit_b32 v35, v4, v3, 16 -; SI-NEXT: v_alignbit_b32 v37, v2, v1, 16 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v2 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_alignbit_b32 v20, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v21, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v22, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v23, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v24, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v25, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v26, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v29, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v32, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v35, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v1 ; SI-NEXT: .LBB48_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v0, v0, v35 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; SI-NEXT: v_or_b32_e32 v1, v1, v37 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v28 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v24 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v23 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v22 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v21 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v26 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x4c, v0 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v39 +; SI-NEXT: v_or_b32_e32 v2, v2, v32 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v38 +; SI-NEXT: v_or_b32_e32 v4, v4, v29 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v37 +; SI-NEXT: v_or_b32_e32 v6, v6, v26 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v36 +; SI-NEXT: v_or_b32_e32 v8, v8, v25 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v34 +; SI-NEXT: v_or_b32_e32 v10, v10, v24 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v33 +; SI-NEXT: v_or_b32_e32 v12, v12, v23 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v31 +; SI-NEXT: v_or_b32_e32 v14, v14, v22 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v30 +; SI-NEXT: v_or_b32_e32 v16, v16, v21 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v28 +; SI-NEXT: v_or_b32_e32 v18, v18, v20 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v27 +; SI-NEXT: v_or_b32_e32 v1, v1, v35 +; SI-NEXT: v_or_b32_e32 v3, v3, v32 +; SI-NEXT: v_or_b32_e32 v5, v5, v29 +; SI-NEXT: v_or_b32_e32 v7, v7, v26 +; SI-NEXT: v_or_b32_e32 v9, v9, v25 +; SI-NEXT: v_or_b32_e32 v11, v11, v24 +; SI-NEXT: v_or_b32_e32 v13, v13, v23 +; SI-NEXT: v_or_b32_e32 v15, v15, v22 +; SI-NEXT: v_or_b32_e32 v17, v17, v21 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v10f64_to_v40i16: @@ -24647,218 +24208,165 @@ define inreg <40 x i16> @bitcast_v10f64_to_v40i16_scalar(<10 x double> inreg %a, ; SI-LABEL: bitcast_v10f64_to_v40i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 -; SI-NEXT: v_mov_b32_e32 v19, s16 -; SI-NEXT: v_mov_b32_e32 v20, s17 -; SI-NEXT: v_mov_b32_e32 v17, s18 -; SI-NEXT: v_mov_b32_e32 v18, s19 -; SI-NEXT: v_mov_b32_e32 v15, s20 -; SI-NEXT: v_mov_b32_e32 v16, s21 -; SI-NEXT: v_mov_b32_e32 v13, s22 -; SI-NEXT: v_mov_b32_e32 v14, s23 -; SI-NEXT: v_mov_b32_e32 v11, s24 -; SI-NEXT: v_mov_b32_e32 v12, s25 -; SI-NEXT: v_mov_b32_e32 v9, s26 -; SI-NEXT: v_mov_b32_e32 v10, s27 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: v_mov_b32_e32 v20, s16 +; SI-NEXT: v_mov_b32_e32 v21, s17 +; SI-NEXT: v_mov_b32_e32 v18, s18 +; SI-NEXT: v_mov_b32_e32 v19, s19 +; SI-NEXT: v_mov_b32_e32 v16, s20 +; SI-NEXT: v_mov_b32_e32 v17, s21 +; SI-NEXT: v_mov_b32_e32 v14, s22 +; SI-NEXT: v_mov_b32_e32 v15, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mov_b32_e32 v7, s28 -; SI-NEXT: v_mov_b32_e32 v8, s29 +; SI-NEXT: v_mov_b32_e32 v13, s29 ; SI-NEXT: s_cbranch_scc0 .LBB49_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshr_b64 v[21:22], v[5:6], 16 -; SI-NEXT: v_lshr_b64 v[27:28], v[9:10], 16 -; SI-NEXT: v_lshr_b64 v[22:23], v[3:4], 16 -; SI-NEXT: v_lshr_b64 v[28:29], v[13:14], 16 -; SI-NEXT: v_lshr_b64 v[23:24], v[1:2], 16 -; SI-NEXT: v_lshr_b64 v[29:30], v[15:16], 16 -; SI-NEXT: v_lshr_b64 v[24:25], v[7:8], 16 -; SI-NEXT: v_lshr_b64 v[30:31], v[17:18], 16 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v20 -; SI-NEXT: v_lshr_b64 v[25:26], v[11:12], 16 -; SI-NEXT: v_lshr_b64 v[31:32], v[19:20], 16 +; SI-NEXT: v_lshr_b64 v[26:27], v[4:5], 16 +; SI-NEXT: v_lshr_b64 v[27:28], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[28:29], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[22:23], v[16:17], 16 +; SI-NEXT: v_lshr_b64 v[29:30], v[12:13], 16 +; SI-NEXT: v_lshr_b64 v[23:24], v[18:19], 16 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v21 +; SI-NEXT: v_lshr_b64 v[32:33], v[10:11], 16 +; SI-NEXT: v_lshr_b64 v[30:31], v[8:9], 16 +; SI-NEXT: v_lshr_b64 v[6:7], v[14:15], 16 +; SI-NEXT: v_lshr_b64 v[24:25], v[20:21], 16 ; SI-NEXT: s_cbranch_execnz .LBB49_3 ; SI-NEXT: .LBB49_2: ; %cmp.true -; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 -; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 -; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 -; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 -; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 -; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 -; SI-NEXT: v_lshr_b64 v[21:22], v[5:6], 16 -; SI-NEXT: v_lshr_b64 v[27:28], v[9:10], 16 -; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 -; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 -; SI-NEXT: v_lshr_b64 v[22:23], v[3:4], 16 -; SI-NEXT: v_lshr_b64 v[28:29], v[13:14], 16 -; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 -; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 -; SI-NEXT: v_lshr_b64 v[23:24], v[1:2], 16 -; SI-NEXT: v_lshr_b64 v[29:30], v[15:16], 16 -; SI-NEXT: v_lshr_b64 v[24:25], v[7:8], 16 -; SI-NEXT: v_lshr_b64 v[30:31], v[17:18], 16 -; SI-NEXT: v_lshr_b64 v[25:26], v[11:12], 16 -; SI-NEXT: v_lshr_b64 v[31:32], v[19:20], 16 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v20 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_lshr_b64 v[26:27], v[4:5], 16 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: v_lshr_b64 v[27:28], v[2:3], 16 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; SI-NEXT: v_lshr_b64 v[28:29], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[22:23], v[16:17], 16 +; SI-NEXT: v_lshr_b64 v[29:30], v[12:13], 16 +; SI-NEXT: v_lshr_b64 v[23:24], v[18:19], 16 +; SI-NEXT: v_lshr_b64 v[32:33], v[10:11], 16 +; SI-NEXT: v_lshr_b64 v[30:31], v[8:9], 16 +; SI-NEXT: v_lshr_b64 v[6:7], v[14:15], 16 +; SI-NEXT: v_lshr_b64 v[24:25], v[20:21], 16 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v21 ; SI-NEXT: .LBB49_3: ; %end -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v31 -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; SI-NEXT: v_or_b32_e32 v19, v19, v26 -; SI-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v50 -; SI-NEXT: v_or_b32_e32 v19, v19, v20 -; SI-NEXT: v_add_i32_e32 v20, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v30 -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; SI-NEXT: v_or_b32_e32 v17, v17, v19 -; SI-NEXT: v_add_i32_e32 v19, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v17, v19, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v49 -; SI-NEXT: v_or_b32_e32 v17, v17, v18 -; SI-NEXT: v_add_i32_e32 v18, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v29 -; SI-NEXT: v_or_b32_e32 v15, v15, v17 -; SI-NEXT: v_add_i32_e32 v17, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v15, v17, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v48 -; SI-NEXT: v_or_b32_e32 v15, v15, v16 -; SI-NEXT: v_add_i32_e32 v16, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v28 -; SI-NEXT: v_or_b32_e32 v13, v13, v15 -; SI-NEXT: v_add_i32_e32 v15, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v13, v15, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v24 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v24, v20, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v51 +; SI-NEXT: v_or_b32_e32 v25, v7, v20 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v23 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v20, v18, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v50 +; SI-NEXT: v_or_b32_e32 v21, v7, v18 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v22 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v22, v16, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v49 +; SI-NEXT: v_or_b32_e32 v23, v7, v16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v48 +; SI-NEXT: v_or_b32_e32 v7, v7, v14 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v30 +; SI-NEXT: v_or_b32_e32 v8, v8, v14 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v39 -; SI-NEXT: v_or_b32_e32 v13, v13, v14 -; SI-NEXT: v_add_i32_e32 v14, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v9, v9, v14 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v32 +; SI-NEXT: v_or_b32_e32 v10, v10, v14 ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v25 -; SI-NEXT: v_or_b32_e32 v11, v11, v13 -; SI-NEXT: v_add_i32_e32 v13, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v11, v13, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v38 -; SI-NEXT: v_or_b32_e32 v11, v11, v12 -; SI-NEXT: v_add_i32_e32 v12, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v27 -; SI-NEXT: v_or_b32_e32 v9, v9, v11 -; SI-NEXT: v_add_i32_e32 v11, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v9, v11, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v37 -; SI-NEXT: v_or_b32_e32 v9, v9, v10 -; SI-NEXT: v_add_i32_e32 v10, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v24 -; SI-NEXT: v_or_b32_e32 v7, v7, v9 -; SI-NEXT: v_add_i32_e32 v9, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v7, v9, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v36 -; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: v_add_i32_e32 v8, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v23 -; SI-NEXT: v_or_b32_e32 v1, v1, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, 56, v0 -; SI-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v22 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v21 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x4c, v0 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v38 +; SI-NEXT: v_or_b32_e32 v11, v11, v14 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v29 +; SI-NEXT: v_or_b32_e32 v12, v12, v14 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v37 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v28 +; SI-NEXT: v_or_b32_e32 v14, v0, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v36 +; SI-NEXT: v_or_b32_e32 v15, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v27 +; SI-NEXT: v_or_b32_e32 v16, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v35 +; SI-NEXT: v_or_b32_e32 v17, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v26 +; SI-NEXT: v_or_b32_e32 v18, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v34 +; SI-NEXT: v_or_b32_e32 v19, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, v24 +; SI-NEXT: v_mov_b32_e32 v1, v25 +; SI-NEXT: v_mov_b32_e32 v2, v20 +; SI-NEXT: v_mov_b32_e32 v3, v21 +; SI-NEXT: v_mov_b32_e32 v4, v22 +; SI-NEXT: v_mov_b32_e32 v5, v23 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB49_4: -; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: s_branch .LBB49_2 ; ; VI-LABEL: bitcast_v10f64_to_v40i16_scalar: @@ -25435,132 +24943,136 @@ define <10 x double> @bitcast_v40i16_to_v10f64(<40 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v40i16_to_v10f64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v52, v6 -; SI-NEXT: v_mov_b32_e32 v53, v4 -; SI-NEXT: v_mov_b32_e32 v54, v2 -; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 -; SI-NEXT: v_mov_b32_e32 v49, v12 -; SI-NEXT: v_mov_b32_e32 v50, v10 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v32, v19 +; SI-NEXT: v_mov_b32_e32 v33, v18 +; SI-NEXT: v_mov_b32_e32 v34, v17 +; SI-NEXT: v_mov_b32_e32 v35, v16 +; SI-NEXT: v_mov_b32_e32 v36, v15 +; SI-NEXT: v_mov_b32_e32 v37, v14 +; SI-NEXT: v_mov_b32_e32 v38, v13 +; SI-NEXT: v_mov_b32_e32 v39, v12 +; SI-NEXT: v_mov_b32_e32 v48, v11 +; SI-NEXT: v_mov_b32_e32 v49, v10 +; SI-NEXT: v_mov_b32_e32 v50, v9 ; SI-NEXT: v_mov_b32_e32 v51, v8 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v37, v20 -; SI-NEXT: v_mov_b32_e32 v38, v18 -; SI-NEXT: v_mov_b32_e32 v39, v16 -; SI-NEXT: v_mov_b32_e32 v48, v14 -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v15 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v29 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v4 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:20 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v8 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:12 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v52, v7 +; SI-NEXT: v_mov_b32_e32 v53, v6 +; SI-NEXT: v_mov_b32_e32 v54, v5 +; SI-NEXT: v_mov_b32_e32 v55, v4 +; SI-NEXT: v_mov_b32_e32 v40, v3 +; SI-NEXT: v_mov_b32_e32 v41, v2 +; SI-NEXT: v_mov_b32_e32 v42, v1 +; SI-NEXT: v_mov_b32_e32 v43, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v38 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v48 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v50 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v52 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v40 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v42 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v43 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v12 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB50_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; kill: killed $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; kill: killed $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; kill: killed $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; kill: killed $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; kill: killed $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v53 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v52 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v51 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v50 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v49 -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v48 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v39 -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v38 -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v37 -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v43 -; SI-NEXT: ; kill: killed $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: v_or_b32_e32 v0, v0, v42 -; SI-NEXT: v_or_b32_e32 v1, v1, v41 -; SI-NEXT: v_or_b32_e32 v2, v2, v36 -; SI-NEXT: v_or_b32_e32 v3, v3, v35 -; SI-NEXT: v_or_b32_e32 v4, v4, v40 -; SI-NEXT: v_or_b32_e32 v5, v5, v34 -; SI-NEXT: v_or_b32_e32 v6, v6, v33 -; SI-NEXT: v_or_b32_e32 v7, v7, v32 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v39 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v43 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v42 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v41 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v40 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v55 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v54 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v53 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v52 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v51 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v50 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v49 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v48 +; SI-NEXT: v_or_b32_e32 v0, v0, v59 +; SI-NEXT: v_or_b32_e32 v1, v1, v58 +; SI-NEXT: v_or_b32_e32 v2, v2, v57 +; SI-NEXT: v_or_b32_e32 v3, v3, v56 +; SI-NEXT: v_or_b32_e32 v4, v4, v47 +; SI-NEXT: v_or_b32_e32 v5, v5, v46 +; SI-NEXT: v_or_b32_e32 v6, v6, v45 +; SI-NEXT: v_or_b32_e32 v7, v7, v44 ; SI-NEXT: v_or_b32_e32 v8, v8, v63 ; SI-NEXT: v_or_b32_e32 v9, v9, v62 ; SI-NEXT: v_or_b32_e32 v10, v10, v61 -; SI-NEXT: v_or_b32_e32 v16, v16, v47 +; SI-NEXT: v_or_b32_e32 v11, v11, v60 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr53 @@ -25570,81 +25082,91 @@ define <10 x double> @bitcast_v40i16_to_v10f64(<40 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; kill: killed $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr62 ; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v38 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v37 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v36 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v35 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v34 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v33 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v32 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; SI-NEXT: v_or_b32_e32 v11, v11, v60 -; SI-NEXT: v_or_b32_e32 v12, v12, v59 -; SI-NEXT: v_or_b32_e32 v13, v13, v58 -; SI-NEXT: v_or_b32_e32 v14, v14, v57 -; SI-NEXT: v_or_b32_e32 v15, v15, v56 -; SI-NEXT: v_or_b32_e32 v17, v17, v46 -; SI-NEXT: v_or_b32_e32 v18, v18, v45 -; SI-NEXT: v_or_b32_e32 v19, v19, v44 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 ; SI-NEXT: .LBB50_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB50_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v39 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v53 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v52 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v51 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v50 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v49 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v48 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v39 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v38 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v37 -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v43 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v43 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v42 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v41 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v40 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v48 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 @@ -25656,20 +25178,20 @@ define <10 x double> @bitcast_v40i16_to_v10f64(<40 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_or_b32_e32 v0, v59, v0 ; SI-NEXT: s_mov_b32 s6, 0x30000 -; SI-NEXT: v_or_b32_e32 v1, v41, v1 -; SI-NEXT: v_or_b32_e32 v2, v36, v2 -; SI-NEXT: v_or_b32_e32 v3, v35, v3 -; SI-NEXT: v_or_b32_e32 v4, v40, v4 -; SI-NEXT: v_or_b32_e32 v5, v34, v5 -; SI-NEXT: v_or_b32_e32 v6, v33, v6 -; SI-NEXT: v_or_b32_e32 v7, v32, v7 +; SI-NEXT: v_or_b32_e32 v1, v58, v1 +; SI-NEXT: v_or_b32_e32 v2, v57, v2 +; SI-NEXT: v_or_b32_e32 v3, v56, v3 +; SI-NEXT: v_or_b32_e32 v4, v47, v4 +; SI-NEXT: v_or_b32_e32 v5, v46, v5 +; SI-NEXT: v_or_b32_e32 v6, v45, v6 +; SI-NEXT: v_or_b32_e32 v7, v44, v7 ; SI-NEXT: v_or_b32_e32 v8, v63, v8 ; SI-NEXT: v_or_b32_e32 v9, v62, v9 ; SI-NEXT: v_or_b32_e32 v10, v61, v10 -; SI-NEXT: v_or_b32_e32 v16, v47, v16 +; SI-NEXT: v_or_b32_e32 v11, v60, v11 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 @@ -25680,62 +25202,64 @@ define <10 x double> @bitcast_v40i16_to_v10f64(<40 x i16> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 ; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 ; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 -; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 -; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v38 ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v37 ; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v36 ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v35 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v34 ; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v33 ; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v32 ; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; SI-NEXT: v_or_b32_e32 v11, v60, v11 -; SI-NEXT: v_or_b32_e32 v12, v59, v12 -; SI-NEXT: v_or_b32_e32 v13, v58, v13 -; SI-NEXT: v_or_b32_e32 v14, v57, v14 -; SI-NEXT: v_or_b32_e32 v15, v56, v15 -; SI-NEXT: v_or_b32_e32 v17, v46, v17 -; SI-NEXT: v_or_b32_e32 v18, v45, v18 -; SI-NEXT: v_or_b32_e32 v19, v44, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 ; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 ; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 ; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 ; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 ; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 ; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 ; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 ; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v19 ; SI-NEXT: .LBB50_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -26267,209 +25791,221 @@ define inreg <10 x double> @bitcast_v40i16_to_v10f64_scalar(<40 x i16> inreg %a, ; SI-LABEL: bitcast_v40i16_to_v10f64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v32, v24 -; SI-NEXT: v_mov_b32_e32 v33, v22 -; SI-NEXT: v_mov_b32_e32 v34, v20 -; SI-NEXT: v_mov_b32_e32 v35, v18 -; SI-NEXT: v_mov_b32_e32 v36, v16 -; SI-NEXT: v_mov_b32_e32 v37, v14 -; SI-NEXT: v_mov_b32_e32 v38, v12 -; SI-NEXT: v_mov_b32_e32 v39, v10 -; SI-NEXT: v_mov_b32_e32 v48, v8 -; SI-NEXT: v_mov_b32_e32 v49, v6 -; SI-NEXT: v_mov_b32_e32 v50, v4 -; SI-NEXT: v_mov_b32_e32 v51, v2 -; SI-NEXT: v_mov_b32_e32 v52, v0 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v25 +; SI-NEXT: v_mov_b32_e32 v32, v5 +; SI-NEXT: v_mov_b32_e32 v33, v4 +; SI-NEXT: v_mov_b32_e32 v34, v3 +; SI-NEXT: v_mov_b32_e32 v35, v2 +; SI-NEXT: v_mov_b32_e32 v36, v1 +; SI-NEXT: v_mov_b32_e32 v37, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v37 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s7, s28, 16 +; SI-NEXT: s_lshr_b32 s8, s27, 16 +; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: s_lshr_b32 s13, s22, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s40, s19, 16 +; SI-NEXT: s_lshr_b32 s41, s18, 16 +; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v0 ; SI-NEXT: s_cbranch_scc0 .LBB51_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 -; SI-NEXT: v_or_b32_e32 v7, v0, v57 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 -; SI-NEXT: v_or_b32_e32 v8, v0, v56 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 -; SI-NEXT: v_or_b32_e32 v9, v0, v47 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 -; SI-NEXT: v_or_b32_e32 v10, v0, v46 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 ; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: v_or_b32_e32 v11, v0, v45 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: s_lshl_b32 s5, s43, 16 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: v_or_b32_e32 v12, v0, v44 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 -; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: v_or_b32_e32 v13, v0, v43 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s44, s42, 16 +; SI-NEXT: s_or_b32 s5, s5, s44 +; SI-NEXT: s_and_b32 s44, s18, 0xffff +; SI-NEXT: s_lshl_b32 s45, s41, 16 +; SI-NEXT: s_or_b32 s44, s44, s45 +; SI-NEXT: s_and_b32 s45, s19, 0xffff +; SI-NEXT: s_lshl_b32 s46, s40, 16 +; SI-NEXT: s_or_b32 s45, s45, s46 +; SI-NEXT: s_and_b32 s46, s20, 0xffff +; SI-NEXT: s_lshl_b32 s47, s15, 16 +; SI-NEXT: s_or_b32 s46, s46, s47 +; SI-NEXT: s_and_b32 s47, s21, 0xffff +; SI-NEXT: s_lshl_b32 s56, s14, 16 +; SI-NEXT: s_or_b32 s47, s47, s56 +; SI-NEXT: s_and_b32 s56, s22, 0xffff +; SI-NEXT: s_lshl_b32 s57, s13, 16 +; SI-NEXT: s_or_b32 s56, s56, s57 +; SI-NEXT: s_and_b32 s57, s23, 0xffff +; SI-NEXT: s_lshl_b32 s58, s12, 16 +; SI-NEXT: s_or_b32 s57, s57, s58 +; SI-NEXT: s_and_b32 s58, s24, 0xffff +; SI-NEXT: s_lshl_b32 s59, s11, 16 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 -; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: v_or_b32_e32 v14, v0, v42 +; SI-NEXT: s_or_b32 s58, s58, s59 +; SI-NEXT: s_and_b32 s59, s25, 0xffff +; SI-NEXT: s_lshl_b32 s60, s10, 16 +; SI-NEXT: v_or_b32_e32 v14, v0, v51 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 -; SI-NEXT: s_or_b32 s7, s7, s8 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: v_or_b32_e32 v15, v0, v41 +; SI-NEXT: s_or_b32 s59, s59, s60 +; SI-NEXT: s_and_b32 s60, s26, 0xffff +; SI-NEXT: s_lshl_b32 s61, s9, 16 +; SI-NEXT: v_or_b32_e32 v15, v0, v50 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 -; SI-NEXT: s_or_b32 s8, s8, s9 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: v_or_b32_e32 v16, v0, v40 +; SI-NEXT: s_or_b32 s60, s60, s61 +; SI-NEXT: s_and_b32 s61, s27, 0xffff +; SI-NEXT: s_lshl_b32 s62, s8, 16 +; SI-NEXT: v_or_b32_e32 v16, v0, v49 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 -; SI-NEXT: s_or_b32 s9, s9, s10 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_or_b32_e32 v17, v0, v55 +; SI-NEXT: s_or_b32 s61, s61, s62 +; SI-NEXT: s_and_b32 s62, s28, 0xffff +; SI-NEXT: s_lshl_b32 s63, s7, 16 +; SI-NEXT: v_or_b32_e32 v17, v0, v48 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 -; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_or_b32_e32 v18, v0, v54 +; SI-NEXT: s_or_b32 s62, s62, s63 +; SI-NEXT: s_and_b32 s63, s29, 0xffff +; SI-NEXT: s_lshl_b32 s72, s6, 16 +; SI-NEXT: v_or_b32_e32 v18, v0, v39 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 -; SI-NEXT: v_or_b32_e32 v19, v0, v53 +; SI-NEXT: s_or_b32 s63, s63, s72 +; SI-NEXT: v_or_b32_e32 v19, v0, v38 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_mov_b32_e32 v5, s9 -; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: v_mov_b32_e32 v2, s44 +; SI-NEXT: v_mov_b32_e32 v3, s45 +; SI-NEXT: v_mov_b32_e32 v4, s46 +; SI-NEXT: v_mov_b32_e32 v5, s47 +; SI-NEXT: v_mov_b32_e32 v6, s56 +; SI-NEXT: v_mov_b32_e32 v7, s57 +; SI-NEXT: v_mov_b32_e32 v8, s58 +; SI-NEXT: v_mov_b32_e32 v9, s59 +; SI-NEXT: v_mov_b32_e32 v10, s60 +; SI-NEXT: v_mov_b32_e32 v11, s61 +; SI-NEXT: v_mov_b32_e32 v12, s62 +; SI-NEXT: v_mov_b32_e32 v13, s63 ; SI-NEXT: s_cbranch_execnz .LBB51_3 ; SI-NEXT: .LBB51_2: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v57, v0 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v56, v0 -; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v47, v0 -; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v46, v0 -; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v45, v0 -; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v44, v0 -; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v43, v0 -; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: v_or_b32_e32 v0, v51, v0 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s16, s42, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: s_and_b32 s16, s18, 0xffff +; SI-NEXT: s_lshl_b32 s17, s41, 16 +; SI-NEXT: s_add_i32 s19, s19, 3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v41, v0 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_and_b32 s17, s19, 0xffff +; SI-NEXT: s_lshl_b32 s18, s40, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_or_b32_e32 v0, v50, v0 +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: s_and_b32 s18, s20, 0xffff +; SI-NEXT: s_lshl_b32 s15, s15, 16 +; SI-NEXT: s_add_i32 s21, s21, 3 ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: s_or_b32 s15, s15, s18 +; SI-NEXT: s_and_b32 s18, s21, 0xffff +; SI-NEXT: s_lshl_b32 s14, s14, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_or_b32_e32 v0, v40, v0 -; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s14, s14, s18 +; SI-NEXT: s_and_b32 s18, s22, 0xffff +; SI-NEXT: s_lshl_b32 s13, s13, 16 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: v_or_b32_e32 v0, v49, v0 +; SI-NEXT: s_or_b32 s13, s13, s18 +; SI-NEXT: s_and_b32 s18, s23, 0xffff +; SI-NEXT: s_lshl_b32 s12, s12, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 ; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s12, s12, s18 +; SI-NEXT: s_and_b32 s18, s24, 0xffff +; SI-NEXT: s_lshl_b32 s11, s11, 16 +; SI-NEXT: s_add_i32 s25, s25, 3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: v_or_b32_e32 v0, v55, v0 -; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s11, s11, s18 +; SI-NEXT: s_and_b32 s18, s25, 0xffff +; SI-NEXT: s_lshl_b32 s10, s10, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: v_or_b32_e32 v0, v48, v0 +; SI-NEXT: s_or_b32 s10, s10, s18 +; SI-NEXT: s_and_b32 s18, s26, 0xffff +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_add_i32 s27, s27, 3 ; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 -; SI-NEXT: s_or_b32 s7, s8, s7 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_or_b32 s8, s9, s8 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_or_b32 s9, s9, s18 +; SI-NEXT: s_and_b32 s18, s27, 0xffff +; SI-NEXT: s_lshl_b32 s8, s8, 16 ; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: v_or_b32_e32 v0, v54, v0 -; SI-NEXT: s_or_b32 s9, s10, s9 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s8, s8, s18 +; SI-NEXT: s_and_b32 s18, s28, 0xffff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: v_or_b32_e32 v0, v39, v0 +; SI-NEXT: s_or_b32 s7, s7, s18 +; SI-NEXT: s_and_b32 s18, s29, 0xffff +; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 -; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: s_or_b32 s6, s6, s18 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_add_i32 s4, s4, 0x30000 ; SI-NEXT: s_add_i32 s5, s5, 0x30000 -; SI-NEXT: s_add_i32 s6, s6, 0x30000 -; SI-NEXT: s_add_i32 s7, s7, 0x30000 -; SI-NEXT: s_add_i32 s8, s8, 0x30000 -; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s16, s16, 0x30000 +; SI-NEXT: s_add_i32 s17, s17, 0x30000 +; SI-NEXT: s_add_i32 s15, s15, 0x30000 +; SI-NEXT: s_add_i32 s14, s14, 0x30000 +; SI-NEXT: s_add_i32 s13, s13, 0x30000 +; SI-NEXT: s_add_i32 s12, s12, 0x30000 +; SI-NEXT: s_add_i32 s11, s11, 0x30000 ; SI-NEXT: s_add_i32 s10, s10, 0x30000 -; SI-NEXT: v_or_b32_e32 v0, v53, v0 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v38, v0 ; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_mov_b32_e32 v5, s9 -; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: v_mov_b32_e32 v3, s17 +; SI-NEXT: v_mov_b32_e32 v4, s15 +; SI-NEXT: v_mov_b32_e32 v5, s14 +; SI-NEXT: v_mov_b32_e32 v6, s13 +; SI-NEXT: v_mov_b32_e32 v7, s12 +; SI-NEXT: v_mov_b32_e32 v8, s11 +; SI-NEXT: v_mov_b32_e32 v9, s10 +; SI-NEXT: v_mov_b32_e32 v10, s9 +; SI-NEXT: v_mov_b32_e32 v11, s8 +; SI-NEXT: v_mov_b32_e32 v12, s7 +; SI-NEXT: v_mov_b32_e32 v13, s6 ; SI-NEXT: .LBB51_3: ; %end -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB51_4: ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 @@ -27011,29 +26547,27 @@ define <40 x half> @bitcast_v10f64_to_v40f16(<10 x double> %a, i32 %b) { ; SI-LABEL: bitcast_v10f64_to_v40f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v21 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr41 @@ -27043,355 +26577,298 @@ define <40 x half> @bitcast_v10f64_to_v40f16(<10 x double> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB52_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v23 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v23 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v23 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v23 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v23 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v23 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v23 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v23 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v23 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v23 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v23 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v23 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v23 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v4 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v46, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v23 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v3 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v56, v23 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v45, v23 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v19 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v58, v23 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v47, v23 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v17 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v57, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v60, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: .LBB52_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB52_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 -; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 -; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 -; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 -; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 -; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 -; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 -; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 -; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 -; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v1 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 ; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 ; SI-NEXT: .LBB52_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v57 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v56 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v47 -; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v45 -; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v44 -; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v42 -; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v40 -; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v54 -; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v52 -; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v50 -; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v48 -; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v38 -; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 -; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v35 -; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 -; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v31 -; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v29 -; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v27 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v25 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v44 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v41 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v23 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x4c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; SI-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: bitcast_v10f64_to_v40f16: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr36 -; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr34 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr30 -; VI-NEXT: ; implicit-def: $vgpr29 -; VI-NEXT: ; implicit-def: $vgpr28 -; VI-NEXT: ; implicit-def: $vgpr27 -; VI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v54 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v53 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v49 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v50 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v37 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v38 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v33 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v34 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v28 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v30 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v24 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v27 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v21 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v23 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v10f64_to_v40f16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr28 +; VI-NEXT: ; implicit-def: $vgpr27 +; VI-NEXT: ; implicit-def: $vgpr26 ; VI-NEXT: ; implicit-def: $vgpr25 ; VI-NEXT: ; implicit-def: $vgpr24 ; VI-NEXT: ; implicit-def: $vgpr23 @@ -27759,338 +27236,278 @@ define inreg <40 x half> @bitcast_v10f64_to_v40f16_scalar(<10 x double> inreg %a ; SI-LABEL: bitcast_v10f64_to_v40f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 -; SI-NEXT: v_mov_b32_e32 v20, s16 -; SI-NEXT: v_mov_b32_e32 v21, s17 -; SI-NEXT: v_mov_b32_e32 v18, s18 -; SI-NEXT: v_mov_b32_e32 v19, s19 -; SI-NEXT: v_mov_b32_e32 v16, s20 -; SI-NEXT: v_mov_b32_e32 v17, s21 -; SI-NEXT: v_mov_b32_e32 v14, s22 -; SI-NEXT: v_mov_b32_e32 v15, s23 -; SI-NEXT: v_mov_b32_e32 v12, s24 -; SI-NEXT: v_mov_b32_e32 v13, s25 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: v_mov_b32_e32 v18, s16 +; SI-NEXT: v_mov_b32_e32 v19, s17 +; SI-NEXT: v_mov_b32_e32 v16, s18 +; SI-NEXT: v_mov_b32_e32 v17, s19 +; SI-NEXT: v_mov_b32_e32 v14, s20 +; SI-NEXT: v_mov_b32_e32 v15, s21 +; SI-NEXT: v_mov_b32_e32 v12, s22 +; SI-NEXT: v_mov_b32_e32 v13, s23 +; SI-NEXT: v_mov_b32_e32 v10, s24 +; SI-NEXT: v_mov_b32_e32 v11, s25 ; SI-NEXT: v_mov_b32_e32 v8, s26 ; SI-NEXT: v_mov_b32_e32 v9, s27 +; SI-NEXT: v_mov_b32_e32 v6, s28 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mov_b32_e32 v10, s28 -; SI-NEXT: v_mov_b32_e32 v11, s29 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v7, s29 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB53_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v24, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v26, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v28, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v23 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v23 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v49, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v23 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v23 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v23 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v23 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v19 ; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v46, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v16 ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v56, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v3 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v58, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v60, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v38, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v18 ; SI-NEXT: s_cbranch_execnz .LBB53_3 ; SI-NEXT: .LBB53_2: ; %cmp.true -; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 ; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 ; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 ; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 ; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; SI-NEXT: v_add_f64 v[7:8], v[8:9], 1.0 -; SI-NEXT: v_add_f64 v[9:10], v[10:11], 1.0 -; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 -; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 -; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v36 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 ; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 ; SI-NEXT: .LBB53_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v57 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v56 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v47 -; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v45 -; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v44 -; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v42 -; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v40 -; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v54 -; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v52 -; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v50 -; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v48 -; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v38 -; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 -; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v35 -; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 -; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v31 -; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v29 -; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v27 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v25 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v44 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v41 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v23 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x4c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v54 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v53 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v49 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v50 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v37 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v38 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v33 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v34 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v28 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v30 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v24 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v27 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v21 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v23 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB53_4: -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr41 @@ -28100,28 +27517,29 @@ define inreg <40 x half> @bitcast_v10f64_to_v40f16_scalar(<10 x double> inreg %a ; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: s_branch .LBB53_2 ; ; VI-LABEL: bitcast_v10f64_to_v40f16_scalar: @@ -28698,128 +28116,146 @@ define <10 x double> @bitcast_v40f16_to_v10f64(<40 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v40f16_to_v10f64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:36 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v38, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v20 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v23 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v48 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f16_f32_e32 v48, v55 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v32 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v54 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v40 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v53 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v39, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v63, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v52 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB54_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v41 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; kill: killed $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; kill: killed $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; kill: killed $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; kill: killed $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; kill: killed $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; kill: killed $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; kill: killed $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; kill: killed $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; kill: killed $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; kill: killed $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v39 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v37 @@ -28832,9 +28268,11 @@ define <10 x double> @bitcast_v40f16_to_v10f64(<40 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v47 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v45 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v43 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v49 -; SI-NEXT: ; kill: killed $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v41 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v53 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v49 ; SI-NEXT: v_or_b32_e32 v0, v38, v0 ; SI-NEXT: v_or_b32_e32 v1, v36, v1 ; SI-NEXT: v_or_b32_e32 v2, v34, v2 @@ -28846,7 +28284,11 @@ define <10 x double> @bitcast_v40f16_to_v10f64(<40 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v8, v46, v8 ; SI-NEXT: v_or_b32_e32 v9, v44, v9 ; SI-NEXT: v_or_b32_e32 v10, v42, v10 -; SI-NEXT: v_or_b32_e32 v19, v48, v19 +; SI-NEXT: v_or_b32_e32 v11, v40, v11 +; SI-NEXT: v_or_b32_e32 v12, v54, v12 +; SI-NEXT: v_or_b32_e32 v13, v52, v13 +; SI-NEXT: v_or_b32_e32 v14, v50, v14 +; SI-NEXT: v_or_b32_e32 v15, v48, v15 ; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr37 @@ -28870,47 +28312,50 @@ define <10 x double> @bitcast_v40f16_to_v10f64(<40 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; kill: killed $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v51 -; SI-NEXT: v_or_b32_e32 v18, v50, v18 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 ; SI-NEXT: .LBB54_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB54_4 @@ -28918,9 +28363,6 @@ define <10 x double> @bitcast_v40f16_to_v10f64(<40 x half> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v39 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v37 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v36 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 @@ -28938,7 +28380,11 @@ define <10 x double> @bitcast_v40f16_to_v10f64(<40 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v2, v35 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v34 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v62 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -28948,152 +28394,142 @@ define <10 x double> @bitcast_v40f16_to_v10f64(<40 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v33 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v62 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v58 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v58 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v57 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v56 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v56 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v46 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v43 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v43 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v53 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v47 ; SI-NEXT: v_or_b32_e32 v7, v9, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v48 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v10, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v44 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v52 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v41 ; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v50 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v54 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v14, v51 ; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v49 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v16, v14 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v48 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v51 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v50 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_or_b32_e32 v18, v19, v18 @@ -29101,25 +28537,25 @@ define <10 x double> @bitcast_v40f16_to_v10f64(<40 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v19, v21, v19 ; SI-NEXT: .LBB54_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; SI-NEXT: s_setpc_b64 s[30:31] -; +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; ; VI-LABEL: bitcast_v40f16_to_v10f64: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -29649,6 +29085,16 @@ define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a ; SI-LABEL: bitcast_v40f16_to_v10f64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_lshr_b32 s14, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s14 +; SI-NEXT: s_lshr_b32 s14, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s14 +; SI-NEXT: s_lshr_b32 s14, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s14 +; SI-NEXT: s_lshr_b32 s14, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s14 +; SI-NEXT: s_lshr_b32 s12, s21, 16 +; SI-NEXT: s_lshr_b32 s13, s20, 16 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -29665,98 +29111,149 @@ define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v32, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v0 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v63, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v10 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_cvt_f16_f32_e32 v58, v16 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_cvt_f16_f32_e32 v59, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s21 +; SI-NEXT: s_lshr_b32 s10, s23, 16 +; SI-NEXT: s_lshr_b32 s11, s22, 16 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s23 +; SI-NEXT: s_lshr_b32 s8, s25, 16 +; SI-NEXT: s_lshr_b32 s9, s24, 16 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v15 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v63, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s25 +; SI-NEXT: s_lshr_b32 s6, s27, 16 +; SI-NEXT: s_lshr_b32 s7, s26, 16 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s27 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v0 +; SI-NEXT: s_lshr_b32 s4, s29, 16 +; SI-NEXT: s_lshr_b32 s5, s28, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v52, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v58, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v60, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v59, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v57, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v39, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v56, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v38, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v37, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v36, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v35, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v34, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v61, s26 -; SI-NEXT: v_cvt_f16_f32_e32 v25, s29 -; SI-NEXT: v_cvt_f16_f32_e32 v24, s28 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v5 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_cbranch_scc0 .LBB55_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v58 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v59 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v38 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v34 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v45 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v35 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v63 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v44 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v42 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v40 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v54 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v52 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v50 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v48 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v30 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v44 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v60 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v53 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v25 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v23 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: v_or_b32_e32 v0, v60, v0 -; SI-NEXT: v_or_b32_e32 v1, v57, v1 -; SI-NEXT: v_or_b32_e32 v2, v56, v2 -; SI-NEXT: v_or_b32_e32 v3, v37, v3 -; SI-NEXT: v_or_b32_e32 v4, v35, v4 -; SI-NEXT: v_or_b32_e32 v5, v61, v5 -; SI-NEXT: v_or_b32_e32 v6, v24, v6 -; SI-NEXT: v_or_b32_e32 v7, v47, v7 +; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: v_or_b32_e32 v1, v56, v1 +; SI-NEXT: v_or_b32_e32 v2, v46, v2 +; SI-NEXT: v_or_b32_e32 v3, v38, v3 +; SI-NEXT: v_or_b32_e32 v4, v37, v4 +; SI-NEXT: v_or_b32_e32 v5, v34, v5 +; SI-NEXT: v_or_b32_e32 v6, v33, v6 +; SI-NEXT: v_or_b32_e32 v7, v63, v7 ; SI-NEXT: v_or_b32_e32 v8, v62, v8 -; SI-NEXT: v_or_b32_e32 v9, v43, v9 -; SI-NEXT: v_or_b32_e32 v10, v41, v10 -; SI-NEXT: v_or_b32_e32 v11, v55, v11 -; SI-NEXT: v_or_b32_e32 v12, v53, v12 -; SI-NEXT: v_or_b32_e32 v13, v51, v13 -; SI-NEXT: v_or_b32_e32 v14, v49, v14 -; SI-NEXT: v_or_b32_e32 v15, v31, v15 -; SI-NEXT: v_or_b32_e32 v16, v29, v16 -; SI-NEXT: v_or_b32_e32 v17, v27, v17 +; SI-NEXT: v_or_b32_e32 v9, v55, v9 +; SI-NEXT: v_or_b32_e32 v10, v54, v10 +; SI-NEXT: v_or_b32_e32 v11, v51, v11 +; SI-NEXT: v_or_b32_e32 v12, v50, v12 +; SI-NEXT: v_or_b32_e32 v13, v48, v13 +; SI-NEXT: v_or_b32_e32 v14, v30, v14 +; SI-NEXT: v_or_b32_e32 v15, v28, v15 +; SI-NEXT: v_or_b32_e32 v16, v26, v16 +; SI-NEXT: v_or_b32_e32 v17, v24, v17 ; SI-NEXT: v_or_b32_e32 v18, v22, v18 ; SI-NEXT: v_or_b32_e32 v19, v21, v19 ; SI-NEXT: s_cbranch_execnz .LBB55_3 ; SI-NEXT: .LBB55_2: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v58 -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v56 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -29769,10 +29266,11 @@ define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v46 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v37 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -29781,18 +29279,19 @@ define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v45 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v34 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v33 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v36 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v39 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v32 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 @@ -29800,10 +29299,10 @@ define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v63 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v36 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v62 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 @@ -29811,25 +29310,24 @@ define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v60 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v35 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v54 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v51 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v44 ; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v61 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 @@ -29838,20 +29336,20 @@ define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v55 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v48 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v30 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v53 ; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v52 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 @@ -29860,20 +29358,20 @@ define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v50 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v24 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v31 ; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v29 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 @@ -29882,7 +29380,7 @@ define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v28 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 @@ -29893,7 +29391,7 @@ define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v25 ; SI-NEXT: v_or_b32_e32 v16, v18, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v18, v23 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 @@ -29934,87 +29432,86 @@ define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB55_4: -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v59, v48 +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v39, v44 +; SI-NEXT: v_mov_b32_e32 v44, v48 ; SI-NEXT: v_mov_b32_e32 v48, v21 -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v60, v49 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_mov_b32_e32 v38, v45 +; SI-NEXT: v_mov_b32_e32 v45, v49 ; SI-NEXT: v_mov_b32_e32 v49, v20 -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v43, v50 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_mov_b32_e32 v37, v46 +; SI-NEXT: v_mov_b32_e32 v46, v50 ; SI-NEXT: v_mov_b32_e32 v50, v22 -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v44, v51 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_mov_b32_e32 v36, v47 +; SI-NEXT: v_mov_b32_e32 v47, v51 ; SI-NEXT: v_mov_b32_e32 v51, v23 -; SI-NEXT: v_mov_b32_e32 v45, v52 -; SI-NEXT: v_mov_b32_e32 v52, v27 -; SI-NEXT: v_mov_b32_e32 v46, v53 -; SI-NEXT: v_mov_b32_e32 v53, v28 -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v41, v32 -; SI-NEXT: v_mov_b32_e32 v33, v47 -; SI-NEXT: v_mov_b32_e32 v47, v54 -; SI-NEXT: v_mov_b32_e32 v54, v29 -; SI-NEXT: v_mov_b32_e32 v42, v56 -; SI-NEXT: v_mov_b32_e32 v56, v55 -; SI-NEXT: v_mov_b32_e32 v55, v30 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_mov_b32_e32 v35, v56 +; SI-NEXT: v_mov_b32_e32 v56, v52 +; SI-NEXT: v_mov_b32_e32 v52, v24 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v34, v57 +; SI-NEXT: v_mov_b32_e32 v57, v53 +; SI-NEXT: v_mov_b32_e32 v53, v25 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v36, v57 -; SI-NEXT: v_mov_b32_e32 v57, v40 -; SI-NEXT: v_mov_b32_e32 v40, v31 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v39, v58 -; SI-NEXT: v_mov_b32_e32 v58, v37 -; SI-NEXT: v_mov_b32_e32 v37, v34 -; SI-NEXT: v_mov_b32_e32 v34, v24 -; SI-NEXT: v_mov_b32_e32 v32, v38 -; SI-NEXT: v_mov_b32_e32 v38, v35 -; SI-NEXT: v_mov_b32_e32 v35, v25 +; SI-NEXT: v_mov_b32_e32 v33, v58 +; SI-NEXT: v_mov_b32_e32 v58, v54 +; SI-NEXT: v_mov_b32_e32 v54, v26 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v32, v59 +; SI-NEXT: v_mov_b32_e32 v59, v55 +; SI-NEXT: v_mov_b32_e32 v55, v27 +; SI-NEXT: v_mov_b32_e32 v40, v28 +; SI-NEXT: v_mov_b32_e32 v41, v29 +; SI-NEXT: v_mov_b32_e32 v42, v30 +; SI-NEXT: v_mov_b32_e32 v43, v31 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v25, v35 -; SI-NEXT: v_mov_b32_e32 v35, v38 -; SI-NEXT: v_mov_b32_e32 v38, v32 -; SI-NEXT: v_mov_b32_e32 v24, v34 -; SI-NEXT: v_mov_b32_e32 v34, v37 -; SI-NEXT: v_mov_b32_e32 v37, v58 -; SI-NEXT: v_mov_b32_e32 v58, v39 -; SI-NEXT: v_mov_b32_e32 v31, v40 -; SI-NEXT: v_mov_b32_e32 v40, v57 -; SI-NEXT: v_mov_b32_e32 v57, v36 -; SI-NEXT: v_mov_b32_e32 v30, v55 -; SI-NEXT: v_mov_b32_e32 v55, v56 -; SI-NEXT: v_mov_b32_e32 v56, v42 -; SI-NEXT: v_mov_b32_e32 v32, v41 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v27, v55 +; SI-NEXT: v_mov_b32_e32 v55, v59 +; SI-NEXT: v_mov_b32_e32 v59, v32 +; SI-NEXT: v_mov_b32_e32 v26, v54 +; SI-NEXT: v_mov_b32_e32 v54, v58 +; SI-NEXT: v_mov_b32_e32 v58, v33 +; SI-NEXT: v_mov_b32_e32 v25, v53 +; SI-NEXT: v_mov_b32_e32 v53, v57 +; SI-NEXT: v_mov_b32_e32 v57, v34 +; SI-NEXT: v_mov_b32_e32 v24, v52 +; SI-NEXT: v_mov_b32_e32 v52, v56 +; SI-NEXT: v_mov_b32_e32 v56, v35 ; SI-NEXT: v_mov_b32_e32 v23, v51 -; SI-NEXT: v_mov_b32_e32 v51, v44 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v51, v47 +; SI-NEXT: v_mov_b32_e32 v47, v36 ; SI-NEXT: v_mov_b32_e32 v22, v50 -; SI-NEXT: v_mov_b32_e32 v50, v43 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v50, v46 +; SI-NEXT: v_mov_b32_e32 v46, v37 ; SI-NEXT: v_mov_b32_e32 v20, v49 -; SI-NEXT: v_mov_b32_e32 v49, v60 -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v49, v45 +; SI-NEXT: v_mov_b32_e32 v45, v38 ; SI-NEXT: v_mov_b32_e32 v21, v48 -; SI-NEXT: v_mov_b32_e32 v48, v59 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v29, v54 -; SI-NEXT: v_mov_b32_e32 v54, v47 -; SI-NEXT: v_mov_b32_e32 v47, v33 -; SI-NEXT: v_mov_b32_e32 v28, v53 -; SI-NEXT: v_mov_b32_e32 v53, v46 -; SI-NEXT: v_mov_b32_e32 v27, v52 -; SI-NEXT: v_mov_b32_e32 v52, v45 +; SI-NEXT: v_mov_b32_e32 v48, v44 +; SI-NEXT: v_mov_b32_e32 v44, v39 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v31, v43 +; SI-NEXT: v_mov_b32_e32 v30, v42 +; SI-NEXT: v_mov_b32_e32 v29, v41 +; SI-NEXT: v_mov_b32_e32 v28, v40 ; SI-NEXT: s_branch .LBB55_2 ; ; VI-LABEL: bitcast_v40f16_to_v10f64_scalar: @@ -30521,153 +30018,159 @@ end: define <40 x half> @bitcast_v40i16_to_v40f16(<40 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v40i16_to_v40f16: ; SI: ; %bb.0: +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; kill: killed $vgpr37 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; kill: killed $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; kill: killed $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; kill: killed $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; kill: killed $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; kill: killed $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; kill: killed $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; kill: killed $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; kill: killed $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; kill: killed $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; kill: killed $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; kill: killed $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; kill: killed $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; kill: killed $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; kill: killed $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; kill: killed $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; kill: killed $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 ; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; kill: killed $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; kill: killed $vgpr37 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; kill: killed $vgpr37 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v19 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; kill: killed $vgpr37 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v18 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; kill: killed $vgpr37 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v17 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; kill: killed $vgpr37 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; kill: killed $vgpr37 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v15 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; kill: killed $vgpr37 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v14 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; kill: killed $vgpr37 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v13 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; kill: killed $vgpr37 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v12 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v0 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; kill: killed $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; kill: killed $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB56_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v33, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v4 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v7 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v10 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v13 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v19 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v30 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v33 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v36 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v5 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr15 @@ -30675,7 +30178,10 @@ define <40 x half> @bitcast_v40i16_to_v40f16(<40 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr23 @@ -30686,355 +30192,290 @@ define <40 x half> @bitcast_v40i16_to_v40f16(<40 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v39 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v49 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v51 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v52 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v9 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v53 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v10 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v54 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v11 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v55 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: .LBB56_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB56_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_add_i32_e32 v39, vcc, 3, v60 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v21 -; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v23 -; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v25 -; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v26 -; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 -; SI-NEXT: v_add_i32_e32 v38, vcc, 3, v38 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v27 -; SI-NEXT: v_add_i32_e32 v39, vcc, 3, v39 -; SI-NEXT: v_add_i32_e32 v49, vcc, 3, v49 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v39 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v38, vcc, 3, v62 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v28 -; SI-NEXT: v_add_i32_e32 v50, vcc, 3, v50 -; SI-NEXT: v_add_i32_e32 v51, vcc, 3, v51 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v29 -; SI-NEXT: v_add_i32_e32 v52, vcc, 3, v52 -; SI-NEXT: v_add_i32_e32 v53, vcc, 3, v53 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v2 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v30 -; SI-NEXT: v_add_i32_e32 v54, vcc, 3, v54 -; SI-NEXT: v_add_i32_e32 v55, vcc, 3, v55 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v38 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 -; SI-NEXT: v_add_i32_e32 v48, vcc, 3, v48 -; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v3 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v39 -; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v4 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v21 +; SI-NEXT: v_add_i32_e32 v36, vcc, 3, v36 ; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v5 +; SI-NEXT: v_add_i32_e32 v35, vcc, 3, v35 ; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v6 +; SI-NEXT: v_add_i32_e32 v34, vcc, 3, v34 ; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v7 +; SI-NEXT: v_add_i32_e32 v33, vcc, 3, v33 ; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v8 +; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32 ; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v9 +; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v31 ; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v10 +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 ; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v53 -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v54 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v48 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v55 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v11 +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v37, vcc, 3, v63 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v44, v28 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v36 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: .LBB56_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v34 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v63 -; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v61 -; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v59 -; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v62 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v57 -; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v47 -; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v45 -; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v43 -; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v41 -; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v44 -; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 -; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v2, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v43 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v2, v40 -; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v6, v39 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v8, v51 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x4c, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v10, v55 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v12, v42 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v45 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v37 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v47 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v49 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v57 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v53 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v40i16_to_v40f16: @@ -31433,369 +30874,286 @@ define inreg <40 x half> @bitcast_v40i16_to_v40f16_scalar(<40 x i16> inreg %a, i ; SI-LABEL: bitcast_v40i16_to_v40f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v27 +; SI-NEXT: s_lshr_b32 s43, s29, 16 +; SI-NEXT: s_lshr_b32 s42, s28, 16 +; SI-NEXT: s_lshr_b32 s41, s27, 16 +; SI-NEXT: s_lshr_b32 s40, s26, 16 +; SI-NEXT: s_lshr_b32 s15, s25, 16 +; SI-NEXT: s_lshr_b32 s14, s24, 16 +; SI-NEXT: s_lshr_b32 s13, s23, 16 +; SI-NEXT: s_lshr_b32 s12, s22, 16 +; SI-NEXT: s_lshr_b32 s11, s21, 16 +; SI-NEXT: s_lshr_b32 s10, s20, 16 +; SI-NEXT: s_lshr_b32 s9, s19, 16 +; SI-NEXT: s_lshr_b32 s8, s18, 16 +; SI-NEXT: s_lshr_b32 s7, s17, 16 +; SI-NEXT: s_lshr_b32 s6, s16, 16 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v1 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v0 ; SI-NEXT: s_cbranch_scc0 .LBB57_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v30, v15 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v62, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s17 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v63, s18 -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v30, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v59, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s21 -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v30, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v57, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v61, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v46, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v60, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v43, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v58, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v41, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v56, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v18 -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v30, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v42 ; SI-NEXT: s_cbranch_execnz .LBB57_3 ; SI-NEXT: .LBB57_2: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v15 -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v17 -; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v53, vcc, 3, v42 ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v35, vcc, 3, v41 ; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v40 ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v34, vcc, 3, v55 ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v43 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: s_add_i32 s43, s43, 3 ; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_add_i32 s42, s42, 3 ; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s41, s41, 3 ; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_add_i32 s40, s40, 3 ; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s15, s15, 3 ; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s14, s14, 3 ; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s13, s13, 3 ; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s12, s12, 3 ; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s11, s11, 3 ; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s10, s10, 3 ; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s9, s9, 3 ; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s8, s8, 3 ; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s7, s7, 3 ; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s6, s6, 3 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v62, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v63, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v59, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v57, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v61, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v46, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v60, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v43, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v58, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v41, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v56, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v32, v18 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v26 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v6, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 ; SI-NEXT: .LBB57_3: ; %end -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v62 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v63 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: v_cvt_f16_f32_e32 v1, v27 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v59 -; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v57 -; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v46 -; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v43 -; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v41 -; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v53 -; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v50 -; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v39 -; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 -; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v35 -; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 -; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v31 -; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 -; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 -; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v30 -; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v8 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v17 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v21 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v12 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v25 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v15 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v29 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v18 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v33 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v22 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v37 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v26 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v39 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v30 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v49 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v34 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v51 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v35 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v55 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v52 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v48 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x4c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB57_4: -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: s_branch .LBB57_2 ; ; VI-LABEL: bitcast_v40i16_to_v40f16_scalar: @@ -32445,394 +31803,344 @@ define <40 x i16> @bitcast_v40f16_to_v40i16(<40 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v40f16_to_v40i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:16 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:36 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:40 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v15 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v32 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v30 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f16_f32_e32 v9, v41 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f16_f32_e32 v8, v42 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v43 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v44 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v45 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v46 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v47 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v56 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v57 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v58 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v49 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB58_2 ; SI-NEXT: ; %bb.1: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v23, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v38 +; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 +; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v35 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v32 +; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v35 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v36 -; SI-NEXT: v_or_b32_e32 v50, v24, v23 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v54 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_or_b32_e32 v49, v26, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v53 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_or_b32_e32 v36, v27, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v52 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v32 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_or_b32_e32 v34, v27, v26 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v51 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_or_b32_e32 v32, v29, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v22 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v21 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v28 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_or_b32_e32 v22, v22, v32 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v29 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_or_b32_e32 v18, v18, v32 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_or_b32_e32 v23, v23, v32 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_or_b32_e32 v25, v25, v32 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_or_b32_e32 v5, v5, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_or_b32_e32 v3, v3, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; SI-NEXT: v_or_b32_e32 v9, v9, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_or_b32_e32 v14, v14, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v10 -; SI-NEXT: v_or_b32_e32 v12, v12, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v17 -; SI-NEXT: v_or_b32_e32 v18, v18, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v16 -; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; SI-NEXT: v_or_b32_e32 v35, v30, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_or_b32_e32 v33, v33, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 +; SI-NEXT: v_or_b32_e32 v12, v12, v32 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v20 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v39 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v37 -; SI-NEXT: v_or_b32_e32 v48, v30, v39 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v38 -; SI-NEXT: v_or_b32_e32 v39, v20, v30 -; SI-NEXT: v_or_b32_e32 v15, v15, v22 -; SI-NEXT: v_or_b32_e32 v13, v13, v21 -; SI-NEXT: v_or_b32_e32 v11, v11, v28 -; SI-NEXT: v_or_b32_e32 v6, v6, v19 -; SI-NEXT: v_or_b32_e32 v4, v4, v29 -; SI-NEXT: v_alignbit_b32 v40, v39, v23, 16 -; SI-NEXT: v_alignbit_b32 v55, v48, v24, 16 -; SI-NEXT: v_alignbit_b32 v54, v33, v25, 16 -; SI-NEXT: v_alignbit_b32 v53, v35, v26, 16 -; SI-NEXT: v_alignbit_b32 v52, v18, v27, 16 -; SI-NEXT: v_alignbit_b32 v51, v12, v22, 16 -; SI-NEXT: v_alignbit_b32 v22, v14, v21, 16 -; SI-NEXT: v_alignbit_b32 v21, v9, v28, 16 -; SI-NEXT: v_alignbit_b32 v20, v3, v19, 16 -; SI-NEXT: v_alignbit_b32 v19, v5, v29, 16 -; SI-NEXT: .LBB58_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v50 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v40 -; SI-NEXT: v_or_b32_e32 v23, v23, v24 -; SI-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v39 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v38 -; SI-NEXT: v_or_b32_e32 v23, v23, v24 -; SI-NEXT: v_add_i32_e32 v24, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v23, v24, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v49 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v55 -; SI-NEXT: v_or_b32_e32 v23, v23, v24 -; SI-NEXT: v_add_i32_e32 v24, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v23, v24, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v48 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v37 -; SI-NEXT: v_or_b32_e32 v23, v23, v24 -; SI-NEXT: v_add_i32_e32 v24, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v23, v24, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v36 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v54 -; SI-NEXT: v_or_b32_e32 v23, v23, v24 -; SI-NEXT: v_add_i32_e32 v24, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v23, v24, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v33 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v31 -; SI-NEXT: v_or_b32_e32 v23, v23, v24 -; SI-NEXT: v_add_i32_e32 v24, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v23, v24, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v34 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v53 -; SI-NEXT: v_or_b32_e32 v23, v23, v24 -; SI-NEXT: v_add_i32_e32 v24, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v23, v24, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v35 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v23, v16 -; SI-NEXT: v_add_i32_e32 v23, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v16, v23, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v32 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v52 -; SI-NEXT: v_or_b32_e32 v16, v16, v23 -; SI-NEXT: v_add_i32_e32 v23, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v16, v23, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v16, v16, v17 -; SI-NEXT: v_add_i32_e32 v17, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v51 -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v15, v15, v16 -; SI-NEXT: v_add_i32_e32 v16, vcc, 40, v0 -; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_add_i32_e32 v12, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen -; SI-NEXT: buffer_store_dword v10, v12, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v13 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v22 -; SI-NEXT: v_or_b32_e32 v10, v10, v12 -; SI-NEXT: v_add_i32_e32 v12, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v10, v12, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v10, v7 -; SI-NEXT: v_add_i32_e32 v10, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v7, v10, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v21 -; SI-NEXT: v_or_b32_e32 v7, v7, v10 -; SI-NEXT: v_add_i32_e32 v10, vcc, 56, v0 -; SI-NEXT: buffer_store_dword v7, v10, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: v_add_i32_e32 v8, vcc, 60, v0 -; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v20 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v6, v6, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, 64, v0 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 -; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen -; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v19 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 -; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_or_b32_e32 v26, v26, v32 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_or_b32_e32 v28, v28, v32 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v5 +; SI-NEXT: v_or_b32_e32 v6, v6, v32 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v3 +; SI-NEXT: v_or_b32_e32 v30, v30, v32 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v38 +; SI-NEXT: v_or_b32_e32 v2, v2, v32 +; SI-NEXT: v_or_b32_e32 v0, v0, v20 +; SI-NEXT: v_or_b32_e32 v29, v29, v39 +; SI-NEXT: v_or_b32_e32 v4, v4, v37 +; SI-NEXT: v_or_b32_e32 v8, v8, v36 +; SI-NEXT: v_or_b32_e32 v27, v27, v49 +; SI-NEXT: v_or_b32_e32 v10, v10, v34 +; SI-NEXT: v_or_b32_e32 v14, v14, v33 +; SI-NEXT: v_or_b32_e32 v24, v24, v50 +; SI-NEXT: v_or_b32_e32 v16, v16, v31 +; SI-NEXT: v_or_b32_e32 v21, v21, v51 +; SI-NEXT: v_alignbit_b32 v48, v2, v20, 16 +; SI-NEXT: v_alignbit_b32 v39, v30, v39, 16 +; SI-NEXT: v_alignbit_b32 v38, v6, v37, 16 +; SI-NEXT: v_alignbit_b32 v37, v28, v36, 16 +; SI-NEXT: v_alignbit_b32 v36, v26, v49, 16 +; SI-NEXT: v_alignbit_b32 v35, v12, v34, 16 +; SI-NEXT: v_alignbit_b32 v34, v25, v33, 16 +; SI-NEXT: v_alignbit_b32 v33, v23, v50, 16 +; SI-NEXT: v_alignbit_b32 v32, v18, v31, 16 +; SI-NEXT: v_alignbit_b32 v31, v22, v51, 16 +; SI-NEXT: .LBB58_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v48 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v20 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x4c, v0 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v39 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v2, v2, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v37 +; SI-NEXT: v_or_b32_e32 v3, v20, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v38 +; SI-NEXT: v_or_b32_e32 v6, v6, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v4, v4, v20 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v36 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v8, v8, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v34 +; SI-NEXT: v_or_b32_e32 v9, v20, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v35 +; SI-NEXT: v_or_b32_e32 v12, v12, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v10, v10, v20 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v33 +; SI-NEXT: v_or_b32_e32 v14, v14, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v20, v15 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v32 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v16, v16, v20 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v31 +; SI-NEXT: v_or_b32_e32 v18, v18, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v40f16_to_v40i16: @@ -33232,6 +32540,31 @@ define inreg <40 x i16> @bitcast_v40f16_to_v40i16_scalar(<40 x half> inreg %a, i ; SI-LABEL: bitcast_v40f16_to_v40i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v1 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s22 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s8, s27, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill @@ -33244,343 +32577,316 @@ define inreg <40 x i16> @bitcast_v40f16_to_v40i16_scalar(<40 x half> inreg %a, i ; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v42, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v3 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_cvt_f16_f32_e32 v56, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s6 +; SI-NEXT: s_lshr_b32 s7, s28, 16 +; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: s_lshr_b32 s12, s22, 16 +; SI-NEXT: s_lshr_b32 s13, s20, 16 +; SI-NEXT: s_lshr_b32 s14, s18, 16 +; SI-NEXT: s_lshr_b32 s15, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v40, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v13 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v58, s16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v59, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v57, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v56, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v47, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v46, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v45, s28 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v38, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v20, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v36, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v16, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v34, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v12, s26 -; SI-NEXT: v_cvt_f16_f32_e32 v25, s29 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v6 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v59, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v18 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_cbranch_scc0 .LBB59_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_cbranch_execnz .LBB59_3 ; SI-NEXT: .LBB59_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v25 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v1 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v1 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v32 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_or_b32_e32 v45, v22, v17 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v54 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v1 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v57 -; SI-NEXT: v_or_b32_e32 v54, v22, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v51 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v58 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_or_b32_e32 v57, v3, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_or_b32_e32 v58, v1, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v47 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_or_b32_e32 v43, v3, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v32 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v44 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v36 +; SI-NEXT: v_or_b32_e32 v53, v16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v54 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_or_b32_e32 v47, v1, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v41 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_or_b32_e32 v40, v3, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_or_b32_e32 v41, v1, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v43 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v28 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_or_b32_e32 v54, v16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v58 +; SI-NEXT: v_or_b32_e32 v44, v21, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v46 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v30 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 -; SI-NEXT: v_or_b32_e32 v51, v22, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v48 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v1 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v22 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v48 -; SI-NEXT: v_or_b32_e32 v2, v2, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v53 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v49 -; SI-NEXT: v_or_b32_e32 v4, v4, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v52 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_or_b32_e32 v43, v18, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v41 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v21 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v53 -; SI-NEXT: v_or_b32_e32 v6, v6, v21 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v22 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v52 -; SI-NEXT: v_or_b32_e32 v8, v8, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v44 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v55 -; SI-NEXT: v_or_b32_e32 v10, v10, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_or_b32_e32 v58, v16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v22 +; SI-NEXT: v_or_b32_e32 v46, v21, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v55 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v24 +; SI-NEXT: v_or_b32_e32 v41, v18, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v59 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v21 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v44 -; SI-NEXT: v_or_b32_e32 v14, v14, v21 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v42 -; SI-NEXT: v_or_b32_e32 v18, v18, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v55, v21, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v48 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v59, v18, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v42 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v22 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v56 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v46 -; SI-NEXT: v_or_b32_e32 v16, v16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v59 -; SI-NEXT: v_or_b32_e32 v12, v12, v22 -; SI-NEXT: v_or_b32_e32 v20, v20, v21 -; SI-NEXT: v_lshr_b64 v[25:26], v[17:18], 16 -; SI-NEXT: v_or_b32_e32 v50, v23, v1 -; SI-NEXT: v_lshr_b64 v[38:39], v[19:20], 16 -; SI-NEXT: v_lshr_b64 v[36:37], v[15:16], 16 -; SI-NEXT: v_lshr_b64 v[34:35], v[11:12], 16 -; SI-NEXT: v_lshr_b64 v[32:33], v[13:14], 16 -; SI-NEXT: v_lshr_b64 v[30:31], v[9:10], 16 -; SI-NEXT: v_lshr_b64 v[28:29], v[7:8], 16 -; SI-NEXT: v_lshr_b64 v[26:27], v[5:6], 16 -; SI-NEXT: v_lshr_b64 v[23:24], v[3:4], 16 -; SI-NEXT: v_lshr_b64 v[21:22], v[1:2], 16 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v21 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v42, v20, v18 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v48 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v57 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v50 +; SI-NEXT: v_or_b32_e32 v17, v17, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v20 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v21 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v57 +; SI-NEXT: v_or_b32_e32 v15, v15, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v52 +; SI-NEXT: v_or_b32_e32 v13, v13, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v56 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v45 +; SI-NEXT: v_or_b32_e32 v11, v11, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v20 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v20 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v56 +; SI-NEXT: v_or_b32_e32 v9, v9, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v49 +; SI-NEXT: v_or_b32_e32 v7, v7, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v21 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v20 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v40 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v51 +; SI-NEXT: v_or_b32_e32 v3, v3, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v47 +; SI-NEXT: v_or_b32_e32 v5, v5, v21 +; SI-NEXT: v_or_b32_e32 v1, v1, v20 +; SI-NEXT: v_lshr_b64 v[38:39], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[36:37], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[34:35], v[4:5], 16 +; SI-NEXT: v_lshr_b64 v[32:33], v[6:7], 16 +; SI-NEXT: v_lshr_b64 v[30:31], v[8:9], 16 +; SI-NEXT: v_lshr_b64 v[28:29], v[10:11], 16 +; SI-NEXT: v_lshr_b64 v[26:27], v[12:13], 16 +; SI-NEXT: v_lshr_b64 v[24:25], v[14:15], 16 +; SI-NEXT: v_lshr_b64 v[22:23], v[16:17], 16 +; SI-NEXT: v_lshr_b64 v[20:21], v[18:19], 16 ; SI-NEXT: .LBB59_3: ; %end -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v38 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v58 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v59 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v36 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v57 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v56 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v34 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v47 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v46 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v45 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v25 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v42 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v43 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v32 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v44 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v41 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v30 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v55 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v28 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v52 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v40 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v26 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v53 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v51 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v23 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v49 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v50 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v21 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v38 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v53 +; SI-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v47 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x4c, v0 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v43 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v40 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v34 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v54 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v51 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v32 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v44 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v49 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v41 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v30 +; SI-NEXT: v_or_b32_e32 v8, v8, v10 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v56 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v58 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v28 +; SI-NEXT: v_or_b32_e32 v10, v10, v12 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v45 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v46 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v26 +; SI-NEXT: v_or_b32_e32 v12, v12, v14 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v52 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v59 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v24 +; SI-NEXT: v_or_b32_e32 v14, v14, v16 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v57 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v55 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v22 +; SI-NEXT: v_or_b32_e32 v16, v16, v18 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v50 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v42 ; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -33593,7 +32899,12 @@ define inreg <40 x i16> @bitcast_v40f16_to_v40i16_scalar(<40 x half> inreg %a, i ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v18, v18, v20 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v48 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB59_4: ; SI-NEXT: s_branch .LBB59_2 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll index 3d9c7681b3132..8a0d00ea6164f 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll @@ -977,25 +977,29 @@ define <4 x i16> @bitcast_i64_to_v4i16(i64 %a, i32 %b) { ; SI-LABEL: bitcast_i64_to_v4i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v4, v1 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_alignbit_b32 v1, v4, v0, 16 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_alignbit_b32 v2, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc -; SI-NEXT: v_alignbit_b32 v1, v4, v0, 16 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: v_alignbit_b32 v2, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; SI-NEXT: ; %bb.4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_mov_b32_e32 v2, v4 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_i64_to_v4i16: @@ -1074,10 +1078,14 @@ define inreg <4 x i16> @bitcast_i64_to_v4i16_scalar(i64 inreg %a, i32 inreg %b) ; SI-NEXT: s_lshr_b32 s8, s17, 16 ; SI-NEXT: s_lshr_b64 s[4:5], s[16:17], 16 ; SI-NEXT: .LBB13_3: ; %end -; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_mov_b32_e32 v1, s4 -; SI-NEXT: v_mov_b32_e32 v2, s17 -; SI-NEXT: v_mov_b32_e32 v3, s8 +; SI-NEXT: s_and_b32 s5, s16, 0xffff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s8, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB13_4: ; SI-NEXT: ; implicit-def: $sgpr4 @@ -1157,10 +1165,13 @@ define i64 @bitcast_v4i16_to_i64(<4 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v4i16_to_i64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v5, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_mov_b32_e32 v4, v1 +; SI-NEXT: v_mov_b32_e32 v3, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v0 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -1172,23 +1183,23 @@ define i64 @bitcast_v4i16_to_i64(<4 x i16> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB14_3: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v5 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v0, v0, v4 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v0, v0, v5 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB14_2 ; SI-NEXT: .LBB14_4: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v5 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v4 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v0, v4, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_or_b32_e32 v0, v5, v0 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x30000, v1 ; SI-NEXT: s_or_b64 exec, exec, s[4:5] @@ -1262,24 +1273,26 @@ define inreg i64 @bitcast_v4i16_to_i64_scalar(<4 x i16> inreg %a, i32 inreg %b) ; SI-LABEL: bitcast_v4i16_to_i64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: s_lshr_b32 s8, s17, 16 +; SI-NEXT: s_lshr_b32 s9, s16, 16 +; SI-NEXT: s_cmp_lg_u32 s18, 0 ; SI-NEXT: s_cbranch_scc0 .LBB15_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_lshl_b32 s5, s9, 16 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s8, 16 ; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_cbranch_execnz .LBB15_3 ; SI-NEXT: .LBB15_2: ; %cmp.true ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_lshl_b32 s5, s9, 16 +; SI-NEXT: s_add_i32 s17, s17, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s8, 16 ; SI-NEXT: s_or_b32 s5, s6, s5 ; SI-NEXT: s_add_i32 s4, s4, 0x30000 ; SI-NEXT: s_add_i32 s5, s5, 0x30000 @@ -1373,43 +1386,45 @@ define <4 x half> @bitcast_i64_to_v4f16(i64 %a, i32 %b) { ; SI-LABEL: bitcast_i64_to_v4f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v5, v1 -; SI-NEXT: v_mov_b32_e32 v4, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB16_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB16_4 -; SI-NEXT: .LBB16_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB16_3: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v4 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB16_2 -; SI-NEXT: .LBB16_4: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v4 -; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: .LBB16_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB16_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: .LBB16_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_i64_to_v4f16: @@ -1480,28 +1495,36 @@ define inreg <4 x half> @bitcast_i64_to_v4f16_scalar(i64 inreg %a, i32 inreg %b) ; SI-NEXT: s_cbranch_scc0 .LBB17_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s16 ; SI-NEXT: s_cbranch_execnz .LBB17_3 ; SI-NEXT: .LBB17_2: ; %cmp.true ; SI-NEXT: s_add_u32 s4, s16, 3 ; SI-NEXT: s_addc_u32 s5, s17, 0 ; SI-NEXT: s_lshr_b32 s6, s4, 16 ; SI-NEXT: s_lshr_b32 s7, s5, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s6 ; SI-NEXT: .LBB17_3: ; %end +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-NEXT: v_or_b32_e32 v1, v4, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB17_4: -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: s_branch .LBB17_2 ; ; VI-LABEL: bitcast_i64_to_v4f16_scalar: @@ -1577,11 +1600,17 @@ define i64 @bitcast_v4f16_to_i64(<4 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v4f16_to_i64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v1 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v1 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -1594,32 +1623,32 @@ define i64 @bitcast_v4f16_to_i64(<4 x half> %a, i32 %b) { ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB18_3: ; %cmp.false ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 ; SI-NEXT: v_or_b32_e32 v0, v5, v0 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB18_2 ; SI-NEXT: .LBB18_4: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -1692,11 +1721,17 @@ define inreg i64 @bitcast_v4f16_to_i64_scalar(<4 x half> inreg %a, i32 inreg %b) ; SI-LABEL: bitcast_v4f16_to_i64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v5, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v4, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v3, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v2, s18 -; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: s_lshr_b32 s5, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v6 +; SI-NEXT: s_cmp_lg_u32 s18, 0 ; SI-NEXT: s_cbranch_scc0 .LBB19_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 @@ -1812,39 +1847,39 @@ define <4 x bfloat> @bitcast_i64_to_v4bf16(i64 %a, i32 %b) { ; SI-LABEL: bitcast_i64_to_v4bf16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v5, v1 -; SI-NEXT: v_mov_b32_e32 v4, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB20_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB20_4 -; SI-NEXT: .LBB20_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB20_3: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v5 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB20_2 -; SI-NEXT: .LBB20_4: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v4 -; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v0 +; SI-NEXT: ; %bb.4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v5 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v4 +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v3 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_i64_to_v4bf16: @@ -1914,29 +1949,33 @@ define inreg <4 x bfloat> @bitcast_i64_to_v4bf16_scalar(i64 inreg %a, i32 inreg ; SI-NEXT: s_cmp_lg_u32 s18, 0 ; SI-NEXT: s_cbranch_scc0 .LBB21_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_and_b32 s6, s17, 0xffff0000 -; SI-NEXT: s_lshl_b32 s7, s17, 16 -; SI-NEXT: s_and_b32 s8, s16, 0xffff0000 -; SI-NEXT: s_lshl_b32 s9, s16, 16 +; SI-NEXT: s_and_b32 s7, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s6, s17, 16 +; SI-NEXT: s_and_b32 s9, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s8, s16, 16 ; SI-NEXT: s_cbranch_execnz .LBB21_3 ; SI-NEXT: .LBB21_2: ; %cmp.true ; SI-NEXT: s_add_u32 s4, s16, 3 ; SI-NEXT: s_addc_u32 s5, s17, 0 -; SI-NEXT: s_and_b32 s6, s5, 0xffff0000 -; SI-NEXT: s_lshl_b32 s7, s5, 16 -; SI-NEXT: s_and_b32 s8, s4, 0xffff0000 -; SI-NEXT: s_lshl_b32 s9, s4, 16 +; SI-NEXT: s_and_b32 s7, s5, 0xffff0000 +; SI-NEXT: s_lshl_b32 s6, s5, 16 +; SI-NEXT: s_and_b32 s9, s4, 0xffff0000 +; SI-NEXT: s_lshl_b32 s8, s4, 16 ; SI-NEXT: .LBB21_3: ; %end -; SI-NEXT: v_mov_b32_e32 v0, s9 -; SI-NEXT: v_mov_b32_e32 v1, s8 -; SI-NEXT: v_mov_b32_e32 v2, s7 -; SI-NEXT: v_mov_b32_e32 v3, s6 +; SI-NEXT: v_mul_f32_e64 v0, 1.0, s9 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_mul_f32_e64 v0, 1.0, s8 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s7 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s6 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB21_4: -; SI-NEXT: ; implicit-def: $sgpr9 ; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr9 ; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr7 ; SI-NEXT: s_branch .LBB21_2 ; ; VI-LABEL: bitcast_i64_to_v4bf16_scalar: @@ -2012,11 +2051,15 @@ define i64 @bitcast_v4bf16_to_i64(<4 x bfloat> %a, i32 %b) { ; SI-LABEL: bitcast_v4bf16_to_i64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; SI-NEXT: v_mul_f32_e32 v4, 1.0, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; SI-NEXT: v_mul_f32_e32 v5, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v1 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -2029,13 +2072,13 @@ define i64 @bitcast_v4bf16_to_i64(<4 x bfloat> %a, i32 %b) { ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB22_3: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 ; SI-NEXT: v_alignbit_b32 v0, v0, v5, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_alignbit_b32 v1, v1, v3, 16 ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB22_2 ; SI-NEXT: .LBB22_4: ; %cmp.true @@ -2044,9 +2087,9 @@ define i64 @bitcast_v4bf16_to_i64(<4 x bfloat> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 @@ -2271,11 +2314,15 @@ define inreg i64 @bitcast_v4bf16_to_i64_scalar(<4 x bfloat> inreg %a, i32 inreg ; SI-LABEL: bitcast_v4bf16_to_i64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_cmp_lg_u32 s20, 0 -; SI-NEXT: v_mul_f32_e64 v8, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v5, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v7, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v3, 1.0, s18 +; SI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_and_b32 s6, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s7, s16, 16 +; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s6 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s7 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s4 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s5 ; SI-NEXT: s_cbranch_scc0 .LBB23_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v8 @@ -4189,26 +4236,28 @@ define <4 x i16> @bitcast_f64_to_v4i16(double %a, i32 %b) { ; SI-LABEL: bitcast_f64_to_v4i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v5, v1 -; SI-NEXT: v_mov_b32_e32 v4, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_alignbit_b32 v1, v5, v4, 16 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; SI-NEXT: v_alignbit_b32 v2, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; SI-NEXT: v_alignbit_b32 v1, v5, v4, 16 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_alignbit_b32 v2, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; SI-NEXT: ; %bb.4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_mov_b32_e32 v0, v4 -; SI-NEXT: v_mov_b32_e32 v2, v5 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_f64_to_v4i16: @@ -4279,22 +4328,26 @@ define inreg <4 x i16> @bitcast_f64_to_v4i16_scalar(double inreg %a, i32 inreg % ; SI-NEXT: s_lshr_b64 s[4:5], s[16:17], 16 ; SI-NEXT: s_cbranch_execnz .LBB37_4 ; SI-NEXT: .LBB37_2: ; %cmp.true -; SI-NEXT: v_add_f64 v[4:5], s[16:17], 1.0 -; SI-NEXT: v_lshr_b64 v[1:2], v[4:5], 16 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; SI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; SI-NEXT: v_lshr_b64 v[2:3], v[0:1], 16 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; SI-NEXT: s_branch .LBB37_5 ; SI-NEXT: .LBB37_3: ; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr8 ; SI-NEXT: s_branch .LBB37_2 ; SI-NEXT: .LBB37_4: -; SI-NEXT: v_mov_b32_e32 v5, s17 -; SI-NEXT: v_mov_b32_e32 v4, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v0, s16 ; SI-NEXT: v_mov_b32_e32 v3, s8 -; SI-NEXT: v_mov_b32_e32 v1, s4 +; SI-NEXT: v_mov_b32_e32 v2, s4 ; SI-NEXT: .LBB37_5: ; %end -; SI-NEXT: v_mov_b32_e32 v0, v4 -; SI-NEXT: v_mov_b32_e32 v2, v5 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_f64_to_v4i16_scalar: @@ -4369,10 +4422,13 @@ define double @bitcast_v4i16_to_f64(<4 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v4i16_to_f64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v5, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_mov_b32_e32 v4, v1 +; SI-NEXT: v_mov_b32_e32 v3, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v0 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -4384,23 +4440,23 @@ define double @bitcast_v4i16_to_f64(<4 x i16> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB38_3: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v5 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v0, v0, v4 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v0, v0, v5 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB38_2 ; SI-NEXT: .LBB38_4: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v5 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v4 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v0, v4, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_or_b32_e32 v0, v5, v0 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x30000, v1 ; SI-NEXT: s_or_b64 exec, exec, s[4:5] @@ -4474,24 +4530,26 @@ define inreg double @bitcast_v4i16_to_f64_scalar(<4 x i16> inreg %a, i32 inreg % ; SI-LABEL: bitcast_v4i16_to_f64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: s_lshr_b32 s8, s17, 16 +; SI-NEXT: s_lshr_b32 s9, s16, 16 +; SI-NEXT: s_cmp_lg_u32 s18, 0 ; SI-NEXT: s_cbranch_scc0 .LBB39_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_lshl_b32 s5, s9, 16 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s8, 16 ; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_cbranch_execnz .LBB39_3 ; SI-NEXT: .LBB39_2: ; %cmp.true ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_lshl_b32 s5, s9, 16 +; SI-NEXT: s_add_i32 s17, s17, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s8, 16 ; SI-NEXT: s_or_b32 s5, s6, s5 ; SI-NEXT: s_add_i32 s4, s4, 0x30000 ; SI-NEXT: s_add_i32 s5, s5, 0x30000 @@ -4614,8 +4672,14 @@ define <4 x half> @bitcast_f64_to_v4f16(double %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: .LBB40_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_mov_b32_e32 v0, v4 -; SI-NEXT: v_mov_b32_e32 v1, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_f64_to_v4f16: @@ -4683,27 +4747,35 @@ define inreg <4 x half> @bitcast_f64_to_v4f16_scalar(double inreg %a, i32 inreg ; SI-NEXT: s_cbranch_scc0 .LBB41_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s16 ; SI-NEXT: s_cbranch_execnz .LBB41_3 ; SI-NEXT: .LBB41_2: ; %cmp.true -; SI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; SI-NEXT: v_add_f64 v[1:2], s[16:17], 1.0 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: .LBB41_3: ; %end -; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-NEXT: v_or_b32_e32 v1, v4, v1 +; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB41_4: -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: s_branch .LBB41_2 ; ; VI-LABEL: bitcast_f64_to_v4f16_scalar: @@ -4778,11 +4850,17 @@ define double @bitcast_v4f16_to_f64(<4 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v4f16_to_f64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v1 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v1 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -4795,32 +4873,32 @@ define double @bitcast_v4f16_to_f64(<4 x half> %a, i32 %b) { ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB42_3: ; %cmp.false ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 ; SI-NEXT: v_or_b32_e32 v0, v5, v0 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB42_2 ; SI-NEXT: .LBB42_4: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -4893,11 +4971,17 @@ define inreg double @bitcast_v4f16_to_f64_scalar(<4 x half> inreg %a, i32 inreg ; SI-LABEL: bitcast_v4f16_to_f64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v5, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v4, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v3, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v2, s18 -; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: s_lshr_b32 s5, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v6 +; SI-NEXT: s_cmp_lg_u32 s18, 0 ; SI-NEXT: s_cbranch_scc0 .LBB43_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 @@ -5014,8 +5098,8 @@ define <4 x bfloat> @bitcast_f64_to_v4bf16(double %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -5023,8 +5107,8 @@ define <4 x bfloat> @bitcast_f64_to_v4bf16(double %a, i32 %b) { ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v0 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1 ; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] @@ -5032,12 +5116,18 @@ define <4 x bfloat> @bitcast_f64_to_v4bf16(double %a, i32 %b) { ; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v0 ; SI-NEXT: ; %bb.4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_mov_b32_e32 v0, v5 -; SI-NEXT: v_mov_b32_e32 v1, v4 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v5 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v4 +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v3 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_f64_to_v4bf16: @@ -5115,7 +5205,7 @@ define inreg <4 x bfloat> @bitcast_f64_to_v4bf16_scalar(double inreg %a, i32 inr ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: s_branch .LBB45_5 ; SI-NEXT: .LBB45_3: ; SI-NEXT: ; implicit-def: $sgpr6 ; SI-NEXT: ; implicit-def: $sgpr7 @@ -5127,6 +5217,15 @@ define inreg <4 x bfloat> @bitcast_f64_to_v4bf16_scalar(double inreg %a, i32 inr ; SI-NEXT: v_mov_b32_e32 v2, s8 ; SI-NEXT: v_mov_b32_e32 v1, s7 ; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: .LBB45_5: ; %end +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_lshr_b64 v[1:2], v[2:3], 16 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_f64_to_v4bf16_scalar: @@ -5201,11 +5300,15 @@ define double @bitcast_v4bf16_to_f64(<4 x bfloat> %a, i32 %b) { ; SI-LABEL: bitcast_v4bf16_to_f64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; SI-NEXT: v_mul_f32_e32 v4, 1.0, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; SI-NEXT: v_mul_f32_e32 v5, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v1 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -5218,13 +5321,13 @@ define double @bitcast_v4bf16_to_f64(<4 x bfloat> %a, i32 %b) { ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB46_3: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 ; SI-NEXT: v_alignbit_b32 v0, v0, v5, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_alignbit_b32 v1, v1, v3, 16 ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB46_2 ; SI-NEXT: .LBB46_4: ; %cmp.true @@ -5233,9 +5336,9 @@ define double @bitcast_v4bf16_to_f64(<4 x bfloat> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 @@ -5460,11 +5563,15 @@ define inreg double @bitcast_v4bf16_to_f64_scalar(<4 x bfloat> inreg %a, i32 inr ; SI-LABEL: bitcast_v4bf16_to_f64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_cmp_lg_u32 s20, 0 -; SI-NEXT: v_mul_f32_e64 v8, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v5, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v7, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v3, 1.0, s18 +; SI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_and_b32 s6, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s7, s16, 16 +; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s6 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s7 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s4 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s5 ; SI-NEXT: s_cbranch_scc0 .LBB47_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v8 @@ -7071,25 +7178,29 @@ define <4 x i16> @bitcast_v2i32_to_v4i16(<2 x i32> %a, i32 %b) { ; SI-LABEL: bitcast_v2i32_to_v4i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v4, v1 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_alignbit_b32 v1, v4, v0, 16 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_alignbit_b32 v2, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_alignbit_b32 v1, v4, v0, 16 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_alignbit_b32 v2, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; SI-NEXT: ; %bb.4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_mov_b32_e32 v2, v4 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2i32_to_v4i16: @@ -7167,10 +7278,14 @@ define inreg <4 x i16> @bitcast_v2i32_to_v4i16_scalar(<2 x i32> inreg %a, i32 in ; SI-NEXT: s_lshr_b64 s[4:5], s[16:17], 16 ; SI-NEXT: s_lshr_b32 s8, s17, 16 ; SI-NEXT: .LBB57_3: ; %end -; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_mov_b32_e32 v1, s4 -; SI-NEXT: v_mov_b32_e32 v2, s17 -; SI-NEXT: v_mov_b32_e32 v3, s8 +; SI-NEXT: s_and_b32 s5, s16, 0xffff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s8, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB57_4: ; SI-NEXT: ; implicit-def: $sgpr4 @@ -7250,10 +7365,13 @@ define <2 x i32> @bitcast_v4i16_to_v2i32(<4 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v4i16_to_v2i32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v5, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_mov_b32_e32 v4, v1 +; SI-NEXT: v_mov_b32_e32 v3, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v0 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -7265,23 +7383,23 @@ define <2 x i32> @bitcast_v4i16_to_v2i32(<4 x i16> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB58_3: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v5 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v0, v0, v4 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v0, v0, v5 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB58_2 ; SI-NEXT: .LBB58_4: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v5 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v4 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v0, v4, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_or_b32_e32 v0, v5, v0 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x30000, v1 ; SI-NEXT: s_or_b64 exec, exec, s[4:5] @@ -7355,24 +7473,26 @@ define inreg <2 x i32> @bitcast_v4i16_to_v2i32_scalar(<4 x i16> inreg %a, i32 in ; SI-LABEL: bitcast_v4i16_to_v2i32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: s_lshr_b32 s8, s17, 16 +; SI-NEXT: s_lshr_b32 s9, s16, 16 +; SI-NEXT: s_cmp_lg_u32 s18, 0 ; SI-NEXT: s_cbranch_scc0 .LBB59_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_lshl_b32 s5, s9, 16 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s8, 16 ; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_cbranch_execnz .LBB59_3 ; SI-NEXT: .LBB59_2: ; %cmp.true ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_lshl_b32 s5, s9, 16 +; SI-NEXT: s_add_i32 s17, s17, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s8, 16 ; SI-NEXT: s_or_b32 s5, s6, s5 ; SI-NEXT: s_add_i32 s4, s4, 0x30000 ; SI-NEXT: s_add_i32 s5, s5, 0x30000 @@ -7466,43 +7586,45 @@ define <4 x half> @bitcast_v2i32_to_v4f16(<2 x i32> %a, i32 %b) { ; SI-LABEL: bitcast_v2i32_to_v4f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v4, v1 -; SI-NEXT: v_mov_b32_e32 v5, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB60_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB60_4 -; SI-NEXT: .LBB60_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB60_3: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB60_2 -; SI-NEXT: .LBB60_4: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v5 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v4 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: .LBB60_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB60_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: .LBB60_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2i32_to_v4f16: @@ -7572,28 +7694,36 @@ define inreg <4 x half> @bitcast_v2i32_to_v4f16_scalar(<2 x i32> inreg %a, i32 i ; SI-NEXT: s_cbranch_scc0 .LBB61_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s16 ; SI-NEXT: s_cbranch_execnz .LBB61_3 ; SI-NEXT: .LBB61_2: ; %cmp.true ; SI-NEXT: s_add_i32 s4, s16, 3 ; SI-NEXT: s_add_i32 s6, s17, 3 ; SI-NEXT: s_lshr_b32 s5, s4, 16 ; SI-NEXT: s_lshr_b32 s7, s6, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s5 ; SI-NEXT: .LBB61_3: ; %end +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-NEXT: v_or_b32_e32 v1, v4, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB61_4: -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: s_branch .LBB61_2 ; ; VI-LABEL: bitcast_v2i32_to_v4f16_scalar: @@ -7669,11 +7799,17 @@ define <2 x i32> @bitcast_v4f16_to_v2i32(<4 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v4f16_to_v2i32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v1 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v1 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -7686,32 +7822,32 @@ define <2 x i32> @bitcast_v4f16_to_v2i32(<4 x half> %a, i32 %b) { ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB62_3: ; %cmp.false ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 ; SI-NEXT: v_or_b32_e32 v0, v5, v0 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB62_2 ; SI-NEXT: .LBB62_4: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -7784,11 +7920,17 @@ define inreg <2 x i32> @bitcast_v4f16_to_v2i32_scalar(<4 x half> inreg %a, i32 i ; SI-LABEL: bitcast_v4f16_to_v2i32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v5, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v4, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v3, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v2, s18 -; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: s_lshr_b32 s5, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v6 +; SI-NEXT: s_cmp_lg_u32 s18, 0 ; SI-NEXT: s_cbranch_scc0 .LBB63_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 @@ -7904,39 +8046,39 @@ define <4 x bfloat> @bitcast_v2i32_to_v4bf16(<2 x i32> %a, i32 %b) { ; SI-LABEL: bitcast_v2i32_to_v4bf16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v5, v1 -; SI-NEXT: v_mov_b32_e32 v4, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB64_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB64_4 -; SI-NEXT: .LBB64_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB64_3: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v5 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB64_2 -; SI-NEXT: .LBB64_4: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v4 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v5 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v0 +; SI-NEXT: ; %bb.4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v5 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v4 +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v3 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2i32_to_v4bf16: @@ -8005,29 +8147,33 @@ define inreg <4 x bfloat> @bitcast_v2i32_to_v4bf16_scalar(<2 x i32> inreg %a, i3 ; SI-NEXT: s_cmp_lg_u32 s18, 0 ; SI-NEXT: s_cbranch_scc0 .LBB65_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_and_b32 s6, s17, 0xffff0000 -; SI-NEXT: s_lshl_b32 s7, s17, 16 -; SI-NEXT: s_and_b32 s8, s16, 0xffff0000 -; SI-NEXT: s_lshl_b32 s9, s16, 16 +; SI-NEXT: s_and_b32 s7, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s6, s17, 16 +; SI-NEXT: s_and_b32 s9, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s8, s16, 16 ; SI-NEXT: s_cbranch_execnz .LBB65_3 ; SI-NEXT: .LBB65_2: ; %cmp.true ; SI-NEXT: s_add_i32 s4, s16, 3 ; SI-NEXT: s_add_i32 s5, s17, 3 -; SI-NEXT: s_and_b32 s6, s5, 0xffff0000 -; SI-NEXT: s_lshl_b32 s7, s5, 16 -; SI-NEXT: s_and_b32 s8, s4, 0xffff0000 -; SI-NEXT: s_lshl_b32 s9, s4, 16 +; SI-NEXT: s_and_b32 s7, s5, 0xffff0000 +; SI-NEXT: s_lshl_b32 s6, s5, 16 +; SI-NEXT: s_and_b32 s9, s4, 0xffff0000 +; SI-NEXT: s_lshl_b32 s8, s4, 16 ; SI-NEXT: .LBB65_3: ; %end -; SI-NEXT: v_mov_b32_e32 v0, s9 -; SI-NEXT: v_mov_b32_e32 v1, s8 -; SI-NEXT: v_mov_b32_e32 v2, s7 -; SI-NEXT: v_mov_b32_e32 v3, s6 +; SI-NEXT: v_mul_f32_e64 v0, 1.0, s9 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_mul_f32_e64 v0, 1.0, s8 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s7 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s6 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB65_4: -; SI-NEXT: ; implicit-def: $sgpr9 ; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr9 ; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr7 ; SI-NEXT: s_branch .LBB65_2 ; ; VI-LABEL: bitcast_v2i32_to_v4bf16_scalar: @@ -8103,11 +8249,15 @@ define <2 x i32> @bitcast_v4bf16_to_v2i32(<4 x bfloat> %a, i32 %b) { ; SI-LABEL: bitcast_v4bf16_to_v2i32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; SI-NEXT: v_mul_f32_e32 v4, 1.0, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; SI-NEXT: v_mul_f32_e32 v5, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v1 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -8120,13 +8270,13 @@ define <2 x i32> @bitcast_v4bf16_to_v2i32(<4 x bfloat> %a, i32 %b) { ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB66_3: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 ; SI-NEXT: v_alignbit_b32 v0, v0, v5, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_alignbit_b32 v1, v1, v3, 16 ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB66_2 ; SI-NEXT: .LBB66_4: ; %cmp.true @@ -8135,9 +8285,9 @@ define <2 x i32> @bitcast_v4bf16_to_v2i32(<4 x bfloat> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 @@ -8362,11 +8512,15 @@ define inreg <2 x i32> @bitcast_v4bf16_to_v2i32_scalar(<4 x bfloat> inreg %a, i3 ; SI-LABEL: bitcast_v4bf16_to_v2i32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_cmp_lg_u32 s20, 0 -; SI-NEXT: v_mul_f32_e64 v8, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v5, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v7, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v3, 1.0, s18 +; SI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_and_b32 s6, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s7, s16, 16 +; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s6 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s7 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s4 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s5 ; SI-NEXT: s_cbranch_scc0 .LBB67_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v8 @@ -9640,25 +9794,29 @@ define <4 x i16> @bitcast_v2f32_to_v4i16(<2 x float> %a, i32 %b) { ; SI-LABEL: bitcast_v2f32_to_v4i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v4, v1 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_alignbit_b32 v1, v4, v0, 16 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_alignbit_b32 v2, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_alignbit_b32 v1, v4, v0, 16 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_alignbit_b32 v2, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; SI-NEXT: ; %bb.4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_mov_b32_e32 v2, v4 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2f32_to_v4i16: @@ -9730,23 +9888,27 @@ define inreg <4 x i16> @bitcast_v2f32_to_v4i16_scalar(<2 x float> inreg %a, i32 ; SI-NEXT: s_lshr_b64 s[4:5], s[16:17], 16 ; SI-NEXT: s_cbranch_execnz .LBB73_4 ; SI-NEXT: .LBB73_2: ; %cmp.true -; SI-NEXT: v_add_f32_e64 v5, s17, 1.0 -; SI-NEXT: v_add_f32_e64 v4, s16, 1.0 -; SI-NEXT: v_lshr_b64 v[1:2], v[4:5], 16 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; SI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: v_lshr_b64 v[2:3], v[0:1], 16 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; SI-NEXT: s_branch .LBB73_5 ; SI-NEXT: .LBB73_3: ; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr8 ; SI-NEXT: s_branch .LBB73_2 ; SI-NEXT: .LBB73_4: -; SI-NEXT: v_mov_b32_e32 v4, s16 -; SI-NEXT: v_mov_b32_e32 v5, s17 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 ; SI-NEXT: v_mov_b32_e32 v3, s8 -; SI-NEXT: v_mov_b32_e32 v1, s4 +; SI-NEXT: v_mov_b32_e32 v2, s4 ; SI-NEXT: .LBB73_5: ; %end -; SI-NEXT: v_mov_b32_e32 v0, v4 -; SI-NEXT: v_mov_b32_e32 v2, v5 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2f32_to_v4i16_scalar: @@ -9824,10 +9986,13 @@ define <2 x float> @bitcast_v4i16_to_v2f32(<4 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v4i16_to_v2f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v5, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_mov_b32_e32 v4, v1 +; SI-NEXT: v_mov_b32_e32 v3, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v0 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -9839,23 +10004,23 @@ define <2 x float> @bitcast_v4i16_to_v2f32(<4 x i16> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB74_3: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v5 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v0, v0, v4 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v0, v0, v5 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB74_2 ; SI-NEXT: .LBB74_4: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v5 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v4 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v0, v4, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_or_b32_e32 v0, v5, v0 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x30000, v1 ; SI-NEXT: s_or_b64 exec, exec, s[4:5] @@ -9929,24 +10094,26 @@ define inreg <2 x float> @bitcast_v4i16_to_v2f32_scalar(<4 x i16> inreg %a, i32 ; SI-LABEL: bitcast_v4i16_to_v2f32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: s_lshr_b32 s8, s17, 16 +; SI-NEXT: s_lshr_b32 s9, s16, 16 +; SI-NEXT: s_cmp_lg_u32 s18, 0 ; SI-NEXT: s_cbranch_scc0 .LBB75_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_lshl_b32 s5, s9, 16 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s8, 16 ; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_cbranch_execnz .LBB75_3 ; SI-NEXT: .LBB75_2: ; %cmp.true ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_lshl_b32 s5, s9, 16 +; SI-NEXT: s_add_i32 s17, s17, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s8, 16 ; SI-NEXT: s_or_b32 s5, s6, s5 ; SI-NEXT: s_add_i32 s4, s4, 0x30000 ; SI-NEXT: s_add_i32 s5, s5, 0x30000 @@ -10040,43 +10207,45 @@ define <4 x half> @bitcast_v2f32_to_v4f16(<2 x float> %a, i32 %b) { ; SI-LABEL: bitcast_v2f32_to_v4f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v4, v1 -; SI-NEXT: v_mov_b32_e32 v5, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB76_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB76_4 -; SI-NEXT: .LBB76_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB76_3: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB76_2 -; SI-NEXT: .LBB76_4: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v0, 1.0, v5 -; SI-NEXT: v_add_f32_e32 v2, 1.0, v4 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: .LBB76_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB76_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: .LBB76_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2f32_to_v4f16: @@ -10145,28 +10314,36 @@ define inreg <4 x half> @bitcast_v2f32_to_v4f16_scalar(<2 x float> inreg %a, i32 ; SI-NEXT: s_cbranch_scc0 .LBB77_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s16 ; SI-NEXT: s_cbranch_execnz .LBB77_3 ; SI-NEXT: .LBB77_2: ; %cmp.true -; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 -; SI-NEXT: v_add_f32_e64 v2, s17, 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_add_f32_e64 v1, s16, 1.0 +; SI-NEXT: v_add_f32_e64 v0, s17, 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: .LBB77_3: ; %end +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-NEXT: v_or_b32_e32 v1, v4, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB77_4: -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: s_branch .LBB77_2 ; ; VI-LABEL: bitcast_v2f32_to_v4f16_scalar: @@ -10244,11 +10421,17 @@ define <2 x float> @bitcast_v4f16_to_v2f32(<4 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v4f16_to_v2f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v1 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v1 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -10261,32 +10444,32 @@ define <2 x float> @bitcast_v4f16_to_v2f32(<4 x half> %a, i32 %b) { ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB78_3: ; %cmp.false ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 ; SI-NEXT: v_or_b32_e32 v0, v5, v0 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB78_2 ; SI-NEXT: .LBB78_4: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -10359,11 +10542,17 @@ define inreg <2 x float> @bitcast_v4f16_to_v2f32_scalar(<4 x half> inreg %a, i32 ; SI-LABEL: bitcast_v4f16_to_v2f32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v5, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v4, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v3, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v2, s18 -; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: s_lshr_b32 s5, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v6 +; SI-NEXT: s_cmp_lg_u32 s18, 0 ; SI-NEXT: s_cbranch_scc0 .LBB79_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 @@ -10479,39 +10668,39 @@ define <4 x bfloat> @bitcast_v2f32_to_v4bf16(<2 x float> %a, i32 %b) { ; SI-LABEL: bitcast_v2f32_to_v4bf16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v5, v1 -; SI-NEXT: v_mov_b32_e32 v4, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB80_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB80_4 -; SI-NEXT: .LBB80_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB80_3: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v5 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB80_2 -; SI-NEXT: .LBB80_4: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v0, 1.0, v4 -; SI-NEXT: v_add_f32_e32 v1, 1.0, v5 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v0 +; SI-NEXT: ; %bb.4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v5 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v4 +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v3 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2f32_to_v4bf16: @@ -10591,7 +10780,7 @@ define inreg <4 x bfloat> @bitcast_v2f32_to_v4bf16_scalar(<2 x float> inreg %a, ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: s_branch .LBB81_5 ; SI-NEXT: .LBB81_3: ; SI-NEXT: ; implicit-def: $sgpr9 ; SI-NEXT: ; implicit-def: $sgpr8 @@ -10603,6 +10792,15 @@ define inreg <4 x bfloat> @bitcast_v2f32_to_v4bf16_scalar(<2 x float> inreg %a, ; SI-NEXT: v_mov_b32_e32 v1, s8 ; SI-NEXT: v_mov_b32_e32 v2, s7 ; SI-NEXT: v_mov_b32_e32 v3, s6 +; SI-NEXT: .LBB81_5: ; %end +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_lshr_b64 v[1:2], v[2:3], 16 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2f32_to_v4bf16_scalar: @@ -10680,11 +10878,15 @@ define <2 x float> @bitcast_v4bf16_to_v2f32(<4 x bfloat> %a, i32 %b) { ; SI-LABEL: bitcast_v4bf16_to_v2f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; SI-NEXT: v_mul_f32_e32 v4, 1.0, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; SI-NEXT: v_mul_f32_e32 v5, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v1 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -10697,13 +10899,13 @@ define <2 x float> @bitcast_v4bf16_to_v2f32(<4 x bfloat> %a, i32 %b) { ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB82_3: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 ; SI-NEXT: v_alignbit_b32 v0, v0, v5, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_alignbit_b32 v1, v1, v3, 16 ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB82_2 ; SI-NEXT: .LBB82_4: ; %cmp.true @@ -10712,9 +10914,9 @@ define <2 x float> @bitcast_v4bf16_to_v2f32(<4 x bfloat> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 @@ -10939,11 +11141,15 @@ define inreg <2 x float> @bitcast_v4bf16_to_v2f32_scalar(<4 x bfloat> inreg %a, ; SI-LABEL: bitcast_v4bf16_to_v2f32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_cmp_lg_u32 s20, 0 -; SI-NEXT: v_mul_f32_e64 v8, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v5, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v7, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v3, 1.0, s18 +; SI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_and_b32 s6, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s7, s16, 16 +; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s6 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s7 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s4 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s5 ; SI-NEXT: s_cbranch_scc0 .LBB83_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v8 @@ -12233,45 +12439,47 @@ define <4 x half> @bitcast_v4i16_to_v4f16(<4 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v4i16_to_v4f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v8, v3 -; SI-NEXT: v_mov_b32_e32 v5, v2 -; SI-NEXT: v_mov_b32_e32 v6, v1 -; SI-NEXT: v_mov_b32_e32 v7, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB88_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB88_4 -; SI-NEXT: .LBB88_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB88_3: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v0, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: s_cbranch_execz .LBB88_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v6 ; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: .LBB88_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB88_2 -; SI-NEXT: .LBB88_4: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v8 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v5 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v6 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: s_cbranch_execz .LBB88_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: .LBB88_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4i16_to_v4f16: @@ -12342,29 +12550,39 @@ define inreg <4 x half> @bitcast_v4i16_to_v4f16_scalar(<4 x i16> inreg %a, i32 i ; SI-LABEL: bitcast_v4i16_to_v4f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: s_lshr_b32 s7, s17, 16 +; SI-NEXT: s_lshr_b32 s6, s16, 16 +; SI-NEXT: s_cmp_lg_u32 s18, 0 ; SI-NEXT: s_cbranch_scc0 .LBB89_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s6 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s7 ; SI-NEXT: s_cbranch_execnz .LBB89_3 ; SI-NEXT: .LBB89_2: ; %cmp.true -; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s7, s7, 3 ; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s6, s6, 3 ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s6 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s7 ; SI-NEXT: .LBB89_3: ; %end -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB89_4: -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB89_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: s_branch .LBB89_2 ; @@ -12450,35 +12668,47 @@ define <4 x i16> @bitcast_v4f16_to_v4i16(<4 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v4f16_to_v4i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB90_2 ; SI-NEXT: ; %bb.1: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v2, v2, v4 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v4 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: v_alignbit_b32 v4, v1, v2, 16 ; SI-NEXT: .LBB90_2: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4f16_to_v4i16: @@ -12550,34 +12780,45 @@ define inreg <4 x i16> @bitcast_v4f16_to_v4i16_scalar(<4 x half> inreg %a, i32 i ; SI-LABEL: bitcast_v4f16_to_v4i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v4, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v2, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v3, s19 -; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: s_lshr_b32 s5, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: s_cmp_lg_u32 s18, 0 ; SI-NEXT: s_cbranch_scc0 .LBB91_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_cbranch_execnz .LBB91_3 ; SI-NEXT: .LBB91_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v2, v2, v4 -; SI-NEXT: v_lshr_b64 v[4:5], v[1:2], 16 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_lshr_b64 v[2:3], v[0:1], 16 +; SI-NEXT: v_or_b32_e32 v0, v5, v0 ; SI-NEXT: .LBB91_3: ; %end -; SI-NEXT: v_mov_b32_e32 v1, v4 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB91_4: ; SI-NEXT: s_branch .LBB91_2 @@ -12667,43 +12908,46 @@ define <4 x bfloat> @bitcast_v4i16_to_v4bf16(<4 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v4i16_to_v4bf16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v6, v2 -; SI-NEXT: v_mov_b32_e32 v5, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB92_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB92_4 -; SI-NEXT: .LBB92_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB92_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB92_2 -; SI-NEXT: .LBB92_4: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v6 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v3, v0 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v5 +; SI-NEXT: s_cbranch_execz .LBB92_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x30000, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v0 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v1 +; SI-NEXT: .LBB92_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v2 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v4 +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v3 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v5 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4i16_to_v4bf16: @@ -12774,39 +13018,45 @@ define inreg <4 x bfloat> @bitcast_v4i16_to_v4bf16_scalar(<4 x i16> inreg %a, i3 ; SI-LABEL: bitcast_v4i16_to_v4bf16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: s_lshr_b32 s11, s17, 16 +; SI-NEXT: s_lshr_b32 s10, s16, 16 +; SI-NEXT: s_cmp_lg_u32 s18, 0 ; SI-NEXT: s_cbranch_scc0 .LBB93_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshl_b32 s6, s16, 16 -; SI-NEXT: s_lshl_b32 s7, s17, 16 -; SI-NEXT: s_lshl_b32 s9, s18, 16 -; SI-NEXT: s_lshl_b32 s8, s19, 16 +; SI-NEXT: s_lshl_b32 s7, s16, 16 +; SI-NEXT: s_lshl_b32 s9, s10, 16 +; SI-NEXT: s_lshl_b32 s6, s17, 16 +; SI-NEXT: s_lshl_b32 s8, s11, 16 ; SI-NEXT: s_cbranch_execnz .LBB93_3 ; SI-NEXT: .LBB93_2: ; %cmp.true -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_and_b32 s4, s18, 0xffff -; SI-NEXT: s_lshl_b32 s5, s19, 16 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_and_b32 s4, s17, 0xffff +; SI-NEXT: s_lshl_b32 s5, s11, 16 ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_and_b32 s5, s16, 0xffff -; SI-NEXT: s_lshl_b32 s6, s17, 16 +; SI-NEXT: s_lshl_b32 s6, s10, 16 ; SI-NEXT: s_or_b32 s5, s6, s5 ; SI-NEXT: s_add_i32 s4, s4, 0x30000 ; SI-NEXT: s_add_i32 s5, s5, 0x30000 -; SI-NEXT: s_and_b32 s7, s5, 0xffff0000 -; SI-NEXT: s_lshl_b32 s6, s5, 16 +; SI-NEXT: s_and_b32 s9, s5, 0xffff0000 +; SI-NEXT: s_lshl_b32 s7, s5, 16 ; SI-NEXT: s_and_b32 s8, s4, 0xffff0000 -; SI-NEXT: s_lshl_b32 s9, s4, 16 +; SI-NEXT: s_lshl_b32 s6, s4, 16 ; SI-NEXT: .LBB93_3: ; %end -; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: v_mov_b32_e32 v1, s7 -; SI-NEXT: v_mov_b32_e32 v2, s9 -; SI-NEXT: v_mov_b32_e32 v3, s8 +; SI-NEXT: v_mul_f32_e64 v0, 1.0, s9 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_mul_f32_e64 v0, 1.0, s7 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s8 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s6 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB93_4: -; SI-NEXT: ; implicit-def: $sgpr6 ; SI-NEXT: ; implicit-def: $sgpr7 ; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $sgpr6 ; SI-NEXT: ; implicit-def: $sgpr8 ; SI-NEXT: s_branch .LBB93_2 ; @@ -12892,51 +13142,56 @@ define <4 x i16> @bitcast_v4bf16_to_v4i16(<4 x bfloat> %a, i32 %b) { ; SI-LABEL: bitcast_v4bf16_to_v4i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; SI-NEXT: v_mul_f32_e32 v7, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v6, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v5, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v6, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v1 ; SI-NEXT: v_mul_f32_e32 v4, 1.0, v3 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB94_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB94_4 -; SI-NEXT: .LBB94_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB94_3: ; %cmp.false +; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v4 ; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB94_2 -; SI-NEXT: .LBB94_4: ; %cmp.true +; SI-NEXT: s_cbranch_execz .LBB94_4 +; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v7 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 -; SI-NEXT: v_alignbit_b32 v0, v2, v0, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_alignbit_b32 v3, v1, v3, 16 +; SI-NEXT: .LBB94_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4bf16_to_v4i16: @@ -13152,42 +13407,50 @@ define inreg <4 x i16> @bitcast_v4bf16_to_v4i16_scalar(<4 x bfloat> inreg %a, i3 ; SI-LABEL: bitcast_v4bf16_to_v4i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_cmp_lg_u32 s20, 0 -; SI-NEXT: v_mul_f32_e64 v7, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v3, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v6, 1.0, s19 +; SI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_and_b32 s6, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s7, s16, 16 +; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s7 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s6 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s5 +; SI-NEXT: v_mul_f32_e64 v0, 1.0, s4 ; SI-NEXT: s_cbranch_scc0 .LBB95_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 ; SI-NEXT: s_cbranch_execnz .LBB95_3 ; SI-NEXT: .LBB95_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v7 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v1 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 -; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v7 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 -; SI-NEXT: v_lshr_b64 v[2:3], v[3:4], 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 -; SI-NEXT: v_lshr_b64 v[5:6], v[1:2], 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v6 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; SI-NEXT: v_lshr_b64 v[3:4], v[3:4], 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; SI-NEXT: v_lshr_b64 v[1:2], v[4:5], 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v6 +; SI-NEXT: v_lshr_b64 v[6:7], v[0:1], 16 ; SI-NEXT: .LBB95_3: ; %end -; SI-NEXT: v_mov_b32_e32 v1, v5 -; SI-NEXT: v_mov_b32_e32 v3, v4 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v6 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v5 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB95_4: -; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: s_branch .LBB95_2 ; ; VI-LABEL: bitcast_v4bf16_to_v4i16_scalar: @@ -13425,19 +13688,19 @@ define <8 x i8> @bitcast_v4i16_to_v8i8(<4 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v4i16_to_v8i8: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v10, v3 -; SI-NEXT: v_mov_b32_e32 v8, v2 +; SI-NEXT: v_mov_b32_e32 v8, v1 ; SI-NEXT: v_mov_b32_e32 v9, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v9 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v6 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -13451,18 +13714,17 @@ define <8 x i8> @bitcast_v4i16_to_v8i8(<4 x i16> %a, i32 %b) { ; SI-NEXT: .LBB96_3: ; %cmp.false ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v9 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 -; SI-NEXT: v_or_b32_e32 v0, v0, v12 -; SI-NEXT: v_or_b32_e32 v4, v1, v11 +; SI-NEXT: v_or_b32_e32 v0, v0, v11 +; SI-NEXT: v_or_b32_e32 v4, v1, v10 ; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 ; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 ; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 ; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v10 -; SI-NEXT: v_bfe_u32 v7, v10, 8, 8 +; SI-NEXT: v_bfe_u32 v7, v6, 8, 8 ; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB96_2 ; SI-NEXT: .LBB96_4: ; %cmp.true @@ -13470,8 +13732,8 @@ define <8 x i8> @bitcast_v4i16_to_v8i8(<4 x i16> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v8 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v0, v12, v0 -; SI-NEXT: v_or_b32_e32 v1, v11, v1 +; SI-NEXT: v_or_b32_e32 v0, v11, v0 +; SI-NEXT: v_or_b32_e32 v1, v10, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v4, vcc, 0x30000, v1 ; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 @@ -13663,30 +13925,31 @@ define inreg <8 x i8> @bitcast_v4i16_to_v8i8_scalar(<4 x i16> inreg %a, i32 inre ; SI-LABEL: bitcast_v4i16_to_v8i8_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: s_lshr_b32 s14, s17, 16 +; SI-NEXT: s_lshr_b32 s15, s16, 16 +; SI-NEXT: s_cmp_lg_u32 s18, 0 ; SI-NEXT: s_cbranch_scc0 .LBB97_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_lshl_b32 s5, s15, 16 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s14, 16 ; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_lshr_b64 s[6:7], s[4:5], 24 ; SI-NEXT: s_lshr_b64 s[8:9], s[4:5], 16 ; SI-NEXT: s_lshr_b64 s[10:11], s[4:5], 8 ; SI-NEXT: s_lshr_b32 s9, s5, 8 -; SI-NEXT: s_and_b32 s11, s19, 0xffff -; SI-NEXT: s_bfe_u32 s7, s19, 0x80008 +; SI-NEXT: s_bfe_u32 s7, s14, 0x80008 ; SI-NEXT: s_cbranch_execnz .LBB97_3 ; SI-NEXT: .LBB97_2: ; %cmp.true ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_lshl_b32 s5, s15, 16 +; SI-NEXT: s_add_i32 s17, s17, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s14, 16 ; SI-NEXT: s_or_b32 s5, s6, s5 ; SI-NEXT: s_add_i32 s4, s4, 0x30000 ; SI-NEXT: s_add_i32 s5, s5, 0x30000 @@ -13694,7 +13957,7 @@ define inreg <8 x i8> @bitcast_v4i16_to_v8i8_scalar(<4 x i16> inreg %a, i32 inre ; SI-NEXT: s_lshr_b64 s[8:9], s[4:5], 16 ; SI-NEXT: s_lshr_b64 s[10:11], s[4:5], 8 ; SI-NEXT: s_lshr_b32 s7, s5, 24 -; SI-NEXT: s_lshr_b32 s11, s5, 16 +; SI-NEXT: s_lshr_b32 s14, s5, 16 ; SI-NEXT: s_lshr_b32 s9, s5, 8 ; SI-NEXT: .LBB97_3: ; %end ; SI-NEXT: v_mov_b32_e32 v0, s4 @@ -13703,7 +13966,7 @@ define inreg <8 x i8> @bitcast_v4i16_to_v8i8_scalar(<4 x i16> inreg %a, i32 inre ; SI-NEXT: v_mov_b32_e32 v3, s6 ; SI-NEXT: v_mov_b32_e32 v4, s5 ; SI-NEXT: v_mov_b32_e32 v5, s9 -; SI-NEXT: v_mov_b32_e32 v6, s11 +; SI-NEXT: v_mov_b32_e32 v6, s14 ; SI-NEXT: v_mov_b32_e32 v7, s7 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB97_4: @@ -13712,7 +13975,6 @@ define inreg <8 x i8> @bitcast_v4i16_to_v8i8_scalar(<4 x i16> inreg %a, i32 inre ; SI-NEXT: ; implicit-def: $sgpr8 ; SI-NEXT: ; implicit-def: $sgpr6 ; SI-NEXT: ; implicit-def: $sgpr9 -; SI-NEXT: ; implicit-def: $sgpr11 ; SI-NEXT: ; implicit-def: $sgpr7 ; SI-NEXT: s_branch .LBB97_2 ; @@ -13877,79 +14139,79 @@ define <4 x i16> @bitcast_v8i8_to_v4i16(<8 x i8> %a, i32 %b) { ; SI-LABEL: bitcast_v8i8_to_v4i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v9, v2 -; SI-NEXT: v_mov_b32_e32 v10, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 ; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v3 -; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v1 -; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v3 +; SI-NEXT: v_lshlrev_b32_e32 v10, 8, v1 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB98_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB98_4 -; SI-NEXT: .LBB98_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB98_3: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xff, v4 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v6 -; SI-NEXT: v_or_b32_e32 v0, v0, v11 +; SI-NEXT: s_cbranch_execz .LBB98_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v1, 0xff, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v1, v1, v11 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v4, v7, v3 +; SI-NEXT: v_or_b32_e32 v3, v1, v4 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v10 +; SI-NEXT: v_or_b32_e32 v1, v8, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v3, v5, v1 -; SI-NEXT: v_or_b32_e32 v2, v0, v3 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v9 -; SI-NEXT: v_and_b32_e32 v4, 0xff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v4, v4, v8 -; SI-NEXT: v_or_b32_e32 v0, v7, v0 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; SI-NEXT: v_or_b32_e32 v0, v4, v0 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_alignbit_b32 v5, v3, v1, 16 +; SI-NEXT: v_or_b32_e32 v1, v0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v4 +; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: .LBB98_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB98_2 -; SI-NEXT: .LBB98_4: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v10 +; SI-NEXT: s_cbranch_execz .LBB98_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v9 -; SI-NEXT: v_or_b32_e32 v0, v8, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v2 +; SI-NEXT: v_or_b32_e32 v0, v10, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v7, v1 +; SI-NEXT: v_or_b32_e32 v1, v8, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v4 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v6 -; SI-NEXT: v_or_b32_e32 v1, v11, v1 +; SI-NEXT: v_or_b32_e32 v0, v11, v0 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v2, v5, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x3000000, v1 -; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v2, v7, v2 +; SI-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x3000000, v0 +; SI-NEXT: v_alignbit_b32 v5, v3, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v3 +; SI-NEXT: .LBB98_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v9 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8i8_to_v4i16: @@ -14290,10 +14552,14 @@ define inreg <4 x i16> @bitcast_v8i8_to_v4i16_scalar(<8 x i8> inreg %a, i32 inre ; SI-NEXT: s_lshr_b64 s[6:7], s[4:5], 16 ; SI-NEXT: s_lshr_b32 s7, s5, 16 ; SI-NEXT: .LBB99_3: ; %end +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_or_b32 s4, s4, s6 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s6, s7, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s6 -; SI-NEXT: v_mov_b32_e32 v2, s5 -; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB99_4: ; SI-NEXT: ; implicit-def: $sgpr4 @@ -14503,53 +14769,62 @@ define <4 x bfloat> @bitcast_v4f16_to_v4bf16(<4 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v4f16_to_v4bf16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v3 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v3 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB100_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB100_4 -; SI-NEXT: .LBB100_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB100_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v5 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v8 -; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v7 ; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB100_2 -; SI-NEXT: .LBB100_4: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v5 +; SI-NEXT: s_cbranch_execz .LBB100_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v4 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: .LBB100_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_alignbit_b32 v0, v2, v0, 16 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4f16_to_v4bf16: @@ -14621,42 +14896,56 @@ define inreg <4 x bfloat> @bitcast_v4f16_to_v4bf16_scalar(<4 x half> inreg %a, i ; SI-LABEL: bitcast_v4f16_to_v4bf16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v4, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v5, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v6, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v7, s19 -; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: s_lshr_b32 s5, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v4 +; SI-NEXT: s_cmp_lg_u32 s18, 0 ; SI-NEXT: s_cbranch_scc0 .LBB101_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v7 ; SI-NEXT: s_cbranch_execnz .LBB101_3 ; SI-NEXT: .LBB101_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v4 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: .LBB101_3: ; %end +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v6 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v1 +; SI-NEXT: v_lshr_b64 v[0:1], v[5:6], 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v4 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_lshr_b64 v[1:2], v[2:3], 16 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB101_4: -; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: s_branch .LBB101_2 ; ; VI-LABEL: bitcast_v4f16_to_v4bf16_scalar: @@ -14744,57 +15033,65 @@ define <4 x half> @bitcast_v4bf16_to_v4f16(<4 x bfloat> %a, i32 %b) { ; SI-LABEL: bitcast_v4bf16_to_v4f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; SI-NEXT: v_mul_f32_e32 v4, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v5, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v6, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v6, 1.0, v1 ; SI-NEXT: v_mul_f32_e32 v7, 1.0, v3 ; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB102_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB102_4 -; SI-NEXT: .LBB102_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB102_3: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v4 +; SI-NEXT: s_cbranch_execz .LBB102_2 +; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v6 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: .LBB102_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB102_2 -; SI-NEXT: .LBB102_4: ; %cmp.true +; SI-NEXT: s_cbranch_execz .LBB102_4 +; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v7 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v6 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v4 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: .LBB102_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4bf16_to_v4f16: @@ -15017,46 +15314,58 @@ define inreg <4 x half> @bitcast_v4bf16_to_v4f16_scalar(<4 x bfloat> inreg %a, i ; SI-LABEL: bitcast_v4bf16_to_v4f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_cmp_lg_u32 s20, 0 -; SI-NEXT: v_mul_f32_e64 v4, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v5, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v6, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v7, 1.0, s19 +; SI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_and_b32 s6, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s7, s16, 16 +; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: v_mul_f32_e64 v0, 1.0, s7 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s6 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s5 +; SI-NEXT: v_mul_f32_e64 v6, 1.0, s4 ; SI-NEXT: s_cbranch_scc0 .LBB103_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: s_cbranch_execnz .LBB103_3 ; SI-NEXT: .LBB103_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v7 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v6 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v6 ; SI-NEXT: .LBB103_3: ; %end +; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB103_4: -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: s_branch .LBB103_2 ; ; VI-LABEL: bitcast_v4bf16_to_v4f16_scalar: @@ -15308,11 +15617,17 @@ define <8 x i8> @bitcast_v4f16_to_v8i8(<4 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v4f16_to_v8i8: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v10, v1 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v2 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v1 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 @@ -15547,11 +15862,17 @@ define inreg <8 x i8> @bitcast_v4f16_to_v8i8_scalar(<4 x half> inreg %a, i32 inr ; SI-LABEL: bitcast_v4f16_to_v8i8_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v11, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v8, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v6, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v0, s18 -; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: s_lshr_b32 s5, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v3 +; SI-NEXT: s_cmp_lg_u32 s18, 0 ; SI-NEXT: s_cbranch_scc0 .LBB105_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v11 @@ -15761,71 +16082,73 @@ define <4 x half> @bitcast_v8i8_to_v4f16(<8 x i8> %a, i32 %b) { ; SI-LABEL: bitcast_v8i8_to_v4f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v10, v2 -; SI-NEXT: v_mov_b32_e32 v9, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; SI-NEXT: v_lshlrev_b32_e32 v12, 8, v1 -; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v3 -; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v7 -; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v10, 8, v3 +; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v7 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB106_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB106_4 -; SI-NEXT: .LBB106_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB106_3: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xff, v9 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v10 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v4 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v6 -; SI-NEXT: v_or_b32_e32 v0, v0, v12 -; SI-NEXT: v_or_b32_e32 v1, v1, v11 -; SI-NEXT: v_or_b32_e32 v2, v2, v8 -; SI-NEXT: v_or_b32_e32 v3, v3, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: s_cbranch_execz .LBB106_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v0, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v4 +; SI-NEXT: v_or_b32_e32 v0, v0, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v0, v0, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 ; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: .LBB106_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB106_2 -; SI-NEXT: .LBB106_4: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v6 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v5, v0 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x300, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v4 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v8, v0 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x300, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v10 +; SI-NEXT: s_cbranch_execz .LBB106_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v6 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v1, v8, v1 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x300, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v1, v9, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x300, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v1, v10, v1 ; SI-NEXT: v_or_b32_e32 v0, v11, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v9 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v12, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x300, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v6 +; SI-NEXT: .LBB106_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8i8_to_v4f16: @@ -16121,11 +16444,11 @@ define inreg <4 x half> @bitcast_v8i8_to_v4f16_scalar(<8 x i8> inreg %a, i32 inr ; SI-NEXT: s_and_b32 s4, s18, 0xff ; SI-NEXT: s_lshl_b32 s5, s19, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 ; SI-NEXT: s_and_b32 s4, s20, 0xff ; SI-NEXT: s_lshl_b32 s5, s21, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 ; SI-NEXT: s_and_b32 s4, s22, 0xff ; SI-NEXT: s_lshl_b32 s5, s23, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 @@ -16153,15 +16476,23 @@ define inreg <4 x half> @bitcast_v8i8_to_v4f16_scalar(<8 x i8> inreg %a, i32 inr ; SI-NEXT: s_addk_i32 s6, 0x300 ; SI-NEXT: s_addk_i32 s7, 0x300 ; SI-NEXT: v_cvt_f32_f16_e32 v0, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 ; SI-NEXT: .LBB107_3: ; %end +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB107_4: ; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: s_branch .LBB107_2 ; @@ -16367,11 +16698,15 @@ define <8 x i8> @bitcast_v4bf16_to_v8i8(<4 x bfloat> %a, i32 %b) { ; SI-LABEL: bitcast_v4bf16_to_v8i8: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; SI-NEXT: v_mul_f32_e32 v10, 1.0, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v4 ; SI-NEXT: v_mul_f32_e32 v11, 1.0, v0 ; SI-NEXT: v_mul_f32_e32 v8, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v9, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v1 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 @@ -16743,11 +17078,15 @@ define inreg <8 x i8> @bitcast_v4bf16_to_v8i8_scalar(<4 x bfloat> inreg %a, i32 ; SI-LABEL: bitcast_v4bf16_to_v8i8_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_cmp_lg_u32 s20, 0 -; SI-NEXT: v_mul_f32_e64 v15, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v0, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v14, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v5, 1.0, s18 +; SI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_and_b32 s6, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s7, s16, 16 +; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: v_mul_f32_e64 v15, 1.0, s6 +; SI-NEXT: v_mul_f32_e64 v0, 1.0, s7 +; SI-NEXT: v_mul_f32_e64 v14, 1.0, s4 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s5 ; SI-NEXT: s_cbranch_scc0 .LBB109_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v15 @@ -17141,75 +17480,80 @@ define <4 x bfloat> @bitcast_v8i8_to_v4bf16(<8 x i8> %a, i32 %b) { ; SI-LABEL: bitcast_v8i8_to_v4bf16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v10, v1 -; SI-NEXT: v_mov_b32_e32 v9, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 ; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v3 ; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v7 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v7 ; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB110_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xff, v9 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v10 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v4 -; SI-NEXT: v_or_b32_e32 v2, v2, v11 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v8, v1 -; SI-NEXT: v_or_b32_e32 v3, v5, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; SI-NEXT: v_or_b32_e32 v3, v1, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v5, v8, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v4 +; SI-NEXT: v_or_b32_e32 v0, v0, v11 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v10, v9, v0 ; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: .LBB110_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB110_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v4 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v6 -; SI-NEXT: v_or_b32_e32 v0, v11, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v5, v1 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v4, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v4 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v10 +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v6 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v2 +; SI-NEXT: v_or_b32_e32 v3, v11, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x300, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v4, v9, v4 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v8, v1 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x3000000, v3 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x3000000, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v4 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v4 ; SI-NEXT: .LBB110_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_mov_b32_e32 v2, v7 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v5 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v3 +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v10 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v7 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8i8_to_v4bf16: @@ -17505,11 +17849,11 @@ define inreg <4 x bfloat> @bitcast_v8i8_to_v4bf16_scalar(<8 x i8> inreg %a, i32 ; SI-NEXT: s_and_b32 s4, s18, 0xff ; SI-NEXT: s_lshl_b32 s4, s4, 16 ; SI-NEXT: s_lshl_b32 s5, s19, 24 -; SI-NEXT: s_or_b32 s7, s5, s4 +; SI-NEXT: s_or_b32 s8, s5, s4 ; SI-NEXT: s_and_b32 s4, s20, 0xff ; SI-NEXT: s_lshl_b32 s5, s21, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_lshl_b32 s8, s4, 16 +; SI-NEXT: s_lshl_b32 s7, s4, 16 ; SI-NEXT: s_and_b32 s4, s22, 0xff ; SI-NEXT: s_lshl_b32 s4, s4, 16 ; SI-NEXT: s_lshl_b32 s5, s23, 24 @@ -17542,20 +17886,24 @@ define inreg <4 x bfloat> @bitcast_v8i8_to_v4bf16_scalar(<8 x i8> inreg %a, i32 ; SI-NEXT: s_or_b32 s5, s6, s5 ; SI-NEXT: s_add_i32 s4, s4, 0x3000000 ; SI-NEXT: s_add_i32 s5, s5, 0x3000000 -; SI-NEXT: s_and_b32 s7, s5, 0xffff0000 +; SI-NEXT: s_and_b32 s8, s5, 0xffff0000 ; SI-NEXT: s_lshl_b32 s6, s5, 16 ; SI-NEXT: s_and_b32 s9, s4, 0xffff0000 -; SI-NEXT: s_lshl_b32 s8, s4, 16 +; SI-NEXT: s_lshl_b32 s7, s4, 16 ; SI-NEXT: .LBB111_3: ; %end -; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: v_mov_b32_e32 v1, s7 -; SI-NEXT: v_mov_b32_e32 v2, s8 -; SI-NEXT: v_mov_b32_e32 v3, s9 +; SI-NEXT: v_mul_f32_e64 v0, 1.0, s8 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_mul_f32_e64 v0, 1.0, s6 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s9 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s7 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB111_4: ; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; implicit-def: $sgpr7 ; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr7 ; SI-NEXT: ; implicit-def: $sgpr9 ; SI-NEXT: s_branch .LBB111_2 ; diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll index 4743a9a84d243..6e2167edd97cd 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll @@ -2330,236 +2330,171 @@ define <44 x i16> @bitcast_v22i32_to_v44i16(<22 x i32> %a, i32 %b) { ; SI-LABEL: bitcast_v22i32_to_v44i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v23 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 ; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB12_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_alignbit_b32 v23, v22, v21, 16 -; SI-NEXT: v_alignbit_b32 v24, v20, v19, 16 -; SI-NEXT: v_alignbit_b32 v25, v18, v17, 16 -; SI-NEXT: v_alignbit_b32 v26, v16, v15, 16 -; SI-NEXT: v_alignbit_b32 v28, v14, v13, 16 -; SI-NEXT: v_alignbit_b32 v30, v12, v11, 16 -; SI-NEXT: v_alignbit_b32 v32, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v34, v8, v7, 16 -; SI-NEXT: v_alignbit_b32 v37, v6, v5, 16 -; SI-NEXT: v_alignbit_b32 v39, v4, v3, 16 -; SI-NEXT: v_alignbit_b32 v49, v2, v1, 16 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v2 +; SI-NEXT: v_alignbit_b32 v22, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v23, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v24, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v25, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v26, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v27, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v28, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v30, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v32, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v35, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v37, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v1 ; SI-NEXT: .LBB12_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB12_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 ; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 ; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 ; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 ; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 ; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 ; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 ; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 ; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; SI-NEXT: v_alignbit_b32 v23, v22, v21, 16 -; SI-NEXT: v_alignbit_b32 v24, v20, v19, 16 -; SI-NEXT: v_alignbit_b32 v25, v18, v17, 16 -; SI-NEXT: v_alignbit_b32 v26, v16, v15, 16 -; SI-NEXT: v_alignbit_b32 v28, v14, v13, 16 -; SI-NEXT: v_alignbit_b32 v30, v12, v11, 16 -; SI-NEXT: v_alignbit_b32 v32, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v34, v8, v7, 16 -; SI-NEXT: v_alignbit_b32 v37, v6, v5, 16 -; SI-NEXT: v_alignbit_b32 v39, v4, v3, 16 -; SI-NEXT: v_alignbit_b32 v49, v2, v1, 16 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v2 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_alignbit_b32 v22, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v23, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v24, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v25, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v26, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v27, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v28, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v30, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v32, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v35, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v37, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v1 ; SI-NEXT: .LBB12_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v0, v0, v37 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; SI-NEXT: v_or_b32_e32 v1, v1, v49 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v28 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v26 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v24 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v23 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x54, v0 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v51 +; SI-NEXT: v_or_b32_e32 v2, v2, v35 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v50 +; SI-NEXT: v_or_b32_e32 v4, v4, v32 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v49 +; SI-NEXT: v_or_b32_e32 v6, v6, v30 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v48 +; SI-NEXT: v_or_b32_e32 v8, v8, v28 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v39 +; SI-NEXT: v_or_b32_e32 v10, v10, v27 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v38 +; SI-NEXT: v_or_b32_e32 v12, v12, v26 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v36 +; SI-NEXT: v_or_b32_e32 v14, v14, v25 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v34 +; SI-NEXT: v_or_b32_e32 v16, v16, v24 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v33 +; SI-NEXT: v_or_b32_e32 v18, v18, v23 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v31 +; SI-NEXT: v_or_b32_e32 v20, v20, v22 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v29 +; SI-NEXT: v_or_b32_e32 v1, v1, v37 +; SI-NEXT: v_or_b32_e32 v3, v3, v35 +; SI-NEXT: v_or_b32_e32 v5, v5, v32 +; SI-NEXT: v_or_b32_e32 v7, v7, v30 +; SI-NEXT: v_or_b32_e32 v9, v9, v28 +; SI-NEXT: v_or_b32_e32 v11, v11, v27 +; SI-NEXT: v_or_b32_e32 v13, v13, v26 +; SI-NEXT: v_or_b32_e32 v15, v15, v25 +; SI-NEXT: v_or_b32_e32 v17, v17, v24 +; SI-NEXT: v_or_b32_e32 v19, v19, v23 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v22i32_to_v44i16: @@ -3021,44 +2956,44 @@ define inreg <44 x i16> @bitcast_v22i32_to_v44i16_scalar(<22 x i32> inreg %a, i3 ; SI-LABEL: bitcast_v22i32_to_v44i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v10, s16 -; SI-NEXT: v_mov_b32_e32 v11, s17 -; SI-NEXT: v_mov_b32_e32 v12, s18 -; SI-NEXT: v_mov_b32_e32 v13, s19 -; SI-NEXT: v_mov_b32_e32 v14, s20 -; SI-NEXT: v_mov_b32_e32 v15, s21 -; SI-NEXT: v_mov_b32_e32 v16, s22 -; SI-NEXT: v_mov_b32_e32 v17, s23 -; SI-NEXT: v_mov_b32_e32 v18, s24 -; SI-NEXT: v_mov_b32_e32 v19, s25 -; SI-NEXT: v_readfirstlane_b32 s24, v10 -; SI-NEXT: v_mov_b32_e32 v10, s26 -; SI-NEXT: v_readfirstlane_b32 s25, v11 -; SI-NEXT: v_mov_b32_e32 v11, s27 -; SI-NEXT: v_readfirstlane_b32 s22, v12 -; SI-NEXT: v_mov_b32_e32 v12, s28 -; SI-NEXT: v_readfirstlane_b32 s23, v13 -; SI-NEXT: v_mov_b32_e32 v13, s29 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 -; SI-NEXT: v_readfirstlane_b32 s20, v14 -; SI-NEXT: v_readfirstlane_b32 s21, v15 -; SI-NEXT: v_readfirstlane_b32 s18, v16 -; SI-NEXT: v_readfirstlane_b32 s19, v17 -; SI-NEXT: v_readfirstlane_b32 s16, v18 -; SI-NEXT: v_readfirstlane_b32 s17, v19 -; SI-NEXT: v_readfirstlane_b32 s14, v10 -; SI-NEXT: v_readfirstlane_b32 s15, v11 -; SI-NEXT: v_readfirstlane_b32 s12, v12 -; SI-NEXT: v_readfirstlane_b32 s13, v13 -; SI-NEXT: v_readfirstlane_b32 s10, v1 -; SI-NEXT: v_readfirstlane_b32 s11, v2 -; SI-NEXT: v_readfirstlane_b32 s8, v3 -; SI-NEXT: v_readfirstlane_b32 s9, v4 -; SI-NEXT: v_readfirstlane_b32 s6, v5 -; SI-NEXT: v_readfirstlane_b32 s7, v6 -; SI-NEXT: v_readfirstlane_b32 s4, v7 +; SI-NEXT: v_mov_b32_e32 v9, s16 +; SI-NEXT: v_mov_b32_e32 v10, s17 +; SI-NEXT: v_mov_b32_e32 v11, s18 +; SI-NEXT: v_mov_b32_e32 v12, s19 +; SI-NEXT: v_mov_b32_e32 v13, s20 +; SI-NEXT: v_mov_b32_e32 v14, s21 +; SI-NEXT: v_mov_b32_e32 v15, s22 +; SI-NEXT: v_mov_b32_e32 v16, s23 +; SI-NEXT: v_mov_b32_e32 v17, s24 +; SI-NEXT: v_mov_b32_e32 v18, s25 +; SI-NEXT: v_mov_b32_e32 v19, s26 +; SI-NEXT: v_readfirstlane_b32 s24, v9 +; SI-NEXT: v_mov_b32_e32 v9, s27 +; SI-NEXT: v_readfirstlane_b32 s25, v10 +; SI-NEXT: v_mov_b32_e32 v10, s28 +; SI-NEXT: v_readfirstlane_b32 s22, v11 +; SI-NEXT: v_mov_b32_e32 v11, s29 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: v_readfirstlane_b32 s23, v12 +; SI-NEXT: v_readfirstlane_b32 s20, v13 +; SI-NEXT: v_readfirstlane_b32 s21, v14 +; SI-NEXT: v_readfirstlane_b32 s18, v15 +; SI-NEXT: v_readfirstlane_b32 s19, v16 +; SI-NEXT: v_readfirstlane_b32 s16, v17 +; SI-NEXT: v_readfirstlane_b32 s17, v18 +; SI-NEXT: v_readfirstlane_b32 s14, v19 +; SI-NEXT: v_readfirstlane_b32 s15, v9 +; SI-NEXT: v_readfirstlane_b32 s12, v10 +; SI-NEXT: v_readfirstlane_b32 s13, v11 +; SI-NEXT: v_readfirstlane_b32 s10, v0 +; SI-NEXT: v_readfirstlane_b32 s11, v1 +; SI-NEXT: v_readfirstlane_b32 s8, v2 +; SI-NEXT: v_readfirstlane_b32 s9, v3 +; SI-NEXT: v_readfirstlane_b32 s6, v4 +; SI-NEXT: v_readfirstlane_b32 s7, v5 +; SI-NEXT: v_readfirstlane_b32 s4, v6 ; SI-NEXT: s_and_b64 s[26:27], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s5, v8 +; SI-NEXT: v_readfirstlane_b32 s5, v7 ; SI-NEXT: s_cbranch_scc0 .LBB13_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_lshr_b32 s76, s5, 16 @@ -3133,153 +3068,91 @@ define inreg <44 x i16> @bitcast_v22i32_to_v44i16_scalar(<22 x i32> inreg %a, i3 ; SI-NEXT: s_lshl_b32 s27, s72, 16 ; SI-NEXT: s_and_b32 s24, s24, 0xffff ; SI-NEXT: s_or_b32 s24, s24, s27 -; SI-NEXT: v_mov_b32_e32 v1, s24 -; SI-NEXT: s_and_b32 s24, s25, 0xffff -; SI-NEXT: s_lshl_b32 s25, s94, 16 -; SI-NEXT: s_or_b32 s24, s24, s25 -; SI-NEXT: v_mov_b32_e32 v2, s24 -; SI-NEXT: s_lshl_b32 s24, s62, 16 +; SI-NEXT: s_and_b32 s25, s25, 0xffff +; SI-NEXT: s_lshl_b32 s27, s94, 16 +; SI-NEXT: s_or_b32 s25, s25, s27 +; SI-NEXT: s_lshl_b32 s27, s62, 16 ; SI-NEXT: s_and_b32 s22, s22, 0xffff -; SI-NEXT: s_or_b32 s22, s22, s24 -; SI-NEXT: v_mov_b32_e32 v3, s22 -; SI-NEXT: s_and_b32 s22, s23, 0xffff -; SI-NEXT: s_lshl_b32 s23, s93, 16 -; SI-NEXT: s_or_b32 s22, s22, s23 -; SI-NEXT: v_mov_b32_e32 v4, s22 -; SI-NEXT: s_lshl_b32 s22, s60, 16 +; SI-NEXT: s_or_b32 s22, s22, s27 +; SI-NEXT: s_and_b32 s23, s23, 0xffff +; SI-NEXT: s_lshl_b32 s27, s93, 16 +; SI-NEXT: s_or_b32 s23, s23, s27 +; SI-NEXT: s_lshl_b32 s27, s60, 16 ; SI-NEXT: s_and_b32 s20, s20, 0xffff -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; SI-NEXT: s_or_b32 s20, s20, s22 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v1, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v1, vcc, 12, v0 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v2, s20 -; SI-NEXT: s_and_b32 s20, s21, 0xffff -; SI-NEXT: s_lshl_b32 s21, s92, 16 -; SI-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v1, vcc, 16, v0 -; SI-NEXT: s_or_b32 s20, s20, s21 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s20 +; SI-NEXT: s_or_b32 s20, s20, s27 +; SI-NEXT: s_and_b32 s21, s21, 0xffff +; SI-NEXT: s_lshl_b32 s27, s92, 16 +; SI-NEXT: s_or_b32 s21, s21, s27 +; SI-NEXT: s_lshl_b32 s27, s58, 16 ; SI-NEXT: s_and_b32 s18, s18, 0xffff -; SI-NEXT: s_lshl_b32 s20, s58, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 20, v0 -; SI-NEXT: s_or_b32 s18, s18, s20 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s18 -; SI-NEXT: s_and_b32 s18, s19, 0xffff -; SI-NEXT: s_lshl_b32 s19, s91, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 24, v0 -; SI-NEXT: s_or_b32 s18, s18, s19 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: s_or_b32 s18, s18, s27 +; SI-NEXT: s_and_b32 s19, s19, 0xffff +; SI-NEXT: s_lshl_b32 s27, s91, 16 +; SI-NEXT: s_or_b32 s19, s19, s27 ; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: s_lshl_b32 s18, s56, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 28, v0 -; SI-NEXT: s_or_b32 s16, s16, s18 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s17, 0xffff -; SI-NEXT: s_lshl_b32 s17, s90, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 32, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_lshl_b32 s27, s56, 16 +; SI-NEXT: s_or_b32 s16, s16, s27 +; SI-NEXT: s_and_b32 s17, s17, 0xffff +; SI-NEXT: s_lshl_b32 s27, s90, 16 +; SI-NEXT: s_or_b32 s17, s17, s27 ; SI-NEXT: s_and_b32 s14, s14, 0xffff -; SI-NEXT: s_lshl_b32 s16, s46, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 36, v0 -; SI-NEXT: s_or_b32 s14, s14, s16 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s14 -; SI-NEXT: s_and_b32 s14, s15, 0xffff -; SI-NEXT: s_lshl_b32 s15, s89, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 40, v0 -; SI-NEXT: s_or_b32 s14, s14, s15 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s14 +; SI-NEXT: s_lshl_b32 s27, s46, 16 +; SI-NEXT: s_or_b32 s14, s14, s27 +; SI-NEXT: s_and_b32 s15, s15, 0xffff +; SI-NEXT: s_lshl_b32 s27, s89, 16 +; SI-NEXT: s_or_b32 s15, s15, s27 ; SI-NEXT: s_and_b32 s12, s12, 0xffff -; SI-NEXT: s_lshl_b32 s14, s44, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 44, v0 -; SI-NEXT: s_or_b32 s12, s12, s14 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s12 -; SI-NEXT: s_and_b32 s12, s13, 0xffff -; SI-NEXT: s_lshl_b32 s13, s88, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 48, v0 -; SI-NEXT: s_or_b32 s12, s12, s13 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s12 +; SI-NEXT: s_lshl_b32 s27, s44, 16 +; SI-NEXT: s_or_b32 s12, s12, s27 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_lshl_b32 s27, s88, 16 +; SI-NEXT: s_or_b32 s13, s13, s27 ; SI-NEXT: s_and_b32 s10, s10, 0xffff -; SI-NEXT: s_lshl_b32 s12, s42, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 52, v0 -; SI-NEXT: s_or_b32 s10, s10, s12 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s10 -; SI-NEXT: s_and_b32 s10, s11, 0xffff -; SI-NEXT: s_lshl_b32 s11, s79, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 56, v0 -; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: s_lshl_b32 s27, s42, 16 +; SI-NEXT: s_or_b32 s10, s10, s27 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: s_lshl_b32 s27, s79, 16 +; SI-NEXT: s_or_b32 s11, s11, s27 ; SI-NEXT: s_and_b32 s8, s8, 0xffff -; SI-NEXT: s_lshl_b32 s10, s40, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 60, v0 -; SI-NEXT: s_or_b32 s8, s8, s10 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s8 -; SI-NEXT: s_and_b32 s8, s9, 0xffff -; SI-NEXT: s_lshl_b32 s9, s78, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 64, v0 -; SI-NEXT: s_or_b32 s8, s8, s9 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: s_lshl_b32 s27, s40, 16 +; SI-NEXT: s_or_b32 s8, s8, s27 +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_lshl_b32 s27, s78, 16 +; SI-NEXT: s_or_b32 s9, s9, s27 ; SI-NEXT: s_and_b32 s6, s6, 0xffff -; SI-NEXT: s_lshl_b32 s8, s28, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x44, v0 -; SI-NEXT: s_or_b32 s6, s6, s8 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: s_and_b32 s6, s7, 0xffff -; SI-NEXT: s_lshl_b32 s7, s77, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x48, v0 -; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_lshl_b32 s27, s28, 16 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_lshl_b32 s6, s26, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x4c, v0 -; SI-NEXT: s_or_b32 s4, s4, s6 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s4 -; SI-NEXT: s_and_b32 s4, s5, 0xffff -; SI-NEXT: s_lshl_b32 s5, s76, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x50, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x54, v0 -; SI-NEXT: v_mov_b32_e32 v1, s4 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_lshl_b32 s26, s26, 16 +; SI-NEXT: s_or_b32 s6, s6, s27 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_lshl_b32 s27, s77, 16 +; SI-NEXT: s_or_b32 s4, s4, s26 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s26, s76, 16 +; SI-NEXT: s_or_b32 s7, s7, s27 +; SI-NEXT: s_or_b32 s5, s5, s26 +; SI-NEXT: v_mov_b32_e32 v0, s24 +; SI-NEXT: v_mov_b32_e32 v1, s25 +; SI-NEXT: v_mov_b32_e32 v2, s22 +; SI-NEXT: v_mov_b32_e32 v3, s23 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s18 +; SI-NEXT: v_mov_b32_e32 v7, s19 +; SI-NEXT: v_mov_b32_e32 v8, s16 +; SI-NEXT: v_mov_b32_e32 v9, s17 +; SI-NEXT: v_mov_b32_e32 v10, s14 +; SI-NEXT: v_mov_b32_e32 v11, s15 +; SI-NEXT: v_mov_b32_e32 v12, s12 +; SI-NEXT: v_mov_b32_e32 v13, s13 +; SI-NEXT: v_mov_b32_e32 v14, s10 +; SI-NEXT: v_mov_b32_e32 v15, s11 +; SI-NEXT: v_mov_b32_e32 v16, s8 +; SI-NEXT: v_mov_b32_e32 v17, s9 +; SI-NEXT: v_mov_b32_e32 v18, s6 +; SI-NEXT: v_mov_b32_e32 v19, s7 +; SI-NEXT: v_mov_b32_e32 v20, s4 +; SI-NEXT: v_mov_b32_e32 v21, s5 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB13_4: ; SI-NEXT: ; implicit-def: $sgpr72 @@ -3903,159 +3776,145 @@ define <22 x i32> @bitcast_v44i16_to_v22i32(<44 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v44i16_to_v22i32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v54, v2 -; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 -; SI-NEXT: v_mov_b32_e32 v53, v4 -; SI-NEXT: v_mov_b32_e32 v50, v10 -; SI-NEXT: v_mov_b32_e32 v51, v8 -; SI-NEXT: v_mov_b32_e32 v52, v6 -; SI-NEXT: v_mov_b32_e32 v39, v16 -; SI-NEXT: v_mov_b32_e32 v48, v14 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v34, v19 +; SI-NEXT: v_mov_b32_e32 v35, v18 +; SI-NEXT: v_mov_b32_e32 v36, v17 +; SI-NEXT: v_mov_b32_e32 v37, v16 +; SI-NEXT: v_mov_b32_e32 v38, v15 +; SI-NEXT: v_mov_b32_e32 v39, v14 +; SI-NEXT: v_mov_b32_e32 v48, v13 ; SI-NEXT: v_mov_b32_e32 v49, v12 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v38, v18 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v19 +; SI-NEXT: v_mov_b32_e32 v50, v11 +; SI-NEXT: v_mov_b32_e32 v51, v10 +; SI-NEXT: v_mov_b32_e32 v52, v9 +; SI-NEXT: v_mov_b32_e32 v53, v8 +; SI-NEXT: v_mov_b32_e32 v54, v7 +; SI-NEXT: v_mov_b32_e32 v55, v6 +; SI-NEXT: v_mov_b32_e32 v40, v5 +; SI-NEXT: v_mov_b32_e32 v41, v4 +; SI-NEXT: v_mov_b32_e32 v42, v3 +; SI-NEXT: v_mov_b32_e32 v43, v2 +; SI-NEXT: v_mov_b32_e32 v44, v1 +; SI-NEXT: v_mov_b32_e32 v45, v0 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v21 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v38 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v48 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v50 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v52 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v40 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v42 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v43 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v44 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v45 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 ; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v23 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v29 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v0 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:4 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v4 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:36 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v12 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:28 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB14_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v53 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v52 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v51 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v50 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v49 -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v48 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v39 -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v38 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: v_or_b32_e32 v0, v0, v37 -; SI-NEXT: v_or_b32_e32 v1, v1, v45 -; SI-NEXT: v_or_b32_e32 v2, v2, v44 -; SI-NEXT: v_or_b32_e32 v3, v3, v43 -; SI-NEXT: v_or_b32_e32 v4, v4, v36 -; SI-NEXT: v_or_b32_e32 v5, v5, v42 -; SI-NEXT: v_or_b32_e32 v6, v6, v41 -; SI-NEXT: v_or_b32_e32 v7, v7, v35 -; SI-NEXT: v_or_b32_e32 v8, v8, v34 -; SI-NEXT: v_or_b32_e32 v9, v9, v40 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v49 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v45 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v44 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v43 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v42 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v41 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v40 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v55 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v54 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v53 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v52 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v51 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v50 +; SI-NEXT: v_or_b32_e32 v0, v0, v33 +; SI-NEXT: v_or_b32_e32 v1, v1, v63 +; SI-NEXT: v_or_b32_e32 v2, v2, v32 +; SI-NEXT: v_or_b32_e32 v3, v3, v62 +; SI-NEXT: v_or_b32_e32 v4, v4, v61 +; SI-NEXT: v_or_b32_e32 v5, v5, v60 +; SI-NEXT: v_or_b32_e32 v6, v6, v59 +; SI-NEXT: v_or_b32_e32 v7, v7, v58 +; SI-NEXT: v_or_b32_e32 v8, v8, v57 +; SI-NEXT: v_or_b32_e32 v9, v9, v56 +; SI-NEXT: v_or_b32_e32 v10, v10, v47 +; SI-NEXT: v_or_b32_e32 v11, v11, v46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr53 @@ -4063,61 +3922,9 @@ define <22 x i32> @bitcast_v44i16_to_v22i32(<44 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; SI-NEXT: v_or_b32_e32 v10, v10, v33 -; SI-NEXT: v_or_b32_e32 v11, v11, v32 -; SI-NEXT: v_or_b32_e32 v12, v12, v63 -; SI-NEXT: v_or_b32_e32 v13, v13, v62 -; SI-NEXT: v_or_b32_e32 v14, v14, v61 -; SI-NEXT: v_or_b32_e32 v15, v15, v60 -; SI-NEXT: v_or_b32_e32 v16, v16, v59 -; SI-NEXT: v_or_b32_e32 v17, v17, v58 -; SI-NEXT: v_or_b32_e32 v18, v18, v57 -; SI-NEXT: v_or_b32_e32 v19, v19, v56 -; SI-NEXT: v_or_b32_e32 v20, v20, v47 -; SI-NEXT: v_or_b32_e32 v21, v21, v46 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr62 ; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: ; implicit-def: $vgpr60 @@ -4127,33 +3934,97 @@ define <22 x i32> @bitcast_v44i16_to_v22i32(<44 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v48 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v39 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v38 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v37 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v36 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v35 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v34 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 ; SI-NEXT: .LBB14_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB14_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v49 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v53 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v52 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v51 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v50 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v49 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v48 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v39 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v38 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v45 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v44 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v43 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v42 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v41 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v40 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v50 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 @@ -4164,17 +4035,21 @@ define <22 x i32> @bitcast_v44i16_to_v22i32(<44 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: v_or_b32_e32 v0, v37, v0 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_or_b32_e32 v0, v33, v0 ; SI-NEXT: s_mov_b32 s6, 0x30000 -; SI-NEXT: v_or_b32_e32 v1, v45, v1 -; SI-NEXT: v_or_b32_e32 v2, v44, v2 -; SI-NEXT: v_or_b32_e32 v3, v43, v3 -; SI-NEXT: v_or_b32_e32 v4, v36, v4 -; SI-NEXT: v_or_b32_e32 v5, v42, v5 -; SI-NEXT: v_or_b32_e32 v6, v41, v6 -; SI-NEXT: v_or_b32_e32 v7, v35, v7 -; SI-NEXT: v_or_b32_e32 v8, v34, v8 -; SI-NEXT: v_or_b32_e32 v9, v40, v9 +; SI-NEXT: v_or_b32_e32 v1, v63, v1 +; SI-NEXT: v_or_b32_e32 v2, v32, v2 +; SI-NEXT: v_or_b32_e32 v3, v62, v3 +; SI-NEXT: v_or_b32_e32 v4, v61, v4 +; SI-NEXT: v_or_b32_e32 v5, v60, v5 +; SI-NEXT: v_or_b32_e32 v6, v59, v6 +; SI-NEXT: v_or_b32_e32 v7, v58, v7 +; SI-NEXT: v_or_b32_e32 v8, v57, v8 +; SI-NEXT: v_or_b32_e32 v9, v56, v9 +; SI-NEXT: v_or_b32_e32 v10, v47, v10 +; SI-NEXT: v_or_b32_e32 v11, v46, v11 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 @@ -4185,49 +4060,38 @@ define <22 x i32> @bitcast_v44i16_to_v22i32(<44 x i16> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 ; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 ; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v48 ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v39 ; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v38 ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v37 ; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v36 ; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v35 ; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v34 ; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; SI-NEXT: v_or_b32_e32 v10, v33, v10 -; SI-NEXT: v_or_b32_e32 v11, v32, v11 -; SI-NEXT: v_or_b32_e32 v12, v63, v12 -; SI-NEXT: v_or_b32_e32 v13, v62, v13 -; SI-NEXT: v_or_b32_e32 v14, v61, v14 -; SI-NEXT: v_or_b32_e32 v15, v60, v15 -; SI-NEXT: v_or_b32_e32 v16, v59, v16 -; SI-NEXT: v_or_b32_e32 v17, v58, v17 -; SI-NEXT: v_or_b32_e32 v18, v57, v18 -; SI-NEXT: v_or_b32_e32 v19, v56, v19 -; SI-NEXT: v_or_b32_e32 v20, v47, v20 -; SI-NEXT: v_or_b32_e32 v21, v46, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 ; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 ; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 @@ -4238,26 +4102,37 @@ define <22 x i32> @bitcast_v44i16_to_v22i32(<44 x i16> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 ; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 ; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 ; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v21 ; SI-NEXT: .LBB14_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -4849,233 +4724,239 @@ define inreg <22 x i32> @bitcast_v44i16_to_v22i32_scalar(<44 x i16> inreg %a, i3 ; SI-LABEL: bitcast_v44i16_to_v22i32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v32, v28 -; SI-NEXT: v_mov_b32_e32 v33, v26 -; SI-NEXT: v_mov_b32_e32 v34, v24 -; SI-NEXT: v_mov_b32_e32 v35, v22 -; SI-NEXT: v_mov_b32_e32 v36, v20 -; SI-NEXT: v_mov_b32_e32 v37, v18 -; SI-NEXT: v_mov_b32_e32 v38, v16 -; SI-NEXT: v_mov_b32_e32 v39, v14 -; SI-NEXT: v_mov_b32_e32 v48, v12 -; SI-NEXT: v_mov_b32_e32 v49, v10 -; SI-NEXT: v_mov_b32_e32 v50, v8 -; SI-NEXT: v_mov_b32_e32 v51, v6 -; SI-NEXT: v_mov_b32_e32 v52, v4 -; SI-NEXT: v_mov_b32_e32 v53, v2 -; SI-NEXT: v_mov_b32_e32 v54, v0 +; SI-NEXT: v_mov_b32_e32 v32, v7 +; SI-NEXT: v_mov_b32_e32 v33, v6 +; SI-NEXT: v_mov_b32_e32 v34, v5 +; SI-NEXT: v_mov_b32_e32 v35, v4 +; SI-NEXT: v_mov_b32_e32 v36, v3 +; SI-NEXT: v_mov_b32_e32 v37, v2 +; SI-NEXT: v_mov_b32_e32 v38, v1 +; SI-NEXT: v_mov_b32_e32 v39, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v38 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v39 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s7, s28, 16 +; SI-NEXT: s_lshr_b32 s8, s27, 16 +; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: s_lshr_b32 s13, s22, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s40, s19, 16 +; SI-NEXT: s_lshr_b32 s41, s18, 16 +; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v0 ; SI-NEXT: s_cbranch_scc0 .LBB15_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v54 -; SI-NEXT: v_or_b32_e32 v7, v0, v61 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 -; SI-NEXT: v_or_b32_e32 v9, v0, v59 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 -; SI-NEXT: v_or_b32_e32 v10, v0, v58 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 -; SI-NEXT: v_or_b32_e32 v11, v0, v57 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 -; SI-NEXT: v_or_b32_e32 v12, v0, v56 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 -; SI-NEXT: v_or_b32_e32 v13, v0, v47 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 ; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: v_or_b32_e32 v14, v0, v46 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: s_lshl_b32 s5, s43, 16 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: v_or_b32_e32 v15, v0, v45 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s44, s42, 16 +; SI-NEXT: s_or_b32 s5, s5, s44 +; SI-NEXT: s_and_b32 s44, s18, 0xffff +; SI-NEXT: s_lshl_b32 s45, s41, 16 +; SI-NEXT: s_or_b32 s44, s44, s45 +; SI-NEXT: s_and_b32 s45, s19, 0xffff +; SI-NEXT: s_lshl_b32 s46, s40, 16 +; SI-NEXT: s_or_b32 s45, s45, s46 +; SI-NEXT: s_and_b32 s46, s20, 0xffff +; SI-NEXT: s_lshl_b32 s47, s15, 16 +; SI-NEXT: s_or_b32 s46, s46, s47 +; SI-NEXT: s_and_b32 s47, s21, 0xffff +; SI-NEXT: s_lshl_b32 s56, s14, 16 +; SI-NEXT: s_or_b32 s47, s47, s56 +; SI-NEXT: s_and_b32 s56, s22, 0xffff +; SI-NEXT: s_lshl_b32 s57, s13, 16 +; SI-NEXT: s_or_b32 s56, s56, s57 +; SI-NEXT: s_and_b32 s57, s23, 0xffff +; SI-NEXT: s_lshl_b32 s58, s12, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: s_or_b32 s57, s57, s58 +; SI-NEXT: s_and_b32 s58, s24, 0xffff +; SI-NEXT: s_lshl_b32 s59, s11, 16 +; SI-NEXT: v_or_b32_e32 v14, v0, v55 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 -; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: v_or_b32_e32 v16, v0, v44 +; SI-NEXT: s_or_b32 s58, s58, s59 +; SI-NEXT: s_and_b32 s59, s25, 0xffff +; SI-NEXT: s_lshl_b32 s60, s10, 16 +; SI-NEXT: v_or_b32_e32 v16, v0, v53 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 -; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: v_or_b32_e32 v17, v0, v43 +; SI-NEXT: s_or_b32 s59, s59, s60 +; SI-NEXT: s_and_b32 s60, s26, 0xffff +; SI-NEXT: s_lshl_b32 s61, s9, 16 +; SI-NEXT: v_or_b32_e32 v17, v0, v52 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 -; SI-NEXT: s_or_b32 s7, s7, s8 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: v_or_b32_e32 v18, v0, v42 +; SI-NEXT: s_or_b32 s60, s60, s61 +; SI-NEXT: s_and_b32 s61, s27, 0xffff +; SI-NEXT: s_lshl_b32 s62, s8, 16 +; SI-NEXT: v_or_b32_e32 v18, v0, v51 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 -; SI-NEXT: s_or_b32 s8, s8, s9 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: v_or_b32_e32 v19, v0, v41 +; SI-NEXT: s_or_b32 s61, s61, s62 +; SI-NEXT: s_and_b32 s62, s28, 0xffff +; SI-NEXT: s_lshl_b32 s63, s7, 16 +; SI-NEXT: v_or_b32_e32 v19, v0, v50 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 -; SI-NEXT: s_or_b32 s9, s9, s10 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v53 -; SI-NEXT: v_or_b32_e32 v20, v0, v40 +; SI-NEXT: s_or_b32 s62, s62, s63 +; SI-NEXT: s_and_b32 s63, s29, 0xffff +; SI-NEXT: s_lshl_b32 s72, s6, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v38 +; SI-NEXT: v_or_b32_e32 v20, v0, v49 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 -; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_or_b32_e32 v8, v1, v60 -; SI-NEXT: v_or_b32_e32 v21, v0, v55 +; SI-NEXT: s_or_b32 s63, s63, s72 +; SI-NEXT: v_or_b32_e32 v15, v1, v54 +; SI-NEXT: v_or_b32_e32 v21, v0, v48 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_mov_b32_e32 v5, s9 -; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: v_mov_b32_e32 v2, s44 +; SI-NEXT: v_mov_b32_e32 v3, s45 +; SI-NEXT: v_mov_b32_e32 v4, s46 +; SI-NEXT: v_mov_b32_e32 v5, s47 +; SI-NEXT: v_mov_b32_e32 v6, s56 +; SI-NEXT: v_mov_b32_e32 v7, s57 +; SI-NEXT: v_mov_b32_e32 v8, s58 +; SI-NEXT: v_mov_b32_e32 v9, s59 +; SI-NEXT: v_mov_b32_e32 v10, s60 +; SI-NEXT: v_mov_b32_e32 v11, s61 +; SI-NEXT: v_mov_b32_e32 v12, s62 +; SI-NEXT: v_mov_b32_e32 v13, s63 ; SI-NEXT: s_cbranch_execnz .LBB15_3 ; SI-NEXT: .LBB15_2: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v54 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v61, v0 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v60, v0 -; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v59, v0 -; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v58, v0 -; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v57, v0 -; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v56, v0 -; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v47, v0 -; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: v_or_b32_e32 v0, v55, v0 ; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v45, v0 -; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v44, v0 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: v_or_b32_e32 v0, v53, v0 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s16, s42, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: s_and_b32 s16, s18, 0xffff +; SI-NEXT: s_lshl_b32 s17, s41, 16 +; SI-NEXT: s_add_i32 s19, s19, 3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_and_b32 s17, s19, 0xffff +; SI-NEXT: s_lshl_b32 s18, s40, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_or_b32_e32 v0, v52, v0 +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: s_and_b32 s18, s20, 0xffff +; SI-NEXT: s_lshl_b32 s15, s15, 16 +; SI-NEXT: s_add_i32 s21, s21, 3 ; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: s_or_b32 s15, s15, s18 +; SI-NEXT: s_and_b32 s18, s21, 0xffff +; SI-NEXT: s_lshl_b32 s14, s14, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_or_b32_e32 v0, v42, v0 -; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s14, s14, s18 +; SI-NEXT: s_and_b32 s18, s22, 0xffff +; SI-NEXT: s_lshl_b32 s13, s13, 16 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: v_or_b32_e32 v0, v51, v0 +; SI-NEXT: s_or_b32 s13, s13, s18 +; SI-NEXT: s_and_b32 s18, s23, 0xffff +; SI-NEXT: s_lshl_b32 s12, s12, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 ; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s12, s12, s18 +; SI-NEXT: s_and_b32 s18, s24, 0xffff +; SI-NEXT: s_lshl_b32 s11, s11, 16 +; SI-NEXT: s_add_i32 s25, s25, 3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: v_or_b32_e32 v0, v41, v0 -; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s11, s11, s18 +; SI-NEXT: s_and_b32 s18, s25, 0xffff +; SI-NEXT: s_lshl_b32 s10, s10, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: v_or_b32_e32 v0, v50, v0 +; SI-NEXT: s_or_b32 s10, s10, s18 +; SI-NEXT: s_and_b32 s18, s26, 0xffff +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_add_i32 s27, s27, 3 ; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 -; SI-NEXT: s_or_b32 s7, s8, s7 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_or_b32 s8, s9, s8 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_or_b32 s9, s9, s18 +; SI-NEXT: s_and_b32 s18, s27, 0xffff +; SI-NEXT: s_lshl_b32 s8, s8, 16 ; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: v_or_b32_e32 v0, v40, v0 -; SI-NEXT: s_or_b32 s9, s10, s9 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s8, s8, s18 +; SI-NEXT: s_and_b32 s18, s28, 0xffff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: v_or_b32_e32 v0, v49, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v38 +; SI-NEXT: s_or_b32 s7, s7, s18 +; SI-NEXT: s_and_b32 s18, s29, 0xffff +; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 -; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_or_b32 s6, s6, s18 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v54, v1 ; SI-NEXT: s_add_i32 s4, s4, 0x30000 ; SI-NEXT: s_add_i32 s5, s5, 0x30000 -; SI-NEXT: s_add_i32 s6, s6, 0x30000 -; SI-NEXT: s_add_i32 s7, s7, 0x30000 -; SI-NEXT: s_add_i32 s8, s8, 0x30000 -; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s16, s16, 0x30000 +; SI-NEXT: s_add_i32 s17, s17, 0x30000 +; SI-NEXT: s_add_i32 s15, s15, 0x30000 +; SI-NEXT: s_add_i32 s14, s14, 0x30000 +; SI-NEXT: s_add_i32 s13, s13, 0x30000 +; SI-NEXT: s_add_i32 s12, s12, 0x30000 +; SI-NEXT: s_add_i32 s11, s11, 0x30000 ; SI-NEXT: s_add_i32 s10, s10, 0x30000 -; SI-NEXT: v_or_b32_e32 v0, v55, v0 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v48, v0 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v1 ; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v0 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_mov_b32_e32 v5, s9 -; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: v_mov_b32_e32 v3, s17 +; SI-NEXT: v_mov_b32_e32 v4, s15 +; SI-NEXT: v_mov_b32_e32 v5, s14 +; SI-NEXT: v_mov_b32_e32 v6, s13 +; SI-NEXT: v_mov_b32_e32 v7, s12 +; SI-NEXT: v_mov_b32_e32 v8, s11 +; SI-NEXT: v_mov_b32_e32 v9, s10 +; SI-NEXT: v_mov_b32_e32 v10, s9 +; SI-NEXT: v_mov_b32_e32 v11, s8 +; SI-NEXT: v_mov_b32_e32 v12, s7 +; SI-NEXT: v_mov_b32_e32 v13, s6 ; SI-NEXT: .LBB15_3: ; %end -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB15_4: ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 @@ -5658,10 +5539,9 @@ define <44 x half> @bitcast_v22i32_to_v44f16(<22 x i32> %a, i32 %b) { ; SI-LABEL: bitcast_v22i32_to_v44f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v23 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -5678,15 +5558,14 @@ define <44 x half> @bitcast_v22i32_to_v44f16(<22 x i32> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr47 @@ -5696,109 +5575,109 @@ define <44 x half> @bitcast_v22i32_to_v44f16(<22 x i32> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; kill: killed $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; kill: killed $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB16_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v57, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v4 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v60, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v3 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v62, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v24 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v31, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v3 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v63, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v1 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v59, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v2 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v61, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v63, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v22 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v29, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v0 +; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 @@ -5820,11 +5699,11 @@ define <44 x half> @bitcast_v22i32_to_v44f16(<22 x i32> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: .LBB16_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB16_4 ; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 @@ -5847,236 +5726,145 @@ define <44 x half> @bitcast_v22i32_to_v44f16(<22 x i32> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 ; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v19 ; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v20 ; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 ; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 ; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 ; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 ; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: .LBB16_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v1, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v63 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: v_cvt_f16_f32_e32 v1, v62 -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v62 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v61 -; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v59 -; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v58 -; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v56 -; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v46 -; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v44 -; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v42 -; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v40 -; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v54 -; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v52 -; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v51 -; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v49 -; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v39 -; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 -; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v35 -; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v31 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v26 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v27 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x54, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v58 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v56 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v47 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v44 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v43 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v55 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v40 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v51 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v52 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v39 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v48 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v34 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v36 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v30 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v33 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v29 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -6093,7 +5881,30 @@ define <44 x half> @bitcast_v22i32_to_v44f16(<22 x i32> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v25 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v29 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v20, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v27 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v22i32_to_v44f16: @@ -6555,119 +6366,119 @@ define inreg <44 x half> @bitcast_v22i32_to_v44f16_scalar(<22 x i32> inreg %a, i ; SI-LABEL: bitcast_v22i32_to_v44f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v10, s16 -; SI-NEXT: v_mov_b32_e32 v11, s17 -; SI-NEXT: v_mov_b32_e32 v12, s18 -; SI-NEXT: v_mov_b32_e32 v13, s19 -; SI-NEXT: v_mov_b32_e32 v14, s20 -; SI-NEXT: v_mov_b32_e32 v15, s21 -; SI-NEXT: v_mov_b32_e32 v16, s22 -; SI-NEXT: v_mov_b32_e32 v17, s23 -; SI-NEXT: v_mov_b32_e32 v18, s24 -; SI-NEXT: v_mov_b32_e32 v19, s25 -; SI-NEXT: v_readfirstlane_b32 s23, v10 -; SI-NEXT: v_mov_b32_e32 v10, s26 -; SI-NEXT: v_readfirstlane_b32 s24, v11 -; SI-NEXT: v_mov_b32_e32 v11, s27 -; SI-NEXT: v_readfirstlane_b32 s25, v12 -; SI-NEXT: v_mov_b32_e32 v12, s28 -; SI-NEXT: v_readfirstlane_b32 s26, v13 -; SI-NEXT: v_mov_b32_e32 v13, s29 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 -; SI-NEXT: v_readfirstlane_b32 s27, v14 -; SI-NEXT: v_readfirstlane_b32 s22, v15 -; SI-NEXT: v_readfirstlane_b32 s21, v16 -; SI-NEXT: v_readfirstlane_b32 s20, v17 -; SI-NEXT: v_readfirstlane_b32 s19, v18 -; SI-NEXT: v_readfirstlane_b32 s18, v19 -; SI-NEXT: v_readfirstlane_b32 s17, v10 -; SI-NEXT: v_readfirstlane_b32 s16, v11 -; SI-NEXT: v_readfirstlane_b32 s15, v12 -; SI-NEXT: v_readfirstlane_b32 s14, v13 -; SI-NEXT: v_readfirstlane_b32 s13, v1 -; SI-NEXT: v_readfirstlane_b32 s12, v2 -; SI-NEXT: v_readfirstlane_b32 s11, v3 -; SI-NEXT: v_readfirstlane_b32 s10, v4 -; SI-NEXT: v_readfirstlane_b32 s8, v5 -; SI-NEXT: v_readfirstlane_b32 s7, v6 -; SI-NEXT: v_readfirstlane_b32 s6, v7 +; SI-NEXT: v_mov_b32_e32 v9, s16 +; SI-NEXT: v_mov_b32_e32 v10, s17 +; SI-NEXT: v_mov_b32_e32 v11, s18 +; SI-NEXT: v_mov_b32_e32 v12, s19 +; SI-NEXT: v_mov_b32_e32 v13, s20 +; SI-NEXT: v_mov_b32_e32 v14, s21 +; SI-NEXT: v_mov_b32_e32 v15, s22 +; SI-NEXT: v_mov_b32_e32 v16, s23 +; SI-NEXT: v_mov_b32_e32 v17, s24 +; SI-NEXT: v_mov_b32_e32 v18, s25 +; SI-NEXT: v_mov_b32_e32 v19, s26 +; SI-NEXT: v_readfirstlane_b32 s24, v9 +; SI-NEXT: v_mov_b32_e32 v9, s27 +; SI-NEXT: v_readfirstlane_b32 s25, v10 +; SI-NEXT: v_mov_b32_e32 v10, s28 +; SI-NEXT: v_readfirstlane_b32 s26, v11 +; SI-NEXT: v_mov_b32_e32 v11, s29 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: v_readfirstlane_b32 s27, v12 +; SI-NEXT: v_readfirstlane_b32 s23, v13 +; SI-NEXT: v_readfirstlane_b32 s22, v14 +; SI-NEXT: v_readfirstlane_b32 s21, v15 +; SI-NEXT: v_readfirstlane_b32 s20, v16 +; SI-NEXT: v_readfirstlane_b32 s19, v17 +; SI-NEXT: v_readfirstlane_b32 s18, v18 +; SI-NEXT: v_readfirstlane_b32 s17, v19 +; SI-NEXT: v_readfirstlane_b32 s16, v9 +; SI-NEXT: v_readfirstlane_b32 s15, v10 +; SI-NEXT: v_readfirstlane_b32 s14, v11 +; SI-NEXT: v_readfirstlane_b32 s13, v0 +; SI-NEXT: v_readfirstlane_b32 s12, v1 +; SI-NEXT: v_readfirstlane_b32 s11, v2 +; SI-NEXT: v_readfirstlane_b32 s10, v3 +; SI-NEXT: v_readfirstlane_b32 s8, v4 +; SI-NEXT: v_readfirstlane_b32 s7, v5 +; SI-NEXT: v_readfirstlane_b32 s6, v6 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s9, v8 +; SI-NEXT: v_readfirstlane_b32 s9, v7 ; SI-NEXT: s_cbranch_scc0 .LBB17_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_lshr_b32 s4, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 ; SI-NEXT: s_lshr_b32 s4, s6, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 ; SI-NEXT: s_lshr_b32 s4, s7, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 ; SI-NEXT: s_lshr_b32 s4, s8, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 ; SI-NEXT: s_lshr_b32 s4, s10, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 ; SI-NEXT: s_lshr_b32 s4, s11, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 ; SI-NEXT: s_lshr_b32 s4, s12, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 -; SI-NEXT: s_lshr_b32 s4, s13, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 +; SI-NEXT: s_lshr_b32 s4, s13, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 ; SI-NEXT: s_lshr_b32 s4, s14, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 ; SI-NEXT: s_lshr_b32 s4, s15, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 ; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 ; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 ; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 ; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 ; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 ; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s4 ; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s4 ; SI-NEXT: s_lshr_b32 s4, s27, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 ; SI-NEXT: s_lshr_b32 s4, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s4 ; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 ; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v49, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v51, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v51, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s24 ; SI-NEXT: s_cbranch_execnz .LBB17_3 ; SI-NEXT: .LBB17_2: ; %cmp.true -; SI-NEXT: s_add_i32 s23, s23, 3 ; SI-NEXT: s_add_i32 s24, s24, 3 ; SI-NEXT: s_add_i32 s25, s25, 3 ; SI-NEXT: s_add_i32 s26, s26, 3 ; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 ; SI-NEXT: s_add_i32 s22, s22, 3 ; SI-NEXT: s_add_i32 s21, s21, 3 ; SI-NEXT: s_add_i32 s20, s20, 3 @@ -6685,11 +6496,11 @@ define inreg <44 x half> @bitcast_v22i32_to_v44f16_scalar(<22 x i32> inreg %a, i ; SI-NEXT: s_add_i32 s7, s7, 3 ; SI-NEXT: s_add_i32 s6, s6, 3 ; SI-NEXT: s_add_i32 s9, s9, 3 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: s_lshr_b32 s5, s24, 16 -; SI-NEXT: s_lshr_b32 s28, s25, 16 -; SI-NEXT: s_lshr_b32 s29, s26, 16 -; SI-NEXT: s_lshr_b32 s40, s27, 16 +; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: s_lshr_b32 s5, s25, 16 +; SI-NEXT: s_lshr_b32 s28, s26, 16 +; SI-NEXT: s_lshr_b32 s29, s27, 16 +; SI-NEXT: s_lshr_b32 s40, s23, 16 ; SI-NEXT: s_lshr_b32 s41, s22, 16 ; SI-NEXT: s_lshr_b32 s42, s21, 16 ; SI-NEXT: s_lshr_b32 s43, s20, 16 @@ -6707,250 +6518,185 @@ define inreg <44 x half> @bitcast_v22i32_to_v44f16_scalar(<22 x i32> inreg %a, i ; SI-NEXT: s_lshr_b32 s63, s7, 16 ; SI-NEXT: s_lshr_b32 s72, s6, 16 ; SI-NEXT: s_lshr_b32 s73, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v49, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v51, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s73 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s72 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s63 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s62 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s61 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s60 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s59 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s58 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s57 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s56 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s47 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s46 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s45 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s44 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s43 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s73 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s72 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s63 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s62 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s61 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s60 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s59 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s58 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s57 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s56 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s47 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s45 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s44 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v51, s4 ; SI-NEXT: .LBB17_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 ; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 ; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 ; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 ; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 +; SI-NEXT: v_or_b32_e32 v0, v50, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v49 +; SI-NEXT: v_or_b32_e32 v2, v48, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v48 +; SI-NEXT: v_or_b32_e32 v5, v5, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v38 +; SI-NEXT: v_or_b32_e32 v7, v7, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v36 +; SI-NEXT: v_or_b32_e32 v9, v34, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v34 +; SI-NEXT: v_or_b32_e32 v11, v32, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v32 +; SI-NEXT: v_or_b32_e32 v13, v30, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v30 +; SI-NEXT: v_or_b32_e32 v15, v28, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v28 +; SI-NEXT: v_or_b32_e32 v17, v26, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_or_b32_e32 v19, v24, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 ; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 ; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 -; SI-NEXT: v_or_b32_e32 v51, v51, v52 -; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 ; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: buffer_store_dword v51, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v51, vcc, 4, v0 -; SI-NEXT: v_or_b32_e32 v49, v49, v50 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 ; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: buffer_store_dword v49, v51, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v49, vcc, 8, v0 -; SI-NEXT: v_or_b32_e32 v39, v39, v48 -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 ; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: buffer_store_dword v39, v49, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v39, vcc, 12, v0 -; SI-NEXT: v_or_b32_e32 v37, v37, v38 -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: buffer_store_dword v37, v39, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v37, vcc, 16, v0 -; SI-NEXT: v_or_b32_e32 v35, v36, v35 -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: buffer_store_dword v35, v37, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v35, vcc, 20, v0 -; SI-NEXT: v_or_b32_e32 v33, v34, v33 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: buffer_store_dword v33, v35, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v33, vcc, 24, v0 -; SI-NEXT: v_or_b32_e32 v31, v32, v31 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: buffer_store_dword v31, v33, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v31, vcc, 28, v0 -; SI-NEXT: v_or_b32_e32 v29, v30, v29 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: buffer_store_dword v29, v31, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v29, vcc, 32, v0 -; SI-NEXT: v_or_b32_e32 v27, v28, v27 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; SI-NEXT: buffer_store_dword v27, v29, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v27, vcc, 36, v0 -; SI-NEXT: v_or_b32_e32 v25, v26, v25 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: buffer_store_dword v25, v27, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v25, vcc, 40, v0 -; SI-NEXT: v_or_b32_e32 v23, v24, v23 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: buffer_store_dword v23, v25, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v23, vcc, 44, v0 -; SI-NEXT: v_or_b32_e32 v20, v22, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: buffer_store_dword v20, v23, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v21 -; SI-NEXT: v_add_i32_e32 v21, vcc, 48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v20, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: buffer_store_dword v18, v21, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v18, v19 -; SI-NEXT: v_add_i32_e32 v19, vcc, 52, v0 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: buffer_store_dword v16, v19, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v16, v17 -; SI-NEXT: v_add_i32_e32 v17, vcc, 56, v0 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v16, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v14, v15 -; SI-NEXT: v_add_i32_e32 v15, vcc, 60, v0 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v14, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: buffer_store_dword v12, v15, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v12, v13 -; SI-NEXT: v_add_i32_e32 v13, vcc, 64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: buffer_store_dword v10, v13, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v10, v11 -; SI-NEXT: v_add_i32_e32 v11, vcc, 0x44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: buffer_store_dword v8, v11, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v8, v9 -; SI-NEXT: v_add_i32_e32 v9, vcc, 0x48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x4c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v5 -; SI-NEXT: v_add_i32_e32 v5, vcc, 0x50, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v3 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x54, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v24 +; SI-NEXT: v_or_b32_e32 v1, v52, v1 +; SI-NEXT: v_or_b32_e32 v3, v50, v3 +; SI-NEXT: v_or_b32_e32 v4, v39, v4 +; SI-NEXT: v_or_b32_e32 v6, v37, v6 +; SI-NEXT: v_or_b32_e32 v8, v35, v8 +; SI-NEXT: v_or_b32_e32 v10, v33, v10 +; SI-NEXT: v_or_b32_e32 v12, v31, v12 +; SI-NEXT: v_or_b32_e32 v14, v29, v14 +; SI-NEXT: v_or_b32_e32 v16, v27, v16 +; SI-NEXT: v_or_b32_e32 v18, v25, v18 +; SI-NEXT: v_or_b32_e32 v20, v23, v20 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB17_4: +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: s_branch .LBB17_2 ; ; VI-LABEL: bitcast_v22i32_to_v44f16_scalar: @@ -7550,159 +7296,163 @@ define <22 x i32> @bitcast_v44f16_to_v22i32(<44 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v44f16_to_v22i32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:52 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v50, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v32 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v10 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v9 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v52 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v46 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v52, v47 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v56 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v40, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v45 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v44 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v43 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v42 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB18_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v57 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v49 @@ -7713,10 +7463,13 @@ define <22 x i32> @bitcast_v44f16_to_v22i32(<44 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v63 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v61 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v59 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v53 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v45 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v41 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v53 ; SI-NEXT: v_or_b32_e32 v0, v50, v0 ; SI-NEXT: v_or_b32_e32 v1, v48, v1 ; SI-NEXT: v_or_b32_e32 v2, v38, v2 @@ -7726,8 +7479,13 @@ define <22 x i32> @bitcast_v44f16_to_v22i32(<44 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v6, v62, v6 ; SI-NEXT: v_or_b32_e32 v7, v60, v7 ; SI-NEXT: v_or_b32_e32 v8, v58, v8 -; SI-NEXT: v_or_b32_e32 v20, v54, v20 -; SI-NEXT: v_or_b32_e32 v21, v52, v21 +; SI-NEXT: v_or_b32_e32 v9, v56, v9 +; SI-NEXT: v_or_b32_e32 v10, v46, v10 +; SI-NEXT: v_or_b32_e32 v11, v44, v11 +; SI-NEXT: v_or_b32_e32 v12, v42, v12 +; SI-NEXT: v_or_b32_e32 v13, v40, v13 +; SI-NEXT: v_or_b32_e32 v14, v54, v14 +; SI-NEXT: v_or_b32_e32 v15, v52, v15 ; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr49 @@ -7747,78 +7505,86 @@ define <22 x i32> @bitcast_v44f16_to_v22i32(<44 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v41 -; SI-NEXT: v_or_b32_e32 v19, v40, v19 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: .LBB18_2: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB18_4 -; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v48 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: .LBB18_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB18_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v48 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 @@ -7868,130 +7634,107 @@ define <22 x i32> @bitcast_v44f16_to_v22i32(<44 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v11, v56 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v63 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v12, v46 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v43 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v52 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v42 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v40 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v53 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v52 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v47 ; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v45 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v44 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v11, v12, v11 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v41 ; SI-NEXT: v_or_b32_e32 v12, v14, v12 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v55 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v54 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 @@ -8003,17 +7746,29 @@ define <22 x i32> @bitcast_v44f16_to_v22i32(<44 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_or_b32_e32 v17, v18, v17 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v41 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_or_b32_e32 v18, v20, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v55 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v54 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_or_b32_e32 v20, v21, v20 @@ -8021,22 +7776,22 @@ define <22 x i32> @bitcast_v44f16_to_v22i32(<44 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v21, v23, v21 ; SI-NEXT: .LBB18_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -8629,6 +8384,14 @@ define inreg <22 x i32> @bitcast_v44f16_to_v22i32_scalar(<44 x half> inreg %a, i ; SI-LABEL: bitcast_v44f16_to_v22i32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_lshr_b32 s40, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s40 +; SI-NEXT: s_lshr_b32 s40, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s17 +; SI-NEXT: s_lshr_b32 s14, s19, 16 +; SI-NEXT: s_lshr_b32 s15, s18, 16 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -8645,219 +8408,280 @@ define inreg <22 x i32> @bitcast_v44f16_to_v22i32_scalar(<44 x half> inreg %a, i ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v60, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v51, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v33, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v1, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v34, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v2, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v3, s20 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v63, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v62, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v61, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v36, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v29, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v28, s26 -; SI-NEXT: v_cvt_f16_f32_e32 v27, s29 -; SI-NEXT: v_cvt_f16_f32_e32 v26, s28 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v61, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v19 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v62, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s19 +; SI-NEXT: s_lshr_b32 s12, s21, 16 +; SI-NEXT: s_lshr_b32 s13, s20, 16 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v18 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v63, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s21 +; SI-NEXT: s_lshr_b32 s10, s23, 16 +; SI-NEXT: s_lshr_b32 s11, s22, 16 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s23 +; SI-NEXT: s_lshr_b32 s8, s25, 16 +; SI-NEXT: s_lshr_b32 s9, s24, 16 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s25 +; SI-NEXT: s_lshr_b32 s6, s27, 16 +; SI-NEXT: s_lshr_b32 s7, s26, 16 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s27 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v0 +; SI-NEXT: s_lshr_b32 s4, s29, 16 +; SI-NEXT: s_lshr_b32 s5, s28, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v7 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB19_4 ; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v23 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v62 +; SI-NEXT: v_or_b32_e32 v2, v21, v2 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v63 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v61 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v29 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v60 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v60 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v48 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v36 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v35 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v58 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v56 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v46 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v44 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v42 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v40 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v54 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v46 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v41 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v53 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v25 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: v_or_b32_e32 v0, v33, v0 -; SI-NEXT: v_or_b32_e32 v1, v34, v1 -; SI-NEXT: v_or_b32_e32 v3, v62, v3 -; SI-NEXT: v_or_b32_e32 v4, v36, v4 -; SI-NEXT: v_or_b32_e32 v5, v28, v5 -; SI-NEXT: v_or_b32_e32 v6, v26, v6 -; SI-NEXT: v_or_b32_e32 v7, v48, v7 -; SI-NEXT: v_or_b32_e32 v8, v38, v8 -; SI-NEXT: v_or_b32_e32 v9, v49, v9 -; SI-NEXT: v_or_b32_e32 v10, v50, v10 -; SI-NEXT: v_or_b32_e32 v11, v59, v11 -; SI-NEXT: v_or_b32_e32 v12, v57, v12 -; SI-NEXT: v_or_b32_e32 v13, v47, v13 -; SI-NEXT: v_or_b32_e32 v14, v45, v14 -; SI-NEXT: v_or_b32_e32 v15, v43, v15 -; SI-NEXT: v_or_b32_e32 v16, v41, v16 -; SI-NEXT: v_or_b32_e32 v17, v55, v17 -; SI-NEXT: v_or_b32_e32 v18, v53, v18 -; SI-NEXT: v_or_b32_e32 v19, v31, v19 +; SI-NEXT: v_or_b32_e32 v0, v59, v0 +; SI-NEXT: v_or_b32_e32 v1, v57, v1 +; SI-NEXT: v_or_b32_e32 v3, v58, v3 +; SI-NEXT: v_or_b32_e32 v4, v56, v4 +; SI-NEXT: v_or_b32_e32 v5, v50, v5 +; SI-NEXT: v_or_b32_e32 v6, v49, v6 +; SI-NEXT: v_or_b32_e32 v7, v38, v7 +; SI-NEXT: v_or_b32_e32 v8, v37, v8 +; SI-NEXT: v_or_b32_e32 v9, v34, v9 +; SI-NEXT: v_or_b32_e32 v10, v33, v10 +; SI-NEXT: v_or_b32_e32 v11, v45, v11 +; SI-NEXT: v_or_b32_e32 v12, v44, v12 +; SI-NEXT: v_or_b32_e32 v13, v42, v13 +; SI-NEXT: v_or_b32_e32 v14, v40, v14 +; SI-NEXT: v_or_b32_e32 v15, v54, v15 +; SI-NEXT: v_or_b32_e32 v16, v52, v16 +; SI-NEXT: v_or_b32_e32 v17, v30, v17 +; SI-NEXT: v_or_b32_e32 v18, v28, v18 +; SI-NEXT: v_or_b32_e32 v19, v26, v19 ; SI-NEXT: v_or_b32_e32 v20, v24, v20 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v21, v23, v21 ; SI-NEXT: s_cbranch_execnz .LBB19_3 ; SI-NEXT: .LBB19_2: ; %cmp.true -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v57 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v4, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v56 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v49 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v26 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v48 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v34 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v50 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v33 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v46 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v44 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v42 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v55 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v54 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v52 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v29 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v28 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v26 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v63 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v60 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v47 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v51 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v48 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 @@ -8877,43 +8701,43 @@ define inreg <22 x i32> @bitcast_v44f16_to_v22i32_scalar(<44 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v45 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v43 ; SI-NEXT: v_or_b32_e32 v12, v14, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v41 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v40 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v53 ; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v31 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v30 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_or_b32_e32 v17, v18, v17 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v27 ; SI-NEXT: v_or_b32_e32 v18, v20, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v20, v25 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 @@ -8949,88 +8773,92 @@ define inreg <22 x i32> @bitcast_v44f16_to_v22i32_scalar(<44 x half> inreg %a, i ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB19_4: -; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v59, v46 -; SI-NEXT: v_mov_b32_e32 v46, v41 -; SI-NEXT: v_mov_b32_e32 v41, v52 -; SI-NEXT: v_mov_b32_e32 v52, v23 -; SI-NEXT: v_mov_b32_e32 v48, v60 -; SI-NEXT: v_mov_b32_e32 v60, v47 -; SI-NEXT: v_mov_b32_e32 v47, v42 -; SI-NEXT: v_mov_b32_e32 v42, v53 +; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v51, v33 +; SI-NEXT: v_mov_b32_e32 v50, v32 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_mov_b32_e32 v48, v47 +; SI-NEXT: v_mov_b32_e32 v47, v53 ; SI-NEXT: v_mov_b32_e32 v53, v22 -; SI-NEXT: v_mov_b32_e32 v35, v61 -; SI-NEXT: v_mov_b32_e32 v61, v56 -; SI-NEXT: v_mov_b32_e32 v56, v43 -; SI-NEXT: v_mov_b32_e32 v43, v54 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_mov_b32_e32 v39, v56 +; SI-NEXT: v_mov_b32_e32 v56, v54 ; SI-NEXT: v_mov_b32_e32 v54, v24 -; SI-NEXT: v_mov_b32_e32 v50, v34 -; SI-NEXT: v_mov_b32_e32 v34, v62 -; SI-NEXT: v_mov_b32_e32 v62, v57 -; SI-NEXT: v_mov_b32_e32 v57, v44 -; SI-NEXT: v_mov_b32_e32 v44, v55 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_mov_b32_e32 v38, v57 +; SI-NEXT: v_mov_b32_e32 v57, v55 ; SI-NEXT: v_mov_b32_e32 v55, v25 -; SI-NEXT: v_mov_b32_e32 v32, v33 -; SI-NEXT: v_mov_b32_e32 v33, v63 -; SI-NEXT: v_mov_b32_e32 v63, v58 -; SI-NEXT: v_mov_b32_e32 v58, v45 -; SI-NEXT: v_mov_b32_e32 v45, v40 -; SI-NEXT: v_mov_b32_e32 v40, v31 -; SI-NEXT: v_mov_b32_e32 v39, v26 -; SI-NEXT: v_mov_b32_e32 v38, v27 -; SI-NEXT: v_mov_b32_e32 v37, v28 -; SI-NEXT: v_mov_b32_e32 v49, v36 -; SI-NEXT: v_mov_b32_e32 v36, v29 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_mov_b32_e32 v37, v58 +; SI-NEXT: v_mov_b32_e32 v58, v40 +; SI-NEXT: v_mov_b32_e32 v40, v26 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v36, v59 +; SI-NEXT: v_mov_b32_e32 v59, v41 +; SI-NEXT: v_mov_b32_e32 v41, v27 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v35, v60 +; SI-NEXT: v_mov_b32_e32 v60, v42 +; SI-NEXT: v_mov_b32_e32 v42, v28 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v34, v61 +; SI-NEXT: v_mov_b32_e32 v61, v43 +; SI-NEXT: v_mov_b32_e32 v43, v29 +; SI-NEXT: v_mov_b32_e32 v33, v62 +; SI-NEXT: v_mov_b32_e32 v62, v44 +; SI-NEXT: v_mov_b32_e32 v44, v30 +; SI-NEXT: v_mov_b32_e32 v32, v63 +; SI-NEXT: v_mov_b32_e32 v63, v45 +; SI-NEXT: v_mov_b32_e32 v45, v31 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v29, v36 -; SI-NEXT: v_mov_b32_e32 v36, v49 -; SI-NEXT: v_mov_b32_e32 v28, v37 -; SI-NEXT: v_mov_b32_e32 v27, v38 -; SI-NEXT: v_mov_b32_e32 v26, v39 -; SI-NEXT: v_mov_b32_e32 v31, v40 -; SI-NEXT: v_mov_b32_e32 v40, v45 -; SI-NEXT: v_mov_b32_e32 v45, v58 -; SI-NEXT: v_mov_b32_e32 v58, v63 -; SI-NEXT: v_mov_b32_e32 v63, v33 -; SI-NEXT: v_mov_b32_e32 v33, v32 +; SI-NEXT: v_mov_b32_e32 v31, v45 +; SI-NEXT: v_mov_b32_e32 v45, v63 +; SI-NEXT: v_mov_b32_e32 v63, v32 +; SI-NEXT: v_mov_b32_e32 v30, v44 +; SI-NEXT: v_mov_b32_e32 v44, v62 +; SI-NEXT: v_mov_b32_e32 v62, v33 +; SI-NEXT: v_mov_b32_e32 v29, v43 +; SI-NEXT: v_mov_b32_e32 v43, v61 +; SI-NEXT: v_mov_b32_e32 v61, v34 +; SI-NEXT: v_mov_b32_e32 v28, v42 +; SI-NEXT: v_mov_b32_e32 v42, v60 +; SI-NEXT: v_mov_b32_e32 v60, v35 +; SI-NEXT: v_mov_b32_e32 v27, v41 +; SI-NEXT: v_mov_b32_e32 v41, v59 +; SI-NEXT: v_mov_b32_e32 v59, v36 +; SI-NEXT: v_mov_b32_e32 v26, v40 +; SI-NEXT: v_mov_b32_e32 v40, v58 +; SI-NEXT: v_mov_b32_e32 v58, v37 ; SI-NEXT: v_mov_b32_e32 v25, v55 -; SI-NEXT: v_mov_b32_e32 v55, v44 -; SI-NEXT: v_mov_b32_e32 v44, v57 -; SI-NEXT: v_mov_b32_e32 v57, v62 -; SI-NEXT: v_mov_b32_e32 v62, v34 -; SI-NEXT: v_mov_b32_e32 v34, v50 +; SI-NEXT: v_mov_b32_e32 v55, v57 +; SI-NEXT: v_mov_b32_e32 v57, v38 ; SI-NEXT: v_mov_b32_e32 v24, v54 -; SI-NEXT: v_mov_b32_e32 v54, v43 -; SI-NEXT: v_mov_b32_e32 v43, v56 -; SI-NEXT: v_mov_b32_e32 v56, v61 -; SI-NEXT: v_mov_b32_e32 v61, v35 +; SI-NEXT: v_mov_b32_e32 v54, v56 +; SI-NEXT: v_mov_b32_e32 v56, v39 ; SI-NEXT: v_mov_b32_e32 v22, v53 -; SI-NEXT: v_mov_b32_e32 v53, v42 -; SI-NEXT: v_mov_b32_e32 v42, v47 -; SI-NEXT: v_mov_b32_e32 v47, v60 -; SI-NEXT: v_mov_b32_e32 v60, v48 -; SI-NEXT: v_mov_b32_e32 v23, v52 -; SI-NEXT: v_mov_b32_e32 v52, v41 -; SI-NEXT: v_mov_b32_e32 v41, v46 -; SI-NEXT: v_mov_b32_e32 v46, v59 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v53, v47 +; SI-NEXT: v_mov_b32_e32 v47, v48 +; SI-NEXT: v_mov_b32_e32 v32, v50 +; SI-NEXT: v_mov_b32_e32 v33, v51 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: s_branch .LBB19_2 ; ; VI-LABEL: bitcast_v44f16_to_v22i32_scalar: @@ -11069,236 +10897,171 @@ define <44 x i16> @bitcast_v22f32_to_v44i16(<22 x float> %a, i32 %b) { ; SI-LABEL: bitcast_v22f32_to_v44i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v23 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 ; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB28_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_alignbit_b32 v23, v22, v21, 16 -; SI-NEXT: v_alignbit_b32 v24, v20, v19, 16 -; SI-NEXT: v_alignbit_b32 v25, v18, v17, 16 -; SI-NEXT: v_alignbit_b32 v26, v16, v15, 16 -; SI-NEXT: v_alignbit_b32 v28, v14, v13, 16 -; SI-NEXT: v_alignbit_b32 v30, v12, v11, 16 -; SI-NEXT: v_alignbit_b32 v32, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v34, v8, v7, 16 -; SI-NEXT: v_alignbit_b32 v37, v6, v5, 16 -; SI-NEXT: v_alignbit_b32 v39, v4, v3, 16 -; SI-NEXT: v_alignbit_b32 v49, v2, v1, 16 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v2 +; SI-NEXT: v_alignbit_b32 v22, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v23, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v24, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v25, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v26, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v27, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v28, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v30, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v32, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v35, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v37, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v1 ; SI-NEXT: .LBB28_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB28_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 -; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 ; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 -; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 ; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 -; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 ; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 -; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 ; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 -; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 ; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 -; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 ; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 -; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 ; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 -; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 ; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 -; SI-NEXT: v_alignbit_b32 v23, v22, v21, 16 -; SI-NEXT: v_alignbit_b32 v24, v20, v19, 16 -; SI-NEXT: v_alignbit_b32 v25, v18, v17, 16 -; SI-NEXT: v_alignbit_b32 v26, v16, v15, 16 -; SI-NEXT: v_alignbit_b32 v28, v14, v13, 16 -; SI-NEXT: v_alignbit_b32 v30, v12, v11, 16 -; SI-NEXT: v_alignbit_b32 v32, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v34, v8, v7, 16 -; SI-NEXT: v_alignbit_b32 v37, v6, v5, 16 -; SI-NEXT: v_alignbit_b32 v39, v4, v3, 16 -; SI-NEXT: v_alignbit_b32 v49, v2, v1, 16 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v2 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_alignbit_b32 v22, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v23, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v24, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v25, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v26, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v27, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v28, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v30, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v32, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v35, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v37, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v1 ; SI-NEXT: .LBB28_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v0, v0, v37 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; SI-NEXT: v_or_b32_e32 v1, v1, v49 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v28 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v26 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v24 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v23 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x54, v0 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v51 +; SI-NEXT: v_or_b32_e32 v2, v2, v35 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v50 +; SI-NEXT: v_or_b32_e32 v4, v4, v32 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v49 +; SI-NEXT: v_or_b32_e32 v6, v6, v30 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v48 +; SI-NEXT: v_or_b32_e32 v8, v8, v28 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v39 +; SI-NEXT: v_or_b32_e32 v10, v10, v27 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v38 +; SI-NEXT: v_or_b32_e32 v12, v12, v26 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v36 +; SI-NEXT: v_or_b32_e32 v14, v14, v25 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v34 +; SI-NEXT: v_or_b32_e32 v16, v16, v24 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v33 +; SI-NEXT: v_or_b32_e32 v18, v18, v23 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v31 +; SI-NEXT: v_or_b32_e32 v20, v20, v22 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v29 +; SI-NEXT: v_or_b32_e32 v1, v1, v37 +; SI-NEXT: v_or_b32_e32 v3, v3, v35 +; SI-NEXT: v_or_b32_e32 v5, v5, v32 +; SI-NEXT: v_or_b32_e32 v7, v7, v30 +; SI-NEXT: v_or_b32_e32 v9, v9, v28 +; SI-NEXT: v_or_b32_e32 v11, v11, v27 +; SI-NEXT: v_or_b32_e32 v13, v13, v26 +; SI-NEXT: v_or_b32_e32 v15, v15, v25 +; SI-NEXT: v_or_b32_e32 v17, v17, v24 +; SI-NEXT: v_or_b32_e32 v19, v19, v23 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v22f32_to_v44i16: @@ -11738,248 +11501,191 @@ define inreg <44 x i16> @bitcast_v22f32_to_v44i16_scalar(<22 x float> inreg %a, ; SI-LABEL: bitcast_v22f32_to_v44i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 -; SI-NEXT: v_mov_b32_e32 v21, s16 -; SI-NEXT: v_mov_b32_e32 v22, s17 -; SI-NEXT: v_mov_b32_e32 v19, s18 -; SI-NEXT: v_mov_b32_e32 v20, s19 -; SI-NEXT: v_mov_b32_e32 v17, s20 -; SI-NEXT: v_mov_b32_e32 v18, s21 -; SI-NEXT: v_mov_b32_e32 v15, s22 -; SI-NEXT: v_mov_b32_e32 v16, s23 -; SI-NEXT: v_mov_b32_e32 v13, s24 -; SI-NEXT: v_mov_b32_e32 v14, s25 -; SI-NEXT: v_mov_b32_e32 v11, s26 -; SI-NEXT: v_mov_b32_e32 v12, s27 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: v_mov_b32_e32 v22, s16 +; SI-NEXT: v_mov_b32_e32 v23, s17 +; SI-NEXT: v_mov_b32_e32 v20, s18 +; SI-NEXT: v_mov_b32_e32 v21, s19 +; SI-NEXT: v_mov_b32_e32 v18, s20 +; SI-NEXT: v_mov_b32_e32 v19, s21 +; SI-NEXT: v_mov_b32_e32 v16, s22 +; SI-NEXT: v_mov_b32_e32 v17, s23 +; SI-NEXT: v_mov_b32_e32 v9, s24 +; SI-NEXT: v_mov_b32_e32 v10, s25 +; SI-NEXT: v_mov_b32_e32 v14, s26 +; SI-NEXT: v_mov_b32_e32 v15, s27 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mov_b32_e32 v9, s28 -; SI-NEXT: v_mov_b32_e32 v10, s29 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 ; SI-NEXT: s_cbranch_scc0 .LBB29_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshr_b64 v[23:24], v[7:8], 16 -; SI-NEXT: v_lshr_b64 v[24:25], v[5:6], 16 -; SI-NEXT: v_lshr_b64 v[25:26], v[3:4], 16 -; SI-NEXT: v_lshr_b64 v[26:27], v[1:2], 16 -; SI-NEXT: v_lshr_b64 v[27:28], v[9:10], 16 -; SI-NEXT: v_lshr_b64 v[28:29], v[11:12], 16 -; SI-NEXT: v_lshr_b64 v[29:30], v[13:14], 16 -; SI-NEXT: v_lshr_b64 v[30:31], v[15:16], 16 -; SI-NEXT: v_lshr_b64 v[31:32], v[17:18], 16 -; SI-NEXT: v_lshr_b64 v[32:33], v[19:20], 16 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v22 -; SI-NEXT: v_lshr_b64 v[33:34], v[21:22], 16 +; SI-NEXT: v_lshr_b64 v[30:31], v[6:7], 16 +; SI-NEXT: v_lshr_b64 v[31:32], v[4:5], 16 +; SI-NEXT: v_lshr_b64 v[32:33], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[33:34], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[34:35], v[12:13], 16 +; SI-NEXT: v_lshr_b64 v[35:36], v[14:15], 16 +; SI-NEXT: v_lshr_b64 v[36:37], v[9:10], 16 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v23 +; SI-NEXT: v_lshr_b64 v[37:38], v[16:17], 16 +; SI-NEXT: v_lshr_b64 v[28:29], v[18:19], 16 +; SI-NEXT: v_lshr_b64 v[26:27], v[20:21], 16 +; SI-NEXT: v_lshr_b64 v[24:25], v[22:23], 16 ; SI-NEXT: s_cbranch_execnz .LBB29_3 ; SI-NEXT: .LBB29_2: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 ; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 ; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 ; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 -; SI-NEXT: v_lshr_b64 v[23:24], v[7:8], 16 ; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_lshr_b64 v[30:31], v[6:7], 16 ; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_lshr_b64 v[24:25], v[5:6], 16 ; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_lshr_b64 v[31:32], v[4:5], 16 ; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshr_b64 v[25:26], v[3:4], 16 -; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 -; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 -; SI-NEXT: v_lshr_b64 v[26:27], v[1:2], 16 -; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 -; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 -; SI-NEXT: v_lshr_b64 v[27:28], v[9:10], 16 -; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshr_b64 v[32:33], v[2:3], 16 ; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 -; SI-NEXT: v_lshr_b64 v[28:29], v[11:12], 16 -; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_lshr_b64 v[33:34], v[0:1], 16 ; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 -; SI-NEXT: v_lshr_b64 v[29:30], v[13:14], 16 -; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 -; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 -; SI-NEXT: v_lshr_b64 v[30:31], v[15:16], 16 -; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 -; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 -; SI-NEXT: v_lshr_b64 v[31:32], v[17:18], 16 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_lshr_b64 v[34:35], v[12:13], 16 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_lshr_b64 v[35:36], v[14:15], 16 +; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 ; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 ; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 -; SI-NEXT: v_lshr_b64 v[32:33], v[19:20], 16 -; SI-NEXT: v_lshr_b64 v[33:34], v[21:22], 16 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v22 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_lshr_b64 v[36:37], v[9:10], 16 +; SI-NEXT: v_lshr_b64 v[37:38], v[16:17], 16 +; SI-NEXT: v_lshr_b64 v[28:29], v[18:19], 16 +; SI-NEXT: v_lshr_b64 v[26:27], v[20:21], 16 +; SI-NEXT: v_lshr_b64 v[24:25], v[22:23], 16 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v23 ; SI-NEXT: .LBB29_3: ; %end -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; SI-NEXT: v_or_b32_e32 v21, v21, v33 -; SI-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v22 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v53 -; SI-NEXT: v_or_b32_e32 v21, v21, v22 -; SI-NEXT: v_add_i32_e32 v22, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v21, v22, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v32 -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; SI-NEXT: v_or_b32_e32 v19, v19, v21 -; SI-NEXT: v_add_i32_e32 v21, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v19, v21, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v52 -; SI-NEXT: v_or_b32_e32 v19, v19, v20 -; SI-NEXT: v_add_i32_e32 v20, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v31 -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; SI-NEXT: v_or_b32_e32 v17, v17, v19 -; SI-NEXT: v_add_i32_e32 v19, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v17, v19, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v51 -; SI-NEXT: v_or_b32_e32 v17, v17, v18 -; SI-NEXT: v_add_i32_e32 v18, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v30 -; SI-NEXT: v_or_b32_e32 v15, v15, v17 -; SI-NEXT: v_add_i32_e32 v17, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v15, v17, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v50 -; SI-NEXT: v_or_b32_e32 v15, v15, v16 -; SI-NEXT: v_add_i32_e32 v16, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v29 -; SI-NEXT: v_or_b32_e32 v13, v13, v15 -; SI-NEXT: v_add_i32_e32 v15, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v13, v15, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v49 -; SI-NEXT: v_or_b32_e32 v13, v13, v14 -; SI-NEXT: v_add_i32_e32 v14, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v28 -; SI-NEXT: v_or_b32_e32 v11, v11, v13 -; SI-NEXT: v_add_i32_e32 v13, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v11, v13, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v48 -; SI-NEXT: v_or_b32_e32 v11, v11, v12 -; SI-NEXT: v_add_i32_e32 v12, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v27 -; SI-NEXT: v_or_b32_e32 v9, v9, v11 -; SI-NEXT: v_add_i32_e32 v11, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v9, v11, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v24, v22, v24 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v55 +; SI-NEXT: v_or_b32_e32 v25, v22, v23 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v26 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v26, v20, v22 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v54 +; SI-NEXT: v_or_b32_e32 v27, v20, v21 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v28 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v28, v18, v20 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v53 +; SI-NEXT: v_or_b32_e32 v29, v18, v19 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v37 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v22, v16, v18 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v23, v16, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v36 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v52 ; SI-NEXT: v_or_b32_e32 v9, v9, v10 -; SI-NEXT: v_add_i32_e32 v10, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v26 -; SI-NEXT: v_or_b32_e32 v1, v1, v9 -; SI-NEXT: v_add_i32_e32 v9, vcc, 56, v0 -; SI-NEXT: buffer_store_dword v1, v9, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v24 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v23 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x54, v0 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v35 +; SI-NEXT: v_or_b32_e32 v10, v10, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v14, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v34 +; SI-NEXT: v_or_b32_e32 v12, v12, v14 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v51 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v33 +; SI-NEXT: v_or_b32_e32 v14, v0, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v50 +; SI-NEXT: v_or_b32_e32 v15, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v32 +; SI-NEXT: v_or_b32_e32 v16, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v49 +; SI-NEXT: v_or_b32_e32 v17, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v31 +; SI-NEXT: v_or_b32_e32 v18, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v48 +; SI-NEXT: v_or_b32_e32 v19, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v30 +; SI-NEXT: v_or_b32_e32 v20, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v39 +; SI-NEXT: v_or_b32_e32 v21, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, v24 +; SI-NEXT: v_mov_b32_e32 v1, v25 +; SI-NEXT: v_mov_b32_e32 v2, v26 +; SI-NEXT: v_mov_b32_e32 v3, v27 +; SI-NEXT: v_mov_b32_e32 v4, v28 +; SI-NEXT: v_mov_b32_e32 v5, v29 +; SI-NEXT: v_mov_b32_e32 v6, v22 +; SI-NEXT: v_mov_b32_e32 v7, v23 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB29_4: -; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: s_branch .LBB29_2 ; ; VI-LABEL: bitcast_v22f32_to_v44i16_scalar: @@ -12628,118 +12334,204 @@ define <22 x float> @bitcast_v44i16_to_v22f32(<44 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v44i16_to_v22f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v54, v2 -; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 -; SI-NEXT: v_mov_b32_e32 v53, v4 -; SI-NEXT: v_mov_b32_e32 v50, v10 -; SI-NEXT: v_mov_b32_e32 v51, v8 -; SI-NEXT: v_mov_b32_e32 v52, v6 -; SI-NEXT: v_mov_b32_e32 v39, v16 -; SI-NEXT: v_mov_b32_e32 v48, v14 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v34, v19 +; SI-NEXT: v_mov_b32_e32 v35, v18 +; SI-NEXT: v_mov_b32_e32 v36, v17 +; SI-NEXT: v_mov_b32_e32 v37, v16 +; SI-NEXT: v_mov_b32_e32 v38, v15 +; SI-NEXT: v_mov_b32_e32 v39, v14 +; SI-NEXT: v_mov_b32_e32 v48, v13 ; SI-NEXT: v_mov_b32_e32 v49, v12 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v38, v18 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v19 +; SI-NEXT: v_mov_b32_e32 v50, v11 +; SI-NEXT: v_mov_b32_e32 v51, v10 +; SI-NEXT: v_mov_b32_e32 v52, v9 +; SI-NEXT: v_mov_b32_e32 v53, v8 +; SI-NEXT: v_mov_b32_e32 v54, v7 +; SI-NEXT: v_mov_b32_e32 v55, v6 +; SI-NEXT: v_mov_b32_e32 v40, v5 +; SI-NEXT: v_mov_b32_e32 v41, v4 +; SI-NEXT: v_mov_b32_e32 v42, v3 +; SI-NEXT: v_mov_b32_e32 v43, v2 +; SI-NEXT: v_mov_b32_e32 v44, v1 +; SI-NEXT: v_mov_b32_e32 v45, v0 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v21 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v38 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v48 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v50 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v52 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v40 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v42 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v43 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v44 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v45 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 ; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v23 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v29 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v0 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:4 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v4 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:36 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v12 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:28 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB30_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v49 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v45 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v44 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v43 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v42 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v41 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v40 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v55 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v54 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v53 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v52 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v51 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v50 +; SI-NEXT: v_or_b32_e32 v0, v0, v33 +; SI-NEXT: v_or_b32_e32 v1, v1, v63 +; SI-NEXT: v_or_b32_e32 v2, v2, v32 +; SI-NEXT: v_or_b32_e32 v3, v3, v62 +; SI-NEXT: v_or_b32_e32 v4, v4, v61 +; SI-NEXT: v_or_b32_e32 v5, v5, v60 +; SI-NEXT: v_or_b32_e32 v6, v6, v59 +; SI-NEXT: v_or_b32_e32 v7, v7, v58 +; SI-NEXT: v_or_b32_e32 v8, v8, v57 +; SI-NEXT: v_or_b32_e32 v9, v9, v56 +; SI-NEXT: v_or_b32_e32 v10, v10, v47 +; SI-NEXT: v_or_b32_e32 v11, v11, v46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v48 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v39 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v38 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v37 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v36 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v35 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v34 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; kill: killed $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr22 @@ -12759,126 +12551,38 @@ define <22 x float> @bitcast_v44i16_to_v22f32(<44 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; kill: killed $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v53 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v52 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v51 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v50 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v49 -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v48 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v39 -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v38 ; SI-NEXT: ; kill: killed $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: v_or_b32_e32 v0, v0, v37 -; SI-NEXT: v_or_b32_e32 v1, v1, v45 -; SI-NEXT: v_or_b32_e32 v2, v2, v44 -; SI-NEXT: v_or_b32_e32 v3, v3, v43 -; SI-NEXT: v_or_b32_e32 v4, v4, v36 -; SI-NEXT: v_or_b32_e32 v5, v5, v42 -; SI-NEXT: v_or_b32_e32 v6, v6, v41 -; SI-NEXT: v_or_b32_e32 v7, v7, v35 -; SI-NEXT: v_or_b32_e32 v8, v8, v34 -; SI-NEXT: v_or_b32_e32 v9, v9, v40 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; kill: killed $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; SI-NEXT: v_or_b32_e32 v10, v10, v33 -; SI-NEXT: v_or_b32_e32 v11, v11, v32 -; SI-NEXT: v_or_b32_e32 v12, v12, v63 -; SI-NEXT: v_or_b32_e32 v13, v13, v62 -; SI-NEXT: v_or_b32_e32 v14, v14, v61 -; SI-NEXT: v_or_b32_e32 v15, v15, v60 -; SI-NEXT: v_or_b32_e32 v16, v16, v59 -; SI-NEXT: v_or_b32_e32 v17, v17, v58 -; SI-NEXT: v_or_b32_e32 v18, v18, v57 -; SI-NEXT: v_or_b32_e32 v19, v19, v56 -; SI-NEXT: v_or_b32_e32 v20, v20, v47 -; SI-NEXT: v_or_b32_e32 v21, v21, v46 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: .LBB30_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB30_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v49 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v53 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v52 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v51 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v50 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v49 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v48 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v39 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v38 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v45 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v44 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v43 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v42 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v41 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v40 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v50 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 @@ -12889,17 +12593,21 @@ define <22 x float> @bitcast_v44i16_to_v22f32(<44 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: v_or_b32_e32 v0, v37, v0 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_or_b32_e32 v0, v33, v0 ; SI-NEXT: s_mov_b32 s6, 0x30000 -; SI-NEXT: v_or_b32_e32 v1, v45, v1 -; SI-NEXT: v_or_b32_e32 v2, v44, v2 -; SI-NEXT: v_or_b32_e32 v3, v43, v3 -; SI-NEXT: v_or_b32_e32 v4, v36, v4 -; SI-NEXT: v_or_b32_e32 v5, v42, v5 -; SI-NEXT: v_or_b32_e32 v6, v41, v6 -; SI-NEXT: v_or_b32_e32 v7, v35, v7 -; SI-NEXT: v_or_b32_e32 v8, v34, v8 -; SI-NEXT: v_or_b32_e32 v9, v40, v9 +; SI-NEXT: v_or_b32_e32 v1, v63, v1 +; SI-NEXT: v_or_b32_e32 v2, v32, v2 +; SI-NEXT: v_or_b32_e32 v3, v62, v3 +; SI-NEXT: v_or_b32_e32 v4, v61, v4 +; SI-NEXT: v_or_b32_e32 v5, v60, v5 +; SI-NEXT: v_or_b32_e32 v6, v59, v6 +; SI-NEXT: v_or_b32_e32 v7, v58, v7 +; SI-NEXT: v_or_b32_e32 v8, v57, v8 +; SI-NEXT: v_or_b32_e32 v9, v56, v9 +; SI-NEXT: v_or_b32_e32 v10, v47, v10 +; SI-NEXT: v_or_b32_e32 v11, v46, v11 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 @@ -12910,49 +12618,38 @@ define <22 x float> @bitcast_v44i16_to_v22f32(<44 x i16> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 ; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 ; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v48 ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v39 ; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v38 ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v37 ; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v36 ; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v35 ; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v34 ; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; SI-NEXT: v_or_b32_e32 v10, v33, v10 -; SI-NEXT: v_or_b32_e32 v11, v32, v11 -; SI-NEXT: v_or_b32_e32 v12, v63, v12 -; SI-NEXT: v_or_b32_e32 v13, v62, v13 -; SI-NEXT: v_or_b32_e32 v14, v61, v14 -; SI-NEXT: v_or_b32_e32 v15, v60, v15 -; SI-NEXT: v_or_b32_e32 v16, v59, v16 -; SI-NEXT: v_or_b32_e32 v17, v58, v17 -; SI-NEXT: v_or_b32_e32 v18, v57, v18 -; SI-NEXT: v_or_b32_e32 v19, v56, v19 -; SI-NEXT: v_or_b32_e32 v20, v47, v20 -; SI-NEXT: v_or_b32_e32 v21, v46, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 ; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 ; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 @@ -12963,26 +12660,37 @@ define <22 x float> @bitcast_v44i16_to_v22f32(<44 x i16> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 ; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 ; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 ; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v21 ; SI-NEXT: .LBB30_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -13574,233 +13282,239 @@ define inreg <22 x float> @bitcast_v44i16_to_v22f32_scalar(<44 x i16> inreg %a, ; SI-LABEL: bitcast_v44i16_to_v22f32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v32, v28 -; SI-NEXT: v_mov_b32_e32 v33, v26 -; SI-NEXT: v_mov_b32_e32 v34, v24 -; SI-NEXT: v_mov_b32_e32 v35, v22 -; SI-NEXT: v_mov_b32_e32 v36, v20 -; SI-NEXT: v_mov_b32_e32 v37, v18 -; SI-NEXT: v_mov_b32_e32 v38, v16 -; SI-NEXT: v_mov_b32_e32 v39, v14 -; SI-NEXT: v_mov_b32_e32 v48, v12 -; SI-NEXT: v_mov_b32_e32 v49, v10 -; SI-NEXT: v_mov_b32_e32 v50, v8 -; SI-NEXT: v_mov_b32_e32 v51, v6 -; SI-NEXT: v_mov_b32_e32 v52, v4 -; SI-NEXT: v_mov_b32_e32 v53, v2 -; SI-NEXT: v_mov_b32_e32 v54, v0 +; SI-NEXT: v_mov_b32_e32 v32, v7 +; SI-NEXT: v_mov_b32_e32 v33, v6 +; SI-NEXT: v_mov_b32_e32 v34, v5 +; SI-NEXT: v_mov_b32_e32 v35, v4 +; SI-NEXT: v_mov_b32_e32 v36, v3 +; SI-NEXT: v_mov_b32_e32 v37, v2 +; SI-NEXT: v_mov_b32_e32 v38, v1 +; SI-NEXT: v_mov_b32_e32 v39, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v38 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v39 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s7, s28, 16 +; SI-NEXT: s_lshr_b32 s8, s27, 16 +; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: s_lshr_b32 s13, s22, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s40, s19, 16 +; SI-NEXT: s_lshr_b32 s41, s18, 16 +; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v0 ; SI-NEXT: s_cbranch_scc0 .LBB31_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v54 -; SI-NEXT: v_or_b32_e32 v7, v0, v61 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 -; SI-NEXT: v_or_b32_e32 v9, v0, v59 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 -; SI-NEXT: v_or_b32_e32 v10, v0, v58 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 -; SI-NEXT: v_or_b32_e32 v11, v0, v57 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 -; SI-NEXT: v_or_b32_e32 v12, v0, v56 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 -; SI-NEXT: v_or_b32_e32 v13, v0, v47 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 ; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: v_or_b32_e32 v14, v0, v46 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: s_lshl_b32 s5, s43, 16 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: v_or_b32_e32 v15, v0, v45 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s44, s42, 16 +; SI-NEXT: s_or_b32 s5, s5, s44 +; SI-NEXT: s_and_b32 s44, s18, 0xffff +; SI-NEXT: s_lshl_b32 s45, s41, 16 +; SI-NEXT: s_or_b32 s44, s44, s45 +; SI-NEXT: s_and_b32 s45, s19, 0xffff +; SI-NEXT: s_lshl_b32 s46, s40, 16 +; SI-NEXT: s_or_b32 s45, s45, s46 +; SI-NEXT: s_and_b32 s46, s20, 0xffff +; SI-NEXT: s_lshl_b32 s47, s15, 16 +; SI-NEXT: s_or_b32 s46, s46, s47 +; SI-NEXT: s_and_b32 s47, s21, 0xffff +; SI-NEXT: s_lshl_b32 s56, s14, 16 +; SI-NEXT: s_or_b32 s47, s47, s56 +; SI-NEXT: s_and_b32 s56, s22, 0xffff +; SI-NEXT: s_lshl_b32 s57, s13, 16 +; SI-NEXT: s_or_b32 s56, s56, s57 +; SI-NEXT: s_and_b32 s57, s23, 0xffff +; SI-NEXT: s_lshl_b32 s58, s12, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: s_or_b32 s57, s57, s58 +; SI-NEXT: s_and_b32 s58, s24, 0xffff +; SI-NEXT: s_lshl_b32 s59, s11, 16 +; SI-NEXT: v_or_b32_e32 v14, v0, v55 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 -; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: v_or_b32_e32 v16, v0, v44 +; SI-NEXT: s_or_b32 s58, s58, s59 +; SI-NEXT: s_and_b32 s59, s25, 0xffff +; SI-NEXT: s_lshl_b32 s60, s10, 16 +; SI-NEXT: v_or_b32_e32 v16, v0, v53 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 -; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: v_or_b32_e32 v17, v0, v43 +; SI-NEXT: s_or_b32 s59, s59, s60 +; SI-NEXT: s_and_b32 s60, s26, 0xffff +; SI-NEXT: s_lshl_b32 s61, s9, 16 +; SI-NEXT: v_or_b32_e32 v17, v0, v52 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 -; SI-NEXT: s_or_b32 s7, s7, s8 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: v_or_b32_e32 v18, v0, v42 +; SI-NEXT: s_or_b32 s60, s60, s61 +; SI-NEXT: s_and_b32 s61, s27, 0xffff +; SI-NEXT: s_lshl_b32 s62, s8, 16 +; SI-NEXT: v_or_b32_e32 v18, v0, v51 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 -; SI-NEXT: s_or_b32 s8, s8, s9 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: v_or_b32_e32 v19, v0, v41 +; SI-NEXT: s_or_b32 s61, s61, s62 +; SI-NEXT: s_and_b32 s62, s28, 0xffff +; SI-NEXT: s_lshl_b32 s63, s7, 16 +; SI-NEXT: v_or_b32_e32 v19, v0, v50 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 -; SI-NEXT: s_or_b32 s9, s9, s10 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v53 -; SI-NEXT: v_or_b32_e32 v20, v0, v40 +; SI-NEXT: s_or_b32 s62, s62, s63 +; SI-NEXT: s_and_b32 s63, s29, 0xffff +; SI-NEXT: s_lshl_b32 s72, s6, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v38 +; SI-NEXT: v_or_b32_e32 v20, v0, v49 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 -; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_or_b32_e32 v8, v1, v60 -; SI-NEXT: v_or_b32_e32 v21, v0, v55 +; SI-NEXT: s_or_b32 s63, s63, s72 +; SI-NEXT: v_or_b32_e32 v15, v1, v54 +; SI-NEXT: v_or_b32_e32 v21, v0, v48 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_mov_b32_e32 v5, s9 -; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: v_mov_b32_e32 v2, s44 +; SI-NEXT: v_mov_b32_e32 v3, s45 +; SI-NEXT: v_mov_b32_e32 v4, s46 +; SI-NEXT: v_mov_b32_e32 v5, s47 +; SI-NEXT: v_mov_b32_e32 v6, s56 +; SI-NEXT: v_mov_b32_e32 v7, s57 +; SI-NEXT: v_mov_b32_e32 v8, s58 +; SI-NEXT: v_mov_b32_e32 v9, s59 +; SI-NEXT: v_mov_b32_e32 v10, s60 +; SI-NEXT: v_mov_b32_e32 v11, s61 +; SI-NEXT: v_mov_b32_e32 v12, s62 +; SI-NEXT: v_mov_b32_e32 v13, s63 ; SI-NEXT: s_cbranch_execnz .LBB31_3 ; SI-NEXT: .LBB31_2: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v54 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v61, v0 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v60, v0 -; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v59, v0 -; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v58, v0 -; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v57, v0 -; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v56, v0 -; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v47, v0 -; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: v_or_b32_e32 v0, v55, v0 ; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v45, v0 -; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v44, v0 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: v_or_b32_e32 v0, v53, v0 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s16, s42, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: s_and_b32 s16, s18, 0xffff +; SI-NEXT: s_lshl_b32 s17, s41, 16 +; SI-NEXT: s_add_i32 s19, s19, 3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_and_b32 s17, s19, 0xffff +; SI-NEXT: s_lshl_b32 s18, s40, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_or_b32_e32 v0, v52, v0 +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: s_and_b32 s18, s20, 0xffff +; SI-NEXT: s_lshl_b32 s15, s15, 16 +; SI-NEXT: s_add_i32 s21, s21, 3 ; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: s_or_b32 s15, s15, s18 +; SI-NEXT: s_and_b32 s18, s21, 0xffff +; SI-NEXT: s_lshl_b32 s14, s14, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_or_b32_e32 v0, v42, v0 -; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s14, s14, s18 +; SI-NEXT: s_and_b32 s18, s22, 0xffff +; SI-NEXT: s_lshl_b32 s13, s13, 16 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: v_or_b32_e32 v0, v51, v0 +; SI-NEXT: s_or_b32 s13, s13, s18 +; SI-NEXT: s_and_b32 s18, s23, 0xffff +; SI-NEXT: s_lshl_b32 s12, s12, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 ; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s12, s12, s18 +; SI-NEXT: s_and_b32 s18, s24, 0xffff +; SI-NEXT: s_lshl_b32 s11, s11, 16 +; SI-NEXT: s_add_i32 s25, s25, 3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: v_or_b32_e32 v0, v41, v0 -; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s11, s11, s18 +; SI-NEXT: s_and_b32 s18, s25, 0xffff +; SI-NEXT: s_lshl_b32 s10, s10, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: v_or_b32_e32 v0, v50, v0 +; SI-NEXT: s_or_b32 s10, s10, s18 +; SI-NEXT: s_and_b32 s18, s26, 0xffff +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_add_i32 s27, s27, 3 ; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 -; SI-NEXT: s_or_b32 s7, s8, s7 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_or_b32 s8, s9, s8 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_or_b32 s9, s9, s18 +; SI-NEXT: s_and_b32 s18, s27, 0xffff +; SI-NEXT: s_lshl_b32 s8, s8, 16 ; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: v_or_b32_e32 v0, v40, v0 -; SI-NEXT: s_or_b32 s9, s10, s9 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s8, s8, s18 +; SI-NEXT: s_and_b32 s18, s28, 0xffff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: v_or_b32_e32 v0, v49, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v38 +; SI-NEXT: s_or_b32 s7, s7, s18 +; SI-NEXT: s_and_b32 s18, s29, 0xffff +; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 -; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_or_b32 s6, s6, s18 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v54, v1 ; SI-NEXT: s_add_i32 s4, s4, 0x30000 ; SI-NEXT: s_add_i32 s5, s5, 0x30000 -; SI-NEXT: s_add_i32 s6, s6, 0x30000 -; SI-NEXT: s_add_i32 s7, s7, 0x30000 -; SI-NEXT: s_add_i32 s8, s8, 0x30000 -; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s16, s16, 0x30000 +; SI-NEXT: s_add_i32 s17, s17, 0x30000 +; SI-NEXT: s_add_i32 s15, s15, 0x30000 +; SI-NEXT: s_add_i32 s14, s14, 0x30000 +; SI-NEXT: s_add_i32 s13, s13, 0x30000 +; SI-NEXT: s_add_i32 s12, s12, 0x30000 +; SI-NEXT: s_add_i32 s11, s11, 0x30000 ; SI-NEXT: s_add_i32 s10, s10, 0x30000 -; SI-NEXT: v_or_b32_e32 v0, v55, v0 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v48, v0 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v1 ; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v0 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_mov_b32_e32 v5, s9 -; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: v_mov_b32_e32 v3, s17 +; SI-NEXT: v_mov_b32_e32 v4, s15 +; SI-NEXT: v_mov_b32_e32 v5, s14 +; SI-NEXT: v_mov_b32_e32 v6, s13 +; SI-NEXT: v_mov_b32_e32 v7, s12 +; SI-NEXT: v_mov_b32_e32 v8, s11 +; SI-NEXT: v_mov_b32_e32 v9, s10 +; SI-NEXT: v_mov_b32_e32 v10, s9 +; SI-NEXT: v_mov_b32_e32 v11, s8 +; SI-NEXT: v_mov_b32_e32 v12, s7 +; SI-NEXT: v_mov_b32_e32 v13, s6 ; SI-NEXT: .LBB31_3: ; %end -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB31_4: ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 @@ -14383,10 +14097,9 @@ define <44 x half> @bitcast_v22f32_to_v44f16(<22 x float> %a, i32 %b) { ; SI-LABEL: bitcast_v22f32_to_v44f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v23 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -14403,15 +14116,14 @@ define <44 x half> @bitcast_v22f32_to_v44f16(<22 x float> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr47 @@ -14421,109 +14133,109 @@ define <44 x half> @bitcast_v22f32_to_v44f16(<22 x float> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; kill: killed $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; kill: killed $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB32_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v57, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v4 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v60, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v3 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v62, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v24 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v31, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v3 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v63, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v1 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v59, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v2 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v61, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v63, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v22 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v29, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v0 +; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 @@ -14545,11 +14257,11 @@ define <44 x half> @bitcast_v22f32_to_v44f16(<22 x float> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: .LBB32_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB32_4 ; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 @@ -14572,236 +14284,145 @@ define <44 x half> @bitcast_v22f32_to_v44f16(<22 x float> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 ; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 -; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v19 ; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v20 ; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 ; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 ; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 ; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 ; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: .LBB32_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v1, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v63 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: v_cvt_f16_f32_e32 v1, v62 -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v62 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v61 -; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v59 -; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v58 -; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v56 -; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v46 -; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v44 -; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v42 -; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v40 -; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v54 -; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v52 -; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v51 -; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v49 -; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v39 -; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 -; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v35 -; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v31 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v26 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v27 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x54, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v58 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v56 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v47 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v44 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v43 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v55 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v40 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v51 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v52 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v39 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v48 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v34 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v36 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v30 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v33 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v29 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -14818,7 +14439,30 @@ define <44 x half> @bitcast_v22f32_to_v44f16(<22 x float> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v25 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v29 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v20, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v27 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v22f32_to_v44f16: @@ -15258,7 +14902,7 @@ define inreg <44 x half> @bitcast_v22f32_to_v44f16_scalar(<22 x float> inreg %a, ; SI-LABEL: bitcast_v22f32_to_v44f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -15275,412 +14919,248 @@ define inreg <44 x half> @bitcast_v22f32_to_v44f16_scalar(<22 x float> inreg %a, ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v12, s16 -; SI-NEXT: v_mov_b32_e32 v11, s17 -; SI-NEXT: v_mov_b32_e32 v10, s18 -; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v21, s16 +; SI-NEXT: v_mov_b32_e32 v20, s17 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v63, s18 ; SI-NEXT: v_mov_b32_e32 v62, s19 -; SI-NEXT: v_mov_b32_e32 v61, s20 -; SI-NEXT: v_mov_b32_e32 v58, s21 -; SI-NEXT: v_mov_b32_e32 v56, s22 -; SI-NEXT: v_mov_b32_e32 v46, s23 -; SI-NEXT: v_mov_b32_e32 v45, s24 +; SI-NEXT: v_mov_b32_e32 v60, s20 +; SI-NEXT: v_mov_b32_e32 v59, s21 +; SI-NEXT: v_mov_b32_e32 v58, s22 +; SI-NEXT: v_mov_b32_e32 v47, s23 +; SI-NEXT: v_mov_b32_e32 v44, s24 ; SI-NEXT: v_mov_b32_e32 v43, s25 -; SI-NEXT: v_mov_b32_e32 v59, s26 +; SI-NEXT: v_mov_b32_e32 v42, s26 +; SI-NEXT: v_mov_b32_e32 v56, s27 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mov_b32_e32 v60, s27 ; SI-NEXT: v_mov_b32_e32 v57, s28 -; SI-NEXT: v_mov_b32_e32 v47, s29 -; SI-NEXT: s_cbranch_scc0 .LBB33_2 +; SI-NEXT: v_mov_b32_e32 v45, s29 +; SI-NEXT: s_cbranch_scc0 .LBB33_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v1 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v9 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v46 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v58 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v9, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v62 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v22 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v9, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v4 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v13, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v59 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v45 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v43 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v42 ; SI-NEXT: v_cvt_f32_f16_e32 v34, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v56 -; SI-NEXT: s_mov_b64 s[4:5], 0 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v60 ; SI-NEXT: v_cvt_f32_f16_e32 v40, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v12 -; SI-NEXT: s_branch .LBB33_3 -; SI-NEXT: .LBB33_2: -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; kill: killed $vgpr9 -; SI-NEXT: s_mov_b64 s[4:5], -1 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; kill: killed $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; kill: killed $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; kill: killed $vgpr9 -; SI-NEXT: .LBB33_3: ; %Flow -; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v9, v14 -; SI-NEXT: v_mov_b32_e32 v14, v16 -; SI-NEXT: v_mov_b32_e32 v16, v18 -; SI-NEXT: v_mov_b32_e32 v18, v20 -; SI-NEXT: s_cbranch_vccnz .LBB33_5 -; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v9, 1.0, v12 -; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 -; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 -; SI-NEXT: v_add_f32_e32 v14, 1.0, v62 -; SI-NEXT: v_add_f32_e32 v16, 1.0, v61 -; SI-NEXT: v_add_f32_e32 v18, 1.0, v58 -; SI-NEXT: v_add_f32_e32 v20, 1.0, v56 -; SI-NEXT: v_add_f32_e32 v22, 1.0, v46 -; SI-NEXT: v_add_f32_e32 v24, 1.0, v45 -; SI-NEXT: v_add_f32_e32 v26, 1.0, v43 -; SI-NEXT: v_add_f32_e32 v28, 1.0, v59 -; SI-NEXT: v_add_f32_e32 v30, 1.0, v60 -; SI-NEXT: v_add_f32_e32 v29, 1.0, v57 -; SI-NEXT: v_add_f32_e32 v27, 1.0, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v21 +; SI-NEXT: s_cbranch_execnz .LBB33_3 +; SI-NEXT: .LBB33_2: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v63 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v7 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v47 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v63 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v62 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v60 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v59 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v58 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v44 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v43 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v42 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v56 +; SI-NEXT: v_add_f32_e32 v31, 1.0, v57 +; SI-NEXT: v_add_f32_e32 v30, 1.0, v45 ; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 ; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 ; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 ; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 -; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 -; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v29 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v41 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 ; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 ; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 ; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v12 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: .LBB33_5: ; %end -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v44 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v42 -; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v40 -; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v54 -; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v52 -; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v48 -; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v38 -; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v36 -; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v34 -; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v32 -; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v30 -; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v29 -; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v27 -; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v13 -; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v23 -; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v22 -; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v21 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: v_cvt_f32_f16_e32 v53, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v21 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: .LBB33_3: ; %end ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v19 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v17 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v15 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x54, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v0, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v46 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v51 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v40 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v49 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v52 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v38 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v37 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v35 +; SI-NEXT: v_or_b32_e32 v7, v21, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_or_b32_e32 v8, v20, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v33 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_or_b32_e32 v10, v20, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v31 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_or_b32_e32 v12, v20, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v29 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_or_b32_e32 v14, v20, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v27 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v16 ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -15697,8 +15177,88 @@ define inreg <44 x half> @bitcast_v22f32_to_v44f16_scalar(<22 x float> inreg %a, ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_or_b32_e32 v16, v20, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v25 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_or_b32_e32 v9, v34, v9 +; SI-NEXT: v_or_b32_e32 v11, v32, v11 +; SI-NEXT: v_or_b32_e32 v13, v30, v13 +; SI-NEXT: v_or_b32_e32 v15, v28, v15 +; SI-NEXT: v_or_b32_e32 v17, v26, v17 +; SI-NEXT: v_or_b32_e32 v19, v24, v19 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v23 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 ; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB33_4: +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: s_branch .LBB33_2 ; ; VI-LABEL: bitcast_v22f32_to_v44f16_scalar: ; VI: ; %bb.0: @@ -16346,173 +15906,180 @@ define <22 x float> @bitcast_v44f16_to_v22f32(<44 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v44f16_to_v22f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:52 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v50, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v32 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v10 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v9 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v52 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v46 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v52, v47 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v56 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v40, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v45 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v44 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v43 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v42 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB34_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v63 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v59 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v57 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v49 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v33 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v63 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v61 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v59 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v53 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v45 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v41 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v53 ; SI-NEXT: v_or_b32_e32 v0, v50, v0 ; SI-NEXT: v_or_b32_e32 v1, v48, v1 ; SI-NEXT: v_or_b32_e32 v2, v38, v2 @@ -16522,8 +16089,13 @@ define <22 x float> @bitcast_v44f16_to_v22f32(<44 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v6, v62, v6 ; SI-NEXT: v_or_b32_e32 v7, v60, v7 ; SI-NEXT: v_or_b32_e32 v8, v58, v8 -; SI-NEXT: v_or_b32_e32 v20, v54, v20 -; SI-NEXT: v_or_b32_e32 v21, v52, v21 +; SI-NEXT: v_or_b32_e32 v9, v56, v9 +; SI-NEXT: v_or_b32_e32 v10, v46, v10 +; SI-NEXT: v_or_b32_e32 v11, v44, v11 +; SI-NEXT: v_or_b32_e32 v12, v42, v12 +; SI-NEXT: v_or_b32_e32 v13, v40, v13 +; SI-NEXT: v_or_b32_e32 v14, v54, v14 +; SI-NEXT: v_or_b32_e32 v15, v52, v15 ; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr49 @@ -16543,64 +16115,72 @@ define <22 x float> @bitcast_v44f16_to_v22f32(<44 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v41 -; SI-NEXT: v_or_b32_e32 v19, v40, v19 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 ; SI-NEXT: .LBB34_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB34_4 @@ -16664,130 +16244,107 @@ define <22 x float> @bitcast_v44f16_to_v22f32(<44 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v11, v56 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v63 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v12, v46 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v43 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v52 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v42 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v40 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v53 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v52 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v47 ; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v45 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v44 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v11, v12, v11 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v41 ; SI-NEXT: v_or_b32_e32 v12, v14, v12 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v55 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v54 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 @@ -16799,17 +16356,29 @@ define <22 x float> @bitcast_v44f16_to_v22f32(<44 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_or_b32_e32 v17, v18, v17 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v41 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_or_b32_e32 v18, v20, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v55 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v54 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_or_b32_e32 v20, v21, v20 @@ -16817,22 +16386,22 @@ define <22 x float> @bitcast_v44f16_to_v22f32(<44 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v21, v23, v21 ; SI-NEXT: .LBB34_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -17425,6 +16994,14 @@ define inreg <22 x float> @bitcast_v44f16_to_v22f32_scalar(<44 x half> inreg %a, ; SI-LABEL: bitcast_v44f16_to_v22f32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_lshr_b32 s40, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s40 +; SI-NEXT: s_lshr_b32 s40, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s17 +; SI-NEXT: s_lshr_b32 s14, s19, 16 +; SI-NEXT: s_lshr_b32 s15, s18, 16 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -17441,219 +17018,280 @@ define inreg <22 x float> @bitcast_v44f16_to_v22f32_scalar(<44 x half> inreg %a, ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v60, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v51, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v33, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v1, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v34, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v2, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v3, s20 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v63, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v62, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v61, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v36, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v29, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v28, s26 -; SI-NEXT: v_cvt_f16_f32_e32 v27, s29 -; SI-NEXT: v_cvt_f16_f32_e32 v26, s28 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v61, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v19 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v62, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s19 +; SI-NEXT: s_lshr_b32 s12, s21, 16 +; SI-NEXT: s_lshr_b32 s13, s20, 16 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v18 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v63, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s21 +; SI-NEXT: s_lshr_b32 s10, s23, 16 +; SI-NEXT: s_lshr_b32 s11, s22, 16 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s23 +; SI-NEXT: s_lshr_b32 s8, s25, 16 +; SI-NEXT: s_lshr_b32 s9, s24, 16 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s25 +; SI-NEXT: s_lshr_b32 s6, s27, 16 +; SI-NEXT: s_lshr_b32 s7, s26, 16 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s27 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v0 +; SI-NEXT: s_lshr_b32 s4, s29, 16 +; SI-NEXT: s_lshr_b32 s5, s28, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v7 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB35_4 ; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v23 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v62 +; SI-NEXT: v_or_b32_e32 v2, v21, v2 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v63 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v61 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v29 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v60 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v60 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v48 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v36 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v35 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v58 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v56 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v46 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v44 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v42 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v40 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v54 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v46 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v41 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v53 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v25 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: v_or_b32_e32 v0, v33, v0 -; SI-NEXT: v_or_b32_e32 v1, v34, v1 -; SI-NEXT: v_or_b32_e32 v3, v62, v3 -; SI-NEXT: v_or_b32_e32 v4, v36, v4 -; SI-NEXT: v_or_b32_e32 v5, v28, v5 -; SI-NEXT: v_or_b32_e32 v6, v26, v6 -; SI-NEXT: v_or_b32_e32 v7, v48, v7 -; SI-NEXT: v_or_b32_e32 v8, v38, v8 -; SI-NEXT: v_or_b32_e32 v9, v49, v9 -; SI-NEXT: v_or_b32_e32 v10, v50, v10 -; SI-NEXT: v_or_b32_e32 v11, v59, v11 -; SI-NEXT: v_or_b32_e32 v12, v57, v12 -; SI-NEXT: v_or_b32_e32 v13, v47, v13 -; SI-NEXT: v_or_b32_e32 v14, v45, v14 -; SI-NEXT: v_or_b32_e32 v15, v43, v15 -; SI-NEXT: v_or_b32_e32 v16, v41, v16 -; SI-NEXT: v_or_b32_e32 v17, v55, v17 -; SI-NEXT: v_or_b32_e32 v18, v53, v18 -; SI-NEXT: v_or_b32_e32 v19, v31, v19 +; SI-NEXT: v_or_b32_e32 v0, v59, v0 +; SI-NEXT: v_or_b32_e32 v1, v57, v1 +; SI-NEXT: v_or_b32_e32 v3, v58, v3 +; SI-NEXT: v_or_b32_e32 v4, v56, v4 +; SI-NEXT: v_or_b32_e32 v5, v50, v5 +; SI-NEXT: v_or_b32_e32 v6, v49, v6 +; SI-NEXT: v_or_b32_e32 v7, v38, v7 +; SI-NEXT: v_or_b32_e32 v8, v37, v8 +; SI-NEXT: v_or_b32_e32 v9, v34, v9 +; SI-NEXT: v_or_b32_e32 v10, v33, v10 +; SI-NEXT: v_or_b32_e32 v11, v45, v11 +; SI-NEXT: v_or_b32_e32 v12, v44, v12 +; SI-NEXT: v_or_b32_e32 v13, v42, v13 +; SI-NEXT: v_or_b32_e32 v14, v40, v14 +; SI-NEXT: v_or_b32_e32 v15, v54, v15 +; SI-NEXT: v_or_b32_e32 v16, v52, v16 +; SI-NEXT: v_or_b32_e32 v17, v30, v17 +; SI-NEXT: v_or_b32_e32 v18, v28, v18 +; SI-NEXT: v_or_b32_e32 v19, v26, v19 ; SI-NEXT: v_or_b32_e32 v20, v24, v20 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v21, v23, v21 ; SI-NEXT: s_cbranch_execnz .LBB35_3 ; SI-NEXT: .LBB35_2: ; %cmp.true -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v57 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v4, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v56 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v49 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v26 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v48 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v34 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v50 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v33 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v46 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v44 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v42 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v55 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v54 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v52 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v29 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v28 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v26 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v63 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v60 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v47 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v51 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v48 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 @@ -17673,43 +17311,43 @@ define inreg <22 x float> @bitcast_v44f16_to_v22f32_scalar(<44 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v45 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v43 ; SI-NEXT: v_or_b32_e32 v12, v14, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v41 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v40 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v53 ; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v31 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v30 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_or_b32_e32 v17, v18, v17 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v27 ; SI-NEXT: v_or_b32_e32 v18, v20, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v20, v25 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 @@ -17745,88 +17383,92 @@ define inreg <22 x float> @bitcast_v44f16_to_v22f32_scalar(<44 x half> inreg %a, ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB35_4: -; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v59, v46 -; SI-NEXT: v_mov_b32_e32 v46, v41 -; SI-NEXT: v_mov_b32_e32 v41, v52 -; SI-NEXT: v_mov_b32_e32 v52, v23 -; SI-NEXT: v_mov_b32_e32 v48, v60 -; SI-NEXT: v_mov_b32_e32 v60, v47 -; SI-NEXT: v_mov_b32_e32 v47, v42 -; SI-NEXT: v_mov_b32_e32 v42, v53 +; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v51, v33 +; SI-NEXT: v_mov_b32_e32 v50, v32 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_mov_b32_e32 v48, v47 +; SI-NEXT: v_mov_b32_e32 v47, v53 ; SI-NEXT: v_mov_b32_e32 v53, v22 -; SI-NEXT: v_mov_b32_e32 v35, v61 -; SI-NEXT: v_mov_b32_e32 v61, v56 -; SI-NEXT: v_mov_b32_e32 v56, v43 -; SI-NEXT: v_mov_b32_e32 v43, v54 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_mov_b32_e32 v39, v56 +; SI-NEXT: v_mov_b32_e32 v56, v54 ; SI-NEXT: v_mov_b32_e32 v54, v24 -; SI-NEXT: v_mov_b32_e32 v50, v34 -; SI-NEXT: v_mov_b32_e32 v34, v62 -; SI-NEXT: v_mov_b32_e32 v62, v57 -; SI-NEXT: v_mov_b32_e32 v57, v44 -; SI-NEXT: v_mov_b32_e32 v44, v55 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_mov_b32_e32 v38, v57 +; SI-NEXT: v_mov_b32_e32 v57, v55 ; SI-NEXT: v_mov_b32_e32 v55, v25 -; SI-NEXT: v_mov_b32_e32 v32, v33 -; SI-NEXT: v_mov_b32_e32 v33, v63 -; SI-NEXT: v_mov_b32_e32 v63, v58 -; SI-NEXT: v_mov_b32_e32 v58, v45 -; SI-NEXT: v_mov_b32_e32 v45, v40 -; SI-NEXT: v_mov_b32_e32 v40, v31 -; SI-NEXT: v_mov_b32_e32 v39, v26 -; SI-NEXT: v_mov_b32_e32 v38, v27 -; SI-NEXT: v_mov_b32_e32 v37, v28 -; SI-NEXT: v_mov_b32_e32 v49, v36 -; SI-NEXT: v_mov_b32_e32 v36, v29 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_mov_b32_e32 v37, v58 +; SI-NEXT: v_mov_b32_e32 v58, v40 +; SI-NEXT: v_mov_b32_e32 v40, v26 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v36, v59 +; SI-NEXT: v_mov_b32_e32 v59, v41 +; SI-NEXT: v_mov_b32_e32 v41, v27 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v35, v60 +; SI-NEXT: v_mov_b32_e32 v60, v42 +; SI-NEXT: v_mov_b32_e32 v42, v28 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v34, v61 +; SI-NEXT: v_mov_b32_e32 v61, v43 +; SI-NEXT: v_mov_b32_e32 v43, v29 +; SI-NEXT: v_mov_b32_e32 v33, v62 +; SI-NEXT: v_mov_b32_e32 v62, v44 +; SI-NEXT: v_mov_b32_e32 v44, v30 +; SI-NEXT: v_mov_b32_e32 v32, v63 +; SI-NEXT: v_mov_b32_e32 v63, v45 +; SI-NEXT: v_mov_b32_e32 v45, v31 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v29, v36 -; SI-NEXT: v_mov_b32_e32 v36, v49 -; SI-NEXT: v_mov_b32_e32 v28, v37 -; SI-NEXT: v_mov_b32_e32 v27, v38 -; SI-NEXT: v_mov_b32_e32 v26, v39 -; SI-NEXT: v_mov_b32_e32 v31, v40 -; SI-NEXT: v_mov_b32_e32 v40, v45 -; SI-NEXT: v_mov_b32_e32 v45, v58 -; SI-NEXT: v_mov_b32_e32 v58, v63 -; SI-NEXT: v_mov_b32_e32 v63, v33 -; SI-NEXT: v_mov_b32_e32 v33, v32 +; SI-NEXT: v_mov_b32_e32 v31, v45 +; SI-NEXT: v_mov_b32_e32 v45, v63 +; SI-NEXT: v_mov_b32_e32 v63, v32 +; SI-NEXT: v_mov_b32_e32 v30, v44 +; SI-NEXT: v_mov_b32_e32 v44, v62 +; SI-NEXT: v_mov_b32_e32 v62, v33 +; SI-NEXT: v_mov_b32_e32 v29, v43 +; SI-NEXT: v_mov_b32_e32 v43, v61 +; SI-NEXT: v_mov_b32_e32 v61, v34 +; SI-NEXT: v_mov_b32_e32 v28, v42 +; SI-NEXT: v_mov_b32_e32 v42, v60 +; SI-NEXT: v_mov_b32_e32 v60, v35 +; SI-NEXT: v_mov_b32_e32 v27, v41 +; SI-NEXT: v_mov_b32_e32 v41, v59 +; SI-NEXT: v_mov_b32_e32 v59, v36 +; SI-NEXT: v_mov_b32_e32 v26, v40 +; SI-NEXT: v_mov_b32_e32 v40, v58 +; SI-NEXT: v_mov_b32_e32 v58, v37 ; SI-NEXT: v_mov_b32_e32 v25, v55 -; SI-NEXT: v_mov_b32_e32 v55, v44 -; SI-NEXT: v_mov_b32_e32 v44, v57 -; SI-NEXT: v_mov_b32_e32 v57, v62 -; SI-NEXT: v_mov_b32_e32 v62, v34 -; SI-NEXT: v_mov_b32_e32 v34, v50 +; SI-NEXT: v_mov_b32_e32 v55, v57 +; SI-NEXT: v_mov_b32_e32 v57, v38 ; SI-NEXT: v_mov_b32_e32 v24, v54 -; SI-NEXT: v_mov_b32_e32 v54, v43 -; SI-NEXT: v_mov_b32_e32 v43, v56 -; SI-NEXT: v_mov_b32_e32 v56, v61 -; SI-NEXT: v_mov_b32_e32 v61, v35 +; SI-NEXT: v_mov_b32_e32 v54, v56 +; SI-NEXT: v_mov_b32_e32 v56, v39 ; SI-NEXT: v_mov_b32_e32 v22, v53 -; SI-NEXT: v_mov_b32_e32 v53, v42 -; SI-NEXT: v_mov_b32_e32 v42, v47 -; SI-NEXT: v_mov_b32_e32 v47, v60 -; SI-NEXT: v_mov_b32_e32 v60, v48 -; SI-NEXT: v_mov_b32_e32 v23, v52 -; SI-NEXT: v_mov_b32_e32 v52, v41 -; SI-NEXT: v_mov_b32_e32 v41, v46 -; SI-NEXT: v_mov_b32_e32 v46, v59 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v53, v47 +; SI-NEXT: v_mov_b32_e32 v47, v48 +; SI-NEXT: v_mov_b32_e32 v32, v50 +; SI-NEXT: v_mov_b32_e32 v33, v51 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: s_branch .LBB35_2 ; ; VI-LABEL: bitcast_v44f16_to_v22f32_scalar: @@ -19103,236 +18745,171 @@ define <44 x i16> @bitcast_v11i64_to_v44i16(<11 x i64> %a, i32 %b) { ; SI-LABEL: bitcast_v11i64_to_v44i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v23 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB40_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_alignbit_b32 v23, v22, v21, 16 -; SI-NEXT: v_alignbit_b32 v24, v20, v19, 16 -; SI-NEXT: v_alignbit_b32 v25, v18, v17, 16 -; SI-NEXT: v_alignbit_b32 v26, v16, v15, 16 -; SI-NEXT: v_alignbit_b32 v27, v14, v13, 16 -; SI-NEXT: v_alignbit_b32 v29, v12, v11, 16 -; SI-NEXT: v_alignbit_b32 v32, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v34, v8, v7, 16 -; SI-NEXT: v_alignbit_b32 v36, v6, v5, 16 -; SI-NEXT: v_alignbit_b32 v39, v4, v3, 16 -; SI-NEXT: v_alignbit_b32 v49, v2, v1, 16 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v2 +; SI-NEXT: v_alignbit_b32 v22, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v23, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v24, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v25, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v26, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v27, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v28, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v29, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v32, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v34, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v37, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v1 ; SI-NEXT: .LBB40_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB40_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: v_addc_u32_e32 v12, vcc, 0, v12, vcc -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; SI-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; SI-NEXT: v_addc_u32_e32 v16, vcc, 0, v16, vcc -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; SI-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; SI-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc -; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; SI-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc -; SI-NEXT: v_alignbit_b32 v23, v22, v21, 16 -; SI-NEXT: v_alignbit_b32 v24, v20, v19, 16 -; SI-NEXT: v_alignbit_b32 v25, v18, v17, 16 -; SI-NEXT: v_alignbit_b32 v26, v16, v15, 16 -; SI-NEXT: v_alignbit_b32 v27, v14, v13, 16 -; SI-NEXT: v_alignbit_b32 v29, v12, v11, 16 -; SI-NEXT: v_alignbit_b32 v32, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v34, v8, v7, 16 -; SI-NEXT: v_alignbit_b32 v36, v6, v5, 16 -; SI-NEXT: v_alignbit_b32 v39, v4, v3, 16 -; SI-NEXT: v_alignbit_b32 v49, v2, v1, 16 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; SI-NEXT: v_alignbit_b32 v22, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v23, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v24, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v25, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v26, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v27, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v28, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v29, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v32, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v34, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v37, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v1 ; SI-NEXT: .LBB40_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v0, v0, v37 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; SI-NEXT: v_or_b32_e32 v1, v1, v49 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v26 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v24 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v23 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v28 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x54, v0 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v51 +; SI-NEXT: v_or_b32_e32 v2, v2, v34 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v50 +; SI-NEXT: v_or_b32_e32 v4, v4, v32 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v49 +; SI-NEXT: v_or_b32_e32 v6, v6, v29 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v48 +; SI-NEXT: v_or_b32_e32 v8, v8, v28 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v39 +; SI-NEXT: v_or_b32_e32 v10, v10, v27 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v38 +; SI-NEXT: v_or_b32_e32 v12, v12, v26 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v36 +; SI-NEXT: v_or_b32_e32 v14, v14, v25 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v35 +; SI-NEXT: v_or_b32_e32 v16, v16, v24 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v33 +; SI-NEXT: v_or_b32_e32 v18, v18, v23 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v31 +; SI-NEXT: v_or_b32_e32 v20, v20, v22 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v30 +; SI-NEXT: v_or_b32_e32 v1, v1, v37 +; SI-NEXT: v_or_b32_e32 v3, v3, v34 +; SI-NEXT: v_or_b32_e32 v5, v5, v32 +; SI-NEXT: v_or_b32_e32 v7, v7, v29 +; SI-NEXT: v_or_b32_e32 v9, v9, v28 +; SI-NEXT: v_or_b32_e32 v11, v11, v27 +; SI-NEXT: v_or_b32_e32 v13, v13, v26 +; SI-NEXT: v_or_b32_e32 v15, v15, v25 +; SI-NEXT: v_or_b32_e32 v17, v17, v24 +; SI-NEXT: v_or_b32_e32 v19, v19, v23 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v11i64_to_v44i16: @@ -19806,44 +19383,44 @@ define inreg <44 x i16> @bitcast_v11i64_to_v44i16_scalar(<11 x i64> inreg %a, i3 ; SI-LABEL: bitcast_v11i64_to_v44i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v10, s16 -; SI-NEXT: v_mov_b32_e32 v11, s17 -; SI-NEXT: v_mov_b32_e32 v12, s18 -; SI-NEXT: v_mov_b32_e32 v13, s19 -; SI-NEXT: v_mov_b32_e32 v14, s20 -; SI-NEXT: v_mov_b32_e32 v15, s21 -; SI-NEXT: v_mov_b32_e32 v16, s22 -; SI-NEXT: v_mov_b32_e32 v17, s23 -; SI-NEXT: v_mov_b32_e32 v18, s24 -; SI-NEXT: v_mov_b32_e32 v19, s25 -; SI-NEXT: v_readfirstlane_b32 s24, v10 -; SI-NEXT: v_mov_b32_e32 v10, s26 -; SI-NEXT: v_readfirstlane_b32 s25, v11 -; SI-NEXT: v_mov_b32_e32 v11, s27 -; SI-NEXT: v_readfirstlane_b32 s22, v12 -; SI-NEXT: v_mov_b32_e32 v12, s28 -; SI-NEXT: v_readfirstlane_b32 s23, v13 -; SI-NEXT: v_mov_b32_e32 v13, s29 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 -; SI-NEXT: v_readfirstlane_b32 s20, v14 -; SI-NEXT: v_readfirstlane_b32 s21, v15 -; SI-NEXT: v_readfirstlane_b32 s18, v16 -; SI-NEXT: v_readfirstlane_b32 s19, v17 -; SI-NEXT: v_readfirstlane_b32 s16, v18 -; SI-NEXT: v_readfirstlane_b32 s17, v19 -; SI-NEXT: v_readfirstlane_b32 s14, v10 -; SI-NEXT: v_readfirstlane_b32 s15, v11 -; SI-NEXT: v_readfirstlane_b32 s12, v12 -; SI-NEXT: v_readfirstlane_b32 s13, v13 -; SI-NEXT: v_readfirstlane_b32 s10, v1 -; SI-NEXT: v_readfirstlane_b32 s11, v2 -; SI-NEXT: v_readfirstlane_b32 s8, v3 -; SI-NEXT: v_readfirstlane_b32 s9, v4 -; SI-NEXT: v_readfirstlane_b32 s6, v5 -; SI-NEXT: v_readfirstlane_b32 s7, v6 -; SI-NEXT: v_readfirstlane_b32 s4, v7 +; SI-NEXT: v_mov_b32_e32 v9, s16 +; SI-NEXT: v_mov_b32_e32 v10, s17 +; SI-NEXT: v_mov_b32_e32 v11, s18 +; SI-NEXT: v_mov_b32_e32 v12, s19 +; SI-NEXT: v_mov_b32_e32 v13, s20 +; SI-NEXT: v_mov_b32_e32 v14, s21 +; SI-NEXT: v_mov_b32_e32 v15, s22 +; SI-NEXT: v_mov_b32_e32 v16, s23 +; SI-NEXT: v_mov_b32_e32 v17, s24 +; SI-NEXT: v_mov_b32_e32 v18, s25 +; SI-NEXT: v_mov_b32_e32 v19, s26 +; SI-NEXT: v_readfirstlane_b32 s24, v9 +; SI-NEXT: v_mov_b32_e32 v9, s27 +; SI-NEXT: v_readfirstlane_b32 s25, v10 +; SI-NEXT: v_mov_b32_e32 v10, s28 +; SI-NEXT: v_readfirstlane_b32 s22, v11 +; SI-NEXT: v_mov_b32_e32 v11, s29 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: v_readfirstlane_b32 s23, v12 +; SI-NEXT: v_readfirstlane_b32 s20, v13 +; SI-NEXT: v_readfirstlane_b32 s21, v14 +; SI-NEXT: v_readfirstlane_b32 s18, v15 +; SI-NEXT: v_readfirstlane_b32 s19, v16 +; SI-NEXT: v_readfirstlane_b32 s16, v17 +; SI-NEXT: v_readfirstlane_b32 s17, v18 +; SI-NEXT: v_readfirstlane_b32 s14, v19 +; SI-NEXT: v_readfirstlane_b32 s15, v9 +; SI-NEXT: v_readfirstlane_b32 s12, v10 +; SI-NEXT: v_readfirstlane_b32 s13, v11 +; SI-NEXT: v_readfirstlane_b32 s10, v0 +; SI-NEXT: v_readfirstlane_b32 s11, v1 +; SI-NEXT: v_readfirstlane_b32 s8, v2 +; SI-NEXT: v_readfirstlane_b32 s9, v3 +; SI-NEXT: v_readfirstlane_b32 s6, v4 +; SI-NEXT: v_readfirstlane_b32 s7, v5 +; SI-NEXT: v_readfirstlane_b32 s4, v6 ; SI-NEXT: s_and_b64 s[26:27], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s5, v8 +; SI-NEXT: v_readfirstlane_b32 s5, v7 ; SI-NEXT: s_cbranch_scc0 .LBB41_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_lshr_b32 s76, s5, 16 @@ -19918,153 +19495,91 @@ define inreg <44 x i16> @bitcast_v11i64_to_v44i16_scalar(<11 x i64> inreg %a, i3 ; SI-NEXT: s_lshl_b32 s27, s72, 16 ; SI-NEXT: s_and_b32 s24, s24, 0xffff ; SI-NEXT: s_or_b32 s24, s24, s27 -; SI-NEXT: v_mov_b32_e32 v1, s24 -; SI-NEXT: s_and_b32 s24, s25, 0xffff -; SI-NEXT: s_lshl_b32 s25, s94, 16 -; SI-NEXT: s_or_b32 s24, s24, s25 -; SI-NEXT: v_mov_b32_e32 v2, s24 -; SI-NEXT: s_lshl_b32 s24, s62, 16 +; SI-NEXT: s_and_b32 s25, s25, 0xffff +; SI-NEXT: s_lshl_b32 s27, s94, 16 +; SI-NEXT: s_or_b32 s25, s25, s27 +; SI-NEXT: s_lshl_b32 s27, s62, 16 ; SI-NEXT: s_and_b32 s22, s22, 0xffff -; SI-NEXT: s_or_b32 s22, s22, s24 -; SI-NEXT: v_mov_b32_e32 v3, s22 -; SI-NEXT: s_and_b32 s22, s23, 0xffff -; SI-NEXT: s_lshl_b32 s23, s93, 16 -; SI-NEXT: s_or_b32 s22, s22, s23 -; SI-NEXT: v_mov_b32_e32 v4, s22 -; SI-NEXT: s_lshl_b32 s22, s60, 16 +; SI-NEXT: s_or_b32 s22, s22, s27 +; SI-NEXT: s_and_b32 s23, s23, 0xffff +; SI-NEXT: s_lshl_b32 s27, s93, 16 +; SI-NEXT: s_or_b32 s23, s23, s27 +; SI-NEXT: s_lshl_b32 s27, s60, 16 ; SI-NEXT: s_and_b32 s20, s20, 0xffff -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; SI-NEXT: s_or_b32 s20, s20, s22 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v1, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v1, vcc, 12, v0 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v2, s20 -; SI-NEXT: s_and_b32 s20, s21, 0xffff -; SI-NEXT: s_lshl_b32 s21, s92, 16 -; SI-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v1, vcc, 16, v0 -; SI-NEXT: s_or_b32 s20, s20, s21 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s20 +; SI-NEXT: s_or_b32 s20, s20, s27 +; SI-NEXT: s_and_b32 s21, s21, 0xffff +; SI-NEXT: s_lshl_b32 s27, s92, 16 +; SI-NEXT: s_or_b32 s21, s21, s27 +; SI-NEXT: s_lshl_b32 s27, s58, 16 ; SI-NEXT: s_and_b32 s18, s18, 0xffff -; SI-NEXT: s_lshl_b32 s20, s58, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 20, v0 -; SI-NEXT: s_or_b32 s18, s18, s20 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s18 -; SI-NEXT: s_and_b32 s18, s19, 0xffff -; SI-NEXT: s_lshl_b32 s19, s91, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 24, v0 -; SI-NEXT: s_or_b32 s18, s18, s19 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: s_or_b32 s18, s18, s27 +; SI-NEXT: s_and_b32 s19, s19, 0xffff +; SI-NEXT: s_lshl_b32 s27, s91, 16 +; SI-NEXT: s_or_b32 s19, s19, s27 ; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: s_lshl_b32 s18, s56, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 28, v0 -; SI-NEXT: s_or_b32 s16, s16, s18 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s17, 0xffff -; SI-NEXT: s_lshl_b32 s17, s90, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 32, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_lshl_b32 s27, s56, 16 +; SI-NEXT: s_or_b32 s16, s16, s27 +; SI-NEXT: s_and_b32 s17, s17, 0xffff +; SI-NEXT: s_lshl_b32 s27, s90, 16 +; SI-NEXT: s_or_b32 s17, s17, s27 ; SI-NEXT: s_and_b32 s14, s14, 0xffff -; SI-NEXT: s_lshl_b32 s16, s46, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 36, v0 -; SI-NEXT: s_or_b32 s14, s14, s16 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s14 -; SI-NEXT: s_and_b32 s14, s15, 0xffff -; SI-NEXT: s_lshl_b32 s15, s89, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 40, v0 -; SI-NEXT: s_or_b32 s14, s14, s15 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s14 +; SI-NEXT: s_lshl_b32 s27, s46, 16 +; SI-NEXT: s_or_b32 s14, s14, s27 +; SI-NEXT: s_and_b32 s15, s15, 0xffff +; SI-NEXT: s_lshl_b32 s27, s89, 16 +; SI-NEXT: s_or_b32 s15, s15, s27 ; SI-NEXT: s_and_b32 s12, s12, 0xffff -; SI-NEXT: s_lshl_b32 s14, s44, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 44, v0 -; SI-NEXT: s_or_b32 s12, s12, s14 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s12 -; SI-NEXT: s_and_b32 s12, s13, 0xffff -; SI-NEXT: s_lshl_b32 s13, s88, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 48, v0 -; SI-NEXT: s_or_b32 s12, s12, s13 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s12 +; SI-NEXT: s_lshl_b32 s27, s44, 16 +; SI-NEXT: s_or_b32 s12, s12, s27 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_lshl_b32 s27, s88, 16 +; SI-NEXT: s_or_b32 s13, s13, s27 ; SI-NEXT: s_and_b32 s10, s10, 0xffff -; SI-NEXT: s_lshl_b32 s12, s42, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 52, v0 -; SI-NEXT: s_or_b32 s10, s10, s12 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s10 -; SI-NEXT: s_and_b32 s10, s11, 0xffff -; SI-NEXT: s_lshl_b32 s11, s79, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 56, v0 -; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: s_lshl_b32 s27, s42, 16 +; SI-NEXT: s_or_b32 s10, s10, s27 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: s_lshl_b32 s27, s79, 16 +; SI-NEXT: s_or_b32 s11, s11, s27 ; SI-NEXT: s_and_b32 s8, s8, 0xffff -; SI-NEXT: s_lshl_b32 s10, s40, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 60, v0 -; SI-NEXT: s_or_b32 s8, s8, s10 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s8 -; SI-NEXT: s_and_b32 s8, s9, 0xffff -; SI-NEXT: s_lshl_b32 s9, s78, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 64, v0 -; SI-NEXT: s_or_b32 s8, s8, s9 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: s_lshl_b32 s27, s40, 16 +; SI-NEXT: s_or_b32 s8, s8, s27 +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_lshl_b32 s27, s78, 16 +; SI-NEXT: s_or_b32 s9, s9, s27 ; SI-NEXT: s_and_b32 s6, s6, 0xffff -; SI-NEXT: s_lshl_b32 s8, s28, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x44, v0 -; SI-NEXT: s_or_b32 s6, s6, s8 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: s_and_b32 s6, s7, 0xffff -; SI-NEXT: s_lshl_b32 s7, s77, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x48, v0 -; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_lshl_b32 s27, s28, 16 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_lshl_b32 s6, s26, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x4c, v0 -; SI-NEXT: s_or_b32 s4, s4, s6 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s4 -; SI-NEXT: s_and_b32 s4, s5, 0xffff -; SI-NEXT: s_lshl_b32 s5, s76, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x50, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x54, v0 -; SI-NEXT: v_mov_b32_e32 v1, s4 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_lshl_b32 s26, s26, 16 +; SI-NEXT: s_or_b32 s6, s6, s27 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_lshl_b32 s27, s77, 16 +; SI-NEXT: s_or_b32 s4, s4, s26 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s26, s76, 16 +; SI-NEXT: s_or_b32 s7, s7, s27 +; SI-NEXT: s_or_b32 s5, s5, s26 +; SI-NEXT: v_mov_b32_e32 v0, s24 +; SI-NEXT: v_mov_b32_e32 v1, s25 +; SI-NEXT: v_mov_b32_e32 v2, s22 +; SI-NEXT: v_mov_b32_e32 v3, s23 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s18 +; SI-NEXT: v_mov_b32_e32 v7, s19 +; SI-NEXT: v_mov_b32_e32 v8, s16 +; SI-NEXT: v_mov_b32_e32 v9, s17 +; SI-NEXT: v_mov_b32_e32 v10, s14 +; SI-NEXT: v_mov_b32_e32 v11, s15 +; SI-NEXT: v_mov_b32_e32 v12, s12 +; SI-NEXT: v_mov_b32_e32 v13, s13 +; SI-NEXT: v_mov_b32_e32 v14, s10 +; SI-NEXT: v_mov_b32_e32 v15, s11 +; SI-NEXT: v_mov_b32_e32 v16, s8 +; SI-NEXT: v_mov_b32_e32 v17, s9 +; SI-NEXT: v_mov_b32_e32 v18, s6 +; SI-NEXT: v_mov_b32_e32 v19, s7 +; SI-NEXT: v_mov_b32_e32 v20, s4 +; SI-NEXT: v_mov_b32_e32 v21, s5 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB41_4: ; SI-NEXT: ; implicit-def: $sgpr72 @@ -20688,118 +20203,204 @@ define <11 x i64> @bitcast_v44i16_to_v11i64(<44 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v44i16_to_v11i64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v54, v2 -; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 -; SI-NEXT: v_mov_b32_e32 v53, v4 -; SI-NEXT: v_mov_b32_e32 v50, v10 -; SI-NEXT: v_mov_b32_e32 v51, v8 -; SI-NEXT: v_mov_b32_e32 v52, v6 -; SI-NEXT: v_mov_b32_e32 v39, v16 -; SI-NEXT: v_mov_b32_e32 v48, v14 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v34, v19 +; SI-NEXT: v_mov_b32_e32 v35, v18 +; SI-NEXT: v_mov_b32_e32 v36, v17 +; SI-NEXT: v_mov_b32_e32 v37, v16 +; SI-NEXT: v_mov_b32_e32 v38, v15 +; SI-NEXT: v_mov_b32_e32 v39, v14 +; SI-NEXT: v_mov_b32_e32 v48, v13 ; SI-NEXT: v_mov_b32_e32 v49, v12 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v38, v18 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v19 +; SI-NEXT: v_mov_b32_e32 v50, v11 +; SI-NEXT: v_mov_b32_e32 v51, v10 +; SI-NEXT: v_mov_b32_e32 v52, v9 +; SI-NEXT: v_mov_b32_e32 v53, v8 +; SI-NEXT: v_mov_b32_e32 v54, v7 +; SI-NEXT: v_mov_b32_e32 v55, v6 +; SI-NEXT: v_mov_b32_e32 v40, v5 +; SI-NEXT: v_mov_b32_e32 v41, v4 +; SI-NEXT: v_mov_b32_e32 v42, v3 +; SI-NEXT: v_mov_b32_e32 v43, v2 +; SI-NEXT: v_mov_b32_e32 v44, v1 +; SI-NEXT: v_mov_b32_e32 v45, v0 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v21 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v38 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v48 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v50 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v52 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v40 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v42 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v43 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v44 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v45 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 ; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v23 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v29 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v0 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:4 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v4 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:36 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v12 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:28 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB42_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v49 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v45 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v44 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v43 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v42 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v41 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v40 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v55 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v54 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v53 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v52 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v51 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v50 +; SI-NEXT: v_or_b32_e32 v0, v0, v33 +; SI-NEXT: v_or_b32_e32 v1, v1, v63 +; SI-NEXT: v_or_b32_e32 v2, v2, v32 +; SI-NEXT: v_or_b32_e32 v3, v3, v62 +; SI-NEXT: v_or_b32_e32 v4, v4, v61 +; SI-NEXT: v_or_b32_e32 v5, v5, v60 +; SI-NEXT: v_or_b32_e32 v6, v6, v59 +; SI-NEXT: v_or_b32_e32 v7, v7, v58 +; SI-NEXT: v_or_b32_e32 v8, v8, v57 +; SI-NEXT: v_or_b32_e32 v9, v9, v56 +; SI-NEXT: v_or_b32_e32 v10, v10, v47 +; SI-NEXT: v_or_b32_e32 v11, v11, v46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v48 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v39 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v38 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v37 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v36 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v35 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v34 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; kill: killed $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr22 @@ -20819,126 +20420,38 @@ define <11 x i64> @bitcast_v44i16_to_v11i64(<44 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; kill: killed $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v53 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v52 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v51 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v50 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v49 -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v48 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v39 -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v38 ; SI-NEXT: ; kill: killed $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: v_or_b32_e32 v0, v0, v37 -; SI-NEXT: v_or_b32_e32 v1, v1, v45 -; SI-NEXT: v_or_b32_e32 v2, v2, v44 -; SI-NEXT: v_or_b32_e32 v3, v3, v43 -; SI-NEXT: v_or_b32_e32 v4, v4, v36 -; SI-NEXT: v_or_b32_e32 v5, v5, v42 -; SI-NEXT: v_or_b32_e32 v6, v6, v41 -; SI-NEXT: v_or_b32_e32 v7, v7, v35 -; SI-NEXT: v_or_b32_e32 v8, v8, v34 -; SI-NEXT: v_or_b32_e32 v9, v9, v40 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; kill: killed $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; SI-NEXT: v_or_b32_e32 v10, v10, v33 -; SI-NEXT: v_or_b32_e32 v11, v11, v32 -; SI-NEXT: v_or_b32_e32 v12, v12, v63 -; SI-NEXT: v_or_b32_e32 v13, v13, v62 -; SI-NEXT: v_or_b32_e32 v14, v14, v61 -; SI-NEXT: v_or_b32_e32 v15, v15, v60 -; SI-NEXT: v_or_b32_e32 v16, v16, v59 -; SI-NEXT: v_or_b32_e32 v17, v17, v58 -; SI-NEXT: v_or_b32_e32 v18, v18, v57 -; SI-NEXT: v_or_b32_e32 v19, v19, v56 -; SI-NEXT: v_or_b32_e32 v20, v20, v47 -; SI-NEXT: v_or_b32_e32 v21, v21, v46 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: .LBB42_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB42_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v49 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v53 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v52 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v51 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v50 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v49 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v48 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v39 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v38 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v45 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v44 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v43 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v42 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v41 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v40 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v50 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 @@ -20949,17 +20462,21 @@ define <11 x i64> @bitcast_v44i16_to_v11i64(<44 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: v_or_b32_e32 v0, v37, v0 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_or_b32_e32 v0, v33, v0 ; SI-NEXT: s_mov_b32 s6, 0x30000 -; SI-NEXT: v_or_b32_e32 v1, v45, v1 -; SI-NEXT: v_or_b32_e32 v2, v44, v2 -; SI-NEXT: v_or_b32_e32 v3, v43, v3 -; SI-NEXT: v_or_b32_e32 v4, v36, v4 -; SI-NEXT: v_or_b32_e32 v5, v42, v5 -; SI-NEXT: v_or_b32_e32 v6, v41, v6 -; SI-NEXT: v_or_b32_e32 v7, v35, v7 -; SI-NEXT: v_or_b32_e32 v8, v34, v8 -; SI-NEXT: v_or_b32_e32 v9, v40, v9 +; SI-NEXT: v_or_b32_e32 v1, v63, v1 +; SI-NEXT: v_or_b32_e32 v2, v32, v2 +; SI-NEXT: v_or_b32_e32 v3, v62, v3 +; SI-NEXT: v_or_b32_e32 v4, v61, v4 +; SI-NEXT: v_or_b32_e32 v5, v60, v5 +; SI-NEXT: v_or_b32_e32 v6, v59, v6 +; SI-NEXT: v_or_b32_e32 v7, v58, v7 +; SI-NEXT: v_or_b32_e32 v8, v57, v8 +; SI-NEXT: v_or_b32_e32 v9, v56, v9 +; SI-NEXT: v_or_b32_e32 v10, v47, v10 +; SI-NEXT: v_or_b32_e32 v11, v46, v11 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 @@ -20970,49 +20487,38 @@ define <11 x i64> @bitcast_v44i16_to_v11i64(<44 x i16> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 ; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 ; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v48 ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v39 ; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v38 ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v37 ; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v36 ; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v35 ; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v34 ; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; SI-NEXT: v_or_b32_e32 v10, v33, v10 -; SI-NEXT: v_or_b32_e32 v11, v32, v11 -; SI-NEXT: v_or_b32_e32 v12, v63, v12 -; SI-NEXT: v_or_b32_e32 v13, v62, v13 -; SI-NEXT: v_or_b32_e32 v14, v61, v14 -; SI-NEXT: v_or_b32_e32 v15, v60, v15 -; SI-NEXT: v_or_b32_e32 v16, v59, v16 -; SI-NEXT: v_or_b32_e32 v17, v58, v17 -; SI-NEXT: v_or_b32_e32 v18, v57, v18 -; SI-NEXT: v_or_b32_e32 v19, v56, v19 -; SI-NEXT: v_or_b32_e32 v20, v47, v20 -; SI-NEXT: v_or_b32_e32 v21, v46, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 ; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 ; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 @@ -21023,26 +20529,37 @@ define <11 x i64> @bitcast_v44i16_to_v11i64(<44 x i16> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 ; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 ; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 ; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v21 ; SI-NEXT: .LBB42_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -21634,233 +21151,239 @@ define inreg <11 x i64> @bitcast_v44i16_to_v11i64_scalar(<44 x i16> inreg %a, i3 ; SI-LABEL: bitcast_v44i16_to_v11i64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v32, v28 -; SI-NEXT: v_mov_b32_e32 v33, v26 -; SI-NEXT: v_mov_b32_e32 v34, v24 -; SI-NEXT: v_mov_b32_e32 v35, v22 -; SI-NEXT: v_mov_b32_e32 v36, v20 -; SI-NEXT: v_mov_b32_e32 v37, v18 -; SI-NEXT: v_mov_b32_e32 v38, v16 -; SI-NEXT: v_mov_b32_e32 v39, v14 -; SI-NEXT: v_mov_b32_e32 v48, v12 -; SI-NEXT: v_mov_b32_e32 v49, v10 -; SI-NEXT: v_mov_b32_e32 v50, v8 -; SI-NEXT: v_mov_b32_e32 v51, v6 -; SI-NEXT: v_mov_b32_e32 v52, v4 -; SI-NEXT: v_mov_b32_e32 v53, v2 -; SI-NEXT: v_mov_b32_e32 v54, v0 +; SI-NEXT: v_mov_b32_e32 v32, v7 +; SI-NEXT: v_mov_b32_e32 v33, v6 +; SI-NEXT: v_mov_b32_e32 v34, v5 +; SI-NEXT: v_mov_b32_e32 v35, v4 +; SI-NEXT: v_mov_b32_e32 v36, v3 +; SI-NEXT: v_mov_b32_e32 v37, v2 +; SI-NEXT: v_mov_b32_e32 v38, v1 +; SI-NEXT: v_mov_b32_e32 v39, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v38 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v39 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s7, s28, 16 +; SI-NEXT: s_lshr_b32 s8, s27, 16 +; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: s_lshr_b32 s13, s22, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s40, s19, 16 +; SI-NEXT: s_lshr_b32 s41, s18, 16 +; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v0 ; SI-NEXT: s_cbranch_scc0 .LBB43_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v54 -; SI-NEXT: v_or_b32_e32 v7, v0, v61 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 -; SI-NEXT: v_or_b32_e32 v9, v0, v59 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 -; SI-NEXT: v_or_b32_e32 v10, v0, v58 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 -; SI-NEXT: v_or_b32_e32 v11, v0, v57 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 -; SI-NEXT: v_or_b32_e32 v12, v0, v56 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 -; SI-NEXT: v_or_b32_e32 v13, v0, v47 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 ; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: v_or_b32_e32 v14, v0, v46 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: s_lshl_b32 s5, s43, 16 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: v_or_b32_e32 v15, v0, v45 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s44, s42, 16 +; SI-NEXT: s_or_b32 s5, s5, s44 +; SI-NEXT: s_and_b32 s44, s18, 0xffff +; SI-NEXT: s_lshl_b32 s45, s41, 16 +; SI-NEXT: s_or_b32 s44, s44, s45 +; SI-NEXT: s_and_b32 s45, s19, 0xffff +; SI-NEXT: s_lshl_b32 s46, s40, 16 +; SI-NEXT: s_or_b32 s45, s45, s46 +; SI-NEXT: s_and_b32 s46, s20, 0xffff +; SI-NEXT: s_lshl_b32 s47, s15, 16 +; SI-NEXT: s_or_b32 s46, s46, s47 +; SI-NEXT: s_and_b32 s47, s21, 0xffff +; SI-NEXT: s_lshl_b32 s56, s14, 16 +; SI-NEXT: s_or_b32 s47, s47, s56 +; SI-NEXT: s_and_b32 s56, s22, 0xffff +; SI-NEXT: s_lshl_b32 s57, s13, 16 +; SI-NEXT: s_or_b32 s56, s56, s57 +; SI-NEXT: s_and_b32 s57, s23, 0xffff +; SI-NEXT: s_lshl_b32 s58, s12, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: s_or_b32 s57, s57, s58 +; SI-NEXT: s_and_b32 s58, s24, 0xffff +; SI-NEXT: s_lshl_b32 s59, s11, 16 +; SI-NEXT: v_or_b32_e32 v14, v0, v55 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 -; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: v_or_b32_e32 v16, v0, v44 +; SI-NEXT: s_or_b32 s58, s58, s59 +; SI-NEXT: s_and_b32 s59, s25, 0xffff +; SI-NEXT: s_lshl_b32 s60, s10, 16 +; SI-NEXT: v_or_b32_e32 v16, v0, v53 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 -; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: v_or_b32_e32 v17, v0, v43 +; SI-NEXT: s_or_b32 s59, s59, s60 +; SI-NEXT: s_and_b32 s60, s26, 0xffff +; SI-NEXT: s_lshl_b32 s61, s9, 16 +; SI-NEXT: v_or_b32_e32 v17, v0, v52 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 -; SI-NEXT: s_or_b32 s7, s7, s8 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: v_or_b32_e32 v18, v0, v42 +; SI-NEXT: s_or_b32 s60, s60, s61 +; SI-NEXT: s_and_b32 s61, s27, 0xffff +; SI-NEXT: s_lshl_b32 s62, s8, 16 +; SI-NEXT: v_or_b32_e32 v18, v0, v51 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 -; SI-NEXT: s_or_b32 s8, s8, s9 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: v_or_b32_e32 v19, v0, v41 +; SI-NEXT: s_or_b32 s61, s61, s62 +; SI-NEXT: s_and_b32 s62, s28, 0xffff +; SI-NEXT: s_lshl_b32 s63, s7, 16 +; SI-NEXT: v_or_b32_e32 v19, v0, v50 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 -; SI-NEXT: s_or_b32 s9, s9, s10 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v53 -; SI-NEXT: v_or_b32_e32 v20, v0, v40 +; SI-NEXT: s_or_b32 s62, s62, s63 +; SI-NEXT: s_and_b32 s63, s29, 0xffff +; SI-NEXT: s_lshl_b32 s72, s6, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v38 +; SI-NEXT: v_or_b32_e32 v20, v0, v49 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 -; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_or_b32_e32 v8, v1, v60 -; SI-NEXT: v_or_b32_e32 v21, v0, v55 +; SI-NEXT: s_or_b32 s63, s63, s72 +; SI-NEXT: v_or_b32_e32 v15, v1, v54 +; SI-NEXT: v_or_b32_e32 v21, v0, v48 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_mov_b32_e32 v5, s9 -; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: v_mov_b32_e32 v2, s44 +; SI-NEXT: v_mov_b32_e32 v3, s45 +; SI-NEXT: v_mov_b32_e32 v4, s46 +; SI-NEXT: v_mov_b32_e32 v5, s47 +; SI-NEXT: v_mov_b32_e32 v6, s56 +; SI-NEXT: v_mov_b32_e32 v7, s57 +; SI-NEXT: v_mov_b32_e32 v8, s58 +; SI-NEXT: v_mov_b32_e32 v9, s59 +; SI-NEXT: v_mov_b32_e32 v10, s60 +; SI-NEXT: v_mov_b32_e32 v11, s61 +; SI-NEXT: v_mov_b32_e32 v12, s62 +; SI-NEXT: v_mov_b32_e32 v13, s63 ; SI-NEXT: s_cbranch_execnz .LBB43_3 ; SI-NEXT: .LBB43_2: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v54 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v61, v0 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v60, v0 -; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v59, v0 -; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v58, v0 -; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v57, v0 -; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v56, v0 -; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v47, v0 -; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: v_or_b32_e32 v0, v55, v0 ; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v45, v0 -; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v44, v0 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: v_or_b32_e32 v0, v53, v0 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s16, s42, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: s_and_b32 s16, s18, 0xffff +; SI-NEXT: s_lshl_b32 s17, s41, 16 +; SI-NEXT: s_add_i32 s19, s19, 3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_and_b32 s17, s19, 0xffff +; SI-NEXT: s_lshl_b32 s18, s40, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_or_b32_e32 v0, v52, v0 +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: s_and_b32 s18, s20, 0xffff +; SI-NEXT: s_lshl_b32 s15, s15, 16 +; SI-NEXT: s_add_i32 s21, s21, 3 ; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: s_or_b32 s15, s15, s18 +; SI-NEXT: s_and_b32 s18, s21, 0xffff +; SI-NEXT: s_lshl_b32 s14, s14, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_or_b32_e32 v0, v42, v0 -; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s14, s14, s18 +; SI-NEXT: s_and_b32 s18, s22, 0xffff +; SI-NEXT: s_lshl_b32 s13, s13, 16 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: v_or_b32_e32 v0, v51, v0 +; SI-NEXT: s_or_b32 s13, s13, s18 +; SI-NEXT: s_and_b32 s18, s23, 0xffff +; SI-NEXT: s_lshl_b32 s12, s12, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 ; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s12, s12, s18 +; SI-NEXT: s_and_b32 s18, s24, 0xffff +; SI-NEXT: s_lshl_b32 s11, s11, 16 +; SI-NEXT: s_add_i32 s25, s25, 3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: v_or_b32_e32 v0, v41, v0 -; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s11, s11, s18 +; SI-NEXT: s_and_b32 s18, s25, 0xffff +; SI-NEXT: s_lshl_b32 s10, s10, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: v_or_b32_e32 v0, v50, v0 +; SI-NEXT: s_or_b32 s10, s10, s18 +; SI-NEXT: s_and_b32 s18, s26, 0xffff +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_add_i32 s27, s27, 3 ; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 -; SI-NEXT: s_or_b32 s7, s8, s7 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_or_b32 s8, s9, s8 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_or_b32 s9, s9, s18 +; SI-NEXT: s_and_b32 s18, s27, 0xffff +; SI-NEXT: s_lshl_b32 s8, s8, 16 ; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: v_or_b32_e32 v0, v40, v0 -; SI-NEXT: s_or_b32 s9, s10, s9 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s8, s8, s18 +; SI-NEXT: s_and_b32 s18, s28, 0xffff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: v_or_b32_e32 v0, v49, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v38 +; SI-NEXT: s_or_b32 s7, s7, s18 +; SI-NEXT: s_and_b32 s18, s29, 0xffff +; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 -; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_or_b32 s6, s6, s18 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v54, v1 ; SI-NEXT: s_add_i32 s4, s4, 0x30000 ; SI-NEXT: s_add_i32 s5, s5, 0x30000 -; SI-NEXT: s_add_i32 s6, s6, 0x30000 -; SI-NEXT: s_add_i32 s7, s7, 0x30000 -; SI-NEXT: s_add_i32 s8, s8, 0x30000 -; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s16, s16, 0x30000 +; SI-NEXT: s_add_i32 s17, s17, 0x30000 +; SI-NEXT: s_add_i32 s15, s15, 0x30000 +; SI-NEXT: s_add_i32 s14, s14, 0x30000 +; SI-NEXT: s_add_i32 s13, s13, 0x30000 +; SI-NEXT: s_add_i32 s12, s12, 0x30000 +; SI-NEXT: s_add_i32 s11, s11, 0x30000 ; SI-NEXT: s_add_i32 s10, s10, 0x30000 -; SI-NEXT: v_or_b32_e32 v0, v55, v0 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v48, v0 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v1 ; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v0 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_mov_b32_e32 v5, s9 -; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: v_mov_b32_e32 v3, s17 +; SI-NEXT: v_mov_b32_e32 v4, s15 +; SI-NEXT: v_mov_b32_e32 v5, s14 +; SI-NEXT: v_mov_b32_e32 v6, s13 +; SI-NEXT: v_mov_b32_e32 v7, s12 +; SI-NEXT: v_mov_b32_e32 v8, s11 +; SI-NEXT: v_mov_b32_e32 v9, s10 +; SI-NEXT: v_mov_b32_e32 v10, s9 +; SI-NEXT: v_mov_b32_e32 v11, s8 +; SI-NEXT: v_mov_b32_e32 v12, s7 +; SI-NEXT: v_mov_b32_e32 v13, s6 ; SI-NEXT: .LBB43_3: ; %end -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB43_4: ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 @@ -22443,10 +21966,9 @@ define <44 x half> @bitcast_v11i64_to_v44f16(<11 x i64> %a, i32 %b) { ; SI-LABEL: bitcast_v11i64_to_v44f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v23 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -22463,15 +21985,14 @@ define <44 x half> @bitcast_v11i64_to_v44f16(<11 x i64> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr47 @@ -22481,109 +22002,109 @@ define <44 x half> @bitcast_v11i64_to_v44f16(<11 x i64> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; kill: killed $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; kill: killed $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB44_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v57, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v4 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v60, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v3 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v62, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v24 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v31, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v3 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v63, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v1 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v59, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v2 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v61, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v63, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v22 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v29, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v0 +; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 @@ -22605,263 +22126,172 @@ define <44 x half> @bitcast_v11i64_to_v44f16(<11 x i64> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: .LBB44_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB44_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: v_addc_u32_e32 v12, vcc, 0, v12, vcc -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; SI-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; SI-NEXT: v_addc_u32_e32 v16, vcc, 0, v16, vcc -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; SI-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc -; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; SI-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v19 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v19 ; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v20 ; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 ; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 ; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 ; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 ; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: .LBB44_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v1, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v63 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: v_cvt_f16_f32_e32 v1, v62 -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v62 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v61 -; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v59 -; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v58 -; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v56 -; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v46 -; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v44 -; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v42 -; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v40 -; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v54 -; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v52 -; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v51 -; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v49 -; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v39 -; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 -; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v35 -; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v31 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v26 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v27 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x54, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v58 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v56 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v47 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v44 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v43 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v55 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v40 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v51 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v52 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v39 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v48 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v34 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v36 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v30 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v33 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v29 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -22878,7 +22308,30 @@ define <44 x half> @bitcast_v11i64_to_v44f16(<11 x i64> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v25 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v29 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v20, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v27 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v11i64_to_v44f16: @@ -23352,126 +22805,126 @@ define inreg <44 x half> @bitcast_v11i64_to_v44f16_scalar(<11 x i64> inreg %a, i ; SI-LABEL: bitcast_v11i64_to_v44f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v10, s16 -; SI-NEXT: v_mov_b32_e32 v11, s17 -; SI-NEXT: v_mov_b32_e32 v12, s18 -; SI-NEXT: v_mov_b32_e32 v13, s19 -; SI-NEXT: v_mov_b32_e32 v14, s20 -; SI-NEXT: v_mov_b32_e32 v15, s21 -; SI-NEXT: v_mov_b32_e32 v16, s22 -; SI-NEXT: v_mov_b32_e32 v17, s23 -; SI-NEXT: v_mov_b32_e32 v18, s24 -; SI-NEXT: v_mov_b32_e32 v19, s25 -; SI-NEXT: v_readfirstlane_b32 s22, v10 -; SI-NEXT: v_mov_b32_e32 v10, s26 -; SI-NEXT: v_readfirstlane_b32 s26, v11 -; SI-NEXT: v_mov_b32_e32 v11, s27 -; SI-NEXT: v_readfirstlane_b32 s23, v12 -; SI-NEXT: v_mov_b32_e32 v12, s28 -; SI-NEXT: v_readfirstlane_b32 s27, v13 -; SI-NEXT: v_mov_b32_e32 v13, s29 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 -; SI-NEXT: v_readfirstlane_b32 s24, v14 -; SI-NEXT: v_readfirstlane_b32 s25, v15 -; SI-NEXT: v_readfirstlane_b32 s20, v16 -; SI-NEXT: v_readfirstlane_b32 s21, v17 -; SI-NEXT: v_readfirstlane_b32 s18, v18 -; SI-NEXT: v_readfirstlane_b32 s19, v19 -; SI-NEXT: v_readfirstlane_b32 s16, v10 -; SI-NEXT: v_readfirstlane_b32 s17, v11 -; SI-NEXT: v_readfirstlane_b32 s14, v12 -; SI-NEXT: v_readfirstlane_b32 s15, v13 -; SI-NEXT: v_readfirstlane_b32 s12, v1 -; SI-NEXT: v_readfirstlane_b32 s13, v2 -; SI-NEXT: v_readfirstlane_b32 s10, v3 -; SI-NEXT: v_readfirstlane_b32 s11, v4 -; SI-NEXT: v_readfirstlane_b32 s7, v5 -; SI-NEXT: v_readfirstlane_b32 s8, v6 -; SI-NEXT: v_readfirstlane_b32 s6, v7 +; SI-NEXT: v_mov_b32_e32 v9, s16 +; SI-NEXT: v_mov_b32_e32 v10, s17 +; SI-NEXT: v_mov_b32_e32 v11, s18 +; SI-NEXT: v_mov_b32_e32 v12, s19 +; SI-NEXT: v_mov_b32_e32 v13, s20 +; SI-NEXT: v_mov_b32_e32 v14, s21 +; SI-NEXT: v_mov_b32_e32 v15, s22 +; SI-NEXT: v_mov_b32_e32 v16, s23 +; SI-NEXT: v_mov_b32_e32 v17, s24 +; SI-NEXT: v_mov_b32_e32 v18, s25 +; SI-NEXT: v_mov_b32_e32 v19, s26 +; SI-NEXT: v_readfirstlane_b32 s24, v9 +; SI-NEXT: v_mov_b32_e32 v9, s27 +; SI-NEXT: v_readfirstlane_b32 s26, v10 +; SI-NEXT: v_mov_b32_e32 v10, s28 +; SI-NEXT: v_readfirstlane_b32 s25, v11 +; SI-NEXT: v_mov_b32_e32 v11, s29 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: v_readfirstlane_b32 s27, v12 +; SI-NEXT: v_readfirstlane_b32 s22, v13 +; SI-NEXT: v_readfirstlane_b32 s23, v14 +; SI-NEXT: v_readfirstlane_b32 s20, v15 +; SI-NEXT: v_readfirstlane_b32 s21, v16 +; SI-NEXT: v_readfirstlane_b32 s18, v17 +; SI-NEXT: v_readfirstlane_b32 s19, v18 +; SI-NEXT: v_readfirstlane_b32 s16, v19 +; SI-NEXT: v_readfirstlane_b32 s17, v9 +; SI-NEXT: v_readfirstlane_b32 s14, v10 +; SI-NEXT: v_readfirstlane_b32 s15, v11 +; SI-NEXT: v_readfirstlane_b32 s12, v0 +; SI-NEXT: v_readfirstlane_b32 s13, v1 +; SI-NEXT: v_readfirstlane_b32 s10, v2 +; SI-NEXT: v_readfirstlane_b32 s11, v3 +; SI-NEXT: v_readfirstlane_b32 s7, v4 +; SI-NEXT: v_readfirstlane_b32 s8, v5 +; SI-NEXT: v_readfirstlane_b32 s6, v6 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s9, v8 +; SI-NEXT: v_readfirstlane_b32 s9, v7 ; SI-NEXT: s_cbranch_scc0 .LBB45_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_lshr_b32 s4, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 ; SI-NEXT: s_lshr_b32 s4, s6, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 ; SI-NEXT: s_lshr_b32 s4, s8, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 ; SI-NEXT: s_lshr_b32 s4, s7, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 ; SI-NEXT: s_lshr_b32 s4, s11, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 ; SI-NEXT: s_lshr_b32 s4, s10, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 ; SI-NEXT: s_lshr_b32 s4, s13, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 -; SI-NEXT: s_lshr_b32 s4, s12, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 +; SI-NEXT: s_lshr_b32 s4, s12, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 ; SI-NEXT: s_lshr_b32 s4, s15, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 ; SI-NEXT: s_lshr_b32 s4, s14, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 ; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 ; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 ; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 ; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 ; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 ; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s4 -; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s4 -; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s4 -; SI-NEXT: s_lshr_b32 s4, s27, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s4 ; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s4 -; SI-NEXT: s_lshr_b32 s4, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 ; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v49, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v51, s22 -; SI-NEXT: s_cbranch_execnz .LBB45_3 -; SI-NEXT: .LBB45_2: ; %cmp.true -; SI-NEXT: s_add_u32 s4, s22, 3 -; SI-NEXT: s_addc_u32 s5, s26, 0 -; SI-NEXT: s_lshr_b32 s22, s4, 16 -; SI-NEXT: s_lshr_b32 s26, s5, 16 -; SI-NEXT: s_add_u32 s23, s23, 3 -; SI-NEXT: s_addc_u32 s27, s27, 0 -; SI-NEXT: s_lshr_b32 s28, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s4 +; SI-NEXT: s_lshr_b32 s4, s27, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s4 +; SI-NEXT: s_lshr_b32 s4, s26, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v51, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s24 +; SI-NEXT: s_cbranch_execnz .LBB45_3 +; SI-NEXT: .LBB45_2: ; %cmp.true +; SI-NEXT: s_add_u32 s4, s24, 3 +; SI-NEXT: s_addc_u32 s5, s26, 0 +; SI-NEXT: s_lshr_b32 s24, s4, 16 +; SI-NEXT: s_lshr_b32 s26, s5, 16 +; SI-NEXT: s_add_u32 s25, s25, 3 +; SI-NEXT: s_addc_u32 s27, s27, 0 +; SI-NEXT: s_lshr_b32 s28, s25, 16 ; SI-NEXT: s_lshr_b32 s29, s27, 16 -; SI-NEXT: s_add_u32 s24, s24, 3 -; SI-NEXT: s_addc_u32 s25, s25, 0 -; SI-NEXT: s_lshr_b32 s40, s24, 16 -; SI-NEXT: s_lshr_b32 s41, s25, 16 +; SI-NEXT: s_add_u32 s22, s22, 3 +; SI-NEXT: s_addc_u32 s23, s23, 0 +; SI-NEXT: s_lshr_b32 s40, s22, 16 +; SI-NEXT: s_lshr_b32 s41, s23, 16 ; SI-NEXT: s_add_u32 s20, s20, 3 ; SI-NEXT: s_addc_u32 s21, s21, 0 ; SI-NEXT: s_lshr_b32 s42, s20, 16 @@ -23504,250 +22957,185 @@ define inreg <44 x half> @bitcast_v11i64_to_v44f16_scalar(<11 x i64> inreg %a, i ; SI-NEXT: s_addc_u32 s9, s9, 0 ; SI-NEXT: s_lshr_b32 s72, s6, 16 ; SI-NEXT: s_lshr_b32 s73, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v49, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v51, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s73 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s72 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s63 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s62 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s61 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s60 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s59 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s58 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s57 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s56 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s47 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s46 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s45 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s44 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s43 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s73 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s72 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s63 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s62 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s61 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s60 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s59 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s58 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s57 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s56 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s47 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s45 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s44 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v51, s24 ; SI-NEXT: .LBB45_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 ; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 ; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 ; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 ; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 +; SI-NEXT: v_or_b32_e32 v0, v50, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v49 +; SI-NEXT: v_or_b32_e32 v2, v48, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v48 +; SI-NEXT: v_or_b32_e32 v5, v5, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v38 +; SI-NEXT: v_or_b32_e32 v7, v7, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v36 +; SI-NEXT: v_or_b32_e32 v9, v34, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v34 +; SI-NEXT: v_or_b32_e32 v11, v32, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v32 +; SI-NEXT: v_or_b32_e32 v13, v30, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v30 +; SI-NEXT: v_or_b32_e32 v15, v28, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v28 +; SI-NEXT: v_or_b32_e32 v17, v26, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_or_b32_e32 v19, v24, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 ; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 ; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 -; SI-NEXT: v_or_b32_e32 v51, v51, v52 -; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 ; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: buffer_store_dword v51, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v51, vcc, 4, v0 -; SI-NEXT: v_or_b32_e32 v49, v49, v50 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 ; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: buffer_store_dword v49, v51, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v49, vcc, 8, v0 -; SI-NEXT: v_or_b32_e32 v39, v39, v48 -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 ; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: buffer_store_dword v39, v49, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v39, vcc, 12, v0 -; SI-NEXT: v_or_b32_e32 v37, v37, v38 -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: buffer_store_dword v37, v39, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v37, vcc, 16, v0 -; SI-NEXT: v_or_b32_e32 v35, v36, v35 -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: buffer_store_dword v35, v37, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v35, vcc, 20, v0 -; SI-NEXT: v_or_b32_e32 v33, v34, v33 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: buffer_store_dword v33, v35, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v33, vcc, 24, v0 -; SI-NEXT: v_or_b32_e32 v31, v32, v31 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: buffer_store_dword v31, v33, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v31, vcc, 28, v0 -; SI-NEXT: v_or_b32_e32 v29, v30, v29 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: buffer_store_dword v29, v31, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v29, vcc, 32, v0 -; SI-NEXT: v_or_b32_e32 v27, v28, v27 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; SI-NEXT: buffer_store_dword v27, v29, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v27, vcc, 36, v0 -; SI-NEXT: v_or_b32_e32 v25, v26, v25 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: buffer_store_dword v25, v27, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v25, vcc, 40, v0 -; SI-NEXT: v_or_b32_e32 v23, v24, v23 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: buffer_store_dword v23, v25, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v23, vcc, 44, v0 -; SI-NEXT: v_or_b32_e32 v20, v22, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: buffer_store_dword v20, v23, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v21 -; SI-NEXT: v_add_i32_e32 v21, vcc, 48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v20, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: buffer_store_dword v18, v21, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v18, v19 -; SI-NEXT: v_add_i32_e32 v19, vcc, 52, v0 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: buffer_store_dword v16, v19, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v16, v17 -; SI-NEXT: v_add_i32_e32 v17, vcc, 56, v0 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v16, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v14, v15 -; SI-NEXT: v_add_i32_e32 v15, vcc, 60, v0 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v14, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: buffer_store_dword v12, v15, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v12, v13 -; SI-NEXT: v_add_i32_e32 v13, vcc, 64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: buffer_store_dword v10, v13, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v10, v11 -; SI-NEXT: v_add_i32_e32 v11, vcc, 0x44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: buffer_store_dword v8, v11, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v8, v9 -; SI-NEXT: v_add_i32_e32 v9, vcc, 0x48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x4c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v5 -; SI-NEXT: v_add_i32_e32 v5, vcc, 0x50, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v3 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x54, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v24 +; SI-NEXT: v_or_b32_e32 v1, v52, v1 +; SI-NEXT: v_or_b32_e32 v3, v50, v3 +; SI-NEXT: v_or_b32_e32 v4, v39, v4 +; SI-NEXT: v_or_b32_e32 v6, v37, v6 +; SI-NEXT: v_or_b32_e32 v8, v35, v8 +; SI-NEXT: v_or_b32_e32 v10, v33, v10 +; SI-NEXT: v_or_b32_e32 v12, v31, v12 +; SI-NEXT: v_or_b32_e32 v14, v29, v14 +; SI-NEXT: v_or_b32_e32 v16, v27, v16 +; SI-NEXT: v_or_b32_e32 v18, v25, v18 +; SI-NEXT: v_or_b32_e32 v20, v23, v20 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB45_4: +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: s_branch .LBB45_2 ; ; VI-LABEL: bitcast_v11i64_to_v44f16_scalar: @@ -24347,159 +23735,163 @@ define <11 x i64> @bitcast_v44f16_to_v11i64(<44 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v44f16_to_v11i64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:52 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v50, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v32 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v10 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v9 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v52 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v46 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v52, v47 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v56 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v40, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v45 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v44 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v43 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v42 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB46_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v57 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v49 @@ -24510,10 +23902,13 @@ define <11 x i64> @bitcast_v44f16_to_v11i64(<44 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v63 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v61 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v59 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v53 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v45 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v41 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v53 ; SI-NEXT: v_or_b32_e32 v0, v50, v0 ; SI-NEXT: v_or_b32_e32 v1, v48, v1 ; SI-NEXT: v_or_b32_e32 v2, v38, v2 @@ -24523,8 +23918,13 @@ define <11 x i64> @bitcast_v44f16_to_v11i64(<44 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v6, v62, v6 ; SI-NEXT: v_or_b32_e32 v7, v60, v7 ; SI-NEXT: v_or_b32_e32 v8, v58, v8 -; SI-NEXT: v_or_b32_e32 v20, v54, v20 -; SI-NEXT: v_or_b32_e32 v21, v52, v21 +; SI-NEXT: v_or_b32_e32 v9, v56, v9 +; SI-NEXT: v_or_b32_e32 v10, v46, v10 +; SI-NEXT: v_or_b32_e32 v11, v44, v11 +; SI-NEXT: v_or_b32_e32 v12, v42, v12 +; SI-NEXT: v_or_b32_e32 v13, v40, v13 +; SI-NEXT: v_or_b32_e32 v14, v54, v14 +; SI-NEXT: v_or_b32_e32 v15, v52, v15 ; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr49 @@ -24544,64 +23944,72 @@ define <11 x i64> @bitcast_v44f16_to_v11i64(<44 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v41 -; SI-NEXT: v_or_b32_e32 v19, v40, v19 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 ; SI-NEXT: .LBB46_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB46_4 @@ -24665,130 +24073,107 @@ define <11 x i64> @bitcast_v44f16_to_v11i64(<44 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v11, v56 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v63 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v12, v46 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v43 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v52 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v42 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v40 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v53 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v52 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v47 ; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v45 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v44 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v11, v12, v11 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v41 ; SI-NEXT: v_or_b32_e32 v12, v14, v12 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v55 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v54 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 @@ -24800,17 +24185,29 @@ define <11 x i64> @bitcast_v44f16_to_v11i64(<44 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_or_b32_e32 v17, v18, v17 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v41 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_or_b32_e32 v18, v20, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v55 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v54 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_or_b32_e32 v20, v21, v20 @@ -24818,22 +24215,22 @@ define <11 x i64> @bitcast_v44f16_to_v11i64(<44 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v21, v23, v21 ; SI-NEXT: .LBB46_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -25426,6 +24823,14 @@ define inreg <11 x i64> @bitcast_v44f16_to_v11i64_scalar(<44 x half> inreg %a, i ; SI-LABEL: bitcast_v44f16_to_v11i64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_lshr_b32 s40, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s40 +; SI-NEXT: s_lshr_b32 s40, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s17 +; SI-NEXT: s_lshr_b32 s14, s19, 16 +; SI-NEXT: s_lshr_b32 s15, s18, 16 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -25442,219 +24847,280 @@ define inreg <11 x i64> @bitcast_v44f16_to_v11i64_scalar(<44 x half> inreg %a, i ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v60, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v51, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v33, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v1, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v34, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v2, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v3, s20 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v63, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v62, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v61, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v36, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v29, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v28, s26 -; SI-NEXT: v_cvt_f16_f32_e32 v27, s29 -; SI-NEXT: v_cvt_f16_f32_e32 v26, s28 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v61, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v19 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v62, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s19 +; SI-NEXT: s_lshr_b32 s12, s21, 16 +; SI-NEXT: s_lshr_b32 s13, s20, 16 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v18 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v63, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s21 +; SI-NEXT: s_lshr_b32 s10, s23, 16 +; SI-NEXT: s_lshr_b32 s11, s22, 16 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s23 +; SI-NEXT: s_lshr_b32 s8, s25, 16 +; SI-NEXT: s_lshr_b32 s9, s24, 16 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s25 +; SI-NEXT: s_lshr_b32 s6, s27, 16 +; SI-NEXT: s_lshr_b32 s7, s26, 16 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s27 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v0 +; SI-NEXT: s_lshr_b32 s4, s29, 16 +; SI-NEXT: s_lshr_b32 s5, s28, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v7 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB47_4 ; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v23 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v62 +; SI-NEXT: v_or_b32_e32 v2, v21, v2 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v63 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v61 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v29 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v60 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v60 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v48 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v36 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v35 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v58 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v56 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v46 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v44 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v42 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v40 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v54 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v46 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v41 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v53 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v25 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: v_or_b32_e32 v0, v33, v0 -; SI-NEXT: v_or_b32_e32 v1, v34, v1 -; SI-NEXT: v_or_b32_e32 v3, v62, v3 -; SI-NEXT: v_or_b32_e32 v4, v36, v4 -; SI-NEXT: v_or_b32_e32 v5, v28, v5 -; SI-NEXT: v_or_b32_e32 v6, v26, v6 -; SI-NEXT: v_or_b32_e32 v7, v48, v7 -; SI-NEXT: v_or_b32_e32 v8, v38, v8 -; SI-NEXT: v_or_b32_e32 v9, v49, v9 -; SI-NEXT: v_or_b32_e32 v10, v50, v10 -; SI-NEXT: v_or_b32_e32 v11, v59, v11 -; SI-NEXT: v_or_b32_e32 v12, v57, v12 -; SI-NEXT: v_or_b32_e32 v13, v47, v13 -; SI-NEXT: v_or_b32_e32 v14, v45, v14 -; SI-NEXT: v_or_b32_e32 v15, v43, v15 -; SI-NEXT: v_or_b32_e32 v16, v41, v16 -; SI-NEXT: v_or_b32_e32 v17, v55, v17 -; SI-NEXT: v_or_b32_e32 v18, v53, v18 -; SI-NEXT: v_or_b32_e32 v19, v31, v19 -; SI-NEXT: v_or_b32_e32 v20, v24, v20 +; SI-NEXT: v_or_b32_e32 v0, v59, v0 +; SI-NEXT: v_or_b32_e32 v1, v57, v1 +; SI-NEXT: v_or_b32_e32 v3, v58, v3 +; SI-NEXT: v_or_b32_e32 v4, v56, v4 +; SI-NEXT: v_or_b32_e32 v5, v50, v5 +; SI-NEXT: v_or_b32_e32 v6, v49, v6 +; SI-NEXT: v_or_b32_e32 v7, v38, v7 +; SI-NEXT: v_or_b32_e32 v8, v37, v8 +; SI-NEXT: v_or_b32_e32 v9, v34, v9 +; SI-NEXT: v_or_b32_e32 v10, v33, v10 +; SI-NEXT: v_or_b32_e32 v11, v45, v11 +; SI-NEXT: v_or_b32_e32 v12, v44, v12 +; SI-NEXT: v_or_b32_e32 v13, v42, v13 +; SI-NEXT: v_or_b32_e32 v14, v40, v14 +; SI-NEXT: v_or_b32_e32 v15, v54, v15 +; SI-NEXT: v_or_b32_e32 v16, v52, v16 +; SI-NEXT: v_or_b32_e32 v17, v30, v17 +; SI-NEXT: v_or_b32_e32 v18, v28, v18 +; SI-NEXT: v_or_b32_e32 v19, v26, v19 +; SI-NEXT: v_or_b32_e32 v20, v24, v20 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v21, v23, v21 ; SI-NEXT: s_cbranch_execnz .LBB47_3 ; SI-NEXT: .LBB47_2: ; %cmp.true -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v57 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v4, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v56 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v49 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v26 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v48 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v34 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v50 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v33 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v46 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v44 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v42 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v55 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v54 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v52 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v29 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v28 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v26 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v63 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v60 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v47 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v51 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v48 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 @@ -25674,43 +25140,43 @@ define inreg <11 x i64> @bitcast_v44f16_to_v11i64_scalar(<44 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v45 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v43 ; SI-NEXT: v_or_b32_e32 v12, v14, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v41 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v40 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v53 ; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v31 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v30 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_or_b32_e32 v17, v18, v17 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v27 ; SI-NEXT: v_or_b32_e32 v18, v20, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v20, v25 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 @@ -25746,88 +25212,92 @@ define inreg <11 x i64> @bitcast_v44f16_to_v11i64_scalar(<44 x half> inreg %a, i ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB47_4: -; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v59, v46 -; SI-NEXT: v_mov_b32_e32 v46, v41 -; SI-NEXT: v_mov_b32_e32 v41, v52 -; SI-NEXT: v_mov_b32_e32 v52, v23 -; SI-NEXT: v_mov_b32_e32 v48, v60 -; SI-NEXT: v_mov_b32_e32 v60, v47 -; SI-NEXT: v_mov_b32_e32 v47, v42 -; SI-NEXT: v_mov_b32_e32 v42, v53 +; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v51, v33 +; SI-NEXT: v_mov_b32_e32 v50, v32 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_mov_b32_e32 v48, v47 +; SI-NEXT: v_mov_b32_e32 v47, v53 ; SI-NEXT: v_mov_b32_e32 v53, v22 -; SI-NEXT: v_mov_b32_e32 v35, v61 -; SI-NEXT: v_mov_b32_e32 v61, v56 -; SI-NEXT: v_mov_b32_e32 v56, v43 -; SI-NEXT: v_mov_b32_e32 v43, v54 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_mov_b32_e32 v39, v56 +; SI-NEXT: v_mov_b32_e32 v56, v54 ; SI-NEXT: v_mov_b32_e32 v54, v24 -; SI-NEXT: v_mov_b32_e32 v50, v34 -; SI-NEXT: v_mov_b32_e32 v34, v62 -; SI-NEXT: v_mov_b32_e32 v62, v57 -; SI-NEXT: v_mov_b32_e32 v57, v44 -; SI-NEXT: v_mov_b32_e32 v44, v55 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_mov_b32_e32 v38, v57 +; SI-NEXT: v_mov_b32_e32 v57, v55 ; SI-NEXT: v_mov_b32_e32 v55, v25 -; SI-NEXT: v_mov_b32_e32 v32, v33 -; SI-NEXT: v_mov_b32_e32 v33, v63 -; SI-NEXT: v_mov_b32_e32 v63, v58 -; SI-NEXT: v_mov_b32_e32 v58, v45 -; SI-NEXT: v_mov_b32_e32 v45, v40 -; SI-NEXT: v_mov_b32_e32 v40, v31 -; SI-NEXT: v_mov_b32_e32 v39, v26 -; SI-NEXT: v_mov_b32_e32 v38, v27 -; SI-NEXT: v_mov_b32_e32 v37, v28 -; SI-NEXT: v_mov_b32_e32 v49, v36 -; SI-NEXT: v_mov_b32_e32 v36, v29 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_mov_b32_e32 v37, v58 +; SI-NEXT: v_mov_b32_e32 v58, v40 +; SI-NEXT: v_mov_b32_e32 v40, v26 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v36, v59 +; SI-NEXT: v_mov_b32_e32 v59, v41 +; SI-NEXT: v_mov_b32_e32 v41, v27 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v35, v60 +; SI-NEXT: v_mov_b32_e32 v60, v42 +; SI-NEXT: v_mov_b32_e32 v42, v28 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v34, v61 +; SI-NEXT: v_mov_b32_e32 v61, v43 +; SI-NEXT: v_mov_b32_e32 v43, v29 +; SI-NEXT: v_mov_b32_e32 v33, v62 +; SI-NEXT: v_mov_b32_e32 v62, v44 +; SI-NEXT: v_mov_b32_e32 v44, v30 +; SI-NEXT: v_mov_b32_e32 v32, v63 +; SI-NEXT: v_mov_b32_e32 v63, v45 +; SI-NEXT: v_mov_b32_e32 v45, v31 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v29, v36 -; SI-NEXT: v_mov_b32_e32 v36, v49 -; SI-NEXT: v_mov_b32_e32 v28, v37 -; SI-NEXT: v_mov_b32_e32 v27, v38 -; SI-NEXT: v_mov_b32_e32 v26, v39 -; SI-NEXT: v_mov_b32_e32 v31, v40 -; SI-NEXT: v_mov_b32_e32 v40, v45 -; SI-NEXT: v_mov_b32_e32 v45, v58 -; SI-NEXT: v_mov_b32_e32 v58, v63 -; SI-NEXT: v_mov_b32_e32 v63, v33 -; SI-NEXT: v_mov_b32_e32 v33, v32 +; SI-NEXT: v_mov_b32_e32 v31, v45 +; SI-NEXT: v_mov_b32_e32 v45, v63 +; SI-NEXT: v_mov_b32_e32 v63, v32 +; SI-NEXT: v_mov_b32_e32 v30, v44 +; SI-NEXT: v_mov_b32_e32 v44, v62 +; SI-NEXT: v_mov_b32_e32 v62, v33 +; SI-NEXT: v_mov_b32_e32 v29, v43 +; SI-NEXT: v_mov_b32_e32 v43, v61 +; SI-NEXT: v_mov_b32_e32 v61, v34 +; SI-NEXT: v_mov_b32_e32 v28, v42 +; SI-NEXT: v_mov_b32_e32 v42, v60 +; SI-NEXT: v_mov_b32_e32 v60, v35 +; SI-NEXT: v_mov_b32_e32 v27, v41 +; SI-NEXT: v_mov_b32_e32 v41, v59 +; SI-NEXT: v_mov_b32_e32 v59, v36 +; SI-NEXT: v_mov_b32_e32 v26, v40 +; SI-NEXT: v_mov_b32_e32 v40, v58 +; SI-NEXT: v_mov_b32_e32 v58, v37 ; SI-NEXT: v_mov_b32_e32 v25, v55 -; SI-NEXT: v_mov_b32_e32 v55, v44 -; SI-NEXT: v_mov_b32_e32 v44, v57 -; SI-NEXT: v_mov_b32_e32 v57, v62 -; SI-NEXT: v_mov_b32_e32 v62, v34 -; SI-NEXT: v_mov_b32_e32 v34, v50 +; SI-NEXT: v_mov_b32_e32 v55, v57 +; SI-NEXT: v_mov_b32_e32 v57, v38 ; SI-NEXT: v_mov_b32_e32 v24, v54 -; SI-NEXT: v_mov_b32_e32 v54, v43 -; SI-NEXT: v_mov_b32_e32 v43, v56 -; SI-NEXT: v_mov_b32_e32 v56, v61 -; SI-NEXT: v_mov_b32_e32 v61, v35 +; SI-NEXT: v_mov_b32_e32 v54, v56 +; SI-NEXT: v_mov_b32_e32 v56, v39 ; SI-NEXT: v_mov_b32_e32 v22, v53 -; SI-NEXT: v_mov_b32_e32 v53, v42 -; SI-NEXT: v_mov_b32_e32 v42, v47 -; SI-NEXT: v_mov_b32_e32 v47, v60 -; SI-NEXT: v_mov_b32_e32 v60, v48 -; SI-NEXT: v_mov_b32_e32 v23, v52 -; SI-NEXT: v_mov_b32_e32 v52, v41 -; SI-NEXT: v_mov_b32_e32 v41, v46 -; SI-NEXT: v_mov_b32_e32 v46, v59 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v53, v47 +; SI-NEXT: v_mov_b32_e32 v47, v48 +; SI-NEXT: v_mov_b32_e32 v32, v50 +; SI-NEXT: v_mov_b32_e32 v33, v51 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: s_branch .LBB47_2 ; ; VI-LABEL: bitcast_v44f16_to_v11i64_scalar: @@ -26374,225 +25844,160 @@ define <44 x i16> @bitcast_v11f64_to_v44i16(<11 x double> %a, i32 %b) { ; SI-LABEL: bitcast_v11f64_to_v44i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v23 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB48_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_alignbit_b32 v23, v22, v21, 16 -; SI-NEXT: v_alignbit_b32 v24, v20, v19, 16 -; SI-NEXT: v_alignbit_b32 v25, v18, v17, 16 -; SI-NEXT: v_alignbit_b32 v26, v16, v15, 16 -; SI-NEXT: v_alignbit_b32 v27, v14, v13, 16 -; SI-NEXT: v_alignbit_b32 v29, v12, v11, 16 -; SI-NEXT: v_alignbit_b32 v32, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v34, v8, v7, 16 -; SI-NEXT: v_alignbit_b32 v36, v6, v5, 16 -; SI-NEXT: v_alignbit_b32 v39, v4, v3, 16 -; SI-NEXT: v_alignbit_b32 v49, v2, v1, 16 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v2 +; SI-NEXT: v_alignbit_b32 v22, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v23, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v24, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v25, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v26, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v27, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v28, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v29, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v32, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v34, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v37, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v1 ; SI-NEXT: .LBB48_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB48_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 -; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 -; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 -; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 -; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 -; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 -; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 -; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 -; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 -; SI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 -; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 -; SI-NEXT: v_alignbit_b32 v23, v22, v21, 16 -; SI-NEXT: v_alignbit_b32 v24, v20, v19, 16 -; SI-NEXT: v_alignbit_b32 v25, v18, v17, 16 -; SI-NEXT: v_alignbit_b32 v26, v16, v15, 16 -; SI-NEXT: v_alignbit_b32 v27, v14, v13, 16 -; SI-NEXT: v_alignbit_b32 v29, v12, v11, 16 -; SI-NEXT: v_alignbit_b32 v32, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v34, v8, v7, 16 -; SI-NEXT: v_alignbit_b32 v36, v6, v5, 16 -; SI-NEXT: v_alignbit_b32 v39, v4, v3, 16 -; SI-NEXT: v_alignbit_b32 v49, v2, v1, 16 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v2 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: v_alignbit_b32 v22, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v23, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v24, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v25, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v26, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v27, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v28, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v29, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v32, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v34, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v37, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v1 ; SI-NEXT: .LBB48_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v0, v0, v37 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v1, v49 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v26 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v24 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v23 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v28 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x54, v0 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v51 +; SI-NEXT: v_or_b32_e32 v2, v2, v34 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v50 +; SI-NEXT: v_or_b32_e32 v4, v4, v32 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v49 +; SI-NEXT: v_or_b32_e32 v6, v6, v29 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v48 +; SI-NEXT: v_or_b32_e32 v8, v8, v28 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v39 +; SI-NEXT: v_or_b32_e32 v10, v10, v27 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v38 +; SI-NEXT: v_or_b32_e32 v12, v12, v26 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v36 +; SI-NEXT: v_or_b32_e32 v14, v14, v25 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v35 +; SI-NEXT: v_or_b32_e32 v16, v16, v24 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v33 +; SI-NEXT: v_or_b32_e32 v18, v18, v23 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v31 +; SI-NEXT: v_or_b32_e32 v20, v20, v22 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v30 +; SI-NEXT: v_or_b32_e32 v1, v1, v37 +; SI-NEXT: v_or_b32_e32 v3, v3, v34 +; SI-NEXT: v_or_b32_e32 v5, v5, v32 +; SI-NEXT: v_or_b32_e32 v7, v7, v29 +; SI-NEXT: v_or_b32_e32 v9, v9, v28 +; SI-NEXT: v_or_b32_e32 v11, v11, v27 +; SI-NEXT: v_or_b32_e32 v13, v13, v26 +; SI-NEXT: v_or_b32_e32 v15, v15, v25 +; SI-NEXT: v_or_b32_e32 v17, v17, v24 +; SI-NEXT: v_or_b32_e32 v19, v19, v23 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v11f64_to_v44i16: @@ -27010,237 +26415,186 @@ define inreg <44 x i16> @bitcast_v11f64_to_v44i16_scalar(<11 x double> inreg %a, ; SI-LABEL: bitcast_v11f64_to_v44i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 -; SI-NEXT: v_mov_b32_e32 v21, s16 -; SI-NEXT: v_mov_b32_e32 v22, s17 -; SI-NEXT: v_mov_b32_e32 v19, s18 -; SI-NEXT: v_mov_b32_e32 v20, s19 -; SI-NEXT: v_mov_b32_e32 v17, s20 -; SI-NEXT: v_mov_b32_e32 v18, s21 -; SI-NEXT: v_mov_b32_e32 v15, s22 -; SI-NEXT: v_mov_b32_e32 v16, s23 -; SI-NEXT: v_mov_b32_e32 v13, s24 -; SI-NEXT: v_mov_b32_e32 v14, s25 -; SI-NEXT: v_mov_b32_e32 v11, s26 -; SI-NEXT: v_mov_b32_e32 v12, s27 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: v_mov_b32_e32 v22, s16 +; SI-NEXT: v_mov_b32_e32 v23, s17 +; SI-NEXT: v_mov_b32_e32 v20, s18 +; SI-NEXT: v_mov_b32_e32 v21, s19 +; SI-NEXT: v_mov_b32_e32 v18, s20 +; SI-NEXT: v_mov_b32_e32 v19, s21 +; SI-NEXT: v_mov_b32_e32 v16, s22 +; SI-NEXT: v_mov_b32_e32 v17, s23 +; SI-NEXT: v_mov_b32_e32 v9, s24 +; SI-NEXT: v_mov_b32_e32 v10, s25 +; SI-NEXT: v_mov_b32_e32 v14, s26 +; SI-NEXT: v_mov_b32_e32 v15, s27 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mov_b32_e32 v9, s28 -; SI-NEXT: v_mov_b32_e32 v10, s29 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB49_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshr_b64 v[23:24], v[7:8], 16 -; SI-NEXT: v_lshr_b64 v[24:25], v[5:6], 16 -; SI-NEXT: v_lshr_b64 v[25:26], v[3:4], 16 -; SI-NEXT: v_lshr_b64 v[26:27], v[1:2], 16 -; SI-NEXT: v_lshr_b64 v[31:32], v[11:12], 16 -; SI-NEXT: v_lshr_b64 v[27:28], v[9:10], 16 -; SI-NEXT: v_lshr_b64 v[32:33], v[17:18], 16 -; SI-NEXT: v_lshr_b64 v[28:29], v[13:14], 16 -; SI-NEXT: v_lshr_b64 v[33:34], v[19:20], 16 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v22 -; SI-NEXT: v_lshr_b64 v[29:30], v[15:16], 16 -; SI-NEXT: v_lshr_b64 v[34:35], v[21:22], 16 +; SI-NEXT: v_lshr_b64 v[30:31], v[6:7], 16 +; SI-NEXT: v_lshr_b64 v[31:32], v[4:5], 16 +; SI-NEXT: v_lshr_b64 v[32:33], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[33:34], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[34:35], v[12:13], 16 +; SI-NEXT: v_lshr_b64 v[35:36], v[9:10], 16 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v21 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v23 +; SI-NEXT: v_lshr_b64 v[38:39], v[14:15], 16 +; SI-NEXT: v_lshr_b64 v[36:37], v[16:17], 16 +; SI-NEXT: v_lshr_b64 v[28:29], v[18:19], 16 +; SI-NEXT: v_lshr_b64 v[26:27], v[20:21], 16 +; SI-NEXT: v_lshr_b64 v[24:25], v[22:23], 16 ; SI-NEXT: s_cbranch_execnz .LBB49_3 ; SI-NEXT: .LBB49_2: ; %cmp.true -; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 -; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 -; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 -; SI-NEXT: v_lshr_b64 v[23:24], v[7:8], 16 -; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 -; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 -; SI-NEXT: v_lshr_b64 v[24:25], v[5:6], 16 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_lshr_b64 v[30:31], v[6:7], 16 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_lshr_b64 v[31:32], v[4:5], 16 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_lshr_b64 v[32:33], v[2:3], 16 ; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 -; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 -; SI-NEXT: v_lshr_b64 v[25:26], v[3:4], 16 -; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 -; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 -; SI-NEXT: v_lshr_b64 v[26:27], v[1:2], 16 -; SI-NEXT: v_lshr_b64 v[31:32], v[11:12], 16 -; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 -; SI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 -; SI-NEXT: v_lshr_b64 v[27:28], v[9:10], 16 -; SI-NEXT: v_lshr_b64 v[32:33], v[17:18], 16 -; SI-NEXT: v_lshr_b64 v[28:29], v[13:14], 16 -; SI-NEXT: v_lshr_b64 v[33:34], v[19:20], 16 -; SI-NEXT: v_lshr_b64 v[29:30], v[15:16], 16 -; SI-NEXT: v_lshr_b64 v[34:35], v[21:22], 16 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v22 -; SI-NEXT: .LBB49_3: ; %end -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v34 -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; SI-NEXT: v_or_b32_e32 v21, v21, v30 -; SI-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v22 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v54 -; SI-NEXT: v_or_b32_e32 v21, v21, v22 -; SI-NEXT: v_add_i32_e32 v22, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v21, v22, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v33 -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; SI-NEXT: v_or_b32_e32 v19, v19, v21 -; SI-NEXT: v_add_i32_e32 v21, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v19, v21, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v53 -; SI-NEXT: v_or_b32_e32 v19, v19, v20 -; SI-NEXT: v_add_i32_e32 v20, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v32 -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; SI-NEXT: v_or_b32_e32 v17, v17, v19 -; SI-NEXT: v_add_i32_e32 v19, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v17, v19, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v52 -; SI-NEXT: v_or_b32_e32 v17, v17, v18 -; SI-NEXT: v_add_i32_e32 v18, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v29 -; SI-NEXT: v_or_b32_e32 v15, v15, v17 -; SI-NEXT: v_add_i32_e32 v17, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v15, v17, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v51 -; SI-NEXT: v_or_b32_e32 v15, v15, v16 -; SI-NEXT: v_add_i32_e32 v16, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v28 -; SI-NEXT: v_or_b32_e32 v13, v13, v15 -; SI-NEXT: v_add_i32_e32 v15, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v13, v15, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v50 -; SI-NEXT: v_or_b32_e32 v13, v13, v14 -; SI-NEXT: v_add_i32_e32 v14, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v31 -; SI-NEXT: v_or_b32_e32 v11, v11, v13 -; SI-NEXT: v_add_i32_e32 v13, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v11, v13, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v49 -; SI-NEXT: v_or_b32_e32 v11, v11, v12 -; SI-NEXT: v_add_i32_e32 v12, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v27 -; SI-NEXT: v_or_b32_e32 v9, v9, v11 -; SI-NEXT: v_add_i32_e32 v11, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v9, v11, s[0:3], 0 offen +; SI-NEXT: v_lshr_b64 v[33:34], v[0:1], 16 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; SI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; SI-NEXT: v_lshr_b64 v[34:35], v[12:13], 16 +; SI-NEXT: v_lshr_b64 v[35:36], v[9:10], 16 +; SI-NEXT: v_lshr_b64 v[38:39], v[14:15], 16 +; SI-NEXT: v_lshr_b64 v[36:37], v[16:17], 16 +; SI-NEXT: v_lshr_b64 v[28:29], v[18:19], 16 +; SI-NEXT: v_lshr_b64 v[26:27], v[20:21], 16 +; SI-NEXT: v_lshr_b64 v[24:25], v[22:23], 16 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v21 ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v23 +; SI-NEXT: .LBB49_3: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v24, v22, v24 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v40 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v25, v22, v23 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v26 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v26, v20, v22 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v55 +; SI-NEXT: v_or_b32_e32 v27, v20, v21 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v28 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v28, v18, v20 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v54 +; SI-NEXT: v_or_b32_e32 v29, v18, v19 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v36 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v22, v16, v18 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v23, v16, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v35 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v53 ; SI-NEXT: v_or_b32_e32 v9, v9, v10 -; SI-NEXT: v_add_i32_e32 v10, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v26 -; SI-NEXT: v_or_b32_e32 v1, v1, v9 -; SI-NEXT: v_add_i32_e32 v9, vcc, 56, v0 -; SI-NEXT: buffer_store_dword v1, v9, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v24 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v23 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x54, v0 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v38 +; SI-NEXT: v_or_b32_e32 v10, v10, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v14, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v34 +; SI-NEXT: v_or_b32_e32 v12, v12, v14 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v52 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v33 +; SI-NEXT: v_or_b32_e32 v14, v0, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v51 +; SI-NEXT: v_or_b32_e32 v15, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v32 +; SI-NEXT: v_or_b32_e32 v16, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v50 +; SI-NEXT: v_or_b32_e32 v17, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v31 +; SI-NEXT: v_or_b32_e32 v18, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v49 +; SI-NEXT: v_or_b32_e32 v19, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v30 +; SI-NEXT: v_or_b32_e32 v20, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v48 +; SI-NEXT: v_or_b32_e32 v21, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, v24 +; SI-NEXT: v_mov_b32_e32 v1, v25 +; SI-NEXT: v_mov_b32_e32 v2, v26 +; SI-NEXT: v_mov_b32_e32 v3, v27 +; SI-NEXT: v_mov_b32_e32 v4, v28 +; SI-NEXT: v_mov_b32_e32 v5, v29 +; SI-NEXT: v_mov_b32_e32 v6, v22 +; SI-NEXT: v_mov_b32_e32 v7, v23 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB49_4: -; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: s_branch .LBB49_2 ; ; VI-LABEL: bitcast_v11f64_to_v44i16_scalar: @@ -27866,118 +27220,204 @@ define <11 x double> @bitcast_v44i16_to_v11f64(<44 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v44i16_to_v11f64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v54, v2 -; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 -; SI-NEXT: v_mov_b32_e32 v53, v4 -; SI-NEXT: v_mov_b32_e32 v50, v10 -; SI-NEXT: v_mov_b32_e32 v51, v8 -; SI-NEXT: v_mov_b32_e32 v52, v6 -; SI-NEXT: v_mov_b32_e32 v39, v16 -; SI-NEXT: v_mov_b32_e32 v48, v14 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v34, v19 +; SI-NEXT: v_mov_b32_e32 v35, v18 +; SI-NEXT: v_mov_b32_e32 v36, v17 +; SI-NEXT: v_mov_b32_e32 v37, v16 +; SI-NEXT: v_mov_b32_e32 v38, v15 +; SI-NEXT: v_mov_b32_e32 v39, v14 +; SI-NEXT: v_mov_b32_e32 v48, v13 ; SI-NEXT: v_mov_b32_e32 v49, v12 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v38, v18 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v19 +; SI-NEXT: v_mov_b32_e32 v50, v11 +; SI-NEXT: v_mov_b32_e32 v51, v10 +; SI-NEXT: v_mov_b32_e32 v52, v9 +; SI-NEXT: v_mov_b32_e32 v53, v8 +; SI-NEXT: v_mov_b32_e32 v54, v7 +; SI-NEXT: v_mov_b32_e32 v55, v6 +; SI-NEXT: v_mov_b32_e32 v40, v5 +; SI-NEXT: v_mov_b32_e32 v41, v4 +; SI-NEXT: v_mov_b32_e32 v42, v3 +; SI-NEXT: v_mov_b32_e32 v43, v2 +; SI-NEXT: v_mov_b32_e32 v44, v1 +; SI-NEXT: v_mov_b32_e32 v45, v0 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v21 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v38 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v48 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v50 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v52 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v40 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v42 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v43 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v44 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v45 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 ; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v23 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v29 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v0 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:4 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v4 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:36 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v12 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:28 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB50_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v49 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v45 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v44 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v43 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v42 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v41 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v40 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v55 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v54 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v53 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v52 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v51 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v50 +; SI-NEXT: v_or_b32_e32 v0, v0, v33 +; SI-NEXT: v_or_b32_e32 v1, v1, v63 +; SI-NEXT: v_or_b32_e32 v2, v2, v32 +; SI-NEXT: v_or_b32_e32 v3, v3, v62 +; SI-NEXT: v_or_b32_e32 v4, v4, v61 +; SI-NEXT: v_or_b32_e32 v5, v5, v60 +; SI-NEXT: v_or_b32_e32 v6, v6, v59 +; SI-NEXT: v_or_b32_e32 v7, v7, v58 +; SI-NEXT: v_or_b32_e32 v8, v8, v57 +; SI-NEXT: v_or_b32_e32 v9, v9, v56 +; SI-NEXT: v_or_b32_e32 v10, v10, v47 +; SI-NEXT: v_or_b32_e32 v11, v11, v46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v48 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v39 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v38 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v37 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v36 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v35 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v34 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; kill: killed $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr22 @@ -27997,126 +27437,38 @@ define <11 x double> @bitcast_v44i16_to_v11f64(<44 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; kill: killed $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v53 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v52 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v51 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v50 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v49 -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v48 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v39 -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v38 ; SI-NEXT: ; kill: killed $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: v_or_b32_e32 v0, v0, v37 -; SI-NEXT: v_or_b32_e32 v1, v1, v45 -; SI-NEXT: v_or_b32_e32 v2, v2, v44 -; SI-NEXT: v_or_b32_e32 v3, v3, v43 -; SI-NEXT: v_or_b32_e32 v4, v4, v36 -; SI-NEXT: v_or_b32_e32 v5, v5, v42 -; SI-NEXT: v_or_b32_e32 v6, v6, v41 -; SI-NEXT: v_or_b32_e32 v7, v7, v35 -; SI-NEXT: v_or_b32_e32 v8, v8, v34 -; SI-NEXT: v_or_b32_e32 v9, v9, v40 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; kill: killed $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; SI-NEXT: v_or_b32_e32 v10, v10, v33 -; SI-NEXT: v_or_b32_e32 v11, v11, v32 -; SI-NEXT: v_or_b32_e32 v12, v12, v63 -; SI-NEXT: v_or_b32_e32 v13, v13, v62 -; SI-NEXT: v_or_b32_e32 v14, v14, v61 -; SI-NEXT: v_or_b32_e32 v15, v15, v60 -; SI-NEXT: v_or_b32_e32 v16, v16, v59 -; SI-NEXT: v_or_b32_e32 v17, v17, v58 -; SI-NEXT: v_or_b32_e32 v18, v18, v57 -; SI-NEXT: v_or_b32_e32 v19, v19, v56 -; SI-NEXT: v_or_b32_e32 v20, v20, v47 -; SI-NEXT: v_or_b32_e32 v21, v21, v46 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: .LBB50_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB50_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v49 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v53 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v52 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v51 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v50 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v49 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v48 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v39 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v38 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v45 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v44 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v43 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v42 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v41 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v40 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v50 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 @@ -28127,17 +27479,21 @@ define <11 x double> @bitcast_v44i16_to_v11f64(<44 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: v_or_b32_e32 v0, v37, v0 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_or_b32_e32 v0, v33, v0 ; SI-NEXT: s_mov_b32 s6, 0x30000 -; SI-NEXT: v_or_b32_e32 v1, v45, v1 -; SI-NEXT: v_or_b32_e32 v2, v44, v2 -; SI-NEXT: v_or_b32_e32 v3, v43, v3 -; SI-NEXT: v_or_b32_e32 v4, v36, v4 -; SI-NEXT: v_or_b32_e32 v5, v42, v5 -; SI-NEXT: v_or_b32_e32 v6, v41, v6 -; SI-NEXT: v_or_b32_e32 v7, v35, v7 -; SI-NEXT: v_or_b32_e32 v8, v34, v8 -; SI-NEXT: v_or_b32_e32 v9, v40, v9 +; SI-NEXT: v_or_b32_e32 v1, v63, v1 +; SI-NEXT: v_or_b32_e32 v2, v32, v2 +; SI-NEXT: v_or_b32_e32 v3, v62, v3 +; SI-NEXT: v_or_b32_e32 v4, v61, v4 +; SI-NEXT: v_or_b32_e32 v5, v60, v5 +; SI-NEXT: v_or_b32_e32 v6, v59, v6 +; SI-NEXT: v_or_b32_e32 v7, v58, v7 +; SI-NEXT: v_or_b32_e32 v8, v57, v8 +; SI-NEXT: v_or_b32_e32 v9, v56, v9 +; SI-NEXT: v_or_b32_e32 v10, v47, v10 +; SI-NEXT: v_or_b32_e32 v11, v46, v11 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 @@ -28148,49 +27504,38 @@ define <11 x double> @bitcast_v44i16_to_v11f64(<44 x i16> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 ; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 ; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v48 ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v39 ; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v38 ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v37 ; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v36 ; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v35 ; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v34 ; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; SI-NEXT: v_or_b32_e32 v10, v33, v10 -; SI-NEXT: v_or_b32_e32 v11, v32, v11 -; SI-NEXT: v_or_b32_e32 v12, v63, v12 -; SI-NEXT: v_or_b32_e32 v13, v62, v13 -; SI-NEXT: v_or_b32_e32 v14, v61, v14 -; SI-NEXT: v_or_b32_e32 v15, v60, v15 -; SI-NEXT: v_or_b32_e32 v16, v59, v16 -; SI-NEXT: v_or_b32_e32 v17, v58, v17 -; SI-NEXT: v_or_b32_e32 v18, v57, v18 -; SI-NEXT: v_or_b32_e32 v19, v56, v19 -; SI-NEXT: v_or_b32_e32 v20, v47, v20 -; SI-NEXT: v_or_b32_e32 v21, v46, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 ; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 ; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 @@ -28201,26 +27546,37 @@ define <11 x double> @bitcast_v44i16_to_v11f64(<44 x i16> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 ; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 ; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 ; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v21 ; SI-NEXT: .LBB50_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -28812,233 +28168,239 @@ define inreg <11 x double> @bitcast_v44i16_to_v11f64_scalar(<44 x i16> inreg %a, ; SI-LABEL: bitcast_v44i16_to_v11f64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v32, v28 -; SI-NEXT: v_mov_b32_e32 v33, v26 -; SI-NEXT: v_mov_b32_e32 v34, v24 -; SI-NEXT: v_mov_b32_e32 v35, v22 -; SI-NEXT: v_mov_b32_e32 v36, v20 -; SI-NEXT: v_mov_b32_e32 v37, v18 -; SI-NEXT: v_mov_b32_e32 v38, v16 -; SI-NEXT: v_mov_b32_e32 v39, v14 -; SI-NEXT: v_mov_b32_e32 v48, v12 -; SI-NEXT: v_mov_b32_e32 v49, v10 -; SI-NEXT: v_mov_b32_e32 v50, v8 -; SI-NEXT: v_mov_b32_e32 v51, v6 -; SI-NEXT: v_mov_b32_e32 v52, v4 -; SI-NEXT: v_mov_b32_e32 v53, v2 -; SI-NEXT: v_mov_b32_e32 v54, v0 +; SI-NEXT: v_mov_b32_e32 v32, v7 +; SI-NEXT: v_mov_b32_e32 v33, v6 +; SI-NEXT: v_mov_b32_e32 v34, v5 +; SI-NEXT: v_mov_b32_e32 v35, v4 +; SI-NEXT: v_mov_b32_e32 v36, v3 +; SI-NEXT: v_mov_b32_e32 v37, v2 +; SI-NEXT: v_mov_b32_e32 v38, v1 +; SI-NEXT: v_mov_b32_e32 v39, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v38 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v39 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s7, s28, 16 +; SI-NEXT: s_lshr_b32 s8, s27, 16 +; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: s_lshr_b32 s13, s22, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s40, s19, 16 +; SI-NEXT: s_lshr_b32 s41, s18, 16 +; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v0 ; SI-NEXT: s_cbranch_scc0 .LBB51_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v54 -; SI-NEXT: v_or_b32_e32 v7, v0, v61 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 -; SI-NEXT: v_or_b32_e32 v9, v0, v59 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 -; SI-NEXT: v_or_b32_e32 v10, v0, v58 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 -; SI-NEXT: v_or_b32_e32 v11, v0, v57 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 -; SI-NEXT: v_or_b32_e32 v12, v0, v56 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 -; SI-NEXT: v_or_b32_e32 v13, v0, v47 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 ; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: v_or_b32_e32 v14, v0, v46 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: s_lshl_b32 s5, s43, 16 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: v_or_b32_e32 v15, v0, v45 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s44, s42, 16 +; SI-NEXT: s_or_b32 s5, s5, s44 +; SI-NEXT: s_and_b32 s44, s18, 0xffff +; SI-NEXT: s_lshl_b32 s45, s41, 16 +; SI-NEXT: s_or_b32 s44, s44, s45 +; SI-NEXT: s_and_b32 s45, s19, 0xffff +; SI-NEXT: s_lshl_b32 s46, s40, 16 +; SI-NEXT: s_or_b32 s45, s45, s46 +; SI-NEXT: s_and_b32 s46, s20, 0xffff +; SI-NEXT: s_lshl_b32 s47, s15, 16 +; SI-NEXT: s_or_b32 s46, s46, s47 +; SI-NEXT: s_and_b32 s47, s21, 0xffff +; SI-NEXT: s_lshl_b32 s56, s14, 16 +; SI-NEXT: s_or_b32 s47, s47, s56 +; SI-NEXT: s_and_b32 s56, s22, 0xffff +; SI-NEXT: s_lshl_b32 s57, s13, 16 +; SI-NEXT: s_or_b32 s56, s56, s57 +; SI-NEXT: s_and_b32 s57, s23, 0xffff +; SI-NEXT: s_lshl_b32 s58, s12, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: s_or_b32 s57, s57, s58 +; SI-NEXT: s_and_b32 s58, s24, 0xffff +; SI-NEXT: s_lshl_b32 s59, s11, 16 +; SI-NEXT: v_or_b32_e32 v14, v0, v55 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 -; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: v_or_b32_e32 v16, v0, v44 +; SI-NEXT: s_or_b32 s58, s58, s59 +; SI-NEXT: s_and_b32 s59, s25, 0xffff +; SI-NEXT: s_lshl_b32 s60, s10, 16 +; SI-NEXT: v_or_b32_e32 v16, v0, v53 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 -; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: v_or_b32_e32 v17, v0, v43 +; SI-NEXT: s_or_b32 s59, s59, s60 +; SI-NEXT: s_and_b32 s60, s26, 0xffff +; SI-NEXT: s_lshl_b32 s61, s9, 16 +; SI-NEXT: v_or_b32_e32 v17, v0, v52 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 -; SI-NEXT: s_or_b32 s7, s7, s8 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: v_or_b32_e32 v18, v0, v42 +; SI-NEXT: s_or_b32 s60, s60, s61 +; SI-NEXT: s_and_b32 s61, s27, 0xffff +; SI-NEXT: s_lshl_b32 s62, s8, 16 +; SI-NEXT: v_or_b32_e32 v18, v0, v51 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 -; SI-NEXT: s_or_b32 s8, s8, s9 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: v_or_b32_e32 v19, v0, v41 +; SI-NEXT: s_or_b32 s61, s61, s62 +; SI-NEXT: s_and_b32 s62, s28, 0xffff +; SI-NEXT: s_lshl_b32 s63, s7, 16 +; SI-NEXT: v_or_b32_e32 v19, v0, v50 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 -; SI-NEXT: s_or_b32 s9, s9, s10 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v53 -; SI-NEXT: v_or_b32_e32 v20, v0, v40 +; SI-NEXT: s_or_b32 s62, s62, s63 +; SI-NEXT: s_and_b32 s63, s29, 0xffff +; SI-NEXT: s_lshl_b32 s72, s6, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v38 +; SI-NEXT: v_or_b32_e32 v20, v0, v49 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 -; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_or_b32_e32 v8, v1, v60 -; SI-NEXT: v_or_b32_e32 v21, v0, v55 +; SI-NEXT: s_or_b32 s63, s63, s72 +; SI-NEXT: v_or_b32_e32 v15, v1, v54 +; SI-NEXT: v_or_b32_e32 v21, v0, v48 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_mov_b32_e32 v5, s9 -; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: v_mov_b32_e32 v2, s44 +; SI-NEXT: v_mov_b32_e32 v3, s45 +; SI-NEXT: v_mov_b32_e32 v4, s46 +; SI-NEXT: v_mov_b32_e32 v5, s47 +; SI-NEXT: v_mov_b32_e32 v6, s56 +; SI-NEXT: v_mov_b32_e32 v7, s57 +; SI-NEXT: v_mov_b32_e32 v8, s58 +; SI-NEXT: v_mov_b32_e32 v9, s59 +; SI-NEXT: v_mov_b32_e32 v10, s60 +; SI-NEXT: v_mov_b32_e32 v11, s61 +; SI-NEXT: v_mov_b32_e32 v12, s62 +; SI-NEXT: v_mov_b32_e32 v13, s63 ; SI-NEXT: s_cbranch_execnz .LBB51_3 ; SI-NEXT: .LBB51_2: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v54 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v61, v0 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v60, v0 -; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v59, v0 -; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v58, v0 -; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v57, v0 -; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v56, v0 -; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v47, v0 -; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: v_or_b32_e32 v0, v55, v0 ; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v45, v0 -; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v44, v0 -; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: v_or_b32_e32 v0, v53, v0 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s16, s42, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: s_and_b32 s16, s18, 0xffff +; SI-NEXT: s_lshl_b32 s17, s41, 16 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_and_b32 s17, s19, 0xffff +; SI-NEXT: s_lshl_b32 s18, s40, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_or_b32_e32 v0, v52, v0 +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: s_and_b32 s18, s20, 0xffff +; SI-NEXT: s_lshl_b32 s15, s15, 16 +; SI-NEXT: s_add_i32 s21, s21, 3 ; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: s_or_b32 s15, s15, s18 +; SI-NEXT: s_and_b32 s18, s21, 0xffff +; SI-NEXT: s_lshl_b32 s14, s14, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_or_b32_e32 v0, v42, v0 -; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s14, s14, s18 +; SI-NEXT: s_and_b32 s18, s22, 0xffff +; SI-NEXT: s_lshl_b32 s13, s13, 16 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: v_or_b32_e32 v0, v51, v0 +; SI-NEXT: s_or_b32 s13, s13, s18 +; SI-NEXT: s_and_b32 s18, s23, 0xffff +; SI-NEXT: s_lshl_b32 s12, s12, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 ; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s12, s12, s18 +; SI-NEXT: s_and_b32 s18, s24, 0xffff +; SI-NEXT: s_lshl_b32 s11, s11, 16 +; SI-NEXT: s_add_i32 s25, s25, 3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: v_or_b32_e32 v0, v41, v0 -; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s11, s11, s18 +; SI-NEXT: s_and_b32 s18, s25, 0xffff +; SI-NEXT: s_lshl_b32 s10, s10, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: v_or_b32_e32 v0, v50, v0 +; SI-NEXT: s_or_b32 s10, s10, s18 +; SI-NEXT: s_and_b32 s18, s26, 0xffff +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_add_i32 s27, s27, 3 ; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 -; SI-NEXT: s_or_b32 s7, s8, s7 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_or_b32 s8, s9, s8 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_or_b32 s9, s9, s18 +; SI-NEXT: s_and_b32 s18, s27, 0xffff +; SI-NEXT: s_lshl_b32 s8, s8, 16 ; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: v_or_b32_e32 v0, v40, v0 -; SI-NEXT: s_or_b32 s9, s10, s9 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s8, s8, s18 +; SI-NEXT: s_and_b32 s18, s28, 0xffff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: v_or_b32_e32 v0, v49, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v38 +; SI-NEXT: s_or_b32 s7, s7, s18 +; SI-NEXT: s_and_b32 s18, s29, 0xffff +; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 -; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_or_b32 s6, s6, s18 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v54, v1 ; SI-NEXT: s_add_i32 s4, s4, 0x30000 ; SI-NEXT: s_add_i32 s5, s5, 0x30000 -; SI-NEXT: s_add_i32 s6, s6, 0x30000 -; SI-NEXT: s_add_i32 s7, s7, 0x30000 -; SI-NEXT: s_add_i32 s8, s8, 0x30000 -; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s16, s16, 0x30000 +; SI-NEXT: s_add_i32 s17, s17, 0x30000 +; SI-NEXT: s_add_i32 s15, s15, 0x30000 +; SI-NEXT: s_add_i32 s14, s14, 0x30000 +; SI-NEXT: s_add_i32 s13, s13, 0x30000 +; SI-NEXT: s_add_i32 s12, s12, 0x30000 +; SI-NEXT: s_add_i32 s11, s11, 0x30000 ; SI-NEXT: s_add_i32 s10, s10, 0x30000 -; SI-NEXT: v_or_b32_e32 v0, v55, v0 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v48, v0 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v1 ; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v0 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_mov_b32_e32 v5, s9 -; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: v_mov_b32_e32 v3, s17 +; SI-NEXT: v_mov_b32_e32 v4, s15 +; SI-NEXT: v_mov_b32_e32 v5, s14 +; SI-NEXT: v_mov_b32_e32 v6, s13 +; SI-NEXT: v_mov_b32_e32 v7, s12 +; SI-NEXT: v_mov_b32_e32 v8, s11 +; SI-NEXT: v_mov_b32_e32 v9, s10 +; SI-NEXT: v_mov_b32_e32 v10, s9 +; SI-NEXT: v_mov_b32_e32 v11, s8 +; SI-NEXT: v_mov_b32_e32 v12, s7 +; SI-NEXT: v_mov_b32_e32 v13, s6 ; SI-NEXT: .LBB51_3: ; %end -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB51_4: ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 @@ -29621,10 +28983,9 @@ define <44 x half> @bitcast_v11f64_to_v44f16(<11 x double> %a, i32 %b) { ; SI-LABEL: bitcast_v11f64_to_v44f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v23 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -29641,15 +29002,14 @@ define <44 x half> @bitcast_v11f64_to_v44f16(<11 x double> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr47 @@ -29659,365 +29019,274 @@ define <44 x half> @bitcast_v11f64_to_v44f16(<11 x double> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; kill: killed $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; kill: killed $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB52_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v57, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v4 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v60, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v3 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v62, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v24 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v31, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v3 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v63, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v59, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v2 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v61, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v63, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v22 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v29, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: .LBB52_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB52_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 -; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 -; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 -; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 -; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 -; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 -; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 -; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 -; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 -; SI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v19 +; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v19 ; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v20 ; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 ; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 ; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 ; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 ; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: .LBB52_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v1, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v63 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: v_cvt_f16_f32_e32 v1, v62 -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v62 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v61 -; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v59 -; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v58 -; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v56 -; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v46 -; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v44 -; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v42 -; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v40 -; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v54 -; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v52 -; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v51 -; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v49 -; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v39 -; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 -; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v35 -; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v31 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v26 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v27 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x54, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v58 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v56 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v47 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v44 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v43 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v55 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v40 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v51 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v52 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v39 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v48 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v34 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v36 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v30 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v33 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v29 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -30034,7 +29303,30 @@ define <44 x half> @bitcast_v11f64_to_v44f16(<11 x double> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v25 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v29 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v20, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v27 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v11f64_to_v44f16: @@ -30452,22 +29744,22 @@ define inreg <44 x half> @bitcast_v11f64_to_v44f16_scalar(<11 x double> inreg %a ; SI-LABEL: bitcast_v11f64_to_v44f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 -; SI-NEXT: v_mov_b32_e32 v21, s16 -; SI-NEXT: v_mov_b32_e32 v22, s17 -; SI-NEXT: v_mov_b32_e32 v19, s18 -; SI-NEXT: v_mov_b32_e32 v20, s19 -; SI-NEXT: v_mov_b32_e32 v17, s20 -; SI-NEXT: v_mov_b32_e32 v18, s21 -; SI-NEXT: v_mov_b32_e32 v15, s22 -; SI-NEXT: v_mov_b32_e32 v16, s23 -; SI-NEXT: v_mov_b32_e32 v11, s24 -; SI-NEXT: v_mov_b32_e32 v12, s25 -; SI-NEXT: v_mov_b32_e32 v13, s26 -; SI-NEXT: v_mov_b32_e32 v14, s27 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: v_mov_b32_e32 v20, s16 +; SI-NEXT: v_mov_b32_e32 v21, s17 +; SI-NEXT: v_mov_b32_e32 v18, s18 +; SI-NEXT: v_mov_b32_e32 v19, s19 +; SI-NEXT: v_mov_b32_e32 v16, s20 +; SI-NEXT: v_mov_b32_e32 v17, s21 +; SI-NEXT: v_mov_b32_e32 v14, s22 +; SI-NEXT: v_mov_b32_e32 v15, s23 +; SI-NEXT: v_mov_b32_e32 v10, s24 +; SI-NEXT: v_mov_b32_e32 v11, s25 +; SI-NEXT: v_mov_b32_e32 v12, s26 +; SI-NEXT: v_mov_b32_e32 v13, s27 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mov_b32_e32 v9, s28 -; SI-NEXT: v_mov_b32_e32 v10, s29 +; SI-NEXT: v_mov_b32_e32 v8, s28 +; SI-NEXT: v_mov_b32_e32 v9, s29 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -30486,324 +29778,233 @@ define inreg <44 x half> @bitcast_v11f64_to_v44f16_scalar(<11 x double> inreg %a ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB53_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v18 ; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v57, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v21 ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v59, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v20 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v61, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v3 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v63, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v3 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v23, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v10 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v23, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v13 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v40, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v20 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v23, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v0 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v48, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v18 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v20 ; SI-NEXT: s_cbranch_execnz .LBB53_3 ; SI-NEXT: .LBB53_2: ; %cmp.true -; SI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 -; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 -; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 -; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 -; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 -; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 -; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 -; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 -; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 -; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 -; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 +; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v6 ; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v16 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v62, v19 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v63, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 ; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 ; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 ; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 ; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 ; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 ; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 ; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 ; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: .LBB53_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v1, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v24 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v23 -; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v62 -; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v60 -; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v58 -; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v46 -; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v44 -; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v42 -; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v40 -; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v54 -; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v52 -; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v51 -; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v49 -; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v39 -; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 -; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v35 -; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v31 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v26 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v27 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x54, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v0, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v22 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v57 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v62 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v47 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v60 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v29 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v6, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v58 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v44 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v43 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v55 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v40 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v51 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v52 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v39 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v48 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v34 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v36 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v30 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v33 ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -30820,56 +30021,78 @@ define inreg <44 x half> @bitcast_v11f64_to_v44f16_scalar(<11 x double> inreg %a ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v26 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v29 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v20, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v27 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB53_4: -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr62 ; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; kill: killed $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: s_branch .LBB53_2 ; ; VI-LABEL: bitcast_v11f64_to_v44f16_scalar: @@ -31495,159 +30718,163 @@ define <11 x double> @bitcast_v44f16_to_v11f64(<44 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v44f16_to_v11f64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:52 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v50, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v32 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v10 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v9 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v40, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v41, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v52 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v46 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v52, v47 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v56 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v45 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v44 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v43 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v42 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execz .LBB54_2 -; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v57 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB54_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v49 @@ -31658,10 +30885,13 @@ define <11 x double> @bitcast_v44f16_to_v11f64(<44 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v63 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v61 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v59 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v53 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v45 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v41 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v53 ; SI-NEXT: v_or_b32_e32 v0, v50, v0 ; SI-NEXT: v_or_b32_e32 v1, v48, v1 ; SI-NEXT: v_or_b32_e32 v2, v38, v2 @@ -31671,8 +30901,13 @@ define <11 x double> @bitcast_v44f16_to_v11f64(<44 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v6, v62, v6 ; SI-NEXT: v_or_b32_e32 v7, v60, v7 ; SI-NEXT: v_or_b32_e32 v8, v58, v8 -; SI-NEXT: v_or_b32_e32 v20, v54, v20 -; SI-NEXT: v_or_b32_e32 v21, v52, v21 +; SI-NEXT: v_or_b32_e32 v9, v56, v9 +; SI-NEXT: v_or_b32_e32 v10, v46, v10 +; SI-NEXT: v_or_b32_e32 v11, v44, v11 +; SI-NEXT: v_or_b32_e32 v12, v42, v12 +; SI-NEXT: v_or_b32_e32 v13, v40, v13 +; SI-NEXT: v_or_b32_e32 v14, v54, v14 +; SI-NEXT: v_or_b32_e32 v15, v52, v15 ; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr49 @@ -31692,64 +30927,72 @@ define <11 x double> @bitcast_v44f16_to_v11f64(<44 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v41 -; SI-NEXT: v_or_b32_e32 v19, v40, v19 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 ; SI-NEXT: .LBB54_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB54_4 @@ -31813,130 +31056,107 @@ define <11 x double> @bitcast_v44f16_to_v11f64(<44 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v11, v56 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v63 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v12, v46 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v43 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v52 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v42 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v40 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v53 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v52 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v47 ; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v45 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v44 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v11, v12, v11 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v41 ; SI-NEXT: v_or_b32_e32 v12, v14, v12 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v55 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v54 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 @@ -31948,17 +31168,29 @@ define <11 x double> @bitcast_v44f16_to_v11f64(<44 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_or_b32_e32 v17, v18, v17 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v41 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_or_b32_e32 v18, v20, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v55 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v54 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_or_b32_e32 v20, v21, v20 @@ -31966,22 +31198,22 @@ define <11 x double> @bitcast_v44f16_to_v11f64(<44 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v21, v23, v21 ; SI-NEXT: .LBB54_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -32574,6 +31806,14 @@ define inreg <11 x double> @bitcast_v44f16_to_v11f64_scalar(<44 x half> inreg %a ; SI-LABEL: bitcast_v44f16_to_v11f64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_lshr_b32 s40, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s40 +; SI-NEXT: s_lshr_b32 s40, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s17 +; SI-NEXT: s_lshr_b32 s14, s19, 16 +; SI-NEXT: s_lshr_b32 s15, s18, 16 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -32590,219 +31830,280 @@ define inreg <11 x double> @bitcast_v44f16_to_v11f64_scalar(<44 x half> inreg %a ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v60, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v51, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v33, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v1, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v34, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v2, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v3, s20 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v63, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v62, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v61, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v36, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v29, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v28, s26 -; SI-NEXT: v_cvt_f16_f32_e32 v27, s29 -; SI-NEXT: v_cvt_f16_f32_e32 v26, s28 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_cbranch_scc0 .LBB55_4 -; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v61, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v19 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v63 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v61 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v29 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s19 +; SI-NEXT: s_lshr_b32 s12, s21, 16 +; SI-NEXT: s_lshr_b32 s13, s20, 16 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v18 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v63, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s21 +; SI-NEXT: s_lshr_b32 s10, s23, 16 +; SI-NEXT: s_lshr_b32 s11, s22, 16 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s23 +; SI-NEXT: s_lshr_b32 s8, s25, 16 +; SI-NEXT: s_lshr_b32 s9, s24, 16 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s25 +; SI-NEXT: s_lshr_b32 s6, s27, 16 +; SI-NEXT: s_lshr_b32 s7, s26, 16 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s27 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v0 +; SI-NEXT: s_lshr_b32 s4, s29, 16 +; SI-NEXT: s_lshr_b32 s5, s28, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v7 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB55_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v23 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v62 +; SI-NEXT: v_or_b32_e32 v2, v21, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v63 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v60 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v48 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v36 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v35 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v58 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v56 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v46 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v44 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v42 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v40 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v54 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v46 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v41 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v53 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v25 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: v_or_b32_e32 v0, v33, v0 -; SI-NEXT: v_or_b32_e32 v1, v34, v1 -; SI-NEXT: v_or_b32_e32 v3, v62, v3 -; SI-NEXT: v_or_b32_e32 v4, v36, v4 -; SI-NEXT: v_or_b32_e32 v5, v28, v5 -; SI-NEXT: v_or_b32_e32 v6, v26, v6 -; SI-NEXT: v_or_b32_e32 v7, v48, v7 -; SI-NEXT: v_or_b32_e32 v8, v38, v8 -; SI-NEXT: v_or_b32_e32 v9, v49, v9 -; SI-NEXT: v_or_b32_e32 v10, v50, v10 -; SI-NEXT: v_or_b32_e32 v11, v59, v11 -; SI-NEXT: v_or_b32_e32 v12, v57, v12 -; SI-NEXT: v_or_b32_e32 v13, v47, v13 -; SI-NEXT: v_or_b32_e32 v14, v45, v14 -; SI-NEXT: v_or_b32_e32 v15, v43, v15 -; SI-NEXT: v_or_b32_e32 v16, v41, v16 -; SI-NEXT: v_or_b32_e32 v17, v55, v17 -; SI-NEXT: v_or_b32_e32 v18, v53, v18 -; SI-NEXT: v_or_b32_e32 v19, v31, v19 +; SI-NEXT: v_or_b32_e32 v0, v59, v0 +; SI-NEXT: v_or_b32_e32 v1, v57, v1 +; SI-NEXT: v_or_b32_e32 v3, v58, v3 +; SI-NEXT: v_or_b32_e32 v4, v56, v4 +; SI-NEXT: v_or_b32_e32 v5, v50, v5 +; SI-NEXT: v_or_b32_e32 v6, v49, v6 +; SI-NEXT: v_or_b32_e32 v7, v38, v7 +; SI-NEXT: v_or_b32_e32 v8, v37, v8 +; SI-NEXT: v_or_b32_e32 v9, v34, v9 +; SI-NEXT: v_or_b32_e32 v10, v33, v10 +; SI-NEXT: v_or_b32_e32 v11, v45, v11 +; SI-NEXT: v_or_b32_e32 v12, v44, v12 +; SI-NEXT: v_or_b32_e32 v13, v42, v13 +; SI-NEXT: v_or_b32_e32 v14, v40, v14 +; SI-NEXT: v_or_b32_e32 v15, v54, v15 +; SI-NEXT: v_or_b32_e32 v16, v52, v16 +; SI-NEXT: v_or_b32_e32 v17, v30, v17 +; SI-NEXT: v_or_b32_e32 v18, v28, v18 +; SI-NEXT: v_or_b32_e32 v19, v26, v19 ; SI-NEXT: v_or_b32_e32 v20, v24, v20 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v21, v23, v21 ; SI-NEXT: s_cbranch_execnz .LBB55_3 ; SI-NEXT: .LBB55_2: ; %cmp.true -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v57 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v4, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v56 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v49 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v26 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v48 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v34 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v50 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v33 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v46 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v44 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v42 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v55 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v54 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v52 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v29 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v28 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v26 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v63 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v60 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v47 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v51 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v48 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 @@ -32822,43 +32123,43 @@ define inreg <11 x double> @bitcast_v44f16_to_v11f64_scalar(<44 x half> inreg %a ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v45 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v43 ; SI-NEXT: v_or_b32_e32 v12, v14, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v41 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v40 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v53 ; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v31 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v30 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_or_b32_e32 v17, v18, v17 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v27 ; SI-NEXT: v_or_b32_e32 v18, v20, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v20, v25 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 @@ -32894,88 +32195,92 @@ define inreg <11 x double> @bitcast_v44f16_to_v11f64_scalar(<44 x half> inreg %a ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB55_4: -; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v59, v46 -; SI-NEXT: v_mov_b32_e32 v46, v41 -; SI-NEXT: v_mov_b32_e32 v41, v52 -; SI-NEXT: v_mov_b32_e32 v52, v23 -; SI-NEXT: v_mov_b32_e32 v48, v60 -; SI-NEXT: v_mov_b32_e32 v60, v47 -; SI-NEXT: v_mov_b32_e32 v47, v42 -; SI-NEXT: v_mov_b32_e32 v42, v53 +; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v51, v33 +; SI-NEXT: v_mov_b32_e32 v50, v32 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_mov_b32_e32 v48, v47 +; SI-NEXT: v_mov_b32_e32 v47, v53 ; SI-NEXT: v_mov_b32_e32 v53, v22 -; SI-NEXT: v_mov_b32_e32 v35, v61 -; SI-NEXT: v_mov_b32_e32 v61, v56 -; SI-NEXT: v_mov_b32_e32 v56, v43 -; SI-NEXT: v_mov_b32_e32 v43, v54 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_mov_b32_e32 v39, v56 +; SI-NEXT: v_mov_b32_e32 v56, v54 ; SI-NEXT: v_mov_b32_e32 v54, v24 -; SI-NEXT: v_mov_b32_e32 v50, v34 -; SI-NEXT: v_mov_b32_e32 v34, v62 -; SI-NEXT: v_mov_b32_e32 v62, v57 -; SI-NEXT: v_mov_b32_e32 v57, v44 -; SI-NEXT: v_mov_b32_e32 v44, v55 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_mov_b32_e32 v38, v57 +; SI-NEXT: v_mov_b32_e32 v57, v55 ; SI-NEXT: v_mov_b32_e32 v55, v25 -; SI-NEXT: v_mov_b32_e32 v32, v33 -; SI-NEXT: v_mov_b32_e32 v33, v63 -; SI-NEXT: v_mov_b32_e32 v63, v58 -; SI-NEXT: v_mov_b32_e32 v58, v45 -; SI-NEXT: v_mov_b32_e32 v45, v40 -; SI-NEXT: v_mov_b32_e32 v40, v31 -; SI-NEXT: v_mov_b32_e32 v39, v26 -; SI-NEXT: v_mov_b32_e32 v38, v27 -; SI-NEXT: v_mov_b32_e32 v37, v28 -; SI-NEXT: v_mov_b32_e32 v49, v36 -; SI-NEXT: v_mov_b32_e32 v36, v29 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_mov_b32_e32 v37, v58 +; SI-NEXT: v_mov_b32_e32 v58, v40 +; SI-NEXT: v_mov_b32_e32 v40, v26 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v36, v59 +; SI-NEXT: v_mov_b32_e32 v59, v41 +; SI-NEXT: v_mov_b32_e32 v41, v27 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v35, v60 +; SI-NEXT: v_mov_b32_e32 v60, v42 +; SI-NEXT: v_mov_b32_e32 v42, v28 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v34, v61 +; SI-NEXT: v_mov_b32_e32 v61, v43 +; SI-NEXT: v_mov_b32_e32 v43, v29 +; SI-NEXT: v_mov_b32_e32 v33, v62 +; SI-NEXT: v_mov_b32_e32 v62, v44 +; SI-NEXT: v_mov_b32_e32 v44, v30 +; SI-NEXT: v_mov_b32_e32 v32, v63 +; SI-NEXT: v_mov_b32_e32 v63, v45 +; SI-NEXT: v_mov_b32_e32 v45, v31 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v29, v36 -; SI-NEXT: v_mov_b32_e32 v36, v49 -; SI-NEXT: v_mov_b32_e32 v28, v37 -; SI-NEXT: v_mov_b32_e32 v27, v38 -; SI-NEXT: v_mov_b32_e32 v26, v39 -; SI-NEXT: v_mov_b32_e32 v31, v40 -; SI-NEXT: v_mov_b32_e32 v40, v45 -; SI-NEXT: v_mov_b32_e32 v45, v58 -; SI-NEXT: v_mov_b32_e32 v58, v63 -; SI-NEXT: v_mov_b32_e32 v63, v33 -; SI-NEXT: v_mov_b32_e32 v33, v32 +; SI-NEXT: v_mov_b32_e32 v31, v45 +; SI-NEXT: v_mov_b32_e32 v45, v63 +; SI-NEXT: v_mov_b32_e32 v63, v32 +; SI-NEXT: v_mov_b32_e32 v30, v44 +; SI-NEXT: v_mov_b32_e32 v44, v62 +; SI-NEXT: v_mov_b32_e32 v62, v33 +; SI-NEXT: v_mov_b32_e32 v29, v43 +; SI-NEXT: v_mov_b32_e32 v43, v61 +; SI-NEXT: v_mov_b32_e32 v61, v34 +; SI-NEXT: v_mov_b32_e32 v28, v42 +; SI-NEXT: v_mov_b32_e32 v42, v60 +; SI-NEXT: v_mov_b32_e32 v60, v35 +; SI-NEXT: v_mov_b32_e32 v27, v41 +; SI-NEXT: v_mov_b32_e32 v41, v59 +; SI-NEXT: v_mov_b32_e32 v59, v36 +; SI-NEXT: v_mov_b32_e32 v26, v40 +; SI-NEXT: v_mov_b32_e32 v40, v58 +; SI-NEXT: v_mov_b32_e32 v58, v37 ; SI-NEXT: v_mov_b32_e32 v25, v55 -; SI-NEXT: v_mov_b32_e32 v55, v44 -; SI-NEXT: v_mov_b32_e32 v44, v57 -; SI-NEXT: v_mov_b32_e32 v57, v62 -; SI-NEXT: v_mov_b32_e32 v62, v34 -; SI-NEXT: v_mov_b32_e32 v34, v50 +; SI-NEXT: v_mov_b32_e32 v55, v57 +; SI-NEXT: v_mov_b32_e32 v57, v38 ; SI-NEXT: v_mov_b32_e32 v24, v54 -; SI-NEXT: v_mov_b32_e32 v54, v43 -; SI-NEXT: v_mov_b32_e32 v43, v56 -; SI-NEXT: v_mov_b32_e32 v56, v61 -; SI-NEXT: v_mov_b32_e32 v61, v35 +; SI-NEXT: v_mov_b32_e32 v54, v56 +; SI-NEXT: v_mov_b32_e32 v56, v39 ; SI-NEXT: v_mov_b32_e32 v22, v53 -; SI-NEXT: v_mov_b32_e32 v53, v42 -; SI-NEXT: v_mov_b32_e32 v42, v47 -; SI-NEXT: v_mov_b32_e32 v47, v60 -; SI-NEXT: v_mov_b32_e32 v60, v48 -; SI-NEXT: v_mov_b32_e32 v23, v52 -; SI-NEXT: v_mov_b32_e32 v52, v41 -; SI-NEXT: v_mov_b32_e32 v41, v46 -; SI-NEXT: v_mov_b32_e32 v46, v59 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v53, v47 +; SI-NEXT: v_mov_b32_e32 v47, v48 +; SI-NEXT: v_mov_b32_e32 v32, v50 +; SI-NEXT: v_mov_b32_e32 v33, v51 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: s_branch .LBB55_2 ; ; VI-LABEL: bitcast_v44f16_to_v11f64_scalar: @@ -33522,616 +32827,548 @@ define <44 x half> @bitcast_v44i16_to_v44f16(<44 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v44i16_to_v44f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:24 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; kill: killed $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; kill: killed $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; kill: killed $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; kill: killed $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; kill: killed $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; kill: killed $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; kill: killed $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; kill: killed $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; kill: killed $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; kill: killed $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; kill: killed $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; kill: killed $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; kill: killed $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; kill: killed $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; kill: killed $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; kill: killed $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; kill: killed $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; kill: killed $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; kill: killed $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; kill: killed $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; kill: killed $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; kill: killed $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; kill: killed $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; kill: killed $vgpr56 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v0 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; kill: killed $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; kill: killed $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB56_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v37, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v4 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v7 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v10 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v13 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v20 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v56, v40 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v34 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v37 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v39 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v49 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v53 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v26 ; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v27 ; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v28 ; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v29 ; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v30 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v51 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v52 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v53 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v54 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v55 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v41 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v9 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v42 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v10 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v11 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v45 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v13 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v46 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v47 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: .LBB56_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB56_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v17 -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v19 -; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v21 -; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v24 ; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 ; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v25 ; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v2 ; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v26 ; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v3 ; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 -; SI-NEXT: v_add_i32_e32 v50, vcc, 3, v50 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v25 -; SI-NEXT: v_add_i32_e32 v51, vcc, 3, v51 -; SI-NEXT: v_add_i32_e32 v52, vcc, 3, v52 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v26 -; SI-NEXT: v_add_i32_e32 v53, vcc, 3, v53 -; SI-NEXT: v_add_i32_e32 v54, vcc, 3, v54 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v27 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v27 -; SI-NEXT: v_add_i32_e32 v55, vcc, 3, v55 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_add_i32_e32 v41, vcc, 3, v41 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v4 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v28 -; SI-NEXT: v_add_i32_e32 v42, vcc, 3, v42 -; SI-NEXT: v_add_i32_e32 v43, vcc, 3, v43 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v28 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v29 -; SI-NEXT: v_add_i32_e32 v44, vcc, 3, v44 -; SI-NEXT: v_add_i32_e32 v45, vcc, 3, v45 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v5 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v30 -; SI-NEXT: v_add_i32_e32 v46, vcc, 3, v46 -; SI-NEXT: v_add_i32_e32 v47, vcc, 3, v47 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v29 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_add_i32_e32 v40, vcc, 3, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v6 +; SI-NEXT: v_add_i32_e32 v52, vcc, 3, v52 ; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v51 -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v52 -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v53 -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v30 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v54 -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v7 +; SI-NEXT: v_add_i32_e32 v50, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v55 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v8 +; SI-NEXT: v_add_i32_e32 v49, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v41 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v32 +; SI-NEXT: v_add_i32_e32 v48, vcc, 3, v48 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v42 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v9 +; SI-NEXT: v_add_i32_e32 v39, vcc, 3, v39 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v43 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v10 +; SI-NEXT: v_add_i32_e32 v38, vcc, 3, v38 +; SI-NEXT: v_add_i32_e32 v37, vcc, 3, v37 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v4 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v11 +; SI-NEXT: v_add_i32_e32 v36, vcc, 3, v36 +; SI-NEXT: v_add_i32_e32 v35, vcc, 3, v35 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v7 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v12 +; SI-NEXT: v_add_i32_e32 v34, vcc, 3, v34 +; SI-NEXT: v_add_i32_e32 v33, vcc, 3, v33 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v40 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v13 +; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v47 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v37 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v56, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v39 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: .LBB56_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v38 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v34 -; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v32 -; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v63 -; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v61 -; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v59 -; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v62 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v57 -; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 -; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v58 -; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v8, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v62 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v57 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v10, v41 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v2, v56 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v12, v45 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v14, v56 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x54, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v61 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v55 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v43 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v44i16_to_v44f16: @@ -34562,455 +33799,328 @@ define inreg <44 x half> @bitcast_v44i16_to_v44f16_scalar(<44 x i16> inreg %a, i ; SI-LABEL: bitcast_v44i16_to_v44f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: s_lshr_b32 s43, s29, 16 +; SI-NEXT: s_lshr_b32 s42, s28, 16 +; SI-NEXT: s_lshr_b32 s41, s27, 16 +; SI-NEXT: s_lshr_b32 s40, s26, 16 +; SI-NEXT: s_lshr_b32 s15, s25, 16 +; SI-NEXT: s_lshr_b32 s14, s24, 16 +; SI-NEXT: s_lshr_b32 s13, s23, 16 +; SI-NEXT: s_lshr_b32 s12, s22, 16 +; SI-NEXT: s_lshr_b32 s11, s21, 16 +; SI-NEXT: s_lshr_b32 s10, s20, 16 +; SI-NEXT: s_lshr_b32 s9, s19, 16 +; SI-NEXT: s_lshr_b32 s8, s18, 16 +; SI-NEXT: s_lshr_b32 s7, s17, 16 +; SI-NEXT: s_lshr_b32 s6, s16, 16 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v1 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v0 ; SI-NEXT: s_cbranch_scc0 .LBB57_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v31, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s18 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v49, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v60, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s21 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v56, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v45, s24 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v62, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v41, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v58, s27 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v54, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v46, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v1 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v6 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v17 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v20 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v23 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v27 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v30 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v58 ; SI-NEXT: s_cbranch_execnz .LBB57_3 ; SI-NEXT: .LBB57_2: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v3 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v43, vcc, 3, v58 ; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v5 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v7 -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v9 -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v10 -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v11 -; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 -; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v12 -; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v13 -; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v14 -; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v15 -; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v49, vcc, 3, v57 ; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v37, vcc, 3, v56 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v48, vcc, 3, v47 ; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v33, vcc, 3, v46 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v36, vcc, 3, v45 ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v44 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v59 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: s_add_i32 s43, s43, 3 ; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_add_i32 s42, s42, 3 ; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s41, s41, 3 ; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_add_i32 s40, s40, 3 ; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s15, s15, 3 ; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s14, s14, 3 ; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s13, s13, 3 ; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s12, s12, 3 ; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s11, s11, 3 ; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s10, s10, 3 ; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s9, s9, 3 ; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s8, s8, 3 ; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s7, s7, 3 ; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s6, s6, 3 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v49, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v60, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v56, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v45, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v62, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v41, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v58, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v54, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v46, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v30 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v8, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 ; SI-NEXT: .LBB57_3: ; %end -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v36 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v60 -; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v56 -; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v62 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v45 -; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v41 -; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v54 -; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v51 -; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 -; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 -; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 -; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v10 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v19 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v23 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v14 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v27 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v17 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v31 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v20 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v35 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v24 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v39 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v28 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v51 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v32 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v53 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v36 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v55 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v42 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v43 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v41 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v21, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v49 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v37 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 -; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v63 -; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v59 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v47 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v43 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v40 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v52 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x54, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB57_4: +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: s_branch .LBB57_2 ; ; VI-LABEL: bitcast_v44i16_to_v44f16_scalar: @@ -35716,428 +34826,377 @@ define <44 x i16> @bitcast_v44f16_to_v44i16(<44 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v44f16_to_v44i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:56 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v26 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v33 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v1 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v27 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:52 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v30 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v42 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v43 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f16_f32_e32 v7, v44 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f16_f32_e32 v8, v45 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v46 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v47 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v56 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v57 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v22, v3 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v9, v4 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v26 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v52 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB58_2 ; SI-NEXT: ; %bb.1: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v21, v21, v52 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v19, v19, v52 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_or_b32_e32 v17, v17, v52 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_or_b32_e32 v3, v3, v26 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_or_b32_e32 v9, v9, v26 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_or_b32_e32 v1, v1, v26 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_or_b32_e32 v6, v6, v26 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v13 -; SI-NEXT: v_or_b32_e32 v12, v12, v26 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v16 -; SI-NEXT: v_or_b32_e32 v15, v15, v26 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v20 -; SI-NEXT: v_or_b32_e32 v19, v19, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v50 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v26 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_or_b32_e32 v31, v26, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v39 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v26 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v35 -; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; SI-NEXT: v_or_b32_e32 v34, v26, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v26 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_or_b32_e32 v37, v26, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v26 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_or_b32_e32 v48, v26, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_or_b32_e32 v51, v27, v26 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v55 -; SI-NEXT: v_or_b32_e32 v50, v29, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_or_b32_e32 v15, v15, v52 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_or_b32_e32 v13, v13, v52 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_or_b32_e32 v11, v11, v52 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v51, 0x38000000, v51 +; SI-NEXT: v_add_f32_e32 v50, 0x38000000, v50 +; SI-NEXT: v_add_f32_e32 v49, 0x38000000, v49 +; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v48 +; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 +; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 +; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 +; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 +; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 +; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 ; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; SI-NEXT: v_or_b32_e32 v39, v30, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v53 -; SI-NEXT: v_add_f32_e32 v52, 0x38000000, v52 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_or_b32_e32 v9, v9, v52 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 +; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 ; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_or_b32_e32 v7, v7, v52 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 ; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v36 -; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_or_b32_e32 v5, v5, v52 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v52 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v52 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; SI-NEXT: v_or_b32_e32 v36, v30, v29 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v53 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v33, v33, v30 -; SI-NEXT: v_or_b32_e32 v21, v21, v52 -; SI-NEXT: v_or_b32_e32 v17, v17, v25 -; SI-NEXT: v_or_b32_e32 v14, v14, v24 -; SI-NEXT: v_or_b32_e32 v8, v8, v23 -; SI-NEXT: v_or_b32_e32 v5, v5, v22 -; SI-NEXT: v_or_b32_e32 v11, v11, v18 -; SI-NEXT: v_alignbit_b32 v41, v48, v26, 16 -; SI-NEXT: v_alignbit_b32 v40, v37, v27, 16 -; SI-NEXT: v_alignbit_b32 v55, v34, v28, 16 -; SI-NEXT: v_alignbit_b32 v54, v31, v29, 16 -; SI-NEXT: v_alignbit_b32 v53, v19, v30, 16 -; SI-NEXT: v_alignbit_b32 v52, v15, v52, 16 -; SI-NEXT: v_alignbit_b32 v25, v12, v25, 16 -; SI-NEXT: v_alignbit_b32 v24, v6, v24, 16 -; SI-NEXT: v_alignbit_b32 v23, v1, v23, 16 -; SI-NEXT: v_alignbit_b32 v22, v9, v22, 16 -; SI-NEXT: v_alignbit_b32 v18, v3, v18, 16 +; SI-NEXT: v_or_b32_e32 v0, v0, v51 +; SI-NEXT: v_or_b32_e32 v33, v33, v50 +; SI-NEXT: v_or_b32_e32 v32, v32, v49 +; SI-NEXT: v_or_b32_e32 v31, v31, v48 +; SI-NEXT: v_or_b32_e32 v30, v30, v39 +; SI-NEXT: v_or_b32_e32 v28, v28, v38 +; SI-NEXT: v_or_b32_e32 v27, v27, v37 +; SI-NEXT: v_or_b32_e32 v26, v26, v36 +; SI-NEXT: v_or_b32_e32 v24, v24, v35 +; SI-NEXT: v_or_b32_e32 v25, v25, v34 +; SI-NEXT: v_or_b32_e32 v23, v23, v29 +; SI-NEXT: v_alignbit_b32 v51, v1, v51, 16 +; SI-NEXT: v_alignbit_b32 v50, v3, v50, 16 +; SI-NEXT: v_alignbit_b32 v49, v5, v49, 16 +; SI-NEXT: v_alignbit_b32 v48, v7, v48, 16 +; SI-NEXT: v_alignbit_b32 v39, v9, v39, 16 +; SI-NEXT: v_alignbit_b32 v38, v11, v38, 16 +; SI-NEXT: v_alignbit_b32 v37, v13, v37, 16 +; SI-NEXT: v_alignbit_b32 v36, v15, v36, 16 +; SI-NEXT: v_alignbit_b32 v35, v17, v35, 16 +; SI-NEXT: v_alignbit_b32 v34, v19, v34, 16 +; SI-NEXT: v_alignbit_b32 v29, v21, v29, 16 ; SI-NEXT: .LBB58_2: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_and_b32_e32 v26, 0xffff, v51 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v41 -; SI-NEXT: v_or_b32_e32 v26, v26, v27 -; SI-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v26, 0xffff, v48 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v49 -; SI-NEXT: v_or_b32_e32 v26, v26, v27 -; SI-NEXT: v_add_i32_e32 v27, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v26, v27, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v26, 0xffff, v50 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v40 -; SI-NEXT: v_or_b32_e32 v26, v26, v27 -; SI-NEXT: v_add_i32_e32 v27, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v26, v27, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v26, 0xffff, v37 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v38 -; SI-NEXT: v_or_b32_e32 v26, v26, v27 -; SI-NEXT: v_add_i32_e32 v27, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v26, v27, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v26, 0xffff, v39 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v55 -; SI-NEXT: v_or_b32_e32 v26, v26, v27 -; SI-NEXT: v_add_i32_e32 v27, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v26, v27, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v26, 0xffff, v34 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v35 -; SI-NEXT: v_or_b32_e32 v26, v26, v27 -; SI-NEXT: v_add_i32_e32 v27, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v26, v27, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v26, 0xffff, v36 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v54 -; SI-NEXT: v_or_b32_e32 v26, v26, v27 -; SI-NEXT: v_add_i32_e32 v27, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v26, v27, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v26, 0xffff, v31 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v32 -; SI-NEXT: v_or_b32_e32 v26, v26, v27 -; SI-NEXT: v_add_i32_e32 v27, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v26, v27, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v26, 0xffff, v33 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v53 -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v26, v26, v27 -; SI-NEXT: v_add_i32_e32 v27, vcc, 32, v0 -; SI-NEXT: v_or_b32_e32 v19, v19, v20 -; SI-NEXT: v_add_i32_e32 v20, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v26, v27, s[0:3], 0 offen -; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v21 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v52 ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v19, v19, v20 -; SI-NEXT: v_add_i32_e32 v20, vcc, 40, v0 -; SI-NEXT: v_or_b32_e32 v15, v15, v16 -; SI-NEXT: v_add_i32_e32 v16, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen -; SI-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v17 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v25 -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v15, v15, v16 -; SI-NEXT: v_add_i32_e32 v16, vcc, 48, v0 -; SI-NEXT: v_or_b32_e32 v12, v12, v13 -; SI-NEXT: v_add_i32_e32 v13, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen -; SI-NEXT: buffer_store_dword v12, v13, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v24 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v12, v12, v13 -; SI-NEXT: v_add_i32_e32 v13, vcc, 56, v0 -; SI-NEXT: v_or_b32_e32 v6, v6, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, 60, v0 -; SI-NEXT: buffer_store_dword v12, v13, s[0:3], 0 offen -; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v23 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v6, v6, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, 64, v0 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 -; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v22 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v10 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v18 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v35 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x54, v0 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v50 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v49 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v31 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v48 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v39 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v38 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v37 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v36 +; SI-NEXT: v_or_b32_e32 v16, v16, v24 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v34 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v29 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v0, v0, v51 +; SI-NEXT: v_or_b32_e32 v2, v2, v33 +; SI-NEXT: v_or_b32_e32 v4, v4, v32 +; SI-NEXT: v_or_b32_e32 v6, v6, v31 +; SI-NEXT: v_or_b32_e32 v8, v8, v30 +; SI-NEXT: v_or_b32_e32 v10, v10, v28 +; SI-NEXT: v_or_b32_e32 v12, v12, v27 +; SI-NEXT: v_or_b32_e32 v14, v14, v26 +; SI-NEXT: v_or_b32_e32 v18, v18, v24 +; SI-NEXT: v_or_b32_e32 v20, v20, v23 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v44f16_to_v44i16: @@ -36569,414 +35628,413 @@ define inreg <44 x i16> @bitcast_v44f16_to_v44i16_scalar(<44 x half> inreg %a, i ; SI-LABEL: bitcast_v44f16_to_v44i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v57, v2 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v58, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v40, s16 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s18 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s14 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s10 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s8, s27, 16 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s28 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_cvt_f16_f32_e32 v59, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s6 +; SI-NEXT: s_lshr_b32 s7, s28, 16 +; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: s_lshr_b32 s13, s22, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s40, s18, 16 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: s_lshr_b32 s41, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v5 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v63, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v54, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v61, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v62, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v59, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v60, s28 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v50, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v26, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v48, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v24, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v38, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v16, s26 -; SI-NEXT: v_cvt_f16_f32_e32 v29, s29 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v20 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_cbranch_scc0 .LBB59_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_cbranch_execnz .LBB59_3 ; SI-NEXT: .LBB59_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v5, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v5 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_or_b32_e32 v26, v3, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v24 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v48 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v5 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_or_b32_e32 v24, v3, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v16 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v5 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_or_b32_e32 v16, v3, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v22 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v29 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v5 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_or_b32_e32 v22, v3, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v18 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v36 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v5 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_or_b32_e32 v18, v3, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v14 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v5 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_or_b32_e32 v14, v3, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v10 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v32 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v5 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v42 -; SI-NEXT: v_or_b32_e32 v10, v3, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v30 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v5 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v41 -; SI-NEXT: v_or_b32_e32 v8, v3, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v27 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v5 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v55 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v20 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v46 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v40 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v44 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_or_b32_e32 v6, v3, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v53 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v4 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v6 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v18 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v4, v3, v4 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v33 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v57 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v46 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v38 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v40 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v24 +; SI-NEXT: v_or_b32_e32 v44, v22, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v26 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v52 -; SI-NEXT: v_or_b32_e32 v2, v2, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_or_b32_e32 v54, v12, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v58 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v10 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v12 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v8 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_or_b32_e32 v60, v19, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v46 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v41 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v32 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v34 +; SI-NEXT: v_or_b32_e32 v27, v22, v4 +; SI-NEXT: v_or_b32_e32 v56, v24, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v63 +; SI-NEXT: v_or_b32_e32 v26, v23, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v29 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_or_b32_e32 v40, v11, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v62 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v14 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v16 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_or_b32_e32 v58, v12, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v45 -; SI-NEXT: v_or_b32_e32 v46, v19, v9 -; SI-NEXT: v_or_b32_e32 v62, v11, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v35 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v45 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v28 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v60 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v43, v22, v10 +; SI-NEXT: v_or_b32_e32 v63, v24, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_or_b32_e32 v60, v23, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v54 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_or_b32_e32 v56, v11, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v45, v12, v7 -; SI-NEXT: v_or_b32_e32 v12, v19, v3 -; SI-NEXT: v_or_b32_e32 v43, v11, v5 -; SI-NEXT: v_or_b32_e32 v11, v20, v1 -; SI-NEXT: v_lshr_b64 v[29:30], v[21:22], 16 -; SI-NEXT: v_lshr_b64 v[34:35], v[13:14], 16 -; SI-NEXT: v_lshr_b64 v[32:33], v[9:10], 16 -; SI-NEXT: v_lshr_b64 v[50:51], v[25:26], 16 -; SI-NEXT: v_lshr_b64 v[48:49], v[23:24], 16 -; SI-NEXT: v_lshr_b64 v[38:39], v[15:16], 16 -; SI-NEXT: v_lshr_b64 v[36:37], v[17:18], 16 -; SI-NEXT: v_mov_b32_e32 v35, v12 -; SI-NEXT: v_mov_b32_e32 v33, v11 -; SI-NEXT: v_lshr_b64 v[30:31], v[7:8], 16 -; SI-NEXT: v_lshr_b64 v[27:28], v[5:6], 16 -; SI-NEXT: v_lshr_b64 v[19:20], v[3:4], 16 -; SI-NEXT: v_lshr_b64 v[11:12], v[1:2], 16 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v62, v22, v16 +; SI-NEXT: v_or_b32_e32 v58, v24, v18 +; SI-NEXT: v_or_b32_e32 v22, v25, v20 +; SI-NEXT: v_lshr_b64 v[28:29], v[14:15], 16 +; SI-NEXT: v_lshr_b64 v[24:25], v[18:19], 16 +; SI-NEXT: v_or_b32_e32 v54, v23, v14 +; SI-NEXT: v_lshr_b64 v[50:51], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[48:49], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[38:39], v[4:5], 16 +; SI-NEXT: v_lshr_b64 v[36:37], v[6:7], 16 +; SI-NEXT: v_lshr_b64 v[34:35], v[8:9], 16 +; SI-NEXT: v_lshr_b64 v[32:33], v[10:11], 16 +; SI-NEXT: v_lshr_b64 v[30:31], v[12:13], 16 +; SI-NEXT: v_mov_b32_e32 v29, v60 +; SI-NEXT: v_lshr_b64 v[60:61], v[16:17], 16 +; SI-NEXT: v_mov_b32_e32 v25, v22 +; SI-NEXT: v_lshr_b64 v[22:23], v[20:21], 16 ; SI-NEXT: .LBB59_3: ; %end -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v50 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v40 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v26 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v63 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v48 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v54 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v24 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v61 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v38 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v62 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v59 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v29 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v60 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v57 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v58 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v36 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v47 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v56 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v34 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v44 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v46 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v32 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v42 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v45 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v30 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v41 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v43 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v27 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v55 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v35 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v19 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v53 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v33 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v11 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v50 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v44 +; SI-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v46 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x54, v0 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v55 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v38 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v27 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v47 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v36 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v56 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v41 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v34 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v29 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v59 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v32 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v43 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v57 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v63 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v30 +; SI-NEXT: v_or_b32_e32 v12, v12, v14 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v45 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v28 +; SI-NEXT: v_or_b32_e32 v14, v14, v16 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v42 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v62 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v60 +; SI-NEXT: v_or_b32_e32 v16, v16, v18 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v40 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v58 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v24 +; SI-NEXT: v_or_b32_e32 v18, v18, v20 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v53 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v20, v20, v22 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v52 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB59_4: ; SI-NEXT: s_branch .LBB59_2 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll index 9569f4e38116f..1ff6bbd4e9a37 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll @@ -2440,260 +2440,185 @@ define <48 x i16> @bitcast_v24i32_to_v48i16(<24 x i32> %a, i32 %b) { ; SI-LABEL: bitcast_v24i32_to_v48i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v25 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB12_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_alignbit_b32 v25, v24, v23, 16 -; SI-NEXT: v_alignbit_b32 v26, v22, v21, 16 -; SI-NEXT: v_alignbit_b32 v27, v20, v19, 16 -; SI-NEXT: v_alignbit_b32 v28, v18, v17, 16 -; SI-NEXT: v_alignbit_b32 v29, v16, v15, 16 -; SI-NEXT: v_alignbit_b32 v31, v14, v13, 16 -; SI-NEXT: v_alignbit_b32 v34, v12, v11, 16 -; SI-NEXT: v_alignbit_b32 v36, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v38, v8, v7, 16 -; SI-NEXT: v_alignbit_b32 v48, v6, v5, 16 -; SI-NEXT: v_alignbit_b32 v51, v4, v3, 16 -; SI-NEXT: v_alignbit_b32 v53, v2, v1, 16 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v4 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v2 +; SI-NEXT: v_alignbit_b32 v24, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v25, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v26, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v27, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v28, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v29, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v30, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v31, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v33, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v36, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v38, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v49, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v1 ; SI-NEXT: .LBB12_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB12_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 ; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 ; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 ; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 ; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 ; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 ; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 ; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 ; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 ; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; SI-NEXT: v_alignbit_b32 v25, v24, v23, 16 -; SI-NEXT: v_alignbit_b32 v26, v22, v21, 16 -; SI-NEXT: v_alignbit_b32 v27, v20, v19, 16 -; SI-NEXT: v_alignbit_b32 v28, v18, v17, 16 -; SI-NEXT: v_alignbit_b32 v29, v16, v15, 16 -; SI-NEXT: v_alignbit_b32 v31, v14, v13, 16 -; SI-NEXT: v_alignbit_b32 v34, v12, v11, 16 -; SI-NEXT: v_alignbit_b32 v36, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v38, v8, v7, 16 -; SI-NEXT: v_alignbit_b32 v48, v6, v5, 16 -; SI-NEXT: v_alignbit_b32 v51, v4, v3, 16 -; SI-NEXT: v_alignbit_b32 v53, v2, v1, 16 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v4 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v2 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_alignbit_b32 v24, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v25, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v26, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v27, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v28, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v29, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v30, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v31, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v33, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v36, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v38, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v49, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v1 ; SI-NEXT: .LBB12_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v0, v0, v49 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; SI-NEXT: v_or_b32_e32 v1, v1, v53 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v49 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v28 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v26 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v24 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x5c, v0 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v55 +; SI-NEXT: v_or_b32_e32 v2, v2, v38 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v54 +; SI-NEXT: v_or_b32_e32 v4, v4, v36 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v53 +; SI-NEXT: v_or_b32_e32 v6, v6, v33 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v52 +; SI-NEXT: v_or_b32_e32 v8, v8, v31 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v51 +; SI-NEXT: v_or_b32_e32 v10, v10, v30 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v50 +; SI-NEXT: v_or_b32_e32 v12, v12, v29 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v48 +; SI-NEXT: v_or_b32_e32 v14, v14, v28 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v39 +; SI-NEXT: v_or_b32_e32 v16, v16, v27 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v37 +; SI-NEXT: v_or_b32_e32 v18, v18, v26 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v35 +; SI-NEXT: v_or_b32_e32 v20, v20, v25 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v34 +; SI-NEXT: v_or_b32_e32 v22, v22, v24 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v32 +; SI-NEXT: v_or_b32_e32 v1, v1, v49 +; SI-NEXT: v_or_b32_e32 v3, v3, v38 +; SI-NEXT: v_or_b32_e32 v5, v5, v36 +; SI-NEXT: v_or_b32_e32 v7, v7, v33 +; SI-NEXT: v_or_b32_e32 v9, v9, v31 +; SI-NEXT: v_or_b32_e32 v11, v11, v30 +; SI-NEXT: v_or_b32_e32 v13, v13, v29 +; SI-NEXT: v_or_b32_e32 v15, v15, v28 +; SI-NEXT: v_or_b32_e32 v17, v17, v27 +; SI-NEXT: v_or_b32_e32 v19, v19, v26 +; SI-NEXT: v_or_b32_e32 v21, v21, v25 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v24i32_to_v48i16: @@ -3190,53 +3115,53 @@ define inreg <48 x i16> @bitcast_v24i32_to_v48i16_scalar(<24 x i32> inreg %a, i3 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v20, s30, 0 -; SI-NEXT: v_mov_b32_e32 v12, s16 -; SI-NEXT: v_mov_b32_e32 v13, s17 -; SI-NEXT: v_mov_b32_e32 v14, s18 -; SI-NEXT: v_mov_b32_e32 v15, s19 -; SI-NEXT: v_mov_b32_e32 v16, s20 -; SI-NEXT: v_mov_b32_e32 v17, s21 -; SI-NEXT: v_writelane_b32 v20, s31, 1 -; SI-NEXT: v_mov_b32_e32 v18, s22 -; SI-NEXT: v_mov_b32_e32 v19, s23 -; SI-NEXT: v_readfirstlane_b32 s40, v12 -; SI-NEXT: v_mov_b32_e32 v12, s24 -; SI-NEXT: v_readfirstlane_b32 s41, v13 -; SI-NEXT: v_mov_b32_e32 v13, s25 -; SI-NEXT: v_readfirstlane_b32 s24, v14 -; SI-NEXT: v_mov_b32_e32 v14, s26 -; SI-NEXT: v_readfirstlane_b32 s25, v15 -; SI-NEXT: v_mov_b32_e32 v15, s27 -; SI-NEXT: v_readfirstlane_b32 s22, v16 -; SI-NEXT: v_mov_b32_e32 v16, s28 -; SI-NEXT: v_readfirstlane_b32 s23, v17 -; SI-NEXT: v_mov_b32_e32 v17, s29 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; SI-NEXT: v_writelane_b32 v20, s34, 2 -; SI-NEXT: v_readfirstlane_b32 s20, v18 -; SI-NEXT: v_readfirstlane_b32 s21, v19 -; SI-NEXT: v_readfirstlane_b32 s18, v12 -; SI-NEXT: v_readfirstlane_b32 s19, v13 -; SI-NEXT: v_readfirstlane_b32 s16, v14 -; SI-NEXT: v_readfirstlane_b32 s17, v15 -; SI-NEXT: v_readfirstlane_b32 s14, v16 -; SI-NEXT: v_readfirstlane_b32 s15, v17 -; SI-NEXT: v_readfirstlane_b32 s12, v1 -; SI-NEXT: v_readfirstlane_b32 s13, v2 -; SI-NEXT: v_readfirstlane_b32 s10, v3 -; SI-NEXT: v_readfirstlane_b32 s11, v4 -; SI-NEXT: v_readfirstlane_b32 s8, v5 -; SI-NEXT: v_readfirstlane_b32 s9, v6 -; SI-NEXT: v_readfirstlane_b32 s6, v7 -; SI-NEXT: v_readfirstlane_b32 s7, v8 -; SI-NEXT: v_readfirstlane_b32 s4, v9 +; SI-NEXT: v_writelane_b32 v24, s30, 0 +; SI-NEXT: v_mov_b32_e32 v11, s16 +; SI-NEXT: v_mov_b32_e32 v12, s17 +; SI-NEXT: v_mov_b32_e32 v13, s18 +; SI-NEXT: v_mov_b32_e32 v14, s19 +; SI-NEXT: v_mov_b32_e32 v15, s20 +; SI-NEXT: v_writelane_b32 v24, s31, 1 +; SI-NEXT: v_mov_b32_e32 v16, s21 +; SI-NEXT: v_mov_b32_e32 v17, s22 +; SI-NEXT: v_mov_b32_e32 v18, s23 +; SI-NEXT: v_mov_b32_e32 v19, s24 +; SI-NEXT: v_readfirstlane_b32 s40, v11 +; SI-NEXT: v_mov_b32_e32 v11, s25 +; SI-NEXT: v_readfirstlane_b32 s41, v12 +; SI-NEXT: v_mov_b32_e32 v12, s26 +; SI-NEXT: v_readfirstlane_b32 s24, v13 +; SI-NEXT: v_mov_b32_e32 v13, s27 +; SI-NEXT: v_readfirstlane_b32 s25, v14 +; SI-NEXT: v_mov_b32_e32 v14, s28 +; SI-NEXT: v_readfirstlane_b32 s22, v15 +; SI-NEXT: v_mov_b32_e32 v15, s29 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: v_writelane_b32 v24, s34, 2 +; SI-NEXT: v_readfirstlane_b32 s23, v16 +; SI-NEXT: v_readfirstlane_b32 s20, v17 +; SI-NEXT: v_readfirstlane_b32 s21, v18 +; SI-NEXT: v_readfirstlane_b32 s18, v19 +; SI-NEXT: v_readfirstlane_b32 s19, v11 +; SI-NEXT: v_readfirstlane_b32 s16, v12 +; SI-NEXT: v_readfirstlane_b32 s17, v13 +; SI-NEXT: v_readfirstlane_b32 s14, v14 +; SI-NEXT: v_readfirstlane_b32 s15, v15 +; SI-NEXT: v_readfirstlane_b32 s12, v0 +; SI-NEXT: v_readfirstlane_b32 s13, v1 +; SI-NEXT: v_readfirstlane_b32 s10, v2 +; SI-NEXT: v_readfirstlane_b32 s11, v3 +; SI-NEXT: v_readfirstlane_b32 s8, v4 +; SI-NEXT: v_readfirstlane_b32 s9, v5 +; SI-NEXT: v_readfirstlane_b32 s6, v6 +; SI-NEXT: v_readfirstlane_b32 s7, v7 +; SI-NEXT: v_readfirstlane_b32 s4, v8 ; SI-NEXT: s_and_b64 s[26:27], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s5, v10 -; SI-NEXT: v_writelane_b32 v20, s35, 3 +; SI-NEXT: v_readfirstlane_b32 s5, v9 +; SI-NEXT: v_writelane_b32 v24, s35, 3 ; SI-NEXT: s_cbranch_scc0 .LBB13_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_lshr_b32 s88, s5, 16 @@ -3317,175 +3242,107 @@ define inreg <48 x i16> @bitcast_v24i32_to_v48i16_scalar(<24 x i32> inreg %a, i3 ; SI-NEXT: s_lshl_b32 s27, s76, 16 ; SI-NEXT: s_and_b32 s29, s40, 0xffff ; SI-NEXT: s_or_b32 s27, s29, s27 -; SI-NEXT: v_mov_b32_e32 v1, s27 -; SI-NEXT: s_and_b32 s27, s41, 0xffff -; SI-NEXT: s_lshl_b32 s29, s35, 16 -; SI-NEXT: s_or_b32 s27, s27, s29 -; SI-NEXT: v_mov_b32_e32 v2, s27 -; SI-NEXT: s_lshl_b32 s27, s74, 16 +; SI-NEXT: s_and_b32 s29, s41, 0xffff +; SI-NEXT: s_lshl_b32 s40, s35, 16 +; SI-NEXT: s_or_b32 s29, s29, s40 +; SI-NEXT: s_lshl_b32 s40, s74, 16 ; SI-NEXT: s_and_b32 s24, s24, 0xffff -; SI-NEXT: s_or_b32 s24, s24, s27 -; SI-NEXT: v_mov_b32_e32 v3, s24 -; SI-NEXT: s_and_b32 s24, s25, 0xffff -; SI-NEXT: s_lshl_b32 s25, s34, 16 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; SI-NEXT: s_or_b32 s24, s24, s25 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v1, vcc, 8, v0 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s24 +; SI-NEXT: s_or_b32 s24, s24, s40 +; SI-NEXT: s_and_b32 s25, s25, 0xffff +; SI-NEXT: s_lshl_b32 s40, s34, 16 +; SI-NEXT: s_or_b32 s25, s25, s40 +; SI-NEXT: s_lshl_b32 s40, s72, 16 ; SI-NEXT: s_and_b32 s22, s22, 0xffff -; SI-NEXT: s_lshl_b32 s24, s72, 16 -; SI-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v1, vcc, 12, v0 -; SI-NEXT: s_or_b32 s22, s22, s24 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s22 -; SI-NEXT: s_and_b32 s22, s23, 0xffff -; SI-NEXT: s_lshl_b32 s23, s31, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 16, v0 -; SI-NEXT: s_or_b32 s22, s22, s23 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s22 +; SI-NEXT: s_or_b32 s22, s22, s40 +; SI-NEXT: s_and_b32 s23, s23, 0xffff +; SI-NEXT: s_lshl_b32 s40, s31, 16 +; SI-NEXT: s_or_b32 s23, s23, s40 ; SI-NEXT: s_and_b32 s20, s20, 0xffff -; SI-NEXT: s_lshl_b32 s22, s62, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 20, v0 -; SI-NEXT: s_or_b32 s20, s20, s22 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s20 -; SI-NEXT: s_and_b32 s20, s21, 0xffff -; SI-NEXT: s_lshl_b32 s21, s30, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 24, v0 -; SI-NEXT: s_or_b32 s20, s20, s21 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s20 +; SI-NEXT: s_lshl_b32 s40, s62, 16 +; SI-NEXT: s_or_b32 s20, s20, s40 +; SI-NEXT: s_and_b32 s21, s21, 0xffff +; SI-NEXT: s_lshl_b32 s40, s30, 16 +; SI-NEXT: s_or_b32 s21, s21, s40 ; SI-NEXT: s_and_b32 s18, s18, 0xffff -; SI-NEXT: s_lshl_b32 s20, s60, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 28, v0 -; SI-NEXT: s_or_b32 s18, s18, s20 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s18 -; SI-NEXT: s_and_b32 s18, s19, 0xffff -; SI-NEXT: s_lshl_b32 s19, s95, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 32, v0 -; SI-NEXT: s_or_b32 s18, s18, s19 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: s_lshl_b32 s40, s60, 16 +; SI-NEXT: s_or_b32 s18, s18, s40 +; SI-NEXT: s_and_b32 s19, s19, 0xffff +; SI-NEXT: s_lshl_b32 s40, s95, 16 +; SI-NEXT: s_or_b32 s19, s19, s40 ; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: s_lshl_b32 s18, s58, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 36, v0 -; SI-NEXT: s_or_b32 s16, s16, s18 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s17, 0xffff -; SI-NEXT: s_lshl_b32 s17, s94, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 40, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_lshl_b32 s40, s58, 16 +; SI-NEXT: s_or_b32 s16, s16, s40 +; SI-NEXT: s_and_b32 s17, s17, 0xffff +; SI-NEXT: s_lshl_b32 s40, s94, 16 +; SI-NEXT: s_or_b32 s17, s17, s40 ; SI-NEXT: s_and_b32 s14, s14, 0xffff -; SI-NEXT: s_lshl_b32 s16, s56, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 44, v0 -; SI-NEXT: s_or_b32 s14, s14, s16 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s14 -; SI-NEXT: s_and_b32 s14, s15, 0xffff -; SI-NEXT: s_lshl_b32 s15, s93, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 48, v0 -; SI-NEXT: s_or_b32 s14, s14, s15 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s14 +; SI-NEXT: s_lshl_b32 s40, s56, 16 +; SI-NEXT: s_or_b32 s14, s14, s40 +; SI-NEXT: s_and_b32 s15, s15, 0xffff +; SI-NEXT: s_lshl_b32 s40, s93, 16 +; SI-NEXT: s_or_b32 s15, s15, s40 ; SI-NEXT: s_and_b32 s12, s12, 0xffff -; SI-NEXT: s_lshl_b32 s14, s46, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 52, v0 -; SI-NEXT: s_or_b32 s12, s12, s14 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s12 -; SI-NEXT: s_and_b32 s12, s13, 0xffff -; SI-NEXT: s_lshl_b32 s13, s92, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 56, v0 -; SI-NEXT: s_or_b32 s12, s12, s13 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s12 +; SI-NEXT: s_lshl_b32 s40, s46, 16 +; SI-NEXT: s_or_b32 s12, s12, s40 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_lshl_b32 s40, s92, 16 +; SI-NEXT: s_or_b32 s13, s13, s40 ; SI-NEXT: s_and_b32 s10, s10, 0xffff -; SI-NEXT: s_lshl_b32 s12, s44, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 60, v0 -; SI-NEXT: s_or_b32 s10, s10, s12 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s10 -; SI-NEXT: s_and_b32 s10, s11, 0xffff -; SI-NEXT: s_lshl_b32 s11, s91, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 64, v0 -; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: s_lshl_b32 s40, s44, 16 +; SI-NEXT: s_or_b32 s10, s10, s40 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: s_lshl_b32 s40, s91, 16 +; SI-NEXT: s_or_b32 s11, s11, s40 ; SI-NEXT: s_and_b32 s8, s8, 0xffff -; SI-NEXT: s_lshl_b32 s10, s42, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x44, v0 -; SI-NEXT: s_or_b32 s8, s8, s10 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s8 -; SI-NEXT: s_and_b32 s8, s9, 0xffff -; SI-NEXT: s_lshl_b32 s9, s90, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x48, v0 -; SI-NEXT: s_or_b32 s8, s8, s9 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: s_lshl_b32 s40, s42, 16 ; SI-NEXT: s_and_b32 s6, s6, 0xffff -; SI-NEXT: s_lshl_b32 s8, s28, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x4c, v0 -; SI-NEXT: s_or_b32 s6, s6, s8 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: s_and_b32 s6, s7, 0xffff -; SI-NEXT: s_lshl_b32 s7, s89, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x50, v0 -; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_lshl_b32 s28, s28, 16 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_lshl_b32 s6, s26, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x54, v0 -; SI-NEXT: s_or_b32 s4, s4, s6 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s4 -; SI-NEXT: s_and_b32 s4, s5, 0xffff -; SI-NEXT: s_lshl_b32 s5, s88, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x58, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x5c, v0 -; SI-NEXT: v_mov_b32_e32 v1, s4 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: v_readlane_b32 s35, v20, 3 -; SI-NEXT: v_readlane_b32 s34, v20, 2 -; SI-NEXT: v_readlane_b32 s31, v20, 1 -; SI-NEXT: v_readlane_b32 s30, v20, 0 +; SI-NEXT: s_lshl_b32 s26, s26, 16 +; SI-NEXT: s_or_b32 s8, s8, s40 +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_lshl_b32 s40, s90, 16 +; SI-NEXT: s_or_b32 s6, s6, s28 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_lshl_b32 s28, s89, 16 +; SI-NEXT: s_or_b32 s4, s4, s26 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s26, s88, 16 +; SI-NEXT: s_or_b32 s9, s9, s40 +; SI-NEXT: s_or_b32 s7, s7, s28 +; SI-NEXT: s_or_b32 s5, s5, s26 +; SI-NEXT: v_mov_b32_e32 v0, s27 +; SI-NEXT: v_mov_b32_e32 v1, s29 +; SI-NEXT: v_mov_b32_e32 v2, s24 +; SI-NEXT: v_mov_b32_e32 v3, s25 +; SI-NEXT: v_mov_b32_e32 v4, s22 +; SI-NEXT: v_mov_b32_e32 v5, s23 +; SI-NEXT: v_mov_b32_e32 v6, s20 +; SI-NEXT: v_mov_b32_e32 v7, s21 +; SI-NEXT: v_mov_b32_e32 v8, s18 +; SI-NEXT: v_mov_b32_e32 v9, s19 +; SI-NEXT: v_mov_b32_e32 v10, s16 +; SI-NEXT: v_mov_b32_e32 v11, s17 +; SI-NEXT: v_mov_b32_e32 v12, s14 +; SI-NEXT: v_mov_b32_e32 v13, s15 +; SI-NEXT: v_mov_b32_e32 v14, s12 +; SI-NEXT: v_mov_b32_e32 v15, s13 +; SI-NEXT: v_mov_b32_e32 v16, s10 +; SI-NEXT: v_mov_b32_e32 v17, s11 +; SI-NEXT: v_mov_b32_e32 v18, s8 +; SI-NEXT: v_mov_b32_e32 v19, s9 +; SI-NEXT: v_mov_b32_e32 v20, s6 +; SI-NEXT: v_mov_b32_e32 v21, s7 +; SI-NEXT: v_mov_b32_e32 v22, s4 +; SI-NEXT: v_mov_b32_e32 v23, s5 +; SI-NEXT: v_readlane_b32 s35, v24, 3 +; SI-NEXT: v_readlane_b32 s34, v24, 2 +; SI-NEXT: v_readlane_b32 s31, v24, 1 +; SI-NEXT: v_readlane_b32 s30, v24, 0 ; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[4:5] -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB13_4: ; SI-NEXT: ; implicit-def: $sgpr76 @@ -4156,285 +4013,281 @@ define <24 x i32> @bitcast_v48i16_to_v24i32(<48 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v48i16_to_v24i32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v48, v14 -; SI-NEXT: v_mov_b32_e32 v49, v12 -; SI-NEXT: v_mov_b32_e32 v50, v10 -; SI-NEXT: v_mov_b32_e32 v51, v8 -; SI-NEXT: v_mov_b32_e32 v52, v6 -; SI-NEXT: v_mov_b32_e32 v53, v4 -; SI-NEXT: v_mov_b32_e32 v54, v2 -; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v29 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v8 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v12 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:52 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:44 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:36 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v18 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:28 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:12 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:4 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v36, v19 +; SI-NEXT: v_mov_b32_e32 v37, v18 +; SI-NEXT: v_mov_b32_e32 v38, v17 +; SI-NEXT: v_mov_b32_e32 v39, v16 +; SI-NEXT: v_mov_b32_e32 v48, v15 +; SI-NEXT: v_mov_b32_e32 v49, v14 +; SI-NEXT: v_mov_b32_e32 v50, v13 +; SI-NEXT: v_mov_b32_e32 v51, v12 +; SI-NEXT: v_mov_b32_e32 v52, v11 +; SI-NEXT: v_mov_b32_e32 v53, v10 +; SI-NEXT: v_mov_b32_e32 v54, v9 +; SI-NEXT: v_mov_b32_e32 v55, v8 +; SI-NEXT: v_mov_b32_e32 v40, v7 +; SI-NEXT: v_mov_b32_e32 v41, v6 +; SI-NEXT: v_mov_b32_e32 v42, v5 +; SI-NEXT: v_mov_b32_e32 v43, v4 +; SI-NEXT: v_mov_b32_e32 v44, v3 +; SI-NEXT: v_mov_b32_e32 v45, v2 +; SI-NEXT: v_mov_b32_e32 v46, v1 +; SI-NEXT: v_mov_b32_e32 v47, v0 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v23 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v22 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v21 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v38 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v48 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v50 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v52 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v40 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v42 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v43 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v44 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v45 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v46 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v47 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB14_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; kill: killed $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; kill: killed $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; kill: killed $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; kill: killed $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; kill: killed $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; kill: killed $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; kill: killed $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; kill: killed $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; kill: killed $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; kill: killed $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; kill: killed $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; kill: killed $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; kill: killed $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v53 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v52 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v51 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v50 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v49 -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v48 -; SI-NEXT: ; kill: killed $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: v_or_b32_e32 v0, v0, v47 -; SI-NEXT: v_or_b32_e32 v1, v1, v39 -; SI-NEXT: v_or_b32_e32 v2, v2, v46 -; SI-NEXT: v_or_b32_e32 v3, v3, v38 -; SI-NEXT: v_or_b32_e32 v4, v4, v45 -; SI-NEXT: v_or_b32_e32 v5, v5, v37 -; SI-NEXT: v_or_b32_e32 v6, v6, v44 -; SI-NEXT: v_or_b32_e32 v7, v7, v36 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v51 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v47 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v46 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v45 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v44 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v43 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v42 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v41 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v40 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v55 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v54 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v53 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v52 +; SI-NEXT: v_or_b32_e32 v0, v0, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v59 +; SI-NEXT: v_or_b32_e32 v2, v2, v34 +; SI-NEXT: v_or_b32_e32 v3, v3, v58 +; SI-NEXT: v_or_b32_e32 v4, v4, v33 +; SI-NEXT: v_or_b32_e32 v5, v5, v57 +; SI-NEXT: v_or_b32_e32 v6, v6, v32 +; SI-NEXT: v_or_b32_e32 v7, v7, v56 +; SI-NEXT: v_or_b32_e32 v8, v8, v63 +; SI-NEXT: v_or_b32_e32 v9, v9, v62 +; SI-NEXT: v_or_b32_e32 v10, v10, v61 +; SI-NEXT: v_or_b32_e32 v11, v11, v60 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; kill: killed $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; kill: killed $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v50 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v49 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v48 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v39 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v38 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v37 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v36 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; SI-NEXT: v_or_b32_e32 v8, v8, v43 -; SI-NEXT: v_or_b32_e32 v9, v9, v35 -; SI-NEXT: v_or_b32_e32 v10, v10, v42 -; SI-NEXT: v_or_b32_e32 v11, v11, v34 -; SI-NEXT: v_or_b32_e32 v12, v12, v41 -; SI-NEXT: v_or_b32_e32 v13, v13, v33 -; SI-NEXT: v_or_b32_e32 v14, v14, v40 -; SI-NEXT: v_or_b32_e32 v15, v15, v32 -; SI-NEXT: v_or_b32_e32 v16, v16, v63 -; SI-NEXT: v_or_b32_e32 v17, v17, v62 -; SI-NEXT: v_or_b32_e32 v18, v18, v61 -; SI-NEXT: v_or_b32_e32 v19, v19, v60 -; SI-NEXT: v_or_b32_e32 v20, v20, v59 -; SI-NEXT: v_or_b32_e32 v21, v21, v58 -; SI-NEXT: v_or_b32_e32 v22, v22, v57 -; SI-NEXT: v_or_b32_e32 v23, v23, v56 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 ; SI-NEXT: .LBB14_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB14_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v51 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v53 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v52 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v51 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v50 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v49 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v48 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v47 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v46 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v45 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v44 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v43 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v42 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v41 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v40 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v52 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 @@ -4443,15 +4296,23 @@ define <24 x i32> @bitcast_v48i16_to_v24i32(<48 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: v_or_b32_e32 v0, v47, v0 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_or_b32_e32 v0, v35, v0 ; SI-NEXT: s_mov_b32 s6, 0x30000 -; SI-NEXT: v_or_b32_e32 v1, v39, v1 -; SI-NEXT: v_or_b32_e32 v2, v46, v2 -; SI-NEXT: v_or_b32_e32 v3, v38, v3 -; SI-NEXT: v_or_b32_e32 v4, v45, v4 -; SI-NEXT: v_or_b32_e32 v5, v37, v5 -; SI-NEXT: v_or_b32_e32 v6, v44, v6 -; SI-NEXT: v_or_b32_e32 v7, v36, v7 +; SI-NEXT: v_or_b32_e32 v1, v59, v1 +; SI-NEXT: v_or_b32_e32 v2, v34, v2 +; SI-NEXT: v_or_b32_e32 v3, v58, v3 +; SI-NEXT: v_or_b32_e32 v4, v33, v4 +; SI-NEXT: v_or_b32_e32 v5, v57, v5 +; SI-NEXT: v_or_b32_e32 v6, v32, v6 +; SI-NEXT: v_or_b32_e32 v7, v56, v7 +; SI-NEXT: v_or_b32_e32 v8, v63, v8 +; SI-NEXT: v_or_b32_e32 v9, v62, v9 +; SI-NEXT: v_or_b32_e32 v10, v61, v10 +; SI-NEXT: v_or_b32_e32 v11, v60, v11 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 @@ -4460,64 +4321,40 @@ define <24 x i32> @bitcast_v48i16_to_v24i32(<48 x i16> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 ; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 ; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v50 ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v49 ; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v48 ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v39 ; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v38 ; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v37 ; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v36 ; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; SI-NEXT: v_or_b32_e32 v8, v43, v8 -; SI-NEXT: v_or_b32_e32 v9, v35, v9 -; SI-NEXT: v_or_b32_e32 v10, v42, v10 -; SI-NEXT: v_or_b32_e32 v11, v34, v11 -; SI-NEXT: v_or_b32_e32 v12, v41, v12 -; SI-NEXT: v_or_b32_e32 v13, v33, v13 -; SI-NEXT: v_or_b32_e32 v14, v40, v14 -; SI-NEXT: v_or_b32_e32 v15, v32, v15 -; SI-NEXT: v_or_b32_e32 v16, v63, v16 -; SI-NEXT: v_or_b32_e32 v17, v62, v17 -; SI-NEXT: v_or_b32_e32 v18, v61, v18 -; SI-NEXT: v_or_b32_e32 v19, v60, v19 -; SI-NEXT: v_or_b32_e32 v20, v59, v20 -; SI-NEXT: v_or_b32_e32 v21, v58, v21 -; SI-NEXT: v_or_b32_e32 v22, v57, v22 -; SI-NEXT: v_or_b32_e32 v23, v56, v23 -; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 ; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 ; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 @@ -4528,28 +4365,51 @@ define <24 x i32> @bitcast_v48i16_to_v24i32(<48 x i16> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 ; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 ; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 ; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v23 ; SI-NEXT: .LBB14_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -5200,319 +5060,270 @@ define inreg <24 x i32> @bitcast_v48i16_to_v24i32_scalar(<48 x i16> inreg %a, i3 ; SI-LABEL: bitcast_v48i16_to_v24i32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mov_b32_e32 v61, v4 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v63, v2 -; SI-NEXT: v_mov_b32_e32 v56, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:4 -; SI-NEXT: v_mov_b32_e32 v31, v22 -; SI-NEXT: v_mov_b32_e32 v34, v20 -; SI-NEXT: v_mov_b32_e32 v35, v18 -; SI-NEXT: v_mov_b32_e32 v36, v16 -; SI-NEXT: v_mov_b32_e32 v37, v14 -; SI-NEXT: v_mov_b32_e32 v38, v12 -; SI-NEXT: v_mov_b32_e32 v39, v10 -; SI-NEXT: v_mov_b32_e32 v48, v8 -; SI-NEXT: v_mov_b32_e32 v50, v6 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v2 +; SI-NEXT: v_mov_b32_e32 v32, v9 +; SI-NEXT: v_mov_b32_e32 v33, v8 +; SI-NEXT: v_mov_b32_e32 v34, v7 +; SI-NEXT: v_mov_b32_e32 v35, v6 +; SI-NEXT: v_mov_b32_e32 v36, v5 +; SI-NEXT: v_mov_b32_e32 v37, v4 +; SI-NEXT: v_mov_b32_e32 v38, v3 +; SI-NEXT: v_mov_b32_e32 v39, v2 +; SI-NEXT: v_mov_b32_e32 v48, v1 +; SI-NEXT: v_mov_b32_e32 v49, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v38 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v48 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v49 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s7, s28, 16 +; SI-NEXT: s_lshr_b32 s8, s27, 16 +; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: s_lshr_b32 s13, s22, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s40, s19, 16 +; SI-NEXT: s_lshr_b32 s41, s18, 16 +; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v0 ; SI-NEXT: s_cbranch_scc0 .LBB15_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v56 -; SI-NEXT: v_or_b32_e32 v7, v0, v54 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v61 -; SI-NEXT: v_or_b32_e32 v9, v0, v32 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 -; SI-NEXT: v_or_b32_e32 v10, v0, v55 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s44, s42, 16 +; SI-NEXT: s_or_b32 s5, s5, s44 +; SI-NEXT: s_and_b32 s44, s18, 0xffff +; SI-NEXT: s_lshl_b32 s45, s41, 16 +; SI-NEXT: s_or_b32 s44, s44, s45 +; SI-NEXT: s_and_b32 s45, s19, 0xffff +; SI-NEXT: s_lshl_b32 s46, s40, 16 +; SI-NEXT: s_or_b32 s45, s45, s46 +; SI-NEXT: s_and_b32 s46, s20, 0xffff +; SI-NEXT: s_lshl_b32 s47, s15, 16 +; SI-NEXT: s_or_b32 s46, s46, s47 +; SI-NEXT: s_and_b32 s47, s21, 0xffff +; SI-NEXT: s_lshl_b32 s56, s14, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: s_or_b32 s47, s47, s56 +; SI-NEXT: s_and_b32 s56, s22, 0xffff +; SI-NEXT: s_lshl_b32 s57, s13, 16 +; SI-NEXT: v_or_b32_e32 v14, v0, v43 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 -; SI-NEXT: v_or_b32_e32 v11, v0, v62 +; SI-NEXT: s_or_b32 s56, s56, s57 +; SI-NEXT: s_and_b32 s57, s23, 0xffff +; SI-NEXT: s_lshl_b32 s58, s12, 16 +; SI-NEXT: v_or_b32_e32 v15, v0, v42 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 -; SI-NEXT: v_or_b32_e32 v12, v0, v46 +; SI-NEXT: s_or_b32 s57, s57, s58 +; SI-NEXT: s_and_b32 s58, s24, 0xffff +; SI-NEXT: s_lshl_b32 s59, s11, 16 +; SI-NEXT: v_or_b32_e32 v16, v0, v41 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 -; SI-NEXT: v_or_b32_e32 v13, v0, v45 +; SI-NEXT: s_or_b32 s58, s58, s59 +; SI-NEXT: s_and_b32 s59, s25, 0xffff +; SI-NEXT: s_lshl_b32 s60, s10, 16 +; SI-NEXT: v_or_b32_e32 v17, v0, v40 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 -; SI-NEXT: v_or_b32_e32 v14, v0, v60 +; SI-NEXT: s_or_b32 s59, s59, s60 +; SI-NEXT: s_and_b32 s60, s26, 0xffff +; SI-NEXT: s_lshl_b32 s61, s9, 16 +; SI-NEXT: v_or_b32_e32 v18, v0, v55 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 -; SI-NEXT: v_or_b32_e32 v15, v0, v59 +; SI-NEXT: s_or_b32 s60, s60, s61 +; SI-NEXT: s_and_b32 s61, s27, 0xffff +; SI-NEXT: s_lshl_b32 s62, s8, 16 +; SI-NEXT: v_or_b32_e32 v19, v0, v54 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 -; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: v_or_b32_e32 v16, v0, v58 +; SI-NEXT: s_or_b32 s61, s61, s62 +; SI-NEXT: s_and_b32 s62, s28, 0xffff +; SI-NEXT: s_lshl_b32 s63, s7, 16 +; SI-NEXT: v_or_b32_e32 v20, v0, v53 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: v_or_b32_e32 v17, v0, v41 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v31 -; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: v_or_b32_e32 v18, v0, v40 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v24 -; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: v_or_b32_e32 v19, v0, v52 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v26 -; SI-NEXT: s_or_b32 s7, s7, s8 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: v_or_b32_e32 v20, v0, v51 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v28 -; SI-NEXT: s_or_b32 s8, s8, s9 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: v_or_b32_e32 v21, v0, v29 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 -; SI-NEXT: s_or_b32 s9, s9, s10 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v63 -; SI-NEXT: v_or_b32_e32 v22, v0, v27 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 -; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_or_b32_e32 v8, v1, v57 -; SI-NEXT: v_or_b32_e32 v23, v0, v25 +; SI-NEXT: s_or_b32 s62, s62, s63 +; SI-NEXT: s_and_b32 s63, s29, 0xffff +; SI-NEXT: s_lshl_b32 s72, s6, 16 +; SI-NEXT: v_or_b32_e32 v21, v0, v52 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: s_or_b32 s63, s63, s72 +; SI-NEXT: v_or_b32_e32 v22, v0, v51 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: v_or_b32_e32 v23, v0, v50 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_mov_b32_e32 v5, s9 -; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: v_mov_b32_e32 v2, s44 +; SI-NEXT: v_mov_b32_e32 v3, s45 +; SI-NEXT: v_mov_b32_e32 v4, s46 +; SI-NEXT: v_mov_b32_e32 v5, s47 +; SI-NEXT: v_mov_b32_e32 v6, s56 +; SI-NEXT: v_mov_b32_e32 v7, s57 +; SI-NEXT: v_mov_b32_e32 v8, s58 +; SI-NEXT: v_mov_b32_e32 v9, s59 +; SI-NEXT: v_mov_b32_e32 v10, s60 +; SI-NEXT: v_mov_b32_e32 v11, s61 +; SI-NEXT: v_mov_b32_e32 v12, s62 +; SI-NEXT: v_mov_b32_e32 v13, s63 ; SI-NEXT: s_cbranch_execnz .LBB15_3 ; SI-NEXT: .LBB15_2: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v56 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v54, v0 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v61 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v32, v0 -; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v0, v55, v0 -; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 +; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v62, v0 -; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 +; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v46, v0 -; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 +; SI-NEXT: v_or_b32_e32 v0, v41, v0 +; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v45, v0 -; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 +; SI-NEXT: v_or_b32_e32 v0, v40, v0 +; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v60, v0 -; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: v_or_b32_e32 v0, v55, v0 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s16, s42, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: s_and_b32 s16, s18, 0xffff +; SI-NEXT: s_lshl_b32 s17, s41, 16 +; SI-NEXT: s_add_i32 s19, s19, 3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v59, v0 -; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v58, v0 -; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v41, v0 -; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v31 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v40, v0 -; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v24 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v52, v0 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_and_b32 s17, s19, 0xffff +; SI-NEXT: s_lshl_b32 s18, s40, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_or_b32_e32 v0, v54, v0 +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: s_and_b32 s18, s20, 0xffff +; SI-NEXT: s_lshl_b32 s15, s15, 16 +; SI-NEXT: s_add_i32 s21, s21, 3 ; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: s_or_b32 s15, s15, s18 +; SI-NEXT: s_and_b32 s18, s21, 0xffff +; SI-NEXT: s_lshl_b32 s14, s14, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_or_b32_e32 v0, v51, v0 -; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s14, s14, s18 +; SI-NEXT: s_and_b32 s18, s22, 0xffff +; SI-NEXT: s_lshl_b32 s13, s13, 16 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: v_or_b32_e32 v0, v53, v0 +; SI-NEXT: s_or_b32 s13, s13, s18 +; SI-NEXT: s_and_b32 s18, s23, 0xffff +; SI-NEXT: s_lshl_b32 s12, s12, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 ; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v28 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 +; SI-NEXT: s_or_b32 s12, s12, s18 +; SI-NEXT: s_and_b32 s18, s24, 0xffff +; SI-NEXT: s_lshl_b32 s11, s11, 16 +; SI-NEXT: s_add_i32 s25, s25, 3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: v_or_b32_e32 v0, v29, v0 -; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 -; SI-NEXT: s_or_b32 s7, s8, s7 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_or_b32 s11, s11, s18 +; SI-NEXT: s_and_b32 s18, s25, 0xffff +; SI-NEXT: s_lshl_b32 s10, s10, 16 ; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_or_b32 s8, s9, s8 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: v_or_b32_e32 v0, v52, v0 +; SI-NEXT: s_or_b32 s10, s10, s18 +; SI-NEXT: s_and_b32 s18, s26, 0xffff +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: s_or_b32 s9, s9, s18 +; SI-NEXT: s_and_b32 s18, s27, 0xffff +; SI-NEXT: s_lshl_b32 s8, s8, 16 ; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: v_or_b32_e32 v0, v27, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v63 -; SI-NEXT: s_or_b32 s9, s10, s9 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s8, s8, s18 +; SI-NEXT: s_and_b32 s18, s28, 0xffff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: v_or_b32_e32 v0, v51, v0 +; SI-NEXT: s_or_b32 s7, s7, s18 +; SI-NEXT: s_and_b32 s18, s29, 0xffff +; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: v_add_i32_e32 v22, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 +; SI-NEXT: s_or_b32 s6, s6, s18 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v57, v1 ; SI-NEXT: s_add_i32 s4, s4, 0x30000 ; SI-NEXT: s_add_i32 s5, s5, 0x30000 -; SI-NEXT: s_add_i32 s6, s6, 0x30000 -; SI-NEXT: s_add_i32 s7, s7, 0x30000 -; SI-NEXT: s_add_i32 s8, s8, 0x30000 -; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s16, s16, 0x30000 +; SI-NEXT: s_add_i32 s17, s17, 0x30000 +; SI-NEXT: s_add_i32 s15, s15, 0x30000 +; SI-NEXT: s_add_i32 s14, s14, 0x30000 +; SI-NEXT: s_add_i32 s13, s13, 0x30000 +; SI-NEXT: s_add_i32 s12, s12, 0x30000 +; SI-NEXT: s_add_i32 s11, s11, 0x30000 ; SI-NEXT: s_add_i32 s10, s10, 0x30000 -; SI-NEXT: v_or_b32_e32 v0, v25, v0 -; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v50, v0 ; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_mov_b32_e32 v5, s9 -; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: v_mov_b32_e32 v3, s17 +; SI-NEXT: v_mov_b32_e32 v4, s15 +; SI-NEXT: v_mov_b32_e32 v5, s14 +; SI-NEXT: v_mov_b32_e32 v6, s13 +; SI-NEXT: v_mov_b32_e32 v7, s12 +; SI-NEXT: v_mov_b32_e32 v8, s11 +; SI-NEXT: v_mov_b32_e32 v9, s10 +; SI-NEXT: v_mov_b32_e32 v10, s9 +; SI-NEXT: v_mov_b32_e32 v11, s8 +; SI-NEXT: v_mov_b32_e32 v12, s7 +; SI-NEXT: v_mov_b32_e32 v13, s6 ; SI-NEXT: .LBB15_3: ; %end -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB15_4: -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v46, v51 -; SI-NEXT: v_mov_b32_e32 v51, v39 -; SI-NEXT: v_mov_b32_e32 v39, v34 -; SI-NEXT: v_mov_b32_e32 v34, v30 -; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v47, v52 -; SI-NEXT: v_mov_b32_e32 v52, v48 -; SI-NEXT: v_mov_b32_e32 v48, v35 -; SI-NEXT: v_mov_b32_e32 v35, v28 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v55, v57 -; SI-NEXT: v_mov_b32_e32 v57, v41 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_mov_b32_e32 v41, v49 -; SI-NEXT: v_mov_b32_e32 v49, v36 -; SI-NEXT: v_mov_b32_e32 v36, v26 -; SI-NEXT: v_mov_b32_e32 v42, v50 -; SI-NEXT: v_mov_b32_e32 v50, v37 -; SI-NEXT: v_mov_b32_e32 v37, v24 -; SI-NEXT: v_mov_b32_e32 v33, v32 -; SI-NEXT: v_mov_b32_e32 v32, v56 -; SI-NEXT: v_mov_b32_e32 v56, v40 -; SI-NEXT: v_mov_b32_e32 v40, v38 -; SI-NEXT: v_mov_b32_e32 v38, v31 -; SI-NEXT: v_mov_b32_e32 v43, v25 -; SI-NEXT: v_mov_b32_e32 v44, v27 -; SI-NEXT: v_mov_b32_e32 v53, v45 -; SI-NEXT: v_mov_b32_e32 v45, v29 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v26, v36 -; SI-NEXT: v_mov_b32_e32 v36, v49 -; SI-NEXT: v_mov_b32_e32 v49, v41 -; SI-NEXT: v_mov_b32_e32 v41, v57 -; SI-NEXT: v_mov_b32_e32 v57, v55 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v30, v34 -; SI-NEXT: v_mov_b32_e32 v34, v39 -; SI-NEXT: v_mov_b32_e32 v39, v51 -; SI-NEXT: v_mov_b32_e32 v51, v46 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v29, v45 -; SI-NEXT: v_mov_b32_e32 v45, v53 -; SI-NEXT: v_mov_b32_e32 v27, v44 -; SI-NEXT: v_mov_b32_e32 v25, v43 -; SI-NEXT: v_mov_b32_e32 v31, v38 -; SI-NEXT: v_mov_b32_e32 v38, v40 -; SI-NEXT: v_mov_b32_e32 v40, v56 -; SI-NEXT: v_mov_b32_e32 v56, v32 -; SI-NEXT: v_mov_b32_e32 v32, v33 -; SI-NEXT: v_mov_b32_e32 v24, v37 -; SI-NEXT: v_mov_b32_e32 v37, v50 -; SI-NEXT: v_mov_b32_e32 v50, v42 -; SI-NEXT: v_mov_b32_e32 v28, v35 -; SI-NEXT: v_mov_b32_e32 v35, v48 -; SI-NEXT: v_mov_b32_e32 v48, v52 -; SI-NEXT: v_mov_b32_e32 v52, v47 ; SI-NEXT: s_branch .LBB15_2 ; ; VI-LABEL: bitcast_v48i16_to_v24i32_scalar: @@ -6141,16 +5952,12 @@ end: define <48 x half> @bitcast_v24i32_to_v48f16(<24 x i32> %a, i32 %b) { ; SI-LABEL: bitcast_v24i32_to_v48f16: ; SI: ; %bb.0: -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v25 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -6167,18 +5974,15 @@ define <48 x half> @bitcast_v24i32_to_v48f16(<24 x i32> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; kill: killed $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr62 @@ -6189,119 +5993,126 @@ define <48 x half> @bitcast_v24i32_to_v48f16(<24 x i32> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; kill: killed $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB16_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v22 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v24 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v53, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v24 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v50, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v0 +; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 @@ -6325,11 +6136,17 @@ define <48 x half> @bitcast_v24i32_to_v48f16(<24 x i32> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: .LBB16_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB16_4 ; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v33 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 @@ -6349,37 +6166,32 @@ define <48 x half> @bitcast_v24i32_to_v48f16(<24 x i32> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 ; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 ; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 ; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 ; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 ; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 ; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v18 ; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v20 ; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v21 ; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v22 ; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 @@ -6388,242 +6200,126 @@ define <48 x half> @bitcast_v24i32_to_v48f16(<24 x i32> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 ; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 ; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v32 ; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 ; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 ; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 ; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 ; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_mov_b32_e32 v35, v24 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v33, v22 +; SI-NEXT: v_mov_b32_e32 v32, v23 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: .LBB16_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v30 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: v_cvt_f16_f32_e32 v1, v29 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v28 -; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v26 -; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v25 -; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v62 -; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v60 -; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v58 -; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v56 -; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v46 -; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v44 -; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v42 -; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v41 -; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v55 -; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v53 -; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 -; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 -; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x5c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v35 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v28 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v27 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v24 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v62 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v61 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v58 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v57 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v45 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v46 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v41 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v42 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v53 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v54 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v48 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v50 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v16, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v34 ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -6640,10 +6336,52 @@ define <48 x half> @bitcast_v24i32_to_v48f16(<24 x i32> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; SI-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: bitcast_v24i32_to_v48f16: +; SI-NEXT: v_cvt_f16_f32_e32 v22, v37 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v32 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v20, v35 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v39 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v33 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 +; SI-NEXT: v_or_b32_e32 v23, v25, v23 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v24i32_to_v48f16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 @@ -7136,131 +6874,130 @@ define inreg <48 x half> @bitcast_v24i32_to_v48f16_scalar(<24 x i32> inreg %a, i ; SI-LABEL: bitcast_v24i32_to_v48f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v12, s16 -; SI-NEXT: v_mov_b32_e32 v13, s17 -; SI-NEXT: v_mov_b32_e32 v14, s18 -; SI-NEXT: v_mov_b32_e32 v15, s19 -; SI-NEXT: v_mov_b32_e32 v16, s20 -; SI-NEXT: v_mov_b32_e32 v17, s21 -; SI-NEXT: v_mov_b32_e32 v18, s22 -; SI-NEXT: v_mov_b32_e32 v19, s23 -; SI-NEXT: v_readfirstlane_b32 s23, v12 -; SI-NEXT: v_mov_b32_e32 v12, s24 -; SI-NEXT: v_readfirstlane_b32 s24, v13 -; SI-NEXT: v_mov_b32_e32 v13, s25 -; SI-NEXT: v_readfirstlane_b32 s25, v14 -; SI-NEXT: v_mov_b32_e32 v14, s26 -; SI-NEXT: v_readfirstlane_b32 s26, v15 -; SI-NEXT: v_mov_b32_e32 v15, s27 -; SI-NEXT: v_readfirstlane_b32 s27, v16 -; SI-NEXT: v_mov_b32_e32 v16, s28 -; SI-NEXT: v_readfirstlane_b32 s28, v17 -; SI-NEXT: v_mov_b32_e32 v17, s29 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; SI-NEXT: v_readfirstlane_b32 s29, v18 -; SI-NEXT: v_readfirstlane_b32 s22, v19 -; SI-NEXT: v_readfirstlane_b32 s21, v12 -; SI-NEXT: v_readfirstlane_b32 s20, v13 -; SI-NEXT: v_readfirstlane_b32 s19, v14 -; SI-NEXT: v_readfirstlane_b32 s18, v15 -; SI-NEXT: v_readfirstlane_b32 s17, v16 -; SI-NEXT: v_readfirstlane_b32 s16, v17 -; SI-NEXT: v_readfirstlane_b32 s15, v1 -; SI-NEXT: v_readfirstlane_b32 s14, v2 -; SI-NEXT: v_readfirstlane_b32 s13, v3 -; SI-NEXT: v_readfirstlane_b32 s12, v4 -; SI-NEXT: v_readfirstlane_b32 s11, v5 -; SI-NEXT: v_readfirstlane_b32 s10, v6 -; SI-NEXT: v_readfirstlane_b32 s8, v7 -; SI-NEXT: v_readfirstlane_b32 s7, v8 -; SI-NEXT: v_readfirstlane_b32 s6, v9 +; SI-NEXT: v_mov_b32_e32 v11, s16 +; SI-NEXT: v_mov_b32_e32 v12, s17 +; SI-NEXT: v_mov_b32_e32 v13, s18 +; SI-NEXT: v_mov_b32_e32 v14, s19 +; SI-NEXT: v_mov_b32_e32 v15, s20 +; SI-NEXT: v_mov_b32_e32 v16, s21 +; SI-NEXT: v_mov_b32_e32 v17, s22 +; SI-NEXT: v_mov_b32_e32 v18, s23 +; SI-NEXT: v_mov_b32_e32 v19, s24 +; SI-NEXT: v_readfirstlane_b32 s24, v11 +; SI-NEXT: v_mov_b32_e32 v11, s25 +; SI-NEXT: v_readfirstlane_b32 s25, v12 +; SI-NEXT: v_mov_b32_e32 v12, s26 +; SI-NEXT: v_readfirstlane_b32 s26, v13 +; SI-NEXT: v_mov_b32_e32 v13, s27 +; SI-NEXT: v_readfirstlane_b32 s27, v14 +; SI-NEXT: v_mov_b32_e32 v14, s28 +; SI-NEXT: v_readfirstlane_b32 s28, v15 +; SI-NEXT: v_mov_b32_e32 v15, s29 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: v_readfirstlane_b32 s29, v16 +; SI-NEXT: v_readfirstlane_b32 s23, v17 +; SI-NEXT: v_readfirstlane_b32 s22, v18 +; SI-NEXT: v_readfirstlane_b32 s21, v19 +; SI-NEXT: v_readfirstlane_b32 s20, v11 +; SI-NEXT: v_readfirstlane_b32 s19, v12 +; SI-NEXT: v_readfirstlane_b32 s18, v13 +; SI-NEXT: v_readfirstlane_b32 s17, v14 +; SI-NEXT: v_readfirstlane_b32 s16, v15 +; SI-NEXT: v_readfirstlane_b32 s15, v0 +; SI-NEXT: v_readfirstlane_b32 s14, v1 +; SI-NEXT: v_readfirstlane_b32 s13, v2 +; SI-NEXT: v_readfirstlane_b32 s12, v3 +; SI-NEXT: v_readfirstlane_b32 s11, v4 +; SI-NEXT: v_readfirstlane_b32 s10, v5 +; SI-NEXT: v_readfirstlane_b32 s8, v6 +; SI-NEXT: v_readfirstlane_b32 s7, v7 +; SI-NEXT: v_readfirstlane_b32 s6, v8 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s9, v10 +; SI-NEXT: v_readfirstlane_b32 s9, v9 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB17_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_lshr_b32 s4, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s4 ; SI-NEXT: s_lshr_b32 s4, s6, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 ; SI-NEXT: s_lshr_b32 s4, s7, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 ; SI-NEXT: s_lshr_b32 s4, s8, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 ; SI-NEXT: s_lshr_b32 s4, s10, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 ; SI-NEXT: s_lshr_b32 s4, s11, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 ; SI-NEXT: s_lshr_b32 s4, s12, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 ; SI-NEXT: s_lshr_b32 s4, s13, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 ; SI-NEXT: s_lshr_b32 s4, s14, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 ; SI-NEXT: s_lshr_b32 s4, s15, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 ; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 ; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 ; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 ; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 ; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 ; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 ; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s4 ; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 ; SI-NEXT: s_lshr_b32 s4, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s4 ; SI-NEXT: s_lshr_b32 s4, s27, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 ; SI-NEXT: s_lshr_b32 s4, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v53, s4 ; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 ; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v54, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v40, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v49, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v51, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v53, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v55, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v55, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v51, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v54, s24 ; SI-NEXT: s_cbranch_execnz .LBB17_3 ; SI-NEXT: .LBB17_2: ; %cmp.true -; SI-NEXT: s_add_i32 s23, s23, 3 ; SI-NEXT: s_add_i32 s24, s24, 3 ; SI-NEXT: s_add_i32 s25, s25, 3 ; SI-NEXT: s_add_i32 s26, s26, 3 ; SI-NEXT: s_add_i32 s27, s27, 3 ; SI-NEXT: s_add_i32 s28, s28, 3 ; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 ; SI-NEXT: s_add_i32 s22, s22, 3 ; SI-NEXT: s_add_i32 s21, s21, 3 ; SI-NEXT: s_add_i32 s20, s20, 3 @@ -7278,13 +7015,13 @@ define inreg <48 x half> @bitcast_v24i32_to_v48f16_scalar(<24 x i32> inreg %a, i ; SI-NEXT: s_add_i32 s7, s7, 3 ; SI-NEXT: s_add_i32 s6, s6, 3 ; SI-NEXT: s_add_i32 s9, s9, 3 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: s_lshr_b32 s5, s24, 16 -; SI-NEXT: s_lshr_b32 s40, s25, 16 -; SI-NEXT: s_lshr_b32 s41, s26, 16 -; SI-NEXT: s_lshr_b32 s42, s27, 16 -; SI-NEXT: s_lshr_b32 s43, s28, 16 -; SI-NEXT: s_lshr_b32 s44, s29, 16 +; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: s_lshr_b32 s5, s25, 16 +; SI-NEXT: s_lshr_b32 s40, s26, 16 +; SI-NEXT: s_lshr_b32 s41, s27, 16 +; SI-NEXT: s_lshr_b32 s42, s28, 16 +; SI-NEXT: s_lshr_b32 s43, s29, 16 +; SI-NEXT: s_lshr_b32 s44, s23, 16 ; SI-NEXT: s_lshr_b32 s45, s22, 16 ; SI-NEXT: s_lshr_b32 s46, s21, 16 ; SI-NEXT: s_lshr_b32 s47, s20, 16 @@ -7302,275 +7039,204 @@ define inreg <48 x half> @bitcast_v24i32_to_v48f16_scalar(<24 x i32> inreg %a, i ; SI-NEXT: s_lshr_b32 s75, s7, 16 ; SI-NEXT: s_lshr_b32 s76, s6, 16 ; SI-NEXT: s_lshr_b32 s77, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v49, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v51, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v53, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v55, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s77 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s76 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s75 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s74 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s73 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s72 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s63 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s62 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s61 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s60 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s59 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s58 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s57 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s56 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s47 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s46 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s45 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s44 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s43 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v54, s5 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v40, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v51, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v54, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s77 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s76 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s75 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s74 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s73 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s72 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s63 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s62 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s61 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s60 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s59 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s58 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s57 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s56 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s47 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s45 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s44 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v53, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v55, s4 ; SI-NEXT: .LBB17_3: ; %end +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v40, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v55, v55 ; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v40, v1 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 ; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v55 +; SI-NEXT: v_or_b32_e32 v0, v54, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v53 +; SI-NEXT: v_or_b32_e32 v2, v52, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 -; SI-NEXT: v_or_b32_e32 v55, v55, v40 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 -; SI-NEXT: buffer_store_dword v55, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v55, vcc, 4, v0 -; SI-NEXT: v_or_b32_e32 v53, v53, v54 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v52 +; SI-NEXT: v_or_b32_e32 v5, v5, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v50 +; SI-NEXT: v_or_b32_e32 v7, v7, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 -; SI-NEXT: buffer_store_dword v53, v55, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v53, vcc, 8, v0 -; SI-NEXT: v_or_b32_e32 v51, v51, v52 -; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v48 +; SI-NEXT: v_or_b32_e32 v9, v38, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: buffer_store_dword v51, v53, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v51, vcc, 12, v0 -; SI-NEXT: v_or_b32_e32 v49, v49, v50 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v38 +; SI-NEXT: v_or_b32_e32 v11, v36, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: buffer_store_dword v49, v51, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v49, vcc, 16, v0 -; SI-NEXT: v_or_b32_e32 v39, v48, v39 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v36 +; SI-NEXT: v_or_b32_e32 v13, v34, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: buffer_store_dword v39, v49, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v39, vcc, 20, v0 -; SI-NEXT: v_or_b32_e32 v37, v38, v37 -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v34 +; SI-NEXT: v_or_b32_e32 v15, v32, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: buffer_store_dword v37, v39, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v37, vcc, 24, v0 -; SI-NEXT: v_or_b32_e32 v35, v36, v35 -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v32 +; SI-NEXT: v_or_b32_e32 v17, v30, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: buffer_store_dword v35, v37, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v35, vcc, 28, v0 -; SI-NEXT: v_or_b32_e32 v33, v34, v33 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v30 +; SI-NEXT: v_or_b32_e32 v19, v28, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: buffer_store_dword v33, v35, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v33, vcc, 32, v0 -; SI-NEXT: v_or_b32_e32 v31, v32, v31 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; SI-NEXT: buffer_store_dword v31, v33, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v31, vcc, 36, v0 -; SI-NEXT: v_or_b32_e32 v29, v30, v29 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; SI-NEXT: buffer_store_dword v29, v31, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v29, vcc, 40, v0 -; SI-NEXT: v_or_b32_e32 v27, v28, v27 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: buffer_store_dword v27, v29, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v27, vcc, 44, v0 -; SI-NEXT: v_or_b32_e32 v24, v26, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: buffer_store_dword v24, v27, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v24, v25 -; SI-NEXT: v_add_i32_e32 v25, vcc, 48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v22, v24, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: buffer_store_dword v22, v25, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v22, v23 -; SI-NEXT: v_add_i32_e32 v23, vcc, 52, v0 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v22, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: buffer_store_dword v20, v23, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v21 -; SI-NEXT: v_add_i32_e32 v21, vcc, 56, v0 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v20, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: buffer_store_dword v18, v21, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v18, v19 -; SI-NEXT: v_add_i32_e32 v19, vcc, 60, v0 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: buffer_store_dword v16, v19, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v16, v17 -; SI-NEXT: v_add_i32_e32 v17, vcc, 64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v16, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v14, v15 -; SI-NEXT: v_add_i32_e32 v15, vcc, 0x44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v14, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: buffer_store_dword v12, v15, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v12, v13 -; SI-NEXT: v_add_i32_e32 v13, vcc, 0x48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: buffer_store_dword v10, v13, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v10, v11 -; SI-NEXT: v_add_i32_e32 v11, vcc, 0x4c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v10, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: buffer_store_dword v7, v11, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v7, v9 -; SI-NEXT: v_add_i32_e32 v9, vcc, 0x50, v0 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v8 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x54, v0 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v28 +; SI-NEXT: v_or_b32_e32 v21, v26, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v6 -; SI-NEXT: v_add_i32_e32 v5, vcc, 0x58, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v4 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x5c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v26 +; SI-NEXT: v_or_b32_e32 v3, v54, v3 +; SI-NEXT: v_or_b32_e32 v4, v51, v4 +; SI-NEXT: v_or_b32_e32 v6, v49, v6 +; SI-NEXT: v_or_b32_e32 v8, v39, v8 +; SI-NEXT: v_or_b32_e32 v10, v37, v10 +; SI-NEXT: v_or_b32_e32 v12, v35, v12 +; SI-NEXT: v_or_b32_e32 v14, v33, v14 +; SI-NEXT: v_or_b32_e32 v16, v31, v16 +; SI-NEXT: v_or_b32_e32 v18, v29, v18 +; SI-NEXT: v_or_b32_e32 v20, v27, v20 +; SI-NEXT: v_or_b32_e32 v22, v25, v22 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB17_4: +; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: s_branch .LBB17_2 ; ; VI-LABEL: bitcast_v24i32_to_v48f16_scalar: @@ -8215,170 +7881,181 @@ define <24 x i32> @bitcast_v48f16_to_v24i32(<48 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v48f16_to_v24i32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v54, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:48 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:44 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:56 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:52 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:64 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v16 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v40, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v11 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v12 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v15 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v32 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v44, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v32 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v63 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v32 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v62 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v61 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v60 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v59 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v58 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v57 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v56 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v23 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB18_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v33 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; kill: killed $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; kill: killed $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; kill: killed $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; kill: killed $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; kill: killed $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; kill: killed $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; kill: killed $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; kill: killed $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v41 ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; kill: killed $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr24 @@ -8414,9 +8091,14 @@ define <24 x i32> @bitcast_v48f16_to_v24i32(<48 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v39 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v37 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v45 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v43 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v41 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v63 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v45 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v43 ; SI-NEXT: ; kill: killed $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: v_or_b32_e32 v0, v54, v0 @@ -8426,9 +8108,14 @@ define <24 x i32> @bitcast_v48f16_to_v24i32(<48 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v4, v38, v4 ; SI-NEXT: v_or_b32_e32 v5, v36, v5 ; SI-NEXT: v_or_b32_e32 v6, v34, v6 -; SI-NEXT: v_or_b32_e32 v21, v44, v21 -; SI-NEXT: v_or_b32_e32 v22, v42, v22 -; SI-NEXT: v_or_b32_e32 v23, v40, v23 +; SI-NEXT: v_or_b32_e32 v7, v32, v7 +; SI-NEXT: v_or_b32_e32 v8, v62, v8 +; SI-NEXT: v_or_b32_e32 v9, v60, v9 +; SI-NEXT: v_or_b32_e32 v10, v58, v10 +; SI-NEXT: v_or_b32_e32 v11, v56, v11 +; SI-NEXT: v_or_b32_e32 v12, v46, v12 +; SI-NEXT: v_or_b32_e32 v13, v44, v13 +; SI-NEXT: v_or_b32_e32 v14, v42, v14 ; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr53 @@ -8444,86 +8131,71 @@ define <24 x i32> @bitcast_v48f16_to_v24i32(<48 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v47 -; SI-NEXT: v_or_b32_e32 v20, v46, v20 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v23, v40, v23 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: .LBB18_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB18_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v55 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v53 @@ -8556,158 +8228,128 @@ define <24 x i32> @bitcast_v48f16_to_v24i32(<48 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v3, v49 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v36 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v36 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v34 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v32 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v39 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v47 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v46 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v44 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v58 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v41 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v40 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v57 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v33 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v13, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v43 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v42 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v40 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v59 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v47 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v45 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v14, v12 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v44 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v41 ; SI-NEXT: v_or_b32_e32 v14, v16, v14 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 @@ -8719,12 +8361,12 @@ define <24 x i32> @bitcast_v48f16_to_v24i32(<48 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 @@ -8734,7 +8376,7 @@ define <24 x i32> @bitcast_v48f16_to_v24i32(<48 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_or_b32_e32 v18, v20, v18 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 @@ -8746,17 +8388,29 @@ define <24 x i32> @bitcast_v48f16_to_v24i32(<48 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_or_b32_e32 v19, v20, v19 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v45 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_or_b32_e32 v20, v22, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v43 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v42 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_or_b32_e32 v22, v23, v22 @@ -8764,22 +8418,22 @@ define <24 x i32> @bitcast_v48f16_to_v24i32(<48 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v23, v25, v23 ; SI-NEXT: .LBB18_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -9431,165 +9085,281 @@ define inreg <24 x i32> @bitcast_v48f16_to_v24i32_scalar(<48 x half> inreg %a, i ; SI-LABEL: bitcast_v48f16_to_v24i32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v62, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v3 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 +; SI-NEXT: s_lshr_b32 s40, s17, 16 +; SI-NEXT: s_lshr_b32 s41, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v63, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v7 +; SI-NEXT: s_lshr_b32 s14, s19, 16 +; SI-NEXT: s_lshr_b32 s15, s18, 16 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v23 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v63, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v6 +; SI-NEXT: s_lshr_b32 s12, s21, 16 +; SI-NEXT: s_lshr_b32 s13, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v56, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v46, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v0, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v8, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v1, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v7, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v2, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v6, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v3, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v5, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v4, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v54, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v51, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v30, s28 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f16_f32_e32 v27, v31 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f16_f32_e32 v24, v34 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f16_f32_e32 v31, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v35, s26 -; SI-NEXT: v_cvt_f16_f32_e32 v34, s29 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s21 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v8 +; SI-NEXT: s_lshr_b32 s10, s23, 16 +; SI-NEXT: s_lshr_b32 s11, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v7 +; SI-NEXT: s_lshr_b32 s8, s25, 16 +; SI-NEXT: s_lshr_b32 s9, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v9 +; SI-NEXT: s_lshr_b32 s6, s27, 16 +; SI-NEXT: s_lshr_b32 s7, s26, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; SI-NEXT: s_lshr_b32 s4, s29, 16 +; SI-NEXT: s_lshr_b32 s5, s28, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s29 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB19_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v0, v8, v0 -; SI-NEXT: v_or_b32_e32 v1, v7, v1 -; SI-NEXT: v_or_b32_e32 v2, v6, v2 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v34 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v62 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v53 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v46 +; SI-NEXT: v_or_b32_e32 v3, v25, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v41 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v44 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v56 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v60 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v55 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v52 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v51 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v48 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v39 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v60 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v34 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v58 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v56 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v46 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v44 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v42 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v29 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 -; SI-NEXT: v_or_b32_e32 v4, v54, v4 -; SI-NEXT: v_or_b32_e32 v5, v35, v5 -; SI-NEXT: v_or_b32_e32 v6, v30, v6 -; SI-NEXT: v_or_b32_e32 v7, v55, v7 -; SI-NEXT: v_or_b32_e32 v8, v63, v8 -; SI-NEXT: v_or_b32_e32 v9, v43, v9 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v28 +; SI-NEXT: v_mov_b32_e32 v43, v41 +; SI-NEXT: v_mov_b32_e32 v45, v44 +; SI-NEXT: v_mov_b32_e32 v41, v40 +; SI-NEXT: v_or_b32_e32 v0, v40, v0 +; SI-NEXT: v_or_b32_e32 v1, v63, v1 +; SI-NEXT: v_or_b32_e32 v2, v47, v2 +; SI-NEXT: v_or_b32_e32 v4, v62, v4 +; SI-NEXT: v_or_b32_e32 v5, v59, v5 +; SI-NEXT: v_or_b32_e32 v6, v58, v6 +; SI-NEXT: v_or_b32_e32 v7, v54, v7 +; SI-NEXT: v_or_b32_e32 v8, v53, v8 +; SI-NEXT: v_or_b32_e32 v9, v50, v9 ; SI-NEXT: v_or_b32_e32 v10, v49, v10 -; SI-NEXT: v_or_b32_e32 v11, v39, v11 +; SI-NEXT: v_or_b32_e32 v11, v38, v11 ; SI-NEXT: v_or_b32_e32 v12, v37, v12 -; SI-NEXT: v_or_b32_e32 v13, v61, v13 +; SI-NEXT: v_or_b32_e32 v13, v35, v13 ; SI-NEXT: v_or_b32_e32 v14, v33, v14 -; SI-NEXT: v_or_b32_e32 v15, v59, v15 -; SI-NEXT: v_or_b32_e32 v16, v57, v16 -; SI-NEXT: v_or_b32_e32 v17, v47, v17 -; SI-NEXT: v_or_b32_e32 v18, v45, v18 -; SI-NEXT: v_or_b32_e32 v19, v25, v19 -; SI-NEXT: v_or_b32_e32 v20, v41, v20 -; SI-NEXT: v_or_b32_e32 v21, v28, v21 -; SI-NEXT: v_or_b32_e32 v22, v26, v22 -; SI-NEXT: v_or_b32_e32 v23, v31, v23 +; SI-NEXT: v_or_b32_e32 v15, v26, v15 +; SI-NEXT: v_or_b32_e32 v16, v31, v16 +; SI-NEXT: v_or_b32_e32 v17, v29, v17 +; SI-NEXT: v_or_b32_e32 v18, v27, v18 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v23, v25, v23 ; SI-NEXT: s_cbranch_execnz .LBB19_3 ; SI-NEXT: .LBB19_2: ; %cmp.true -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v55 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v63 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v59 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v46 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v58 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v54 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v50 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v49 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v48 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 @@ -9600,9 +9370,9 @@ define inreg <24 x i32> @bitcast_v48f16_to_v24i32_scalar(<48 x half> inreg %a, i ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v38 ; SI-NEXT: v_cvt_f32_f16_e32 v14, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v34 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v33 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 @@ -9610,99 +9380,50 @@ define inreg <24 x i32> @bitcast_v48f16_to_v24i32_scalar(<48 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v26 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v30 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v29 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v27 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v42 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v41 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v28 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v42 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v61 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v60 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v57 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v55 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 @@ -9712,22 +9433,30 @@ define inreg <24 x i32> @bitcast_v48f16_to_v24i32_scalar(<48 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v51 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v39 ; SI-NEXT: v_or_b32_e32 v11, v13, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v14, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v35 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 @@ -9735,159 +9464,157 @@ define inreg <24 x i32> @bitcast_v48f16_to_v24i32_scalar(<48 x half> inreg %a, i ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v32 ; SI-NEXT: v_or_b32_e32 v14, v16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v31 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v28 ; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v44 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_or_b32_e32 v18, v20, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v25 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v31 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: v_or_b32_e32 v19, v20, v19 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v29 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_or_b32_e32 v20, v22, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v27 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v26 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_or_b32_e32 v22, v23, v22 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 ; SI-NEXT: v_or_b32_e32 v23, v25, v23 ; SI-NEXT: .LBB19_3: ; %end -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB19_4: -; SI-NEXT: v_mov_b32_e32 v40, v31 -; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v54, v33 -; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v51, v32 -; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v34, v59 -; SI-NEXT: v_mov_b32_e32 v59, v46 -; SI-NEXT: v_mov_b32_e32 v46, v41 -; SI-NEXT: v_mov_b32_e32 v41, v24 -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v35, v60 -; SI-NEXT: v_mov_b32_e32 v60, v47 -; SI-NEXT: v_mov_b32_e32 v47, v42 -; SI-NEXT: v_mov_b32_e32 v42, v26 -; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v55, v39 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_mov_b32_e32 v53, v37 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_mov_b32_e32 v52, v36 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_mov_b32_e32 v51, v35 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mov_b32_e32 v55, v52 +; SI-NEXT: v_mov_b32_e32 v50, v34 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v53, v43 -; SI-NEXT: v_mov_b32_e32 v52, v50 -; SI-NEXT: v_mov_b32_e32 v50, v49 -; SI-NEXT: v_mov_b32_e32 v49, v48 -; SI-NEXT: v_mov_b32_e32 v48, v39 -; SI-NEXT: v_mov_b32_e32 v39, v38 -; SI-NEXT: v_mov_b32_e32 v38, v37 -; SI-NEXT: v_mov_b32_e32 v37, v36 -; SI-NEXT: v_mov_b32_e32 v36, v61 -; SI-NEXT: v_mov_b32_e32 v61, v56 -; SI-NEXT: v_mov_b32_e32 v56, v25 -; SI-NEXT: v_mov_b32_e32 v43, v27 +; SI-NEXT: v_mov_b32_e32 v49, v33 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v48, v32 +; SI-NEXT: v_mov_b32_e32 v39, v26 +; SI-NEXT: v_mov_b32_e32 v37, v58 +; SI-NEXT: v_mov_b32_e32 v58, v27 +; SI-NEXT: v_mov_b32_e32 v36, v59 +; SI-NEXT: v_mov_b32_e32 v59, v28 +; SI-NEXT: v_mov_b32_e32 v35, v60 +; SI-NEXT: v_mov_b32_e32 v60, v29 +; SI-NEXT: v_mov_b32_e32 v34, v61 +; SI-NEXT: v_mov_b32_e32 v61, v30 ; SI-NEXT: v_mov_b32_e32 v33, v62 -; SI-NEXT: v_mov_b32_e32 v62, v57 -; SI-NEXT: v_mov_b32_e32 v57, v44 -; SI-NEXT: v_mov_b32_e32 v44, v28 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v63, v58 -; SI-NEXT: v_mov_b32_e32 v58, v45 -; SI-NEXT: v_mov_b32_e32 v45, v29 -; SI-NEXT: v_mov_b32_e32 v32, v30 +; SI-NEXT: v_mov_b32_e32 v62, v31 +; SI-NEXT: v_mov_b32_e32 v32, v63 +; SI-NEXT: v_mov_b32_e32 v63, v24 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v29, v45 -; SI-NEXT: v_mov_b32_e32 v45, v58 -; SI-NEXT: v_mov_b32_e32 v58, v63 -; SI-NEXT: v_mov_b32_e32 v27, v43 -; SI-NEXT: v_mov_b32_e32 v25, v56 -; SI-NEXT: v_mov_b32_e32 v56, v61 -; SI-NEXT: v_mov_b32_e32 v61, v36 -; SI-NEXT: v_mov_b32_e32 v36, v37 -; SI-NEXT: v_mov_b32_e32 v37, v38 -; SI-NEXT: v_mov_b32_e32 v38, v39 -; SI-NEXT: v_mov_b32_e32 v39, v48 -; SI-NEXT: v_mov_b32_e32 v48, v49 -; SI-NEXT: v_mov_b32_e32 v49, v50 -; SI-NEXT: v_mov_b32_e32 v50, v52 -; SI-NEXT: v_mov_b32_e32 v43, v53 -; SI-NEXT: v_mov_b32_e32 v52, v55 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v30, v32 -; SI-NEXT: v_mov_b32_e32 v28, v44 -; SI-NEXT: v_mov_b32_e32 v44, v57 -; SI-NEXT: v_mov_b32_e32 v57, v62 +; SI-NEXT: v_mov_b32_e32 v45, v44 +; SI-NEXT: v_mov_b32_e32 v24, v63 +; SI-NEXT: v_mov_b32_e32 v63, v32 +; SI-NEXT: v_mov_b32_e32 v31, v62 ; SI-NEXT: v_mov_b32_e32 v62, v33 -; SI-NEXT: v_mov_b32_e32 v26, v42 -; SI-NEXT: v_mov_b32_e32 v42, v47 -; SI-NEXT: v_mov_b32_e32 v47, v60 +; SI-NEXT: v_mov_b32_e32 v30, v61 +; SI-NEXT: v_mov_b32_e32 v61, v34 +; SI-NEXT: v_mov_b32_e32 v29, v60 ; SI-NEXT: v_mov_b32_e32 v60, v35 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v24, v41 -; SI-NEXT: v_mov_b32_e32 v41, v46 -; SI-NEXT: v_mov_b32_e32 v46, v59 -; SI-NEXT: v_mov_b32_e32 v59, v34 -; SI-NEXT: v_mov_b32_e32 v32, v51 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v33, v54 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v31, v40 +; SI-NEXT: v_mov_b32_e32 v28, v59 +; SI-NEXT: v_mov_b32_e32 v59, v36 +; SI-NEXT: v_mov_b32_e32 v27, v58 +; SI-NEXT: v_mov_b32_e32 v58, v37 +; SI-NEXT: v_mov_b32_e32 v26, v39 +; SI-NEXT: v_mov_b32_e32 v32, v48 +; SI-NEXT: v_mov_b32_e32 v33, v49 +; SI-NEXT: v_mov_b32_e32 v34, v50 +; SI-NEXT: v_mov_b32_e32 v35, v51 +; SI-NEXT: v_mov_b32_e32 v36, v52 +; SI-NEXT: v_mov_b32_e32 v37, v53 +; SI-NEXT: v_mov_b32_e32 v39, v55 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v43, v41 +; SI-NEXT: v_mov_b32_e32 v41, v40 ; SI-NEXT: s_branch .LBB19_2 ; ; VI-LABEL: bitcast_v48f16_to_v24i32_scalar: @@ -12042,260 +11769,185 @@ define <48 x i16> @bitcast_v24f32_to_v48i16(<24 x float> %a, i32 %b) { ; SI-LABEL: bitcast_v24f32_to_v48i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v25 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB28_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_alignbit_b32 v25, v24, v23, 16 -; SI-NEXT: v_alignbit_b32 v26, v22, v21, 16 -; SI-NEXT: v_alignbit_b32 v27, v20, v19, 16 -; SI-NEXT: v_alignbit_b32 v28, v18, v17, 16 -; SI-NEXT: v_alignbit_b32 v29, v16, v15, 16 -; SI-NEXT: v_alignbit_b32 v31, v14, v13, 16 -; SI-NEXT: v_alignbit_b32 v34, v12, v11, 16 -; SI-NEXT: v_alignbit_b32 v36, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v38, v8, v7, 16 -; SI-NEXT: v_alignbit_b32 v48, v6, v5, 16 -; SI-NEXT: v_alignbit_b32 v51, v4, v3, 16 -; SI-NEXT: v_alignbit_b32 v53, v2, v1, 16 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v4 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v2 +; SI-NEXT: v_alignbit_b32 v24, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v25, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v26, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v27, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v28, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v29, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v30, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v31, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v33, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v36, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v38, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v49, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v1 ; SI-NEXT: .LBB28_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB28_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 -; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 ; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 -; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 ; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 -; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 ; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 -; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 ; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 -; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 ; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 -; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 ; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 -; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 ; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 -; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 ; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 -; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 ; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 -; SI-NEXT: v_alignbit_b32 v25, v24, v23, 16 -; SI-NEXT: v_alignbit_b32 v26, v22, v21, 16 -; SI-NEXT: v_alignbit_b32 v27, v20, v19, 16 -; SI-NEXT: v_alignbit_b32 v28, v18, v17, 16 -; SI-NEXT: v_alignbit_b32 v29, v16, v15, 16 -; SI-NEXT: v_alignbit_b32 v31, v14, v13, 16 -; SI-NEXT: v_alignbit_b32 v34, v12, v11, 16 -; SI-NEXT: v_alignbit_b32 v36, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v38, v8, v7, 16 -; SI-NEXT: v_alignbit_b32 v48, v6, v5, 16 -; SI-NEXT: v_alignbit_b32 v51, v4, v3, 16 -; SI-NEXT: v_alignbit_b32 v53, v2, v1, 16 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v4 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v2 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_alignbit_b32 v24, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v25, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v26, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v27, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v28, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v29, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v30, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v31, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v33, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v36, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v38, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v49, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v1 ; SI-NEXT: .LBB28_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v0, v0, v49 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; SI-NEXT: v_or_b32_e32 v1, v1, v53 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v49 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v28 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v26 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v24 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x5c, v0 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v55 +; SI-NEXT: v_or_b32_e32 v2, v2, v38 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v54 +; SI-NEXT: v_or_b32_e32 v4, v4, v36 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v53 +; SI-NEXT: v_or_b32_e32 v6, v6, v33 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v52 +; SI-NEXT: v_or_b32_e32 v8, v8, v31 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v51 +; SI-NEXT: v_or_b32_e32 v10, v10, v30 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v50 +; SI-NEXT: v_or_b32_e32 v12, v12, v29 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v48 +; SI-NEXT: v_or_b32_e32 v14, v14, v28 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v39 +; SI-NEXT: v_or_b32_e32 v16, v16, v27 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v37 +; SI-NEXT: v_or_b32_e32 v18, v18, v26 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v35 +; SI-NEXT: v_or_b32_e32 v20, v20, v25 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v34 +; SI-NEXT: v_or_b32_e32 v22, v22, v24 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v32 +; SI-NEXT: v_or_b32_e32 v1, v1, v49 +; SI-NEXT: v_or_b32_e32 v3, v3, v38 +; SI-NEXT: v_or_b32_e32 v5, v5, v36 +; SI-NEXT: v_or_b32_e32 v7, v7, v33 +; SI-NEXT: v_or_b32_e32 v9, v9, v31 +; SI-NEXT: v_or_b32_e32 v11, v11, v30 +; SI-NEXT: v_or_b32_e32 v13, v13, v29 +; SI-NEXT: v_or_b32_e32 v15, v15, v28 +; SI-NEXT: v_or_b32_e32 v17, v17, v27 +; SI-NEXT: v_or_b32_e32 v19, v19, v26 +; SI-NEXT: v_or_b32_e32 v21, v21, v25 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v24f32_to_v48i16: @@ -12767,280 +12419,229 @@ define inreg <48 x i16> @bitcast_v24f32_to_v48i16_scalar(<24 x float> inreg %a, ; SI-LABEL: bitcast_v24f32_to_v48i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; SI-NEXT: v_mov_b32_e32 v23, s16 -; SI-NEXT: v_mov_b32_e32 v24, s17 -; SI-NEXT: v_mov_b32_e32 v21, s18 -; SI-NEXT: v_mov_b32_e32 v22, s19 -; SI-NEXT: v_mov_b32_e32 v19, s20 -; SI-NEXT: v_mov_b32_e32 v20, s21 -; SI-NEXT: v_mov_b32_e32 v17, s22 -; SI-NEXT: v_mov_b32_e32 v18, s23 -; SI-NEXT: v_mov_b32_e32 v15, s24 -; SI-NEXT: v_mov_b32_e32 v16, s25 -; SI-NEXT: v_mov_b32_e32 v13, s26 -; SI-NEXT: v_mov_b32_e32 v14, s27 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: v_mov_b32_e32 v22, s16 +; SI-NEXT: v_mov_b32_e32 v23, s17 +; SI-NEXT: v_mov_b32_e32 v20, s18 +; SI-NEXT: v_mov_b32_e32 v21, s19 +; SI-NEXT: v_mov_b32_e32 v18, s20 +; SI-NEXT: v_mov_b32_e32 v19, s21 +; SI-NEXT: v_mov_b32_e32 v14, s22 +; SI-NEXT: v_mov_b32_e32 v15, s23 +; SI-NEXT: v_mov_b32_e32 v16, s24 +; SI-NEXT: v_mov_b32_e32 v17, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mov_b32_e32 v11, s28 -; SI-NEXT: v_mov_b32_e32 v12, s29 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB29_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshr_b64 v[25:26], v[9:10], 16 -; SI-NEXT: v_lshr_b64 v[32:33], v[11:12], 16 -; SI-NEXT: v_lshr_b64 v[26:27], v[7:8], 16 -; SI-NEXT: v_lshr_b64 v[33:34], v[15:16], 16 -; SI-NEXT: v_lshr_b64 v[27:28], v[5:6], 16 -; SI-NEXT: v_lshr_b64 v[34:35], v[17:18], 16 -; SI-NEXT: v_lshr_b64 v[28:29], v[3:4], 16 -; SI-NEXT: v_lshr_b64 v[35:36], v[19:20], 16 -; SI-NEXT: v_lshr_b64 v[29:30], v[1:2], 16 -; SI-NEXT: v_lshr_b64 v[36:37], v[21:22], 16 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v18 +; SI-NEXT: v_lshr_b64 v[34:35], v[8:9], 16 +; SI-NEXT: v_lshr_b64 v[35:36], v[6:7], 16 +; SI-NEXT: v_lshr_b64 v[36:37], v[4:5], 16 +; SI-NEXT: v_lshr_b64 v[37:38], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[48:49], v[12:13], 16 +; SI-NEXT: v_lshr_b64 v[29:30], v[16:17], 16 +; SI-NEXT: v_lshr_b64 v[26:27], v[14:15], 16 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v1 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v13 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v11 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v17 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v22 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v24 -; SI-NEXT: v_lshr_b64 v[30:31], v[13:14], 16 -; SI-NEXT: v_lshr_b64 v[37:38], v[23:24], 16 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v15 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; SI-NEXT: v_lshr_b64 v[38:39], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[49:50], v[10:11], 16 +; SI-NEXT: v_lshr_b64 v[24:25], v[18:19], 16 +; SI-NEXT: v_lshr_b64 v[27:28], v[20:21], 16 +; SI-NEXT: v_lshr_b64 v[30:31], v[22:23], 16 ; SI-NEXT: s_cbranch_execnz .LBB29_3 ; SI-NEXT: .LBB29_2: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 -; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 -; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 ; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 -; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 -; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 ; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 ; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 -; SI-NEXT: v_lshr_b64 v[25:26], v[9:10], 16 -; SI-NEXT: v_lshr_b64 v[32:33], v[11:12], 16 -; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 -; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 ; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_lshr_b64 v[34:35], v[8:9], 16 ; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 -; SI-NEXT: v_lshr_b64 v[26:27], v[7:8], 16 -; SI-NEXT: v_lshr_b64 v[33:34], v[15:16], 16 -; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 -; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 ; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_lshr_b64 v[35:36], v[6:7], 16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 ; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_lshr_b64 v[27:28], v[5:6], 16 -; SI-NEXT: v_lshr_b64 v[34:35], v[17:18], 16 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_lshr_b64 v[36:37], v[4:5], 16 +; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 ; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 ; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 -; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 ; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshr_b64 v[28:29], v[3:4], 16 -; SI-NEXT: v_lshr_b64 v[35:36], v[19:20], 16 -; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 -; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 -; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 -; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 -; SI-NEXT: v_lshr_b64 v[29:30], v[1:2], 16 -; SI-NEXT: v_lshr_b64 v[36:37], v[21:22], 16 -; SI-NEXT: v_lshr_b64 v[30:31], v[13:14], 16 -; SI-NEXT: v_lshr_b64 v[37:38], v[23:24], 16 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v18 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v20 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v22 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshr_b64 v[37:38], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[48:49], v[12:13], 16 +; SI-NEXT: v_lshr_b64 v[29:30], v[16:17], 16 +; SI-NEXT: v_lshr_b64 v[26:27], v[14:15], 16 +; SI-NEXT: v_lshr_b64 v[38:39], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[49:50], v[10:11], 16 +; SI-NEXT: v_lshr_b64 v[24:25], v[18:19], 16 +; SI-NEXT: v_lshr_b64 v[27:28], v[20:21], 16 +; SI-NEXT: v_lshr_b64 v[30:31], v[22:23], 16 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v1 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v13 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v11 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v17 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v15 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 ; SI-NEXT: .LBB29_3: ; %end -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v37 -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; SI-NEXT: v_or_b32_e32 v23, v23, v31 -; SI-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v42 -; SI-NEXT: v_or_b32_e32 v23, v23, v24 -; SI-NEXT: v_add_i32_e32 v24, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v23, v24, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v36 -; SI-NEXT: v_or_b32_e32 v21, v21, v23 -; SI-NEXT: v_add_i32_e32 v23, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v21, v23, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v22 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v41 -; SI-NEXT: v_or_b32_e32 v21, v21, v22 -; SI-NEXT: v_add_i32_e32 v22, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v21, v22, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v35 -; SI-NEXT: v_or_b32_e32 v19, v19, v21 -; SI-NEXT: v_add_i32_e32 v21, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v19, v21, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v40 -; SI-NEXT: v_or_b32_e32 v19, v19, v20 -; SI-NEXT: v_add_i32_e32 v20, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v34 -; SI-NEXT: v_or_b32_e32 v17, v17, v19 -; SI-NEXT: v_add_i32_e32 v19, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v17, v19, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v55 -; SI-NEXT: v_or_b32_e32 v17, v17, v18 -; SI-NEXT: v_add_i32_e32 v18, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v33 -; SI-NEXT: v_or_b32_e32 v15, v15, v17 -; SI-NEXT: v_add_i32_e32 v17, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v15, v17, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v54 -; SI-NEXT: v_or_b32_e32 v15, v15, v16 -; SI-NEXT: v_add_i32_e32 v16, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v30 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v30, v22, v25 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v32 +; SI-NEXT: v_or_b32_e32 v31, v22, v23 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v27 +; SI-NEXT: v_or_b32_e32 v32, v20, v22 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v33 +; SI-NEXT: v_or_b32_e32 v33, v20, v21 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v24 +; SI-NEXT: v_or_b32_e32 v24, v18, v20 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v44 +; SI-NEXT: v_or_b32_e32 v25, v18, v19 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v26 +; SI-NEXT: v_or_b32_e32 v26, v14, v18 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v43 +; SI-NEXT: v_or_b32_e32 v27, v14, v15 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v29 +; SI-NEXT: v_or_b32_e32 v28, v14, v15 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v42 +; SI-NEXT: v_or_b32_e32 v29, v14, v15 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v49 +; SI-NEXT: v_or_b32_e32 v10, v10, v14 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v41 +; SI-NEXT: v_or_b32_e32 v11, v11, v14 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v48 +; SI-NEXT: v_or_b32_e32 v12, v12, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v40 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v30 -; SI-NEXT: v_or_b32_e32 v13, v13, v15 -; SI-NEXT: v_add_i32_e32 v15, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v13, v15, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v53 ; SI-NEXT: v_or_b32_e32 v13, v13, v14 -; SI-NEXT: v_add_i32_e32 v14, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v32 -; SI-NEXT: v_or_b32_e32 v11, v11, v13 -; SI-NEXT: v_add_i32_e32 v13, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v11, v13, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v52 -; SI-NEXT: v_or_b32_e32 v11, v11, v12 -; SI-NEXT: v_add_i32_e32 v12, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v29 -; SI-NEXT: v_or_b32_e32 v1, v1, v11 -; SI-NEXT: v_add_i32_e32 v11, vcc, 56, v0 -; SI-NEXT: buffer_store_dword v1, v11, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v28 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v49 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v26 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x5c, v0 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v38 +; SI-NEXT: v_or_b32_e32 v14, v0, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v55 +; SI-NEXT: v_or_b32_e32 v15, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v37 +; SI-NEXT: v_or_b32_e32 v16, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v54 +; SI-NEXT: v_or_b32_e32 v17, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v36 +; SI-NEXT: v_or_b32_e32 v18, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v53 +; SI-NEXT: v_or_b32_e32 v19, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v35 +; SI-NEXT: v_or_b32_e32 v20, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v52 +; SI-NEXT: v_or_b32_e32 v21, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v34 +; SI-NEXT: v_or_b32_e32 v22, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v51 +; SI-NEXT: v_or_b32_e32 v23, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, v30 +; SI-NEXT: v_mov_b32_e32 v1, v31 +; SI-NEXT: v_mov_b32_e32 v2, v32 +; SI-NEXT: v_mov_b32_e32 v3, v33 +; SI-NEXT: v_mov_b32_e32 v4, v24 +; SI-NEXT: v_mov_b32_e32 v5, v25 +; SI-NEXT: v_mov_b32_e32 v6, v26 +; SI-NEXT: v_mov_b32_e32 v7, v27 +; SI-NEXT: v_mov_b32_e32 v8, v28 +; SI-NEXT: v_mov_b32_e32 v9, v29 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB29_4: -; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: s_branch .LBB29_2 ; ; VI-LABEL: bitcast_v24f32_to_v48i16_scalar: @@ -13739,129 +13340,222 @@ define <24 x float> @bitcast_v48i16_to_v24f32(<48 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v48i16_to_v24f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v48, v14 -; SI-NEXT: v_mov_b32_e32 v49, v12 -; SI-NEXT: v_mov_b32_e32 v50, v10 -; SI-NEXT: v_mov_b32_e32 v51, v8 -; SI-NEXT: v_mov_b32_e32 v52, v6 -; SI-NEXT: v_mov_b32_e32 v53, v4 -; SI-NEXT: v_mov_b32_e32 v54, v2 -; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v29 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v8 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v12 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:52 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:44 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:36 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v18 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:28 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:12 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:4 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v36, v19 +; SI-NEXT: v_mov_b32_e32 v37, v18 +; SI-NEXT: v_mov_b32_e32 v38, v17 +; SI-NEXT: v_mov_b32_e32 v39, v16 +; SI-NEXT: v_mov_b32_e32 v48, v15 +; SI-NEXT: v_mov_b32_e32 v49, v14 +; SI-NEXT: v_mov_b32_e32 v50, v13 +; SI-NEXT: v_mov_b32_e32 v51, v12 +; SI-NEXT: v_mov_b32_e32 v52, v11 +; SI-NEXT: v_mov_b32_e32 v53, v10 +; SI-NEXT: v_mov_b32_e32 v54, v9 +; SI-NEXT: v_mov_b32_e32 v55, v8 +; SI-NEXT: v_mov_b32_e32 v40, v7 +; SI-NEXT: v_mov_b32_e32 v41, v6 +; SI-NEXT: v_mov_b32_e32 v42, v5 +; SI-NEXT: v_mov_b32_e32 v43, v4 +; SI-NEXT: v_mov_b32_e32 v44, v3 +; SI-NEXT: v_mov_b32_e32 v45, v2 +; SI-NEXT: v_mov_b32_e32 v46, v1 +; SI-NEXT: v_mov_b32_e32 v47, v0 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v23 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v22 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v21 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v38 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v48 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v50 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v52 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v40 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v42 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v43 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v44 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v45 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v46 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v47 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB30_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v51 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v47 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v46 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v45 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v44 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v43 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v42 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v41 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v40 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v55 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v54 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v53 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v52 +; SI-NEXT: v_or_b32_e32 v0, v0, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v59 +; SI-NEXT: v_or_b32_e32 v2, v2, v34 +; SI-NEXT: v_or_b32_e32 v3, v3, v58 +; SI-NEXT: v_or_b32_e32 v4, v4, v33 +; SI-NEXT: v_or_b32_e32 v5, v5, v57 +; SI-NEXT: v_or_b32_e32 v6, v6, v32 +; SI-NEXT: v_or_b32_e32 v7, v7, v56 +; SI-NEXT: v_or_b32_e32 v8, v8, v63 +; SI-NEXT: v_or_b32_e32 v9, v9, v62 +; SI-NEXT: v_or_b32_e32 v10, v10, v61 +; SI-NEXT: v_or_b32_e32 v11, v11, v60 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v50 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v49 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v48 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v39 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v38 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v37 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v36 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; kill: killed $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr24 @@ -13889,152 +13583,63 @@ define <24 x float> @bitcast_v48i16_to_v24f32(<48 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; kill: killed $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v53 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v52 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v51 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v50 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v49 -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v48 ; SI-NEXT: ; kill: killed $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: v_or_b32_e32 v0, v0, v47 -; SI-NEXT: v_or_b32_e32 v1, v1, v39 -; SI-NEXT: v_or_b32_e32 v2, v2, v46 -; SI-NEXT: v_or_b32_e32 v3, v3, v38 -; SI-NEXT: v_or_b32_e32 v4, v4, v45 -; SI-NEXT: v_or_b32_e32 v5, v5, v37 -; SI-NEXT: v_or_b32_e32 v6, v6, v44 -; SI-NEXT: v_or_b32_e32 v7, v7, v36 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; kill: killed $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; kill: killed $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: .LBB30_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB30_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v51 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v47 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v46 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v45 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v44 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v43 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v42 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v41 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v40 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v52 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; SI-NEXT: v_or_b32_e32 v8, v8, v43 -; SI-NEXT: v_or_b32_e32 v9, v9, v35 -; SI-NEXT: v_or_b32_e32 v10, v10, v42 -; SI-NEXT: v_or_b32_e32 v11, v11, v34 -; SI-NEXT: v_or_b32_e32 v12, v12, v41 -; SI-NEXT: v_or_b32_e32 v13, v13, v33 -; SI-NEXT: v_or_b32_e32 v14, v14, v40 -; SI-NEXT: v_or_b32_e32 v15, v15, v32 -; SI-NEXT: v_or_b32_e32 v16, v16, v63 -; SI-NEXT: v_or_b32_e32 v17, v17, v62 -; SI-NEXT: v_or_b32_e32 v18, v18, v61 -; SI-NEXT: v_or_b32_e32 v19, v19, v60 -; SI-NEXT: v_or_b32_e32 v20, v20, v59 -; SI-NEXT: v_or_b32_e32 v21, v21, v58 -; SI-NEXT: v_or_b32_e32 v22, v22, v57 -; SI-NEXT: v_or_b32_e32 v23, v23, v56 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: .LBB30_2: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB30_4 -; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v53 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v52 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v51 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v50 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v49 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v48 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: v_or_b32_e32 v0, v47, v0 +; SI-NEXT: v_or_b32_e32 v0, v35, v0 ; SI-NEXT: s_mov_b32 s6, 0x30000 -; SI-NEXT: v_or_b32_e32 v1, v39, v1 -; SI-NEXT: v_or_b32_e32 v2, v46, v2 -; SI-NEXT: v_or_b32_e32 v3, v38, v3 -; SI-NEXT: v_or_b32_e32 v4, v45, v4 -; SI-NEXT: v_or_b32_e32 v5, v37, v5 -; SI-NEXT: v_or_b32_e32 v6, v44, v6 -; SI-NEXT: v_or_b32_e32 v7, v36, v7 +; SI-NEXT: v_or_b32_e32 v1, v59, v1 +; SI-NEXT: v_or_b32_e32 v2, v34, v2 +; SI-NEXT: v_or_b32_e32 v3, v58, v3 +; SI-NEXT: v_or_b32_e32 v4, v33, v4 +; SI-NEXT: v_or_b32_e32 v5, v57, v5 +; SI-NEXT: v_or_b32_e32 v6, v32, v6 +; SI-NEXT: v_or_b32_e32 v7, v56, v7 +; SI-NEXT: v_or_b32_e32 v8, v63, v8 +; SI-NEXT: v_or_b32_e32 v9, v62, v9 +; SI-NEXT: v_or_b32_e32 v10, v61, v10 +; SI-NEXT: v_or_b32_e32 v11, v60, v11 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 @@ -14043,64 +13648,40 @@ define <24 x float> @bitcast_v48i16_to_v24f32(<48 x i16> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 ; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 ; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v50 ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v49 ; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v48 ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v39 ; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v38 ; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v37 ; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v36 ; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; SI-NEXT: v_or_b32_e32 v8, v43, v8 -; SI-NEXT: v_or_b32_e32 v9, v35, v9 -; SI-NEXT: v_or_b32_e32 v10, v42, v10 -; SI-NEXT: v_or_b32_e32 v11, v34, v11 -; SI-NEXT: v_or_b32_e32 v12, v41, v12 -; SI-NEXT: v_or_b32_e32 v13, v33, v13 -; SI-NEXT: v_or_b32_e32 v14, v40, v14 -; SI-NEXT: v_or_b32_e32 v15, v32, v15 -; SI-NEXT: v_or_b32_e32 v16, v63, v16 -; SI-NEXT: v_or_b32_e32 v17, v62, v17 -; SI-NEXT: v_or_b32_e32 v18, v61, v18 -; SI-NEXT: v_or_b32_e32 v19, v60, v19 -; SI-NEXT: v_or_b32_e32 v20, v59, v20 -; SI-NEXT: v_or_b32_e32 v21, v58, v21 -; SI-NEXT: v_or_b32_e32 v22, v57, v22 -; SI-NEXT: v_or_b32_e32 v23, v56, v23 -; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 ; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 ; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 @@ -14111,28 +13692,51 @@ define <24 x float> @bitcast_v48i16_to_v24f32(<48 x i16> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 ; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 ; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 ; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v23 ; SI-NEXT: .LBB30_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -14783,319 +14387,270 @@ define inreg <24 x float> @bitcast_v48i16_to_v24f32_scalar(<48 x i16> inreg %a, ; SI-LABEL: bitcast_v48i16_to_v24f32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mov_b32_e32 v61, v4 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v63, v2 -; SI-NEXT: v_mov_b32_e32 v56, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:4 -; SI-NEXT: v_mov_b32_e32 v31, v22 -; SI-NEXT: v_mov_b32_e32 v34, v20 -; SI-NEXT: v_mov_b32_e32 v35, v18 -; SI-NEXT: v_mov_b32_e32 v36, v16 -; SI-NEXT: v_mov_b32_e32 v37, v14 -; SI-NEXT: v_mov_b32_e32 v38, v12 -; SI-NEXT: v_mov_b32_e32 v39, v10 -; SI-NEXT: v_mov_b32_e32 v48, v8 -; SI-NEXT: v_mov_b32_e32 v50, v6 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v2 +; SI-NEXT: v_mov_b32_e32 v32, v9 +; SI-NEXT: v_mov_b32_e32 v33, v8 +; SI-NEXT: v_mov_b32_e32 v34, v7 +; SI-NEXT: v_mov_b32_e32 v35, v6 +; SI-NEXT: v_mov_b32_e32 v36, v5 +; SI-NEXT: v_mov_b32_e32 v37, v4 +; SI-NEXT: v_mov_b32_e32 v38, v3 +; SI-NEXT: v_mov_b32_e32 v39, v2 +; SI-NEXT: v_mov_b32_e32 v48, v1 +; SI-NEXT: v_mov_b32_e32 v49, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v38 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v48 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v49 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s7, s28, 16 +; SI-NEXT: s_lshr_b32 s8, s27, 16 +; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: s_lshr_b32 s13, s22, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s40, s19, 16 +; SI-NEXT: s_lshr_b32 s41, s18, 16 +; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v0 ; SI-NEXT: s_cbranch_scc0 .LBB31_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v56 -; SI-NEXT: v_or_b32_e32 v7, v0, v54 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v61 -; SI-NEXT: v_or_b32_e32 v9, v0, v32 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 -; SI-NEXT: v_or_b32_e32 v10, v0, v55 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s44, s42, 16 +; SI-NEXT: s_or_b32 s5, s5, s44 +; SI-NEXT: s_and_b32 s44, s18, 0xffff +; SI-NEXT: s_lshl_b32 s45, s41, 16 +; SI-NEXT: s_or_b32 s44, s44, s45 +; SI-NEXT: s_and_b32 s45, s19, 0xffff +; SI-NEXT: s_lshl_b32 s46, s40, 16 +; SI-NEXT: s_or_b32 s45, s45, s46 +; SI-NEXT: s_and_b32 s46, s20, 0xffff +; SI-NEXT: s_lshl_b32 s47, s15, 16 +; SI-NEXT: s_or_b32 s46, s46, s47 +; SI-NEXT: s_and_b32 s47, s21, 0xffff +; SI-NEXT: s_lshl_b32 s56, s14, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: s_or_b32 s47, s47, s56 +; SI-NEXT: s_and_b32 s56, s22, 0xffff +; SI-NEXT: s_lshl_b32 s57, s13, 16 +; SI-NEXT: v_or_b32_e32 v14, v0, v43 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 -; SI-NEXT: v_or_b32_e32 v11, v0, v62 +; SI-NEXT: s_or_b32 s56, s56, s57 +; SI-NEXT: s_and_b32 s57, s23, 0xffff +; SI-NEXT: s_lshl_b32 s58, s12, 16 +; SI-NEXT: v_or_b32_e32 v15, v0, v42 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 -; SI-NEXT: v_or_b32_e32 v12, v0, v46 +; SI-NEXT: s_or_b32 s57, s57, s58 +; SI-NEXT: s_and_b32 s58, s24, 0xffff +; SI-NEXT: s_lshl_b32 s59, s11, 16 +; SI-NEXT: v_or_b32_e32 v16, v0, v41 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 -; SI-NEXT: v_or_b32_e32 v13, v0, v45 +; SI-NEXT: s_or_b32 s58, s58, s59 +; SI-NEXT: s_and_b32 s59, s25, 0xffff +; SI-NEXT: s_lshl_b32 s60, s10, 16 +; SI-NEXT: v_or_b32_e32 v17, v0, v40 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 -; SI-NEXT: v_or_b32_e32 v14, v0, v60 +; SI-NEXT: s_or_b32 s59, s59, s60 +; SI-NEXT: s_and_b32 s60, s26, 0xffff +; SI-NEXT: s_lshl_b32 s61, s9, 16 +; SI-NEXT: v_or_b32_e32 v18, v0, v55 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 -; SI-NEXT: v_or_b32_e32 v15, v0, v59 +; SI-NEXT: s_or_b32 s60, s60, s61 +; SI-NEXT: s_and_b32 s61, s27, 0xffff +; SI-NEXT: s_lshl_b32 s62, s8, 16 +; SI-NEXT: v_or_b32_e32 v19, v0, v54 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 -; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: v_or_b32_e32 v16, v0, v58 +; SI-NEXT: s_or_b32 s61, s61, s62 +; SI-NEXT: s_and_b32 s62, s28, 0xffff +; SI-NEXT: s_lshl_b32 s63, s7, 16 +; SI-NEXT: v_or_b32_e32 v20, v0, v53 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: v_or_b32_e32 v17, v0, v41 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v31 -; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: v_or_b32_e32 v18, v0, v40 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v24 -; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: v_or_b32_e32 v19, v0, v52 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v26 -; SI-NEXT: s_or_b32 s7, s7, s8 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: v_or_b32_e32 v20, v0, v51 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v28 -; SI-NEXT: s_or_b32 s8, s8, s9 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: v_or_b32_e32 v21, v0, v29 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 -; SI-NEXT: s_or_b32 s9, s9, s10 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v63 -; SI-NEXT: v_or_b32_e32 v22, v0, v27 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 -; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_or_b32_e32 v8, v1, v57 -; SI-NEXT: v_or_b32_e32 v23, v0, v25 +; SI-NEXT: s_or_b32 s62, s62, s63 +; SI-NEXT: s_and_b32 s63, s29, 0xffff +; SI-NEXT: s_lshl_b32 s72, s6, 16 +; SI-NEXT: v_or_b32_e32 v21, v0, v52 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: s_or_b32 s63, s63, s72 +; SI-NEXT: v_or_b32_e32 v22, v0, v51 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: v_or_b32_e32 v23, v0, v50 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_mov_b32_e32 v5, s9 -; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: v_mov_b32_e32 v2, s44 +; SI-NEXT: v_mov_b32_e32 v3, s45 +; SI-NEXT: v_mov_b32_e32 v4, s46 +; SI-NEXT: v_mov_b32_e32 v5, s47 +; SI-NEXT: v_mov_b32_e32 v6, s56 +; SI-NEXT: v_mov_b32_e32 v7, s57 +; SI-NEXT: v_mov_b32_e32 v8, s58 +; SI-NEXT: v_mov_b32_e32 v9, s59 +; SI-NEXT: v_mov_b32_e32 v10, s60 +; SI-NEXT: v_mov_b32_e32 v11, s61 +; SI-NEXT: v_mov_b32_e32 v12, s62 +; SI-NEXT: v_mov_b32_e32 v13, s63 ; SI-NEXT: s_cbranch_execnz .LBB31_3 ; SI-NEXT: .LBB31_2: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v56 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v54, v0 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v61 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v32, v0 -; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v0, v55, v0 -; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 +; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v62, v0 -; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 +; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v46, v0 -; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 +; SI-NEXT: v_or_b32_e32 v0, v41, v0 +; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v45, v0 -; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 +; SI-NEXT: v_or_b32_e32 v0, v40, v0 +; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v60, v0 -; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: v_or_b32_e32 v0, v55, v0 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s16, s42, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: s_and_b32 s16, s18, 0xffff +; SI-NEXT: s_lshl_b32 s17, s41, 16 +; SI-NEXT: s_add_i32 s19, s19, 3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v59, v0 -; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_and_b32 s17, s19, 0xffff +; SI-NEXT: s_lshl_b32 s18, s40, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_or_b32_e32 v0, v54, v0 +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: s_and_b32 s18, s20, 0xffff +; SI-NEXT: s_lshl_b32 s15, s15, 16 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: s_or_b32 s15, s15, s18 +; SI-NEXT: s_and_b32 s18, s21, 0xffff +; SI-NEXT: s_lshl_b32 s14, s14, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v58, v0 -; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 +; SI-NEXT: s_or_b32 s14, s14, s18 +; SI-NEXT: s_and_b32 s18, s22, 0xffff +; SI-NEXT: s_lshl_b32 s13, s13, 16 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: v_or_b32_e32 v0, v53, v0 +; SI-NEXT: s_or_b32 s13, s13, s18 +; SI-NEXT: s_and_b32 s18, s23, 0xffff +; SI-NEXT: s_lshl_b32 s12, s12, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 +; SI-NEXT: s_or_b32 s12, s12, s18 +; SI-NEXT: s_and_b32 s18, s24, 0xffff +; SI-NEXT: s_lshl_b32 s11, s11, 16 +; SI-NEXT: s_add_i32 s25, s25, 3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v41, v0 -; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v31 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v40, v0 -; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v24 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s11, s11, s18 +; SI-NEXT: s_and_b32 s18, s25, 0xffff +; SI-NEXT: s_lshl_b32 s10, s10, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 ; SI-NEXT: v_or_b32_e32 v0, v52, v0 -; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v26 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_or_b32_e32 v0, v51, v0 -; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v28 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: v_or_b32_e32 v0, v29, v0 -; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s10, s10, s18 +; SI-NEXT: s_and_b32 s18, s26, 0xffff +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_add_i32 s27, s27, 3 ; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 -; SI-NEXT: s_or_b32 s7, s8, s7 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_or_b32 s8, s9, s8 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: s_or_b32 s9, s9, s18 +; SI-NEXT: s_and_b32 s18, s27, 0xffff +; SI-NEXT: s_lshl_b32 s8, s8, 16 ; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: v_or_b32_e32 v0, v27, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v63 -; SI-NEXT: s_or_b32 s9, s10, s9 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s8, s8, s18 +; SI-NEXT: s_and_b32 s18, s28, 0xffff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: v_or_b32_e32 v0, v51, v0 +; SI-NEXT: s_or_b32 s7, s7, s18 +; SI-NEXT: s_and_b32 s18, s29, 0xffff +; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: v_add_i32_e32 v22, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 +; SI-NEXT: s_or_b32 s6, s6, s18 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v57, v1 ; SI-NEXT: s_add_i32 s4, s4, 0x30000 ; SI-NEXT: s_add_i32 s5, s5, 0x30000 -; SI-NEXT: s_add_i32 s6, s6, 0x30000 -; SI-NEXT: s_add_i32 s7, s7, 0x30000 -; SI-NEXT: s_add_i32 s8, s8, 0x30000 -; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s16, s16, 0x30000 +; SI-NEXT: s_add_i32 s17, s17, 0x30000 +; SI-NEXT: s_add_i32 s15, s15, 0x30000 +; SI-NEXT: s_add_i32 s14, s14, 0x30000 +; SI-NEXT: s_add_i32 s13, s13, 0x30000 +; SI-NEXT: s_add_i32 s12, s12, 0x30000 +; SI-NEXT: s_add_i32 s11, s11, 0x30000 ; SI-NEXT: s_add_i32 s10, s10, 0x30000 -; SI-NEXT: v_or_b32_e32 v0, v25, v0 -; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v50, v0 ; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_mov_b32_e32 v5, s9 -; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: v_mov_b32_e32 v3, s17 +; SI-NEXT: v_mov_b32_e32 v4, s15 +; SI-NEXT: v_mov_b32_e32 v5, s14 +; SI-NEXT: v_mov_b32_e32 v6, s13 +; SI-NEXT: v_mov_b32_e32 v7, s12 +; SI-NEXT: v_mov_b32_e32 v8, s11 +; SI-NEXT: v_mov_b32_e32 v9, s10 +; SI-NEXT: v_mov_b32_e32 v10, s9 +; SI-NEXT: v_mov_b32_e32 v11, s8 +; SI-NEXT: v_mov_b32_e32 v12, s7 +; SI-NEXT: v_mov_b32_e32 v13, s6 ; SI-NEXT: .LBB31_3: ; %end -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB31_4: -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v46, v51 -; SI-NEXT: v_mov_b32_e32 v51, v39 -; SI-NEXT: v_mov_b32_e32 v39, v34 -; SI-NEXT: v_mov_b32_e32 v34, v30 -; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v47, v52 -; SI-NEXT: v_mov_b32_e32 v52, v48 -; SI-NEXT: v_mov_b32_e32 v48, v35 -; SI-NEXT: v_mov_b32_e32 v35, v28 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v55, v57 -; SI-NEXT: v_mov_b32_e32 v57, v41 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_mov_b32_e32 v41, v49 -; SI-NEXT: v_mov_b32_e32 v49, v36 -; SI-NEXT: v_mov_b32_e32 v36, v26 -; SI-NEXT: v_mov_b32_e32 v42, v50 -; SI-NEXT: v_mov_b32_e32 v50, v37 -; SI-NEXT: v_mov_b32_e32 v37, v24 -; SI-NEXT: v_mov_b32_e32 v33, v32 -; SI-NEXT: v_mov_b32_e32 v32, v56 -; SI-NEXT: v_mov_b32_e32 v56, v40 -; SI-NEXT: v_mov_b32_e32 v40, v38 -; SI-NEXT: v_mov_b32_e32 v38, v31 -; SI-NEXT: v_mov_b32_e32 v43, v25 -; SI-NEXT: v_mov_b32_e32 v44, v27 -; SI-NEXT: v_mov_b32_e32 v53, v45 -; SI-NEXT: v_mov_b32_e32 v45, v29 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v26, v36 -; SI-NEXT: v_mov_b32_e32 v36, v49 -; SI-NEXT: v_mov_b32_e32 v49, v41 -; SI-NEXT: v_mov_b32_e32 v41, v57 -; SI-NEXT: v_mov_b32_e32 v57, v55 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v30, v34 -; SI-NEXT: v_mov_b32_e32 v34, v39 -; SI-NEXT: v_mov_b32_e32 v39, v51 -; SI-NEXT: v_mov_b32_e32 v51, v46 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v29, v45 -; SI-NEXT: v_mov_b32_e32 v45, v53 -; SI-NEXT: v_mov_b32_e32 v27, v44 -; SI-NEXT: v_mov_b32_e32 v25, v43 -; SI-NEXT: v_mov_b32_e32 v31, v38 -; SI-NEXT: v_mov_b32_e32 v38, v40 -; SI-NEXT: v_mov_b32_e32 v40, v56 -; SI-NEXT: v_mov_b32_e32 v56, v32 -; SI-NEXT: v_mov_b32_e32 v32, v33 -; SI-NEXT: v_mov_b32_e32 v24, v37 -; SI-NEXT: v_mov_b32_e32 v37, v50 -; SI-NEXT: v_mov_b32_e32 v50, v42 -; SI-NEXT: v_mov_b32_e32 v28, v35 -; SI-NEXT: v_mov_b32_e32 v35, v48 -; SI-NEXT: v_mov_b32_e32 v48, v52 -; SI-NEXT: v_mov_b32_e32 v52, v47 ; SI-NEXT: s_branch .LBB31_2 ; ; VI-LABEL: bitcast_v48i16_to_v24f32_scalar: @@ -15724,16 +15279,12 @@ end: define <48 x half> @bitcast_v24f32_to_v48f16(<24 x float> %a, i32 %b) { ; SI-LABEL: bitcast_v24f32_to_v48f16: ; SI: ; %bb.0: -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v25 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -15750,18 +15301,15 @@ define <48 x half> @bitcast_v24f32_to_v48f16(<24 x float> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; kill: killed $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr62 @@ -15772,119 +15320,126 @@ define <48 x half> @bitcast_v24f32_to_v48f16(<24 x float> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; kill: killed $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB32_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v22 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v24 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v53, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v24 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v50, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v0 +; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 @@ -15908,11 +15463,17 @@ define <48 x half> @bitcast_v24f32_to_v48f16(<24 x float> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: .LBB32_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB32_4 ; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v33 ; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 @@ -15932,37 +15493,32 @@ define <48 x half> @bitcast_v24f32_to_v48f16(<24 x float> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 ; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 ; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 -; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 ; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 ; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 ; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 ; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 -; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v18 ; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v20 ; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v21 ; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v22 ; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 @@ -15971,242 +15527,126 @@ define <48 x half> @bitcast_v24f32_to_v48f16(<24 x float> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 ; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 ; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v32 ; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 ; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 ; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 ; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 ; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_mov_b32_e32 v35, v24 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v33, v22 +; SI-NEXT: v_mov_b32_e32 v32, v23 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: .LBB32_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v30 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: v_cvt_f16_f32_e32 v1, v29 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v28 -; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v26 -; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v25 -; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v62 -; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v60 -; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v58 -; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v56 -; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v46 -; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v44 -; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v42 -; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v41 -; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v55 -; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v53 -; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 -; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 -; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x5c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v35 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v28 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v27 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v24 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v62 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v61 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v58 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v57 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v45 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v46 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v41 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v42 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v53 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v54 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v48 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v50 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v16, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v34 ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -16223,7 +15663,49 @@ define <48 x half> @bitcast_v24f32_to_v48f16(<24 x float> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v22, v37 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v32 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v20, v35 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v39 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v33 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 +; SI-NEXT: v_or_b32_e32 v23, v25, v23 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v24f32_to_v48f16: @@ -16695,7 +16177,7 @@ define inreg <48 x half> @bitcast_v24f32_to_v48f16_scalar(<24 x float> inreg %a, ; SI-LABEL: bitcast_v24f32_to_v48f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -16713,409 +16195,299 @@ define inreg <48 x half> @bitcast_v24f32_to_v48f16_scalar(<24 x float> inreg %a, ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: v_mov_b32_e32 v18, s16 -; SI-NEXT: v_mov_b32_e32 v15, s17 -; SI-NEXT: v_mov_b32_e32 v13, s18 -; SI-NEXT: v_mov_b32_e32 v12, s19 +; SI-NEXT: v_mov_b32_e32 v25, s17 +; SI-NEXT: v_mov_b32_e32 v24, s18 +; SI-NEXT: v_mov_b32_e32 v23, s19 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v62, s20 -; SI-NEXT: v_mov_b32_e32 v59, s21 +; SI-NEXT: v_mov_b32_e32 v60, s21 ; SI-NEXT: v_mov_b32_e32 v58, s22 ; SI-NEXT: v_mov_b32_e32 v57, s23 -; SI-NEXT: v_mov_b32_e32 v16, s24 +; SI-NEXT: v_mov_b32_e32 v56, s24 +; SI-NEXT: v_mov_b32_e32 v20, s25 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mov_b32_e32 v17, s25 -; SI-NEXT: v_mov_b32_e32 v14, s26 -; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v21, s26 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v63, s28 -; SI-NEXT: v_mov_b32_e32 v60, s29 +; SI-NEXT: v_mov_b32_e32 v63, s27 +; SI-NEXT: v_mov_b32_e32 v61, s28 +; SI-NEXT: v_mov_b32_e32 v59, s29 ; SI-NEXT: s_cbranch_scc0 .LBB33_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v19 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v19 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v19 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v1 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v63 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v19 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v19 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v19 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v19 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v19 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v19 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v19 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v19 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v36, v19 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v56 ; SI-NEXT: v_cvt_f32_f16_e32 v37, v19 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v19 ; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v19 ; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v19 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v19 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v19 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v19 ; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v19 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v23 ; SI-NEXT: v_cvt_f32_f16_e32 v54, v19 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v19 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v19 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v25 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v42, v19 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v19 ; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v17 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v21 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v4 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v19, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v5 ; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v19, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v12 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v15 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v5 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v0 ; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v3 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v29, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v25 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v19, v18 ; SI-NEXT: s_cbranch_execnz .LBB33_3 ; SI-NEXT: .LBB33_2: ; %cmp.true ; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v63 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v7 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v56 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v63 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v62 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v6 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v62 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v35, 1.0, v61 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v5 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v61 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v25 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v24 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v23 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v60 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v58 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v57 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v34, 1.0, v59 ; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v7 -; SI-NEXT: v_add_f32_e32 v24, 1.0, v57 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v21 -; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 -; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 -; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 -; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 -; SI-NEXT: v_add_f32_e32 v19, 1.0, v62 -; SI-NEXT: v_add_f32_e32 v20, 1.0, v59 -; SI-NEXT: v_add_f32_e32 v22, 1.0, v58 -; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 -; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 -; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 -; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 -; SI-NEXT: v_add_f32_e32 v26, 1.0, v63 -; SI-NEXT: v_add_f32_e32 v28, 1.0, v60 ; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 ; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 -; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 -; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 ; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 ; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 -; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v1 ; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v2 ; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v3 ; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v54 ; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 ; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 ; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 -; SI-NEXT: v_mov_b32_e32 v21, v10 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v18 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: .LBB33_3: ; %end ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v61 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v56 -; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v46 -; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v44 -; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v43 -; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v41 -; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v22 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v55 -; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v53 -; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v51 -; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v49 -; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v39 -; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v35 -; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 -; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v31 -; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v29 -; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v24 -; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v22 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v20 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v23 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v25 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x5c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v21 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v46 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v51 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v43 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v39 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v40 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v37 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v52 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_or_b32_e32 v9, v19, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v48 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_or_b32_e32 v10, v18, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v12 +; SI-NEXT: v_or_b32_e32 v11, v20, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v34 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_or_b32_e32 v12, v18, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v14 +; SI-NEXT: v_or_b32_e32 v13, v20, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v32 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_or_b32_e32 v14, v18, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v16 +; SI-NEXT: v_or_b32_e32 v15, v20, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v30 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v19 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v17, v20, v17 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v19, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v28 ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -17132,66 +16504,97 @@ define inreg <48 x half> @bitcast_v24f32_to_v48f16_scalar(<24 x float> inreg %a, ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v24, v27 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v22, v26 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 +; SI-NEXT: v_or_b32_e32 v23, v25, v23 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB33_4: -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; kill: killed $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; kill: killed $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; kill: killed $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; kill: killed $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; kill: killed $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; kill: killed $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; kill: killed $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; kill: killed $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: s_branch .LBB33_2 ; ; VI-LABEL: bitcast_v24f32_to_v48f16_scalar: @@ -17890,170 +17293,181 @@ define <24 x float> @bitcast_v48f16_to_v24f32(<48 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v48f16_to_v24f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v16 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v40, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f16_f32_e32 v54, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:48 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:44 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:56 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:52 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:64 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v11 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v12 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v15 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v32 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v44, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v32 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v63 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v32 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v62 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v61 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v60 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v59 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v58 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v57 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v56 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v23 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB34_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v33 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; kill: killed $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; kill: killed $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; kill: killed $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; kill: killed $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; kill: killed $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; kill: killed $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; kill: killed $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; kill: killed $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v41 ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; kill: killed $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr24 @@ -18089,9 +17503,14 @@ define <24 x float> @bitcast_v48f16_to_v24f32(<48 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v39 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v37 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v45 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v43 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v41 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v63 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v45 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v43 ; SI-NEXT: ; kill: killed $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: v_or_b32_e32 v0, v54, v0 @@ -18101,9 +17520,14 @@ define <24 x float> @bitcast_v48f16_to_v24f32(<48 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v4, v38, v4 ; SI-NEXT: v_or_b32_e32 v5, v36, v5 ; SI-NEXT: v_or_b32_e32 v6, v34, v6 -; SI-NEXT: v_or_b32_e32 v21, v44, v21 -; SI-NEXT: v_or_b32_e32 v22, v42, v22 -; SI-NEXT: v_or_b32_e32 v23, v40, v23 +; SI-NEXT: v_or_b32_e32 v7, v32, v7 +; SI-NEXT: v_or_b32_e32 v8, v62, v8 +; SI-NEXT: v_or_b32_e32 v9, v60, v9 +; SI-NEXT: v_or_b32_e32 v10, v58, v10 +; SI-NEXT: v_or_b32_e32 v11, v56, v11 +; SI-NEXT: v_or_b32_e32 v12, v46, v12 +; SI-NEXT: v_or_b32_e32 v13, v44, v13 +; SI-NEXT: v_or_b32_e32 v14, v42, v14 ; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr53 @@ -18119,86 +17543,71 @@ define <24 x float> @bitcast_v48f16_to_v24f32(<48 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v47 -; SI-NEXT: v_or_b32_e32 v20, v46, v20 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v23, v40, v23 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: .LBB34_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB34_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v55 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v53 @@ -18231,158 +17640,128 @@ define <24 x float> @bitcast_v48f16_to_v24f32(<48 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v3, v49 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v36 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v36 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v34 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v32 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v39 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v47 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v46 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v44 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v58 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v41 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v40 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v57 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v33 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v13, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v43 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v42 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v40 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v59 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v12, v47 ; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v45 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v14, v12 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v44 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v41 ; SI-NEXT: v_or_b32_e32 v14, v16, v14 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 @@ -18394,12 +17773,12 @@ define <24 x float> @bitcast_v48f16_to_v24f32(<48 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 @@ -18409,7 +17788,7 @@ define <24 x float> @bitcast_v48f16_to_v24f32(<48 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_or_b32_e32 v18, v20, v18 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 @@ -18421,17 +17800,29 @@ define <24 x float> @bitcast_v48f16_to_v24f32(<48 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_or_b32_e32 v19, v20, v19 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v45 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_or_b32_e32 v20, v22, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v43 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v42 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_or_b32_e32 v22, v23, v22 @@ -18439,22 +17830,22 @@ define <24 x float> @bitcast_v48f16_to_v24f32(<48 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v23, v25, v23 ; SI-NEXT: .LBB34_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -19106,165 +18497,281 @@ define inreg <24 x float> @bitcast_v48f16_to_v24f32_scalar(<48 x half> inreg %a, ; SI-LABEL: bitcast_v48f16_to_v24f32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v62, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v3 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 +; SI-NEXT: s_lshr_b32 s40, s17, 16 +; SI-NEXT: s_lshr_b32 s41, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v63, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v7 +; SI-NEXT: s_lshr_b32 s14, s19, 16 +; SI-NEXT: s_lshr_b32 s15, s18, 16 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v23 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v63, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v6 +; SI-NEXT: s_lshr_b32 s12, s21, 16 +; SI-NEXT: s_lshr_b32 s13, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v56, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v46, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v0, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v8, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v1, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v7, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v2, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v6, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v3, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v5, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v4, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v54, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v51, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v30, s28 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f16_f32_e32 v27, v31 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f16_f32_e32 v24, v34 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f16_f32_e32 v31, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v35, s26 -; SI-NEXT: v_cvt_f16_f32_e32 v34, s29 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s21 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v8 +; SI-NEXT: s_lshr_b32 s10, s23, 16 +; SI-NEXT: s_lshr_b32 s11, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v7 +; SI-NEXT: s_lshr_b32 s8, s25, 16 +; SI-NEXT: s_lshr_b32 s9, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v9 +; SI-NEXT: s_lshr_b32 s6, s27, 16 +; SI-NEXT: s_lshr_b32 s7, s26, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; SI-NEXT: s_lshr_b32 s4, s29, 16 +; SI-NEXT: s_lshr_b32 s5, s28, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s29 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB35_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v0, v8, v0 -; SI-NEXT: v_or_b32_e32 v1, v7, v1 -; SI-NEXT: v_or_b32_e32 v2, v6, v2 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v34 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v62 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v53 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v46 +; SI-NEXT: v_or_b32_e32 v3, v25, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v41 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v44 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v56 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v60 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v55 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v52 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v51 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v48 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v39 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v60 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v34 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v58 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v56 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v46 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v44 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v42 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v29 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 -; SI-NEXT: v_or_b32_e32 v4, v54, v4 -; SI-NEXT: v_or_b32_e32 v5, v35, v5 -; SI-NEXT: v_or_b32_e32 v6, v30, v6 -; SI-NEXT: v_or_b32_e32 v7, v55, v7 -; SI-NEXT: v_or_b32_e32 v8, v63, v8 -; SI-NEXT: v_or_b32_e32 v9, v43, v9 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v28 +; SI-NEXT: v_mov_b32_e32 v43, v41 +; SI-NEXT: v_mov_b32_e32 v45, v44 +; SI-NEXT: v_mov_b32_e32 v41, v40 +; SI-NEXT: v_or_b32_e32 v0, v40, v0 +; SI-NEXT: v_or_b32_e32 v1, v63, v1 +; SI-NEXT: v_or_b32_e32 v2, v47, v2 +; SI-NEXT: v_or_b32_e32 v4, v62, v4 +; SI-NEXT: v_or_b32_e32 v5, v59, v5 +; SI-NEXT: v_or_b32_e32 v6, v58, v6 +; SI-NEXT: v_or_b32_e32 v7, v54, v7 +; SI-NEXT: v_or_b32_e32 v8, v53, v8 +; SI-NEXT: v_or_b32_e32 v9, v50, v9 ; SI-NEXT: v_or_b32_e32 v10, v49, v10 -; SI-NEXT: v_or_b32_e32 v11, v39, v11 +; SI-NEXT: v_or_b32_e32 v11, v38, v11 ; SI-NEXT: v_or_b32_e32 v12, v37, v12 -; SI-NEXT: v_or_b32_e32 v13, v61, v13 +; SI-NEXT: v_or_b32_e32 v13, v35, v13 ; SI-NEXT: v_or_b32_e32 v14, v33, v14 -; SI-NEXT: v_or_b32_e32 v15, v59, v15 -; SI-NEXT: v_or_b32_e32 v16, v57, v16 -; SI-NEXT: v_or_b32_e32 v17, v47, v17 -; SI-NEXT: v_or_b32_e32 v18, v45, v18 -; SI-NEXT: v_or_b32_e32 v19, v25, v19 -; SI-NEXT: v_or_b32_e32 v20, v41, v20 -; SI-NEXT: v_or_b32_e32 v21, v28, v21 -; SI-NEXT: v_or_b32_e32 v22, v26, v22 -; SI-NEXT: v_or_b32_e32 v23, v31, v23 +; SI-NEXT: v_or_b32_e32 v15, v26, v15 +; SI-NEXT: v_or_b32_e32 v16, v31, v16 +; SI-NEXT: v_or_b32_e32 v17, v29, v17 +; SI-NEXT: v_or_b32_e32 v18, v27, v18 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v23, v25, v23 ; SI-NEXT: s_cbranch_execnz .LBB35_3 ; SI-NEXT: .LBB35_2: ; %cmp.true -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v55 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v63 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v59 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v46 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v58 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v54 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v50 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v49 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v48 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 @@ -19275,9 +18782,9 @@ define inreg <24 x float> @bitcast_v48f16_to_v24f32_scalar(<48 x half> inreg %a, ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v38 ; SI-NEXT: v_cvt_f32_f16_e32 v14, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v34 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v33 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 @@ -19285,99 +18792,50 @@ define inreg <24 x float> @bitcast_v48f16_to_v24f32_scalar(<48 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v26 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v30 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v29 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v27 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v42 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v41 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v28 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v42 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v61 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v60 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v57 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v55 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 @@ -19387,22 +18845,30 @@ define inreg <24 x float> @bitcast_v48f16_to_v24f32_scalar(<48 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v51 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v39 ; SI-NEXT: v_or_b32_e32 v11, v13, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v14, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v35 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 @@ -19410,159 +18876,157 @@ define inreg <24 x float> @bitcast_v48f16_to_v24f32_scalar(<48 x half> inreg %a, ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v32 ; SI-NEXT: v_or_b32_e32 v14, v16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v31 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v28 ; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v44 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_or_b32_e32 v18, v20, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v25 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v31 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: v_or_b32_e32 v19, v20, v19 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v29 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_or_b32_e32 v20, v22, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v27 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v26 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_or_b32_e32 v22, v23, v22 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 ; SI-NEXT: v_or_b32_e32 v23, v25, v23 ; SI-NEXT: .LBB35_3: ; %end -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB35_4: -; SI-NEXT: v_mov_b32_e32 v40, v31 -; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v54, v33 -; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v51, v32 -; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v34, v59 -; SI-NEXT: v_mov_b32_e32 v59, v46 -; SI-NEXT: v_mov_b32_e32 v46, v41 -; SI-NEXT: v_mov_b32_e32 v41, v24 -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v35, v60 -; SI-NEXT: v_mov_b32_e32 v60, v47 -; SI-NEXT: v_mov_b32_e32 v47, v42 -; SI-NEXT: v_mov_b32_e32 v42, v26 -; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mov_b32_e32 v55, v52 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v53, v43 -; SI-NEXT: v_mov_b32_e32 v52, v50 -; SI-NEXT: v_mov_b32_e32 v50, v49 -; SI-NEXT: v_mov_b32_e32 v49, v48 -; SI-NEXT: v_mov_b32_e32 v48, v39 -; SI-NEXT: v_mov_b32_e32 v39, v38 -; SI-NEXT: v_mov_b32_e32 v38, v37 -; SI-NEXT: v_mov_b32_e32 v37, v36 -; SI-NEXT: v_mov_b32_e32 v36, v61 -; SI-NEXT: v_mov_b32_e32 v61, v56 -; SI-NEXT: v_mov_b32_e32 v56, v25 -; SI-NEXT: v_mov_b32_e32 v43, v27 -; SI-NEXT: v_mov_b32_e32 v33, v62 -; SI-NEXT: v_mov_b32_e32 v62, v57 -; SI-NEXT: v_mov_b32_e32 v57, v44 -; SI-NEXT: v_mov_b32_e32 v44, v28 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v63, v58 -; SI-NEXT: v_mov_b32_e32 v58, v45 -; SI-NEXT: v_mov_b32_e32 v45, v29 -; SI-NEXT: v_mov_b32_e32 v32, v30 -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v29, v45 -; SI-NEXT: v_mov_b32_e32 v45, v58 -; SI-NEXT: v_mov_b32_e32 v58, v63 -; SI-NEXT: v_mov_b32_e32 v27, v43 -; SI-NEXT: v_mov_b32_e32 v25, v56 -; SI-NEXT: v_mov_b32_e32 v56, v61 -; SI-NEXT: v_mov_b32_e32 v61, v36 -; SI-NEXT: v_mov_b32_e32 v36, v37 -; SI-NEXT: v_mov_b32_e32 v37, v38 -; SI-NEXT: v_mov_b32_e32 v38, v39 -; SI-NEXT: v_mov_b32_e32 v39, v48 -; SI-NEXT: v_mov_b32_e32 v48, v49 -; SI-NEXT: v_mov_b32_e32 v49, v50 -; SI-NEXT: v_mov_b32_e32 v50, v52 -; SI-NEXT: v_mov_b32_e32 v43, v53 -; SI-NEXT: v_mov_b32_e32 v52, v55 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v30, v32 -; SI-NEXT: v_mov_b32_e32 v28, v44 -; SI-NEXT: v_mov_b32_e32 v44, v57 -; SI-NEXT: v_mov_b32_e32 v57, v62 -; SI-NEXT: v_mov_b32_e32 v62, v33 -; SI-NEXT: v_mov_b32_e32 v26, v42 -; SI-NEXT: v_mov_b32_e32 v42, v47 -; SI-NEXT: v_mov_b32_e32 v47, v60 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB35_4: +; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v55, v39 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_mov_b32_e32 v53, v37 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_mov_b32_e32 v52, v36 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_mov_b32_e32 v51, v35 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v50, v34 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v49, v33 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v48, v32 +; SI-NEXT: v_mov_b32_e32 v39, v26 +; SI-NEXT: v_mov_b32_e32 v37, v58 +; SI-NEXT: v_mov_b32_e32 v58, v27 +; SI-NEXT: v_mov_b32_e32 v36, v59 +; SI-NEXT: v_mov_b32_e32 v59, v28 +; SI-NEXT: v_mov_b32_e32 v35, v60 +; SI-NEXT: v_mov_b32_e32 v60, v29 +; SI-NEXT: v_mov_b32_e32 v34, v61 +; SI-NEXT: v_mov_b32_e32 v61, v30 +; SI-NEXT: v_mov_b32_e32 v33, v62 +; SI-NEXT: v_mov_b32_e32 v62, v31 +; SI-NEXT: v_mov_b32_e32 v32, v63 +; SI-NEXT: v_mov_b32_e32 v63, v24 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: v_mov_b32_e32 v45, v44 +; SI-NEXT: v_mov_b32_e32 v24, v63 +; SI-NEXT: v_mov_b32_e32 v63, v32 +; SI-NEXT: v_mov_b32_e32 v31, v62 +; SI-NEXT: v_mov_b32_e32 v62, v33 +; SI-NEXT: v_mov_b32_e32 v30, v61 +; SI-NEXT: v_mov_b32_e32 v61, v34 +; SI-NEXT: v_mov_b32_e32 v29, v60 ; SI-NEXT: v_mov_b32_e32 v60, v35 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v24, v41 -; SI-NEXT: v_mov_b32_e32 v41, v46 -; SI-NEXT: v_mov_b32_e32 v46, v59 -; SI-NEXT: v_mov_b32_e32 v59, v34 -; SI-NEXT: v_mov_b32_e32 v32, v51 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v33, v54 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v31, v40 +; SI-NEXT: v_mov_b32_e32 v28, v59 +; SI-NEXT: v_mov_b32_e32 v59, v36 +; SI-NEXT: v_mov_b32_e32 v27, v58 +; SI-NEXT: v_mov_b32_e32 v58, v37 +; SI-NEXT: v_mov_b32_e32 v26, v39 +; SI-NEXT: v_mov_b32_e32 v32, v48 +; SI-NEXT: v_mov_b32_e32 v33, v49 +; SI-NEXT: v_mov_b32_e32 v34, v50 +; SI-NEXT: v_mov_b32_e32 v35, v51 +; SI-NEXT: v_mov_b32_e32 v36, v52 +; SI-NEXT: v_mov_b32_e32 v37, v53 +; SI-NEXT: v_mov_b32_e32 v39, v55 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v43, v41 +; SI-NEXT: v_mov_b32_e32 v41, v40 ; SI-NEXT: s_branch .LBB35_2 ; ; VI-LABEL: bitcast_v48f16_to_v24f32_scalar: @@ -20919,260 +20383,185 @@ define <48 x i16> @bitcast_v12i64_to_v48i16(<12 x i64> %a, i32 %b) { ; SI-LABEL: bitcast_v12i64_to_v48i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v25 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB40_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_alignbit_b32 v25, v24, v23, 16 -; SI-NEXT: v_alignbit_b32 v26, v22, v21, 16 -; SI-NEXT: v_alignbit_b32 v27, v20, v19, 16 -; SI-NEXT: v_alignbit_b32 v28, v18, v17, 16 -; SI-NEXT: v_alignbit_b32 v29, v16, v15, 16 -; SI-NEXT: v_alignbit_b32 v31, v14, v13, 16 -; SI-NEXT: v_alignbit_b32 v33, v12, v11, 16 -; SI-NEXT: v_alignbit_b32 v35, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v38, v8, v7, 16 -; SI-NEXT: v_alignbit_b32 v48, v6, v5, 16 -; SI-NEXT: v_alignbit_b32 v50, v4, v3, 16 -; SI-NEXT: v_alignbit_b32 v53, v2, v1, 16 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v4 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v2 +; SI-NEXT: v_alignbit_b32 v24, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v25, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v26, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v27, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v28, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v29, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v30, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v31, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v32, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v35, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v38, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v49, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v1 ; SI-NEXT: .LBB40_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB40_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: v_addc_u32_e32 v12, vcc, 0, v12, vcc -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; SI-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; SI-NEXT: v_addc_u32_e32 v16, vcc, 0, v16, vcc -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; SI-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; SI-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc -; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; SI-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc -; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; SI-NEXT: v_addc_u32_e32 v24, vcc, 0, v24, vcc -; SI-NEXT: v_alignbit_b32 v25, v24, v23, 16 -; SI-NEXT: v_alignbit_b32 v26, v22, v21, 16 -; SI-NEXT: v_alignbit_b32 v27, v20, v19, 16 -; SI-NEXT: v_alignbit_b32 v28, v18, v17, 16 -; SI-NEXT: v_alignbit_b32 v29, v16, v15, 16 -; SI-NEXT: v_alignbit_b32 v31, v14, v13, 16 -; SI-NEXT: v_alignbit_b32 v33, v12, v11, 16 -; SI-NEXT: v_alignbit_b32 v35, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v38, v8, v7, 16 -; SI-NEXT: v_alignbit_b32 v48, v6, v5, 16 -; SI-NEXT: v_alignbit_b32 v50, v4, v3, 16 -; SI-NEXT: v_alignbit_b32 v53, v2, v1, 16 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v4 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; SI-NEXT: v_alignbit_b32 v24, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v25, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v26, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v27, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v28, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v29, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v30, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v31, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v32, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v35, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v38, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v49, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v1 ; SI-NEXT: .LBB40_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v0, v0, v49 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; SI-NEXT: v_or_b32_e32 v1, v1, v53 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v49 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v28 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v26 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v24 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x5c, v0 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v55 +; SI-NEXT: v_or_b32_e32 v2, v2, v38 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v54 +; SI-NEXT: v_or_b32_e32 v4, v4, v35 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v53 +; SI-NEXT: v_or_b32_e32 v6, v6, v32 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v52 +; SI-NEXT: v_or_b32_e32 v8, v8, v31 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v51 +; SI-NEXT: v_or_b32_e32 v10, v10, v30 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v50 +; SI-NEXT: v_or_b32_e32 v12, v12, v29 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v48 +; SI-NEXT: v_or_b32_e32 v14, v14, v28 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v39 +; SI-NEXT: v_or_b32_e32 v16, v16, v27 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v37 +; SI-NEXT: v_or_b32_e32 v18, v18, v26 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v36 +; SI-NEXT: v_or_b32_e32 v20, v20, v25 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v34 +; SI-NEXT: v_or_b32_e32 v22, v22, v24 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v33 +; SI-NEXT: v_or_b32_e32 v1, v1, v49 +; SI-NEXT: v_or_b32_e32 v3, v3, v38 +; SI-NEXT: v_or_b32_e32 v5, v5, v35 +; SI-NEXT: v_or_b32_e32 v7, v7, v32 +; SI-NEXT: v_or_b32_e32 v9, v9, v31 +; SI-NEXT: v_or_b32_e32 v11, v11, v30 +; SI-NEXT: v_or_b32_e32 v13, v13, v29 +; SI-NEXT: v_or_b32_e32 v15, v15, v28 +; SI-NEXT: v_or_b32_e32 v17, v17, v27 +; SI-NEXT: v_or_b32_e32 v19, v19, v26 +; SI-NEXT: v_or_b32_e32 v21, v21, v25 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v12i64_to_v48i16: @@ -21681,53 +21070,53 @@ define inreg <48 x i16> @bitcast_v12i64_to_v48i16_scalar(<12 x i64> inreg %a, i3 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v20, s30, 0 -; SI-NEXT: v_mov_b32_e32 v12, s16 -; SI-NEXT: v_mov_b32_e32 v13, s17 -; SI-NEXT: v_mov_b32_e32 v14, s18 -; SI-NEXT: v_mov_b32_e32 v15, s19 -; SI-NEXT: v_mov_b32_e32 v16, s20 -; SI-NEXT: v_mov_b32_e32 v17, s21 -; SI-NEXT: v_writelane_b32 v20, s31, 1 -; SI-NEXT: v_mov_b32_e32 v18, s22 -; SI-NEXT: v_mov_b32_e32 v19, s23 -; SI-NEXT: v_readfirstlane_b32 s40, v12 -; SI-NEXT: v_mov_b32_e32 v12, s24 -; SI-NEXT: v_readfirstlane_b32 s41, v13 -; SI-NEXT: v_mov_b32_e32 v13, s25 -; SI-NEXT: v_readfirstlane_b32 s24, v14 -; SI-NEXT: v_mov_b32_e32 v14, s26 -; SI-NEXT: v_readfirstlane_b32 s25, v15 -; SI-NEXT: v_mov_b32_e32 v15, s27 -; SI-NEXT: v_readfirstlane_b32 s22, v16 -; SI-NEXT: v_mov_b32_e32 v16, s28 -; SI-NEXT: v_readfirstlane_b32 s23, v17 -; SI-NEXT: v_mov_b32_e32 v17, s29 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; SI-NEXT: v_writelane_b32 v20, s34, 2 -; SI-NEXT: v_readfirstlane_b32 s20, v18 -; SI-NEXT: v_readfirstlane_b32 s21, v19 -; SI-NEXT: v_readfirstlane_b32 s18, v12 -; SI-NEXT: v_readfirstlane_b32 s19, v13 -; SI-NEXT: v_readfirstlane_b32 s16, v14 -; SI-NEXT: v_readfirstlane_b32 s17, v15 -; SI-NEXT: v_readfirstlane_b32 s14, v16 -; SI-NEXT: v_readfirstlane_b32 s15, v17 -; SI-NEXT: v_readfirstlane_b32 s12, v1 -; SI-NEXT: v_readfirstlane_b32 s13, v2 -; SI-NEXT: v_readfirstlane_b32 s10, v3 -; SI-NEXT: v_readfirstlane_b32 s11, v4 -; SI-NEXT: v_readfirstlane_b32 s8, v5 -; SI-NEXT: v_readfirstlane_b32 s9, v6 -; SI-NEXT: v_readfirstlane_b32 s6, v7 -; SI-NEXT: v_readfirstlane_b32 s7, v8 -; SI-NEXT: v_readfirstlane_b32 s4, v9 +; SI-NEXT: v_writelane_b32 v24, s30, 0 +; SI-NEXT: v_mov_b32_e32 v11, s16 +; SI-NEXT: v_mov_b32_e32 v12, s17 +; SI-NEXT: v_mov_b32_e32 v13, s18 +; SI-NEXT: v_mov_b32_e32 v14, s19 +; SI-NEXT: v_mov_b32_e32 v15, s20 +; SI-NEXT: v_writelane_b32 v24, s31, 1 +; SI-NEXT: v_mov_b32_e32 v16, s21 +; SI-NEXT: v_mov_b32_e32 v17, s22 +; SI-NEXT: v_mov_b32_e32 v18, s23 +; SI-NEXT: v_mov_b32_e32 v19, s24 +; SI-NEXT: v_readfirstlane_b32 s40, v11 +; SI-NEXT: v_mov_b32_e32 v11, s25 +; SI-NEXT: v_readfirstlane_b32 s41, v12 +; SI-NEXT: v_mov_b32_e32 v12, s26 +; SI-NEXT: v_readfirstlane_b32 s24, v13 +; SI-NEXT: v_mov_b32_e32 v13, s27 +; SI-NEXT: v_readfirstlane_b32 s25, v14 +; SI-NEXT: v_mov_b32_e32 v14, s28 +; SI-NEXT: v_readfirstlane_b32 s22, v15 +; SI-NEXT: v_mov_b32_e32 v15, s29 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: v_writelane_b32 v24, s34, 2 +; SI-NEXT: v_readfirstlane_b32 s23, v16 +; SI-NEXT: v_readfirstlane_b32 s20, v17 +; SI-NEXT: v_readfirstlane_b32 s21, v18 +; SI-NEXT: v_readfirstlane_b32 s18, v19 +; SI-NEXT: v_readfirstlane_b32 s19, v11 +; SI-NEXT: v_readfirstlane_b32 s16, v12 +; SI-NEXT: v_readfirstlane_b32 s17, v13 +; SI-NEXT: v_readfirstlane_b32 s14, v14 +; SI-NEXT: v_readfirstlane_b32 s15, v15 +; SI-NEXT: v_readfirstlane_b32 s12, v0 +; SI-NEXT: v_readfirstlane_b32 s13, v1 +; SI-NEXT: v_readfirstlane_b32 s10, v2 +; SI-NEXT: v_readfirstlane_b32 s11, v3 +; SI-NEXT: v_readfirstlane_b32 s8, v4 +; SI-NEXT: v_readfirstlane_b32 s9, v5 +; SI-NEXT: v_readfirstlane_b32 s6, v6 +; SI-NEXT: v_readfirstlane_b32 s7, v7 +; SI-NEXT: v_readfirstlane_b32 s4, v8 ; SI-NEXT: s_and_b64 s[26:27], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s5, v10 -; SI-NEXT: v_writelane_b32 v20, s35, 3 +; SI-NEXT: v_readfirstlane_b32 s5, v9 +; SI-NEXT: v_writelane_b32 v24, s35, 3 ; SI-NEXT: s_cbranch_scc0 .LBB41_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_lshr_b32 s88, s5, 16 @@ -21808,175 +21197,107 @@ define inreg <48 x i16> @bitcast_v12i64_to_v48i16_scalar(<12 x i64> inreg %a, i3 ; SI-NEXT: s_lshl_b32 s27, s76, 16 ; SI-NEXT: s_and_b32 s29, s40, 0xffff ; SI-NEXT: s_or_b32 s27, s29, s27 -; SI-NEXT: v_mov_b32_e32 v1, s27 -; SI-NEXT: s_and_b32 s27, s41, 0xffff -; SI-NEXT: s_lshl_b32 s29, s35, 16 -; SI-NEXT: s_or_b32 s27, s27, s29 -; SI-NEXT: v_mov_b32_e32 v2, s27 -; SI-NEXT: s_lshl_b32 s27, s74, 16 +; SI-NEXT: s_and_b32 s29, s41, 0xffff +; SI-NEXT: s_lshl_b32 s40, s35, 16 +; SI-NEXT: s_or_b32 s29, s29, s40 +; SI-NEXT: s_lshl_b32 s40, s74, 16 ; SI-NEXT: s_and_b32 s24, s24, 0xffff -; SI-NEXT: s_or_b32 s24, s24, s27 -; SI-NEXT: v_mov_b32_e32 v3, s24 -; SI-NEXT: s_and_b32 s24, s25, 0xffff -; SI-NEXT: s_lshl_b32 s25, s34, 16 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; SI-NEXT: s_or_b32 s24, s24, s25 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v1, vcc, 8, v0 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s24 +; SI-NEXT: s_or_b32 s24, s24, s40 +; SI-NEXT: s_and_b32 s25, s25, 0xffff +; SI-NEXT: s_lshl_b32 s40, s34, 16 +; SI-NEXT: s_or_b32 s25, s25, s40 +; SI-NEXT: s_lshl_b32 s40, s72, 16 ; SI-NEXT: s_and_b32 s22, s22, 0xffff -; SI-NEXT: s_lshl_b32 s24, s72, 16 -; SI-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v1, vcc, 12, v0 -; SI-NEXT: s_or_b32 s22, s22, s24 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s22 -; SI-NEXT: s_and_b32 s22, s23, 0xffff -; SI-NEXT: s_lshl_b32 s23, s31, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 16, v0 -; SI-NEXT: s_or_b32 s22, s22, s23 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s22 +; SI-NEXT: s_or_b32 s22, s22, s40 +; SI-NEXT: s_and_b32 s23, s23, 0xffff +; SI-NEXT: s_lshl_b32 s40, s31, 16 +; SI-NEXT: s_or_b32 s23, s23, s40 ; SI-NEXT: s_and_b32 s20, s20, 0xffff -; SI-NEXT: s_lshl_b32 s22, s62, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 20, v0 -; SI-NEXT: s_or_b32 s20, s20, s22 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s20 -; SI-NEXT: s_and_b32 s20, s21, 0xffff -; SI-NEXT: s_lshl_b32 s21, s30, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 24, v0 -; SI-NEXT: s_or_b32 s20, s20, s21 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s20 +; SI-NEXT: s_lshl_b32 s40, s62, 16 +; SI-NEXT: s_or_b32 s20, s20, s40 +; SI-NEXT: s_and_b32 s21, s21, 0xffff +; SI-NEXT: s_lshl_b32 s40, s30, 16 +; SI-NEXT: s_or_b32 s21, s21, s40 ; SI-NEXT: s_and_b32 s18, s18, 0xffff -; SI-NEXT: s_lshl_b32 s20, s60, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 28, v0 -; SI-NEXT: s_or_b32 s18, s18, s20 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s18 -; SI-NEXT: s_and_b32 s18, s19, 0xffff -; SI-NEXT: s_lshl_b32 s19, s95, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 32, v0 -; SI-NEXT: s_or_b32 s18, s18, s19 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: s_lshl_b32 s40, s60, 16 +; SI-NEXT: s_or_b32 s18, s18, s40 +; SI-NEXT: s_and_b32 s19, s19, 0xffff +; SI-NEXT: s_lshl_b32 s40, s95, 16 +; SI-NEXT: s_or_b32 s19, s19, s40 ; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: s_lshl_b32 s18, s58, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 36, v0 -; SI-NEXT: s_or_b32 s16, s16, s18 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s17, 0xffff -; SI-NEXT: s_lshl_b32 s17, s94, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 40, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_lshl_b32 s40, s58, 16 +; SI-NEXT: s_or_b32 s16, s16, s40 +; SI-NEXT: s_and_b32 s17, s17, 0xffff +; SI-NEXT: s_lshl_b32 s40, s94, 16 +; SI-NEXT: s_or_b32 s17, s17, s40 ; SI-NEXT: s_and_b32 s14, s14, 0xffff -; SI-NEXT: s_lshl_b32 s16, s56, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 44, v0 -; SI-NEXT: s_or_b32 s14, s14, s16 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s14 -; SI-NEXT: s_and_b32 s14, s15, 0xffff -; SI-NEXT: s_lshl_b32 s15, s93, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 48, v0 -; SI-NEXT: s_or_b32 s14, s14, s15 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s14 +; SI-NEXT: s_lshl_b32 s40, s56, 16 +; SI-NEXT: s_or_b32 s14, s14, s40 +; SI-NEXT: s_and_b32 s15, s15, 0xffff +; SI-NEXT: s_lshl_b32 s40, s93, 16 +; SI-NEXT: s_or_b32 s15, s15, s40 ; SI-NEXT: s_and_b32 s12, s12, 0xffff -; SI-NEXT: s_lshl_b32 s14, s46, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 52, v0 -; SI-NEXT: s_or_b32 s12, s12, s14 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s12 -; SI-NEXT: s_and_b32 s12, s13, 0xffff -; SI-NEXT: s_lshl_b32 s13, s92, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 56, v0 -; SI-NEXT: s_or_b32 s12, s12, s13 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s12 +; SI-NEXT: s_lshl_b32 s40, s46, 16 +; SI-NEXT: s_or_b32 s12, s12, s40 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_lshl_b32 s40, s92, 16 +; SI-NEXT: s_or_b32 s13, s13, s40 ; SI-NEXT: s_and_b32 s10, s10, 0xffff -; SI-NEXT: s_lshl_b32 s12, s44, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 60, v0 -; SI-NEXT: s_or_b32 s10, s10, s12 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s10 -; SI-NEXT: s_and_b32 s10, s11, 0xffff -; SI-NEXT: s_lshl_b32 s11, s91, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 64, v0 -; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: s_lshl_b32 s40, s44, 16 +; SI-NEXT: s_or_b32 s10, s10, s40 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: s_lshl_b32 s40, s91, 16 +; SI-NEXT: s_or_b32 s11, s11, s40 ; SI-NEXT: s_and_b32 s8, s8, 0xffff -; SI-NEXT: s_lshl_b32 s10, s42, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x44, v0 -; SI-NEXT: s_or_b32 s8, s8, s10 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s8 -; SI-NEXT: s_and_b32 s8, s9, 0xffff -; SI-NEXT: s_lshl_b32 s9, s90, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x48, v0 -; SI-NEXT: s_or_b32 s8, s8, s9 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: s_lshl_b32 s40, s42, 16 ; SI-NEXT: s_and_b32 s6, s6, 0xffff -; SI-NEXT: s_lshl_b32 s8, s28, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x4c, v0 -; SI-NEXT: s_or_b32 s6, s6, s8 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: s_and_b32 s6, s7, 0xffff -; SI-NEXT: s_lshl_b32 s7, s89, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x50, v0 -; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_lshl_b32 s28, s28, 16 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_lshl_b32 s6, s26, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x54, v0 -; SI-NEXT: s_or_b32 s4, s4, s6 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s4 -; SI-NEXT: s_and_b32 s4, s5, 0xffff -; SI-NEXT: s_lshl_b32 s5, s88, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x58, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x5c, v0 -; SI-NEXT: v_mov_b32_e32 v1, s4 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: v_readlane_b32 s35, v20, 3 -; SI-NEXT: v_readlane_b32 s34, v20, 2 -; SI-NEXT: v_readlane_b32 s31, v20, 1 -; SI-NEXT: v_readlane_b32 s30, v20, 0 +; SI-NEXT: s_lshl_b32 s26, s26, 16 +; SI-NEXT: s_or_b32 s8, s8, s40 +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_lshl_b32 s40, s90, 16 +; SI-NEXT: s_or_b32 s6, s6, s28 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_lshl_b32 s28, s89, 16 +; SI-NEXT: s_or_b32 s4, s4, s26 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s26, s88, 16 +; SI-NEXT: s_or_b32 s9, s9, s40 +; SI-NEXT: s_or_b32 s7, s7, s28 +; SI-NEXT: s_or_b32 s5, s5, s26 +; SI-NEXT: v_mov_b32_e32 v0, s27 +; SI-NEXT: v_mov_b32_e32 v1, s29 +; SI-NEXT: v_mov_b32_e32 v2, s24 +; SI-NEXT: v_mov_b32_e32 v3, s25 +; SI-NEXT: v_mov_b32_e32 v4, s22 +; SI-NEXT: v_mov_b32_e32 v5, s23 +; SI-NEXT: v_mov_b32_e32 v6, s20 +; SI-NEXT: v_mov_b32_e32 v7, s21 +; SI-NEXT: v_mov_b32_e32 v8, s18 +; SI-NEXT: v_mov_b32_e32 v9, s19 +; SI-NEXT: v_mov_b32_e32 v10, s16 +; SI-NEXT: v_mov_b32_e32 v11, s17 +; SI-NEXT: v_mov_b32_e32 v12, s14 +; SI-NEXT: v_mov_b32_e32 v13, s15 +; SI-NEXT: v_mov_b32_e32 v14, s12 +; SI-NEXT: v_mov_b32_e32 v15, s13 +; SI-NEXT: v_mov_b32_e32 v16, s10 +; SI-NEXT: v_mov_b32_e32 v17, s11 +; SI-NEXT: v_mov_b32_e32 v18, s8 +; SI-NEXT: v_mov_b32_e32 v19, s9 +; SI-NEXT: v_mov_b32_e32 v20, s6 +; SI-NEXT: v_mov_b32_e32 v21, s7 +; SI-NEXT: v_mov_b32_e32 v22, s4 +; SI-NEXT: v_mov_b32_e32 v23, s5 +; SI-NEXT: v_readlane_b32 s35, v24, 3 +; SI-NEXT: v_readlane_b32 s34, v24, 2 +; SI-NEXT: v_readlane_b32 s31, v24, 1 +; SI-NEXT: v_readlane_b32 s30, v24, 0 ; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[4:5] -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB41_4: ; SI-NEXT: ; implicit-def: $sgpr76 @@ -22647,141 +21968,234 @@ define <12 x i64> @bitcast_v48i16_to_v12i64(<48 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v48i16_to_v12i64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v48, v14 -; SI-NEXT: v_mov_b32_e32 v49, v12 -; SI-NEXT: v_mov_b32_e32 v50, v10 -; SI-NEXT: v_mov_b32_e32 v51, v8 -; SI-NEXT: v_mov_b32_e32 v52, v6 -; SI-NEXT: v_mov_b32_e32 v53, v4 -; SI-NEXT: v_mov_b32_e32 v54, v2 -; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v29 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v8 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v12 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:52 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:44 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:36 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v18 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:28 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:12 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:4 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execz .LBB42_2 -; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; kill: killed $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; kill: killed $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; kill: killed $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; kill: killed $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; kill: killed $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v36, v19 +; SI-NEXT: v_mov_b32_e32 v37, v18 +; SI-NEXT: v_mov_b32_e32 v38, v17 +; SI-NEXT: v_mov_b32_e32 v39, v16 +; SI-NEXT: v_mov_b32_e32 v48, v15 +; SI-NEXT: v_mov_b32_e32 v49, v14 +; SI-NEXT: v_mov_b32_e32 v50, v13 +; SI-NEXT: v_mov_b32_e32 v51, v12 +; SI-NEXT: v_mov_b32_e32 v52, v11 +; SI-NEXT: v_mov_b32_e32 v53, v10 +; SI-NEXT: v_mov_b32_e32 v54, v9 +; SI-NEXT: v_mov_b32_e32 v55, v8 +; SI-NEXT: v_mov_b32_e32 v40, v7 +; SI-NEXT: v_mov_b32_e32 v41, v6 +; SI-NEXT: v_mov_b32_e32 v42, v5 +; SI-NEXT: v_mov_b32_e32 v43, v4 +; SI-NEXT: v_mov_b32_e32 v44, v3 +; SI-NEXT: v_mov_b32_e32 v45, v2 +; SI-NEXT: v_mov_b32_e32 v46, v1 +; SI-NEXT: v_mov_b32_e32 v47, v0 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v23 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v22 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v21 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v38 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v48 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v50 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v52 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v40 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v42 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v43 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v44 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v45 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v46 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v47 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB42_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v51 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v47 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v46 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v45 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v44 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v43 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v42 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v41 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v40 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v55 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v54 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v53 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v52 +; SI-NEXT: v_or_b32_e32 v0, v0, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v59 +; SI-NEXT: v_or_b32_e32 v2, v2, v34 +; SI-NEXT: v_or_b32_e32 v3, v3, v58 +; SI-NEXT: v_or_b32_e32 v4, v4, v33 +; SI-NEXT: v_or_b32_e32 v5, v5, v57 +; SI-NEXT: v_or_b32_e32 v6, v6, v32 +; SI-NEXT: v_or_b32_e32 v7, v7, v56 +; SI-NEXT: v_or_b32_e32 v8, v8, v63 +; SI-NEXT: v_or_b32_e32 v9, v9, v62 +; SI-NEXT: v_or_b32_e32 v10, v10, v61 +; SI-NEXT: v_or_b32_e32 v11, v11, v60 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v50 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v49 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v48 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v39 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v38 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v37 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v36 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; kill: killed $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr24 @@ -22797,135 +22211,38 @@ define <12 x i64> @bitcast_v48i16_to_v12i64(<48 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; kill: killed $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v53 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v52 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v51 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v50 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v49 -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v48 ; SI-NEXT: ; kill: killed $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: v_or_b32_e32 v0, v0, v47 -; SI-NEXT: v_or_b32_e32 v1, v1, v39 -; SI-NEXT: v_or_b32_e32 v2, v2, v46 -; SI-NEXT: v_or_b32_e32 v3, v3, v38 -; SI-NEXT: v_or_b32_e32 v4, v4, v45 -; SI-NEXT: v_or_b32_e32 v5, v5, v37 -; SI-NEXT: v_or_b32_e32 v6, v6, v44 -; SI-NEXT: v_or_b32_e32 v7, v7, v36 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; kill: killed $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; kill: killed $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; SI-NEXT: v_or_b32_e32 v8, v8, v43 -; SI-NEXT: v_or_b32_e32 v9, v9, v35 -; SI-NEXT: v_or_b32_e32 v10, v10, v42 -; SI-NEXT: v_or_b32_e32 v11, v11, v34 -; SI-NEXT: v_or_b32_e32 v12, v12, v41 -; SI-NEXT: v_or_b32_e32 v13, v13, v33 -; SI-NEXT: v_or_b32_e32 v14, v14, v40 -; SI-NEXT: v_or_b32_e32 v15, v15, v32 -; SI-NEXT: v_or_b32_e32 v16, v16, v63 -; SI-NEXT: v_or_b32_e32 v17, v17, v62 -; SI-NEXT: v_or_b32_e32 v18, v18, v61 -; SI-NEXT: v_or_b32_e32 v19, v19, v60 -; SI-NEXT: v_or_b32_e32 v20, v20, v59 -; SI-NEXT: v_or_b32_e32 v21, v21, v58 -; SI-NEXT: v_or_b32_e32 v22, v22, v57 -; SI-NEXT: v_or_b32_e32 v23, v23, v56 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: .LBB42_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB42_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v51 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v53 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v52 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v51 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v50 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v49 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v48 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v47 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v46 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v45 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v44 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v43 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v42 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v41 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v40 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v52 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 @@ -22934,15 +22251,23 @@ define <12 x i64> @bitcast_v48i16_to_v12i64(<48 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: v_or_b32_e32 v0, v47, v0 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_or_b32_e32 v0, v35, v0 ; SI-NEXT: s_mov_b32 s6, 0x30000 -; SI-NEXT: v_or_b32_e32 v1, v39, v1 -; SI-NEXT: v_or_b32_e32 v2, v46, v2 -; SI-NEXT: v_or_b32_e32 v3, v38, v3 -; SI-NEXT: v_or_b32_e32 v4, v45, v4 -; SI-NEXT: v_or_b32_e32 v5, v37, v5 -; SI-NEXT: v_or_b32_e32 v6, v44, v6 -; SI-NEXT: v_or_b32_e32 v7, v36, v7 +; SI-NEXT: v_or_b32_e32 v1, v59, v1 +; SI-NEXT: v_or_b32_e32 v2, v34, v2 +; SI-NEXT: v_or_b32_e32 v3, v58, v3 +; SI-NEXT: v_or_b32_e32 v4, v33, v4 +; SI-NEXT: v_or_b32_e32 v5, v57, v5 +; SI-NEXT: v_or_b32_e32 v6, v32, v6 +; SI-NEXT: v_or_b32_e32 v7, v56, v7 +; SI-NEXT: v_or_b32_e32 v8, v63, v8 +; SI-NEXT: v_or_b32_e32 v9, v62, v9 +; SI-NEXT: v_or_b32_e32 v10, v61, v10 +; SI-NEXT: v_or_b32_e32 v11, v60, v11 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 @@ -22951,64 +22276,40 @@ define <12 x i64> @bitcast_v48i16_to_v12i64(<48 x i16> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 ; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 ; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v50 ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v49 ; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v48 ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v39 ; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v38 ; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v37 ; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v36 ; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; SI-NEXT: v_or_b32_e32 v8, v43, v8 -; SI-NEXT: v_or_b32_e32 v9, v35, v9 -; SI-NEXT: v_or_b32_e32 v10, v42, v10 -; SI-NEXT: v_or_b32_e32 v11, v34, v11 -; SI-NEXT: v_or_b32_e32 v12, v41, v12 -; SI-NEXT: v_or_b32_e32 v13, v33, v13 -; SI-NEXT: v_or_b32_e32 v14, v40, v14 -; SI-NEXT: v_or_b32_e32 v15, v32, v15 -; SI-NEXT: v_or_b32_e32 v16, v63, v16 -; SI-NEXT: v_or_b32_e32 v17, v62, v17 -; SI-NEXT: v_or_b32_e32 v18, v61, v18 -; SI-NEXT: v_or_b32_e32 v19, v60, v19 -; SI-NEXT: v_or_b32_e32 v20, v59, v20 -; SI-NEXT: v_or_b32_e32 v21, v58, v21 -; SI-NEXT: v_or_b32_e32 v22, v57, v22 -; SI-NEXT: v_or_b32_e32 v23, v56, v23 -; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 ; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 ; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 @@ -23019,42 +22320,65 @@ define <12 x i64> @bitcast_v48i16_to_v12i64(<48 x i16> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 ; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 ; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 ; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v23 ; SI-NEXT: .LBB42_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; SI-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: bitcast_v48i16_to_v12i64: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v48i16_to_v12i64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v32, v23 ; VI-NEXT: v_mov_b32_e32 v33, v22 ; VI-NEXT: v_mov_b32_e32 v34, v21 @@ -23691,319 +23015,270 @@ define inreg <12 x i64> @bitcast_v48i16_to_v12i64_scalar(<48 x i16> inreg %a, i3 ; SI-LABEL: bitcast_v48i16_to_v12i64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mov_b32_e32 v61, v4 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v63, v2 -; SI-NEXT: v_mov_b32_e32 v56, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:4 -; SI-NEXT: v_mov_b32_e32 v31, v22 -; SI-NEXT: v_mov_b32_e32 v34, v20 -; SI-NEXT: v_mov_b32_e32 v35, v18 -; SI-NEXT: v_mov_b32_e32 v36, v16 -; SI-NEXT: v_mov_b32_e32 v37, v14 -; SI-NEXT: v_mov_b32_e32 v38, v12 -; SI-NEXT: v_mov_b32_e32 v39, v10 -; SI-NEXT: v_mov_b32_e32 v48, v8 -; SI-NEXT: v_mov_b32_e32 v50, v6 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v2 +; SI-NEXT: v_mov_b32_e32 v32, v9 +; SI-NEXT: v_mov_b32_e32 v33, v8 +; SI-NEXT: v_mov_b32_e32 v34, v7 +; SI-NEXT: v_mov_b32_e32 v35, v6 +; SI-NEXT: v_mov_b32_e32 v36, v5 +; SI-NEXT: v_mov_b32_e32 v37, v4 +; SI-NEXT: v_mov_b32_e32 v38, v3 +; SI-NEXT: v_mov_b32_e32 v39, v2 +; SI-NEXT: v_mov_b32_e32 v48, v1 +; SI-NEXT: v_mov_b32_e32 v49, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v38 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v48 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v49 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s7, s28, 16 +; SI-NEXT: s_lshr_b32 s8, s27, 16 +; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: s_lshr_b32 s13, s22, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s40, s19, 16 +; SI-NEXT: s_lshr_b32 s41, s18, 16 +; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v0 ; SI-NEXT: s_cbranch_scc0 .LBB43_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v56 -; SI-NEXT: v_or_b32_e32 v7, v0, v54 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v61 -; SI-NEXT: v_or_b32_e32 v9, v0, v32 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 -; SI-NEXT: v_or_b32_e32 v10, v0, v55 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s44, s42, 16 +; SI-NEXT: s_or_b32 s5, s5, s44 +; SI-NEXT: s_and_b32 s44, s18, 0xffff +; SI-NEXT: s_lshl_b32 s45, s41, 16 +; SI-NEXT: s_or_b32 s44, s44, s45 +; SI-NEXT: s_and_b32 s45, s19, 0xffff +; SI-NEXT: s_lshl_b32 s46, s40, 16 +; SI-NEXT: s_or_b32 s45, s45, s46 +; SI-NEXT: s_and_b32 s46, s20, 0xffff +; SI-NEXT: s_lshl_b32 s47, s15, 16 +; SI-NEXT: s_or_b32 s46, s46, s47 +; SI-NEXT: s_and_b32 s47, s21, 0xffff +; SI-NEXT: s_lshl_b32 s56, s14, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: s_or_b32 s47, s47, s56 +; SI-NEXT: s_and_b32 s56, s22, 0xffff +; SI-NEXT: s_lshl_b32 s57, s13, 16 +; SI-NEXT: v_or_b32_e32 v14, v0, v43 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 -; SI-NEXT: v_or_b32_e32 v11, v0, v62 +; SI-NEXT: s_or_b32 s56, s56, s57 +; SI-NEXT: s_and_b32 s57, s23, 0xffff +; SI-NEXT: s_lshl_b32 s58, s12, 16 +; SI-NEXT: v_or_b32_e32 v15, v0, v42 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 -; SI-NEXT: v_or_b32_e32 v12, v0, v46 +; SI-NEXT: s_or_b32 s57, s57, s58 +; SI-NEXT: s_and_b32 s58, s24, 0xffff +; SI-NEXT: s_lshl_b32 s59, s11, 16 +; SI-NEXT: v_or_b32_e32 v16, v0, v41 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 -; SI-NEXT: v_or_b32_e32 v13, v0, v45 +; SI-NEXT: s_or_b32 s58, s58, s59 +; SI-NEXT: s_and_b32 s59, s25, 0xffff +; SI-NEXT: s_lshl_b32 s60, s10, 16 +; SI-NEXT: v_or_b32_e32 v17, v0, v40 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 -; SI-NEXT: v_or_b32_e32 v14, v0, v60 +; SI-NEXT: s_or_b32 s59, s59, s60 +; SI-NEXT: s_and_b32 s60, s26, 0xffff +; SI-NEXT: s_lshl_b32 s61, s9, 16 +; SI-NEXT: v_or_b32_e32 v18, v0, v55 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 -; SI-NEXT: v_or_b32_e32 v15, v0, v59 +; SI-NEXT: s_or_b32 s60, s60, s61 +; SI-NEXT: s_and_b32 s61, s27, 0xffff +; SI-NEXT: s_lshl_b32 s62, s8, 16 +; SI-NEXT: v_or_b32_e32 v19, v0, v54 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 -; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: v_or_b32_e32 v16, v0, v58 +; SI-NEXT: s_or_b32 s61, s61, s62 +; SI-NEXT: s_and_b32 s62, s28, 0xffff +; SI-NEXT: s_lshl_b32 s63, s7, 16 +; SI-NEXT: v_or_b32_e32 v20, v0, v53 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: v_or_b32_e32 v17, v0, v41 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v31 -; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: v_or_b32_e32 v18, v0, v40 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v24 -; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: v_or_b32_e32 v19, v0, v52 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v26 -; SI-NEXT: s_or_b32 s7, s7, s8 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: v_or_b32_e32 v20, v0, v51 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v28 -; SI-NEXT: s_or_b32 s8, s8, s9 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: v_or_b32_e32 v21, v0, v29 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 -; SI-NEXT: s_or_b32 s9, s9, s10 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v63 -; SI-NEXT: v_or_b32_e32 v22, v0, v27 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 -; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_or_b32_e32 v8, v1, v57 -; SI-NEXT: v_or_b32_e32 v23, v0, v25 +; SI-NEXT: s_or_b32 s62, s62, s63 +; SI-NEXT: s_and_b32 s63, s29, 0xffff +; SI-NEXT: s_lshl_b32 s72, s6, 16 +; SI-NEXT: v_or_b32_e32 v21, v0, v52 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: s_or_b32 s63, s63, s72 +; SI-NEXT: v_or_b32_e32 v22, v0, v51 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: v_or_b32_e32 v23, v0, v50 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_mov_b32_e32 v5, s9 -; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: v_mov_b32_e32 v2, s44 +; SI-NEXT: v_mov_b32_e32 v3, s45 +; SI-NEXT: v_mov_b32_e32 v4, s46 +; SI-NEXT: v_mov_b32_e32 v5, s47 +; SI-NEXT: v_mov_b32_e32 v6, s56 +; SI-NEXT: v_mov_b32_e32 v7, s57 +; SI-NEXT: v_mov_b32_e32 v8, s58 +; SI-NEXT: v_mov_b32_e32 v9, s59 +; SI-NEXT: v_mov_b32_e32 v10, s60 +; SI-NEXT: v_mov_b32_e32 v11, s61 +; SI-NEXT: v_mov_b32_e32 v12, s62 +; SI-NEXT: v_mov_b32_e32 v13, s63 ; SI-NEXT: s_cbranch_execnz .LBB43_3 ; SI-NEXT: .LBB43_2: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v56 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v54, v0 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v61 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v32, v0 -; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v0, v55, v0 -; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 +; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v62, v0 -; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 +; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v46, v0 -; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 +; SI-NEXT: v_or_b32_e32 v0, v41, v0 +; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v45, v0 -; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 +; SI-NEXT: v_or_b32_e32 v0, v40, v0 +; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v60, v0 -; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: v_or_b32_e32 v0, v55, v0 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s16, s42, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: s_and_b32 s16, s18, 0xffff +; SI-NEXT: s_lshl_b32 s17, s41, 16 +; SI-NEXT: s_add_i32 s19, s19, 3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v59, v0 -; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_and_b32 s17, s19, 0xffff +; SI-NEXT: s_lshl_b32 s18, s40, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_or_b32_e32 v0, v54, v0 +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: s_and_b32 s18, s20, 0xffff +; SI-NEXT: s_lshl_b32 s15, s15, 16 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: s_or_b32 s15, s15, s18 +; SI-NEXT: s_and_b32 s18, s21, 0xffff +; SI-NEXT: s_lshl_b32 s14, s14, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v58, v0 -; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 +; SI-NEXT: s_or_b32 s14, s14, s18 +; SI-NEXT: s_and_b32 s18, s22, 0xffff +; SI-NEXT: s_lshl_b32 s13, s13, 16 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: v_or_b32_e32 v0, v53, v0 +; SI-NEXT: s_or_b32 s13, s13, s18 +; SI-NEXT: s_and_b32 s18, s23, 0xffff +; SI-NEXT: s_lshl_b32 s12, s12, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 +; SI-NEXT: s_or_b32 s12, s12, s18 +; SI-NEXT: s_and_b32 s18, s24, 0xffff +; SI-NEXT: s_lshl_b32 s11, s11, 16 +; SI-NEXT: s_add_i32 s25, s25, 3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v41, v0 -; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v31 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v40, v0 -; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v24 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s11, s11, s18 +; SI-NEXT: s_and_b32 s18, s25, 0xffff +; SI-NEXT: s_lshl_b32 s10, s10, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 ; SI-NEXT: v_or_b32_e32 v0, v52, v0 -; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v26 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_or_b32_e32 v0, v51, v0 -; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v28 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: v_or_b32_e32 v0, v29, v0 -; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s10, s10, s18 +; SI-NEXT: s_and_b32 s18, s26, 0xffff +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_add_i32 s27, s27, 3 ; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 -; SI-NEXT: s_or_b32 s7, s8, s7 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_or_b32 s8, s9, s8 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: s_or_b32 s9, s9, s18 +; SI-NEXT: s_and_b32 s18, s27, 0xffff +; SI-NEXT: s_lshl_b32 s8, s8, 16 ; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: v_or_b32_e32 v0, v27, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v63 -; SI-NEXT: s_or_b32 s9, s10, s9 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s8, s8, s18 +; SI-NEXT: s_and_b32 s18, s28, 0xffff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: v_or_b32_e32 v0, v51, v0 +; SI-NEXT: s_or_b32 s7, s7, s18 +; SI-NEXT: s_and_b32 s18, s29, 0xffff +; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: v_add_i32_e32 v22, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 +; SI-NEXT: s_or_b32 s6, s6, s18 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v57, v1 ; SI-NEXT: s_add_i32 s4, s4, 0x30000 ; SI-NEXT: s_add_i32 s5, s5, 0x30000 -; SI-NEXT: s_add_i32 s6, s6, 0x30000 -; SI-NEXT: s_add_i32 s7, s7, 0x30000 -; SI-NEXT: s_add_i32 s8, s8, 0x30000 -; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s16, s16, 0x30000 +; SI-NEXT: s_add_i32 s17, s17, 0x30000 +; SI-NEXT: s_add_i32 s15, s15, 0x30000 +; SI-NEXT: s_add_i32 s14, s14, 0x30000 +; SI-NEXT: s_add_i32 s13, s13, 0x30000 +; SI-NEXT: s_add_i32 s12, s12, 0x30000 +; SI-NEXT: s_add_i32 s11, s11, 0x30000 ; SI-NEXT: s_add_i32 s10, s10, 0x30000 -; SI-NEXT: v_or_b32_e32 v0, v25, v0 -; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v50, v0 ; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_mov_b32_e32 v5, s9 -; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: v_mov_b32_e32 v3, s17 +; SI-NEXT: v_mov_b32_e32 v4, s15 +; SI-NEXT: v_mov_b32_e32 v5, s14 +; SI-NEXT: v_mov_b32_e32 v6, s13 +; SI-NEXT: v_mov_b32_e32 v7, s12 +; SI-NEXT: v_mov_b32_e32 v8, s11 +; SI-NEXT: v_mov_b32_e32 v9, s10 +; SI-NEXT: v_mov_b32_e32 v10, s9 +; SI-NEXT: v_mov_b32_e32 v11, s8 +; SI-NEXT: v_mov_b32_e32 v12, s7 +; SI-NEXT: v_mov_b32_e32 v13, s6 ; SI-NEXT: .LBB43_3: ; %end -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB43_4: -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v46, v51 -; SI-NEXT: v_mov_b32_e32 v51, v39 -; SI-NEXT: v_mov_b32_e32 v39, v34 -; SI-NEXT: v_mov_b32_e32 v34, v30 -; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v47, v52 -; SI-NEXT: v_mov_b32_e32 v52, v48 -; SI-NEXT: v_mov_b32_e32 v48, v35 -; SI-NEXT: v_mov_b32_e32 v35, v28 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v55, v57 -; SI-NEXT: v_mov_b32_e32 v57, v41 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_mov_b32_e32 v41, v49 -; SI-NEXT: v_mov_b32_e32 v49, v36 -; SI-NEXT: v_mov_b32_e32 v36, v26 -; SI-NEXT: v_mov_b32_e32 v42, v50 -; SI-NEXT: v_mov_b32_e32 v50, v37 -; SI-NEXT: v_mov_b32_e32 v37, v24 -; SI-NEXT: v_mov_b32_e32 v33, v32 -; SI-NEXT: v_mov_b32_e32 v32, v56 -; SI-NEXT: v_mov_b32_e32 v56, v40 -; SI-NEXT: v_mov_b32_e32 v40, v38 -; SI-NEXT: v_mov_b32_e32 v38, v31 -; SI-NEXT: v_mov_b32_e32 v43, v25 -; SI-NEXT: v_mov_b32_e32 v44, v27 -; SI-NEXT: v_mov_b32_e32 v53, v45 -; SI-NEXT: v_mov_b32_e32 v45, v29 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v26, v36 -; SI-NEXT: v_mov_b32_e32 v36, v49 -; SI-NEXT: v_mov_b32_e32 v49, v41 -; SI-NEXT: v_mov_b32_e32 v41, v57 -; SI-NEXT: v_mov_b32_e32 v57, v55 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v30, v34 -; SI-NEXT: v_mov_b32_e32 v34, v39 -; SI-NEXT: v_mov_b32_e32 v39, v51 -; SI-NEXT: v_mov_b32_e32 v51, v46 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v29, v45 -; SI-NEXT: v_mov_b32_e32 v45, v53 -; SI-NEXT: v_mov_b32_e32 v27, v44 -; SI-NEXT: v_mov_b32_e32 v25, v43 -; SI-NEXT: v_mov_b32_e32 v31, v38 -; SI-NEXT: v_mov_b32_e32 v38, v40 -; SI-NEXT: v_mov_b32_e32 v40, v56 -; SI-NEXT: v_mov_b32_e32 v56, v32 -; SI-NEXT: v_mov_b32_e32 v32, v33 -; SI-NEXT: v_mov_b32_e32 v24, v37 -; SI-NEXT: v_mov_b32_e32 v37, v50 -; SI-NEXT: v_mov_b32_e32 v50, v42 -; SI-NEXT: v_mov_b32_e32 v28, v35 -; SI-NEXT: v_mov_b32_e32 v35, v48 -; SI-NEXT: v_mov_b32_e32 v48, v52 -; SI-NEXT: v_mov_b32_e32 v52, v47 ; SI-NEXT: s_branch .LBB43_2 ; ; VI-LABEL: bitcast_v48i16_to_v12i64_scalar: @@ -24632,16 +23907,12 @@ end: define <48 x half> @bitcast_v12i64_to_v48f16(<12 x i64> %a, i32 %b) { ; SI-LABEL: bitcast_v12i64_to_v48f16: ; SI: ; %bb.0: -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v25 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -24658,18 +23929,15 @@ define <48 x half> @bitcast_v12i64_to_v48f16(<12 x i64> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; kill: killed $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr62 @@ -24680,119 +23948,126 @@ define <48 x half> @bitcast_v12i64_to_v48f16(<12 x i64> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; kill: killed $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB44_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v22 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v24 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v53, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v24 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v50, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v0 +; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 @@ -24816,61 +24091,61 @@ define <48 x half> @bitcast_v12i64_to_v48f16(<12 x i64> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: .LBB44_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB44_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: v_addc_u32_e32 v12, vcc, 0, v12, vcc -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; SI-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_addc_u32_e32 v16, vcc, 0, v16, vcc -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; SI-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; SI-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc -; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; SI-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc -; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; SI-NEXT: v_addc_u32_e32 v24, vcc, 0, v24, vcc -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v19 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v19 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v0 +; SI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; SI-NEXT: v_cvt_f32_f16_e32 v30, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v33 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v18 ; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v20 ; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v21 ; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v22 ; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 @@ -24879,242 +24154,126 @@ define <48 x half> @bitcast_v12i64_to_v48f16(<12 x i64> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 ; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 ; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v32 ; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 ; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 ; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 ; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 ; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_mov_b32_e32 v35, v24 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v33, v22 +; SI-NEXT: v_mov_b32_e32 v32, v23 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: .LBB44_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v30 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: v_cvt_f16_f32_e32 v1, v29 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v28 -; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v26 -; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v25 -; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v62 -; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v60 -; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v58 -; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v56 -; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v46 -; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v44 -; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v42 -; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v41 -; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v55 -; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v53 -; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 -; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 -; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x5c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v35 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v28 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v27 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v24 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v62 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v61 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v58 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v57 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v45 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v46 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v41 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v42 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v53 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v54 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v48 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v50 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v16, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v34 ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -25131,7 +24290,49 @@ define <48 x half> @bitcast_v12i64_to_v48f16(<12 x i64> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v22, v37 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v32 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v20, v35 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v39 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v33 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 +; SI-NEXT: v_or_b32_e32 v23, v25, v23 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v12i64_to_v48f16: @@ -25639,140 +24840,139 @@ define inreg <48 x half> @bitcast_v12i64_to_v48f16_scalar(<12 x i64> inreg %a, i ; SI-LABEL: bitcast_v12i64_to_v48f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v12, s16 -; SI-NEXT: v_mov_b32_e32 v13, s17 -; SI-NEXT: v_mov_b32_e32 v14, s18 -; SI-NEXT: v_mov_b32_e32 v15, s19 -; SI-NEXT: v_mov_b32_e32 v16, s20 -; SI-NEXT: v_mov_b32_e32 v17, s21 -; SI-NEXT: v_mov_b32_e32 v18, s22 -; SI-NEXT: v_mov_b32_e32 v19, s23 -; SI-NEXT: v_readfirstlane_b32 s22, v12 -; SI-NEXT: v_mov_b32_e32 v12, s24 -; SI-NEXT: v_readfirstlane_b32 s40, v13 -; SI-NEXT: v_mov_b32_e32 v13, s25 -; SI-NEXT: v_readfirstlane_b32 s23, v14 -; SI-NEXT: v_mov_b32_e32 v14, s26 -; SI-NEXT: v_readfirstlane_b32 s41, v15 -; SI-NEXT: v_mov_b32_e32 v15, s27 -; SI-NEXT: v_readfirstlane_b32 s24, v16 -; SI-NEXT: v_mov_b32_e32 v16, s28 -; SI-NEXT: v_readfirstlane_b32 s27, v17 -; SI-NEXT: v_mov_b32_e32 v17, s29 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; SI-NEXT: v_readfirstlane_b32 s25, v18 -; SI-NEXT: v_readfirstlane_b32 s26, v19 -; SI-NEXT: v_readfirstlane_b32 s20, v12 -; SI-NEXT: v_readfirstlane_b32 s21, v13 -; SI-NEXT: v_readfirstlane_b32 s18, v14 -; SI-NEXT: v_readfirstlane_b32 s19, v15 -; SI-NEXT: v_readfirstlane_b32 s16, v16 -; SI-NEXT: v_readfirstlane_b32 s17, v17 -; SI-NEXT: v_readfirstlane_b32 s14, v1 -; SI-NEXT: v_readfirstlane_b32 s15, v2 -; SI-NEXT: v_readfirstlane_b32 s12, v3 -; SI-NEXT: v_readfirstlane_b32 s13, v4 -; SI-NEXT: v_readfirstlane_b32 s10, v5 -; SI-NEXT: v_readfirstlane_b32 s11, v6 -; SI-NEXT: v_readfirstlane_b32 s7, v7 -; SI-NEXT: v_readfirstlane_b32 s8, v8 -; SI-NEXT: v_readfirstlane_b32 s6, v9 +; SI-NEXT: v_mov_b32_e32 v11, s16 +; SI-NEXT: v_mov_b32_e32 v12, s17 +; SI-NEXT: v_mov_b32_e32 v13, s18 +; SI-NEXT: v_mov_b32_e32 v14, s19 +; SI-NEXT: v_mov_b32_e32 v15, s20 +; SI-NEXT: v_mov_b32_e32 v16, s21 +; SI-NEXT: v_mov_b32_e32 v17, s22 +; SI-NEXT: v_mov_b32_e32 v18, s23 +; SI-NEXT: v_mov_b32_e32 v19, s24 +; SI-NEXT: v_readfirstlane_b32 s24, v11 +; SI-NEXT: v_mov_b32_e32 v11, s25 +; SI-NEXT: v_readfirstlane_b32 s40, v12 +; SI-NEXT: v_mov_b32_e32 v12, s26 +; SI-NEXT: v_readfirstlane_b32 s25, v13 +; SI-NEXT: v_mov_b32_e32 v13, s27 +; SI-NEXT: v_readfirstlane_b32 s27, v14 +; SI-NEXT: v_mov_b32_e32 v14, s28 +; SI-NEXT: v_readfirstlane_b32 s26, v15 +; SI-NEXT: v_mov_b32_e32 v15, s29 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: v_readfirstlane_b32 s28, v16 +; SI-NEXT: v_readfirstlane_b32 s22, v17 +; SI-NEXT: v_readfirstlane_b32 s23, v18 +; SI-NEXT: v_readfirstlane_b32 s20, v19 +; SI-NEXT: v_readfirstlane_b32 s21, v11 +; SI-NEXT: v_readfirstlane_b32 s18, v12 +; SI-NEXT: v_readfirstlane_b32 s19, v13 +; SI-NEXT: v_readfirstlane_b32 s16, v14 +; SI-NEXT: v_readfirstlane_b32 s17, v15 +; SI-NEXT: v_readfirstlane_b32 s14, v0 +; SI-NEXT: v_readfirstlane_b32 s15, v1 +; SI-NEXT: v_readfirstlane_b32 s12, v2 +; SI-NEXT: v_readfirstlane_b32 s13, v3 +; SI-NEXT: v_readfirstlane_b32 s10, v4 +; SI-NEXT: v_readfirstlane_b32 s11, v5 +; SI-NEXT: v_readfirstlane_b32 s7, v6 +; SI-NEXT: v_readfirstlane_b32 s8, v7 +; SI-NEXT: v_readfirstlane_b32 s6, v8 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s9, v10 +; SI-NEXT: v_readfirstlane_b32 s9, v9 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB45_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_lshr_b32 s4, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s4 ; SI-NEXT: s_lshr_b32 s4, s6, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 ; SI-NEXT: s_lshr_b32 s4, s8, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 ; SI-NEXT: s_lshr_b32 s4, s7, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 ; SI-NEXT: s_lshr_b32 s4, s11, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 ; SI-NEXT: s_lshr_b32 s4, s10, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 ; SI-NEXT: s_lshr_b32 s4, s13, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 ; SI-NEXT: s_lshr_b32 s4, s12, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 ; SI-NEXT: s_lshr_b32 s4, s15, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 ; SI-NEXT: s_lshr_b32 s4, s14, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 ; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 ; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 ; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 ; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 ; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 ; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s4 +; SI-NEXT: s_lshr_b32 s4, s28, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 ; SI-NEXT: s_lshr_b32 s4, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s4 -; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s4 -; SI-NEXT: s_lshr_b32 s4, s27, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s4 -; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s4 -; SI-NEXT: s_lshr_b32 s4, s41, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v50, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s4 +; SI-NEXT: s_lshr_b32 s4, s27, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v53, s4 ; SI-NEXT: s_lshr_b32 s4, s40, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v54, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v40, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v49, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v51, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v53, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v55, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v55, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v51, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v54, s24 ; SI-NEXT: s_cbranch_execnz .LBB45_3 ; SI-NEXT: .LBB45_2: ; %cmp.true -; SI-NEXT: s_add_u32 s4, s22, 3 +; SI-NEXT: s_add_u32 s4, s24, 3 ; SI-NEXT: s_addc_u32 s5, s40, 0 -; SI-NEXT: s_lshr_b32 s22, s4, 16 -; SI-NEXT: s_lshr_b32 s28, s5, 16 -; SI-NEXT: s_add_u32 s23, s23, 3 -; SI-NEXT: s_addc_u32 s29, s41, 0 -; SI-NEXT: s_lshr_b32 s40, s23, 16 -; SI-NEXT: s_lshr_b32 s41, s29, 16 -; SI-NEXT: s_add_u32 s24, s24, 3 -; SI-NEXT: s_addc_u32 s27, s27, 0 -; SI-NEXT: s_lshr_b32 s42, s24, 16 -; SI-NEXT: s_lshr_b32 s43, s27, 16 +; SI-NEXT: s_lshr_b32 s24, s4, 16 +; SI-NEXT: s_lshr_b32 s29, s5, 16 ; SI-NEXT: s_add_u32 s25, s25, 3 -; SI-NEXT: s_addc_u32 s26, s26, 0 -; SI-NEXT: s_lshr_b32 s44, s25, 16 -; SI-NEXT: s_lshr_b32 s45, s26, 16 +; SI-NEXT: s_addc_u32 s27, s27, 0 +; SI-NEXT: s_lshr_b32 s40, s25, 16 +; SI-NEXT: s_lshr_b32 s41, s27, 16 +; SI-NEXT: s_add_u32 s26, s26, 3 +; SI-NEXT: s_addc_u32 s28, s28, 0 +; SI-NEXT: s_lshr_b32 s42, s26, 16 +; SI-NEXT: s_lshr_b32 s43, s28, 16 +; SI-NEXT: s_add_u32 s22, s22, 3 +; SI-NEXT: s_addc_u32 s23, s23, 0 +; SI-NEXT: s_lshr_b32 s44, s22, 16 +; SI-NEXT: s_lshr_b32 s45, s23, 16 ; SI-NEXT: s_add_u32 s20, s20, 3 ; SI-NEXT: s_addc_u32 s21, s21, 0 ; SI-NEXT: s_lshr_b32 s46, s20, 16 @@ -25805,275 +25005,204 @@ define inreg <48 x half> @bitcast_v12i64_to_v48f16_scalar(<12 x i64> inreg %a, i ; SI-NEXT: s_addc_u32 s9, s9, 0 ; SI-NEXT: s_lshr_b32 s76, s6, 16 ; SI-NEXT: s_lshr_b32 s77, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v49, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v51, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v53, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v55, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s77 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s76 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s75 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s74 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s73 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s72 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s63 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s62 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s61 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s60 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s59 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s58 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s57 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s56 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s47 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s46 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s45 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s44 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s43 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v54, s28 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v40, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v51, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v54, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s77 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s76 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s75 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s74 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s73 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s72 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s63 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s62 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s61 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s60 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s59 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s58 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s57 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s56 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s47 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s45 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s44 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v53, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v55, s24 ; SI-NEXT: .LBB45_3: ; %end +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v40, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v55, v55 ; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v40, v1 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 ; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v55 +; SI-NEXT: v_or_b32_e32 v0, v54, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v53 +; SI-NEXT: v_or_b32_e32 v2, v52, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v52 +; SI-NEXT: v_or_b32_e32 v5, v5, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v50 +; SI-NEXT: v_or_b32_e32 v7, v7, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v48 +; SI-NEXT: v_or_b32_e32 v9, v38, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v38 +; SI-NEXT: v_or_b32_e32 v11, v36, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v36 +; SI-NEXT: v_or_b32_e32 v13, v34, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v34 +; SI-NEXT: v_or_b32_e32 v15, v32, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v32 +; SI-NEXT: v_or_b32_e32 v17, v30, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v30 +; SI-NEXT: v_or_b32_e32 v19, v28, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v28 +; SI-NEXT: v_or_b32_e32 v21, v26, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 ; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 ; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 -; SI-NEXT: v_or_b32_e32 v55, v55, v40 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 ; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 -; SI-NEXT: buffer_store_dword v55, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v55, vcc, 4, v0 -; SI-NEXT: v_or_b32_e32 v53, v53, v54 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 ; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 -; SI-NEXT: buffer_store_dword v53, v55, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v53, vcc, 8, v0 -; SI-NEXT: v_or_b32_e32 v51, v51, v52 -; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 ; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: buffer_store_dword v51, v53, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v51, vcc, 12, v0 -; SI-NEXT: v_or_b32_e32 v49, v49, v50 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 ; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: buffer_store_dword v49, v51, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v49, vcc, 16, v0 -; SI-NEXT: v_or_b32_e32 v39, v48, v39 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 ; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: buffer_store_dword v39, v49, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v39, vcc, 20, v0 -; SI-NEXT: v_or_b32_e32 v37, v38, v37 -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: buffer_store_dword v37, v39, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v37, vcc, 24, v0 -; SI-NEXT: v_or_b32_e32 v35, v36, v35 -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: buffer_store_dword v35, v37, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v35, vcc, 28, v0 -; SI-NEXT: v_or_b32_e32 v33, v34, v33 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: buffer_store_dword v33, v35, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v33, vcc, 32, v0 -; SI-NEXT: v_or_b32_e32 v31, v32, v31 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; SI-NEXT: buffer_store_dword v31, v33, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v31, vcc, 36, v0 -; SI-NEXT: v_or_b32_e32 v29, v30, v29 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; SI-NEXT: buffer_store_dword v29, v31, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v29, vcc, 40, v0 -; SI-NEXT: v_or_b32_e32 v27, v28, v27 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: buffer_store_dword v27, v29, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v27, vcc, 44, v0 -; SI-NEXT: v_or_b32_e32 v24, v26, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: buffer_store_dword v24, v27, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v24, v25 -; SI-NEXT: v_add_i32_e32 v25, vcc, 48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v22, v24, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: buffer_store_dword v22, v25, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v22, v23 -; SI-NEXT: v_add_i32_e32 v23, vcc, 52, v0 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v22, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: buffer_store_dword v20, v23, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v21 -; SI-NEXT: v_add_i32_e32 v21, vcc, 56, v0 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v20, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: buffer_store_dword v18, v21, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v18, v19 -; SI-NEXT: v_add_i32_e32 v19, vcc, 60, v0 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: buffer_store_dword v16, v19, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v16, v17 -; SI-NEXT: v_add_i32_e32 v17, vcc, 64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v16, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v14, v15 -; SI-NEXT: v_add_i32_e32 v15, vcc, 0x44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v14, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: buffer_store_dword v12, v15, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v12, v13 -; SI-NEXT: v_add_i32_e32 v13, vcc, 0x48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: buffer_store_dword v10, v13, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v10, v11 -; SI-NEXT: v_add_i32_e32 v11, vcc, 0x4c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v10, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: buffer_store_dword v7, v11, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v7, v9 -; SI-NEXT: v_add_i32_e32 v9, vcc, 0x50, v0 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v8 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x54, v0 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v6 -; SI-NEXT: v_add_i32_e32 v5, vcc, 0x58, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v4 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x5c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v26 +; SI-NEXT: v_or_b32_e32 v3, v54, v3 +; SI-NEXT: v_or_b32_e32 v4, v51, v4 +; SI-NEXT: v_or_b32_e32 v6, v49, v6 +; SI-NEXT: v_or_b32_e32 v8, v39, v8 +; SI-NEXT: v_or_b32_e32 v10, v37, v10 +; SI-NEXT: v_or_b32_e32 v12, v35, v12 +; SI-NEXT: v_or_b32_e32 v14, v33, v14 +; SI-NEXT: v_or_b32_e32 v16, v31, v16 +; SI-NEXT: v_or_b32_e32 v18, v29, v18 +; SI-NEXT: v_or_b32_e32 v20, v27, v20 +; SI-NEXT: v_or_b32_e32 v22, v25, v22 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB45_4: +; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: s_branch .LBB45_2 ; ; VI-LABEL: bitcast_v12i64_to_v48f16_scalar: @@ -26718,170 +25847,181 @@ define <12 x i64> @bitcast_v48f16_to_v12i64(<48 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v48f16_to_v12i64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v16 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v40, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f16_f32_e32 v54, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:48 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:44 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:56 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:52 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:64 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v11 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v12 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v15 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v32 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v44, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v32 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v63 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v32 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v62 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v61 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v60 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v59 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v58 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v57 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v56 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v23 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB46_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v33 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; kill: killed $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; kill: killed $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; kill: killed $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; kill: killed $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; kill: killed $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; kill: killed $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; kill: killed $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; kill: killed $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v41 ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; kill: killed $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr24 @@ -26917,9 +26057,14 @@ define <12 x i64> @bitcast_v48f16_to_v12i64(<48 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v39 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v37 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v45 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v43 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v41 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v63 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v45 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v43 ; SI-NEXT: ; kill: killed $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: v_or_b32_e32 v0, v54, v0 @@ -26929,9 +26074,14 @@ define <12 x i64> @bitcast_v48f16_to_v12i64(<48 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v4, v38, v4 ; SI-NEXT: v_or_b32_e32 v5, v36, v5 ; SI-NEXT: v_or_b32_e32 v6, v34, v6 -; SI-NEXT: v_or_b32_e32 v21, v44, v21 -; SI-NEXT: v_or_b32_e32 v22, v42, v22 -; SI-NEXT: v_or_b32_e32 v23, v40, v23 +; SI-NEXT: v_or_b32_e32 v7, v32, v7 +; SI-NEXT: v_or_b32_e32 v8, v62, v8 +; SI-NEXT: v_or_b32_e32 v9, v60, v9 +; SI-NEXT: v_or_b32_e32 v10, v58, v10 +; SI-NEXT: v_or_b32_e32 v11, v56, v11 +; SI-NEXT: v_or_b32_e32 v12, v46, v12 +; SI-NEXT: v_or_b32_e32 v13, v44, v13 +; SI-NEXT: v_or_b32_e32 v14, v42, v14 ; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr53 @@ -26947,86 +26097,71 @@ define <12 x i64> @bitcast_v48f16_to_v12i64(<48 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v47 -; SI-NEXT: v_or_b32_e32 v20, v46, v20 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v23, v40, v23 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: .LBB46_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB46_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v55 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v53 @@ -27059,158 +26194,128 @@ define <12 x i64> @bitcast_v48f16_to_v12i64(<48 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v3, v49 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v36 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v36 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v34 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v32 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v39 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v47 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v46 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v44 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v58 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v41 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v40 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v57 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v33 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v13, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v43 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v42 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v40 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v59 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v12, v47 ; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v45 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v14, v12 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v44 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v41 ; SI-NEXT: v_or_b32_e32 v14, v16, v14 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 @@ -27222,12 +26327,12 @@ define <12 x i64> @bitcast_v48f16_to_v12i64(<48 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 @@ -27237,7 +26342,7 @@ define <12 x i64> @bitcast_v48f16_to_v12i64(<48 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_or_b32_e32 v18, v20, v18 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 @@ -27249,17 +26354,29 @@ define <12 x i64> @bitcast_v48f16_to_v12i64(<48 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_or_b32_e32 v19, v20, v19 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v45 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_or_b32_e32 v20, v22, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v43 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v42 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_or_b32_e32 v22, v23, v22 @@ -27267,22 +26384,22 @@ define <12 x i64> @bitcast_v48f16_to_v12i64(<48 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v23, v25, v23 ; SI-NEXT: .LBB46_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -27934,165 +27051,281 @@ define inreg <12 x i64> @bitcast_v48f16_to_v12i64_scalar(<48 x half> inreg %a, i ; SI-LABEL: bitcast_v48f16_to_v12i64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v62, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v3 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 +; SI-NEXT: s_lshr_b32 s40, s17, 16 +; SI-NEXT: s_lshr_b32 s41, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v63, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v7 +; SI-NEXT: s_lshr_b32 s14, s19, 16 +; SI-NEXT: s_lshr_b32 s15, s18, 16 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v23 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v63, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v6 +; SI-NEXT: s_lshr_b32 s12, s21, 16 +; SI-NEXT: s_lshr_b32 s13, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v56, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v46, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v0, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v8, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v1, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v7, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v2, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v6, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v3, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v5, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v4, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v54, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v51, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v30, s28 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f16_f32_e32 v27, v31 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f16_f32_e32 v24, v34 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f16_f32_e32 v31, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v35, s26 -; SI-NEXT: v_cvt_f16_f32_e32 v34, s29 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s21 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v8 +; SI-NEXT: s_lshr_b32 s10, s23, 16 +; SI-NEXT: s_lshr_b32 s11, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v7 +; SI-NEXT: s_lshr_b32 s8, s25, 16 +; SI-NEXT: s_lshr_b32 s9, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v9 +; SI-NEXT: s_lshr_b32 s6, s27, 16 +; SI-NEXT: s_lshr_b32 s7, s26, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; SI-NEXT: s_lshr_b32 s4, s29, 16 +; SI-NEXT: s_lshr_b32 s5, s28, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s29 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB47_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v0, v8, v0 -; SI-NEXT: v_or_b32_e32 v1, v7, v1 -; SI-NEXT: v_or_b32_e32 v2, v6, v2 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v34 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v62 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v53 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v46 +; SI-NEXT: v_or_b32_e32 v3, v25, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v41 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v44 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v56 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v60 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v55 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v52 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v51 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v48 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v39 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v60 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v34 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v58 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v56 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v46 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v44 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v42 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v29 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 -; SI-NEXT: v_or_b32_e32 v4, v54, v4 -; SI-NEXT: v_or_b32_e32 v5, v35, v5 -; SI-NEXT: v_or_b32_e32 v6, v30, v6 -; SI-NEXT: v_or_b32_e32 v7, v55, v7 -; SI-NEXT: v_or_b32_e32 v8, v63, v8 -; SI-NEXT: v_or_b32_e32 v9, v43, v9 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v28 +; SI-NEXT: v_mov_b32_e32 v43, v41 +; SI-NEXT: v_mov_b32_e32 v45, v44 +; SI-NEXT: v_mov_b32_e32 v41, v40 +; SI-NEXT: v_or_b32_e32 v0, v40, v0 +; SI-NEXT: v_or_b32_e32 v1, v63, v1 +; SI-NEXT: v_or_b32_e32 v2, v47, v2 +; SI-NEXT: v_or_b32_e32 v4, v62, v4 +; SI-NEXT: v_or_b32_e32 v5, v59, v5 +; SI-NEXT: v_or_b32_e32 v6, v58, v6 +; SI-NEXT: v_or_b32_e32 v7, v54, v7 +; SI-NEXT: v_or_b32_e32 v8, v53, v8 +; SI-NEXT: v_or_b32_e32 v9, v50, v9 ; SI-NEXT: v_or_b32_e32 v10, v49, v10 -; SI-NEXT: v_or_b32_e32 v11, v39, v11 +; SI-NEXT: v_or_b32_e32 v11, v38, v11 ; SI-NEXT: v_or_b32_e32 v12, v37, v12 -; SI-NEXT: v_or_b32_e32 v13, v61, v13 +; SI-NEXT: v_or_b32_e32 v13, v35, v13 ; SI-NEXT: v_or_b32_e32 v14, v33, v14 -; SI-NEXT: v_or_b32_e32 v15, v59, v15 -; SI-NEXT: v_or_b32_e32 v16, v57, v16 -; SI-NEXT: v_or_b32_e32 v17, v47, v17 -; SI-NEXT: v_or_b32_e32 v18, v45, v18 -; SI-NEXT: v_or_b32_e32 v19, v25, v19 -; SI-NEXT: v_or_b32_e32 v20, v41, v20 -; SI-NEXT: v_or_b32_e32 v21, v28, v21 -; SI-NEXT: v_or_b32_e32 v22, v26, v22 -; SI-NEXT: v_or_b32_e32 v23, v31, v23 -; SI-NEXT: s_cbranch_execnz .LBB47_3 -; SI-NEXT: .LBB47_2: ; %cmp.true -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v15, v26, v15 +; SI-NEXT: v_or_b32_e32 v16, v31, v16 +; SI-NEXT: v_or_b32_e32 v17, v29, v17 +; SI-NEXT: v_or_b32_e32 v18, v27, v18 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v55 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v23, v25, v23 +; SI-NEXT: s_cbranch_execnz .LBB47_3 +; SI-NEXT: .LBB47_2: ; %cmp.true +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v63 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v59 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v46 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v58 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v54 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v50 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v49 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v48 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 @@ -28103,9 +27336,9 @@ define inreg <12 x i64> @bitcast_v48f16_to_v12i64_scalar(<48 x half> inreg %a, i ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v38 ; SI-NEXT: v_cvt_f32_f16_e32 v14, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v34 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v33 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 @@ -28113,99 +27346,50 @@ define inreg <12 x i64> @bitcast_v48f16_to_v12i64_scalar(<48 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v26 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v30 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v29 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v27 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v42 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v41 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v28 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v42 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v61 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v60 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v57 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v55 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 @@ -28215,22 +27399,30 @@ define inreg <12 x i64> @bitcast_v48f16_to_v12i64_scalar(<48 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v51 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v39 ; SI-NEXT: v_or_b32_e32 v11, v13, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v14, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v35 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 @@ -28238,159 +27430,157 @@ define inreg <12 x i64> @bitcast_v48f16_to_v12i64_scalar(<48 x half> inreg %a, i ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v32 ; SI-NEXT: v_or_b32_e32 v14, v16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v31 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v28 ; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v44 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_or_b32_e32 v18, v20, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v25 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v31 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: v_or_b32_e32 v19, v20, v19 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v29 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_or_b32_e32 v20, v22, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v27 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v26 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_or_b32_e32 v22, v23, v22 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 ; SI-NEXT: v_or_b32_e32 v23, v25, v23 ; SI-NEXT: .LBB47_3: ; %end -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB47_4: -; SI-NEXT: v_mov_b32_e32 v40, v31 -; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v54, v33 -; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v51, v32 -; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v34, v59 -; SI-NEXT: v_mov_b32_e32 v59, v46 -; SI-NEXT: v_mov_b32_e32 v46, v41 -; SI-NEXT: v_mov_b32_e32 v41, v24 -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v35, v60 -; SI-NEXT: v_mov_b32_e32 v60, v47 -; SI-NEXT: v_mov_b32_e32 v47, v42 -; SI-NEXT: v_mov_b32_e32 v42, v26 -; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v55, v39 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_mov_b32_e32 v53, v37 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_mov_b32_e32 v52, v36 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_mov_b32_e32 v51, v35 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mov_b32_e32 v55, v52 +; SI-NEXT: v_mov_b32_e32 v50, v34 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v53, v43 -; SI-NEXT: v_mov_b32_e32 v52, v50 -; SI-NEXT: v_mov_b32_e32 v50, v49 -; SI-NEXT: v_mov_b32_e32 v49, v48 -; SI-NEXT: v_mov_b32_e32 v48, v39 -; SI-NEXT: v_mov_b32_e32 v39, v38 -; SI-NEXT: v_mov_b32_e32 v38, v37 -; SI-NEXT: v_mov_b32_e32 v37, v36 -; SI-NEXT: v_mov_b32_e32 v36, v61 -; SI-NEXT: v_mov_b32_e32 v61, v56 -; SI-NEXT: v_mov_b32_e32 v56, v25 -; SI-NEXT: v_mov_b32_e32 v43, v27 +; SI-NEXT: v_mov_b32_e32 v49, v33 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v48, v32 +; SI-NEXT: v_mov_b32_e32 v39, v26 +; SI-NEXT: v_mov_b32_e32 v37, v58 +; SI-NEXT: v_mov_b32_e32 v58, v27 +; SI-NEXT: v_mov_b32_e32 v36, v59 +; SI-NEXT: v_mov_b32_e32 v59, v28 +; SI-NEXT: v_mov_b32_e32 v35, v60 +; SI-NEXT: v_mov_b32_e32 v60, v29 +; SI-NEXT: v_mov_b32_e32 v34, v61 +; SI-NEXT: v_mov_b32_e32 v61, v30 ; SI-NEXT: v_mov_b32_e32 v33, v62 -; SI-NEXT: v_mov_b32_e32 v62, v57 -; SI-NEXT: v_mov_b32_e32 v57, v44 -; SI-NEXT: v_mov_b32_e32 v44, v28 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v63, v58 -; SI-NEXT: v_mov_b32_e32 v58, v45 -; SI-NEXT: v_mov_b32_e32 v45, v29 -; SI-NEXT: v_mov_b32_e32 v32, v30 +; SI-NEXT: v_mov_b32_e32 v62, v31 +; SI-NEXT: v_mov_b32_e32 v32, v63 +; SI-NEXT: v_mov_b32_e32 v63, v24 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v29, v45 -; SI-NEXT: v_mov_b32_e32 v45, v58 -; SI-NEXT: v_mov_b32_e32 v58, v63 -; SI-NEXT: v_mov_b32_e32 v27, v43 -; SI-NEXT: v_mov_b32_e32 v25, v56 -; SI-NEXT: v_mov_b32_e32 v56, v61 -; SI-NEXT: v_mov_b32_e32 v61, v36 -; SI-NEXT: v_mov_b32_e32 v36, v37 -; SI-NEXT: v_mov_b32_e32 v37, v38 -; SI-NEXT: v_mov_b32_e32 v38, v39 -; SI-NEXT: v_mov_b32_e32 v39, v48 -; SI-NEXT: v_mov_b32_e32 v48, v49 -; SI-NEXT: v_mov_b32_e32 v49, v50 -; SI-NEXT: v_mov_b32_e32 v50, v52 -; SI-NEXT: v_mov_b32_e32 v43, v53 -; SI-NEXT: v_mov_b32_e32 v52, v55 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v30, v32 -; SI-NEXT: v_mov_b32_e32 v28, v44 -; SI-NEXT: v_mov_b32_e32 v44, v57 -; SI-NEXT: v_mov_b32_e32 v57, v62 +; SI-NEXT: v_mov_b32_e32 v45, v44 +; SI-NEXT: v_mov_b32_e32 v24, v63 +; SI-NEXT: v_mov_b32_e32 v63, v32 +; SI-NEXT: v_mov_b32_e32 v31, v62 ; SI-NEXT: v_mov_b32_e32 v62, v33 -; SI-NEXT: v_mov_b32_e32 v26, v42 -; SI-NEXT: v_mov_b32_e32 v42, v47 -; SI-NEXT: v_mov_b32_e32 v47, v60 +; SI-NEXT: v_mov_b32_e32 v30, v61 +; SI-NEXT: v_mov_b32_e32 v61, v34 +; SI-NEXT: v_mov_b32_e32 v29, v60 ; SI-NEXT: v_mov_b32_e32 v60, v35 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v24, v41 -; SI-NEXT: v_mov_b32_e32 v41, v46 -; SI-NEXT: v_mov_b32_e32 v46, v59 -; SI-NEXT: v_mov_b32_e32 v59, v34 -; SI-NEXT: v_mov_b32_e32 v32, v51 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v33, v54 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v31, v40 +; SI-NEXT: v_mov_b32_e32 v28, v59 +; SI-NEXT: v_mov_b32_e32 v59, v36 +; SI-NEXT: v_mov_b32_e32 v27, v58 +; SI-NEXT: v_mov_b32_e32 v58, v37 +; SI-NEXT: v_mov_b32_e32 v26, v39 +; SI-NEXT: v_mov_b32_e32 v32, v48 +; SI-NEXT: v_mov_b32_e32 v33, v49 +; SI-NEXT: v_mov_b32_e32 v34, v50 +; SI-NEXT: v_mov_b32_e32 v35, v51 +; SI-NEXT: v_mov_b32_e32 v36, v52 +; SI-NEXT: v_mov_b32_e32 v37, v53 +; SI-NEXT: v_mov_b32_e32 v39, v55 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v43, v41 +; SI-NEXT: v_mov_b32_e32 v41, v40 ; SI-NEXT: s_branch .LBB47_2 ; ; VI-LABEL: bitcast_v48f16_to_v12i64_scalar: @@ -28985,248 +28175,173 @@ define <48 x i16> @bitcast_v12f64_to_v48i16(<12 x double> %a, i32 %b) { ; SI-LABEL: bitcast_v12f64_to_v48i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v25 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 ; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB48_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_alignbit_b32 v25, v24, v23, 16 -; SI-NEXT: v_alignbit_b32 v26, v22, v21, 16 -; SI-NEXT: v_alignbit_b32 v27, v20, v19, 16 -; SI-NEXT: v_alignbit_b32 v28, v18, v17, 16 -; SI-NEXT: v_alignbit_b32 v29, v16, v15, 16 -; SI-NEXT: v_alignbit_b32 v31, v14, v13, 16 -; SI-NEXT: v_alignbit_b32 v33, v12, v11, 16 -; SI-NEXT: v_alignbit_b32 v35, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v38, v8, v7, 16 -; SI-NEXT: v_alignbit_b32 v48, v6, v5, 16 -; SI-NEXT: v_alignbit_b32 v50, v4, v3, 16 -; SI-NEXT: v_alignbit_b32 v53, v2, v1, 16 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v4 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v2 +; SI-NEXT: v_alignbit_b32 v24, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v25, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v26, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v27, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v28, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v29, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v30, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v31, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v33, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v36, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v39, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v50, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v1 ; SI-NEXT: .LBB48_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB48_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 -; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 -; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 -; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 -; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 -; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 -; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 -; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 -; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 -; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 -; SI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 -; SI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 -; SI-NEXT: v_alignbit_b32 v25, v24, v23, 16 -; SI-NEXT: v_alignbit_b32 v26, v22, v21, 16 -; SI-NEXT: v_alignbit_b32 v27, v20, v19, 16 -; SI-NEXT: v_alignbit_b32 v28, v18, v17, 16 -; SI-NEXT: v_alignbit_b32 v29, v16, v15, 16 -; SI-NEXT: v_alignbit_b32 v31, v14, v13, 16 -; SI-NEXT: v_alignbit_b32 v33, v12, v11, 16 -; SI-NEXT: v_alignbit_b32 v35, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v38, v8, v7, 16 -; SI-NEXT: v_alignbit_b32 v48, v6, v5, 16 -; SI-NEXT: v_alignbit_b32 v50, v4, v3, 16 -; SI-NEXT: v_alignbit_b32 v53, v2, v1, 16 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v4 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v2 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; SI-NEXT: v_alignbit_b32 v24, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v25, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v26, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v27, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v28, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v29, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v30, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v31, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v33, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v36, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v39, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v50, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v1 ; SI-NEXT: .LBB48_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v0, v0, v50 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; SI-NEXT: v_or_b32_e32 v1, v1, v53 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v49 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v28 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v26 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v24 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x5c, v0 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v55 +; SI-NEXT: v_or_b32_e32 v2, v2, v39 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v54 +; SI-NEXT: v_or_b32_e32 v4, v4, v36 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v53 +; SI-NEXT: v_or_b32_e32 v6, v6, v33 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v52 +; SI-NEXT: v_or_b32_e32 v8, v8, v31 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v51 +; SI-NEXT: v_or_b32_e32 v10, v10, v30 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v49 +; SI-NEXT: v_or_b32_e32 v12, v12, v29 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v48 +; SI-NEXT: v_or_b32_e32 v14, v14, v28 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v38 +; SI-NEXT: v_or_b32_e32 v16, v16, v27 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v37 +; SI-NEXT: v_or_b32_e32 v18, v18, v26 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v35 +; SI-NEXT: v_or_b32_e32 v20, v20, v25 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v34 +; SI-NEXT: v_or_b32_e32 v22, v22, v24 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v32 +; SI-NEXT: v_or_b32_e32 v1, v1, v50 +; SI-NEXT: v_or_b32_e32 v3, v3, v39 +; SI-NEXT: v_or_b32_e32 v5, v5, v36 +; SI-NEXT: v_or_b32_e32 v7, v7, v33 +; SI-NEXT: v_or_b32_e32 v9, v9, v31 +; SI-NEXT: v_or_b32_e32 v11, v11, v30 +; SI-NEXT: v_or_b32_e32 v13, v13, v29 +; SI-NEXT: v_or_b32_e32 v15, v15, v28 +; SI-NEXT: v_or_b32_e32 v17, v17, v27 +; SI-NEXT: v_or_b32_e32 v19, v19, v26 +; SI-NEXT: v_or_b32_e32 v21, v21, v25 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v12f64_to_v48i16: @@ -29674,268 +28789,217 @@ define inreg <48 x i16> @bitcast_v12f64_to_v48i16_scalar(<12 x double> inreg %a, ; SI-LABEL: bitcast_v12f64_to_v48i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; SI-NEXT: v_mov_b32_e32 v23, s16 -; SI-NEXT: v_mov_b32_e32 v24, s17 -; SI-NEXT: v_mov_b32_e32 v21, s18 -; SI-NEXT: v_mov_b32_e32 v22, s19 -; SI-NEXT: v_mov_b32_e32 v19, s20 -; SI-NEXT: v_mov_b32_e32 v20, s21 -; SI-NEXT: v_mov_b32_e32 v17, s22 -; SI-NEXT: v_mov_b32_e32 v18, s23 -; SI-NEXT: v_mov_b32_e32 v15, s24 -; SI-NEXT: v_mov_b32_e32 v16, s25 -; SI-NEXT: v_mov_b32_e32 v13, s26 -; SI-NEXT: v_mov_b32_e32 v14, s27 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: v_mov_b32_e32 v22, s16 +; SI-NEXT: v_mov_b32_e32 v23, s17 +; SI-NEXT: v_mov_b32_e32 v20, s18 +; SI-NEXT: v_mov_b32_e32 v21, s19 +; SI-NEXT: v_mov_b32_e32 v18, s20 +; SI-NEXT: v_mov_b32_e32 v19, s21 +; SI-NEXT: v_mov_b32_e32 v14, s22 +; SI-NEXT: v_mov_b32_e32 v15, s23 +; SI-NEXT: v_mov_b32_e32 v16, s24 +; SI-NEXT: v_mov_b32_e32 v17, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mov_b32_e32 v11, s28 -; SI-NEXT: v_mov_b32_e32 v12, s29 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB49_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshr_b64 v[25:26], v[9:10], 16 -; SI-NEXT: v_lshr_b64 v[32:33], v[11:12], 16 -; SI-NEXT: v_lshr_b64 v[26:27], v[7:8], 16 -; SI-NEXT: v_lshr_b64 v[33:34], v[15:16], 16 -; SI-NEXT: v_lshr_b64 v[27:28], v[5:6], 16 -; SI-NEXT: v_lshr_b64 v[34:35], v[17:18], 16 -; SI-NEXT: v_lshr_b64 v[28:29], v[3:4], 16 -; SI-NEXT: v_lshr_b64 v[35:36], v[19:20], 16 -; SI-NEXT: v_lshr_b64 v[29:30], v[1:2], 16 -; SI-NEXT: v_lshr_b64 v[36:37], v[21:22], 16 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v18 +; SI-NEXT: v_lshr_b64 v[34:35], v[8:9], 16 +; SI-NEXT: v_lshr_b64 v[35:36], v[6:7], 16 +; SI-NEXT: v_lshr_b64 v[36:37], v[4:5], 16 +; SI-NEXT: v_lshr_b64 v[37:38], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[48:49], v[12:13], 16 +; SI-NEXT: v_lshr_b64 v[29:30], v[16:17], 16 +; SI-NEXT: v_lshr_b64 v[26:27], v[14:15], 16 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v1 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v13 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v11 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v17 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v22 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v24 -; SI-NEXT: v_lshr_b64 v[30:31], v[13:14], 16 -; SI-NEXT: v_lshr_b64 v[37:38], v[23:24], 16 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v15 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; SI-NEXT: v_lshr_b64 v[38:39], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[49:50], v[10:11], 16 +; SI-NEXT: v_lshr_b64 v[24:25], v[18:19], 16 +; SI-NEXT: v_lshr_b64 v[27:28], v[20:21], 16 +; SI-NEXT: v_lshr_b64 v[30:31], v[22:23], 16 ; SI-NEXT: s_cbranch_execnz .LBB49_3 ; SI-NEXT: .LBB49_2: ; %cmp.true -; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 -; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 -; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 -; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 -; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 -; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 -; SI-NEXT: v_lshr_b64 v[25:26], v[9:10], 16 -; SI-NEXT: v_lshr_b64 v[32:33], v[11:12], 16 -; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 -; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 -; SI-NEXT: v_lshr_b64 v[26:27], v[7:8], 16 -; SI-NEXT: v_lshr_b64 v[33:34], v[15:16], 16 -; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 -; SI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 -; SI-NEXT: v_lshr_b64 v[27:28], v[5:6], 16 -; SI-NEXT: v_lshr_b64 v[34:35], v[17:18], 16 -; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 -; SI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 -; SI-NEXT: v_lshr_b64 v[28:29], v[3:4], 16 -; SI-NEXT: v_lshr_b64 v[35:36], v[19:20], 16 -; SI-NEXT: v_lshr_b64 v[29:30], v[1:2], 16 -; SI-NEXT: v_lshr_b64 v[36:37], v[21:22], 16 -; SI-NEXT: v_lshr_b64 v[30:31], v[13:14], 16 -; SI-NEXT: v_lshr_b64 v[37:38], v[23:24], 16 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v18 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_lshr_b64 v[34:35], v[8:9], 16 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_lshr_b64 v[35:36], v[6:7], 16 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; SI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; SI-NEXT: v_lshr_b64 v[36:37], v[4:5], 16 +; SI-NEXT: v_lshr_b64 v[37:38], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[48:49], v[12:13], 16 +; SI-NEXT: v_lshr_b64 v[29:30], v[16:17], 16 +; SI-NEXT: v_lshr_b64 v[26:27], v[14:15], 16 +; SI-NEXT: v_lshr_b64 v[38:39], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[49:50], v[10:11], 16 +; SI-NEXT: v_lshr_b64 v[24:25], v[18:19], 16 +; SI-NEXT: v_lshr_b64 v[27:28], v[20:21], 16 +; SI-NEXT: v_lshr_b64 v[30:31], v[22:23], 16 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v1 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v13 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v11 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v17 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v15 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 ; SI-NEXT: .LBB49_3: ; %end -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v37 -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; SI-NEXT: v_or_b32_e32 v23, v23, v31 -; SI-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v42 -; SI-NEXT: v_or_b32_e32 v23, v23, v24 -; SI-NEXT: v_add_i32_e32 v24, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v23, v24, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v36 -; SI-NEXT: v_or_b32_e32 v21, v21, v23 -; SI-NEXT: v_add_i32_e32 v23, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v21, v23, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v22 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v41 -; SI-NEXT: v_or_b32_e32 v21, v21, v22 -; SI-NEXT: v_add_i32_e32 v22, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v21, v22, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v35 -; SI-NEXT: v_or_b32_e32 v19, v19, v21 -; SI-NEXT: v_add_i32_e32 v21, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v19, v21, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v40 -; SI-NEXT: v_or_b32_e32 v19, v19, v20 -; SI-NEXT: v_add_i32_e32 v20, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v34 -; SI-NEXT: v_or_b32_e32 v17, v17, v19 -; SI-NEXT: v_add_i32_e32 v19, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v17, v19, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v55 -; SI-NEXT: v_or_b32_e32 v17, v17, v18 -; SI-NEXT: v_add_i32_e32 v18, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v33 -; SI-NEXT: v_or_b32_e32 v15, v15, v17 -; SI-NEXT: v_add_i32_e32 v17, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v15, v17, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v54 -; SI-NEXT: v_or_b32_e32 v15, v15, v16 -; SI-NEXT: v_add_i32_e32 v16, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v30 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v30, v22, v25 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v32 +; SI-NEXT: v_or_b32_e32 v31, v22, v23 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v27 +; SI-NEXT: v_or_b32_e32 v32, v20, v22 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v33 +; SI-NEXT: v_or_b32_e32 v33, v20, v21 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v24 +; SI-NEXT: v_or_b32_e32 v24, v18, v20 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v44 +; SI-NEXT: v_or_b32_e32 v25, v18, v19 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v26 +; SI-NEXT: v_or_b32_e32 v26, v14, v18 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v43 +; SI-NEXT: v_or_b32_e32 v27, v14, v15 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v29 +; SI-NEXT: v_or_b32_e32 v28, v14, v15 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v42 +; SI-NEXT: v_or_b32_e32 v29, v14, v15 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v49 +; SI-NEXT: v_or_b32_e32 v10, v10, v14 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v41 +; SI-NEXT: v_or_b32_e32 v11, v11, v14 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v48 +; SI-NEXT: v_or_b32_e32 v12, v12, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v40 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v30 -; SI-NEXT: v_or_b32_e32 v13, v13, v15 -; SI-NEXT: v_add_i32_e32 v15, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v13, v15, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v53 ; SI-NEXT: v_or_b32_e32 v13, v13, v14 -; SI-NEXT: v_add_i32_e32 v14, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v32 -; SI-NEXT: v_or_b32_e32 v11, v11, v13 -; SI-NEXT: v_add_i32_e32 v13, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v11, v13, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v52 -; SI-NEXT: v_or_b32_e32 v11, v11, v12 -; SI-NEXT: v_add_i32_e32 v12, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v29 -; SI-NEXT: v_or_b32_e32 v1, v1, v11 -; SI-NEXT: v_add_i32_e32 v11, vcc, 56, v0 -; SI-NEXT: buffer_store_dword v1, v11, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v28 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v49 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v26 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x5c, v0 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v38 +; SI-NEXT: v_or_b32_e32 v14, v0, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v55 +; SI-NEXT: v_or_b32_e32 v15, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v37 +; SI-NEXT: v_or_b32_e32 v16, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v54 +; SI-NEXT: v_or_b32_e32 v17, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v36 +; SI-NEXT: v_or_b32_e32 v18, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v53 +; SI-NEXT: v_or_b32_e32 v19, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v35 +; SI-NEXT: v_or_b32_e32 v20, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v52 +; SI-NEXT: v_or_b32_e32 v21, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v34 +; SI-NEXT: v_or_b32_e32 v22, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v51 +; SI-NEXT: v_or_b32_e32 v23, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, v30 +; SI-NEXT: v_mov_b32_e32 v1, v31 +; SI-NEXT: v_mov_b32_e32 v2, v32 +; SI-NEXT: v_mov_b32_e32 v3, v33 +; SI-NEXT: v_mov_b32_e32 v4, v24 +; SI-NEXT: v_mov_b32_e32 v5, v25 +; SI-NEXT: v_mov_b32_e32 v6, v26 +; SI-NEXT: v_mov_b32_e32 v7, v27 +; SI-NEXT: v_mov_b32_e32 v8, v28 +; SI-NEXT: v_mov_b32_e32 v9, v29 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB49_4: -; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: s_branch .LBB49_2 ; ; VI-LABEL: bitcast_v12f64_to_v48i16_scalar: @@ -30610,129 +29674,222 @@ define <12 x double> @bitcast_v48i16_to_v12f64(<48 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v48i16_to_v12f64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v48, v14 -; SI-NEXT: v_mov_b32_e32 v49, v12 -; SI-NEXT: v_mov_b32_e32 v50, v10 -; SI-NEXT: v_mov_b32_e32 v51, v8 -; SI-NEXT: v_mov_b32_e32 v52, v6 -; SI-NEXT: v_mov_b32_e32 v53, v4 -; SI-NEXT: v_mov_b32_e32 v54, v2 -; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v29 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v8 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v12 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:52 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:44 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:36 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v18 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:28 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:12 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:4 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v36, v19 +; SI-NEXT: v_mov_b32_e32 v37, v18 +; SI-NEXT: v_mov_b32_e32 v38, v17 +; SI-NEXT: v_mov_b32_e32 v39, v16 +; SI-NEXT: v_mov_b32_e32 v48, v15 +; SI-NEXT: v_mov_b32_e32 v49, v14 +; SI-NEXT: v_mov_b32_e32 v50, v13 +; SI-NEXT: v_mov_b32_e32 v51, v12 +; SI-NEXT: v_mov_b32_e32 v52, v11 +; SI-NEXT: v_mov_b32_e32 v53, v10 +; SI-NEXT: v_mov_b32_e32 v54, v9 +; SI-NEXT: v_mov_b32_e32 v55, v8 +; SI-NEXT: v_mov_b32_e32 v40, v7 +; SI-NEXT: v_mov_b32_e32 v41, v6 +; SI-NEXT: v_mov_b32_e32 v42, v5 +; SI-NEXT: v_mov_b32_e32 v43, v4 +; SI-NEXT: v_mov_b32_e32 v44, v3 +; SI-NEXT: v_mov_b32_e32 v45, v2 +; SI-NEXT: v_mov_b32_e32 v46, v1 +; SI-NEXT: v_mov_b32_e32 v47, v0 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v23 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v22 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v21 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v38 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v48 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v50 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v52 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v40 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v42 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v43 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v44 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v45 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v46 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v47 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB50_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v51 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v47 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v46 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v45 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v44 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v43 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v42 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v41 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v40 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v55 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v54 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v53 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v52 +; SI-NEXT: v_or_b32_e32 v0, v0, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v59 +; SI-NEXT: v_or_b32_e32 v2, v2, v34 +; SI-NEXT: v_or_b32_e32 v3, v3, v58 +; SI-NEXT: v_or_b32_e32 v4, v4, v33 +; SI-NEXT: v_or_b32_e32 v5, v5, v57 +; SI-NEXT: v_or_b32_e32 v6, v6, v32 +; SI-NEXT: v_or_b32_e32 v7, v7, v56 +; SI-NEXT: v_or_b32_e32 v8, v8, v63 +; SI-NEXT: v_or_b32_e32 v9, v9, v62 +; SI-NEXT: v_or_b32_e32 v10, v10, v61 +; SI-NEXT: v_or_b32_e32 v11, v11, v60 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v50 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v49 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v48 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v39 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v38 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v37 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v36 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; kill: killed $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr24 @@ -30760,135 +29917,38 @@ define <12 x double> @bitcast_v48i16_to_v12f64(<48 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; kill: killed $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v53 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v52 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v51 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v50 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v49 -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v48 ; SI-NEXT: ; kill: killed $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: v_or_b32_e32 v0, v0, v47 -; SI-NEXT: v_or_b32_e32 v1, v1, v39 -; SI-NEXT: v_or_b32_e32 v2, v2, v46 -; SI-NEXT: v_or_b32_e32 v3, v3, v38 -; SI-NEXT: v_or_b32_e32 v4, v4, v45 -; SI-NEXT: v_or_b32_e32 v5, v5, v37 -; SI-NEXT: v_or_b32_e32 v6, v6, v44 -; SI-NEXT: v_or_b32_e32 v7, v7, v36 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; kill: killed $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; kill: killed $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; SI-NEXT: v_or_b32_e32 v8, v8, v43 -; SI-NEXT: v_or_b32_e32 v9, v9, v35 -; SI-NEXT: v_or_b32_e32 v10, v10, v42 -; SI-NEXT: v_or_b32_e32 v11, v11, v34 -; SI-NEXT: v_or_b32_e32 v12, v12, v41 -; SI-NEXT: v_or_b32_e32 v13, v13, v33 -; SI-NEXT: v_or_b32_e32 v14, v14, v40 -; SI-NEXT: v_or_b32_e32 v15, v15, v32 -; SI-NEXT: v_or_b32_e32 v16, v16, v63 -; SI-NEXT: v_or_b32_e32 v17, v17, v62 -; SI-NEXT: v_or_b32_e32 v18, v18, v61 -; SI-NEXT: v_or_b32_e32 v19, v19, v60 -; SI-NEXT: v_or_b32_e32 v20, v20, v59 -; SI-NEXT: v_or_b32_e32 v21, v21, v58 -; SI-NEXT: v_or_b32_e32 v22, v22, v57 -; SI-NEXT: v_or_b32_e32 v23, v23, v56 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: .LBB50_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB50_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v51 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v53 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v52 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v51 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v50 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v49 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v48 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v47 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v46 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v45 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v44 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v43 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v42 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v41 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v40 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v52 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 @@ -30897,15 +29957,23 @@ define <12 x double> @bitcast_v48i16_to_v12f64(<48 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: v_or_b32_e32 v0, v47, v0 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_or_b32_e32 v0, v35, v0 ; SI-NEXT: s_mov_b32 s6, 0x30000 -; SI-NEXT: v_or_b32_e32 v1, v39, v1 -; SI-NEXT: v_or_b32_e32 v2, v46, v2 -; SI-NEXT: v_or_b32_e32 v3, v38, v3 -; SI-NEXT: v_or_b32_e32 v4, v45, v4 -; SI-NEXT: v_or_b32_e32 v5, v37, v5 -; SI-NEXT: v_or_b32_e32 v6, v44, v6 -; SI-NEXT: v_or_b32_e32 v7, v36, v7 +; SI-NEXT: v_or_b32_e32 v1, v59, v1 +; SI-NEXT: v_or_b32_e32 v2, v34, v2 +; SI-NEXT: v_or_b32_e32 v3, v58, v3 +; SI-NEXT: v_or_b32_e32 v4, v33, v4 +; SI-NEXT: v_or_b32_e32 v5, v57, v5 +; SI-NEXT: v_or_b32_e32 v6, v32, v6 +; SI-NEXT: v_or_b32_e32 v7, v56, v7 +; SI-NEXT: v_or_b32_e32 v8, v63, v8 +; SI-NEXT: v_or_b32_e32 v9, v62, v9 +; SI-NEXT: v_or_b32_e32 v10, v61, v10 +; SI-NEXT: v_or_b32_e32 v11, v60, v11 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 @@ -30914,64 +29982,40 @@ define <12 x double> @bitcast_v48i16_to_v12f64(<48 x i16> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 ; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 ; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v50 ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v49 ; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v48 ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v39 ; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v38 ; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v37 ; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v36 ; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; SI-NEXT: v_or_b32_e32 v8, v43, v8 -; SI-NEXT: v_or_b32_e32 v9, v35, v9 -; SI-NEXT: v_or_b32_e32 v10, v42, v10 -; SI-NEXT: v_or_b32_e32 v11, v34, v11 -; SI-NEXT: v_or_b32_e32 v12, v41, v12 -; SI-NEXT: v_or_b32_e32 v13, v33, v13 -; SI-NEXT: v_or_b32_e32 v14, v40, v14 -; SI-NEXT: v_or_b32_e32 v15, v32, v15 -; SI-NEXT: v_or_b32_e32 v16, v63, v16 -; SI-NEXT: v_or_b32_e32 v17, v62, v17 -; SI-NEXT: v_or_b32_e32 v18, v61, v18 -; SI-NEXT: v_or_b32_e32 v19, v60, v19 -; SI-NEXT: v_or_b32_e32 v20, v59, v20 -; SI-NEXT: v_or_b32_e32 v21, v58, v21 -; SI-NEXT: v_or_b32_e32 v22, v57, v22 -; SI-NEXT: v_or_b32_e32 v23, v56, v23 -; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 ; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 ; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 @@ -30982,28 +30026,51 @@ define <12 x double> @bitcast_v48i16_to_v12f64(<48 x i16> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 ; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 ; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 ; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v23 ; SI-NEXT: .LBB50_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -31654,319 +30721,270 @@ define inreg <12 x double> @bitcast_v48i16_to_v12f64_scalar(<48 x i16> inreg %a, ; SI-LABEL: bitcast_v48i16_to_v12f64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mov_b32_e32 v61, v4 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v63, v2 -; SI-NEXT: v_mov_b32_e32 v56, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:4 -; SI-NEXT: v_mov_b32_e32 v31, v22 -; SI-NEXT: v_mov_b32_e32 v34, v20 -; SI-NEXT: v_mov_b32_e32 v35, v18 -; SI-NEXT: v_mov_b32_e32 v36, v16 -; SI-NEXT: v_mov_b32_e32 v37, v14 -; SI-NEXT: v_mov_b32_e32 v38, v12 -; SI-NEXT: v_mov_b32_e32 v39, v10 -; SI-NEXT: v_mov_b32_e32 v48, v8 -; SI-NEXT: v_mov_b32_e32 v50, v6 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v2 +; SI-NEXT: v_mov_b32_e32 v32, v9 +; SI-NEXT: v_mov_b32_e32 v33, v8 +; SI-NEXT: v_mov_b32_e32 v34, v7 +; SI-NEXT: v_mov_b32_e32 v35, v6 +; SI-NEXT: v_mov_b32_e32 v36, v5 +; SI-NEXT: v_mov_b32_e32 v37, v4 +; SI-NEXT: v_mov_b32_e32 v38, v3 +; SI-NEXT: v_mov_b32_e32 v39, v2 +; SI-NEXT: v_mov_b32_e32 v48, v1 +; SI-NEXT: v_mov_b32_e32 v49, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v38 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v48 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v49 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s7, s28, 16 +; SI-NEXT: s_lshr_b32 s8, s27, 16 +; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: s_lshr_b32 s13, s22, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s40, s19, 16 +; SI-NEXT: s_lshr_b32 s41, s18, 16 +; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v0 ; SI-NEXT: s_cbranch_scc0 .LBB51_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v56 -; SI-NEXT: v_or_b32_e32 v7, v0, v54 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v61 -; SI-NEXT: v_or_b32_e32 v9, v0, v32 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 -; SI-NEXT: v_or_b32_e32 v10, v0, v55 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s44, s42, 16 +; SI-NEXT: s_or_b32 s5, s5, s44 +; SI-NEXT: s_and_b32 s44, s18, 0xffff +; SI-NEXT: s_lshl_b32 s45, s41, 16 +; SI-NEXT: s_or_b32 s44, s44, s45 +; SI-NEXT: s_and_b32 s45, s19, 0xffff +; SI-NEXT: s_lshl_b32 s46, s40, 16 +; SI-NEXT: s_or_b32 s45, s45, s46 +; SI-NEXT: s_and_b32 s46, s20, 0xffff +; SI-NEXT: s_lshl_b32 s47, s15, 16 +; SI-NEXT: s_or_b32 s46, s46, s47 +; SI-NEXT: s_and_b32 s47, s21, 0xffff +; SI-NEXT: s_lshl_b32 s56, s14, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: s_or_b32 s47, s47, s56 +; SI-NEXT: s_and_b32 s56, s22, 0xffff +; SI-NEXT: s_lshl_b32 s57, s13, 16 +; SI-NEXT: v_or_b32_e32 v14, v0, v43 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 -; SI-NEXT: v_or_b32_e32 v11, v0, v62 +; SI-NEXT: s_or_b32 s56, s56, s57 +; SI-NEXT: s_and_b32 s57, s23, 0xffff +; SI-NEXT: s_lshl_b32 s58, s12, 16 +; SI-NEXT: v_or_b32_e32 v15, v0, v42 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 -; SI-NEXT: v_or_b32_e32 v12, v0, v46 +; SI-NEXT: s_or_b32 s57, s57, s58 +; SI-NEXT: s_and_b32 s58, s24, 0xffff +; SI-NEXT: s_lshl_b32 s59, s11, 16 +; SI-NEXT: v_or_b32_e32 v16, v0, v41 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 -; SI-NEXT: v_or_b32_e32 v13, v0, v45 +; SI-NEXT: s_or_b32 s58, s58, s59 +; SI-NEXT: s_and_b32 s59, s25, 0xffff +; SI-NEXT: s_lshl_b32 s60, s10, 16 +; SI-NEXT: v_or_b32_e32 v17, v0, v40 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 -; SI-NEXT: v_or_b32_e32 v14, v0, v60 +; SI-NEXT: s_or_b32 s59, s59, s60 +; SI-NEXT: s_and_b32 s60, s26, 0xffff +; SI-NEXT: s_lshl_b32 s61, s9, 16 +; SI-NEXT: v_or_b32_e32 v18, v0, v55 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 -; SI-NEXT: v_or_b32_e32 v15, v0, v59 +; SI-NEXT: s_or_b32 s60, s60, s61 +; SI-NEXT: s_and_b32 s61, s27, 0xffff +; SI-NEXT: s_lshl_b32 s62, s8, 16 +; SI-NEXT: v_or_b32_e32 v19, v0, v54 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 -; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: v_or_b32_e32 v16, v0, v58 +; SI-NEXT: s_or_b32 s61, s61, s62 +; SI-NEXT: s_and_b32 s62, s28, 0xffff +; SI-NEXT: s_lshl_b32 s63, s7, 16 +; SI-NEXT: v_or_b32_e32 v20, v0, v53 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: v_or_b32_e32 v17, v0, v41 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v31 -; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: v_or_b32_e32 v18, v0, v40 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v24 -; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: v_or_b32_e32 v19, v0, v52 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v26 -; SI-NEXT: s_or_b32 s7, s7, s8 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: v_or_b32_e32 v20, v0, v51 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v28 -; SI-NEXT: s_or_b32 s8, s8, s9 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: v_or_b32_e32 v21, v0, v29 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 -; SI-NEXT: s_or_b32 s9, s9, s10 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v63 -; SI-NEXT: v_or_b32_e32 v22, v0, v27 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 -; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_or_b32_e32 v8, v1, v57 -; SI-NEXT: v_or_b32_e32 v23, v0, v25 +; SI-NEXT: s_or_b32 s62, s62, s63 +; SI-NEXT: s_and_b32 s63, s29, 0xffff +; SI-NEXT: s_lshl_b32 s72, s6, 16 +; SI-NEXT: v_or_b32_e32 v21, v0, v52 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: s_or_b32 s63, s63, s72 +; SI-NEXT: v_or_b32_e32 v22, v0, v51 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: v_or_b32_e32 v23, v0, v50 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_mov_b32_e32 v5, s9 -; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: v_mov_b32_e32 v2, s44 +; SI-NEXT: v_mov_b32_e32 v3, s45 +; SI-NEXT: v_mov_b32_e32 v4, s46 +; SI-NEXT: v_mov_b32_e32 v5, s47 +; SI-NEXT: v_mov_b32_e32 v6, s56 +; SI-NEXT: v_mov_b32_e32 v7, s57 +; SI-NEXT: v_mov_b32_e32 v8, s58 +; SI-NEXT: v_mov_b32_e32 v9, s59 +; SI-NEXT: v_mov_b32_e32 v10, s60 +; SI-NEXT: v_mov_b32_e32 v11, s61 +; SI-NEXT: v_mov_b32_e32 v12, s62 +; SI-NEXT: v_mov_b32_e32 v13, s63 ; SI-NEXT: s_cbranch_execnz .LBB51_3 ; SI-NEXT: .LBB51_2: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v56 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v54, v0 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v61 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v32, v0 -; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v0, v55, v0 -; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 +; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v62, v0 -; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 +; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v46, v0 -; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 +; SI-NEXT: v_or_b32_e32 v0, v41, v0 +; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v45, v0 -; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 +; SI-NEXT: v_or_b32_e32 v0, v40, v0 +; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v60, v0 -; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: v_or_b32_e32 v0, v55, v0 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s16, s42, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: s_and_b32 s16, s18, 0xffff +; SI-NEXT: s_lshl_b32 s17, s41, 16 +; SI-NEXT: s_add_i32 s19, s19, 3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v59, v0 -; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_and_b32 s17, s19, 0xffff +; SI-NEXT: s_lshl_b32 s18, s40, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_or_b32_e32 v0, v54, v0 +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: s_and_b32 s18, s20, 0xffff +; SI-NEXT: s_lshl_b32 s15, s15, 16 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: s_or_b32 s15, s15, s18 +; SI-NEXT: s_and_b32 s18, s21, 0xffff +; SI-NEXT: s_lshl_b32 s14, s14, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v58, v0 -; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 +; SI-NEXT: s_or_b32 s14, s14, s18 +; SI-NEXT: s_and_b32 s18, s22, 0xffff +; SI-NEXT: s_lshl_b32 s13, s13, 16 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: v_or_b32_e32 v0, v53, v0 +; SI-NEXT: s_or_b32 s13, s13, s18 +; SI-NEXT: s_and_b32 s18, s23, 0xffff +; SI-NEXT: s_lshl_b32 s12, s12, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 +; SI-NEXT: s_or_b32 s12, s12, s18 +; SI-NEXT: s_and_b32 s18, s24, 0xffff +; SI-NEXT: s_lshl_b32 s11, s11, 16 +; SI-NEXT: s_add_i32 s25, s25, 3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v41, v0 -; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v31 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v40, v0 -; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v24 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s11, s11, s18 +; SI-NEXT: s_and_b32 s18, s25, 0xffff +; SI-NEXT: s_lshl_b32 s10, s10, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 ; SI-NEXT: v_or_b32_e32 v0, v52, v0 -; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v26 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_or_b32_e32 v0, v51, v0 -; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v28 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: v_or_b32_e32 v0, v29, v0 -; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s10, s10, s18 +; SI-NEXT: s_and_b32 s18, s26, 0xffff +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_add_i32 s27, s27, 3 ; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 -; SI-NEXT: s_or_b32 s7, s8, s7 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_or_b32 s8, s9, s8 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: s_or_b32 s9, s9, s18 +; SI-NEXT: s_and_b32 s18, s27, 0xffff +; SI-NEXT: s_lshl_b32 s8, s8, 16 ; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: v_or_b32_e32 v0, v27, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v63 -; SI-NEXT: s_or_b32 s9, s10, s9 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s8, s8, s18 +; SI-NEXT: s_and_b32 s18, s28, 0xffff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: v_or_b32_e32 v0, v51, v0 +; SI-NEXT: s_or_b32 s7, s7, s18 +; SI-NEXT: s_and_b32 s18, s29, 0xffff +; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: v_add_i32_e32 v22, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 +; SI-NEXT: s_or_b32 s6, s6, s18 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v57, v1 ; SI-NEXT: s_add_i32 s4, s4, 0x30000 ; SI-NEXT: s_add_i32 s5, s5, 0x30000 -; SI-NEXT: s_add_i32 s6, s6, 0x30000 -; SI-NEXT: s_add_i32 s7, s7, 0x30000 -; SI-NEXT: s_add_i32 s8, s8, 0x30000 -; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s16, s16, 0x30000 +; SI-NEXT: s_add_i32 s17, s17, 0x30000 +; SI-NEXT: s_add_i32 s15, s15, 0x30000 +; SI-NEXT: s_add_i32 s14, s14, 0x30000 +; SI-NEXT: s_add_i32 s13, s13, 0x30000 +; SI-NEXT: s_add_i32 s12, s12, 0x30000 +; SI-NEXT: s_add_i32 s11, s11, 0x30000 ; SI-NEXT: s_add_i32 s10, s10, 0x30000 -; SI-NEXT: v_or_b32_e32 v0, v25, v0 -; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v50, v0 ; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_mov_b32_e32 v5, s9 -; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: v_mov_b32_e32 v3, s17 +; SI-NEXT: v_mov_b32_e32 v4, s15 +; SI-NEXT: v_mov_b32_e32 v5, s14 +; SI-NEXT: v_mov_b32_e32 v6, s13 +; SI-NEXT: v_mov_b32_e32 v7, s12 +; SI-NEXT: v_mov_b32_e32 v8, s11 +; SI-NEXT: v_mov_b32_e32 v9, s10 +; SI-NEXT: v_mov_b32_e32 v10, s9 +; SI-NEXT: v_mov_b32_e32 v11, s8 +; SI-NEXT: v_mov_b32_e32 v12, s7 +; SI-NEXT: v_mov_b32_e32 v13, s6 ; SI-NEXT: .LBB51_3: ; %end -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB51_4: -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v46, v51 -; SI-NEXT: v_mov_b32_e32 v51, v39 -; SI-NEXT: v_mov_b32_e32 v39, v34 -; SI-NEXT: v_mov_b32_e32 v34, v30 -; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v47, v52 -; SI-NEXT: v_mov_b32_e32 v52, v48 -; SI-NEXT: v_mov_b32_e32 v48, v35 -; SI-NEXT: v_mov_b32_e32 v35, v28 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v55, v57 -; SI-NEXT: v_mov_b32_e32 v57, v41 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_mov_b32_e32 v41, v49 -; SI-NEXT: v_mov_b32_e32 v49, v36 -; SI-NEXT: v_mov_b32_e32 v36, v26 -; SI-NEXT: v_mov_b32_e32 v42, v50 -; SI-NEXT: v_mov_b32_e32 v50, v37 -; SI-NEXT: v_mov_b32_e32 v37, v24 -; SI-NEXT: v_mov_b32_e32 v33, v32 -; SI-NEXT: v_mov_b32_e32 v32, v56 -; SI-NEXT: v_mov_b32_e32 v56, v40 -; SI-NEXT: v_mov_b32_e32 v40, v38 -; SI-NEXT: v_mov_b32_e32 v38, v31 -; SI-NEXT: v_mov_b32_e32 v43, v25 -; SI-NEXT: v_mov_b32_e32 v44, v27 -; SI-NEXT: v_mov_b32_e32 v53, v45 -; SI-NEXT: v_mov_b32_e32 v45, v29 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v26, v36 -; SI-NEXT: v_mov_b32_e32 v36, v49 -; SI-NEXT: v_mov_b32_e32 v49, v41 -; SI-NEXT: v_mov_b32_e32 v41, v57 -; SI-NEXT: v_mov_b32_e32 v57, v55 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v30, v34 -; SI-NEXT: v_mov_b32_e32 v34, v39 -; SI-NEXT: v_mov_b32_e32 v39, v51 -; SI-NEXT: v_mov_b32_e32 v51, v46 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v29, v45 -; SI-NEXT: v_mov_b32_e32 v45, v53 -; SI-NEXT: v_mov_b32_e32 v27, v44 -; SI-NEXT: v_mov_b32_e32 v25, v43 -; SI-NEXT: v_mov_b32_e32 v31, v38 -; SI-NEXT: v_mov_b32_e32 v38, v40 -; SI-NEXT: v_mov_b32_e32 v40, v56 -; SI-NEXT: v_mov_b32_e32 v56, v32 -; SI-NEXT: v_mov_b32_e32 v32, v33 -; SI-NEXT: v_mov_b32_e32 v24, v37 -; SI-NEXT: v_mov_b32_e32 v37, v50 -; SI-NEXT: v_mov_b32_e32 v50, v42 -; SI-NEXT: v_mov_b32_e32 v28, v35 -; SI-NEXT: v_mov_b32_e32 v35, v48 -; SI-NEXT: v_mov_b32_e32 v48, v52 -; SI-NEXT: v_mov_b32_e32 v52, v47 ; SI-NEXT: s_branch .LBB51_2 ; ; VI-LABEL: bitcast_v48i16_to_v12f64_scalar: @@ -32595,16 +31613,12 @@ end: define <48 x half> @bitcast_v12f64_to_v48f16(<12 x double> %a, i32 %b) { ; SI-LABEL: bitcast_v12f64_to_v48f16: ; SI: ; %bb.0: -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v25 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -32621,18 +31635,15 @@ define <48 x half> @bitcast_v12f64_to_v48f16(<12 x double> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; kill: killed $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr62 @@ -32643,173 +31654,181 @@ define <48 x half> @bitcast_v12f64_to_v48f16(<12 x double> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; kill: killed $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB52_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v22 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v24 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v53, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v24 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v50, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: .LBB52_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB52_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 -; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 -; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 -; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 -; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 -; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 -; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 -; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 -; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 -; SI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 -; SI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v19 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v33 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; SI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v18 ; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v20 ; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v21 ; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v22 ; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 @@ -32818,242 +31837,126 @@ define <48 x half> @bitcast_v12f64_to_v48f16(<12 x double> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 ; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 ; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v32 ; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 ; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 ; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 ; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 ; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_mov_b32_e32 v35, v24 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v33, v22 +; SI-NEXT: v_mov_b32_e32 v32, v23 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: .LBB52_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v30 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: v_cvt_f16_f32_e32 v1, v29 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v28 -; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v26 -; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v25 -; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v62 -; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v60 -; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v58 -; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v56 -; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v46 -; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v44 -; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v42 -; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v41 -; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v55 -; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v53 -; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 -; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 -; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x5c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v35 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v28 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v27 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v24 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v62 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v61 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v58 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v57 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v45 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v46 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v41 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v42 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v53 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v54 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v48 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v50 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v16, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v34 ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -33070,7 +31973,49 @@ define <48 x half> @bitcast_v12f64_to_v48f16(<12 x double> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v22, v37 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v32 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v20, v35 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v39 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v33 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 +; SI-NEXT: v_or_b32_e32 v23, v25, v23 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v12f64_to_v48f16: @@ -33518,22 +32463,22 @@ define inreg <48 x half> @bitcast_v12f64_to_v48f16_scalar(<12 x double> inreg %a ; SI-LABEL: bitcast_v12f64_to_v48f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; SI-NEXT: v_mov_b32_e32 v23, s16 -; SI-NEXT: v_mov_b32_e32 v24, s17 -; SI-NEXT: v_mov_b32_e32 v21, s18 -; SI-NEXT: v_mov_b32_e32 v22, s19 -; SI-NEXT: v_mov_b32_e32 v19, s20 -; SI-NEXT: v_mov_b32_e32 v20, s21 -; SI-NEXT: v_mov_b32_e32 v15, s22 -; SI-NEXT: v_mov_b32_e32 v16, s23 -; SI-NEXT: v_mov_b32_e32 v17, s24 -; SI-NEXT: v_mov_b32_e32 v18, s25 -; SI-NEXT: v_mov_b32_e32 v13, s26 -; SI-NEXT: v_mov_b32_e32 v14, s27 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: v_mov_b32_e32 v22, s16 +; SI-NEXT: v_mov_b32_e32 v23, s17 +; SI-NEXT: v_mov_b32_e32 v20, s18 +; SI-NEXT: v_mov_b32_e32 v21, s19 +; SI-NEXT: v_mov_b32_e32 v18, s20 +; SI-NEXT: v_mov_b32_e32 v19, s21 +; SI-NEXT: v_mov_b32_e32 v14, s22 +; SI-NEXT: v_mov_b32_e32 v15, s23 +; SI-NEXT: v_mov_b32_e32 v16, s24 +; SI-NEXT: v_mov_b32_e32 v17, s25 +; SI-NEXT: v_mov_b32_e32 v12, s26 +; SI-NEXT: v_mov_b32_e32 v13, s27 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mov_b32_e32 v11, s28 -; SI-NEXT: v_mov_b32_e32 v12, s29 +; SI-NEXT: v_mov_b32_e32 v10, s28 +; SI-NEXT: v_mov_b32_e32 v11, s29 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -33552,136 +32497,135 @@ define inreg <48 x half> @bitcast_v12f64_to_v48f16_scalar(<12 x double> inreg %a ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB53_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v9 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v9 -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v26, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v25 -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v26, v7 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v7 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v26, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v10 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v4 -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v25, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v9 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v26, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v12 -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v26, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v18 -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v26, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v20 -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v2 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v26, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v53, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v15 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v26, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v10 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v25, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v16 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v25, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v18 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v25, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v8 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v50, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v14 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v25, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v22 ; SI-NEXT: s_cbranch_execnz .LBB53_3 ; SI-NEXT: .LBB53_2: ; %cmp.true -; SI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 -; SI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 -; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 -; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 -; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 -; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 -; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 -; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 -; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 -; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 -; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 -; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v21 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v5 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v33 +; SI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v23 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v4 ; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v6 ; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v7 ; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v8 ; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 @@ -33690,242 +32634,127 @@ define inreg <48 x half> @bitcast_v12f64_to_v48f16_scalar(<12 x double> inreg %a ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v63, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v16 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v62, v15 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v63, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 ; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 ; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v32 ; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 ; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 ; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 ; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 ; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 ; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 ; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 ; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_mov_b32_e32 v35, v10 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_mov_b32_e32 v33, v8 +; SI-NEXT: v_mov_b32_e32 v32, v9 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: .LBB53_3: ; %end ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v32 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: v_cvt_f16_f32_e32 v1, v25 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v31 -; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v62 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v29 -; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v27 -; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v26 -; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v63 -; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v30 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v61 -; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v59 -; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v57 -; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v47 -; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v45 -; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v41 -; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v55 -; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v53 -; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 -; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 -; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x5c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v35 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v28 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v57 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v25 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v45 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v62 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v43 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v58 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v41 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v46 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v53 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v54 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v48 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v50 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v16, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v34 ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -33942,68 +32771,109 @@ define inreg <48 x half> @bitcast_v12f64_to_v48f16_scalar(<12 x double> inreg %a ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v22, v37 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v32 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v20, v35 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v39 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v33 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 +; SI-NEXT: v_or_b32_e32 v23, v25, v23 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB53_4: -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; kill: killed $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; kill: killed $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr62 ; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; kill: killed $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; kill: killed $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: s_branch .LBB53_2 -; +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: s_branch .LBB53_2 +; ; VI-LABEL: bitcast_v12f64_to_v48f16_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -34676,170 +33546,181 @@ define <12 x double> @bitcast_v48f16_to_v12f64(<48 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v48f16_to_v12f64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v16 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v40, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f16_f32_e32 v54, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:48 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:44 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:56 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:52 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:64 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v11 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v12 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v15 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v32 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v44, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v32 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v63 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v32 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v62 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v61 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v60 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v59 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v58 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v57 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v56 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v23 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB54_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v33 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; kill: killed $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; kill: killed $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; kill: killed $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; kill: killed $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; kill: killed $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; kill: killed $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; kill: killed $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; kill: killed $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v41 ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; kill: killed $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr24 @@ -34875,9 +33756,14 @@ define <12 x double> @bitcast_v48f16_to_v12f64(<48 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v39 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v37 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v45 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v43 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v41 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v63 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v45 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v43 ; SI-NEXT: ; kill: killed $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: v_or_b32_e32 v0, v54, v0 @@ -34887,9 +33773,14 @@ define <12 x double> @bitcast_v48f16_to_v12f64(<48 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v4, v38, v4 ; SI-NEXT: v_or_b32_e32 v5, v36, v5 ; SI-NEXT: v_or_b32_e32 v6, v34, v6 -; SI-NEXT: v_or_b32_e32 v21, v44, v21 -; SI-NEXT: v_or_b32_e32 v22, v42, v22 -; SI-NEXT: v_or_b32_e32 v23, v40, v23 +; SI-NEXT: v_or_b32_e32 v7, v32, v7 +; SI-NEXT: v_or_b32_e32 v8, v62, v8 +; SI-NEXT: v_or_b32_e32 v9, v60, v9 +; SI-NEXT: v_or_b32_e32 v10, v58, v10 +; SI-NEXT: v_or_b32_e32 v11, v56, v11 +; SI-NEXT: v_or_b32_e32 v12, v46, v12 +; SI-NEXT: v_or_b32_e32 v13, v44, v13 +; SI-NEXT: v_or_b32_e32 v14, v42, v14 ; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr53 @@ -34905,86 +33796,71 @@ define <12 x double> @bitcast_v48f16_to_v12f64(<48 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v47 -; SI-NEXT: v_or_b32_e32 v20, v46, v20 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v23, v40, v23 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: .LBB54_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB54_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v55 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v53 @@ -35017,158 +33893,128 @@ define <12 x double> @bitcast_v48f16_to_v12f64(<48 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v3, v49 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v36 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v36 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v34 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v32 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v39 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v47 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v46 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v44 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v58 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v41 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v40 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v57 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v33 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v13, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v43 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v42 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v40 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v59 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v12, v47 ; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v45 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v14, v12 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v44 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v41 ; SI-NEXT: v_or_b32_e32 v14, v16, v14 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 @@ -35180,12 +34026,12 @@ define <12 x double> @bitcast_v48f16_to_v12f64(<48 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 @@ -35195,7 +34041,7 @@ define <12 x double> @bitcast_v48f16_to_v12f64(<48 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_or_b32_e32 v18, v20, v18 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 @@ -35207,17 +34053,29 @@ define <12 x double> @bitcast_v48f16_to_v12f64(<48 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_or_b32_e32 v19, v20, v19 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v45 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_or_b32_e32 v20, v22, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v43 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v42 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_or_b32_e32 v22, v23, v22 @@ -35225,22 +34083,22 @@ define <12 x double> @bitcast_v48f16_to_v12f64(<48 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v23, v25, v23 ; SI-NEXT: .LBB54_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -35892,165 +34750,281 @@ define inreg <12 x double> @bitcast_v48f16_to_v12f64_scalar(<48 x half> inreg %a ; SI-LABEL: bitcast_v48f16_to_v12f64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v62, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v3 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 +; SI-NEXT: s_lshr_b32 s40, s17, 16 +; SI-NEXT: s_lshr_b32 s41, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v63, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v7 +; SI-NEXT: s_lshr_b32 s14, s19, 16 +; SI-NEXT: s_lshr_b32 s15, s18, 16 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v23 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v63, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v6 +; SI-NEXT: s_lshr_b32 s12, s21, 16 +; SI-NEXT: s_lshr_b32 s13, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v56, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v46, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v0, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v8, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v1, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v7, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v2, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v6, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v3, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v5, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v4, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v54, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v51, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v30, s28 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f16_f32_e32 v27, v31 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f16_f32_e32 v24, v34 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f16_f32_e32 v31, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v35, s26 -; SI-NEXT: v_cvt_f16_f32_e32 v34, s29 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s21 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v8 +; SI-NEXT: s_lshr_b32 s10, s23, 16 +; SI-NEXT: s_lshr_b32 s11, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v7 +; SI-NEXT: s_lshr_b32 s8, s25, 16 +; SI-NEXT: s_lshr_b32 s9, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v9 +; SI-NEXT: s_lshr_b32 s6, s27, 16 +; SI-NEXT: s_lshr_b32 s7, s26, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; SI-NEXT: s_lshr_b32 s4, s29, 16 +; SI-NEXT: s_lshr_b32 s5, s28, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s29 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB55_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v0, v8, v0 -; SI-NEXT: v_or_b32_e32 v1, v7, v1 -; SI-NEXT: v_or_b32_e32 v2, v6, v2 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v34 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v62 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v53 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v46 +; SI-NEXT: v_or_b32_e32 v3, v25, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v41 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v44 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v56 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v60 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v55 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v52 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v51 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v48 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v39 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v60 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v34 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v58 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v56 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v46 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v44 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v42 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v29 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 -; SI-NEXT: v_or_b32_e32 v4, v54, v4 -; SI-NEXT: v_or_b32_e32 v5, v35, v5 -; SI-NEXT: v_or_b32_e32 v6, v30, v6 -; SI-NEXT: v_or_b32_e32 v7, v55, v7 -; SI-NEXT: v_or_b32_e32 v8, v63, v8 -; SI-NEXT: v_or_b32_e32 v9, v43, v9 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v28 +; SI-NEXT: v_mov_b32_e32 v43, v41 +; SI-NEXT: v_mov_b32_e32 v45, v44 +; SI-NEXT: v_mov_b32_e32 v41, v40 +; SI-NEXT: v_or_b32_e32 v0, v40, v0 +; SI-NEXT: v_or_b32_e32 v1, v63, v1 +; SI-NEXT: v_or_b32_e32 v2, v47, v2 +; SI-NEXT: v_or_b32_e32 v4, v62, v4 +; SI-NEXT: v_or_b32_e32 v5, v59, v5 +; SI-NEXT: v_or_b32_e32 v6, v58, v6 +; SI-NEXT: v_or_b32_e32 v7, v54, v7 +; SI-NEXT: v_or_b32_e32 v8, v53, v8 +; SI-NEXT: v_or_b32_e32 v9, v50, v9 ; SI-NEXT: v_or_b32_e32 v10, v49, v10 -; SI-NEXT: v_or_b32_e32 v11, v39, v11 +; SI-NEXT: v_or_b32_e32 v11, v38, v11 ; SI-NEXT: v_or_b32_e32 v12, v37, v12 -; SI-NEXT: v_or_b32_e32 v13, v61, v13 +; SI-NEXT: v_or_b32_e32 v13, v35, v13 ; SI-NEXT: v_or_b32_e32 v14, v33, v14 -; SI-NEXT: v_or_b32_e32 v15, v59, v15 -; SI-NEXT: v_or_b32_e32 v16, v57, v16 -; SI-NEXT: v_or_b32_e32 v17, v47, v17 -; SI-NEXT: v_or_b32_e32 v18, v45, v18 -; SI-NEXT: v_or_b32_e32 v19, v25, v19 -; SI-NEXT: v_or_b32_e32 v20, v41, v20 -; SI-NEXT: v_or_b32_e32 v21, v28, v21 -; SI-NEXT: v_or_b32_e32 v22, v26, v22 -; SI-NEXT: v_or_b32_e32 v23, v31, v23 +; SI-NEXT: v_or_b32_e32 v15, v26, v15 +; SI-NEXT: v_or_b32_e32 v16, v31, v16 +; SI-NEXT: v_or_b32_e32 v17, v29, v17 +; SI-NEXT: v_or_b32_e32 v18, v27, v18 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v23, v25, v23 ; SI-NEXT: s_cbranch_execnz .LBB55_3 ; SI-NEXT: .LBB55_2: ; %cmp.true -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v55 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v63 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v59 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v46 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v58 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v54 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v50 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v49 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v48 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 @@ -36061,9 +35035,9 @@ define inreg <12 x double> @bitcast_v48f16_to_v12f64_scalar(<48 x half> inreg %a ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v38 ; SI-NEXT: v_cvt_f32_f16_e32 v14, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v34 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v33 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 @@ -36071,99 +35045,50 @@ define inreg <12 x double> @bitcast_v48f16_to_v12f64_scalar(<48 x half> inreg %a ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v26 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v30 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v29 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v27 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v42 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v41 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v28 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v42 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v61 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v60 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v57 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v55 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 @@ -36173,22 +35098,30 @@ define inreg <12 x double> @bitcast_v48f16_to_v12f64_scalar(<48 x half> inreg %a ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v51 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v39 ; SI-NEXT: v_or_b32_e32 v11, v13, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v14, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v35 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 @@ -36196,159 +35129,157 @@ define inreg <12 x double> @bitcast_v48f16_to_v12f64_scalar(<48 x half> inreg %a ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v32 ; SI-NEXT: v_or_b32_e32 v14, v16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v31 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v28 ; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v44 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_or_b32_e32 v18, v20, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v25 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v31 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: v_or_b32_e32 v19, v20, v19 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v29 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_or_b32_e32 v20, v22, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v27 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v26 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_or_b32_e32 v22, v23, v22 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 ; SI-NEXT: v_or_b32_e32 v23, v25, v23 ; SI-NEXT: .LBB55_3: ; %end -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB55_4: -; SI-NEXT: v_mov_b32_e32 v40, v31 -; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v54, v33 -; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v51, v32 -; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v34, v59 -; SI-NEXT: v_mov_b32_e32 v59, v46 -; SI-NEXT: v_mov_b32_e32 v46, v41 -; SI-NEXT: v_mov_b32_e32 v41, v24 -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v35, v60 -; SI-NEXT: v_mov_b32_e32 v60, v47 -; SI-NEXT: v_mov_b32_e32 v47, v42 -; SI-NEXT: v_mov_b32_e32 v42, v26 -; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v55, v39 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_mov_b32_e32 v53, v37 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_mov_b32_e32 v52, v36 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_mov_b32_e32 v51, v35 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mov_b32_e32 v55, v52 +; SI-NEXT: v_mov_b32_e32 v50, v34 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v53, v43 -; SI-NEXT: v_mov_b32_e32 v52, v50 -; SI-NEXT: v_mov_b32_e32 v50, v49 -; SI-NEXT: v_mov_b32_e32 v49, v48 -; SI-NEXT: v_mov_b32_e32 v48, v39 -; SI-NEXT: v_mov_b32_e32 v39, v38 -; SI-NEXT: v_mov_b32_e32 v38, v37 -; SI-NEXT: v_mov_b32_e32 v37, v36 -; SI-NEXT: v_mov_b32_e32 v36, v61 -; SI-NEXT: v_mov_b32_e32 v61, v56 -; SI-NEXT: v_mov_b32_e32 v56, v25 -; SI-NEXT: v_mov_b32_e32 v43, v27 +; SI-NEXT: v_mov_b32_e32 v49, v33 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v48, v32 +; SI-NEXT: v_mov_b32_e32 v39, v26 +; SI-NEXT: v_mov_b32_e32 v37, v58 +; SI-NEXT: v_mov_b32_e32 v58, v27 +; SI-NEXT: v_mov_b32_e32 v36, v59 +; SI-NEXT: v_mov_b32_e32 v59, v28 +; SI-NEXT: v_mov_b32_e32 v35, v60 +; SI-NEXT: v_mov_b32_e32 v60, v29 +; SI-NEXT: v_mov_b32_e32 v34, v61 +; SI-NEXT: v_mov_b32_e32 v61, v30 ; SI-NEXT: v_mov_b32_e32 v33, v62 -; SI-NEXT: v_mov_b32_e32 v62, v57 -; SI-NEXT: v_mov_b32_e32 v57, v44 -; SI-NEXT: v_mov_b32_e32 v44, v28 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v63, v58 -; SI-NEXT: v_mov_b32_e32 v58, v45 -; SI-NEXT: v_mov_b32_e32 v45, v29 -; SI-NEXT: v_mov_b32_e32 v32, v30 +; SI-NEXT: v_mov_b32_e32 v62, v31 +; SI-NEXT: v_mov_b32_e32 v32, v63 +; SI-NEXT: v_mov_b32_e32 v63, v24 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v29, v45 -; SI-NEXT: v_mov_b32_e32 v45, v58 -; SI-NEXT: v_mov_b32_e32 v58, v63 -; SI-NEXT: v_mov_b32_e32 v27, v43 -; SI-NEXT: v_mov_b32_e32 v25, v56 -; SI-NEXT: v_mov_b32_e32 v56, v61 -; SI-NEXT: v_mov_b32_e32 v61, v36 -; SI-NEXT: v_mov_b32_e32 v36, v37 -; SI-NEXT: v_mov_b32_e32 v37, v38 -; SI-NEXT: v_mov_b32_e32 v38, v39 -; SI-NEXT: v_mov_b32_e32 v39, v48 -; SI-NEXT: v_mov_b32_e32 v48, v49 -; SI-NEXT: v_mov_b32_e32 v49, v50 -; SI-NEXT: v_mov_b32_e32 v50, v52 -; SI-NEXT: v_mov_b32_e32 v43, v53 -; SI-NEXT: v_mov_b32_e32 v52, v55 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v30, v32 -; SI-NEXT: v_mov_b32_e32 v28, v44 -; SI-NEXT: v_mov_b32_e32 v44, v57 -; SI-NEXT: v_mov_b32_e32 v57, v62 +; SI-NEXT: v_mov_b32_e32 v45, v44 +; SI-NEXT: v_mov_b32_e32 v24, v63 +; SI-NEXT: v_mov_b32_e32 v63, v32 +; SI-NEXT: v_mov_b32_e32 v31, v62 ; SI-NEXT: v_mov_b32_e32 v62, v33 -; SI-NEXT: v_mov_b32_e32 v26, v42 -; SI-NEXT: v_mov_b32_e32 v42, v47 -; SI-NEXT: v_mov_b32_e32 v47, v60 +; SI-NEXT: v_mov_b32_e32 v30, v61 +; SI-NEXT: v_mov_b32_e32 v61, v34 +; SI-NEXT: v_mov_b32_e32 v29, v60 ; SI-NEXT: v_mov_b32_e32 v60, v35 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v24, v41 -; SI-NEXT: v_mov_b32_e32 v41, v46 -; SI-NEXT: v_mov_b32_e32 v46, v59 -; SI-NEXT: v_mov_b32_e32 v59, v34 -; SI-NEXT: v_mov_b32_e32 v32, v51 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v33, v54 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v31, v40 +; SI-NEXT: v_mov_b32_e32 v28, v59 +; SI-NEXT: v_mov_b32_e32 v59, v36 +; SI-NEXT: v_mov_b32_e32 v27, v58 +; SI-NEXT: v_mov_b32_e32 v58, v37 +; SI-NEXT: v_mov_b32_e32 v26, v39 +; SI-NEXT: v_mov_b32_e32 v32, v48 +; SI-NEXT: v_mov_b32_e32 v33, v49 +; SI-NEXT: v_mov_b32_e32 v34, v50 +; SI-NEXT: v_mov_b32_e32 v35, v51 +; SI-NEXT: v_mov_b32_e32 v36, v52 +; SI-NEXT: v_mov_b32_e32 v37, v53 +; SI-NEXT: v_mov_b32_e32 v39, v55 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v43, v41 +; SI-NEXT: v_mov_b32_e32 v41, v40 ; SI-NEXT: s_branch .LBB55_2 ; ; VI-LABEL: bitcast_v48f16_to_v12f64_scalar: @@ -36943,709 +35874,635 @@ define <48 x half> @bitcast_v48i16_to_v48f16(<48 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v48i16_to_v48f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:72 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:8 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v0 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB56_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v49, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v4 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v7 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v10 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v14 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v40 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v20 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v21 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v23 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v58, v51 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v61, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v54 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v24 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v41 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v25 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v43 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v26 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v45 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v27 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v28 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v47 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v54 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v55 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v56 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v57 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v58 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v37 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v59 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v60 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v61 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v62 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v39 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v63 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v41 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v48 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v42 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v9 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v49 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v10 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v45 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v50 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v46 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v11 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v47 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v52 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v13 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v15 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v17 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: .LBB56_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB56_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v13 -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v15 -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v32 +; SI-NEXT: v_add_i32_e32 v33, vcc, 3, v33 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v17 -; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 +; SI-NEXT: v_add_i32_e32 v34, vcc, 3, v34 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v18 -; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v33 +; SI-NEXT: v_add_i32_e32 v35, vcc, 3, v35 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v19 -; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v2 +; SI-NEXT: v_add_i32_e32 v36, vcc, 3, v36 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v20 -; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v34 +; SI-NEXT: v_add_i32_e32 v37, vcc, 3, v37 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v21 -; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v3 +; SI-NEXT: v_add_i32_e32 v38, vcc, 3, v38 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v22 -; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 -; SI-NEXT: v_add_i32_e32 v54, vcc, 3, v54 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v35 +; SI-NEXT: v_add_i32_e32 v39, vcc, 3, v39 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v23 -; SI-NEXT: v_add_i32_e32 v55, vcc, 3, v55 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_add_i32_e32 v56, vcc, 3, v56 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v24 -; SI-NEXT: v_add_i32_e32 v57, vcc, 3, v57 -; SI-NEXT: v_add_i32_e32 v58, vcc, 3, v58 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v25 -; SI-NEXT: v_add_i32_e32 v59, vcc, 3, v59 -; SI-NEXT: v_add_i32_e32 v60, vcc, 3, v60 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v26 -; SI-NEXT: v_add_i32_e32 v61, vcc, 3, v61 -; SI-NEXT: v_add_i32_e32 v62, vcc, 3, v62 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v27 -; SI-NEXT: v_add_i32_e32 v63, vcc, 3, v63 -; SI-NEXT: v_add_i32_e32 v41, vcc, 3, v41 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v28 -; SI-NEXT: v_add_i32_e32 v42, vcc, 3, v42 -; SI-NEXT: v_add_i32_e32 v43, vcc, 3, v43 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v29 -; SI-NEXT: v_add_i32_e32 v44, vcc, 3, v44 -; SI-NEXT: v_add_i32_e32 v45, vcc, 3, v45 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v30 -; SI-NEXT: v_add_i32_e32 v46, vcc, 3, v46 -; SI-NEXT: v_add_i32_e32 v47, vcc, 3, v47 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v4 +; SI-NEXT: v_add_i32_e32 v48, vcc, 3, v48 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v54 -; SI-NEXT: v_add_i32_e32 v40, vcc, 3, v40 -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v36 +; SI-NEXT: v_add_i32_e32 v49, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v55 -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v5 +; SI-NEXT: v_add_i32_e32 v50, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v37 ; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: v_add_i32_e32 v52, vcc, 3, v52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v56 -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v6 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v57 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v38 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v58 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v7 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v59 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v39 +; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v47 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v60 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v4 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v7 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v10 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v8 +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v46 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v14 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v48 +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v45 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v40 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v9 +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v44 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v43 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v49 +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v43 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v44 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v10 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v42 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v41 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v45 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v50 +; SI-NEXT: v_add_i32_e32 v40, vcc, 3, v40 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v55 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v46 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v11 +; SI-NEXT: v_add_i32_e32 v54, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v53, vcc, 3, v53 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v47 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: .LBB56_4: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_cvt_f32_f16_e32 v0, v12 +; SI-NEXT: v_add_i32_e32 v51, vcc, 3, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v51 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v50 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: v_cvt_f32_f16_e32 v0, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v38 -; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: v_cvt_f32_f16_e32 v0, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v36 -; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: v_cvt_f32_f16_e32 v0, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v34 -; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: v_cvt_f32_f16_e32 v0, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v32 -; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v0, v17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 -; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v0, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 -; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: v_cvt_f32_f16_e32 v0, v20 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: .LBB56_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v12, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v31 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v14, v59 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v2, v31 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v16, v62 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v18, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v63 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x5c, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v20, v27 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v57 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v22, v29 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 +; SI-NEXT: v_or_b32_e32 v23, v25, v23 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v48i16_to_v48f16: @@ -38108,548 +36965,411 @@ define inreg <48 x half> @bitcast_v48i16_to_v48f16_scalar(<48 x i16> inreg %a, i ; SI-LABEL: bitcast_v48i16_to_v48f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: s_lshr_b32 s43, s29, 16 +; SI-NEXT: s_lshr_b32 s42, s28, 16 +; SI-NEXT: s_lshr_b32 s41, s27, 16 +; SI-NEXT: s_lshr_b32 s40, s26, 16 +; SI-NEXT: s_lshr_b32 s15, s25, 16 +; SI-NEXT: s_lshr_b32 s14, s24, 16 +; SI-NEXT: s_lshr_b32 s13, s23, 16 +; SI-NEXT: s_lshr_b32 s12, s22, 16 +; SI-NEXT: s_lshr_b32 s11, s21, 16 +; SI-NEXT: s_lshr_b32 s10, s20, 16 +; SI-NEXT: s_lshr_b32 s9, s19, 16 +; SI-NEXT: s_lshr_b32 s8, s18, 16 +; SI-NEXT: s_lshr_b32 s7, s17, 16 +; SI-NEXT: s_lshr_b32 s6, s16, 16 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v0 ; SI-NEXT: s_cbranch_scc0 .LBB57_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v31, s22 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v44, v41 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v60, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s17 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, s24 -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v44, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v61, s18 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v45, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v62, s21 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v57, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v46, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v42, s27 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v14 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v17 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v10, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s8 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v10, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s10 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v10, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s22 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v10, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s13 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v17, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v0 +; SI-NEXT: v_mov_b32_e32 v15, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v1 +; SI-NEXT: v_mov_b32_e32 v62, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v20 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v23 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v26 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v30 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v40 -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v44, v54 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v7 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v8 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v9 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v10 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v11 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v12 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v28 +; SI-NEXT: v_mov_b32_e32 v63, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v3 +; SI-NEXT: v_mov_b32_e32 v58, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v4 +; SI-NEXT: v_mov_b32_e32 v59, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v5 +; SI-NEXT: v_mov_b32_e32 v60, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v6 +; SI-NEXT: v_mov_b32_e32 v61, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v7 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v10, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v8 +; SI-NEXT: v_mov_b32_e32 v12, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v9 +; SI-NEXT: v_mov_b32_e32 v14, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 ; SI-NEXT: s_cbranch_execnz .LBB57_3 ; SI-NEXT: .LBB57_2: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v2 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v3 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_add_i32_e32 v43, vcc, 3, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s16 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: v_add_i32_e32 v57, vcc, 3, v14 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v4 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s17 ; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v5 -; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s22 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v6 -; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, s24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: v_add_i32_e32 v45, vcc, 3, v12 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v7 -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v8 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, s26 -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v9 -; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, s28 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v10 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_add_i32_e32 v41, vcc, 3, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s18 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v44, vcc, 3, v61 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v41, vcc, 3, v60 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v49, vcc, 3, v59 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v35, vcc, 3, v58 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v38, vcc, 3, v63 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v62 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v34, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: s_add_i32 s43, s43, 3 ; SI-NEXT: s_add_i32 s29, s29, 3 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v11 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, s29 -; SI-NEXT: v_add_i32_e32 v54, vcc, 3, v54 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v12 -; SI-NEXT: v_add_i32_e32 v55, vcc, 3, v55 -; SI-NEXT: v_add_i32_e32 v40, vcc, 3, v40 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v41 -; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 -; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 -; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: s_add_i32 s42, s42, 3 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s41, s41, 3 ; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_add_i32 s40, s40, 3 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s15, s15, 3 ; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s14, s14, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s13, s13, 3 ; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s12, s12, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s11, s11, 3 ; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s10, s10, 3 ; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s9, s9, 3 ; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v60, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v61, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v45, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v62, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v57, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v46, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v42, s27 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v50, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v27 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v30 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v56, v40 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v54 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_add_i32 s8, s8, 3 +; SI-NEXT: s_add_i32 s7, s7, 3 +; SI-NEXT: s_add_i32 s6, s6, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s7 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v21, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s9 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v10, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: .LBB57_3: ; %end -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v61 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: v_cvt_f16_f32_e32 v1, v62 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v45 -; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 -; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 -; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 -; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v11 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v19 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v10, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v57 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v2, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v49 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v50 -; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v48 -; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v38 -; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v34 -; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v32 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v63 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v58 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v47 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v43 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x5c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v4, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v45 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v44 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v29 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v26 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v33 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v34 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v37 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v41 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v48 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v44 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v51 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v27 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v53 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v31 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v55 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v35 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v46 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v42 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v43 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 +; SI-NEXT: v_or_b32_e32 v23, v25, v23 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB57_4: -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: v_mov_b32_e32 v15, v12 +; SI-NEXT: v_mov_b32_e32 v14, v57 +; SI-NEXT: v_mov_b32_e32 v12, v47 +; SI-NEXT: v_mov_b32_e32 v10, v45 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v61, v43 +; SI-NEXT: v_mov_b32_e32 v60, v41 +; SI-NEXT: v_mov_b32_e32 v59, v55 +; SI-NEXT: v_mov_b32_e32 v58, v49 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v63, v35 +; SI-NEXT: v_mov_b32_e32 v62, v31 +; SI-NEXT: ; kill: killed $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; kill: killed $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; kill: killed $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: s_branch .LBB57_2 ; ; VI-LABEL: bitcast_v48i16_to_v48f16_scalar: @@ -39410,493 +38130,426 @@ define <48 x i16> @bitcast_v48f16_to_v48i16(<48 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v48f16_to_v48i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:24 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v35 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v35 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v35 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v2 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:28 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v35 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:32 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:36 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:72 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v14, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v57 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f16_f32_e32 v9, v58 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f16_f32_e32 v57, v59 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f16_f32_e32 v24, v60 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f16_f32_e32 v11, v61 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f16_f32_e32 v61, v62 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f16_f32_e32 v56, v63 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v34 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v35 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v49 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v34, v52 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v22, v53 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v59, v31 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v53, v32 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v41 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_or_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: v_mov_b32_e32 v52, v21 -; SI-NEXT: v_mov_b32_e32 v21, v6 -; SI-NEXT: v_mov_b32_e32 v6, v7 -; SI-NEXT: v_mov_b32_e32 v7, v8 -; SI-NEXT: v_mov_b32_e32 v8, v1 -; SI-NEXT: v_mov_b32_e32 v58, v2 -; SI-NEXT: v_mov_b32_e32 v60, v50 -; SI-NEXT: v_mov_b32_e32 v50, v3 -; SI-NEXT: s_xor_b64 exec, exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB58_2 ; SI-NEXT: ; %bb.1: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v27, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v60 -; SI-NEXT: v_mov_b32_e32 v35, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v43 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; SI-NEXT: v_or_b32_e32 v60, v28, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v40 +; SI-NEXT: v_add_f32_e32 v55, 0x38000000, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v55 +; SI-NEXT: v_add_f32_e32 v54, 0x38000000, v54 +; SI-NEXT: v_add_f32_e32 v53, 0x38000000, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v54 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v53 +; SI-NEXT: v_add_f32_e32 v53, 0x38000000, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 +; SI-NEXT: v_add_f32_e32 v51, 0x38000000, v51 +; SI-NEXT: v_add_f32_e32 v50, 0x38000000, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v51 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v50 +; SI-NEXT: v_add_f32_e32 v50, 0x38000000, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 +; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v48 +; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v48 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v28 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v29 -; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v40 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v52 -; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v54 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v53 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v59 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v31 -; SI-NEXT: v_or_b32_e32 v52, v32, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v35 -; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v25 -; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v46 -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v33 -; SI-NEXT: v_or_b32_e32 v49, v31, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v61 -; SI-NEXT: v_or_b32_e32 v35, v31, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v23 -; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v56 -; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_or_b32_e32 v57, v32, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v22 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_or_b32_e32 v56, v24, v22 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v31 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v1 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v3 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v39 +; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_or_b32_e32 v34, v24, v47 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v38 -; SI-NEXT: v_or_b32_e32 v59, v2, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_or_b32_e32 v5, v5, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_or_b32_e32 v11, v11, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_or_b32_e32 v10, v10, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v37 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v36 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_or_b32_e32 v25, v25, v39 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; SI-NEXT: v_or_b32_e32 v14, v14, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v58 -; SI-NEXT: v_or_b32_e32 v20, v20, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v18 -; SI-NEXT: v_or_b32_e32 v19, v19, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v16 -; SI-NEXT: v_or_b32_e32 v17, v17, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v15 -; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v32 -; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v33 -; SI-NEXT: v_or_b32_e32 v38, v31, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v39 -; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v36 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_or_b32_e32 v37, v32, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v48 -; SI-NEXT: v_or_b32_e32 v58, v30, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v50 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_or_b32_e32 v22, v22, v39 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v24 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v32 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_or_b32_e32 v28, v28, v39 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_or_b32_e32 v20, v20, v39 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_or_b32_e32 v16, v16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_or_b32_e32 v31, v31, v39 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_or_b32_e32 v14, v14, v39 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_or_b32_e32 v10, v10, v39 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v39 -; SI-NEXT: v_or_b32_e32 v51, v31, v32 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v48 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; SI-NEXT: v_or_b32_e32 v50, v24, v31 -; SI-NEXT: v_or_b32_e32 v8, v8, v29 -; SI-NEXT: v_or_b32_e32 v7, v7, v30 -; SI-NEXT: v_or_b32_e32 v6, v6, v55 -; SI-NEXT: v_or_b32_e32 v21, v21, v45 -; SI-NEXT: v_alignbit_b32 v44, v50, v27, 16 -; SI-NEXT: v_alignbit_b32 v43, v51, v28, 16 -; SI-NEXT: v_alignbit_b32 v42, v37, v29, 16 -; SI-NEXT: v_mov_b32_e32 v29, v49 -; SI-NEXT: v_alignbit_b32 v41, v38, v30, 16 -; SI-NEXT: v_alignbit_b32 v40, v17, v55, 16 -; SI-NEXT: v_alignbit_b32 v55, v19, v45, 16 -; SI-NEXT: v_alignbit_b32 v54, v20, v26, 16 -; SI-NEXT: v_alignbit_b32 v26, v14, v25, 16 -; SI-NEXT: v_alignbit_b32 v25, v10, v46, 16 -; SI-NEXT: v_mov_b32_e32 v46, v35 -; SI-NEXT: v_alignbit_b32 v24, v11, v23, 16 -; SI-NEXT: v_alignbit_b32 v23, v5, v22, 16 -; SI-NEXT: v_alignbit_b32 v22, v59, v47, 16 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_or_b32_e32 v34, v34, v39 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v5 +; SI-NEXT: v_or_b32_e32 v8, v8, v39 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v3 +; SI-NEXT: v_or_b32_e32 v36, v36, v39 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v40 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v55 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v53 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v50 +; SI-NEXT: v_or_b32_e32 v2, v2, v39 +; SI-NEXT: v_or_b32_e32 v0, v0, v24 +; SI-NEXT: v_or_b32_e32 v35, v35, v54 +; SI-NEXT: v_or_b32_e32 v4, v4, v41 +; SI-NEXT: v_or_b32_e32 v6, v6, v52 +; SI-NEXT: v_or_b32_e32 v32, v32, v51 +; SI-NEXT: v_or_b32_e32 v33, v33, v42 +; SI-NEXT: v_or_b32_e32 v12, v12, v49 +; SI-NEXT: v_or_b32_e32 v29, v29, v48 +; SI-NEXT: v_or_b32_e32 v30, v30, v43 +; SI-NEXT: v_or_b32_e32 v18, v18, v38 +; SI-NEXT: v_or_b32_e32 v26, v26, v37 +; SI-NEXT: v_or_b32_e32 v27, v27, v44 +; SI-NEXT: v_alignbit_b32 v40, v2, v24, 16 +; SI-NEXT: v_alignbit_b32 v55, v36, v54, 16 +; SI-NEXT: v_alignbit_b32 v54, v8, v41, 16 +; SI-NEXT: v_alignbit_b32 v53, v34, v52, 16 +; SI-NEXT: v_alignbit_b32 v52, v10, v51, 16 +; SI-NEXT: v_alignbit_b32 v51, v14, v42, 16 +; SI-NEXT: v_alignbit_b32 v50, v31, v49, 16 +; SI-NEXT: v_alignbit_b32 v49, v16, v48, 16 +; SI-NEXT: v_alignbit_b32 v48, v20, v43, 16 +; SI-NEXT: v_alignbit_b32 v39, v28, v38, 16 +; SI-NEXT: v_alignbit_b32 v38, v22, v37, 16 +; SI-NEXT: v_alignbit_b32 v37, v25, v44, 16 ; SI-NEXT: .LBB58_2: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_and_b32_e32 v27, 0xffff, v60 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v44 -; SI-NEXT: v_or_b32_e32 v27, v27, v28 -; SI-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v27, 0xffff, v50 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v48 -; SI-NEXT: v_or_b32_e32 v27, v27, v28 -; SI-NEXT: v_add_i32_e32 v28, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v27, v28, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v27, 0xffff, v58 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v43 -; SI-NEXT: v_or_b32_e32 v27, v27, v28 -; SI-NEXT: v_add_i32_e32 v28, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v27, v28, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v27, 0xffff, v51 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v39 -; SI-NEXT: v_or_b32_e32 v27, v27, v28 -; SI-NEXT: v_add_i32_e32 v28, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v27, v28, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v42 -; SI-NEXT: v_or_b32_e32 v8, v8, v27 -; SI-NEXT: v_add_i32_e32 v27, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v8, v27, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v40 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v37 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v36 -; SI-NEXT: v_or_b32_e32 v8, v8, v27 -; SI-NEXT: v_add_i32_e32 v27, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v8, v27, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v41 -; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: v_add_i32_e32 v8, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v38 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v15 -; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: v_add_i32_e32 v8, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v40 -; SI-NEXT: v_or_b32_e32 v6, v6, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v17 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v16 -; SI-NEXT: v_or_b32_e32 v6, v6, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v21 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v55 -; SI-NEXT: v_or_b32_e32 v6, v6, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v19 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v18 -; SI-NEXT: v_or_b32_e32 v6, v6, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v52 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v54 -; SI-NEXT: v_or_b32_e32 v6, v6, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v20 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v12 -; SI-NEXT: v_or_b32_e32 v6, v6, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v29 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v26 -; SI-NEXT: v_or_b32_e32 v6, v6, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, 56, v0 -; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v13 -; SI-NEXT: v_or_b32_e32 v6, v6, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, 60, v0 -; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v46 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v25 -; SI-NEXT: v_or_b32_e32 v6, v6, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, 64, v0 -; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v9 -; SI-NEXT: v_or_b32_e32 v6, v6, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x44, v0 -; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v57 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v24 -; SI-NEXT: v_or_b32_e32 v6, v6, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x48, v0 -; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v61 -; SI-NEXT: v_or_b32_e32 v3, v6, v3 -; SI-NEXT: v_add_i32_e32 v6, vcc, 0x4c, v0 -; SI-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v56 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v23 -; SI-NEXT: v_or_b32_e32 v3, v3, v6 -; SI-NEXT: v_add_i32_e32 v6, vcc, 0x50, v0 -; SI-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 0x54, v0 -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v34 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v22 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v59 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v53 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 0x58, v0 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v24 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x5c, v0 -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v35 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v55 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v2, v2, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v5, v8, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v53 +; SI-NEXT: v_or_b32_e32 v3, v24, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v54 +; SI-NEXT: v_or_b32_e32 v6, v6, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v34 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v4, v4, v24 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v52 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v11, v14, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v50 +; SI-NEXT: v_or_b32_e32 v8, v8, v24 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v51 +; SI-NEXT: v_or_b32_e32 v12, v12, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v31 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v10, v10, v24 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v49 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v17, v20, v17 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v39 +; SI-NEXT: v_or_b32_e32 v14, v14, v24 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v48 +; SI-NEXT: v_or_b32_e32 v18, v18, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v16, v16, v24 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v38 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v20, v20, v24 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v37 +; SI-NEXT: v_or_b32_e32 v22, v22, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v48f16_to_v48i16: @@ -40360,461 +39013,458 @@ define inreg <48 x i16> @bitcast_v48f16_to_v48i16_scalar(<48 x half> inreg %a, i ; SI-LABEL: bitcast_v48f16_to_v48i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v61, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v3 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v63, v6 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s40, s19, 16 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s14 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v44, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s10 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v9 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s8, s27, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v2 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_cvt_f16_f32_e32 v59, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s6 +; SI-NEXT: s_lshr_b32 s7, s28, 16 +; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: s_lshr_b32 s13, s22, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s41, s18, 16 +; SI-NEXT: s_lshr_b32 s42, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v7 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v62, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v63, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v62, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v42, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v59, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v58, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v57, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v46, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v26, s28 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v54, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v24, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v52, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v50, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v16, s26 -; SI-NEXT: v_cvt_f16_f32_e32 v29, s29 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v20, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v20 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_cbranch_scc0 .LBB59_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_cbranch_execnz .LBB59_3 ; SI-NEXT: .LBB59_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v1, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v29 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v1 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v1 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v57 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v31 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v50 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v48 +; SI-NEXT: v_or_b32_e32 v12, v12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v38 +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v59 +; SI-NEXT: v_or_b32_e32 v27, v14, v2 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v33 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v1 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v59 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v62 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_or_b32_e32 v59, v3, v19 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_or_b32_e32 v62, v1, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v55 -; SI-NEXT: v_or_b32_e32 v57, v3, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v44 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v21 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_or_b32_e32 v35, v5, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v11 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_or_b32_e32 v3, v3, v13 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v53 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_or_b32_e32 v33, v5, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_or_b32_e32 v32, v1, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v30 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_or_b32_e32 v29, v3, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v39 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_or_b32_e32 v28, v26, v5 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v31 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_or_b32_e32 v27, v1, v3 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v43 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v25 -; SI-NEXT: v_or_b32_e32 v44, v26, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v43 -; SI-NEXT: v_or_b32_e32 v2, v2, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v40 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v25 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v41 -; SI-NEXT: v_or_b32_e32 v4, v4, v25 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v26 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v40 -; SI-NEXT: v_or_b32_e32 v6, v6, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v56 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v45 -; SI-NEXT: v_or_b32_e32 v8, v8, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v47 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v36 +; SI-NEXT: v_or_b32_e32 v14, v14, v4 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v58 +; SI-NEXT: v_or_b32_e32 v59, v12, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v34 +; SI-NEXT: v_or_b32_e32 v31, v16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v39 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v25 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v26 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v58, v14, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v32 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v32, v16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v30 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v56 -; SI-NEXT: v_or_b32_e32 v10, v10, v25 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v47 -; SI-NEXT: v_or_b32_e32 v12, v12, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v63 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v60 -; SI-NEXT: v_or_b32_e32 v14, v14, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v25 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v63 +; SI-NEXT: v_or_b32_e32 v62, v16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v29 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v25 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v63 -; SI-NEXT: v_or_b32_e32 v18, v18, v25 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v61 -; SI-NEXT: v_or_b32_e32 v22, v22, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v42 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_or_b32_e32 v63, v20, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v26 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v26, v22, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v35 ; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v26 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v25 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_or_b32_e32 v16, v16, v26 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v58 -; SI-NEXT: v_lshr_b64 v[50:51], v[15:16], 16 -; SI-NEXT: v_or_b32_e32 v20, v20, v25 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v37 +; SI-NEXT: v_or_b32_e32 v28, v22, v20 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v24 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_or_b32_e32 v29, v25, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v60 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v40 +; SI-NEXT: v_or_b32_e32 v23, v23, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v24 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v25 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v60 +; SI-NEXT: v_or_b32_e32 v21, v21, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v41 +; SI-NEXT: v_or_b32_e32 v19, v19, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v61 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v46 +; SI-NEXT: v_or_b32_e32 v17, v17, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v24 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v25 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v61 +; SI-NEXT: v_or_b32_e32 v15, v15, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v57 +; SI-NEXT: v_or_b32_e32 v13, v13, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v56 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v42 -; SI-NEXT: v_mov_b32_e32 v51, v29 -; SI-NEXT: v_lshr_b64 v[29:30], v[21:22], 16 -; SI-NEXT: v_or_b32_e32 v24, v24, v25 -; SI-NEXT: v_lshr_b64 v[30:31], v[5:6], 16 -; SI-NEXT: v_lshr_b64 v[25:26], v[1:2], 16 -; SI-NEXT: v_mov_b32_e32 v31, v44 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_lshr_b64 v[54:55], v[23:24], 16 -; SI-NEXT: v_lshr_b64 v[52:53], v[19:20], 16 -; SI-NEXT: v_lshr_b64 v[48:49], v[17:18], 16 -; SI-NEXT: v_lshr_b64 v[38:39], v[13:14], 16 -; SI-NEXT: v_mov_b32_e32 v55, v35 -; SI-NEXT: v_mov_b32_e32 v53, v32 -; SI-NEXT: v_mov_b32_e32 v49, v28 -; SI-NEXT: v_mov_b32_e32 v39, v27 -; SI-NEXT: v_lshr_b64 v[36:37], v[11:12], 16 -; SI-NEXT: v_mov_b32_e32 v11, v33 -; SI-NEXT: v_lshr_b64 v[34:35], v[9:10], 16 -; SI-NEXT: v_lshr_b64 v[32:33], v[7:8], 16 -; SI-NEXT: v_lshr_b64 v[27:28], v[3:4], 16 +; SI-NEXT: v_or_b32_e32 v11, v11, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v24 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v24 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v56 +; SI-NEXT: v_or_b32_e32 v9, v9, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v45 +; SI-NEXT: v_or_b32_e32 v7, v7, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v25 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v24 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshr_b64 v[38:39], v[8:9], 16 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v47 +; SI-NEXT: v_mov_b32_e32 v39, v32 +; SI-NEXT: v_lshr_b64 v[36:37], v[10:11], 16 +; SI-NEXT: v_lshr_b64 v[34:35], v[12:13], 16 +; SI-NEXT: v_lshr_b64 v[32:33], v[14:15], 16 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v44 +; SI-NEXT: v_or_b32_e32 v3, v3, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v43 +; SI-NEXT: v_mov_b32_e32 v37, v29 +; SI-NEXT: v_mov_b32_e32 v35, v28 +; SI-NEXT: v_mov_b32_e32 v33, v31 +; SI-NEXT: v_lshr_b64 v[30:31], v[16:17], 16 +; SI-NEXT: v_lshr_b64 v[28:29], v[18:19], 16 +; SI-NEXT: v_or_b32_e32 v5, v5, v25 +; SI-NEXT: v_or_b32_e32 v1, v1, v24 +; SI-NEXT: v_mov_b32_e32 v31, v27 +; SI-NEXT: v_mov_b32_e32 v29, v26 +; SI-NEXT: v_lshr_b64 v[26:27], v[20:21], 16 +; SI-NEXT: v_lshr_b64 v[24:25], v[22:23], 16 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_lshr_b64 v[54:55], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[52:53], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[50:51], v[4:5], 16 +; SI-NEXT: v_lshr_b64 v[48:49], v[6:7], 16 ; SI-NEXT: .LBB59_3: ; %end -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v54 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v62 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v24 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v42 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v52 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v59 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v58 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v50 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v57 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v46 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt vmcnt(6) expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v26 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v29 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v61 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v55 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v48 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v63 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v44 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v38 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v60 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v36 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v47 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v53 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v34 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v56 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v51 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v32 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v45 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v49 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v30 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v40 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v39 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v27 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v41 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v31 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v25 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v54 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v25 +; SI-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v43 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x5c, v0 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v31 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v47 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v50 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v27 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v44 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v59 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v48 +; SI-NEXT: v_or_b32_e32 v6, v6, v8 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v45 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v38 +; SI-NEXT: v_or_b32_e32 v8, v8, v10 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v56 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v58 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v36 +; SI-NEXT: v_or_b32_e32 v10, v10, v12 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v42 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v34 +; SI-NEXT: v_or_b32_e32 v12, v12, v14 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v57 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v62 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v32 +; SI-NEXT: v_or_b32_e32 v14, v14, v16 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v61 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v63 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v30 +; SI-NEXT: v_or_b32_e32 v16, v16, v18 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v46 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v28 +; SI-NEXT: v_or_b32_e32 v18, v18, v20 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v41 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v35 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v26 +; SI-NEXT: v_or_b32_e32 v20, v20, v22 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v60 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v37 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v22, v22, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v40 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB59_4: ; SI-NEXT: s_branch .LBB59_2 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll index 63b61f6b02854..18de1fc68024e 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll @@ -2570,297 +2570,217 @@ define <52 x i16> @bitcast_v26i32_to_v52i16(<26 x i32> %a, i32 %b) { ; SI-LABEL: bitcast_v26i32_to_v52i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v27 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB12_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_alignbit_b32 v27, v26, v25, 16 -; SI-NEXT: v_alignbit_b32 v28, v24, v23, 16 -; SI-NEXT: v_alignbit_b32 v29, v22, v21, 16 -; SI-NEXT: v_alignbit_b32 v30, v20, v19, 16 -; SI-NEXT: v_alignbit_b32 v31, v18, v17, 16 -; SI-NEXT: v_alignbit_b32 v33, v16, v15, 16 -; SI-NEXT: v_alignbit_b32 v35, v14, v13, 16 -; SI-NEXT: v_alignbit_b32 v37, v12, v11, 16 -; SI-NEXT: v_alignbit_b32 v48, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v50, v8, v7, 16 -; SI-NEXT: v_alignbit_b32 v52, v6, v5, 16 -; SI-NEXT: v_alignbit_b32 v54, v4, v3, 16 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_alignbit_b32 v40, v2, v1, 16 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v10 +; SI-NEXT: v_alignbit_b32 v26, v25, v24, 16 +; SI-NEXT: v_alignbit_b32 v27, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v28, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v29, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v30, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v31, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v32, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v33, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v34, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v37, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v39, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v50, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v52, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v9 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v7 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v5 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v3 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v1 ; SI-NEXT: .LBB12_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB12_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 ; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 ; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 ; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 ; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 ; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 ; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 ; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 ; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 ; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 ; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; SI-NEXT: v_alignbit_b32 v27, v26, v25, 16 -; SI-NEXT: v_alignbit_b32 v28, v24, v23, 16 -; SI-NEXT: v_alignbit_b32 v29, v22, v21, 16 -; SI-NEXT: v_alignbit_b32 v30, v20, v19, 16 -; SI-NEXT: v_alignbit_b32 v31, v18, v17, 16 -; SI-NEXT: v_alignbit_b32 v33, v16, v15, 16 -; SI-NEXT: v_alignbit_b32 v35, v14, v13, 16 -; SI-NEXT: v_alignbit_b32 v37, v12, v11, 16 -; SI-NEXT: v_alignbit_b32 v48, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v50, v8, v7, 16 -; SI-NEXT: v_alignbit_b32 v52, v6, v5, 16 -; SI-NEXT: v_alignbit_b32 v54, v4, v3, 16 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_alignbit_b32 v40, v2, v1, 16 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v10 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_alignbit_b32 v26, v25, v24, 16 +; SI-NEXT: v_alignbit_b32 v27, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v28, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v29, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v30, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v31, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v32, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v33, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v34, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v37, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v39, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v50, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v52, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v9 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v7 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v5 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v3 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v1 ; SI-NEXT: .LBB12_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; SI-NEXT: v_or_b32_e32 v0, v0, v52 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v43 +; SI-NEXT: v_or_b32_e32 v2, v2, v50 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v42 +; SI-NEXT: v_or_b32_e32 v4, v4, v39 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v41 +; SI-NEXT: v_or_b32_e32 v6, v6, v37 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v40 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; SI-NEXT: v_or_b32_e32 v1, v1, v40 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v44 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v43 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v42 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v53 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v49 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v28 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v24 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v25 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v26 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v8, v8, v34 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v55 +; SI-NEXT: v_or_b32_e32 v10, v10, v33 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v54 +; SI-NEXT: v_or_b32_e32 v12, v12, v32 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v53 +; SI-NEXT: v_or_b32_e32 v14, v14, v31 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v51 +; SI-NEXT: v_or_b32_e32 v16, v16, v30 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v49 +; SI-NEXT: v_or_b32_e32 v18, v18, v29 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v48 +; SI-NEXT: v_or_b32_e32 v20, v20, v28 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v38 +; SI-NEXT: v_or_b32_e32 v22, v22, v27 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v36 +; SI-NEXT: v_or_b32_e32 v24, v24, v26 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v52 +; SI-NEXT: v_or_b32_e32 v3, v3, v50 +; SI-NEXT: v_or_b32_e32 v5, v5, v39 +; SI-NEXT: v_or_b32_e32 v7, v7, v37 +; SI-NEXT: v_or_b32_e32 v9, v9, v34 +; SI-NEXT: v_or_b32_e32 v11, v11, v33 +; SI-NEXT: v_or_b32_e32 v13, v13, v32 +; SI-NEXT: v_or_b32_e32 v15, v15, v31 +; SI-NEXT: v_or_b32_e32 v17, v17, v30 +; SI-NEXT: v_or_b32_e32 v19, v19, v29 +; SI-NEXT: v_or_b32_e32 v21, v21, v28 +; SI-NEXT: v_or_b32_e32 v23, v23, v27 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v26i32_to_v52i16: @@ -3409,60 +3329,60 @@ define inreg <52 x i16> @bitcast_v26i32_to_v52i16_scalar(<26 x i32> inreg %a, i3 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v20, s30, 0 -; SI-NEXT: v_writelane_b32 v20, s31, 1 -; SI-NEXT: v_writelane_b32 v20, s34, 2 -; SI-NEXT: v_writelane_b32 v20, s35, 3 -; SI-NEXT: v_writelane_b32 v20, s36, 4 -; SI-NEXT: v_mov_b32_e32 v14, s16 -; SI-NEXT: v_mov_b32_e32 v15, s17 -; SI-NEXT: v_writelane_b32 v20, s37, 5 -; SI-NEXT: v_mov_b32_e32 v16, s18 -; SI-NEXT: v_mov_b32_e32 v17, s19 -; SI-NEXT: v_mov_b32_e32 v18, s20 -; SI-NEXT: v_mov_b32_e32 v19, s21 -; SI-NEXT: v_readfirstlane_b32 s42, v14 -; SI-NEXT: v_mov_b32_e32 v14, s22 -; SI-NEXT: v_readfirstlane_b32 s43, v15 -; SI-NEXT: v_mov_b32_e32 v15, s23 -; SI-NEXT: v_writelane_b32 v20, s38, 6 -; SI-NEXT: v_readfirstlane_b32 s40, v16 -; SI-NEXT: v_mov_b32_e32 v16, s24 -; SI-NEXT: v_readfirstlane_b32 s41, v17 -; SI-NEXT: v_mov_b32_e32 v17, s25 -; SI-NEXT: v_readfirstlane_b32 s24, v18 -; SI-NEXT: v_mov_b32_e32 v18, s26 -; SI-NEXT: v_readfirstlane_b32 s25, v19 -; SI-NEXT: v_mov_b32_e32 v19, s27 -; SI-NEXT: v_readfirstlane_b32 s22, v14 -; SI-NEXT: v_mov_b32_e32 v14, s28 -; SI-NEXT: v_readfirstlane_b32 s23, v15 -; SI-NEXT: v_mov_b32_e32 v15, s29 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 -; SI-NEXT: v_writelane_b32 v20, s39, 7 -; SI-NEXT: v_readfirstlane_b32 s20, v16 -; SI-NEXT: v_readfirstlane_b32 s21, v17 -; SI-NEXT: v_readfirstlane_b32 s18, v18 -; SI-NEXT: v_readfirstlane_b32 s19, v19 -; SI-NEXT: v_readfirstlane_b32 s16, v14 -; SI-NEXT: v_readfirstlane_b32 s17, v15 -; SI-NEXT: v_readfirstlane_b32 s14, v1 -; SI-NEXT: v_readfirstlane_b32 s15, v2 -; SI-NEXT: v_readfirstlane_b32 s12, v3 -; SI-NEXT: v_readfirstlane_b32 s13, v4 -; SI-NEXT: v_readfirstlane_b32 s10, v5 -; SI-NEXT: v_readfirstlane_b32 s11, v6 -; SI-NEXT: v_readfirstlane_b32 s8, v7 -; SI-NEXT: v_readfirstlane_b32 s9, v8 -; SI-NEXT: v_readfirstlane_b32 s6, v9 -; SI-NEXT: v_readfirstlane_b32 s7, v10 -; SI-NEXT: v_readfirstlane_b32 s4, v11 +; SI-NEXT: v_writelane_b32 v26, s30, 0 +; SI-NEXT: v_writelane_b32 v26, s31, 1 +; SI-NEXT: v_writelane_b32 v26, s34, 2 +; SI-NEXT: v_writelane_b32 v26, s35, 3 +; SI-NEXT: v_writelane_b32 v26, s36, 4 +; SI-NEXT: v_writelane_b32 v26, s37, 5 +; SI-NEXT: v_mov_b32_e32 v13, s16 +; SI-NEXT: v_mov_b32_e32 v14, s17 +; SI-NEXT: v_mov_b32_e32 v15, s18 +; SI-NEXT: v_mov_b32_e32 v16, s19 +; SI-NEXT: v_mov_b32_e32 v17, s20 +; SI-NEXT: v_mov_b32_e32 v18, s21 +; SI-NEXT: v_mov_b32_e32 v19, s22 +; SI-NEXT: v_writelane_b32 v26, s38, 6 +; SI-NEXT: v_readfirstlane_b32 s42, v13 +; SI-NEXT: v_mov_b32_e32 v13, s23 +; SI-NEXT: v_readfirstlane_b32 s43, v14 +; SI-NEXT: v_mov_b32_e32 v14, s24 +; SI-NEXT: v_readfirstlane_b32 s40, v15 +; SI-NEXT: v_mov_b32_e32 v15, s25 +; SI-NEXT: v_readfirstlane_b32 s41, v16 +; SI-NEXT: v_mov_b32_e32 v16, s26 +; SI-NEXT: v_readfirstlane_b32 s24, v17 +; SI-NEXT: v_mov_b32_e32 v17, s27 +; SI-NEXT: v_readfirstlane_b32 s25, v18 +; SI-NEXT: v_mov_b32_e32 v18, s28 +; SI-NEXT: v_readfirstlane_b32 s22, v19 +; SI-NEXT: v_mov_b32_e32 v19, s29 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: v_writelane_b32 v26, s39, 7 +; SI-NEXT: v_readfirstlane_b32 s23, v13 +; SI-NEXT: v_readfirstlane_b32 s20, v14 +; SI-NEXT: v_readfirstlane_b32 s21, v15 +; SI-NEXT: v_readfirstlane_b32 s18, v16 +; SI-NEXT: v_readfirstlane_b32 s19, v17 +; SI-NEXT: v_readfirstlane_b32 s16, v18 +; SI-NEXT: v_readfirstlane_b32 s17, v19 +; SI-NEXT: v_readfirstlane_b32 s14, v0 +; SI-NEXT: v_readfirstlane_b32 s15, v1 +; SI-NEXT: v_readfirstlane_b32 s12, v2 +; SI-NEXT: v_readfirstlane_b32 s13, v3 +; SI-NEXT: v_readfirstlane_b32 s10, v4 +; SI-NEXT: v_readfirstlane_b32 s11, v5 +; SI-NEXT: v_readfirstlane_b32 s8, v6 +; SI-NEXT: v_readfirstlane_b32 s9, v7 +; SI-NEXT: v_readfirstlane_b32 s6, v8 +; SI-NEXT: v_readfirstlane_b32 s7, v9 +; SI-NEXT: v_readfirstlane_b32 s4, v10 ; SI-NEXT: s_and_b64 s[26:27], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s5, v12 -; SI-NEXT: v_writelane_b32 v20, s48, 8 +; SI-NEXT: v_readfirstlane_b32 s5, v11 +; SI-NEXT: v_writelane_b32 v26, s48, 8 ; SI-NEXT: s_cbranch_scc0 .LBB13_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_lshr_b32 s92, s5, 16 @@ -3549,192 +3469,120 @@ define inreg <52 x i16> @bitcast_v26i32_to_v52i16_scalar(<26 x i32> inreg %a, i3 ; SI-NEXT: s_lshl_b32 s27, s88, 16 ; SI-NEXT: s_and_b32 s29, s42, 0xffff ; SI-NEXT: s_or_b32 s27, s29, s27 -; SI-NEXT: v_mov_b32_e32 v1, s27 -; SI-NEXT: s_and_b32 s27, s43, 0xffff -; SI-NEXT: s_lshl_b32 s29, s48, 16 -; SI-NEXT: s_or_b32 s27, s27, s29 -; SI-NEXT: v_mov_b32_e32 v2, s27 -; SI-NEXT: s_lshl_b32 s27, s78, 16 -; SI-NEXT: s_and_b32 s29, s40, 0xffff -; SI-NEXT: s_or_b32 s27, s29, s27 -; SI-NEXT: v_mov_b32_e32 v3, s27 -; SI-NEXT: s_and_b32 s27, s41, 0xffff -; SI-NEXT: s_lshl_b32 s29, s39, 16 -; SI-NEXT: s_or_b32 s27, s27, s29 -; SI-NEXT: v_mov_b32_e32 v4, s27 -; SI-NEXT: s_lshl_b32 s27, s76, 16 +; SI-NEXT: s_and_b32 s29, s43, 0xffff +; SI-NEXT: s_lshl_b32 s42, s48, 16 +; SI-NEXT: s_or_b32 s29, s29, s42 +; SI-NEXT: s_lshl_b32 s42, s78, 16 +; SI-NEXT: s_and_b32 s40, s40, 0xffff +; SI-NEXT: s_or_b32 s40, s40, s42 +; SI-NEXT: s_and_b32 s41, s41, 0xffff +; SI-NEXT: s_lshl_b32 s42, s39, 16 +; SI-NEXT: s_or_b32 s41, s41, s42 +; SI-NEXT: s_lshl_b32 s42, s76, 16 ; SI-NEXT: s_and_b32 s24, s24, 0xffff -; SI-NEXT: s_or_b32 s24, s24, s27 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; SI-NEXT: v_mov_b32_e32 v5, s24 -; SI-NEXT: s_and_b32 s24, s25, 0xffff -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v1, vcc, 8, v0 -; SI-NEXT: s_lshl_b32 s25, s38, 16 -; SI-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v1, vcc, 12, v0 -; SI-NEXT: s_or_b32 s24, s24, s25 -; SI-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v1, vcc, 16, v0 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mov_b32_e32 v2, s24 +; SI-NEXT: s_or_b32 s24, s24, s42 +; SI-NEXT: s_and_b32 s25, s25, 0xffff +; SI-NEXT: s_lshl_b32 s42, s38, 16 +; SI-NEXT: s_or_b32 s25, s25, s42 +; SI-NEXT: s_lshl_b32 s42, s74, 16 ; SI-NEXT: s_and_b32 s22, s22, 0xffff -; SI-NEXT: s_lshl_b32 s24, s74, 16 -; SI-NEXT: buffer_store_dword v5, v1, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v1, vcc, 20, v0 -; SI-NEXT: s_or_b32 s22, s22, s24 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s22 -; SI-NEXT: s_and_b32 s22, s23, 0xffff -; SI-NEXT: s_lshl_b32 s23, s37, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 24, v0 -; SI-NEXT: s_or_b32 s22, s22, s23 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s22 +; SI-NEXT: s_or_b32 s22, s22, s42 +; SI-NEXT: s_and_b32 s23, s23, 0xffff +; SI-NEXT: s_lshl_b32 s42, s37, 16 +; SI-NEXT: s_or_b32 s23, s23, s42 +; SI-NEXT: s_lshl_b32 s42, s72, 16 ; SI-NEXT: s_and_b32 s20, s20, 0xffff -; SI-NEXT: s_lshl_b32 s22, s72, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 28, v0 -; SI-NEXT: s_or_b32 s20, s20, s22 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s20 -; SI-NEXT: s_and_b32 s20, s21, 0xffff -; SI-NEXT: s_lshl_b32 s21, s36, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 32, v0 -; SI-NEXT: s_or_b32 s20, s20, s21 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s20 +; SI-NEXT: s_or_b32 s20, s20, s42 +; SI-NEXT: s_and_b32 s21, s21, 0xffff +; SI-NEXT: s_lshl_b32 s42, s36, 16 +; SI-NEXT: s_or_b32 s21, s21, s42 ; SI-NEXT: s_and_b32 s18, s18, 0xffff -; SI-NEXT: s_lshl_b32 s20, s62, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 36, v0 -; SI-NEXT: s_or_b32 s18, s18, s20 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s18 -; SI-NEXT: s_and_b32 s18, s19, 0xffff -; SI-NEXT: s_lshl_b32 s19, s35, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 40, v0 -; SI-NEXT: s_or_b32 s18, s18, s19 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: s_lshl_b32 s42, s62, 16 +; SI-NEXT: s_or_b32 s18, s18, s42 +; SI-NEXT: s_and_b32 s19, s19, 0xffff +; SI-NEXT: s_lshl_b32 s42, s35, 16 +; SI-NEXT: s_or_b32 s19, s19, s42 ; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: s_lshl_b32 s18, s60, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 44, v0 -; SI-NEXT: s_or_b32 s16, s16, s18 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s17, 0xffff -; SI-NEXT: s_lshl_b32 s17, s34, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 48, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_lshl_b32 s42, s60, 16 +; SI-NEXT: s_or_b32 s16, s16, s42 +; SI-NEXT: s_and_b32 s17, s17, 0xffff +; SI-NEXT: s_lshl_b32 s42, s34, 16 +; SI-NEXT: s_or_b32 s17, s17, s42 ; SI-NEXT: s_and_b32 s14, s14, 0xffff -; SI-NEXT: s_lshl_b32 s16, s58, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 52, v0 -; SI-NEXT: s_or_b32 s14, s14, s16 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s14 -; SI-NEXT: s_and_b32 s14, s15, 0xffff -; SI-NEXT: s_lshl_b32 s15, s31, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 56, v0 -; SI-NEXT: s_or_b32 s14, s14, s15 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s14 +; SI-NEXT: s_lshl_b32 s42, s58, 16 +; SI-NEXT: s_or_b32 s14, s14, s42 +; SI-NEXT: s_and_b32 s15, s15, 0xffff +; SI-NEXT: s_lshl_b32 s42, s31, 16 +; SI-NEXT: s_or_b32 s15, s15, s42 ; SI-NEXT: s_and_b32 s12, s12, 0xffff -; SI-NEXT: s_lshl_b32 s14, s56, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 60, v0 -; SI-NEXT: s_or_b32 s12, s12, s14 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s12 -; SI-NEXT: s_and_b32 s12, s13, 0xffff -; SI-NEXT: s_lshl_b32 s13, s30, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 64, v0 -; SI-NEXT: s_or_b32 s12, s12, s13 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s12 +; SI-NEXT: s_lshl_b32 s42, s56, 16 +; SI-NEXT: s_or_b32 s12, s12, s42 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_lshl_b32 s42, s30, 16 +; SI-NEXT: s_or_b32 s13, s13, s42 ; SI-NEXT: s_and_b32 s10, s10, 0xffff -; SI-NEXT: s_lshl_b32 s12, s46, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x44, v0 -; SI-NEXT: s_or_b32 s10, s10, s12 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s10 -; SI-NEXT: s_and_b32 s10, s11, 0xffff -; SI-NEXT: s_lshl_b32 s11, s95, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x48, v0 -; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: s_lshl_b32 s42, s46, 16 +; SI-NEXT: s_or_b32 s10, s10, s42 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: s_lshl_b32 s42, s95, 16 +; SI-NEXT: s_or_b32 s11, s11, s42 ; SI-NEXT: s_and_b32 s8, s8, 0xffff -; SI-NEXT: s_lshl_b32 s10, s44, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x4c, v0 -; SI-NEXT: s_or_b32 s8, s8, s10 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s8 -; SI-NEXT: s_and_b32 s8, s9, 0xffff -; SI-NEXT: s_lshl_b32 s9, s94, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x50, v0 -; SI-NEXT: s_or_b32 s8, s8, s9 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: s_lshl_b32 s42, s44, 16 ; SI-NEXT: s_and_b32 s6, s6, 0xffff -; SI-NEXT: s_lshl_b32 s8, s28, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x54, v0 -; SI-NEXT: s_or_b32 s6, s6, s8 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: s_and_b32 s6, s7, 0xffff -; SI-NEXT: s_lshl_b32 s7, s93, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x58, v0 -; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_lshl_b32 s28, s28, 16 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_lshl_b32 s6, s26, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x5c, v0 -; SI-NEXT: s_or_b32 s4, s4, s6 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s4 -; SI-NEXT: s_and_b32 s4, s5, 0xffff -; SI-NEXT: s_lshl_b32 s5, s92, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x60, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 -; SI-NEXT: v_mov_b32_e32 v1, s4 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: v_readlane_b32 s48, v20, 8 -; SI-NEXT: v_readlane_b32 s39, v20, 7 -; SI-NEXT: v_readlane_b32 s38, v20, 6 -; SI-NEXT: v_readlane_b32 s37, v20, 5 -; SI-NEXT: v_readlane_b32 s36, v20, 4 -; SI-NEXT: v_readlane_b32 s35, v20, 3 -; SI-NEXT: v_readlane_b32 s34, v20, 2 -; SI-NEXT: v_readlane_b32 s31, v20, 1 -; SI-NEXT: v_readlane_b32 s30, v20, 0 +; SI-NEXT: s_lshl_b32 s26, s26, 16 +; SI-NEXT: s_or_b32 s8, s8, s42 +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_lshl_b32 s42, s94, 16 +; SI-NEXT: s_or_b32 s6, s6, s28 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_lshl_b32 s28, s93, 16 +; SI-NEXT: s_or_b32 s4, s4, s26 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s26, s92, 16 +; SI-NEXT: s_or_b32 s9, s9, s42 +; SI-NEXT: s_or_b32 s7, s7, s28 +; SI-NEXT: s_or_b32 s5, s5, s26 +; SI-NEXT: v_mov_b32_e32 v0, s27 +; SI-NEXT: v_mov_b32_e32 v1, s29 +; SI-NEXT: v_mov_b32_e32 v2, s40 +; SI-NEXT: v_mov_b32_e32 v3, s41 +; SI-NEXT: v_mov_b32_e32 v4, s24 +; SI-NEXT: v_mov_b32_e32 v5, s25 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s20 +; SI-NEXT: v_mov_b32_e32 v9, s21 +; SI-NEXT: v_mov_b32_e32 v10, s18 +; SI-NEXT: v_mov_b32_e32 v11, s19 +; SI-NEXT: v_mov_b32_e32 v12, s16 +; SI-NEXT: v_mov_b32_e32 v13, s17 +; SI-NEXT: v_mov_b32_e32 v14, s14 +; SI-NEXT: v_mov_b32_e32 v15, s15 +; SI-NEXT: v_mov_b32_e32 v16, s12 +; SI-NEXT: v_mov_b32_e32 v17, s13 +; SI-NEXT: v_mov_b32_e32 v18, s10 +; SI-NEXT: v_mov_b32_e32 v19, s11 +; SI-NEXT: v_mov_b32_e32 v20, s8 +; SI-NEXT: v_mov_b32_e32 v21, s9 +; SI-NEXT: v_mov_b32_e32 v22, s6 +; SI-NEXT: v_mov_b32_e32 v23, s7 +; SI-NEXT: v_mov_b32_e32 v24, s4 +; SI-NEXT: v_mov_b32_e32 v25, s5 +; SI-NEXT: v_readlane_b32 s48, v26, 8 +; SI-NEXT: v_readlane_b32 s39, v26, 7 +; SI-NEXT: v_readlane_b32 s38, v26, 6 +; SI-NEXT: v_readlane_b32 s37, v26, 5 +; SI-NEXT: v_readlane_b32 s36, v26, 4 +; SI-NEXT: v_readlane_b32 s35, v26, 3 +; SI-NEXT: v_readlane_b32 s34, v26, 2 +; SI-NEXT: v_readlane_b32 s31, v26, 1 +; SI-NEXT: v_readlane_b32 s30, v26, 0 ; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[4:5] -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB13_4: ; SI-NEXT: ; implicit-def: $sgpr88 @@ -4452,192 +4300,242 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v52i16_to_v26i32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v50, v10 -; SI-NEXT: v_mov_b32_e32 v51, v8 -; SI-NEXT: v_mov_b32_e32 v52, v6 -; SI-NEXT: v_mov_b32_e32 v53, v4 -; SI-NEXT: v_mov_b32_e32 v54, v2 -; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:12 -; SI-NEXT: v_mov_b32_e32 v49, v12 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v29 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v8 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:4 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v12 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v18 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:60 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:52 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v22 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:44 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:36 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:28 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v37, v20 +; SI-NEXT: v_mov_b32_e32 v38, v19 +; SI-NEXT: v_mov_b32_e32 v39, v18 +; SI-NEXT: v_mov_b32_e32 v48, v17 +; SI-NEXT: v_mov_b32_e32 v49, v16 +; SI-NEXT: v_mov_b32_e32 v50, v15 +; SI-NEXT: v_mov_b32_e32 v51, v14 +; SI-NEXT: v_mov_b32_e32 v52, v13 +; SI-NEXT: v_mov_b32_e32 v53, v12 +; SI-NEXT: v_mov_b32_e32 v54, v11 +; SI-NEXT: v_mov_b32_e32 v55, v10 +; SI-NEXT: v_mov_b32_e32 v40, v9 +; SI-NEXT: v_mov_b32_e32 v41, v8 +; SI-NEXT: v_mov_b32_e32 v42, v7 +; SI-NEXT: v_mov_b32_e32 v43, v6 +; SI-NEXT: v_mov_b32_e32 v44, v5 +; SI-NEXT: v_mov_b32_e32 v45, v4 +; SI-NEXT: v_mov_b32_e32 v46, v3 +; SI-NEXT: v_mov_b32_e32 v47, v2 +; SI-NEXT: v_mov_b32_e32 v56, v1 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_mov_b32_e32 v57, v0 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v25 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v24 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v23 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v22 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v38 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v48 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v50 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v52 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v40 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v42 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v43 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v44 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v45 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v46 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v47 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v56 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v57 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB14_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v53 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v52 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v51 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v50 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v49 -; SI-NEXT: v_or_b32_e32 v0, v0, v58 -; SI-NEXT: v_or_b32_e32 v1, v1, v48 -; SI-NEXT: v_or_b32_e32 v2, v2, v39 -; SI-NEXT: v_or_b32_e32 v3, v3, v57 -; SI-NEXT: v_or_b32_e32 v4, v4, v38 -; SI-NEXT: v_or_b32_e32 v5, v5, v56 -; SI-NEXT: v_or_b32_e32 v6, v6, v47 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v54 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v57 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v56 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v47 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v46 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v45 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v44 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v43 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v42 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v41 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v40 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v55 +; SI-NEXT: v_or_b32_e32 v0, v0, v60 +; SI-NEXT: v_or_b32_e32 v1, v1, v36 +; SI-NEXT: v_or_b32_e32 v2, v2, v59 +; SI-NEXT: v_or_b32_e32 v3, v3, v35 +; SI-NEXT: v_or_b32_e32 v4, v4, v58 +; SI-NEXT: v_or_b32_e32 v5, v5, v34 +; SI-NEXT: v_or_b32_e32 v6, v6, v63 +; SI-NEXT: v_or_b32_e32 v7, v7, v33 +; SI-NEXT: v_or_b32_e32 v8, v8, v62 +; SI-NEXT: v_or_b32_e32 v9, v9, v32 +; SI-NEXT: v_or_b32_e32 v10, v10, v61 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v53 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v52 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v51 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v50 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v49 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v48 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v39 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v38 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v37 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_or_b32_e32 v24, v24, v25 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_or_b32_e32 v25, v25, v26 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; kill: killed $vgpr26 @@ -4674,97 +4572,39 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; kill: killed $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 ; SI-NEXT: ; kill: killed $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: v_or_b32_e32 v7, v7, v37 -; SI-NEXT: v_or_b32_e32 v8, v8, v46 -; SI-NEXT: v_or_b32_e32 v9, v9, v45 -; SI-NEXT: v_or_b32_e32 v10, v10, v36 -; SI-NEXT: v_or_b32_e32 v11, v11, v44 -; SI-NEXT: v_or_b32_e32 v12, v12, v43 -; SI-NEXT: v_or_b32_e32 v13, v13, v35 -; SI-NEXT: v_or_b32_e32 v14, v14, v42 -; SI-NEXT: v_or_b32_e32 v15, v15, v34 -; SI-NEXT: v_or_b32_e32 v16, v16, v41 -; SI-NEXT: v_or_b32_e32 v17, v17, v33 -; SI-NEXT: v_or_b32_e32 v18, v18, v40 -; SI-NEXT: v_or_b32_e32 v19, v19, v32 -; SI-NEXT: v_or_b32_e32 v20, v20, v63 -; SI-NEXT: v_or_b32_e32 v21, v21, v62 -; SI-NEXT: v_or_b32_e32 v22, v22, v61 -; SI-NEXT: v_or_b32_e32 v23, v23, v60 -; SI-NEXT: v_or_b32_e32 v24, v24, v59 ; SI-NEXT: ; kill: killed $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; kill: killed $vgpr26 ; SI-NEXT: .LBB14_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB14_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v53 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v52 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v51 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v50 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v49 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v54 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v57 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v56 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v47 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v46 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v45 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v44 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v43 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v42 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v41 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v40 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v55 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 @@ -4772,14 +4612,22 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v0, v60, v0 ; SI-NEXT: s_mov_b32 s6, 0x30000 -; SI-NEXT: v_or_b32_e32 v1, v48, v1 -; SI-NEXT: v_or_b32_e32 v2, v39, v2 -; SI-NEXT: v_or_b32_e32 v3, v57, v3 -; SI-NEXT: v_or_b32_e32 v4, v38, v4 -; SI-NEXT: v_or_b32_e32 v5, v56, v5 -; SI-NEXT: v_or_b32_e32 v6, v47, v6 +; SI-NEXT: v_or_b32_e32 v1, v36, v1 +; SI-NEXT: v_or_b32_e32 v2, v59, v2 +; SI-NEXT: v_or_b32_e32 v3, v35, v3 +; SI-NEXT: v_or_b32_e32 v4, v58, v4 +; SI-NEXT: v_or_b32_e32 v5, v34, v5 +; SI-NEXT: v_or_b32_e32 v6, v63, v6 +; SI-NEXT: v_or_b32_e32 v7, v33, v7 +; SI-NEXT: v_or_b32_e32 v8, v62, v8 +; SI-NEXT: v_or_b32_e32 v9, v32, v9 +; SI-NEXT: v_or_b32_e32 v10, v61, v10 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 @@ -4787,78 +4635,50 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 ; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 ; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v53 ; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v52 ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v51 ; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v50 ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v49 ; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v48 ; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v39 ; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v38 ; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v37 ; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; SI-NEXT: v_or_b32_e32 v7, v37, v7 -; SI-NEXT: v_or_b32_e32 v8, v46, v8 -; SI-NEXT: v_or_b32_e32 v9, v45, v9 -; SI-NEXT: v_or_b32_e32 v10, v36, v10 -; SI-NEXT: v_or_b32_e32 v11, v44, v11 -; SI-NEXT: v_or_b32_e32 v12, v43, v12 -; SI-NEXT: v_or_b32_e32 v13, v35, v13 -; SI-NEXT: v_or_b32_e32 v14, v42, v14 -; SI-NEXT: v_or_b32_e32 v15, v34, v15 -; SI-NEXT: v_or_b32_e32 v16, v41, v16 -; SI-NEXT: v_or_b32_e32 v17, v33, v17 -; SI-NEXT: v_or_b32_e32 v18, v40, v18 -; SI-NEXT: v_or_b32_e32 v19, v32, v19 -; SI-NEXT: v_or_b32_e32 v20, v63, v20 -; SI-NEXT: v_or_b32_e32 v21, v62, v21 -; SI-NEXT: v_or_b32_e32 v22, v61, v22 -; SI-NEXT: v_or_b32_e32 v23, v60, v23 -; SI-NEXT: v_or_b32_e32 v24, v59, v24 -; SI-NEXT: v_or_b32_e32 v25, v26, v25 -; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 -; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 ; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 ; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 @@ -4869,29 +4689,58 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 ; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 ; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v20 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v24, vcc, 0x30000, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 ; SI-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v25 ; SI-NEXT: .LBB14_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -5600,366 +5449,296 @@ define inreg <26 x i32> @bitcast_v52i16_to_v26i32_scalar(<52 x i16> inreg %a, i3 ; SI-LABEL: bitcast_v52i16_to_v26i32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v47, v8 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_mov_b32_e32 v57, v6 -; SI-NEXT: v_mov_b32_e32 v32, v4 -; SI-NEXT: v_mov_b32_e32 v34, v2 -; SI-NEXT: v_mov_b32_e32 v37, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v62, v30 -; SI-NEXT: v_mov_b32_e32 v30, v24 -; SI-NEXT: v_mov_b32_e32 v38, v22 -; SI-NEXT: v_mov_b32_e32 v39, v20 -; SI-NEXT: v_mov_b32_e32 v48, v18 -; SI-NEXT: v_mov_b32_e32 v49, v16 -; SI-NEXT: v_mov_b32_e32 v50, v14 -; SI-NEXT: v_mov_b32_e32 v40, v12 -; SI-NEXT: v_mov_b32_e32 v41, v10 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v1 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v29 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v2 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v4 +; SI-NEXT: v_mov_b32_e32 v36, v7 +; SI-NEXT: v_mov_b32_e32 v35, v8 +; SI-NEXT: v_mov_b32_e32 v51, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v36 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v34, v9 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v35 +; SI-NEXT: v_mov_b32_e32 v33, v10 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v34 +; SI-NEXT: v_mov_b32_e32 v32, v11 +; SI-NEXT: v_mov_b32_e32 v37, v6 +; SI-NEXT: v_mov_b32_e32 v38, v5 +; SI-NEXT: v_mov_b32_e32 v39, v4 +; SI-NEXT: v_mov_b32_e32 v48, v3 +; SI-NEXT: v_mov_b32_e32 v49, v2 +; SI-NEXT: v_mov_b32_e32 v50, v1 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v38 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v48 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v50 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v51 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s7, s28, 16 +; SI-NEXT: s_lshr_b32 s8, s27, 16 +; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: s_lshr_b32 s13, s22, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s40, s19, 16 +; SI-NEXT: s_lshr_b32 s41, s18, 16 +; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v32 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v8 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v0 ; SI-NEXT: s_cbranch_scc0 .LBB15_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 -; SI-NEXT: v_or_b32_e32 v7, v0, v31 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 -; SI-NEXT: v_or_b32_e32 v9, v0, v61 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v57 -; SI-NEXT: v_or_b32_e32 v10, v0, v60 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v47 -; SI-NEXT: v_or_b32_e32 v11, v0, v59 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v41 -; SI-NEXT: v_or_b32_e32 v12, v0, v36 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v40 -; SI-NEXT: v_or_b32_e32 v13, v0, v35 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s44, s42, 16 +; SI-NEXT: s_or_b32 s5, s5, s44 +; SI-NEXT: s_and_b32 s44, s18, 0xffff +; SI-NEXT: s_lshl_b32 s45, s41, 16 +; SI-NEXT: s_or_b32 s44, s44, s45 +; SI-NEXT: s_and_b32 s45, s19, 0xffff +; SI-NEXT: s_lshl_b32 s46, s40, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; SI-NEXT: s_or_b32 s45, s45, s46 +; SI-NEXT: s_and_b32 s46, s20, 0xffff +; SI-NEXT: s_lshl_b32 s47, s15, 16 +; SI-NEXT: v_or_b32_e32 v14, v0, v47 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 -; SI-NEXT: v_or_b32_e32 v14, v0, v33 +; SI-NEXT: s_or_b32 s46, s46, s47 +; SI-NEXT: s_and_b32 s47, s21, 0xffff +; SI-NEXT: s_lshl_b32 s56, s14, 16 +; SI-NEXT: v_or_b32_e32 v15, v0, v46 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 -; SI-NEXT: v_or_b32_e32 v15, v0, v55 +; SI-NEXT: s_or_b32 s47, s47, s56 +; SI-NEXT: s_and_b32 s56, s22, 0xffff +; SI-NEXT: s_lshl_b32 s57, s13, 16 +; SI-NEXT: v_or_b32_e32 v16, v0, v45 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 -; SI-NEXT: v_or_b32_e32 v16, v0, v54 +; SI-NEXT: s_or_b32 s56, s56, s57 +; SI-NEXT: s_and_b32 s57, s23, 0xffff +; SI-NEXT: s_lshl_b32 s58, s12, 16 +; SI-NEXT: v_or_b32_e32 v17, v0, v44 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 -; SI-NEXT: v_or_b32_e32 v17, v0, v58 +; SI-NEXT: s_or_b32 s57, s57, s58 +; SI-NEXT: s_and_b32 s58, s24, 0xffff +; SI-NEXT: s_lshl_b32 s59, s11, 16 +; SI-NEXT: v_or_b32_e32 v18, v0, v43 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 -; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: v_or_b32_e32 v18, v0, v53 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: v_or_b32_e32 v19, v0, v56 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v26 -; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: v_or_b32_e32 v20, v0, v52 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v28 -; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: v_or_b32_e32 v21, v0, v46 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v62 -; SI-NEXT: s_or_b32 s7, s7, s8 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: v_or_b32_e32 v22, v0, v45 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v44 -; SI-NEXT: s_or_b32 s8, s8, s9 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: v_or_b32_e32 v23, v0, v51 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v43 -; SI-NEXT: s_or_b32 s9, s9, s10 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v34 -; SI-NEXT: v_or_b32_e32 v24, v0, v29 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v42 -; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_or_b32_e32 v8, v1, v63 -; SI-NEXT: v_or_b32_e32 v25, v0, v27 +; SI-NEXT: s_or_b32 s58, s58, s59 +; SI-NEXT: s_and_b32 s59, s25, 0xffff +; SI-NEXT: s_lshl_b32 s60, s10, 16 +; SI-NEXT: v_or_b32_e32 v19, v0, v42 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: s_or_b32 s59, s59, s60 +; SI-NEXT: s_and_b32 s60, s26, 0xffff +; SI-NEXT: s_lshl_b32 s61, s9, 16 +; SI-NEXT: v_or_b32_e32 v20, v0, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: s_or_b32 s60, s60, s61 +; SI-NEXT: s_and_b32 s61, s27, 0xffff +; SI-NEXT: s_lshl_b32 s62, s8, 16 +; SI-NEXT: v_or_b32_e32 v21, v0, v40 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: s_or_b32 s61, s61, s62 +; SI-NEXT: s_and_b32 s62, s28, 0xffff +; SI-NEXT: s_lshl_b32 s63, s7, 16 +; SI-NEXT: v_or_b32_e32 v22, v0, v55 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: s_or_b32 s62, s62, s63 +; SI-NEXT: s_and_b32 s63, s29, 0xffff +; SI-NEXT: s_lshl_b32 s72, s6, 16 +; SI-NEXT: v_or_b32_e32 v23, v0, v54 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: s_or_b32 s63, s63, s72 +; SI-NEXT: v_or_b32_e32 v24, v0, v53 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: v_or_b32_e32 v25, v0, v52 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_mov_b32_e32 v5, s9 -; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: v_mov_b32_e32 v2, s44 +; SI-NEXT: v_mov_b32_e32 v3, s45 +; SI-NEXT: v_mov_b32_e32 v4, s46 +; SI-NEXT: v_mov_b32_e32 v5, s47 +; SI-NEXT: v_mov_b32_e32 v6, s56 +; SI-NEXT: v_mov_b32_e32 v7, s57 +; SI-NEXT: v_mov_b32_e32 v8, s58 +; SI-NEXT: v_mov_b32_e32 v9, s59 +; SI-NEXT: v_mov_b32_e32 v10, s60 +; SI-NEXT: v_mov_b32_e32 v11, s61 +; SI-NEXT: v_mov_b32_e32 v12, s62 +; SI-NEXT: v_mov_b32_e32 v13, s63 ; SI-NEXT: s_cbranch_execnz .LBB15_3 ; SI-NEXT: .LBB15_2: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_or_b32_e32 v0, v31, v0 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_or_b32_e32 v0, v61, v0 -; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v57 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v60, v0 -; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v47 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v59, v0 -; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v36, v0 -; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v40 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v35, v0 -; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v33, v0 +; SI-NEXT: v_or_b32_e32 v0, v47, v0 ; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v55, v0 +; SI-NEXT: v_or_b32_e32 v0, v46, v0 ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v54, v0 +; SI-NEXT: v_or_b32_e32 v0, v45, v0 ; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: v_or_b32_e32 v0, v44, v0 ; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v53, v0 +; SI-NEXT: v_or_b32_e32 v0, v43, v0 ; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v56, v0 +; SI-NEXT: v_or_b32_e32 v0, v42, v0 ; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v52, v0 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: v_or_b32_e32 v0, v41, v0 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s16, s42, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: s_and_b32 s16, s18, 0xffff +; SI-NEXT: s_lshl_b32 s17, s41, 16 +; SI-NEXT: s_add_i32 s19, s19, 3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_and_b32 s17, s19, 0xffff +; SI-NEXT: s_lshl_b32 s18, s40, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_or_b32_e32 v0, v40, v0 +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: s_and_b32 s18, s20, 0xffff +; SI-NEXT: s_lshl_b32 s15, s15, 16 +; SI-NEXT: s_add_i32 s21, s21, 3 ; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v62 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: s_or_b32 s15, s15, s18 +; SI-NEXT: s_and_b32 s18, s21, 0xffff +; SI-NEXT: s_lshl_b32 s14, s14, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_or_b32_e32 v0, v45, v0 -; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s14, s14, s18 +; SI-NEXT: s_and_b32 s18, s22, 0xffff +; SI-NEXT: s_lshl_b32 s13, s13, 16 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: v_or_b32_e32 v0, v55, v0 +; SI-NEXT: s_or_b32 s13, s13, s18 +; SI-NEXT: s_and_b32 s18, s23, 0xffff +; SI-NEXT: s_lshl_b32 s12, s12, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 ; SI-NEXT: v_add_i32_e32 v22, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v44 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 +; SI-NEXT: s_or_b32 s12, s12, s18 +; SI-NEXT: s_and_b32 s18, s24, 0xffff +; SI-NEXT: s_lshl_b32 s11, s11, 16 +; SI-NEXT: s_add_i32 s25, s25, 3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: v_or_b32_e32 v0, v51, v0 -; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v43 -; SI-NEXT: s_or_b32 s7, s8, s7 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_or_b32 s11, s11, s18 +; SI-NEXT: s_and_b32 s18, s25, 0xffff +; SI-NEXT: s_lshl_b32 s10, s10, 16 ; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_or_b32 s8, s9, s8 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: v_or_b32_e32 v0, v54, v0 +; SI-NEXT: s_or_b32 s10, s10, s18 +; SI-NEXT: s_and_b32 s18, s26, 0xffff +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: s_or_b32 s9, s9, s18 +; SI-NEXT: s_and_b32 s18, s27, 0xffff +; SI-NEXT: s_lshl_b32 s8, s8, 16 ; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: v_or_b32_e32 v0, v29, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v34 -; SI-NEXT: s_or_b32 s9, s10, s9 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s8, s8, s18 +; SI-NEXT: s_and_b32 s18, s28, 0xffff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: v_or_b32_e32 v0, v53, v0 +; SI-NEXT: s_or_b32 s7, s7, s18 +; SI-NEXT: s_and_b32 s18, s29, 0xffff +; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: v_add_i32_e32 v24, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v42 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 +; SI-NEXT: s_or_b32 s6, s6, s18 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v63, v1 ; SI-NEXT: s_add_i32 s4, s4, 0x30000 ; SI-NEXT: s_add_i32 s5, s5, 0x30000 -; SI-NEXT: s_add_i32 s6, s6, 0x30000 -; SI-NEXT: s_add_i32 s7, s7, 0x30000 -; SI-NEXT: s_add_i32 s8, s8, 0x30000 -; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s16, s16, 0x30000 +; SI-NEXT: s_add_i32 s17, s17, 0x30000 +; SI-NEXT: s_add_i32 s15, s15, 0x30000 +; SI-NEXT: s_add_i32 s14, s14, 0x30000 +; SI-NEXT: s_add_i32 s13, s13, 0x30000 +; SI-NEXT: s_add_i32 s12, s12, 0x30000 +; SI-NEXT: s_add_i32 s11, s11, 0x30000 ; SI-NEXT: s_add_i32 s10, s10, 0x30000 -; SI-NEXT: v_or_b32_e32 v0, v27, v0 -; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v52, v0 ; SI-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v0 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_mov_b32_e32 v5, s9 -; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: v_mov_b32_e32 v3, s17 +; SI-NEXT: v_mov_b32_e32 v4, s15 +; SI-NEXT: v_mov_b32_e32 v5, s14 +; SI-NEXT: v_mov_b32_e32 v6, s13 +; SI-NEXT: v_mov_b32_e32 v7, s12 +; SI-NEXT: v_mov_b32_e32 v8, s11 +; SI-NEXT: v_mov_b32_e32 v9, s10 +; SI-NEXT: v_mov_b32_e32 v10, s9 +; SI-NEXT: v_mov_b32_e32 v11, s8 +; SI-NEXT: v_mov_b32_e32 v12, s7 +; SI-NEXT: v_mov_b32_e32 v13, s6 ; SI-NEXT: .LBB15_3: ; %end -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB15_4: -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v47, v43 -; SI-NEXT: v_mov_b32_e32 v43, v50 -; SI-NEXT: v_mov_b32_e32 v50, v38 -; SI-NEXT: v_mov_b32_e32 v38, v62 -; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v62, v56 -; SI-NEXT: v_mov_b32_e32 v56, v44 -; SI-NEXT: v_mov_b32_e32 v44, v40 -; SI-NEXT: v_mov_b32_e32 v40, v39 -; SI-NEXT: v_mov_b32_e32 v39, v28 -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v59, v45 -; SI-NEXT: v_mov_b32_e32 v45, v41 -; SI-NEXT: v_mov_b32_e32 v41, v48 -; SI-NEXT: v_mov_b32_e32 v48, v26 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v60, v52 -; SI-NEXT: v_mov_b32_e32 v52, v46 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_mov_b32_e32 v46, v42 -; SI-NEXT: v_mov_b32_e32 v42, v49 -; SI-NEXT: v_mov_b32_e32 v49, v30 -; SI-NEXT: v_mov_b32_e32 v61, v63 -; SI-NEXT: v_mov_b32_e32 v63, v57 -; SI-NEXT: v_mov_b32_e32 v57, v27 -; SI-NEXT: v_mov_b32_e32 v53, v37 -; SI-NEXT: v_mov_b32_e32 v37, v36 -; SI-NEXT: v_mov_b32_e32 v36, v35 -; SI-NEXT: v_mov_b32_e32 v35, v34 -; SI-NEXT: v_mov_b32_e32 v34, v33 -; SI-NEXT: v_mov_b32_e32 v33, v55 -; SI-NEXT: v_mov_b32_e32 v55, v32 -; SI-NEXT: v_mov_b32_e32 v32, v54 -; SI-NEXT: v_mov_b32_e32 v54, v58 -; SI-NEXT: v_mov_b32_e32 v58, v51 -; SI-NEXT: v_mov_b32_e32 v51, v29 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v29, v51 -; SI-NEXT: v_mov_b32_e32 v51, v58 -; SI-NEXT: v_mov_b32_e32 v58, v54 -; SI-NEXT: v_mov_b32_e32 v54, v32 -; SI-NEXT: v_mov_b32_e32 v32, v55 -; SI-NEXT: v_mov_b32_e32 v55, v33 -; SI-NEXT: v_mov_b32_e32 v33, v34 -; SI-NEXT: v_mov_b32_e32 v34, v35 -; SI-NEXT: v_mov_b32_e32 v35, v36 -; SI-NEXT: v_mov_b32_e32 v36, v37 -; SI-NEXT: v_mov_b32_e32 v37, v53 -; SI-NEXT: v_mov_b32_e32 v27, v57 -; SI-NEXT: v_mov_b32_e32 v57, v63 -; SI-NEXT: v_mov_b32_e32 v63, v61 -; SI-NEXT: v_mov_b32_e32 v30, v49 -; SI-NEXT: v_mov_b32_e32 v49, v42 -; SI-NEXT: v_mov_b32_e32 v42, v46 -; SI-NEXT: v_mov_b32_e32 v46, v52 -; SI-NEXT: v_mov_b32_e32 v52, v60 -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v26, v48 -; SI-NEXT: v_mov_b32_e32 v48, v41 -; SI-NEXT: v_mov_b32_e32 v41, v45 -; SI-NEXT: v_mov_b32_e32 v45, v59 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v28, v39 -; SI-NEXT: v_mov_b32_e32 v39, v40 -; SI-NEXT: v_mov_b32_e32 v40, v44 -; SI-NEXT: v_mov_b32_e32 v44, v56 -; SI-NEXT: v_mov_b32_e32 v56, v62 -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v62, v38 -; SI-NEXT: v_mov_b32_e32 v38, v50 -; SI-NEXT: v_mov_b32_e32 v50, v43 -; SI-NEXT: v_mov_b32_e32 v43, v47 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_branch .LBB15_2 ; ; VI-LABEL: bitcast_v52i16_to_v26i32_scalar: @@ -6637,16 +6416,16 @@ end: define <52 x half> @bitcast_v26i32_to_v52f16(<26 x i32> %a, i32 %b) { ; SI-LABEL: bitcast_v26i32_to_v52f16: ; SI: ; %bb.0: -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; kill: killed $vgpr51 -; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; kill: killed $vgpr48 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v27 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; kill: killed $vgpr51 -; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; kill: killed $vgpr48 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -6663,129 +6442,115 @@ define <52 x half> @bitcast_v26i32_to_v52f16(<26 x i32> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; kill: killed $vgpr51 -; SI-NEXT: ; kill: killed $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; kill: killed $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr62 ; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; kill: killed $vgpr51 -; SI-NEXT: ; kill: killed $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; kill: killed $vgpr51 -; SI-NEXT: ; kill: killed $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; kill: killed $vgpr51 -; SI-NEXT: ; kill: killed $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; kill: killed $vgpr51 -; SI-NEXT: ; kill: killed $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB16_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v4 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v25 -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v24 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 @@ -6793,35 +6558,47 @@ define <52 x half> @bitcast_v26i32_to_v52f16(<26 x i32> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v35, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v37, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v39, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v27 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v49, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v24 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v58, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v1 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v56, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v0 +; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 @@ -6847,22 +6624,17 @@ define <52 x half> @bitcast_v26i32_to_v52f16(<26 x i32> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: .LBB16_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB16_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v55 -; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v20 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v53 ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 ; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 @@ -6873,6 +6645,7 @@ define <52 x half> @bitcast_v26i32_to_v52f16(<26 x i32> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 ; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 ; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 ; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 ; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 @@ -6880,36 +6653,36 @@ define <52 x half> @bitcast_v26i32_to_v52f16(<26 x i32> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 ; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 ; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 ; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 ; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 ; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 ; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v25 ; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 @@ -6924,271 +6697,187 @@ define <52 x half> @bitcast_v26i32_to_v52f16(<26 x i32> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v56, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 ; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 ; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 ; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 ; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 ; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 ; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 ; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 ; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_mov_b32_e32 v55, v24 -; SI-NEXT: v_mov_b32_e32 v53, v25 -; SI-NEXT: v_mov_b32_e32 v51, v26 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v51, v24 +; SI-NEXT: v_mov_b32_e32 v49, v25 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: .LBB16_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v38 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v36 -; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v34 -; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 -; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v31 -; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v29 -; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v27 -; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v62 -; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v60 -; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v58 -; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 -; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v44 -; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 -; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 -; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 -; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 -; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v36 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v35 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v32 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v30 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v29 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v26 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v62 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v63 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v59 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v60 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v47 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v56 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v12, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v48 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v49 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v14, v54 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v20, v53 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v22, v40 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v55 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v53 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v51 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v44 ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -7205,7 +6894,13 @@ define <52 x half> @bitcast_v26i32_to_v52f16(<26 x i32> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_or_b32_e32 v23, v25, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v51 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 +; SI-NEXT: v_or_b32_e32 v25, v27, v25 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v26i32_to_v52f16: @@ -7753,48 +7448,48 @@ define inreg <52 x half> @bitcast_v26i32_to_v52f16_scalar(<26 x i32> inreg %a, i ; SI-LABEL: bitcast_v26i32_to_v52f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v14, s16 -; SI-NEXT: v_mov_b32_e32 v15, s17 -; SI-NEXT: v_mov_b32_e32 v16, s18 -; SI-NEXT: v_mov_b32_e32 v17, s19 -; SI-NEXT: v_mov_b32_e32 v18, s20 -; SI-NEXT: v_mov_b32_e32 v19, s21 -; SI-NEXT: v_readfirstlane_b32 s40, v14 -; SI-NEXT: v_mov_b32_e32 v14, s22 -; SI-NEXT: v_readfirstlane_b32 s41, v15 -; SI-NEXT: v_mov_b32_e32 v15, s23 -; SI-NEXT: v_readfirstlane_b32 s23, v16 -; SI-NEXT: v_mov_b32_e32 v16, s24 -; SI-NEXT: v_readfirstlane_b32 s24, v17 -; SI-NEXT: v_mov_b32_e32 v17, s25 -; SI-NEXT: v_readfirstlane_b32 s25, v18 -; SI-NEXT: v_mov_b32_e32 v18, s26 -; SI-NEXT: v_readfirstlane_b32 s26, v19 -; SI-NEXT: v_mov_b32_e32 v19, s27 -; SI-NEXT: v_readfirstlane_b32 s27, v14 -; SI-NEXT: v_mov_b32_e32 v14, s28 -; SI-NEXT: v_readfirstlane_b32 s28, v15 -; SI-NEXT: v_mov_b32_e32 v15, s29 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 -; SI-NEXT: v_readfirstlane_b32 s29, v16 -; SI-NEXT: v_readfirstlane_b32 s22, v17 -; SI-NEXT: v_readfirstlane_b32 s21, v18 -; SI-NEXT: v_readfirstlane_b32 s20, v19 -; SI-NEXT: v_readfirstlane_b32 s19, v14 -; SI-NEXT: v_readfirstlane_b32 s18, v15 -; SI-NEXT: v_readfirstlane_b32 s17, v1 -; SI-NEXT: v_readfirstlane_b32 s16, v2 -; SI-NEXT: v_readfirstlane_b32 s15, v3 -; SI-NEXT: v_readfirstlane_b32 s14, v4 -; SI-NEXT: v_readfirstlane_b32 s13, v5 -; SI-NEXT: v_readfirstlane_b32 s12, v6 -; SI-NEXT: v_readfirstlane_b32 s11, v7 -; SI-NEXT: v_readfirstlane_b32 s10, v8 -; SI-NEXT: v_readfirstlane_b32 s8, v9 -; SI-NEXT: v_readfirstlane_b32 s7, v10 -; SI-NEXT: v_readfirstlane_b32 s6, v11 +; SI-NEXT: v_mov_b32_e32 v13, s16 +; SI-NEXT: v_mov_b32_e32 v14, s17 +; SI-NEXT: v_mov_b32_e32 v15, s18 +; SI-NEXT: v_mov_b32_e32 v16, s19 +; SI-NEXT: v_mov_b32_e32 v17, s20 +; SI-NEXT: v_mov_b32_e32 v18, s21 +; SI-NEXT: v_mov_b32_e32 v19, s22 +; SI-NEXT: v_readfirstlane_b32 s40, v13 +; SI-NEXT: v_mov_b32_e32 v13, s23 +; SI-NEXT: v_readfirstlane_b32 s41, v14 +; SI-NEXT: v_mov_b32_e32 v14, s24 +; SI-NEXT: v_readfirstlane_b32 s24, v15 +; SI-NEXT: v_mov_b32_e32 v15, s25 +; SI-NEXT: v_readfirstlane_b32 s25, v16 +; SI-NEXT: v_mov_b32_e32 v16, s26 +; SI-NEXT: v_readfirstlane_b32 s26, v17 +; SI-NEXT: v_mov_b32_e32 v17, s27 +; SI-NEXT: v_readfirstlane_b32 s27, v18 +; SI-NEXT: v_mov_b32_e32 v18, s28 +; SI-NEXT: v_readfirstlane_b32 s28, v19 +; SI-NEXT: v_mov_b32_e32 v19, s29 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: v_readfirstlane_b32 s29, v13 +; SI-NEXT: v_readfirstlane_b32 s23, v14 +; SI-NEXT: v_readfirstlane_b32 s22, v15 +; SI-NEXT: v_readfirstlane_b32 s21, v16 +; SI-NEXT: v_readfirstlane_b32 s20, v17 +; SI-NEXT: v_readfirstlane_b32 s19, v18 +; SI-NEXT: v_readfirstlane_b32 s18, v19 +; SI-NEXT: v_readfirstlane_b32 s17, v0 +; SI-NEXT: v_readfirstlane_b32 s16, v1 +; SI-NEXT: v_readfirstlane_b32 s15, v2 +; SI-NEXT: v_readfirstlane_b32 s14, v3 +; SI-NEXT: v_readfirstlane_b32 s13, v4 +; SI-NEXT: v_readfirstlane_b32 s12, v5 +; SI-NEXT: v_readfirstlane_b32 s11, v6 +; SI-NEXT: v_readfirstlane_b32 s10, v7 +; SI-NEXT: v_readfirstlane_b32 s8, v8 +; SI-NEXT: v_readfirstlane_b32 s7, v9 +; SI-NEXT: v_readfirstlane_b32 s6, v10 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s9, v12 +; SI-NEXT: v_readfirstlane_b32 s9, v11 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill @@ -7803,97 +7498,96 @@ define inreg <52 x half> @bitcast_v26i32_to_v52f16_scalar(<26 x i32> inreg %a, i ; SI-NEXT: s_cbranch_scc0 .LBB17_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_lshr_b32 s4, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s4 ; SI-NEXT: s_lshr_b32 s4, s6, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s4 ; SI-NEXT: s_lshr_b32 s4, s7, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s4 ; SI-NEXT: s_lshr_b32 s4, s8, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 ; SI-NEXT: s_lshr_b32 s4, s10, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 ; SI-NEXT: s_lshr_b32 s4, s11, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 ; SI-NEXT: s_lshr_b32 s4, s12, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 ; SI-NEXT: s_lshr_b32 s4, s13, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 ; SI-NEXT: s_lshr_b32 s4, s14, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 ; SI-NEXT: s_lshr_b32 s4, s15, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 ; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 ; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 ; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 ; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 ; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 ; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 ; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 ; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 ; SI-NEXT: s_lshr_b32 s4, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s4 ; SI-NEXT: s_lshr_b32 s4, s27, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 ; SI-NEXT: s_lshr_b32 s4, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v49, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v54, s4 ; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v51, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 ; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v54, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v40, s4 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v41, s4 ; SI-NEXT: s_lshr_b32 s4, s41, 16 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v42, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 ; SI-NEXT: s_lshr_b32 s4, s40, 16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v44, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v53, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v55, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v41, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v43, s40 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v43, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v51, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v53, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v55, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v40, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v42, s40 ; SI-NEXT: s_cbranch_execnz .LBB17_3 ; SI-NEXT: .LBB17_2: ; %cmp.true ; SI-NEXT: s_add_i32 s40, s40, 3 ; SI-NEXT: s_add_i32 s41, s41, 3 -; SI-NEXT: s_add_i32 s23, s23, 3 ; SI-NEXT: s_add_i32 s24, s24, 3 ; SI-NEXT: s_add_i32 s25, s25, 3 ; SI-NEXT: s_add_i32 s26, s26, 3 ; SI-NEXT: s_add_i32 s27, s27, 3 ; SI-NEXT: s_add_i32 s28, s28, 3 ; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 ; SI-NEXT: s_add_i32 s22, s22, 3 ; SI-NEXT: s_add_i32 s21, s21, 3 ; SI-NEXT: s_add_i32 s20, s20, 3 @@ -7913,13 +7607,13 @@ define inreg <52 x half> @bitcast_v26i32_to_v52f16_scalar(<26 x i32> inreg %a, i ; SI-NEXT: s_add_i32 s9, s9, 3 ; SI-NEXT: s_lshr_b32 s4, s40, 16 ; SI-NEXT: s_lshr_b32 s5, s41, 16 -; SI-NEXT: s_lshr_b32 s42, s23, 16 -; SI-NEXT: s_lshr_b32 s43, s24, 16 -; SI-NEXT: s_lshr_b32 s44, s25, 16 -; SI-NEXT: s_lshr_b32 s45, s26, 16 -; SI-NEXT: s_lshr_b32 s46, s27, 16 -; SI-NEXT: s_lshr_b32 s47, s28, 16 -; SI-NEXT: s_lshr_b32 s56, s29, 16 +; SI-NEXT: s_lshr_b32 s42, s24, 16 +; SI-NEXT: s_lshr_b32 s43, s25, 16 +; SI-NEXT: s_lshr_b32 s44, s26, 16 +; SI-NEXT: s_lshr_b32 s45, s27, 16 +; SI-NEXT: s_lshr_b32 s46, s28, 16 +; SI-NEXT: s_lshr_b32 s47, s29, 16 +; SI-NEXT: s_lshr_b32 s56, s23, 16 ; SI-NEXT: s_lshr_b32 s57, s22, 16 ; SI-NEXT: s_lshr_b32 s58, s21, 16 ; SI-NEXT: s_lshr_b32 s59, s20, 16 @@ -7937,303 +7631,228 @@ define inreg <52 x half> @bitcast_v26i32_to_v52f16_scalar(<26 x i32> inreg %a, i ; SI-NEXT: s_lshr_b32 s79, s7, 16 ; SI-NEXT: s_lshr_b32 s88, s6, 16 ; SI-NEXT: s_lshr_b32 s89, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v53, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v55, s23 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v41, s41 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v43, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s89 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s88 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s79 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s78 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s77 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s76 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s75 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s74 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v51, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v53, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v55, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s25 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v40, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s41 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v42, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s89 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s88 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s79 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s78 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s77 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s76 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s75 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s74 ; SI-NEXT: v_cvt_f32_f16_e32 v16, s73 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s72 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s63 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s62 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s61 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s60 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s59 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s58 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s57 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s56 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s47 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s46 -; SI-NEXT: v_cvt_f32_f16_e32 v49, s45 -; SI-NEXT: v_cvt_f32_f16_e32 v51, s44 -; SI-NEXT: v_cvt_f32_f16_e32 v54, s43 -; SI-NEXT: v_cvt_f32_f16_e32 v40, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v42, s5 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v44, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s72 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s63 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s62 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s61 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s60 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s59 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s58 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s57 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s56 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s47 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s45 +; SI-NEXT: v_cvt_f32_f16_e32 v54, s44 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v41, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v43, s4 ; SI-NEXT: .LBB17_3: ; %end -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v44, v44 +; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v43, v43 ; SI-NEXT: v_cvt_f16_f32_e32 v42, v42 ; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 ; SI-NEXT: v_cvt_f16_f32_e32 v40, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v55 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41 +; SI-NEXT: v_or_b32_e32 v2, v40, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 -; SI-NEXT: v_or_b32_e32 v43, v43, v44 -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v4 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v44, v1 +; SI-NEXT: v_or_b32_e32 v3, v42, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v40 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 -; SI-NEXT: buffer_store_dword v43, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v43, vcc, 4, v0 -; SI-NEXT: v_or_b32_e32 v41, v41, v42 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_or_b32_e32 v5, v5, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v52 ; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 -; SI-NEXT: buffer_store_dword v41, v43, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v41, vcc, 8, v0 -; SI-NEXT: v_or_b32_e32 v55, v55, v40 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v54 +; SI-NEXT: v_or_b32_e32 v7, v7, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 -; SI-NEXT: buffer_store_dword v55, v41, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v55, vcc, 12, v0 -; SI-NEXT: v_or_b32_e32 v53, v53, v54 -; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v52 +; SI-NEXT: v_or_b32_e32 v9, v50, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 -; SI-NEXT: buffer_store_dword v53, v55, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v53, vcc, 16, v0 -; SI-NEXT: v_or_b32_e32 v51, v52, v51 -; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v50 +; SI-NEXT: v_or_b32_e32 v11, v48, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: buffer_store_dword v51, v53, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v51, vcc, 20, v0 -; SI-NEXT: v_or_b32_e32 v49, v50, v49 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v48 +; SI-NEXT: v_or_b32_e32 v13, v38, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: buffer_store_dword v49, v51, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v49, vcc, 24, v0 -; SI-NEXT: v_or_b32_e32 v39, v48, v39 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v38 +; SI-NEXT: v_or_b32_e32 v15, v36, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: buffer_store_dword v39, v49, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v39, vcc, 28, v0 -; SI-NEXT: v_or_b32_e32 v37, v38, v37 -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v36 +; SI-NEXT: v_or_b32_e32 v17, v34, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: buffer_store_dword v37, v39, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v37, vcc, 32, v0 -; SI-NEXT: v_or_b32_e32 v35, v36, v35 -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; SI-NEXT: buffer_store_dword v35, v37, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v35, vcc, 36, v0 -; SI-NEXT: v_or_b32_e32 v33, v34, v33 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; SI-NEXT: buffer_store_dword v33, v35, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v33, vcc, 40, v0 -; SI-NEXT: v_or_b32_e32 v31, v32, v31 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; SI-NEXT: buffer_store_dword v31, v33, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v31, vcc, 44, v0 -; SI-NEXT: v_or_b32_e32 v28, v30, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v34 +; SI-NEXT: v_or_b32_e32 v19, v32, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v32 +; SI-NEXT: v_or_b32_e32 v21, v30, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v30 +; SI-NEXT: v_or_b32_e32 v23, v28, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: buffer_store_dword v28, v31, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v28, v29 -; SI-NEXT: v_add_i32_e32 v29, vcc, 48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_or_b32_e32 v26, v28, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: buffer_store_dword v26, v29, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v26, v27 -; SI-NEXT: v_add_i32_e32 v27, vcc, 52, v0 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_or_b32_e32 v24, v26, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: buffer_store_dword v24, v27, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v24, v25 -; SI-NEXT: v_add_i32_e32 v25, vcc, 56, v0 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v22, v24, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: buffer_store_dword v22, v25, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v22, v23 -; SI-NEXT: v_add_i32_e32 v23, vcc, 60, v0 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v22, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: buffer_store_dword v20, v23, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v21 -; SI-NEXT: v_add_i32_e32 v21, vcc, 64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v20, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: buffer_store_dword v18, v21, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v18, v19 -; SI-NEXT: v_add_i32_e32 v19, vcc, 0x44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: buffer_store_dword v16, v19, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v16, v17 -; SI-NEXT: v_add_i32_e32 v17, vcc, 0x48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v16, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v14, v15 -; SI-NEXT: v_add_i32_e32 v15, vcc, 0x4c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_or_b32_e32 v11, v14, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: buffer_store_dword v11, v15, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v11, v13 -; SI-NEXT: v_add_i32_e32 v13, vcc, 0x50, v0 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: buffer_store_dword v9, v13, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v9, v12 -; SI-NEXT: v_add_i32_e32 v11, vcc, 0x54, v0 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: buffer_store_dword v7, v11, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v7, v10 -; SI-NEXT: v_add_i32_e32 v9, vcc, 0x58, v0 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v8 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x5c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v6 -; SI-NEXT: v_add_i32_e32 v5, vcc, 0x60, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v4 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v28 +; SI-NEXT: v_or_b32_e32 v4, v55, v4 +; SI-NEXT: v_or_b32_e32 v6, v53, v6 +; SI-NEXT: v_or_b32_e32 v8, v51, v8 +; SI-NEXT: v_or_b32_e32 v10, v49, v10 +; SI-NEXT: v_or_b32_e32 v12, v39, v12 +; SI-NEXT: v_or_b32_e32 v14, v37, v14 +; SI-NEXT: v_or_b32_e32 v16, v35, v16 +; SI-NEXT: v_or_b32_e32 v18, v33, v18 +; SI-NEXT: v_or_b32_e32 v20, v31, v20 +; SI-NEXT: v_or_b32_e32 v22, v29, v22 +; SI-NEXT: v_or_b32_e32 v24, v27, v24 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB17_4: +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: s_branch .LBB17_2 ; ; VI-LABEL: bitcast_v26i32_to_v52f16_scalar: @@ -8923,187 +8542,203 @@ define <26 x i32> @bitcast_v52f16_to_v26i32(<52 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v52f16_to_v26i32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v42, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v10 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:24 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v20 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v44, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v15 ; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:20 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v14 ; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v13 ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v12 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:40 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:36 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:48 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:84 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v60, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v11 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v61, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v59 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v9 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v58 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v6 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v57 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v48 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v36 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v37 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v38 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v39 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v63 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v62 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v49 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v61 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v62 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v61 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v60 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v25 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB18_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v49 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; kill: killed $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; kill: killed $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; kill: killed $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; kill: killed $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; kill: killed $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; kill: killed $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; kill: killed $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; kill: killed $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v61 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; kill: killed $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr26 @@ -9145,11 +8780,16 @@ define <26 x i32> @bitcast_v52f16_to_v26i32(<52 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v53 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v57 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v47 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v45 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v63 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v45 ; SI-NEXT: ; kill: killed $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: v_or_b32_e32 v0, v42, v0 @@ -9157,11 +8797,16 @@ define <26 x i32> @bitcast_v52f16_to_v26i32(<52 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v2, v54, v2 ; SI-NEXT: v_or_b32_e32 v3, v52, v3 ; SI-NEXT: v_or_b32_e32 v4, v50, v4 -; SI-NEXT: v_or_b32_e32 v21, v56, v21 -; SI-NEXT: v_or_b32_e32 v22, v46, v22 -; SI-NEXT: v_or_b32_e32 v23, v44, v23 -; SI-NEXT: v_or_b32_e32 v24, v34, v24 -; SI-NEXT: v_or_b32_e32 v25, v32, v25 +; SI-NEXT: v_or_b32_e32 v5, v48, v5 +; SI-NEXT: v_or_b32_e32 v6, v38, v6 +; SI-NEXT: v_or_b32_e32 v7, v36, v7 +; SI-NEXT: v_or_b32_e32 v8, v34, v8 +; SI-NEXT: v_or_b32_e32 v9, v32, v9 +; SI-NEXT: v_or_b32_e32 v10, v62, v10 +; SI-NEXT: v_or_b32_e32 v22, v58, v22 +; SI-NEXT: v_or_b32_e32 v23, v56, v23 +; SI-NEXT: v_or_b32_e32 v24, v46, v24 +; SI-NEXT: v_or_b32_e32 v25, v44, v25 ; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr41 @@ -9173,100 +8818,85 @@ define <26 x i32> @bitcast_v52f16_to_v26i32(<52 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: ; kill: killed $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v59 -; SI-NEXT: v_or_b32_e32 v20, v58, v20 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v60, v21 +; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: .LBB18_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB18_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v41 @@ -9287,7 +8917,7 @@ define <26 x i32> @bitcast_v52f16_to_v26i32(<52 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v2, v55 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v54 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v50 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -9297,145 +8927,117 @@ define <26 x i32> @bitcast_v52f16_to_v26i32(<52 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v53 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v5, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v38 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v51 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v8, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v32 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v46 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v44 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v33 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v32 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v62 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v37 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v56 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v35 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v33 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v44 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v61 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 @@ -9443,12 +9045,12 @@ define <26 x i32> @bitcast_v52f16_to_v26i32(<52 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 @@ -9458,7 +9060,7 @@ define <26 x i32> @bitcast_v52f16_to_v26i32(<52 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v16, v14 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 @@ -9470,12 +9072,12 @@ define <26 x i32> @bitcast_v52f16_to_v26i32(<52 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 @@ -9485,7 +9087,7 @@ define <26 x i32> @bitcast_v52f16_to_v26i32(<52 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 @@ -9497,31 +9099,41 @@ define <26 x i32> @bitcast_v52f16_to_v26i32(<52 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_or_b32_e32 v18, v19, v18 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v59 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v57 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; SI-NEXT: v_or_b32_e32 v20, v22, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v56 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v60 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: v_or_b32_e32 v21, v22, v21 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v57 ; SI-NEXT: v_or_b32_e32 v22, v24, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v47 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; SI-NEXT: v_or_b32_e32 v23, v25, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v46 ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 @@ -9530,22 +9142,22 @@ define <26 x i32> @bitcast_v52f16_to_v26i32(<52 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v25, v27, v25 ; SI-NEXT: .LBB18_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -10255,420 +9867,490 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i ; SI-LABEL: bitcast_v52f16_to_v26i32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v7 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v54, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v53, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v11, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v1, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v2, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v12, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v14, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v3, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v10, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v4, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v9, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v5, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v8, s26 -; SI-NEXT: v_cvt_f16_f32_e32 v6, s29 -; SI-NEXT: v_cvt_f16_f32_e32 v7, s28 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v38 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v39 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v44 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: s_lshr_b32 s40, s17, 16 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 +; SI-NEXT: s_lshr_b32 s41, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s41 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s17 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v9 +; SI-NEXT: s_lshr_b32 s14, s19, 16 +; SI-NEXT: s_lshr_b32 s15, s18, 16 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v7 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_cvt_f16_f32_e32 v60, v25 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 +; SI-NEXT: s_lshr_b32 s12, s21, 16 +; SI-NEXT: s_lshr_b32 s13, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s21 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v10 +; SI-NEXT: s_lshr_b32 s10, s23, 16 +; SI-NEXT: s_lshr_b32 s11, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 +; SI-NEXT: s_lshr_b32 s8, s25, 16 +; SI-NEXT: s_lshr_b32 s9, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v11 +; SI-NEXT: s_lshr_b32 s6, s27, 16 +; SI-NEXT: s_lshr_b32 s7, s26, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v10 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v1 +; SI-NEXT: s_lshr_b32 s4, s29, 16 +; SI-NEXT: s_lshr_b32 s5, s28, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s29 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v20 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB19_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v46 +; SI-NEXT: v_or_b32_e32 v6, v29, v6 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v53 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v26 -; SI-NEXT: v_or_b32_e32 v0, v11, v0 -; SI-NEXT: v_or_b32_e32 v2, v14, v2 -; SI-NEXT: v_or_b32_e32 v3, v10, v3 -; SI-NEXT: v_or_b32_e32 v4, v9, v4 -; SI-NEXT: v_or_b32_e32 v5, v8, v5 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v46 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v41 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v42 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v56 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v43 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v57 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v62 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v34 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v58 -; SI-NEXT: v_mov_b32_e32 v51, v46 -; SI-NEXT: v_or_b32_e32 v7, v45, v7 -; SI-NEXT: v_or_b32_e32 v8, v40, v8 -; SI-NEXT: v_or_b32_e32 v9, v55, v9 -; SI-NEXT: v_or_b32_e32 v10, v54, v10 -; SI-NEXT: v_or_b32_e32 v11, v47, v11 -; SI-NEXT: v_or_b32_e32 v12, v60, v12 -; SI-NEXT: v_or_b32_e32 v13, v52, v13 -; SI-NEXT: v_or_b32_e32 v14, v63, v14 -; SI-NEXT: v_or_b32_e32 v15, v61, v15 -; SI-NEXT: v_or_b32_e32 v17, v35, v17 -; SI-NEXT: v_or_b32_e32 v18, v33, v18 -; SI-NEXT: v_or_b32_e32 v19, v59, v19 -; SI-NEXT: v_or_b32_e32 v20, v27, v20 -; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v60 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v62 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v63 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v45 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v40 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v27 +; SI-NEXT: v_mov_b32_e32 v61, v60 +; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: v_or_b32_e32 v1, v56, v1 +; SI-NEXT: v_or_b32_e32 v2, v37, v2 +; SI-NEXT: v_mov_b32_e32 v57, v58 +; SI-NEXT: v_or_b32_e32 v3, v58, v3 +; SI-NEXT: v_mov_b32_e32 v56, v47 +; SI-NEXT: v_or_b32_e32 v4, v47, v4 +; SI-NEXT: v_mov_b32_e32 v47, v45 +; SI-NEXT: v_mov_b32_e32 v45, v44 +; SI-NEXT: v_or_b32_e32 v5, v44, v5 +; SI-NEXT: v_or_b32_e32 v7, v34, v7 +; SI-NEXT: v_or_b32_e32 v8, v35, v8 +; SI-NEXT: v_or_b32_e32 v9, v42, v9 +; SI-NEXT: v_or_b32_e32 v10, v41, v10 +; SI-NEXT: v_or_b32_e32 v11, v54, v11 +; SI-NEXT: v_or_b32_e32 v12, v53, v12 +; SI-NEXT: v_or_b32_e32 v13, v51, v13 +; SI-NEXT: v_or_b32_e32 v14, v49, v14 +; SI-NEXT: v_or_b32_e32 v15, v39, v15 +; SI-NEXT: v_or_b32_e32 v16, v28, v16 +; SI-NEXT: v_or_b32_e32 v17, v30, v17 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_or_b32_e32 v23, v24, v23 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v37, v16 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v21, v22, v21 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v31 -; SI-NEXT: v_or_b32_e32 v22, v30, v22 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_or_b32_e32 v25, v38, v25 +; SI-NEXT: v_or_b32_e32 v25, v29, v25 ; SI-NEXT: s_cbranch_execnz .LBB19_3 ; SI-NEXT: .LBB19_2: ; %cmp.true -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v54 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v0, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v56 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v35 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v10, v42 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v52 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v63 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v30 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v27 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v30 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v28 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v37 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v63 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v59 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v47 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v46 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v32 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v33 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v36 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v43 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v40 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v55 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v14, v50 ; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v62 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v48 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v27 ; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v31 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v33 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_or_b32_e32 v18, v19, v18 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v58 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v26 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; SI-NEXT: v_or_b32_e32 v20, v22, v20 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_or_b32_e32 v21, v22, v21 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_or_b32_e32 v22, v24, v22 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 @@ -10678,7 +10360,7 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; SI-NEXT: v_or_b32_e32 v23, v25, v23 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 @@ -10692,57 +10374,68 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 ; SI-NEXT: v_or_b32_e32 v25, v27, v25 ; SI-NEXT: .LBB19_3: ; %end -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB19_4: -; SI-NEXT: v_mov_b32_e32 v50, v63 -; SI-NEXT: v_mov_b32_e32 v63, v58 -; SI-NEXT: v_mov_b32_e32 v58, v30 -; SI-NEXT: v_mov_b32_e32 v38, v37 -; SI-NEXT: v_mov_b32_e32 v37, v36 -; SI-NEXT: v_mov_b32_e32 v36, v35 -; SI-NEXT: v_mov_b32_e32 v35, v34 -; SI-NEXT: v_mov_b32_e32 v34, v33 -; SI-NEXT: v_mov_b32_e32 v33, v32 -; SI-NEXT: v_mov_b32_e32 v32, v59 -; SI-NEXT: v_mov_b32_e32 v59, v31 -; SI-NEXT: v_mov_b32_e32 v48, v61 -; SI-NEXT: v_mov_b32_e32 v61, v26 -; SI-NEXT: v_mov_b32_e32 v49, v62 -; SI-NEXT: v_mov_b32_e32 v62, v27 +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_mov_b32_e32 v42, v52 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_mov_b32_e32 v41, v51 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v40, v50 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v55, v49 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v54, v48 +; SI-NEXT: v_mov_b32_e32 v52, v26 +; SI-NEXT: v_mov_b32_e32 v51, v28 +; SI-NEXT: v_mov_b32_e32 v50, v27 +; SI-NEXT: v_mov_b32_e32 v49, v30 +; SI-NEXT: v_mov_b32_e32 v48, v31 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v51, v46 -; SI-NEXT: v_mov_b32_e32 v27, v62 -; SI-NEXT: v_mov_b32_e32 v62, v49 -; SI-NEXT: v_mov_b32_e32 v26, v61 -; SI-NEXT: v_mov_b32_e32 v61, v48 -; SI-NEXT: v_mov_b32_e32 v31, v59 -; SI-NEXT: v_mov_b32_e32 v59, v32 -; SI-NEXT: v_mov_b32_e32 v32, v33 -; SI-NEXT: v_mov_b32_e32 v33, v34 -; SI-NEXT: v_mov_b32_e32 v34, v35 -; SI-NEXT: v_mov_b32_e32 v35, v36 -; SI-NEXT: v_mov_b32_e32 v36, v37 -; SI-NEXT: v_mov_b32_e32 v37, v38 -; SI-NEXT: v_mov_b32_e32 v30, v58 -; SI-NEXT: v_mov_b32_e32 v58, v63 -; SI-NEXT: v_mov_b32_e32 v63, v50 +; SI-NEXT: v_mov_b32_e32 v61, v60 +; SI-NEXT: v_mov_b32_e32 v31, v48 +; SI-NEXT: v_mov_b32_e32 v30, v49 +; SI-NEXT: v_mov_b32_e32 v27, v50 +; SI-NEXT: v_mov_b32_e32 v28, v51 +; SI-NEXT: v_mov_b32_e32 v26, v52 +; SI-NEXT: v_mov_b32_e32 v48, v54 +; SI-NEXT: v_mov_b32_e32 v49, v55 +; SI-NEXT: v_mov_b32_e32 v50, v40 +; SI-NEXT: v_mov_b32_e32 v51, v41 +; SI-NEXT: v_mov_b32_e32 v52, v42 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v57, v58 +; SI-NEXT: v_mov_b32_e32 v56, v47 +; SI-NEXT: v_mov_b32_e32 v47, v45 +; SI-NEXT: v_mov_b32_e32 v45, v44 ; SI-NEXT: s_branch .LBB19_2 ; ; VI-LABEL: bitcast_v52f16_to_v26i32_scalar: @@ -13026,297 +12719,217 @@ define <52 x i16> @bitcast_v26f32_to_v52i16(<26 x float> %a, i32 %b) { ; SI-LABEL: bitcast_v26f32_to_v52i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v27 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB28_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_alignbit_b32 v27, v26, v25, 16 -; SI-NEXT: v_alignbit_b32 v28, v24, v23, 16 -; SI-NEXT: v_alignbit_b32 v29, v22, v21, 16 -; SI-NEXT: v_alignbit_b32 v30, v20, v19, 16 -; SI-NEXT: v_alignbit_b32 v31, v18, v17, 16 -; SI-NEXT: v_alignbit_b32 v33, v16, v15, 16 -; SI-NEXT: v_alignbit_b32 v35, v14, v13, 16 -; SI-NEXT: v_alignbit_b32 v37, v12, v11, 16 -; SI-NEXT: v_alignbit_b32 v48, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v50, v8, v7, 16 -; SI-NEXT: v_alignbit_b32 v52, v6, v5, 16 -; SI-NEXT: v_alignbit_b32 v54, v4, v3, 16 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_alignbit_b32 v40, v2, v1, 16 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v10 +; SI-NEXT: v_alignbit_b32 v26, v25, v24, 16 +; SI-NEXT: v_alignbit_b32 v27, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v28, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v29, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v30, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v31, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v32, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v33, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v34, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v37, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v39, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v50, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v52, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v9 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v7 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v5 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v3 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v1 ; SI-NEXT: .LBB28_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB28_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 -; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 ; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 -; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 ; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 -; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 ; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 -; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 ; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 -; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 ; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 -; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 ; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 -; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 ; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 -; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 ; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 -; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 ; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 -; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 ; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 -; SI-NEXT: v_alignbit_b32 v27, v26, v25, 16 -; SI-NEXT: v_alignbit_b32 v28, v24, v23, 16 -; SI-NEXT: v_alignbit_b32 v29, v22, v21, 16 -; SI-NEXT: v_alignbit_b32 v30, v20, v19, 16 -; SI-NEXT: v_alignbit_b32 v31, v18, v17, 16 -; SI-NEXT: v_alignbit_b32 v33, v16, v15, 16 -; SI-NEXT: v_alignbit_b32 v35, v14, v13, 16 -; SI-NEXT: v_alignbit_b32 v37, v12, v11, 16 -; SI-NEXT: v_alignbit_b32 v48, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v50, v8, v7, 16 -; SI-NEXT: v_alignbit_b32 v52, v6, v5, 16 -; SI-NEXT: v_alignbit_b32 v54, v4, v3, 16 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_alignbit_b32 v40, v2, v1, 16 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v10 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_alignbit_b32 v26, v25, v24, 16 +; SI-NEXT: v_alignbit_b32 v27, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v28, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v29, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v30, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v31, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v32, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v33, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v34, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v37, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v39, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v50, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v52, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v9 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v7 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v5 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v3 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v1 ; SI-NEXT: .LBB28_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; SI-NEXT: v_or_b32_e32 v0, v0, v52 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v43 +; SI-NEXT: v_or_b32_e32 v2, v2, v50 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v42 +; SI-NEXT: v_or_b32_e32 v4, v4, v39 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v41 +; SI-NEXT: v_or_b32_e32 v6, v6, v37 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v40 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; SI-NEXT: v_or_b32_e32 v1, v1, v40 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v44 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v43 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v42 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v53 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v49 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v28 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v24 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v25 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v26 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v8, v8, v34 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v55 +; SI-NEXT: v_or_b32_e32 v10, v10, v33 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v54 +; SI-NEXT: v_or_b32_e32 v12, v12, v32 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v53 +; SI-NEXT: v_or_b32_e32 v14, v14, v31 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v51 +; SI-NEXT: v_or_b32_e32 v16, v16, v30 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v49 +; SI-NEXT: v_or_b32_e32 v18, v18, v29 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v48 +; SI-NEXT: v_or_b32_e32 v20, v20, v28 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v38 +; SI-NEXT: v_or_b32_e32 v22, v22, v27 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v36 +; SI-NEXT: v_or_b32_e32 v24, v24, v26 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v52 +; SI-NEXT: v_or_b32_e32 v3, v3, v50 +; SI-NEXT: v_or_b32_e32 v5, v5, v39 +; SI-NEXT: v_or_b32_e32 v7, v7, v37 +; SI-NEXT: v_or_b32_e32 v9, v9, v34 +; SI-NEXT: v_or_b32_e32 v11, v11, v33 +; SI-NEXT: v_or_b32_e32 v13, v13, v32 +; SI-NEXT: v_or_b32_e32 v15, v15, v31 +; SI-NEXT: v_or_b32_e32 v17, v17, v30 +; SI-NEXT: v_or_b32_e32 v19, v19, v29 +; SI-NEXT: v_or_b32_e32 v21, v21, v28 +; SI-NEXT: v_or_b32_e32 v23, v23, v27 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v26f32_to_v52i16: @@ -13838,318 +13451,261 @@ define inreg <52 x i16> @bitcast_v26f32_to_v52i16_scalar(<26 x float> inreg %a, ; SI-LABEL: bitcast_v26f32_to_v52i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 -; SI-NEXT: v_mov_b32_e32 v25, s16 -; SI-NEXT: v_mov_b32_e32 v26, s17 -; SI-NEXT: v_mov_b32_e32 v23, s18 -; SI-NEXT: v_mov_b32_e32 v24, s19 -; SI-NEXT: v_mov_b32_e32 v19, s20 -; SI-NEXT: v_mov_b32_e32 v20, s21 -; SI-NEXT: v_mov_b32_e32 v21, s22 -; SI-NEXT: v_mov_b32_e32 v22, s23 -; SI-NEXT: v_mov_b32_e32 v17, s24 -; SI-NEXT: v_mov_b32_e32 v18, s25 -; SI-NEXT: v_mov_b32_e32 v15, s26 -; SI-NEXT: v_mov_b32_e32 v16, s27 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: v_mov_b32_e32 v24, s16 +; SI-NEXT: v_mov_b32_e32 v25, s17 +; SI-NEXT: v_mov_b32_e32 v20, s18 +; SI-NEXT: v_mov_b32_e32 v21, s19 +; SI-NEXT: v_mov_b32_e32 v16, s20 +; SI-NEXT: v_mov_b32_e32 v17, s21 +; SI-NEXT: v_mov_b32_e32 v22, s22 +; SI-NEXT: v_mov_b32_e32 v23, s23 +; SI-NEXT: v_mov_b32_e32 v18, s24 +; SI-NEXT: v_mov_b32_e32 v19, s25 +; SI-NEXT: v_mov_b32_e32 v14, s26 +; SI-NEXT: v_mov_b32_e32 v15, s27 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mov_b32_e32 v13, s28 -; SI-NEXT: v_mov_b32_e32 v14, s29 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB29_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshr_b64 v[27:28], v[11:12], 16 -; SI-NEXT: v_lshr_b64 v[28:29], v[9:10], 16 -; SI-NEXT: v_lshr_b64 v[29:30], v[7:8], 16 -; SI-NEXT: v_lshr_b64 v[30:31], v[5:6], 16 -; SI-NEXT: v_lshr_b64 v[31:32], v[3:4], 16 -; SI-NEXT: v_lshr_b64 v[32:33], v[1:2], 16 -; SI-NEXT: v_lshr_b64 v[36:37], v[13:14], 16 -; SI-NEXT: v_lshr_b64 v[33:34], v[15:16], 16 -; SI-NEXT: v_lshr_b64 v[37:38], v[21:22], 16 -; SI-NEXT: v_lshr_b64 v[48:49], v[23:24], 16 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v2 +; SI-NEXT: v_lshr_b64 v[48:49], v[8:9], 16 +; SI-NEXT: v_lshr_b64 v[49:50], v[6:7], 16 +; SI-NEXT: v_lshr_b64 v[50:51], v[4:5], 16 +; SI-NEXT: v_lshr_b64 v[51:52], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[52:53], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[34:35], v[14:15], 16 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v3 ; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v1 ; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v13 ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v15 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v19 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v23 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v17 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v26 -; SI-NEXT: v_lshr_b64 v[34:35], v[17:18], 16 -; SI-NEXT: v_lshr_b64 v[38:39], v[19:20], 16 -; SI-NEXT: v_lshr_b64 v[49:50], v[25:26], 16 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v25 +; SI-NEXT: v_lshr_b64 v[38:39], v[10:11], 16 +; SI-NEXT: v_lshr_b64 v[53:54], v[12:13], 16 +; SI-NEXT: v_lshr_b64 v[32:33], v[18:19], 16 +; SI-NEXT: v_lshr_b64 v[30:31], v[22:23], 16 +; SI-NEXT: v_lshr_b64 v[28:29], v[16:17], 16 +; SI-NEXT: v_lshr_b64 v[26:27], v[20:21], 16 +; SI-NEXT: v_lshr_b64 v[35:36], v[24:25], 16 ; SI-NEXT: s_cbranch_execnz .LBB29_3 ; SI-NEXT: .LBB29_2: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 -; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 -; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 ; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 -; SI-NEXT: v_lshr_b64 v[27:28], v[11:12], 16 ; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 ; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 -; SI-NEXT: v_lshr_b64 v[28:29], v[9:10], 16 ; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_lshr_b64 v[48:49], v[8:9], 16 ; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 -; SI-NEXT: v_lshr_b64 v[29:30], v[7:8], 16 ; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_lshr_b64 v[49:50], v[6:7], 16 ; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_lshr_b64 v[30:31], v[5:6], 16 -; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 -; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 ; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_lshr_b64 v[50:51], v[4:5], 16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 ; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshr_b64 v[31:32], v[3:4], 16 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshr_b64 v[51:52], v[2:3], 16 +; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 ; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 -; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 -; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 ; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 -; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 -; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 -; SI-NEXT: v_lshr_b64 v[32:33], v[1:2], 16 -; SI-NEXT: v_lshr_b64 v[36:37], v[13:14], 16 -; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 -; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 ; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 ; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 ; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 -; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 -; SI-NEXT: v_lshr_b64 v[33:34], v[15:16], 16 -; SI-NEXT: v_lshr_b64 v[37:38], v[21:22], 16 -; SI-NEXT: v_lshr_b64 v[48:49], v[23:24], 16 -; SI-NEXT: v_lshr_b64 v[34:35], v[17:18], 16 -; SI-NEXT: v_lshr_b64 v[38:39], v[19:20], 16 -; SI-NEXT: v_lshr_b64 v[49:50], v[25:26], 16 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v2 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_lshr_b64 v[52:53], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[34:35], v[14:15], 16 +; SI-NEXT: v_lshr_b64 v[38:39], v[10:11], 16 +; SI-NEXT: v_lshr_b64 v[53:54], v[12:13], 16 +; SI-NEXT: v_lshr_b64 v[32:33], v[18:19], 16 +; SI-NEXT: v_lshr_b64 v[30:31], v[22:23], 16 +; SI-NEXT: v_lshr_b64 v[28:29], v[16:17], 16 +; SI-NEXT: v_lshr_b64 v[26:27], v[20:21], 16 +; SI-NEXT: v_lshr_b64 v[35:36], v[24:25], 16 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v3 ; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v1 ; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v13 ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v15 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v19 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v23 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v17 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v25 ; SI-NEXT: .LBB29_3: ; %end -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v49 -; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; SI-NEXT: v_or_b32_e32 v25, v25, v35 -; SI-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v25, 0xffff, v26 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v47 -; SI-NEXT: v_or_b32_e32 v25, v25, v26 -; SI-NEXT: v_add_i32_e32 v26, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v25, v26, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v48 -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; SI-NEXT: v_or_b32_e32 v23, v23, v25 -; SI-NEXT: v_add_i32_e32 v25, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v23, v25, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v46 -; SI-NEXT: v_or_b32_e32 v23, v23, v24 -; SI-NEXT: v_add_i32_e32 v24, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v23, v24, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v38 -; SI-NEXT: v_or_b32_e32 v19, v19, v23 -; SI-NEXT: v_add_i32_e32 v23, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v19, v23, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v45 -; SI-NEXT: v_or_b32_e32 v19, v19, v20 -; SI-NEXT: v_add_i32_e32 v20, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v21 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v37 -; SI-NEXT: v_or_b32_e32 v19, v19, v20 -; SI-NEXT: v_add_i32_e32 v20, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v22 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v44 -; SI-NEXT: v_or_b32_e32 v19, v19, v20 -; SI-NEXT: v_add_i32_e32 v20, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v34 -; SI-NEXT: v_or_b32_e32 v17, v17, v19 -; SI-NEXT: v_add_i32_e32 v19, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v17, v19, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v43 -; SI-NEXT: v_or_b32_e32 v17, v17, v18 -; SI-NEXT: v_add_i32_e32 v18, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v33 -; SI-NEXT: v_or_b32_e32 v15, v15, v17 -; SI-NEXT: v_add_i32_e32 v17, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v15, v17, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v42 -; SI-NEXT: v_or_b32_e32 v15, v15, v16 -; SI-NEXT: v_add_i32_e32 v16, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v35 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v36, v24, v27 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v37 +; SI-NEXT: v_or_b32_e32 v37, v24, v25 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v26 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v26, v20, v24 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v58 +; SI-NEXT: v_or_b32_e32 v27, v20, v21 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v28 +; SI-NEXT: v_or_b32_e32 v28, v16, v20 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v57 +; SI-NEXT: v_or_b32_e32 v29, v16, v17 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v30 +; SI-NEXT: v_or_b32_e32 v30, v16, v17 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v56 +; SI-NEXT: v_or_b32_e32 v31, v16, v17 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v32 +; SI-NEXT: v_or_b32_e32 v32, v16, v17 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v47 +; SI-NEXT: v_or_b32_e32 v33, v16, v17 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v34 +; SI-NEXT: v_or_b32_e32 v34, v14, v16 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v46 +; SI-NEXT: v_or_b32_e32 v35, v14, v15 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v53 +; SI-NEXT: v_or_b32_e32 v12, v12, v14 ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v36 -; SI-NEXT: v_or_b32_e32 v13, v13, v15 -; SI-NEXT: v_add_i32_e32 v15, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v13, v15, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v41 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v45 ; SI-NEXT: v_or_b32_e32 v13, v13, v14 -; SI-NEXT: v_add_i32_e32 v14, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v32 -; SI-NEXT: v_or_b32_e32 v1, v1, v13 -; SI-NEXT: v_add_i32_e32 v13, vcc, 56, v0 -; SI-NEXT: buffer_store_dword v1, v13, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v53 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v28 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v52 +; SI-NEXT: v_or_b32_e32 v14, v0, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v44 +; SI-NEXT: v_or_b32_e32 v15, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v51 +; SI-NEXT: v_or_b32_e32 v16, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v43 +; SI-NEXT: v_or_b32_e32 v17, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v50 +; SI-NEXT: v_or_b32_e32 v18, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v42 +; SI-NEXT: v_or_b32_e32 v19, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v49 +; SI-NEXT: v_or_b32_e32 v20, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v41 +; SI-NEXT: v_or_b32_e32 v21, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v48 +; SI-NEXT: v_or_b32_e32 v22, v0, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v40 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v23, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v38 +; SI-NEXT: v_or_b32_e32 v24, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v55 +; SI-NEXT: v_or_b32_e32 v25, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, v36 +; SI-NEXT: v_mov_b32_e32 v1, v37 +; SI-NEXT: v_mov_b32_e32 v2, v26 +; SI-NEXT: v_mov_b32_e32 v3, v27 +; SI-NEXT: v_mov_b32_e32 v4, v28 +; SI-NEXT: v_mov_b32_e32 v5, v29 +; SI-NEXT: v_mov_b32_e32 v6, v30 +; SI-NEXT: v_mov_b32_e32 v7, v31 +; SI-NEXT: v_mov_b32_e32 v8, v32 +; SI-NEXT: v_mov_b32_e32 v9, v33 +; SI-NEXT: v_mov_b32_e32 v10, v34 +; SI-NEXT: v_mov_b32_e32 v11, v35 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB29_4: -; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: s_branch .LBB29_2 ; ; VI-LABEL: bitcast_v26f32_to_v52i16_scalar: @@ -14917,192 +14473,242 @@ define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v52i16_to_v26f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v50, v10 -; SI-NEXT: v_mov_b32_e32 v51, v8 -; SI-NEXT: v_mov_b32_e32 v52, v6 -; SI-NEXT: v_mov_b32_e32 v53, v4 -; SI-NEXT: v_mov_b32_e32 v54, v2 -; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:12 -; SI-NEXT: v_mov_b32_e32 v49, v12 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v29 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v8 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:4 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v12 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v18 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:60 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:52 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v22 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:44 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:36 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:28 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v37, v20 +; SI-NEXT: v_mov_b32_e32 v38, v19 +; SI-NEXT: v_mov_b32_e32 v39, v18 +; SI-NEXT: v_mov_b32_e32 v48, v17 +; SI-NEXT: v_mov_b32_e32 v49, v16 +; SI-NEXT: v_mov_b32_e32 v50, v15 +; SI-NEXT: v_mov_b32_e32 v51, v14 +; SI-NEXT: v_mov_b32_e32 v52, v13 +; SI-NEXT: v_mov_b32_e32 v53, v12 +; SI-NEXT: v_mov_b32_e32 v54, v11 +; SI-NEXT: v_mov_b32_e32 v55, v10 +; SI-NEXT: v_mov_b32_e32 v40, v9 +; SI-NEXT: v_mov_b32_e32 v41, v8 +; SI-NEXT: v_mov_b32_e32 v42, v7 +; SI-NEXT: v_mov_b32_e32 v43, v6 +; SI-NEXT: v_mov_b32_e32 v44, v5 +; SI-NEXT: v_mov_b32_e32 v45, v4 +; SI-NEXT: v_mov_b32_e32 v46, v3 +; SI-NEXT: v_mov_b32_e32 v47, v2 +; SI-NEXT: v_mov_b32_e32 v56, v1 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_mov_b32_e32 v57, v0 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v25 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v24 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v23 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v22 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v38 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v48 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v50 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v52 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v40 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v42 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v43 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v44 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v45 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v46 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v47 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v56 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v57 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB30_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v53 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v52 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v51 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v50 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v49 -; SI-NEXT: v_or_b32_e32 v0, v0, v58 -; SI-NEXT: v_or_b32_e32 v1, v1, v48 -; SI-NEXT: v_or_b32_e32 v2, v2, v39 -; SI-NEXT: v_or_b32_e32 v3, v3, v57 -; SI-NEXT: v_or_b32_e32 v4, v4, v38 -; SI-NEXT: v_or_b32_e32 v5, v5, v56 -; SI-NEXT: v_or_b32_e32 v6, v6, v47 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v54 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v57 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v56 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v47 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v46 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v45 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v44 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v43 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v42 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v41 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v40 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v55 +; SI-NEXT: v_or_b32_e32 v0, v0, v60 +; SI-NEXT: v_or_b32_e32 v1, v1, v36 +; SI-NEXT: v_or_b32_e32 v2, v2, v59 +; SI-NEXT: v_or_b32_e32 v3, v3, v35 +; SI-NEXT: v_or_b32_e32 v4, v4, v58 +; SI-NEXT: v_or_b32_e32 v5, v5, v34 +; SI-NEXT: v_or_b32_e32 v6, v6, v63 +; SI-NEXT: v_or_b32_e32 v7, v7, v33 +; SI-NEXT: v_or_b32_e32 v8, v8, v62 +; SI-NEXT: v_or_b32_e32 v9, v9, v32 +; SI-NEXT: v_or_b32_e32 v10, v10, v61 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v53 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v52 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v51 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v50 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v49 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v48 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v39 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v38 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v37 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_or_b32_e32 v24, v24, v25 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_or_b32_e32 v25, v25, v26 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; kill: killed $vgpr26 @@ -15139,97 +14745,39 @@ define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; kill: killed $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 ; SI-NEXT: ; kill: killed $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: v_or_b32_e32 v7, v7, v37 -; SI-NEXT: v_or_b32_e32 v8, v8, v46 -; SI-NEXT: v_or_b32_e32 v9, v9, v45 -; SI-NEXT: v_or_b32_e32 v10, v10, v36 -; SI-NEXT: v_or_b32_e32 v11, v11, v44 -; SI-NEXT: v_or_b32_e32 v12, v12, v43 -; SI-NEXT: v_or_b32_e32 v13, v13, v35 -; SI-NEXT: v_or_b32_e32 v14, v14, v42 -; SI-NEXT: v_or_b32_e32 v15, v15, v34 -; SI-NEXT: v_or_b32_e32 v16, v16, v41 -; SI-NEXT: v_or_b32_e32 v17, v17, v33 -; SI-NEXT: v_or_b32_e32 v18, v18, v40 -; SI-NEXT: v_or_b32_e32 v19, v19, v32 -; SI-NEXT: v_or_b32_e32 v20, v20, v63 -; SI-NEXT: v_or_b32_e32 v21, v21, v62 -; SI-NEXT: v_or_b32_e32 v22, v22, v61 -; SI-NEXT: v_or_b32_e32 v23, v23, v60 -; SI-NEXT: v_or_b32_e32 v24, v24, v59 ; SI-NEXT: ; kill: killed $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; kill: killed $vgpr26 ; SI-NEXT: .LBB30_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB30_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v53 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v52 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v51 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v50 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v49 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v54 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v57 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v56 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v47 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v46 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v45 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v44 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v43 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v42 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v41 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v40 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v55 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 @@ -15237,14 +14785,22 @@ define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v0, v60, v0 ; SI-NEXT: s_mov_b32 s6, 0x30000 -; SI-NEXT: v_or_b32_e32 v1, v48, v1 -; SI-NEXT: v_or_b32_e32 v2, v39, v2 -; SI-NEXT: v_or_b32_e32 v3, v57, v3 -; SI-NEXT: v_or_b32_e32 v4, v38, v4 -; SI-NEXT: v_or_b32_e32 v5, v56, v5 -; SI-NEXT: v_or_b32_e32 v6, v47, v6 +; SI-NEXT: v_or_b32_e32 v1, v36, v1 +; SI-NEXT: v_or_b32_e32 v2, v59, v2 +; SI-NEXT: v_or_b32_e32 v3, v35, v3 +; SI-NEXT: v_or_b32_e32 v4, v58, v4 +; SI-NEXT: v_or_b32_e32 v5, v34, v5 +; SI-NEXT: v_or_b32_e32 v6, v63, v6 +; SI-NEXT: v_or_b32_e32 v7, v33, v7 +; SI-NEXT: v_or_b32_e32 v8, v62, v8 +; SI-NEXT: v_or_b32_e32 v9, v32, v9 +; SI-NEXT: v_or_b32_e32 v10, v61, v10 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 @@ -15252,78 +14808,50 @@ define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 ; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 ; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v53 ; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v52 ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v51 ; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v50 ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v49 ; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v48 ; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v39 ; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v38 ; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v37 ; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; SI-NEXT: v_or_b32_e32 v7, v37, v7 -; SI-NEXT: v_or_b32_e32 v8, v46, v8 -; SI-NEXT: v_or_b32_e32 v9, v45, v9 -; SI-NEXT: v_or_b32_e32 v10, v36, v10 -; SI-NEXT: v_or_b32_e32 v11, v44, v11 -; SI-NEXT: v_or_b32_e32 v12, v43, v12 -; SI-NEXT: v_or_b32_e32 v13, v35, v13 -; SI-NEXT: v_or_b32_e32 v14, v42, v14 -; SI-NEXT: v_or_b32_e32 v15, v34, v15 -; SI-NEXT: v_or_b32_e32 v16, v41, v16 -; SI-NEXT: v_or_b32_e32 v17, v33, v17 -; SI-NEXT: v_or_b32_e32 v18, v40, v18 -; SI-NEXT: v_or_b32_e32 v19, v32, v19 -; SI-NEXT: v_or_b32_e32 v20, v63, v20 -; SI-NEXT: v_or_b32_e32 v21, v62, v21 -; SI-NEXT: v_or_b32_e32 v22, v61, v22 -; SI-NEXT: v_or_b32_e32 v23, v60, v23 -; SI-NEXT: v_or_b32_e32 v24, v59, v24 -; SI-NEXT: v_or_b32_e32 v25, v26, v25 -; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 -; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 ; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 ; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 @@ -15334,29 +14862,58 @@ define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 ; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 ; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v20 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v24, vcc, 0x30000, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 ; SI-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v25 ; SI-NEXT: .LBB30_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -16065,366 +15622,296 @@ define inreg <26 x float> @bitcast_v52i16_to_v26f32_scalar(<52 x i16> inreg %a, ; SI-LABEL: bitcast_v52i16_to_v26f32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v47, v8 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_mov_b32_e32 v57, v6 -; SI-NEXT: v_mov_b32_e32 v32, v4 -; SI-NEXT: v_mov_b32_e32 v34, v2 -; SI-NEXT: v_mov_b32_e32 v37, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v62, v30 -; SI-NEXT: v_mov_b32_e32 v30, v24 -; SI-NEXT: v_mov_b32_e32 v38, v22 -; SI-NEXT: v_mov_b32_e32 v39, v20 -; SI-NEXT: v_mov_b32_e32 v48, v18 -; SI-NEXT: v_mov_b32_e32 v49, v16 -; SI-NEXT: v_mov_b32_e32 v50, v14 -; SI-NEXT: v_mov_b32_e32 v40, v12 -; SI-NEXT: v_mov_b32_e32 v41, v10 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v1 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v29 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v2 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v4 +; SI-NEXT: v_mov_b32_e32 v36, v7 +; SI-NEXT: v_mov_b32_e32 v35, v8 +; SI-NEXT: v_mov_b32_e32 v51, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v36 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v34, v9 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v35 +; SI-NEXT: v_mov_b32_e32 v33, v10 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v34 +; SI-NEXT: v_mov_b32_e32 v32, v11 +; SI-NEXT: v_mov_b32_e32 v37, v6 +; SI-NEXT: v_mov_b32_e32 v38, v5 +; SI-NEXT: v_mov_b32_e32 v39, v4 +; SI-NEXT: v_mov_b32_e32 v48, v3 +; SI-NEXT: v_mov_b32_e32 v49, v2 +; SI-NEXT: v_mov_b32_e32 v50, v1 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v38 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v48 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v50 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v51 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s7, s28, 16 +; SI-NEXT: s_lshr_b32 s8, s27, 16 +; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: s_lshr_b32 s13, s22, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s40, s19, 16 +; SI-NEXT: s_lshr_b32 s41, s18, 16 +; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v32 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v8 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v0 ; SI-NEXT: s_cbranch_scc0 .LBB31_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 -; SI-NEXT: v_or_b32_e32 v7, v0, v31 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 -; SI-NEXT: v_or_b32_e32 v9, v0, v61 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v57 -; SI-NEXT: v_or_b32_e32 v10, v0, v60 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v47 -; SI-NEXT: v_or_b32_e32 v11, v0, v59 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v41 -; SI-NEXT: v_or_b32_e32 v12, v0, v36 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v40 -; SI-NEXT: v_or_b32_e32 v13, v0, v35 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s44, s42, 16 +; SI-NEXT: s_or_b32 s5, s5, s44 +; SI-NEXT: s_and_b32 s44, s18, 0xffff +; SI-NEXT: s_lshl_b32 s45, s41, 16 +; SI-NEXT: s_or_b32 s44, s44, s45 +; SI-NEXT: s_and_b32 s45, s19, 0xffff +; SI-NEXT: s_lshl_b32 s46, s40, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; SI-NEXT: s_or_b32 s45, s45, s46 +; SI-NEXT: s_and_b32 s46, s20, 0xffff +; SI-NEXT: s_lshl_b32 s47, s15, 16 +; SI-NEXT: v_or_b32_e32 v14, v0, v47 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 -; SI-NEXT: v_or_b32_e32 v14, v0, v33 +; SI-NEXT: s_or_b32 s46, s46, s47 +; SI-NEXT: s_and_b32 s47, s21, 0xffff +; SI-NEXT: s_lshl_b32 s56, s14, 16 +; SI-NEXT: v_or_b32_e32 v15, v0, v46 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 -; SI-NEXT: v_or_b32_e32 v15, v0, v55 +; SI-NEXT: s_or_b32 s47, s47, s56 +; SI-NEXT: s_and_b32 s56, s22, 0xffff +; SI-NEXT: s_lshl_b32 s57, s13, 16 +; SI-NEXT: v_or_b32_e32 v16, v0, v45 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 -; SI-NEXT: v_or_b32_e32 v16, v0, v54 +; SI-NEXT: s_or_b32 s56, s56, s57 +; SI-NEXT: s_and_b32 s57, s23, 0xffff +; SI-NEXT: s_lshl_b32 s58, s12, 16 +; SI-NEXT: v_or_b32_e32 v17, v0, v44 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 -; SI-NEXT: v_or_b32_e32 v17, v0, v58 +; SI-NEXT: s_or_b32 s57, s57, s58 +; SI-NEXT: s_and_b32 s58, s24, 0xffff +; SI-NEXT: s_lshl_b32 s59, s11, 16 +; SI-NEXT: v_or_b32_e32 v18, v0, v43 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 -; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: v_or_b32_e32 v18, v0, v53 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: v_or_b32_e32 v19, v0, v56 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v26 -; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: v_or_b32_e32 v20, v0, v52 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v28 -; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: v_or_b32_e32 v21, v0, v46 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v62 -; SI-NEXT: s_or_b32 s7, s7, s8 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: v_or_b32_e32 v22, v0, v45 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v44 -; SI-NEXT: s_or_b32 s8, s8, s9 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: v_or_b32_e32 v23, v0, v51 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v43 -; SI-NEXT: s_or_b32 s9, s9, s10 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v34 -; SI-NEXT: v_or_b32_e32 v24, v0, v29 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v42 -; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_or_b32_e32 v8, v1, v63 -; SI-NEXT: v_or_b32_e32 v25, v0, v27 +; SI-NEXT: s_or_b32 s58, s58, s59 +; SI-NEXT: s_and_b32 s59, s25, 0xffff +; SI-NEXT: s_lshl_b32 s60, s10, 16 +; SI-NEXT: v_or_b32_e32 v19, v0, v42 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: s_or_b32 s59, s59, s60 +; SI-NEXT: s_and_b32 s60, s26, 0xffff +; SI-NEXT: s_lshl_b32 s61, s9, 16 +; SI-NEXT: v_or_b32_e32 v20, v0, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: s_or_b32 s60, s60, s61 +; SI-NEXT: s_and_b32 s61, s27, 0xffff +; SI-NEXT: s_lshl_b32 s62, s8, 16 +; SI-NEXT: v_or_b32_e32 v21, v0, v40 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: s_or_b32 s61, s61, s62 +; SI-NEXT: s_and_b32 s62, s28, 0xffff +; SI-NEXT: s_lshl_b32 s63, s7, 16 +; SI-NEXT: v_or_b32_e32 v22, v0, v55 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: s_or_b32 s62, s62, s63 +; SI-NEXT: s_and_b32 s63, s29, 0xffff +; SI-NEXT: s_lshl_b32 s72, s6, 16 +; SI-NEXT: v_or_b32_e32 v23, v0, v54 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: s_or_b32 s63, s63, s72 +; SI-NEXT: v_or_b32_e32 v24, v0, v53 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: v_or_b32_e32 v25, v0, v52 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_mov_b32_e32 v5, s9 -; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: v_mov_b32_e32 v2, s44 +; SI-NEXT: v_mov_b32_e32 v3, s45 +; SI-NEXT: v_mov_b32_e32 v4, s46 +; SI-NEXT: v_mov_b32_e32 v5, s47 +; SI-NEXT: v_mov_b32_e32 v6, s56 +; SI-NEXT: v_mov_b32_e32 v7, s57 +; SI-NEXT: v_mov_b32_e32 v8, s58 +; SI-NEXT: v_mov_b32_e32 v9, s59 +; SI-NEXT: v_mov_b32_e32 v10, s60 +; SI-NEXT: v_mov_b32_e32 v11, s61 +; SI-NEXT: v_mov_b32_e32 v12, s62 +; SI-NEXT: v_mov_b32_e32 v13, s63 ; SI-NEXT: s_cbranch_execnz .LBB31_3 ; SI-NEXT: .LBB31_2: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_or_b32_e32 v0, v31, v0 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_or_b32_e32 v0, v61, v0 -; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v57 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v60, v0 -; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v47 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v59, v0 -; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v36, v0 -; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v40 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v35, v0 -; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v33, v0 +; SI-NEXT: v_or_b32_e32 v0, v47, v0 ; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v55, v0 +; SI-NEXT: v_or_b32_e32 v0, v46, v0 ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v54, v0 +; SI-NEXT: v_or_b32_e32 v0, v45, v0 ; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: v_or_b32_e32 v0, v44, v0 ; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v53, v0 +; SI-NEXT: v_or_b32_e32 v0, v43, v0 ; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v56, v0 +; SI-NEXT: v_or_b32_e32 v0, v42, v0 ; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v52, v0 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: v_or_b32_e32 v0, v41, v0 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s16, s42, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: s_and_b32 s16, s18, 0xffff +; SI-NEXT: s_lshl_b32 s17, s41, 16 +; SI-NEXT: s_add_i32 s19, s19, 3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_and_b32 s17, s19, 0xffff +; SI-NEXT: s_lshl_b32 s18, s40, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_or_b32_e32 v0, v40, v0 +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: s_and_b32 s18, s20, 0xffff +; SI-NEXT: s_lshl_b32 s15, s15, 16 +; SI-NEXT: s_add_i32 s21, s21, 3 ; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v62 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: s_or_b32 s15, s15, s18 +; SI-NEXT: s_and_b32 s18, s21, 0xffff +; SI-NEXT: s_lshl_b32 s14, s14, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_or_b32_e32 v0, v45, v0 -; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s14, s14, s18 +; SI-NEXT: s_and_b32 s18, s22, 0xffff +; SI-NEXT: s_lshl_b32 s13, s13, 16 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: v_or_b32_e32 v0, v55, v0 +; SI-NEXT: s_or_b32 s13, s13, s18 +; SI-NEXT: s_and_b32 s18, s23, 0xffff +; SI-NEXT: s_lshl_b32 s12, s12, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 ; SI-NEXT: v_add_i32_e32 v22, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v44 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 +; SI-NEXT: s_or_b32 s12, s12, s18 +; SI-NEXT: s_and_b32 s18, s24, 0xffff +; SI-NEXT: s_lshl_b32 s11, s11, 16 +; SI-NEXT: s_add_i32 s25, s25, 3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: v_or_b32_e32 v0, v51, v0 -; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v43 -; SI-NEXT: s_or_b32 s7, s8, s7 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_or_b32 s11, s11, s18 +; SI-NEXT: s_and_b32 s18, s25, 0xffff +; SI-NEXT: s_lshl_b32 s10, s10, 16 ; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_or_b32 s8, s9, s8 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: v_or_b32_e32 v0, v54, v0 +; SI-NEXT: s_or_b32 s10, s10, s18 +; SI-NEXT: s_and_b32 s18, s26, 0xffff +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: s_or_b32 s9, s9, s18 +; SI-NEXT: s_and_b32 s18, s27, 0xffff +; SI-NEXT: s_lshl_b32 s8, s8, 16 ; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: v_or_b32_e32 v0, v29, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v34 -; SI-NEXT: s_or_b32 s9, s10, s9 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s8, s8, s18 +; SI-NEXT: s_and_b32 s18, s28, 0xffff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: v_or_b32_e32 v0, v53, v0 +; SI-NEXT: s_or_b32 s7, s7, s18 +; SI-NEXT: s_and_b32 s18, s29, 0xffff +; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: v_add_i32_e32 v24, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v42 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 +; SI-NEXT: s_or_b32 s6, s6, s18 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v63, v1 ; SI-NEXT: s_add_i32 s4, s4, 0x30000 ; SI-NEXT: s_add_i32 s5, s5, 0x30000 -; SI-NEXT: s_add_i32 s6, s6, 0x30000 -; SI-NEXT: s_add_i32 s7, s7, 0x30000 -; SI-NEXT: s_add_i32 s8, s8, 0x30000 -; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s16, s16, 0x30000 +; SI-NEXT: s_add_i32 s17, s17, 0x30000 +; SI-NEXT: s_add_i32 s15, s15, 0x30000 +; SI-NEXT: s_add_i32 s14, s14, 0x30000 +; SI-NEXT: s_add_i32 s13, s13, 0x30000 +; SI-NEXT: s_add_i32 s12, s12, 0x30000 +; SI-NEXT: s_add_i32 s11, s11, 0x30000 ; SI-NEXT: s_add_i32 s10, s10, 0x30000 -; SI-NEXT: v_or_b32_e32 v0, v27, v0 -; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v52, v0 ; SI-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v0 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_mov_b32_e32 v5, s9 -; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: v_mov_b32_e32 v3, s17 +; SI-NEXT: v_mov_b32_e32 v4, s15 +; SI-NEXT: v_mov_b32_e32 v5, s14 +; SI-NEXT: v_mov_b32_e32 v6, s13 +; SI-NEXT: v_mov_b32_e32 v7, s12 +; SI-NEXT: v_mov_b32_e32 v8, s11 +; SI-NEXT: v_mov_b32_e32 v9, s10 +; SI-NEXT: v_mov_b32_e32 v10, s9 +; SI-NEXT: v_mov_b32_e32 v11, s8 +; SI-NEXT: v_mov_b32_e32 v12, s7 +; SI-NEXT: v_mov_b32_e32 v13, s6 ; SI-NEXT: .LBB31_3: ; %end -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB31_4: -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v47, v43 -; SI-NEXT: v_mov_b32_e32 v43, v50 -; SI-NEXT: v_mov_b32_e32 v50, v38 -; SI-NEXT: v_mov_b32_e32 v38, v62 -; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v62, v56 -; SI-NEXT: v_mov_b32_e32 v56, v44 -; SI-NEXT: v_mov_b32_e32 v44, v40 -; SI-NEXT: v_mov_b32_e32 v40, v39 -; SI-NEXT: v_mov_b32_e32 v39, v28 -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v59, v45 -; SI-NEXT: v_mov_b32_e32 v45, v41 -; SI-NEXT: v_mov_b32_e32 v41, v48 -; SI-NEXT: v_mov_b32_e32 v48, v26 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v60, v52 -; SI-NEXT: v_mov_b32_e32 v52, v46 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_mov_b32_e32 v46, v42 -; SI-NEXT: v_mov_b32_e32 v42, v49 -; SI-NEXT: v_mov_b32_e32 v49, v30 -; SI-NEXT: v_mov_b32_e32 v61, v63 -; SI-NEXT: v_mov_b32_e32 v63, v57 -; SI-NEXT: v_mov_b32_e32 v57, v27 -; SI-NEXT: v_mov_b32_e32 v53, v37 -; SI-NEXT: v_mov_b32_e32 v37, v36 -; SI-NEXT: v_mov_b32_e32 v36, v35 -; SI-NEXT: v_mov_b32_e32 v35, v34 -; SI-NEXT: v_mov_b32_e32 v34, v33 -; SI-NEXT: v_mov_b32_e32 v33, v55 -; SI-NEXT: v_mov_b32_e32 v55, v32 -; SI-NEXT: v_mov_b32_e32 v32, v54 -; SI-NEXT: v_mov_b32_e32 v54, v58 -; SI-NEXT: v_mov_b32_e32 v58, v51 -; SI-NEXT: v_mov_b32_e32 v51, v29 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v29, v51 -; SI-NEXT: v_mov_b32_e32 v51, v58 -; SI-NEXT: v_mov_b32_e32 v58, v54 -; SI-NEXT: v_mov_b32_e32 v54, v32 -; SI-NEXT: v_mov_b32_e32 v32, v55 -; SI-NEXT: v_mov_b32_e32 v55, v33 -; SI-NEXT: v_mov_b32_e32 v33, v34 -; SI-NEXT: v_mov_b32_e32 v34, v35 -; SI-NEXT: v_mov_b32_e32 v35, v36 -; SI-NEXT: v_mov_b32_e32 v36, v37 -; SI-NEXT: v_mov_b32_e32 v37, v53 -; SI-NEXT: v_mov_b32_e32 v27, v57 -; SI-NEXT: v_mov_b32_e32 v57, v63 -; SI-NEXT: v_mov_b32_e32 v63, v61 -; SI-NEXT: v_mov_b32_e32 v30, v49 -; SI-NEXT: v_mov_b32_e32 v49, v42 -; SI-NEXT: v_mov_b32_e32 v42, v46 -; SI-NEXT: v_mov_b32_e32 v46, v52 -; SI-NEXT: v_mov_b32_e32 v52, v60 -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v26, v48 -; SI-NEXT: v_mov_b32_e32 v48, v41 -; SI-NEXT: v_mov_b32_e32 v41, v45 -; SI-NEXT: v_mov_b32_e32 v45, v59 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v28, v39 -; SI-NEXT: v_mov_b32_e32 v39, v40 -; SI-NEXT: v_mov_b32_e32 v40, v44 -; SI-NEXT: v_mov_b32_e32 v44, v56 -; SI-NEXT: v_mov_b32_e32 v56, v62 -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v62, v38 -; SI-NEXT: v_mov_b32_e32 v38, v50 -; SI-NEXT: v_mov_b32_e32 v50, v43 -; SI-NEXT: v_mov_b32_e32 v43, v47 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_branch .LBB31_2 ; ; VI-LABEL: bitcast_v52i16_to_v26f32_scalar: @@ -17102,16 +16589,16 @@ end: define <52 x half> @bitcast_v26f32_to_v52f16(<26 x float> %a, i32 %b) { ; SI-LABEL: bitcast_v26f32_to_v52f16: ; SI: ; %bb.0: -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; kill: killed $vgpr51 -; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; kill: killed $vgpr48 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v27 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; kill: killed $vgpr51 -; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; kill: killed $vgpr48 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -17128,129 +16615,115 @@ define <52 x half> @bitcast_v26f32_to_v52f16(<26 x float> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; kill: killed $vgpr51 -; SI-NEXT: ; kill: killed $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; kill: killed $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr62 ; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; kill: killed $vgpr51 -; SI-NEXT: ; kill: killed $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; kill: killed $vgpr51 -; SI-NEXT: ; kill: killed $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; kill: killed $vgpr51 -; SI-NEXT: ; kill: killed $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; kill: killed $vgpr51 -; SI-NEXT: ; kill: killed $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB32_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v4 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v25 -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v24 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 @@ -17258,35 +16731,47 @@ define <52 x half> @bitcast_v26f32_to_v52f16(<26 x float> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v35, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v37, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v39, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v27 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v49, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v24 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v58, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v1 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v56, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v0 +; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 @@ -17312,23 +16797,18 @@ define <52 x half> @bitcast_v26f32_to_v52f16(<26 x float> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: .LBB32_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB32_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 ; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v55 -; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v20 ; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v53 ; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 ; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 ; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 @@ -17338,6 +16818,7 @@ define <52 x half> @bitcast_v26f32_to_v52f16(<26 x float> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 ; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 ; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 ; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 ; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 @@ -17345,36 +16826,36 @@ define <52 x half> @bitcast_v26f32_to_v52f16(<26 x float> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 ; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 ; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 -; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 ; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 ; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 ; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 ; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 -; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v25 ; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 @@ -17389,271 +16870,187 @@ define <52 x half> @bitcast_v26f32_to_v52f16(<26 x float> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v56, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 ; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 ; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 ; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 ; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 ; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 ; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 ; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 ; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_mov_b32_e32 v55, v24 -; SI-NEXT: v_mov_b32_e32 v53, v25 -; SI-NEXT: v_mov_b32_e32 v51, v26 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v51, v24 +; SI-NEXT: v_mov_b32_e32 v49, v25 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: .LBB32_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v38 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v36 -; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v34 -; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 -; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v31 -; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v29 -; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v27 -; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v62 -; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v60 -; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v58 -; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 -; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v44 -; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 -; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 -; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 -; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 -; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v36 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v35 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v32 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v30 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v29 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v26 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v62 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v63 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v59 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v60 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v47 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v56 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v12, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v48 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v49 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v14, v54 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v20, v53 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v22, v40 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v55 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v53 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v51 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v44 ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -17670,7 +17067,13 @@ define <52 x half> @bitcast_v26f32_to_v52f16(<26 x float> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_or_b32_e32 v23, v25, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v51 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 +; SI-NEXT: v_or_b32_e32 v25, v27, v25 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v26f32_to_v52f16: @@ -18192,7 +17595,7 @@ define inreg <52 x half> @bitcast_v26f32_to_v52f16_scalar(<26 x float> inreg %a, ; SI-LABEL: bitcast_v26f32_to_v52f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -18209,200 +17612,194 @@ define inreg <52 x half> @bitcast_v26f32_to_v52f16_scalar(<26 x float> inreg %a, ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v21, s16 -; SI-NEXT: v_mov_b32_e32 v18, s17 -; SI-NEXT: v_mov_b32_e32 v16, s18 -; SI-NEXT: v_mov_b32_e32 v15, s19 -; SI-NEXT: v_mov_b32_e32 v14, s20 +; SI-NEXT: v_mov_b32_e32 v28, s16 +; SI-NEXT: v_mov_b32_e32 v27, s17 +; SI-NEXT: v_mov_b32_e32 v26, s18 +; SI-NEXT: v_mov_b32_e32 v22, s19 +; SI-NEXT: v_mov_b32_e32 v24, s20 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v63, s21 -; SI-NEXT: v_mov_b32_e32 v25, s22 +; SI-NEXT: v_mov_b32_e32 v62, s22 +; SI-NEXT: v_mov_b32_e32 v19, s23 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mov_b32_e32 v26, s23 -; SI-NEXT: v_mov_b32_e32 v24, s24 -; SI-NEXT: v_mov_b32_e32 v23, s25 -; SI-NEXT: v_mov_b32_e32 v22, s26 -; SI-NEXT: v_mov_b32_e32 v19, s27 -; SI-NEXT: v_mov_b32_e32 v17, s28 -; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: v_mov_b32_e32 v30, s24 +; SI-NEXT: v_mov_b32_e32 v29, s25 +; SI-NEXT: v_mov_b32_e32 v21, s26 +; SI-NEXT: v_mov_b32_e32 v20, s27 +; SI-NEXT: v_mov_b32_e32 v23, s28 +; SI-NEXT: v_mov_b32_e32 v25, s29 ; SI-NEXT: s_cbranch_scc0 .LBB33_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v11 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v23 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v25 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v15 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v21 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v8 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v7 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v6 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v5 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v4 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v9 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v12 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v10 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v3 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v18, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v12 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v12 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v7 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v12 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v7 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v12 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v2 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v18, v5 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v23 +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v1 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v5 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v13 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v18, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v11 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v17 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v18, v3 +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v18 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v10 +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v3 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v12 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v0 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v12 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v0 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v12 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v53, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v26 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v28 ; SI-NEXT: s_cbranch_execnz .LBB33_3 ; SI-NEXT: .LBB33_2: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 -; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 -; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v13 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v7 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 ; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v6 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v17 -; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 -; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 -; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v25 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v35 -; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 -; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 -; SI-NEXT: v_add_f32_e32 v20, 1.0, v63 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v33 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v27 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v26 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v29 -; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 -; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 -; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 -; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 -; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v32 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v24 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v63 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v62 ; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v30, 1.0, v30 +; SI-NEXT: v_add_f32_e32 v29, 1.0, v29 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 ; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 -; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 ; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 ; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 ; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 ; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 -; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v7 ; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 @@ -18413,268 +17810,115 @@ define inreg <52 x half> @bitcast_v26f32_to_v52f16_scalar(<26 x float> inreg %a, ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 ; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 ; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 ; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 ; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 -; SI-NEXT: v_mov_b32_e32 v35, v11 -; SI-NEXT: v_mov_b32_e32 v29, v12 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v28 +; SI-NEXT: v_mov_b32_e32 v33, v10 +; SI-NEXT: v_mov_b32_e32 v32, v11 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: .LBB33_3: ; %end ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v20 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: v_cvt_f16_f32_e32 v1, v47 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v62 -; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v61 -; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v59 -; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v57 -; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v56 -; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v46 -; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v44 -; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v42 -; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v40 -; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v54 -; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 -; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 -; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 -; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v57 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v31 -; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v30 -; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v28 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v35 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v29 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v2, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v18 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v45 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v60 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v40 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v58 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v52 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v46 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v39 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v43 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v35 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v55 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v18 +; SI-NEXT: v_or_b32_e32 v11, v19, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v12 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v36 ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -18691,76 +17935,150 @@ define inreg <52 x half> @bitcast_v26f32_to_v52f16_scalar(<26 x float> inreg %a, ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v24, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v32 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f16_f32_e32 v20, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 +; SI-NEXT: v_or_b32_e32 v12, v18, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v19 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v19, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v13, v20, v13 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v20, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_or_b32_e32 v15, v20, v15 +; SI-NEXT: v_or_b32_e32 v14, v18, v14 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v20, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v17, v20, v17 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v22, v37 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v49 +; SI-NEXT: v_or_b32_e32 v23, v25, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v33 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 +; SI-NEXT: v_or_b32_e32 v25, v27, v25 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB33_4: -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; kill: killed $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; kill: killed $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; kill: killed $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; kill: killed $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; kill: killed $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; kill: killed $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; kill: killed $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: s_branch .LBB33_2 ; ; VI-LABEL: bitcast_v26f32_to_v52f16_scalar: @@ -19528,187 +18846,203 @@ define <26 x float> @bitcast_v52f16_to_v26f32(<52 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v52f16_to_v26f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v42, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v10 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:24 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v20 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v44, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v15 ; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:20 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v14 ; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v13 ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v12 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:40 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:36 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:48 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:84 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v60, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v11 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v61, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v59 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v9 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v58 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v6 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v57 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v48 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v36 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v37 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v38 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v39 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v63 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v62 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v49 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v61 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v62 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v61 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v60 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v25 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB34_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v49 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; kill: killed $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; kill: killed $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; kill: killed $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; kill: killed $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; kill: killed $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; kill: killed $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; kill: killed $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; kill: killed $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v61 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; kill: killed $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr26 @@ -19750,11 +19084,16 @@ define <26 x float> @bitcast_v52f16_to_v26f32(<52 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v53 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v57 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v47 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v45 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v63 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v45 ; SI-NEXT: ; kill: killed $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: v_or_b32_e32 v0, v42, v0 @@ -19762,11 +19101,16 @@ define <26 x float> @bitcast_v52f16_to_v26f32(<52 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v2, v54, v2 ; SI-NEXT: v_or_b32_e32 v3, v52, v3 ; SI-NEXT: v_or_b32_e32 v4, v50, v4 -; SI-NEXT: v_or_b32_e32 v21, v56, v21 -; SI-NEXT: v_or_b32_e32 v22, v46, v22 -; SI-NEXT: v_or_b32_e32 v23, v44, v23 -; SI-NEXT: v_or_b32_e32 v24, v34, v24 -; SI-NEXT: v_or_b32_e32 v25, v32, v25 +; SI-NEXT: v_or_b32_e32 v5, v48, v5 +; SI-NEXT: v_or_b32_e32 v6, v38, v6 +; SI-NEXT: v_or_b32_e32 v7, v36, v7 +; SI-NEXT: v_or_b32_e32 v8, v34, v8 +; SI-NEXT: v_or_b32_e32 v9, v32, v9 +; SI-NEXT: v_or_b32_e32 v10, v62, v10 +; SI-NEXT: v_or_b32_e32 v22, v58, v22 +; SI-NEXT: v_or_b32_e32 v23, v56, v23 +; SI-NEXT: v_or_b32_e32 v24, v46, v24 +; SI-NEXT: v_or_b32_e32 v25, v44, v25 ; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr41 @@ -19778,100 +19122,85 @@ define <26 x float> @bitcast_v52f16_to_v26f32(<52 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: ; kill: killed $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v59 -; SI-NEXT: v_or_b32_e32 v20, v58, v20 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v60, v21 +; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: .LBB34_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB34_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v41 @@ -19892,7 +19221,7 @@ define <26 x float> @bitcast_v52f16_to_v26f32(<52 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v2, v55 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v54 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v50 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -19902,145 +19231,117 @@ define <26 x float> @bitcast_v52f16_to_v26f32(<52 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v53 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v5, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v38 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v51 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v8, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v32 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v46 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v44 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v33 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v32 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v62 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v37 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v56 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v35 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v33 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v44 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v61 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 @@ -20048,12 +19349,12 @@ define <26 x float> @bitcast_v52f16_to_v26f32(<52 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 @@ -20063,7 +19364,7 @@ define <26 x float> @bitcast_v52f16_to_v26f32(<52 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v16, v14 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 @@ -20075,12 +19376,12 @@ define <26 x float> @bitcast_v52f16_to_v26f32(<52 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 @@ -20090,7 +19391,7 @@ define <26 x float> @bitcast_v52f16_to_v26f32(<52 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 @@ -20102,31 +19403,41 @@ define <26 x float> @bitcast_v52f16_to_v26f32(<52 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_or_b32_e32 v18, v19, v18 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v59 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v57 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; SI-NEXT: v_or_b32_e32 v20, v22, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v56 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v60 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: v_or_b32_e32 v21, v22, v21 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v57 ; SI-NEXT: v_or_b32_e32 v22, v24, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v47 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; SI-NEXT: v_or_b32_e32 v23, v25, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v46 ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 @@ -20135,22 +19446,22 @@ define <26 x float> @bitcast_v52f16_to_v26f32(<52 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v25, v27, v25 ; SI-NEXT: .LBB34_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -20860,420 +20171,490 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a, ; SI-LABEL: bitcast_v52f16_to_v26f32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v7 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v54, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v53, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v11, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v1, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v2, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v12, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v14, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v3, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v10, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v4, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v9, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v5, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v8, s26 -; SI-NEXT: v_cvt_f16_f32_e32 v6, s29 -; SI-NEXT: v_cvt_f16_f32_e32 v7, s28 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v38 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v39 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v44 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: s_lshr_b32 s40, s17, 16 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 +; SI-NEXT: s_lshr_b32 s41, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s41 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s17 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v9 +; SI-NEXT: s_lshr_b32 s14, s19, 16 +; SI-NEXT: s_lshr_b32 s15, s18, 16 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v7 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_cvt_f16_f32_e32 v60, v25 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 +; SI-NEXT: s_lshr_b32 s12, s21, 16 +; SI-NEXT: s_lshr_b32 s13, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s21 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v10 +; SI-NEXT: s_lshr_b32 s10, s23, 16 +; SI-NEXT: s_lshr_b32 s11, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 +; SI-NEXT: s_lshr_b32 s8, s25, 16 +; SI-NEXT: s_lshr_b32 s9, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v11 +; SI-NEXT: s_lshr_b32 s6, s27, 16 +; SI-NEXT: s_lshr_b32 s7, s26, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v10 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v1 +; SI-NEXT: s_lshr_b32 s4, s29, 16 +; SI-NEXT: s_lshr_b32 s5, s28, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s29 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v20 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB35_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v46 +; SI-NEXT: v_or_b32_e32 v6, v29, v6 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v53 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v26 -; SI-NEXT: v_or_b32_e32 v0, v11, v0 -; SI-NEXT: v_or_b32_e32 v2, v14, v2 -; SI-NEXT: v_or_b32_e32 v3, v10, v3 -; SI-NEXT: v_or_b32_e32 v4, v9, v4 -; SI-NEXT: v_or_b32_e32 v5, v8, v5 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v46 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v41 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v42 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v56 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v43 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v57 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v62 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v34 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v58 -; SI-NEXT: v_mov_b32_e32 v51, v46 -; SI-NEXT: v_or_b32_e32 v7, v45, v7 -; SI-NEXT: v_or_b32_e32 v8, v40, v8 -; SI-NEXT: v_or_b32_e32 v9, v55, v9 -; SI-NEXT: v_or_b32_e32 v10, v54, v10 -; SI-NEXT: v_or_b32_e32 v11, v47, v11 -; SI-NEXT: v_or_b32_e32 v12, v60, v12 -; SI-NEXT: v_or_b32_e32 v13, v52, v13 -; SI-NEXT: v_or_b32_e32 v14, v63, v14 -; SI-NEXT: v_or_b32_e32 v15, v61, v15 -; SI-NEXT: v_or_b32_e32 v17, v35, v17 -; SI-NEXT: v_or_b32_e32 v18, v33, v18 -; SI-NEXT: v_or_b32_e32 v19, v59, v19 -; SI-NEXT: v_or_b32_e32 v20, v27, v20 -; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v60 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v62 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v63 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v45 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v40 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v27 +; SI-NEXT: v_mov_b32_e32 v61, v60 +; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: v_or_b32_e32 v1, v56, v1 +; SI-NEXT: v_or_b32_e32 v2, v37, v2 +; SI-NEXT: v_mov_b32_e32 v57, v58 +; SI-NEXT: v_or_b32_e32 v3, v58, v3 +; SI-NEXT: v_mov_b32_e32 v56, v47 +; SI-NEXT: v_or_b32_e32 v4, v47, v4 +; SI-NEXT: v_mov_b32_e32 v47, v45 +; SI-NEXT: v_mov_b32_e32 v45, v44 +; SI-NEXT: v_or_b32_e32 v5, v44, v5 +; SI-NEXT: v_or_b32_e32 v7, v34, v7 +; SI-NEXT: v_or_b32_e32 v8, v35, v8 +; SI-NEXT: v_or_b32_e32 v9, v42, v9 +; SI-NEXT: v_or_b32_e32 v10, v41, v10 +; SI-NEXT: v_or_b32_e32 v11, v54, v11 +; SI-NEXT: v_or_b32_e32 v12, v53, v12 +; SI-NEXT: v_or_b32_e32 v13, v51, v13 +; SI-NEXT: v_or_b32_e32 v14, v49, v14 +; SI-NEXT: v_or_b32_e32 v15, v39, v15 +; SI-NEXT: v_or_b32_e32 v16, v28, v16 +; SI-NEXT: v_or_b32_e32 v17, v30, v17 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_or_b32_e32 v23, v24, v23 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v37, v16 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v21, v22, v21 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v31 -; SI-NEXT: v_or_b32_e32 v22, v30, v22 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_or_b32_e32 v25, v38, v25 +; SI-NEXT: v_or_b32_e32 v25, v29, v25 ; SI-NEXT: s_cbranch_execnz .LBB35_3 ; SI-NEXT: .LBB35_2: ; %cmp.true -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v54 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v0, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v56 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v35 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v10, v42 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v52 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v63 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v30 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v27 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v30 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v28 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v37 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v63 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v59 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v47 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v46 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v32 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v33 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v36 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v43 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v40 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v55 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v14, v50 ; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v62 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v48 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v27 ; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v31 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v33 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_or_b32_e32 v18, v19, v18 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v58 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v26 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; SI-NEXT: v_or_b32_e32 v20, v22, v20 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_or_b32_e32 v21, v22, v21 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_or_b32_e32 v22, v24, v22 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 @@ -21283,7 +20664,7 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; SI-NEXT: v_or_b32_e32 v23, v25, v23 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 @@ -21297,57 +20678,68 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a, ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 ; SI-NEXT: v_or_b32_e32 v25, v27, v25 ; SI-NEXT: .LBB35_3: ; %end -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB35_4: -; SI-NEXT: v_mov_b32_e32 v50, v63 -; SI-NEXT: v_mov_b32_e32 v63, v58 -; SI-NEXT: v_mov_b32_e32 v58, v30 -; SI-NEXT: v_mov_b32_e32 v38, v37 -; SI-NEXT: v_mov_b32_e32 v37, v36 -; SI-NEXT: v_mov_b32_e32 v36, v35 -; SI-NEXT: v_mov_b32_e32 v35, v34 -; SI-NEXT: v_mov_b32_e32 v34, v33 -; SI-NEXT: v_mov_b32_e32 v33, v32 -; SI-NEXT: v_mov_b32_e32 v32, v59 -; SI-NEXT: v_mov_b32_e32 v59, v31 -; SI-NEXT: v_mov_b32_e32 v48, v61 -; SI-NEXT: v_mov_b32_e32 v61, v26 -; SI-NEXT: v_mov_b32_e32 v49, v62 -; SI-NEXT: v_mov_b32_e32 v62, v27 +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_mov_b32_e32 v42, v52 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_mov_b32_e32 v41, v51 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v40, v50 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v55, v49 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v54, v48 +; SI-NEXT: v_mov_b32_e32 v52, v26 +; SI-NEXT: v_mov_b32_e32 v51, v28 +; SI-NEXT: v_mov_b32_e32 v50, v27 +; SI-NEXT: v_mov_b32_e32 v49, v30 +; SI-NEXT: v_mov_b32_e32 v48, v31 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v51, v46 -; SI-NEXT: v_mov_b32_e32 v27, v62 -; SI-NEXT: v_mov_b32_e32 v62, v49 -; SI-NEXT: v_mov_b32_e32 v26, v61 -; SI-NEXT: v_mov_b32_e32 v61, v48 -; SI-NEXT: v_mov_b32_e32 v31, v59 -; SI-NEXT: v_mov_b32_e32 v59, v32 -; SI-NEXT: v_mov_b32_e32 v32, v33 -; SI-NEXT: v_mov_b32_e32 v33, v34 -; SI-NEXT: v_mov_b32_e32 v34, v35 -; SI-NEXT: v_mov_b32_e32 v35, v36 -; SI-NEXT: v_mov_b32_e32 v36, v37 -; SI-NEXT: v_mov_b32_e32 v37, v38 -; SI-NEXT: v_mov_b32_e32 v30, v58 -; SI-NEXT: v_mov_b32_e32 v58, v63 -; SI-NEXT: v_mov_b32_e32 v63, v50 +; SI-NEXT: v_mov_b32_e32 v61, v60 +; SI-NEXT: v_mov_b32_e32 v31, v48 +; SI-NEXT: v_mov_b32_e32 v30, v49 +; SI-NEXT: v_mov_b32_e32 v27, v50 +; SI-NEXT: v_mov_b32_e32 v28, v51 +; SI-NEXT: v_mov_b32_e32 v26, v52 +; SI-NEXT: v_mov_b32_e32 v48, v54 +; SI-NEXT: v_mov_b32_e32 v49, v55 +; SI-NEXT: v_mov_b32_e32 v50, v40 +; SI-NEXT: v_mov_b32_e32 v51, v41 +; SI-NEXT: v_mov_b32_e32 v52, v42 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v57, v58 +; SI-NEXT: v_mov_b32_e32 v56, v47 +; SI-NEXT: v_mov_b32_e32 v47, v45 +; SI-NEXT: v_mov_b32_e32 v45, v44 ; SI-NEXT: s_branch .LBB35_2 ; ; VI-LABEL: bitcast_v52f16_to_v26f32_scalar: @@ -22791,297 +22183,217 @@ define <52 x i16> @bitcast_v13i64_to_v52i16(<13 x i64> %a, i32 %b) { ; SI-LABEL: bitcast_v13i64_to_v52i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v27 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB40_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_alignbit_b32 v27, v26, v25, 16 -; SI-NEXT: v_alignbit_b32 v28, v24, v23, 16 -; SI-NEXT: v_alignbit_b32 v29, v22, v21, 16 -; SI-NEXT: v_alignbit_b32 v30, v20, v19, 16 -; SI-NEXT: v_alignbit_b32 v31, v18, v17, 16 -; SI-NEXT: v_alignbit_b32 v32, v16, v15, 16 -; SI-NEXT: v_alignbit_b32 v34, v14, v13, 16 -; SI-NEXT: v_alignbit_b32 v37, v12, v11, 16 -; SI-NEXT: v_alignbit_b32 v39, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v49, v8, v7, 16 -; SI-NEXT: v_alignbit_b32 v52, v6, v5, 16 -; SI-NEXT: v_alignbit_b32 v54, v4, v3, 16 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_alignbit_b32 v40, v2, v1, 16 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v10 +; SI-NEXT: v_alignbit_b32 v26, v25, v24, 16 +; SI-NEXT: v_alignbit_b32 v27, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v28, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v29, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v30, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v31, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v32, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v33, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v34, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v36, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v39, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v49, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v52, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v9 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v7 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v5 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v3 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v1 ; SI-NEXT: .LBB40_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB40_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: v_addc_u32_e32 v12, vcc, 0, v12, vcc -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; SI-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; SI-NEXT: v_addc_u32_e32 v16, vcc, 0, v16, vcc -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; SI-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; SI-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc -; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; SI-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc -; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; SI-NEXT: v_addc_u32_e32 v24, vcc, 0, v24, vcc -; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; SI-NEXT: v_addc_u32_e32 v26, vcc, 0, v26, vcc -; SI-NEXT: v_alignbit_b32 v27, v26, v25, 16 -; SI-NEXT: v_alignbit_b32 v28, v24, v23, 16 -; SI-NEXT: v_alignbit_b32 v29, v22, v21, 16 -; SI-NEXT: v_alignbit_b32 v30, v20, v19, 16 -; SI-NEXT: v_alignbit_b32 v31, v18, v17, 16 -; SI-NEXT: v_alignbit_b32 v32, v16, v15, 16 -; SI-NEXT: v_alignbit_b32 v34, v14, v13, 16 -; SI-NEXT: v_alignbit_b32 v37, v12, v11, 16 -; SI-NEXT: v_alignbit_b32 v39, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v49, v8, v7, 16 -; SI-NEXT: v_alignbit_b32 v52, v6, v5, 16 -; SI-NEXT: v_alignbit_b32 v54, v4, v3, 16 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_alignbit_b32 v40, v2, v1, 16 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v10 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc +; SI-NEXT: v_alignbit_b32 v26, v25, v24, 16 +; SI-NEXT: v_alignbit_b32 v27, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v28, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v29, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v30, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v31, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v32, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v33, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v34, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v36, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v39, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v49, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v52, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v9 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v7 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v5 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v3 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v1 ; SI-NEXT: .LBB40_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; SI-NEXT: v_or_b32_e32 v0, v0, v52 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v43 +; SI-NEXT: v_or_b32_e32 v2, v2, v49 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v42 +; SI-NEXT: v_or_b32_e32 v4, v4, v39 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v41 +; SI-NEXT: v_or_b32_e32 v6, v6, v36 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v40 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; SI-NEXT: v_or_b32_e32 v1, v1, v40 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v44 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v43 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v42 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v49 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v53 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v28 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v24 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v25 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v26 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v8, v8, v34 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v55 +; SI-NEXT: v_or_b32_e32 v10, v10, v33 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v54 +; SI-NEXT: v_or_b32_e32 v12, v12, v32 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v53 +; SI-NEXT: v_or_b32_e32 v14, v14, v31 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v51 +; SI-NEXT: v_or_b32_e32 v16, v16, v30 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v50 +; SI-NEXT: v_or_b32_e32 v18, v18, v29 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v48 +; SI-NEXT: v_or_b32_e32 v20, v20, v28 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v38 +; SI-NEXT: v_or_b32_e32 v22, v22, v27 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v37 +; SI-NEXT: v_or_b32_e32 v24, v24, v26 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v52 +; SI-NEXT: v_or_b32_e32 v3, v3, v49 +; SI-NEXT: v_or_b32_e32 v5, v5, v39 +; SI-NEXT: v_or_b32_e32 v7, v7, v36 +; SI-NEXT: v_or_b32_e32 v9, v9, v34 +; SI-NEXT: v_or_b32_e32 v11, v11, v33 +; SI-NEXT: v_or_b32_e32 v13, v13, v32 +; SI-NEXT: v_or_b32_e32 v15, v15, v31 +; SI-NEXT: v_or_b32_e32 v17, v17, v30 +; SI-NEXT: v_or_b32_e32 v19, v19, v29 +; SI-NEXT: v_or_b32_e32 v21, v21, v28 +; SI-NEXT: v_or_b32_e32 v23, v23, v27 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v13i64_to_v52i16: @@ -23644,60 +22956,60 @@ define inreg <52 x i16> @bitcast_v13i64_to_v52i16_scalar(<13 x i64> inreg %a, i3 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v20, s30, 0 -; SI-NEXT: v_writelane_b32 v20, s31, 1 -; SI-NEXT: v_writelane_b32 v20, s34, 2 -; SI-NEXT: v_writelane_b32 v20, s35, 3 -; SI-NEXT: v_writelane_b32 v20, s36, 4 -; SI-NEXT: v_mov_b32_e32 v14, s16 -; SI-NEXT: v_mov_b32_e32 v15, s17 -; SI-NEXT: v_writelane_b32 v20, s37, 5 -; SI-NEXT: v_mov_b32_e32 v16, s18 -; SI-NEXT: v_mov_b32_e32 v17, s19 -; SI-NEXT: v_mov_b32_e32 v18, s20 -; SI-NEXT: v_mov_b32_e32 v19, s21 -; SI-NEXT: v_readfirstlane_b32 s42, v14 -; SI-NEXT: v_mov_b32_e32 v14, s22 -; SI-NEXT: v_readfirstlane_b32 s43, v15 -; SI-NEXT: v_mov_b32_e32 v15, s23 -; SI-NEXT: v_writelane_b32 v20, s38, 6 -; SI-NEXT: v_readfirstlane_b32 s40, v16 -; SI-NEXT: v_mov_b32_e32 v16, s24 -; SI-NEXT: v_readfirstlane_b32 s41, v17 -; SI-NEXT: v_mov_b32_e32 v17, s25 -; SI-NEXT: v_readfirstlane_b32 s24, v18 -; SI-NEXT: v_mov_b32_e32 v18, s26 -; SI-NEXT: v_readfirstlane_b32 s25, v19 -; SI-NEXT: v_mov_b32_e32 v19, s27 -; SI-NEXT: v_readfirstlane_b32 s22, v14 -; SI-NEXT: v_mov_b32_e32 v14, s28 -; SI-NEXT: v_readfirstlane_b32 s23, v15 -; SI-NEXT: v_mov_b32_e32 v15, s29 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 -; SI-NEXT: v_writelane_b32 v20, s39, 7 -; SI-NEXT: v_readfirstlane_b32 s20, v16 -; SI-NEXT: v_readfirstlane_b32 s21, v17 -; SI-NEXT: v_readfirstlane_b32 s18, v18 -; SI-NEXT: v_readfirstlane_b32 s19, v19 -; SI-NEXT: v_readfirstlane_b32 s16, v14 -; SI-NEXT: v_readfirstlane_b32 s17, v15 -; SI-NEXT: v_readfirstlane_b32 s14, v1 -; SI-NEXT: v_readfirstlane_b32 s15, v2 -; SI-NEXT: v_readfirstlane_b32 s12, v3 -; SI-NEXT: v_readfirstlane_b32 s13, v4 -; SI-NEXT: v_readfirstlane_b32 s10, v5 -; SI-NEXT: v_readfirstlane_b32 s11, v6 -; SI-NEXT: v_readfirstlane_b32 s8, v7 -; SI-NEXT: v_readfirstlane_b32 s9, v8 -; SI-NEXT: v_readfirstlane_b32 s6, v9 -; SI-NEXT: v_readfirstlane_b32 s7, v10 -; SI-NEXT: v_readfirstlane_b32 s4, v11 +; SI-NEXT: v_writelane_b32 v26, s30, 0 +; SI-NEXT: v_writelane_b32 v26, s31, 1 +; SI-NEXT: v_writelane_b32 v26, s34, 2 +; SI-NEXT: v_writelane_b32 v26, s35, 3 +; SI-NEXT: v_writelane_b32 v26, s36, 4 +; SI-NEXT: v_writelane_b32 v26, s37, 5 +; SI-NEXT: v_mov_b32_e32 v13, s16 +; SI-NEXT: v_mov_b32_e32 v14, s17 +; SI-NEXT: v_mov_b32_e32 v15, s18 +; SI-NEXT: v_mov_b32_e32 v16, s19 +; SI-NEXT: v_mov_b32_e32 v17, s20 +; SI-NEXT: v_mov_b32_e32 v18, s21 +; SI-NEXT: v_mov_b32_e32 v19, s22 +; SI-NEXT: v_writelane_b32 v26, s38, 6 +; SI-NEXT: v_readfirstlane_b32 s42, v13 +; SI-NEXT: v_mov_b32_e32 v13, s23 +; SI-NEXT: v_readfirstlane_b32 s43, v14 +; SI-NEXT: v_mov_b32_e32 v14, s24 +; SI-NEXT: v_readfirstlane_b32 s40, v15 +; SI-NEXT: v_mov_b32_e32 v15, s25 +; SI-NEXT: v_readfirstlane_b32 s41, v16 +; SI-NEXT: v_mov_b32_e32 v16, s26 +; SI-NEXT: v_readfirstlane_b32 s24, v17 +; SI-NEXT: v_mov_b32_e32 v17, s27 +; SI-NEXT: v_readfirstlane_b32 s25, v18 +; SI-NEXT: v_mov_b32_e32 v18, s28 +; SI-NEXT: v_readfirstlane_b32 s22, v19 +; SI-NEXT: v_mov_b32_e32 v19, s29 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: v_writelane_b32 v26, s39, 7 +; SI-NEXT: v_readfirstlane_b32 s23, v13 +; SI-NEXT: v_readfirstlane_b32 s20, v14 +; SI-NEXT: v_readfirstlane_b32 s21, v15 +; SI-NEXT: v_readfirstlane_b32 s18, v16 +; SI-NEXT: v_readfirstlane_b32 s19, v17 +; SI-NEXT: v_readfirstlane_b32 s16, v18 +; SI-NEXT: v_readfirstlane_b32 s17, v19 +; SI-NEXT: v_readfirstlane_b32 s14, v0 +; SI-NEXT: v_readfirstlane_b32 s15, v1 +; SI-NEXT: v_readfirstlane_b32 s12, v2 +; SI-NEXT: v_readfirstlane_b32 s13, v3 +; SI-NEXT: v_readfirstlane_b32 s10, v4 +; SI-NEXT: v_readfirstlane_b32 s11, v5 +; SI-NEXT: v_readfirstlane_b32 s8, v6 +; SI-NEXT: v_readfirstlane_b32 s9, v7 +; SI-NEXT: v_readfirstlane_b32 s6, v8 +; SI-NEXT: v_readfirstlane_b32 s7, v9 +; SI-NEXT: v_readfirstlane_b32 s4, v10 ; SI-NEXT: s_and_b64 s[26:27], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s5, v12 -; SI-NEXT: v_writelane_b32 v20, s48, 8 +; SI-NEXT: v_readfirstlane_b32 s5, v11 +; SI-NEXT: v_writelane_b32 v26, s48, 8 ; SI-NEXT: s_cbranch_scc0 .LBB41_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_lshr_b32 s92, s5, 16 @@ -23784,192 +23096,120 @@ define inreg <52 x i16> @bitcast_v13i64_to_v52i16_scalar(<13 x i64> inreg %a, i3 ; SI-NEXT: s_lshl_b32 s27, s88, 16 ; SI-NEXT: s_and_b32 s29, s42, 0xffff ; SI-NEXT: s_or_b32 s27, s29, s27 -; SI-NEXT: v_mov_b32_e32 v1, s27 -; SI-NEXT: s_and_b32 s27, s43, 0xffff -; SI-NEXT: s_lshl_b32 s29, s48, 16 -; SI-NEXT: s_or_b32 s27, s27, s29 -; SI-NEXT: v_mov_b32_e32 v2, s27 -; SI-NEXT: s_lshl_b32 s27, s78, 16 -; SI-NEXT: s_and_b32 s29, s40, 0xffff -; SI-NEXT: s_or_b32 s27, s29, s27 -; SI-NEXT: v_mov_b32_e32 v3, s27 -; SI-NEXT: s_and_b32 s27, s41, 0xffff -; SI-NEXT: s_lshl_b32 s29, s39, 16 -; SI-NEXT: s_or_b32 s27, s27, s29 -; SI-NEXT: v_mov_b32_e32 v4, s27 -; SI-NEXT: s_lshl_b32 s27, s76, 16 +; SI-NEXT: s_and_b32 s29, s43, 0xffff +; SI-NEXT: s_lshl_b32 s42, s48, 16 +; SI-NEXT: s_or_b32 s29, s29, s42 +; SI-NEXT: s_lshl_b32 s42, s78, 16 +; SI-NEXT: s_and_b32 s40, s40, 0xffff +; SI-NEXT: s_or_b32 s40, s40, s42 +; SI-NEXT: s_and_b32 s41, s41, 0xffff +; SI-NEXT: s_lshl_b32 s42, s39, 16 +; SI-NEXT: s_or_b32 s41, s41, s42 +; SI-NEXT: s_lshl_b32 s42, s76, 16 ; SI-NEXT: s_and_b32 s24, s24, 0xffff -; SI-NEXT: s_or_b32 s24, s24, s27 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; SI-NEXT: v_mov_b32_e32 v5, s24 -; SI-NEXT: s_and_b32 s24, s25, 0xffff -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v1, vcc, 8, v0 -; SI-NEXT: s_lshl_b32 s25, s38, 16 -; SI-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v1, vcc, 12, v0 -; SI-NEXT: s_or_b32 s24, s24, s25 -; SI-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v1, vcc, 16, v0 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mov_b32_e32 v2, s24 +; SI-NEXT: s_or_b32 s24, s24, s42 +; SI-NEXT: s_and_b32 s25, s25, 0xffff +; SI-NEXT: s_lshl_b32 s42, s38, 16 +; SI-NEXT: s_or_b32 s25, s25, s42 +; SI-NEXT: s_lshl_b32 s42, s74, 16 ; SI-NEXT: s_and_b32 s22, s22, 0xffff -; SI-NEXT: s_lshl_b32 s24, s74, 16 -; SI-NEXT: buffer_store_dword v5, v1, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v1, vcc, 20, v0 -; SI-NEXT: s_or_b32 s22, s22, s24 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s22 -; SI-NEXT: s_and_b32 s22, s23, 0xffff -; SI-NEXT: s_lshl_b32 s23, s37, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 24, v0 -; SI-NEXT: s_or_b32 s22, s22, s23 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s22 +; SI-NEXT: s_or_b32 s22, s22, s42 +; SI-NEXT: s_and_b32 s23, s23, 0xffff +; SI-NEXT: s_lshl_b32 s42, s37, 16 +; SI-NEXT: s_or_b32 s23, s23, s42 +; SI-NEXT: s_lshl_b32 s42, s72, 16 ; SI-NEXT: s_and_b32 s20, s20, 0xffff -; SI-NEXT: s_lshl_b32 s22, s72, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 28, v0 -; SI-NEXT: s_or_b32 s20, s20, s22 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s20 -; SI-NEXT: s_and_b32 s20, s21, 0xffff -; SI-NEXT: s_lshl_b32 s21, s36, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 32, v0 -; SI-NEXT: s_or_b32 s20, s20, s21 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s20 +; SI-NEXT: s_or_b32 s20, s20, s42 +; SI-NEXT: s_and_b32 s21, s21, 0xffff +; SI-NEXT: s_lshl_b32 s42, s36, 16 +; SI-NEXT: s_or_b32 s21, s21, s42 ; SI-NEXT: s_and_b32 s18, s18, 0xffff -; SI-NEXT: s_lshl_b32 s20, s62, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 36, v0 -; SI-NEXT: s_or_b32 s18, s18, s20 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s18 -; SI-NEXT: s_and_b32 s18, s19, 0xffff -; SI-NEXT: s_lshl_b32 s19, s35, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 40, v0 -; SI-NEXT: s_or_b32 s18, s18, s19 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: s_lshl_b32 s42, s62, 16 +; SI-NEXT: s_or_b32 s18, s18, s42 +; SI-NEXT: s_and_b32 s19, s19, 0xffff +; SI-NEXT: s_lshl_b32 s42, s35, 16 +; SI-NEXT: s_or_b32 s19, s19, s42 ; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: s_lshl_b32 s18, s60, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 44, v0 -; SI-NEXT: s_or_b32 s16, s16, s18 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s17, 0xffff -; SI-NEXT: s_lshl_b32 s17, s34, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 48, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_lshl_b32 s42, s60, 16 +; SI-NEXT: s_or_b32 s16, s16, s42 +; SI-NEXT: s_and_b32 s17, s17, 0xffff +; SI-NEXT: s_lshl_b32 s42, s34, 16 +; SI-NEXT: s_or_b32 s17, s17, s42 ; SI-NEXT: s_and_b32 s14, s14, 0xffff -; SI-NEXT: s_lshl_b32 s16, s58, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 52, v0 -; SI-NEXT: s_or_b32 s14, s14, s16 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s14 -; SI-NEXT: s_and_b32 s14, s15, 0xffff -; SI-NEXT: s_lshl_b32 s15, s31, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 56, v0 -; SI-NEXT: s_or_b32 s14, s14, s15 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s14 +; SI-NEXT: s_lshl_b32 s42, s58, 16 +; SI-NEXT: s_or_b32 s14, s14, s42 +; SI-NEXT: s_and_b32 s15, s15, 0xffff +; SI-NEXT: s_lshl_b32 s42, s31, 16 +; SI-NEXT: s_or_b32 s15, s15, s42 ; SI-NEXT: s_and_b32 s12, s12, 0xffff -; SI-NEXT: s_lshl_b32 s14, s56, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 60, v0 -; SI-NEXT: s_or_b32 s12, s12, s14 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s12 -; SI-NEXT: s_and_b32 s12, s13, 0xffff -; SI-NEXT: s_lshl_b32 s13, s30, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 64, v0 -; SI-NEXT: s_or_b32 s12, s12, s13 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s12 +; SI-NEXT: s_lshl_b32 s42, s56, 16 +; SI-NEXT: s_or_b32 s12, s12, s42 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_lshl_b32 s42, s30, 16 +; SI-NEXT: s_or_b32 s13, s13, s42 ; SI-NEXT: s_and_b32 s10, s10, 0xffff -; SI-NEXT: s_lshl_b32 s12, s46, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x44, v0 -; SI-NEXT: s_or_b32 s10, s10, s12 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s10 -; SI-NEXT: s_and_b32 s10, s11, 0xffff -; SI-NEXT: s_lshl_b32 s11, s95, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x48, v0 -; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: s_lshl_b32 s42, s46, 16 +; SI-NEXT: s_or_b32 s10, s10, s42 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: s_lshl_b32 s42, s95, 16 +; SI-NEXT: s_or_b32 s11, s11, s42 ; SI-NEXT: s_and_b32 s8, s8, 0xffff -; SI-NEXT: s_lshl_b32 s10, s44, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x4c, v0 -; SI-NEXT: s_or_b32 s8, s8, s10 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s8 -; SI-NEXT: s_and_b32 s8, s9, 0xffff -; SI-NEXT: s_lshl_b32 s9, s94, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x50, v0 -; SI-NEXT: s_or_b32 s8, s8, s9 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: s_lshl_b32 s42, s44, 16 ; SI-NEXT: s_and_b32 s6, s6, 0xffff -; SI-NEXT: s_lshl_b32 s8, s28, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x54, v0 -; SI-NEXT: s_or_b32 s6, s6, s8 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: s_and_b32 s6, s7, 0xffff -; SI-NEXT: s_lshl_b32 s7, s93, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x58, v0 -; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_lshl_b32 s28, s28, 16 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_lshl_b32 s6, s26, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x5c, v0 -; SI-NEXT: s_or_b32 s4, s4, s6 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s4 -; SI-NEXT: s_and_b32 s4, s5, 0xffff -; SI-NEXT: s_lshl_b32 s5, s92, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x60, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 -; SI-NEXT: v_mov_b32_e32 v1, s4 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: v_readlane_b32 s48, v20, 8 -; SI-NEXT: v_readlane_b32 s39, v20, 7 -; SI-NEXT: v_readlane_b32 s38, v20, 6 -; SI-NEXT: v_readlane_b32 s37, v20, 5 -; SI-NEXT: v_readlane_b32 s36, v20, 4 -; SI-NEXT: v_readlane_b32 s35, v20, 3 -; SI-NEXT: v_readlane_b32 s34, v20, 2 -; SI-NEXT: v_readlane_b32 s31, v20, 1 -; SI-NEXT: v_readlane_b32 s30, v20, 0 +; SI-NEXT: s_lshl_b32 s26, s26, 16 +; SI-NEXT: s_or_b32 s8, s8, s42 +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_lshl_b32 s42, s94, 16 +; SI-NEXT: s_or_b32 s6, s6, s28 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_lshl_b32 s28, s93, 16 +; SI-NEXT: s_or_b32 s4, s4, s26 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s26, s92, 16 +; SI-NEXT: s_or_b32 s9, s9, s42 +; SI-NEXT: s_or_b32 s7, s7, s28 +; SI-NEXT: s_or_b32 s5, s5, s26 +; SI-NEXT: v_mov_b32_e32 v0, s27 +; SI-NEXT: v_mov_b32_e32 v1, s29 +; SI-NEXT: v_mov_b32_e32 v2, s40 +; SI-NEXT: v_mov_b32_e32 v3, s41 +; SI-NEXT: v_mov_b32_e32 v4, s24 +; SI-NEXT: v_mov_b32_e32 v5, s25 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s20 +; SI-NEXT: v_mov_b32_e32 v9, s21 +; SI-NEXT: v_mov_b32_e32 v10, s18 +; SI-NEXT: v_mov_b32_e32 v11, s19 +; SI-NEXT: v_mov_b32_e32 v12, s16 +; SI-NEXT: v_mov_b32_e32 v13, s17 +; SI-NEXT: v_mov_b32_e32 v14, s14 +; SI-NEXT: v_mov_b32_e32 v15, s15 +; SI-NEXT: v_mov_b32_e32 v16, s12 +; SI-NEXT: v_mov_b32_e32 v17, s13 +; SI-NEXT: v_mov_b32_e32 v18, s10 +; SI-NEXT: v_mov_b32_e32 v19, s11 +; SI-NEXT: v_mov_b32_e32 v20, s8 +; SI-NEXT: v_mov_b32_e32 v21, s9 +; SI-NEXT: v_mov_b32_e32 v22, s6 +; SI-NEXT: v_mov_b32_e32 v23, s7 +; SI-NEXT: v_mov_b32_e32 v24, s4 +; SI-NEXT: v_mov_b32_e32 v25, s5 +; SI-NEXT: v_readlane_b32 s48, v26, 8 +; SI-NEXT: v_readlane_b32 s39, v26, 7 +; SI-NEXT: v_readlane_b32 s38, v26, 6 +; SI-NEXT: v_readlane_b32 s37, v26, 5 +; SI-NEXT: v_readlane_b32 s36, v26, 4 +; SI-NEXT: v_readlane_b32 s35, v26, 3 +; SI-NEXT: v_readlane_b32 s34, v26, 2 +; SI-NEXT: v_readlane_b32 s31, v26, 1 +; SI-NEXT: v_readlane_b32 s30, v26, 0 ; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[4:5] -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB41_4: ; SI-NEXT: ; implicit-def: $sgpr88 @@ -24687,192 +23927,242 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v52i16_to_v13i64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v50, v10 -; SI-NEXT: v_mov_b32_e32 v51, v8 -; SI-NEXT: v_mov_b32_e32 v52, v6 -; SI-NEXT: v_mov_b32_e32 v53, v4 -; SI-NEXT: v_mov_b32_e32 v54, v2 -; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:12 -; SI-NEXT: v_mov_b32_e32 v49, v12 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v29 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v8 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:4 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v12 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v18 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:60 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:52 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v22 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:44 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:36 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:28 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v37, v20 +; SI-NEXT: v_mov_b32_e32 v38, v19 +; SI-NEXT: v_mov_b32_e32 v39, v18 +; SI-NEXT: v_mov_b32_e32 v48, v17 +; SI-NEXT: v_mov_b32_e32 v49, v16 +; SI-NEXT: v_mov_b32_e32 v50, v15 +; SI-NEXT: v_mov_b32_e32 v51, v14 +; SI-NEXT: v_mov_b32_e32 v52, v13 +; SI-NEXT: v_mov_b32_e32 v53, v12 +; SI-NEXT: v_mov_b32_e32 v54, v11 +; SI-NEXT: v_mov_b32_e32 v55, v10 +; SI-NEXT: v_mov_b32_e32 v40, v9 +; SI-NEXT: v_mov_b32_e32 v41, v8 +; SI-NEXT: v_mov_b32_e32 v42, v7 +; SI-NEXT: v_mov_b32_e32 v43, v6 +; SI-NEXT: v_mov_b32_e32 v44, v5 +; SI-NEXT: v_mov_b32_e32 v45, v4 +; SI-NEXT: v_mov_b32_e32 v46, v3 +; SI-NEXT: v_mov_b32_e32 v47, v2 +; SI-NEXT: v_mov_b32_e32 v56, v1 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_mov_b32_e32 v57, v0 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v25 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v24 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v23 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v22 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v38 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v48 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v50 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v52 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v40 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v42 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v43 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v44 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v45 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v46 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v47 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v56 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v57 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB42_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v53 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v52 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v51 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v50 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v49 -; SI-NEXT: v_or_b32_e32 v0, v0, v58 -; SI-NEXT: v_or_b32_e32 v1, v1, v48 -; SI-NEXT: v_or_b32_e32 v2, v2, v39 -; SI-NEXT: v_or_b32_e32 v3, v3, v57 -; SI-NEXT: v_or_b32_e32 v4, v4, v38 -; SI-NEXT: v_or_b32_e32 v5, v5, v56 -; SI-NEXT: v_or_b32_e32 v6, v6, v47 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v54 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v57 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v56 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v47 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v46 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v45 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v44 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v43 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v42 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v41 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v40 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v55 +; SI-NEXT: v_or_b32_e32 v0, v0, v60 +; SI-NEXT: v_or_b32_e32 v1, v1, v36 +; SI-NEXT: v_or_b32_e32 v2, v2, v59 +; SI-NEXT: v_or_b32_e32 v3, v3, v35 +; SI-NEXT: v_or_b32_e32 v4, v4, v58 +; SI-NEXT: v_or_b32_e32 v5, v5, v34 +; SI-NEXT: v_or_b32_e32 v6, v6, v63 +; SI-NEXT: v_or_b32_e32 v7, v7, v33 +; SI-NEXT: v_or_b32_e32 v8, v8, v62 +; SI-NEXT: v_or_b32_e32 v9, v9, v32 +; SI-NEXT: v_or_b32_e32 v10, v10, v61 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v53 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v52 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v51 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v50 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v49 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v48 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v39 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v38 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v37 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_or_b32_e32 v24, v24, v25 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_or_b32_e32 v25, v25, v26 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; kill: killed $vgpr26 @@ -24909,97 +24199,39 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; kill: killed $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 ; SI-NEXT: ; kill: killed $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: v_or_b32_e32 v7, v7, v37 -; SI-NEXT: v_or_b32_e32 v8, v8, v46 -; SI-NEXT: v_or_b32_e32 v9, v9, v45 -; SI-NEXT: v_or_b32_e32 v10, v10, v36 -; SI-NEXT: v_or_b32_e32 v11, v11, v44 -; SI-NEXT: v_or_b32_e32 v12, v12, v43 -; SI-NEXT: v_or_b32_e32 v13, v13, v35 -; SI-NEXT: v_or_b32_e32 v14, v14, v42 -; SI-NEXT: v_or_b32_e32 v15, v15, v34 -; SI-NEXT: v_or_b32_e32 v16, v16, v41 -; SI-NEXT: v_or_b32_e32 v17, v17, v33 -; SI-NEXT: v_or_b32_e32 v18, v18, v40 -; SI-NEXT: v_or_b32_e32 v19, v19, v32 -; SI-NEXT: v_or_b32_e32 v20, v20, v63 -; SI-NEXT: v_or_b32_e32 v21, v21, v62 -; SI-NEXT: v_or_b32_e32 v22, v22, v61 -; SI-NEXT: v_or_b32_e32 v23, v23, v60 -; SI-NEXT: v_or_b32_e32 v24, v24, v59 ; SI-NEXT: ; kill: killed $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; kill: killed $vgpr26 ; SI-NEXT: .LBB42_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB42_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v53 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v52 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v51 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v50 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v49 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v54 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v57 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v56 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v47 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v46 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v45 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v44 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v43 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v42 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v41 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v40 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v55 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 @@ -25007,14 +24239,22 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v0, v60, v0 ; SI-NEXT: s_mov_b32 s6, 0x30000 -; SI-NEXT: v_or_b32_e32 v1, v48, v1 -; SI-NEXT: v_or_b32_e32 v2, v39, v2 -; SI-NEXT: v_or_b32_e32 v3, v57, v3 -; SI-NEXT: v_or_b32_e32 v4, v38, v4 -; SI-NEXT: v_or_b32_e32 v5, v56, v5 -; SI-NEXT: v_or_b32_e32 v6, v47, v6 +; SI-NEXT: v_or_b32_e32 v1, v36, v1 +; SI-NEXT: v_or_b32_e32 v2, v59, v2 +; SI-NEXT: v_or_b32_e32 v3, v35, v3 +; SI-NEXT: v_or_b32_e32 v4, v58, v4 +; SI-NEXT: v_or_b32_e32 v5, v34, v5 +; SI-NEXT: v_or_b32_e32 v6, v63, v6 +; SI-NEXT: v_or_b32_e32 v7, v33, v7 +; SI-NEXT: v_or_b32_e32 v8, v62, v8 +; SI-NEXT: v_or_b32_e32 v9, v32, v9 +; SI-NEXT: v_or_b32_e32 v10, v61, v10 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 @@ -25022,78 +24262,50 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 ; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 ; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v53 ; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v52 ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v51 ; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v50 ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v49 ; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v48 ; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v39 ; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v38 ; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v37 ; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; SI-NEXT: v_or_b32_e32 v7, v37, v7 -; SI-NEXT: v_or_b32_e32 v8, v46, v8 -; SI-NEXT: v_or_b32_e32 v9, v45, v9 -; SI-NEXT: v_or_b32_e32 v10, v36, v10 -; SI-NEXT: v_or_b32_e32 v11, v44, v11 -; SI-NEXT: v_or_b32_e32 v12, v43, v12 -; SI-NEXT: v_or_b32_e32 v13, v35, v13 -; SI-NEXT: v_or_b32_e32 v14, v42, v14 -; SI-NEXT: v_or_b32_e32 v15, v34, v15 -; SI-NEXT: v_or_b32_e32 v16, v41, v16 -; SI-NEXT: v_or_b32_e32 v17, v33, v17 -; SI-NEXT: v_or_b32_e32 v18, v40, v18 -; SI-NEXT: v_or_b32_e32 v19, v32, v19 -; SI-NEXT: v_or_b32_e32 v20, v63, v20 -; SI-NEXT: v_or_b32_e32 v21, v62, v21 -; SI-NEXT: v_or_b32_e32 v22, v61, v22 -; SI-NEXT: v_or_b32_e32 v23, v60, v23 -; SI-NEXT: v_or_b32_e32 v24, v59, v24 -; SI-NEXT: v_or_b32_e32 v25, v26, v25 -; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 -; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 ; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 ; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 @@ -25104,29 +24316,58 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 ; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 ; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v20 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v24, vcc, 0x30000, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 ; SI-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v25 ; SI-NEXT: .LBB42_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -25835,366 +25076,296 @@ define inreg <13 x i64> @bitcast_v52i16_to_v13i64_scalar(<52 x i16> inreg %a, i3 ; SI-LABEL: bitcast_v52i16_to_v13i64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v47, v8 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_mov_b32_e32 v57, v6 -; SI-NEXT: v_mov_b32_e32 v32, v4 -; SI-NEXT: v_mov_b32_e32 v34, v2 -; SI-NEXT: v_mov_b32_e32 v37, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v62, v30 -; SI-NEXT: v_mov_b32_e32 v30, v24 -; SI-NEXT: v_mov_b32_e32 v38, v22 -; SI-NEXT: v_mov_b32_e32 v39, v20 -; SI-NEXT: v_mov_b32_e32 v48, v18 -; SI-NEXT: v_mov_b32_e32 v49, v16 -; SI-NEXT: v_mov_b32_e32 v50, v14 -; SI-NEXT: v_mov_b32_e32 v40, v12 -; SI-NEXT: v_mov_b32_e32 v41, v10 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v1 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v29 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v2 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v4 +; SI-NEXT: v_mov_b32_e32 v36, v7 +; SI-NEXT: v_mov_b32_e32 v35, v8 +; SI-NEXT: v_mov_b32_e32 v51, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v36 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v34, v9 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v35 +; SI-NEXT: v_mov_b32_e32 v33, v10 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v34 +; SI-NEXT: v_mov_b32_e32 v32, v11 +; SI-NEXT: v_mov_b32_e32 v37, v6 +; SI-NEXT: v_mov_b32_e32 v38, v5 +; SI-NEXT: v_mov_b32_e32 v39, v4 +; SI-NEXT: v_mov_b32_e32 v48, v3 +; SI-NEXT: v_mov_b32_e32 v49, v2 +; SI-NEXT: v_mov_b32_e32 v50, v1 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v38 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v48 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v50 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v51 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s7, s28, 16 +; SI-NEXT: s_lshr_b32 s8, s27, 16 +; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: s_lshr_b32 s13, s22, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s40, s19, 16 +; SI-NEXT: s_lshr_b32 s41, s18, 16 +; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v32 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v8 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v0 ; SI-NEXT: s_cbranch_scc0 .LBB43_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 -; SI-NEXT: v_or_b32_e32 v7, v0, v31 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 -; SI-NEXT: v_or_b32_e32 v9, v0, v61 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v57 -; SI-NEXT: v_or_b32_e32 v10, v0, v60 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v47 -; SI-NEXT: v_or_b32_e32 v11, v0, v59 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v41 -; SI-NEXT: v_or_b32_e32 v12, v0, v36 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v40 -; SI-NEXT: v_or_b32_e32 v13, v0, v35 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s44, s42, 16 +; SI-NEXT: s_or_b32 s5, s5, s44 +; SI-NEXT: s_and_b32 s44, s18, 0xffff +; SI-NEXT: s_lshl_b32 s45, s41, 16 +; SI-NEXT: s_or_b32 s44, s44, s45 +; SI-NEXT: s_and_b32 s45, s19, 0xffff +; SI-NEXT: s_lshl_b32 s46, s40, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; SI-NEXT: s_or_b32 s45, s45, s46 +; SI-NEXT: s_and_b32 s46, s20, 0xffff +; SI-NEXT: s_lshl_b32 s47, s15, 16 +; SI-NEXT: v_or_b32_e32 v14, v0, v47 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 -; SI-NEXT: v_or_b32_e32 v14, v0, v33 +; SI-NEXT: s_or_b32 s46, s46, s47 +; SI-NEXT: s_and_b32 s47, s21, 0xffff +; SI-NEXT: s_lshl_b32 s56, s14, 16 +; SI-NEXT: v_or_b32_e32 v15, v0, v46 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 -; SI-NEXT: v_or_b32_e32 v15, v0, v55 +; SI-NEXT: s_or_b32 s47, s47, s56 +; SI-NEXT: s_and_b32 s56, s22, 0xffff +; SI-NEXT: s_lshl_b32 s57, s13, 16 +; SI-NEXT: v_or_b32_e32 v16, v0, v45 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 -; SI-NEXT: v_or_b32_e32 v16, v0, v54 +; SI-NEXT: s_or_b32 s56, s56, s57 +; SI-NEXT: s_and_b32 s57, s23, 0xffff +; SI-NEXT: s_lshl_b32 s58, s12, 16 +; SI-NEXT: v_or_b32_e32 v17, v0, v44 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 -; SI-NEXT: v_or_b32_e32 v17, v0, v58 +; SI-NEXT: s_or_b32 s57, s57, s58 +; SI-NEXT: s_and_b32 s58, s24, 0xffff +; SI-NEXT: s_lshl_b32 s59, s11, 16 +; SI-NEXT: v_or_b32_e32 v18, v0, v43 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 -; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: v_or_b32_e32 v18, v0, v53 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: v_or_b32_e32 v19, v0, v56 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v26 -; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: v_or_b32_e32 v20, v0, v52 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v28 -; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: v_or_b32_e32 v21, v0, v46 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v62 -; SI-NEXT: s_or_b32 s7, s7, s8 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: v_or_b32_e32 v22, v0, v45 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v44 -; SI-NEXT: s_or_b32 s8, s8, s9 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: v_or_b32_e32 v23, v0, v51 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v43 -; SI-NEXT: s_or_b32 s9, s9, s10 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v34 -; SI-NEXT: v_or_b32_e32 v24, v0, v29 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v42 -; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_or_b32_e32 v8, v1, v63 -; SI-NEXT: v_or_b32_e32 v25, v0, v27 +; SI-NEXT: s_or_b32 s58, s58, s59 +; SI-NEXT: s_and_b32 s59, s25, 0xffff +; SI-NEXT: s_lshl_b32 s60, s10, 16 +; SI-NEXT: v_or_b32_e32 v19, v0, v42 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: s_or_b32 s59, s59, s60 +; SI-NEXT: s_and_b32 s60, s26, 0xffff +; SI-NEXT: s_lshl_b32 s61, s9, 16 +; SI-NEXT: v_or_b32_e32 v20, v0, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: s_or_b32 s60, s60, s61 +; SI-NEXT: s_and_b32 s61, s27, 0xffff +; SI-NEXT: s_lshl_b32 s62, s8, 16 +; SI-NEXT: v_or_b32_e32 v21, v0, v40 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: s_or_b32 s61, s61, s62 +; SI-NEXT: s_and_b32 s62, s28, 0xffff +; SI-NEXT: s_lshl_b32 s63, s7, 16 +; SI-NEXT: v_or_b32_e32 v22, v0, v55 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: s_or_b32 s62, s62, s63 +; SI-NEXT: s_and_b32 s63, s29, 0xffff +; SI-NEXT: s_lshl_b32 s72, s6, 16 +; SI-NEXT: v_or_b32_e32 v23, v0, v54 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: s_or_b32 s63, s63, s72 +; SI-NEXT: v_or_b32_e32 v24, v0, v53 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: v_or_b32_e32 v25, v0, v52 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_mov_b32_e32 v5, s9 -; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: v_mov_b32_e32 v2, s44 +; SI-NEXT: v_mov_b32_e32 v3, s45 +; SI-NEXT: v_mov_b32_e32 v4, s46 +; SI-NEXT: v_mov_b32_e32 v5, s47 +; SI-NEXT: v_mov_b32_e32 v6, s56 +; SI-NEXT: v_mov_b32_e32 v7, s57 +; SI-NEXT: v_mov_b32_e32 v8, s58 +; SI-NEXT: v_mov_b32_e32 v9, s59 +; SI-NEXT: v_mov_b32_e32 v10, s60 +; SI-NEXT: v_mov_b32_e32 v11, s61 +; SI-NEXT: v_mov_b32_e32 v12, s62 +; SI-NEXT: v_mov_b32_e32 v13, s63 ; SI-NEXT: s_cbranch_execnz .LBB43_3 ; SI-NEXT: .LBB43_2: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_or_b32_e32 v0, v31, v0 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_or_b32_e32 v0, v61, v0 -; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v57 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v60, v0 -; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v47 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v59, v0 -; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v36, v0 -; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v40 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v35, v0 -; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v33, v0 +; SI-NEXT: v_or_b32_e32 v0, v47, v0 ; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v55, v0 +; SI-NEXT: v_or_b32_e32 v0, v46, v0 ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v54, v0 +; SI-NEXT: v_or_b32_e32 v0, v45, v0 ; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: v_or_b32_e32 v0, v44, v0 ; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v53, v0 +; SI-NEXT: v_or_b32_e32 v0, v43, v0 ; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v56, v0 +; SI-NEXT: v_or_b32_e32 v0, v42, v0 ; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v52, v0 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: v_or_b32_e32 v0, v41, v0 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s16, s42, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: s_and_b32 s16, s18, 0xffff +; SI-NEXT: s_lshl_b32 s17, s41, 16 +; SI-NEXT: s_add_i32 s19, s19, 3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_and_b32 s17, s19, 0xffff +; SI-NEXT: s_lshl_b32 s18, s40, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_or_b32_e32 v0, v40, v0 +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: s_and_b32 s18, s20, 0xffff +; SI-NEXT: s_lshl_b32 s15, s15, 16 +; SI-NEXT: s_add_i32 s21, s21, 3 ; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v62 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: s_or_b32 s15, s15, s18 +; SI-NEXT: s_and_b32 s18, s21, 0xffff +; SI-NEXT: s_lshl_b32 s14, s14, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_or_b32_e32 v0, v45, v0 -; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s14, s14, s18 +; SI-NEXT: s_and_b32 s18, s22, 0xffff +; SI-NEXT: s_lshl_b32 s13, s13, 16 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: v_or_b32_e32 v0, v55, v0 +; SI-NEXT: s_or_b32 s13, s13, s18 +; SI-NEXT: s_and_b32 s18, s23, 0xffff +; SI-NEXT: s_lshl_b32 s12, s12, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 ; SI-NEXT: v_add_i32_e32 v22, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v44 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 +; SI-NEXT: s_or_b32 s12, s12, s18 +; SI-NEXT: s_and_b32 s18, s24, 0xffff +; SI-NEXT: s_lshl_b32 s11, s11, 16 +; SI-NEXT: s_add_i32 s25, s25, 3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: v_or_b32_e32 v0, v51, v0 -; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v43 -; SI-NEXT: s_or_b32 s7, s8, s7 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_or_b32 s11, s11, s18 +; SI-NEXT: s_and_b32 s18, s25, 0xffff +; SI-NEXT: s_lshl_b32 s10, s10, 16 ; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_or_b32 s8, s9, s8 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: v_or_b32_e32 v0, v54, v0 +; SI-NEXT: s_or_b32 s10, s10, s18 +; SI-NEXT: s_and_b32 s18, s26, 0xffff +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: s_or_b32 s9, s9, s18 +; SI-NEXT: s_and_b32 s18, s27, 0xffff +; SI-NEXT: s_lshl_b32 s8, s8, 16 ; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: v_or_b32_e32 v0, v29, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v34 -; SI-NEXT: s_or_b32 s9, s10, s9 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s8, s8, s18 +; SI-NEXT: s_and_b32 s18, s28, 0xffff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: v_or_b32_e32 v0, v53, v0 +; SI-NEXT: s_or_b32 s7, s7, s18 +; SI-NEXT: s_and_b32 s18, s29, 0xffff +; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: v_add_i32_e32 v24, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v42 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 +; SI-NEXT: s_or_b32 s6, s6, s18 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v63, v1 ; SI-NEXT: s_add_i32 s4, s4, 0x30000 ; SI-NEXT: s_add_i32 s5, s5, 0x30000 -; SI-NEXT: s_add_i32 s6, s6, 0x30000 -; SI-NEXT: s_add_i32 s7, s7, 0x30000 -; SI-NEXT: s_add_i32 s8, s8, 0x30000 -; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s16, s16, 0x30000 +; SI-NEXT: s_add_i32 s17, s17, 0x30000 +; SI-NEXT: s_add_i32 s15, s15, 0x30000 +; SI-NEXT: s_add_i32 s14, s14, 0x30000 +; SI-NEXT: s_add_i32 s13, s13, 0x30000 +; SI-NEXT: s_add_i32 s12, s12, 0x30000 +; SI-NEXT: s_add_i32 s11, s11, 0x30000 ; SI-NEXT: s_add_i32 s10, s10, 0x30000 -; SI-NEXT: v_or_b32_e32 v0, v27, v0 -; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v52, v0 ; SI-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v0 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_mov_b32_e32 v5, s9 -; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: v_mov_b32_e32 v3, s17 +; SI-NEXT: v_mov_b32_e32 v4, s15 +; SI-NEXT: v_mov_b32_e32 v5, s14 +; SI-NEXT: v_mov_b32_e32 v6, s13 +; SI-NEXT: v_mov_b32_e32 v7, s12 +; SI-NEXT: v_mov_b32_e32 v8, s11 +; SI-NEXT: v_mov_b32_e32 v9, s10 +; SI-NEXT: v_mov_b32_e32 v10, s9 +; SI-NEXT: v_mov_b32_e32 v11, s8 +; SI-NEXT: v_mov_b32_e32 v12, s7 +; SI-NEXT: v_mov_b32_e32 v13, s6 ; SI-NEXT: .LBB43_3: ; %end -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB43_4: -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v47, v43 -; SI-NEXT: v_mov_b32_e32 v43, v50 -; SI-NEXT: v_mov_b32_e32 v50, v38 -; SI-NEXT: v_mov_b32_e32 v38, v62 -; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v62, v56 -; SI-NEXT: v_mov_b32_e32 v56, v44 -; SI-NEXT: v_mov_b32_e32 v44, v40 -; SI-NEXT: v_mov_b32_e32 v40, v39 -; SI-NEXT: v_mov_b32_e32 v39, v28 -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v59, v45 -; SI-NEXT: v_mov_b32_e32 v45, v41 -; SI-NEXT: v_mov_b32_e32 v41, v48 -; SI-NEXT: v_mov_b32_e32 v48, v26 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v60, v52 -; SI-NEXT: v_mov_b32_e32 v52, v46 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_mov_b32_e32 v46, v42 -; SI-NEXT: v_mov_b32_e32 v42, v49 -; SI-NEXT: v_mov_b32_e32 v49, v30 -; SI-NEXT: v_mov_b32_e32 v61, v63 -; SI-NEXT: v_mov_b32_e32 v63, v57 -; SI-NEXT: v_mov_b32_e32 v57, v27 -; SI-NEXT: v_mov_b32_e32 v53, v37 -; SI-NEXT: v_mov_b32_e32 v37, v36 -; SI-NEXT: v_mov_b32_e32 v36, v35 -; SI-NEXT: v_mov_b32_e32 v35, v34 -; SI-NEXT: v_mov_b32_e32 v34, v33 -; SI-NEXT: v_mov_b32_e32 v33, v55 -; SI-NEXT: v_mov_b32_e32 v55, v32 -; SI-NEXT: v_mov_b32_e32 v32, v54 -; SI-NEXT: v_mov_b32_e32 v54, v58 -; SI-NEXT: v_mov_b32_e32 v58, v51 -; SI-NEXT: v_mov_b32_e32 v51, v29 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v29, v51 -; SI-NEXT: v_mov_b32_e32 v51, v58 -; SI-NEXT: v_mov_b32_e32 v58, v54 -; SI-NEXT: v_mov_b32_e32 v54, v32 -; SI-NEXT: v_mov_b32_e32 v32, v55 -; SI-NEXT: v_mov_b32_e32 v55, v33 -; SI-NEXT: v_mov_b32_e32 v33, v34 -; SI-NEXT: v_mov_b32_e32 v34, v35 -; SI-NEXT: v_mov_b32_e32 v35, v36 -; SI-NEXT: v_mov_b32_e32 v36, v37 -; SI-NEXT: v_mov_b32_e32 v37, v53 -; SI-NEXT: v_mov_b32_e32 v27, v57 -; SI-NEXT: v_mov_b32_e32 v57, v63 -; SI-NEXT: v_mov_b32_e32 v63, v61 -; SI-NEXT: v_mov_b32_e32 v30, v49 -; SI-NEXT: v_mov_b32_e32 v49, v42 -; SI-NEXT: v_mov_b32_e32 v42, v46 -; SI-NEXT: v_mov_b32_e32 v46, v52 -; SI-NEXT: v_mov_b32_e32 v52, v60 -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v26, v48 -; SI-NEXT: v_mov_b32_e32 v48, v41 -; SI-NEXT: v_mov_b32_e32 v41, v45 -; SI-NEXT: v_mov_b32_e32 v45, v59 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v28, v39 -; SI-NEXT: v_mov_b32_e32 v39, v40 -; SI-NEXT: v_mov_b32_e32 v40, v44 -; SI-NEXT: v_mov_b32_e32 v44, v56 -; SI-NEXT: v_mov_b32_e32 v56, v62 -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v62, v38 -; SI-NEXT: v_mov_b32_e32 v38, v50 -; SI-NEXT: v_mov_b32_e32 v50, v43 -; SI-NEXT: v_mov_b32_e32 v43, v47 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_branch .LBB43_2 ; ; VI-LABEL: bitcast_v52i16_to_v13i64_scalar: @@ -26872,16 +26043,16 @@ end: define <52 x half> @bitcast_v13i64_to_v52f16(<13 x i64> %a, i32 %b) { ; SI-LABEL: bitcast_v13i64_to_v52f16: ; SI: ; %bb.0: -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; kill: killed $vgpr51 -; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; kill: killed $vgpr48 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v27 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; kill: killed $vgpr51 -; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; kill: killed $vgpr48 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -26898,129 +26069,115 @@ define <52 x half> @bitcast_v13i64_to_v52f16(<13 x i64> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; kill: killed $vgpr51 -; SI-NEXT: ; kill: killed $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; kill: killed $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr62 ; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; kill: killed $vgpr51 -; SI-NEXT: ; kill: killed $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; kill: killed $vgpr51 -; SI-NEXT: ; kill: killed $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; kill: killed $vgpr51 -; SI-NEXT: ; kill: killed $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; kill: killed $vgpr51 -; SI-NEXT: ; kill: killed $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB44_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v4 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v25 -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v24 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 @@ -27028,35 +26185,47 @@ define <52 x half> @bitcast_v13i64_to_v52f16(<13 x i64> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v35, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v37, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v39, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v27 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v49, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v24 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v58, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v1 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v56, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v0 +; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 @@ -27082,70 +26251,65 @@ define <52 x half> @bitcast_v13i64_to_v52f16(<13 x i64> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: .LBB44_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB44_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_addc_u32_e32 v12, vcc, 0, v12, vcc -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; SI-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; SI-NEXT: v_addc_u32_e32 v16, vcc, 0, v16, vcc -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; SI-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; SI-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc -; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v55 -; SI-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v20 -; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; SI-NEXT: v_addc_u32_e32 v24, vcc, 0, v24, vcc -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v53 -; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; SI-NEXT: v_addc_u32_e32 v26, vcc, 0, v26, vcc -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v18 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc ; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v0 +; SI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; SI-NEXT: v_cvt_f32_f16_e32 v38, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v25 ; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 @@ -27160,271 +26324,187 @@ define <52 x half> @bitcast_v13i64_to_v52f16(<13 x i64> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v56, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 ; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 ; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 ; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 ; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 ; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 ; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 ; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 ; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_mov_b32_e32 v55, v24 -; SI-NEXT: v_mov_b32_e32 v53, v25 -; SI-NEXT: v_mov_b32_e32 v51, v26 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v51, v24 +; SI-NEXT: v_mov_b32_e32 v49, v25 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: .LBB44_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v38 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v36 -; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v34 -; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 -; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v31 -; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v29 -; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v27 -; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v62 -; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v60 -; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v58 -; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 -; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v44 -; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 -; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 -; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 -; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 -; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v36 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v35 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v32 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v30 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v29 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v26 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v62 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v63 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v59 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v60 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v47 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v56 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v12, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v48 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v49 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v14, v54 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v20, v53 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v22, v40 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v55 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v53 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v51 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v44 ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -27441,7 +26521,13 @@ define <52 x half> @bitcast_v13i64_to_v52f16(<13 x i64> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_or_b32_e32 v23, v25, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v51 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 +; SI-NEXT: v_or_b32_e32 v25, v27, v25 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v13i64_to_v52f16: @@ -28003,48 +27089,48 @@ define inreg <52 x half> @bitcast_v13i64_to_v52f16_scalar(<13 x i64> inreg %a, i ; SI-LABEL: bitcast_v13i64_to_v52f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v14, s16 -; SI-NEXT: v_mov_b32_e32 v15, s17 -; SI-NEXT: v_mov_b32_e32 v16, s18 -; SI-NEXT: v_mov_b32_e32 v17, s19 -; SI-NEXT: v_mov_b32_e32 v18, s20 -; SI-NEXT: v_mov_b32_e32 v19, s21 -; SI-NEXT: v_readfirstlane_b32 s40, v14 -; SI-NEXT: v_mov_b32_e32 v14, s22 -; SI-NEXT: v_readfirstlane_b32 s41, v15 -; SI-NEXT: v_mov_b32_e32 v15, s23 -; SI-NEXT: v_readfirstlane_b32 s22, v16 -; SI-NEXT: v_mov_b32_e32 v16, s24 -; SI-NEXT: v_readfirstlane_b32 s42, v17 -; SI-NEXT: v_mov_b32_e32 v17, s25 -; SI-NEXT: v_readfirstlane_b32 s23, v18 -; SI-NEXT: v_mov_b32_e32 v18, s26 -; SI-NEXT: v_readfirstlane_b32 s43, v19 -; SI-NEXT: v_mov_b32_e32 v19, s27 -; SI-NEXT: v_readfirstlane_b32 s24, v14 -; SI-NEXT: v_mov_b32_e32 v14, s28 -; SI-NEXT: v_readfirstlane_b32 s27, v15 -; SI-NEXT: v_mov_b32_e32 v15, s29 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 -; SI-NEXT: v_readfirstlane_b32 s25, v16 -; SI-NEXT: v_readfirstlane_b32 s26, v17 -; SI-NEXT: v_readfirstlane_b32 s20, v18 -; SI-NEXT: v_readfirstlane_b32 s21, v19 -; SI-NEXT: v_readfirstlane_b32 s18, v14 -; SI-NEXT: v_readfirstlane_b32 s19, v15 -; SI-NEXT: v_readfirstlane_b32 s16, v1 -; SI-NEXT: v_readfirstlane_b32 s17, v2 -; SI-NEXT: v_readfirstlane_b32 s14, v3 -; SI-NEXT: v_readfirstlane_b32 s15, v4 -; SI-NEXT: v_readfirstlane_b32 s12, v5 -; SI-NEXT: v_readfirstlane_b32 s13, v6 -; SI-NEXT: v_readfirstlane_b32 s10, v7 -; SI-NEXT: v_readfirstlane_b32 s11, v8 -; SI-NEXT: v_readfirstlane_b32 s7, v9 -; SI-NEXT: v_readfirstlane_b32 s8, v10 -; SI-NEXT: v_readfirstlane_b32 s6, v11 +; SI-NEXT: v_mov_b32_e32 v13, s16 +; SI-NEXT: v_mov_b32_e32 v14, s17 +; SI-NEXT: v_mov_b32_e32 v15, s18 +; SI-NEXT: v_mov_b32_e32 v16, s19 +; SI-NEXT: v_mov_b32_e32 v17, s20 +; SI-NEXT: v_mov_b32_e32 v18, s21 +; SI-NEXT: v_mov_b32_e32 v19, s22 +; SI-NEXT: v_readfirstlane_b32 s40, v13 +; SI-NEXT: v_mov_b32_e32 v13, s23 +; SI-NEXT: v_readfirstlane_b32 s41, v14 +; SI-NEXT: v_mov_b32_e32 v14, s24 +; SI-NEXT: v_readfirstlane_b32 s24, v15 +; SI-NEXT: v_mov_b32_e32 v15, s25 +; SI-NEXT: v_readfirstlane_b32 s42, v16 +; SI-NEXT: v_mov_b32_e32 v16, s26 +; SI-NEXT: v_readfirstlane_b32 s25, v17 +; SI-NEXT: v_mov_b32_e32 v17, s27 +; SI-NEXT: v_readfirstlane_b32 s27, v18 +; SI-NEXT: v_mov_b32_e32 v18, s28 +; SI-NEXT: v_readfirstlane_b32 s26, v19 +; SI-NEXT: v_mov_b32_e32 v19, s29 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: v_readfirstlane_b32 s28, v13 +; SI-NEXT: v_readfirstlane_b32 s22, v14 +; SI-NEXT: v_readfirstlane_b32 s23, v15 +; SI-NEXT: v_readfirstlane_b32 s20, v16 +; SI-NEXT: v_readfirstlane_b32 s21, v17 +; SI-NEXT: v_readfirstlane_b32 s18, v18 +; SI-NEXT: v_readfirstlane_b32 s19, v19 +; SI-NEXT: v_readfirstlane_b32 s16, v0 +; SI-NEXT: v_readfirstlane_b32 s17, v1 +; SI-NEXT: v_readfirstlane_b32 s14, v2 +; SI-NEXT: v_readfirstlane_b32 s15, v3 +; SI-NEXT: v_readfirstlane_b32 s12, v4 +; SI-NEXT: v_readfirstlane_b32 s13, v5 +; SI-NEXT: v_readfirstlane_b32 s10, v6 +; SI-NEXT: v_readfirstlane_b32 s11, v7 +; SI-NEXT: v_readfirstlane_b32 s7, v8 +; SI-NEXT: v_readfirstlane_b32 s8, v9 +; SI-NEXT: v_readfirstlane_b32 s6, v10 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s9, v12 +; SI-NEXT: v_readfirstlane_b32 s9, v11 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill @@ -28053,108 +27139,107 @@ define inreg <52 x half> @bitcast_v13i64_to_v52f16_scalar(<13 x i64> inreg %a, i ; SI-NEXT: s_cbranch_scc0 .LBB45_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_lshr_b32 s4, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s4 ; SI-NEXT: s_lshr_b32 s4, s6, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s4 ; SI-NEXT: s_lshr_b32 s4, s8, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s4 ; SI-NEXT: s_lshr_b32 s4, s7, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 ; SI-NEXT: s_lshr_b32 s4, s11, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 ; SI-NEXT: s_lshr_b32 s4, s10, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 ; SI-NEXT: s_lshr_b32 s4, s13, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 ; SI-NEXT: s_lshr_b32 s4, s12, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 ; SI-NEXT: s_lshr_b32 s4, s15, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 ; SI-NEXT: s_lshr_b32 s4, s14, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 ; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 ; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 ; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 ; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 ; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 ; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: s_lshr_b32 s4, s28, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 ; SI-NEXT: s_lshr_b32 s4, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s4 -; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s4 ; SI-NEXT: s_lshr_b32 s4, s27, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s4 -; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s4 -; SI-NEXT: s_lshr_b32 s4, s43, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v49, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v51, s4 -; SI-NEXT: s_lshr_b32 s4, s42, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 +; SI-NEXT: s_lshr_b32 s4, s25, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v54, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v40, s4 +; SI-NEXT: s_lshr_b32 s4, s42, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v41, s4 ; SI-NEXT: s_lshr_b32 s4, s41, 16 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v42, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 ; SI-NEXT: s_lshr_b32 s4, s40, 16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v44, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s43 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v53, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v55, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v41, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v43, s40 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v43, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v51, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v53, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v55, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v40, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v42, s40 ; SI-NEXT: s_cbranch_execnz .LBB45_3 ; SI-NEXT: .LBB45_2: ; %cmp.true ; SI-NEXT: s_add_u32 s4, s40, 3 ; SI-NEXT: s_addc_u32 s5, s41, 0 -; SI-NEXT: s_lshr_b32 s28, s4, 16 -; SI-NEXT: s_lshr_b32 s29, s5, 16 -; SI-NEXT: s_add_u32 s22, s22, 3 -; SI-NEXT: s_addc_u32 s40, s42, 0 -; SI-NEXT: s_lshr_b32 s41, s22, 16 -; SI-NEXT: s_lshr_b32 s42, s40, 16 -; SI-NEXT: s_add_u32 s23, s23, 3 -; SI-NEXT: s_addc_u32 s43, s43, 0 -; SI-NEXT: s_lshr_b32 s44, s23, 16 -; SI-NEXT: s_lshr_b32 s45, s43, 16 +; SI-NEXT: s_lshr_b32 s29, s4, 16 +; SI-NEXT: s_lshr_b32 s40, s5, 16 ; SI-NEXT: s_add_u32 s24, s24, 3 -; SI-NEXT: s_addc_u32 s27, s27, 0 -; SI-NEXT: s_lshr_b32 s46, s24, 16 -; SI-NEXT: s_lshr_b32 s47, s27, 16 +; SI-NEXT: s_addc_u32 s41, s42, 0 +; SI-NEXT: s_lshr_b32 s42, s24, 16 +; SI-NEXT: s_lshr_b32 s43, s41, 16 ; SI-NEXT: s_add_u32 s25, s25, 3 -; SI-NEXT: s_addc_u32 s26, s26, 0 -; SI-NEXT: s_lshr_b32 s56, s25, 16 -; SI-NEXT: s_lshr_b32 s57, s26, 16 +; SI-NEXT: s_addc_u32 s27, s27, 0 +; SI-NEXT: s_lshr_b32 s44, s25, 16 +; SI-NEXT: s_lshr_b32 s45, s27, 16 +; SI-NEXT: s_add_u32 s26, s26, 3 +; SI-NEXT: s_addc_u32 s28, s28, 0 +; SI-NEXT: s_lshr_b32 s46, s26, 16 +; SI-NEXT: s_lshr_b32 s47, s28, 16 +; SI-NEXT: s_add_u32 s22, s22, 3 +; SI-NEXT: s_addc_u32 s23, s23, 0 +; SI-NEXT: s_lshr_b32 s56, s22, 16 +; SI-NEXT: s_lshr_b32 s57, s23, 16 ; SI-NEXT: s_add_u32 s20, s20, 3 ; SI-NEXT: s_addc_u32 s21, s21, 0 ; SI-NEXT: s_lshr_b32 s58, s20, 16 @@ -28187,303 +27272,228 @@ define inreg <52 x half> @bitcast_v13i64_to_v52f16_scalar(<13 x i64> inreg %a, i ; SI-NEXT: s_addc_u32 s9, s9, 0 ; SI-NEXT: s_lshr_b32 s88, s6, 16 ; SI-NEXT: s_lshr_b32 s89, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s43 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v53, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v55, s22 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v41, s5 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v43, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s89 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s88 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s79 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s78 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s77 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s76 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s75 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s74 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v51, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v53, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v55, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s41 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v40, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s5 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v42, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s89 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s88 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s79 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s78 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s77 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s76 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s75 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s74 ; SI-NEXT: v_cvt_f32_f16_e32 v16, s73 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s72 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s63 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s62 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s61 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s60 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s59 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s58 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s57 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s56 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s47 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s46 -; SI-NEXT: v_cvt_f32_f16_e32 v49, s45 -; SI-NEXT: v_cvt_f32_f16_e32 v51, s44 -; SI-NEXT: v_cvt_f32_f16_e32 v54, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v40, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v42, s29 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v44, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s72 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s63 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s62 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s61 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s60 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s59 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s58 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s57 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s56 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s47 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s45 +; SI-NEXT: v_cvt_f32_f16_e32 v54, s44 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v41, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s40 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v43, s29 ; SI-NEXT: .LBB45_3: ; %end -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v44, v44 +; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v43, v43 ; SI-NEXT: v_cvt_f16_f32_e32 v42, v42 ; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 ; SI-NEXT: v_cvt_f16_f32_e32 v40, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v55 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41 +; SI-NEXT: v_or_b32_e32 v2, v40, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 -; SI-NEXT: v_or_b32_e32 v43, v43, v44 -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v4 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v44, v1 +; SI-NEXT: v_or_b32_e32 v3, v42, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v40 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 -; SI-NEXT: buffer_store_dword v43, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v43, vcc, 4, v0 -; SI-NEXT: v_or_b32_e32 v41, v41, v42 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_or_b32_e32 v5, v5, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v52 ; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 -; SI-NEXT: buffer_store_dword v41, v43, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v41, vcc, 8, v0 -; SI-NEXT: v_or_b32_e32 v55, v55, v40 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v54 +; SI-NEXT: v_or_b32_e32 v7, v7, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 -; SI-NEXT: buffer_store_dword v55, v41, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v55, vcc, 12, v0 -; SI-NEXT: v_or_b32_e32 v53, v53, v54 -; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v52 +; SI-NEXT: v_or_b32_e32 v9, v50, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 -; SI-NEXT: buffer_store_dword v53, v55, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v53, vcc, 16, v0 -; SI-NEXT: v_or_b32_e32 v51, v52, v51 -; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v50 +; SI-NEXT: v_or_b32_e32 v11, v48, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: buffer_store_dword v51, v53, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v51, vcc, 20, v0 -; SI-NEXT: v_or_b32_e32 v49, v50, v49 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v48 +; SI-NEXT: v_or_b32_e32 v13, v38, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: buffer_store_dword v49, v51, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v49, vcc, 24, v0 -; SI-NEXT: v_or_b32_e32 v39, v48, v39 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v38 +; SI-NEXT: v_or_b32_e32 v15, v36, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: buffer_store_dword v39, v49, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v39, vcc, 28, v0 -; SI-NEXT: v_or_b32_e32 v37, v38, v37 -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v36 +; SI-NEXT: v_or_b32_e32 v17, v34, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: buffer_store_dword v37, v39, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v37, vcc, 32, v0 -; SI-NEXT: v_or_b32_e32 v35, v36, v35 -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; SI-NEXT: buffer_store_dword v35, v37, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v35, vcc, 36, v0 -; SI-NEXT: v_or_b32_e32 v33, v34, v33 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; SI-NEXT: buffer_store_dword v33, v35, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v33, vcc, 40, v0 -; SI-NEXT: v_or_b32_e32 v31, v32, v31 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; SI-NEXT: buffer_store_dword v31, v33, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v31, vcc, 44, v0 -; SI-NEXT: v_or_b32_e32 v28, v30, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v34 +; SI-NEXT: v_or_b32_e32 v19, v32, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v32 +; SI-NEXT: v_or_b32_e32 v21, v30, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v30 +; SI-NEXT: v_or_b32_e32 v23, v28, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: buffer_store_dword v28, v31, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v28, v29 -; SI-NEXT: v_add_i32_e32 v29, vcc, 48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_or_b32_e32 v26, v28, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: buffer_store_dword v26, v29, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v26, v27 -; SI-NEXT: v_add_i32_e32 v27, vcc, 52, v0 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_or_b32_e32 v24, v26, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: buffer_store_dword v24, v27, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v24, v25 -; SI-NEXT: v_add_i32_e32 v25, vcc, 56, v0 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v22, v24, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: buffer_store_dword v22, v25, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v22, v23 -; SI-NEXT: v_add_i32_e32 v23, vcc, 60, v0 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v22, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: buffer_store_dword v20, v23, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v21 -; SI-NEXT: v_add_i32_e32 v21, vcc, 64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v20, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: buffer_store_dword v18, v21, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v18, v19 -; SI-NEXT: v_add_i32_e32 v19, vcc, 0x44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: buffer_store_dword v16, v19, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v16, v17 -; SI-NEXT: v_add_i32_e32 v17, vcc, 0x48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v16, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v14, v15 -; SI-NEXT: v_add_i32_e32 v15, vcc, 0x4c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_or_b32_e32 v11, v14, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: buffer_store_dword v11, v15, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v11, v13 -; SI-NEXT: v_add_i32_e32 v13, vcc, 0x50, v0 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: buffer_store_dword v9, v13, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v9, v12 -; SI-NEXT: v_add_i32_e32 v11, vcc, 0x54, v0 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: buffer_store_dword v7, v11, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v7, v10 -; SI-NEXT: v_add_i32_e32 v9, vcc, 0x58, v0 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v8 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x5c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v6 -; SI-NEXT: v_add_i32_e32 v5, vcc, 0x60, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v4 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v28 +; SI-NEXT: v_or_b32_e32 v4, v55, v4 +; SI-NEXT: v_or_b32_e32 v6, v53, v6 +; SI-NEXT: v_or_b32_e32 v8, v51, v8 +; SI-NEXT: v_or_b32_e32 v10, v49, v10 +; SI-NEXT: v_or_b32_e32 v12, v39, v12 +; SI-NEXT: v_or_b32_e32 v14, v37, v14 +; SI-NEXT: v_or_b32_e32 v16, v35, v16 +; SI-NEXT: v_or_b32_e32 v18, v33, v18 +; SI-NEXT: v_or_b32_e32 v20, v31, v20 +; SI-NEXT: v_or_b32_e32 v22, v29, v22 +; SI-NEXT: v_or_b32_e32 v24, v27, v24 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB45_4: +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: s_branch .LBB45_2 ; ; VI-LABEL: bitcast_v13i64_to_v52f16_scalar: @@ -29173,187 +28183,203 @@ define <13 x i64> @bitcast_v52f16_to_v13i64(<52 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v52f16_to_v13i64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v42, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v10 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:24 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v20 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v44, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v15 ; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:20 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v14 ; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v13 ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v12 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:40 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:36 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:48 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:84 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v60, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v11 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v61, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v59 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v9 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v58 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v6 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v57 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v48 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v36 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v37 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v38 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v39 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v63 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v62 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v49 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v61 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v62 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v61 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v60 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v25 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB46_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v49 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; kill: killed $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; kill: killed $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; kill: killed $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; kill: killed $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; kill: killed $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; kill: killed $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; kill: killed $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; kill: killed $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v61 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; kill: killed $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr26 @@ -29395,11 +28421,16 @@ define <13 x i64> @bitcast_v52f16_to_v13i64(<52 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v53 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v57 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v47 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v45 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v63 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v45 ; SI-NEXT: ; kill: killed $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: v_or_b32_e32 v0, v42, v0 @@ -29407,11 +28438,16 @@ define <13 x i64> @bitcast_v52f16_to_v13i64(<52 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v2, v54, v2 ; SI-NEXT: v_or_b32_e32 v3, v52, v3 ; SI-NEXT: v_or_b32_e32 v4, v50, v4 -; SI-NEXT: v_or_b32_e32 v21, v56, v21 -; SI-NEXT: v_or_b32_e32 v22, v46, v22 -; SI-NEXT: v_or_b32_e32 v23, v44, v23 -; SI-NEXT: v_or_b32_e32 v24, v34, v24 -; SI-NEXT: v_or_b32_e32 v25, v32, v25 +; SI-NEXT: v_or_b32_e32 v5, v48, v5 +; SI-NEXT: v_or_b32_e32 v6, v38, v6 +; SI-NEXT: v_or_b32_e32 v7, v36, v7 +; SI-NEXT: v_or_b32_e32 v8, v34, v8 +; SI-NEXT: v_or_b32_e32 v9, v32, v9 +; SI-NEXT: v_or_b32_e32 v10, v62, v10 +; SI-NEXT: v_or_b32_e32 v22, v58, v22 +; SI-NEXT: v_or_b32_e32 v23, v56, v23 +; SI-NEXT: v_or_b32_e32 v24, v46, v24 +; SI-NEXT: v_or_b32_e32 v25, v44, v25 ; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr41 @@ -29423,100 +28459,85 @@ define <13 x i64> @bitcast_v52f16_to_v13i64(<52 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: ; kill: killed $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v59 -; SI-NEXT: v_or_b32_e32 v20, v58, v20 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v60, v21 +; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: .LBB46_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB46_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v41 @@ -29537,7 +28558,7 @@ define <13 x i64> @bitcast_v52f16_to_v13i64(<52 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v2, v55 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v54 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v50 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -29547,145 +28568,117 @@ define <13 x i64> @bitcast_v52f16_to_v13i64(<52 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v53 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v5, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v38 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v51 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v8, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v32 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v46 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v44 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v33 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v32 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v62 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v37 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v56 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v35 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v33 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v44 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v61 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 @@ -29693,12 +28686,12 @@ define <13 x i64> @bitcast_v52f16_to_v13i64(<52 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 @@ -29708,7 +28701,7 @@ define <13 x i64> @bitcast_v52f16_to_v13i64(<52 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v16, v14 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 @@ -29720,12 +28713,12 @@ define <13 x i64> @bitcast_v52f16_to_v13i64(<52 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 @@ -29735,7 +28728,7 @@ define <13 x i64> @bitcast_v52f16_to_v13i64(<52 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 @@ -29747,31 +28740,41 @@ define <13 x i64> @bitcast_v52f16_to_v13i64(<52 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_or_b32_e32 v18, v19, v18 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v59 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v57 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; SI-NEXT: v_or_b32_e32 v20, v22, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v56 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v60 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: v_or_b32_e32 v21, v22, v21 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v57 ; SI-NEXT: v_or_b32_e32 v22, v24, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v47 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; SI-NEXT: v_or_b32_e32 v23, v25, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v46 ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 @@ -29780,22 +28783,22 @@ define <13 x i64> @bitcast_v52f16_to_v13i64(<52 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v25, v27, v25 ; SI-NEXT: .LBB46_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -30505,420 +29508,490 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i ; SI-LABEL: bitcast_v52f16_to_v13i64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v7 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v54, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v53, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v11, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v1, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v2, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v12, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v14, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v3, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v10, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v4, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v9, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v5, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v8, s26 -; SI-NEXT: v_cvt_f16_f32_e32 v6, s29 -; SI-NEXT: v_cvt_f16_f32_e32 v7, s28 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v38 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v39 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v44 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: s_lshr_b32 s40, s17, 16 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 +; SI-NEXT: s_lshr_b32 s41, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s41 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s17 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v9 +; SI-NEXT: s_lshr_b32 s14, s19, 16 +; SI-NEXT: s_lshr_b32 s15, s18, 16 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v7 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_cvt_f16_f32_e32 v60, v25 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 +; SI-NEXT: s_lshr_b32 s12, s21, 16 +; SI-NEXT: s_lshr_b32 s13, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s21 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v10 +; SI-NEXT: s_lshr_b32 s10, s23, 16 +; SI-NEXT: s_lshr_b32 s11, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 +; SI-NEXT: s_lshr_b32 s8, s25, 16 +; SI-NEXT: s_lshr_b32 s9, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v11 +; SI-NEXT: s_lshr_b32 s6, s27, 16 +; SI-NEXT: s_lshr_b32 s7, s26, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v10 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v1 +; SI-NEXT: s_lshr_b32 s4, s29, 16 +; SI-NEXT: s_lshr_b32 s5, s28, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s29 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v20 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB47_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v46 +; SI-NEXT: v_or_b32_e32 v6, v29, v6 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v53 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v26 -; SI-NEXT: v_or_b32_e32 v0, v11, v0 -; SI-NEXT: v_or_b32_e32 v2, v14, v2 -; SI-NEXT: v_or_b32_e32 v3, v10, v3 -; SI-NEXT: v_or_b32_e32 v4, v9, v4 -; SI-NEXT: v_or_b32_e32 v5, v8, v5 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v46 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v41 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v42 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v56 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v43 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v57 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v62 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v34 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v58 -; SI-NEXT: v_mov_b32_e32 v51, v46 -; SI-NEXT: v_or_b32_e32 v7, v45, v7 -; SI-NEXT: v_or_b32_e32 v8, v40, v8 -; SI-NEXT: v_or_b32_e32 v9, v55, v9 -; SI-NEXT: v_or_b32_e32 v10, v54, v10 -; SI-NEXT: v_or_b32_e32 v11, v47, v11 -; SI-NEXT: v_or_b32_e32 v12, v60, v12 -; SI-NEXT: v_or_b32_e32 v13, v52, v13 -; SI-NEXT: v_or_b32_e32 v14, v63, v14 -; SI-NEXT: v_or_b32_e32 v15, v61, v15 -; SI-NEXT: v_or_b32_e32 v17, v35, v17 -; SI-NEXT: v_or_b32_e32 v18, v33, v18 -; SI-NEXT: v_or_b32_e32 v19, v59, v19 -; SI-NEXT: v_or_b32_e32 v20, v27, v20 -; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v60 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v62 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v63 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v45 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v40 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v27 +; SI-NEXT: v_mov_b32_e32 v61, v60 +; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: v_or_b32_e32 v1, v56, v1 +; SI-NEXT: v_or_b32_e32 v2, v37, v2 +; SI-NEXT: v_mov_b32_e32 v57, v58 +; SI-NEXT: v_or_b32_e32 v3, v58, v3 +; SI-NEXT: v_mov_b32_e32 v56, v47 +; SI-NEXT: v_or_b32_e32 v4, v47, v4 +; SI-NEXT: v_mov_b32_e32 v47, v45 +; SI-NEXT: v_mov_b32_e32 v45, v44 +; SI-NEXT: v_or_b32_e32 v5, v44, v5 +; SI-NEXT: v_or_b32_e32 v7, v34, v7 +; SI-NEXT: v_or_b32_e32 v8, v35, v8 +; SI-NEXT: v_or_b32_e32 v9, v42, v9 +; SI-NEXT: v_or_b32_e32 v10, v41, v10 +; SI-NEXT: v_or_b32_e32 v11, v54, v11 +; SI-NEXT: v_or_b32_e32 v12, v53, v12 +; SI-NEXT: v_or_b32_e32 v13, v51, v13 +; SI-NEXT: v_or_b32_e32 v14, v49, v14 +; SI-NEXT: v_or_b32_e32 v15, v39, v15 +; SI-NEXT: v_or_b32_e32 v16, v28, v16 +; SI-NEXT: v_or_b32_e32 v17, v30, v17 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_or_b32_e32 v23, v24, v23 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v37, v16 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v21, v22, v21 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v31 -; SI-NEXT: v_or_b32_e32 v22, v30, v22 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_or_b32_e32 v25, v38, v25 +; SI-NEXT: v_or_b32_e32 v25, v29, v25 ; SI-NEXT: s_cbranch_execnz .LBB47_3 ; SI-NEXT: .LBB47_2: ; %cmp.true -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v54 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v0, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v56 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v35 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v10, v42 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v52 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v63 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v30 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v27 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v30 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v28 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v37 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v63 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v59 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v47 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v46 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v32 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v33 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v36 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v43 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v40 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v55 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v14, v50 ; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v62 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v48 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v27 ; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v31 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v33 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_or_b32_e32 v18, v19, v18 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v58 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v26 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; SI-NEXT: v_or_b32_e32 v20, v22, v20 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_or_b32_e32 v21, v22, v21 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_or_b32_e32 v22, v24, v22 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 @@ -30928,7 +30001,7 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; SI-NEXT: v_or_b32_e32 v23, v25, v23 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 @@ -30942,57 +30015,68 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 ; SI-NEXT: v_or_b32_e32 v25, v27, v25 ; SI-NEXT: .LBB47_3: ; %end -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB47_4: -; SI-NEXT: v_mov_b32_e32 v50, v63 -; SI-NEXT: v_mov_b32_e32 v63, v58 -; SI-NEXT: v_mov_b32_e32 v58, v30 -; SI-NEXT: v_mov_b32_e32 v38, v37 -; SI-NEXT: v_mov_b32_e32 v37, v36 -; SI-NEXT: v_mov_b32_e32 v36, v35 -; SI-NEXT: v_mov_b32_e32 v35, v34 -; SI-NEXT: v_mov_b32_e32 v34, v33 -; SI-NEXT: v_mov_b32_e32 v33, v32 -; SI-NEXT: v_mov_b32_e32 v32, v59 -; SI-NEXT: v_mov_b32_e32 v59, v31 -; SI-NEXT: v_mov_b32_e32 v48, v61 -; SI-NEXT: v_mov_b32_e32 v61, v26 -; SI-NEXT: v_mov_b32_e32 v49, v62 -; SI-NEXT: v_mov_b32_e32 v62, v27 +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_mov_b32_e32 v42, v52 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_mov_b32_e32 v41, v51 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v40, v50 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v55, v49 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v54, v48 +; SI-NEXT: v_mov_b32_e32 v52, v26 +; SI-NEXT: v_mov_b32_e32 v51, v28 +; SI-NEXT: v_mov_b32_e32 v50, v27 +; SI-NEXT: v_mov_b32_e32 v49, v30 +; SI-NEXT: v_mov_b32_e32 v48, v31 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v51, v46 -; SI-NEXT: v_mov_b32_e32 v27, v62 -; SI-NEXT: v_mov_b32_e32 v62, v49 -; SI-NEXT: v_mov_b32_e32 v26, v61 -; SI-NEXT: v_mov_b32_e32 v61, v48 -; SI-NEXT: v_mov_b32_e32 v31, v59 -; SI-NEXT: v_mov_b32_e32 v59, v32 -; SI-NEXT: v_mov_b32_e32 v32, v33 -; SI-NEXT: v_mov_b32_e32 v33, v34 -; SI-NEXT: v_mov_b32_e32 v34, v35 -; SI-NEXT: v_mov_b32_e32 v35, v36 -; SI-NEXT: v_mov_b32_e32 v36, v37 -; SI-NEXT: v_mov_b32_e32 v37, v38 -; SI-NEXT: v_mov_b32_e32 v30, v58 -; SI-NEXT: v_mov_b32_e32 v58, v63 -; SI-NEXT: v_mov_b32_e32 v63, v50 +; SI-NEXT: v_mov_b32_e32 v61, v60 +; SI-NEXT: v_mov_b32_e32 v31, v48 +; SI-NEXT: v_mov_b32_e32 v30, v49 +; SI-NEXT: v_mov_b32_e32 v27, v50 +; SI-NEXT: v_mov_b32_e32 v28, v51 +; SI-NEXT: v_mov_b32_e32 v26, v52 +; SI-NEXT: v_mov_b32_e32 v48, v54 +; SI-NEXT: v_mov_b32_e32 v49, v55 +; SI-NEXT: v_mov_b32_e32 v50, v40 +; SI-NEXT: v_mov_b32_e32 v51, v41 +; SI-NEXT: v_mov_b32_e32 v52, v42 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v57, v58 +; SI-NEXT: v_mov_b32_e32 v56, v47 +; SI-NEXT: v_mov_b32_e32 v47, v45 +; SI-NEXT: v_mov_b32_e32 v45, v44 ; SI-NEXT: s_branch .LBB47_2 ; ; VI-LABEL: bitcast_v52f16_to_v13i64_scalar: @@ -31634,284 +30718,204 @@ define <52 x i16> @bitcast_v13f64_to_v52i16(<13 x double> %a, i32 %b) { ; SI-LABEL: bitcast_v13f64_to_v52i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v27 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB48_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_alignbit_b32 v27, v26, v25, 16 -; SI-NEXT: v_alignbit_b32 v28, v24, v23, 16 -; SI-NEXT: v_alignbit_b32 v29, v22, v21, 16 -; SI-NEXT: v_alignbit_b32 v30, v20, v19, 16 -; SI-NEXT: v_alignbit_b32 v31, v18, v17, 16 -; SI-NEXT: v_alignbit_b32 v32, v16, v15, 16 -; SI-NEXT: v_alignbit_b32 v34, v14, v13, 16 -; SI-NEXT: v_alignbit_b32 v37, v12, v11, 16 -; SI-NEXT: v_alignbit_b32 v39, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v49, v8, v7, 16 -; SI-NEXT: v_alignbit_b32 v52, v6, v5, 16 -; SI-NEXT: v_alignbit_b32 v54, v4, v3, 16 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_alignbit_b32 v40, v2, v1, 16 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v10 +; SI-NEXT: v_alignbit_b32 v26, v25, v24, 16 +; SI-NEXT: v_alignbit_b32 v27, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v28, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v29, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v30, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v31, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v32, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v33, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v34, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v36, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v39, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v49, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v52, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v9 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v7 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v5 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v3 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v1 ; SI-NEXT: .LBB48_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB48_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 -; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 -; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 -; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 -; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 -; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 -; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 -; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 -; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 -; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 -; SI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 -; SI-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 -; SI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 -; SI-NEXT: v_alignbit_b32 v27, v26, v25, 16 -; SI-NEXT: v_alignbit_b32 v28, v24, v23, 16 -; SI-NEXT: v_alignbit_b32 v29, v22, v21, 16 -; SI-NEXT: v_alignbit_b32 v30, v20, v19, 16 -; SI-NEXT: v_alignbit_b32 v31, v18, v17, 16 -; SI-NEXT: v_alignbit_b32 v32, v16, v15, 16 -; SI-NEXT: v_alignbit_b32 v34, v14, v13, 16 -; SI-NEXT: v_alignbit_b32 v37, v12, v11, 16 -; SI-NEXT: v_alignbit_b32 v39, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v49, v8, v7, 16 -; SI-NEXT: v_alignbit_b32 v52, v6, v5, 16 -; SI-NEXT: v_alignbit_b32 v54, v4, v3, 16 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_alignbit_b32 v40, v2, v1, 16 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v10 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; SI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; SI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; SI-NEXT: v_alignbit_b32 v26, v25, v24, 16 +; SI-NEXT: v_alignbit_b32 v27, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v28, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v29, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v30, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v31, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v32, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v33, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v34, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v36, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v39, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v49, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v52, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v9 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v7 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v5 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v3 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v1 ; SI-NEXT: .LBB48_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; SI-NEXT: v_or_b32_e32 v0, v0, v52 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v43 +; SI-NEXT: v_or_b32_e32 v2, v2, v49 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v42 +; SI-NEXT: v_or_b32_e32 v4, v4, v39 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v41 +; SI-NEXT: v_or_b32_e32 v6, v6, v36 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v40 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; SI-NEXT: v_or_b32_e32 v1, v1, v40 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v44 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v43 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v42 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v49 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v53 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v28 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v24 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v25 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v26 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v8, v8, v34 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v55 +; SI-NEXT: v_or_b32_e32 v10, v10, v33 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v54 +; SI-NEXT: v_or_b32_e32 v12, v12, v32 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v53 +; SI-NEXT: v_or_b32_e32 v14, v14, v31 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v51 +; SI-NEXT: v_or_b32_e32 v16, v16, v30 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v50 +; SI-NEXT: v_or_b32_e32 v18, v18, v29 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v48 +; SI-NEXT: v_or_b32_e32 v20, v20, v28 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v38 +; SI-NEXT: v_or_b32_e32 v22, v22, v27 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v37 +; SI-NEXT: v_or_b32_e32 v24, v24, v26 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v52 +; SI-NEXT: v_or_b32_e32 v3, v3, v49 +; SI-NEXT: v_or_b32_e32 v5, v5, v39 +; SI-NEXT: v_or_b32_e32 v7, v7, v36 +; SI-NEXT: v_or_b32_e32 v9, v9, v34 +; SI-NEXT: v_or_b32_e32 v11, v11, v33 +; SI-NEXT: v_or_b32_e32 v13, v13, v32 +; SI-NEXT: v_or_b32_e32 v15, v15, v31 +; SI-NEXT: v_or_b32_e32 v17, v17, v30 +; SI-NEXT: v_or_b32_e32 v19, v19, v29 +; SI-NEXT: v_or_b32_e32 v21, v21, v28 +; SI-NEXT: v_or_b32_e32 v23, v23, v27 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v13f64_to_v52i16: @@ -32407,305 +31411,248 @@ define inreg <52 x i16> @bitcast_v13f64_to_v52i16_scalar(<13 x double> inreg %a, ; SI-LABEL: bitcast_v13f64_to_v52i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 -; SI-NEXT: v_mov_b32_e32 v25, s16 -; SI-NEXT: v_mov_b32_e32 v26, s17 -; SI-NEXT: v_mov_b32_e32 v23, s18 -; SI-NEXT: v_mov_b32_e32 v24, s19 -; SI-NEXT: v_mov_b32_e32 v19, s20 -; SI-NEXT: v_mov_b32_e32 v20, s21 -; SI-NEXT: v_mov_b32_e32 v21, s22 -; SI-NEXT: v_mov_b32_e32 v22, s23 -; SI-NEXT: v_mov_b32_e32 v17, s24 -; SI-NEXT: v_mov_b32_e32 v18, s25 -; SI-NEXT: v_mov_b32_e32 v15, s26 -; SI-NEXT: v_mov_b32_e32 v16, s27 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: v_mov_b32_e32 v24, s16 +; SI-NEXT: v_mov_b32_e32 v25, s17 +; SI-NEXT: v_mov_b32_e32 v20, s18 +; SI-NEXT: v_mov_b32_e32 v21, s19 +; SI-NEXT: v_mov_b32_e32 v16, s20 +; SI-NEXT: v_mov_b32_e32 v17, s21 +; SI-NEXT: v_mov_b32_e32 v22, s22 +; SI-NEXT: v_mov_b32_e32 v23, s23 +; SI-NEXT: v_mov_b32_e32 v18, s24 +; SI-NEXT: v_mov_b32_e32 v19, s25 +; SI-NEXT: v_mov_b32_e32 v14, s26 +; SI-NEXT: v_mov_b32_e32 v15, s27 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mov_b32_e32 v13, s28 -; SI-NEXT: v_mov_b32_e32 v14, s29 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB49_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshr_b64 v[27:28], v[11:12], 16 -; SI-NEXT: v_lshr_b64 v[28:29], v[9:10], 16 -; SI-NEXT: v_lshr_b64 v[29:30], v[7:8], 16 -; SI-NEXT: v_lshr_b64 v[30:31], v[5:6], 16 -; SI-NEXT: v_lshr_b64 v[31:32], v[3:4], 16 -; SI-NEXT: v_lshr_b64 v[32:33], v[1:2], 16 -; SI-NEXT: v_lshr_b64 v[36:37], v[13:14], 16 -; SI-NEXT: v_lshr_b64 v[33:34], v[15:16], 16 -; SI-NEXT: v_lshr_b64 v[37:38], v[21:22], 16 -; SI-NEXT: v_lshr_b64 v[48:49], v[23:24], 16 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v2 +; SI-NEXT: v_lshr_b64 v[48:49], v[8:9], 16 +; SI-NEXT: v_lshr_b64 v[49:50], v[6:7], 16 +; SI-NEXT: v_lshr_b64 v[50:51], v[4:5], 16 +; SI-NEXT: v_lshr_b64 v[51:52], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[52:53], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[34:35], v[14:15], 16 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v3 ; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v1 ; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v13 ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v15 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v19 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v23 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v17 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v26 -; SI-NEXT: v_lshr_b64 v[34:35], v[17:18], 16 -; SI-NEXT: v_lshr_b64 v[38:39], v[19:20], 16 -; SI-NEXT: v_lshr_b64 v[49:50], v[25:26], 16 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v25 +; SI-NEXT: v_lshr_b64 v[38:39], v[10:11], 16 +; SI-NEXT: v_lshr_b64 v[53:54], v[12:13], 16 +; SI-NEXT: v_lshr_b64 v[32:33], v[18:19], 16 +; SI-NEXT: v_lshr_b64 v[30:31], v[22:23], 16 +; SI-NEXT: v_lshr_b64 v[28:29], v[16:17], 16 +; SI-NEXT: v_lshr_b64 v[26:27], v[20:21], 16 +; SI-NEXT: v_lshr_b64 v[35:36], v[24:25], 16 ; SI-NEXT: s_cbranch_execnz .LBB49_3 ; SI-NEXT: .LBB49_2: ; %cmp.true -; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 -; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 -; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 -; SI-NEXT: v_lshr_b64 v[27:28], v[11:12], 16 -; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 -; SI-NEXT: v_lshr_b64 v[28:29], v[9:10], 16 -; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 -; SI-NEXT: v_lshr_b64 v[29:30], v[7:8], 16 -; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 -; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 -; SI-NEXT: v_lshr_b64 v[30:31], v[5:6], 16 -; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 -; SI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 -; SI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 -; SI-NEXT: v_lshr_b64 v[31:32], v[3:4], 16 -; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 -; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 -; SI-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 -; SI-NEXT: v_lshr_b64 v[32:33], v[1:2], 16 -; SI-NEXT: v_lshr_b64 v[36:37], v[13:14], 16 -; SI-NEXT: v_lshr_b64 v[33:34], v[15:16], 16 -; SI-NEXT: v_lshr_b64 v[37:38], v[21:22], 16 -; SI-NEXT: v_lshr_b64 v[48:49], v[23:24], 16 -; SI-NEXT: v_lshr_b64 v[34:35], v[17:18], 16 -; SI-NEXT: v_lshr_b64 v[38:39], v[19:20], 16 -; SI-NEXT: v_lshr_b64 v[49:50], v[25:26], 16 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v2 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_lshr_b64 v[48:49], v[8:9], 16 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_lshr_b64 v[49:50], v[6:7], 16 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_lshr_b64 v[50:51], v[4:5], 16 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; SI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; SI-NEXT: v_lshr_b64 v[51:52], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[52:53], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[34:35], v[14:15], 16 +; SI-NEXT: v_lshr_b64 v[38:39], v[10:11], 16 +; SI-NEXT: v_lshr_b64 v[53:54], v[12:13], 16 +; SI-NEXT: v_lshr_b64 v[32:33], v[18:19], 16 +; SI-NEXT: v_lshr_b64 v[30:31], v[22:23], 16 +; SI-NEXT: v_lshr_b64 v[28:29], v[16:17], 16 +; SI-NEXT: v_lshr_b64 v[26:27], v[20:21], 16 +; SI-NEXT: v_lshr_b64 v[35:36], v[24:25], 16 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v3 ; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v1 ; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v13 ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v15 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v19 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v23 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v17 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v25 ; SI-NEXT: .LBB49_3: ; %end -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v49 -; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; SI-NEXT: v_or_b32_e32 v25, v25, v35 -; SI-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v25, 0xffff, v26 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v47 -; SI-NEXT: v_or_b32_e32 v25, v25, v26 -; SI-NEXT: v_add_i32_e32 v26, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v25, v26, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v48 -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; SI-NEXT: v_or_b32_e32 v23, v23, v25 -; SI-NEXT: v_add_i32_e32 v25, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v23, v25, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v46 -; SI-NEXT: v_or_b32_e32 v23, v23, v24 -; SI-NEXT: v_add_i32_e32 v24, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v23, v24, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v38 -; SI-NEXT: v_or_b32_e32 v19, v19, v23 -; SI-NEXT: v_add_i32_e32 v23, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v19, v23, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v45 -; SI-NEXT: v_or_b32_e32 v19, v19, v20 -; SI-NEXT: v_add_i32_e32 v20, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v21 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v37 -; SI-NEXT: v_or_b32_e32 v19, v19, v20 -; SI-NEXT: v_add_i32_e32 v20, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v22 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v44 -; SI-NEXT: v_or_b32_e32 v19, v19, v20 -; SI-NEXT: v_add_i32_e32 v20, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v34 -; SI-NEXT: v_or_b32_e32 v17, v17, v19 -; SI-NEXT: v_add_i32_e32 v19, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v17, v19, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v43 -; SI-NEXT: v_or_b32_e32 v17, v17, v18 -; SI-NEXT: v_add_i32_e32 v18, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v33 -; SI-NEXT: v_or_b32_e32 v15, v15, v17 -; SI-NEXT: v_add_i32_e32 v17, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v15, v17, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v42 -; SI-NEXT: v_or_b32_e32 v15, v15, v16 -; SI-NEXT: v_add_i32_e32 v16, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v35 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v36, v24, v27 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v37 +; SI-NEXT: v_or_b32_e32 v37, v24, v25 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v26 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v26, v20, v24 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v58 +; SI-NEXT: v_or_b32_e32 v27, v20, v21 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v28 +; SI-NEXT: v_or_b32_e32 v28, v16, v20 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v57 +; SI-NEXT: v_or_b32_e32 v29, v16, v17 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v30 +; SI-NEXT: v_or_b32_e32 v30, v16, v17 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v56 +; SI-NEXT: v_or_b32_e32 v31, v16, v17 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v32 +; SI-NEXT: v_or_b32_e32 v32, v16, v17 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v47 +; SI-NEXT: v_or_b32_e32 v33, v16, v17 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v34 +; SI-NEXT: v_or_b32_e32 v34, v14, v16 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v46 +; SI-NEXT: v_or_b32_e32 v35, v14, v15 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v53 +; SI-NEXT: v_or_b32_e32 v12, v12, v14 ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v36 -; SI-NEXT: v_or_b32_e32 v13, v13, v15 -; SI-NEXT: v_add_i32_e32 v15, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v13, v15, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v41 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v45 ; SI-NEXT: v_or_b32_e32 v13, v13, v14 -; SI-NEXT: v_add_i32_e32 v14, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v32 -; SI-NEXT: v_or_b32_e32 v1, v1, v13 -; SI-NEXT: v_add_i32_e32 v13, vcc, 56, v0 -; SI-NEXT: buffer_store_dword v1, v13, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v53 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v28 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v52 +; SI-NEXT: v_or_b32_e32 v14, v0, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v44 +; SI-NEXT: v_or_b32_e32 v15, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v51 +; SI-NEXT: v_or_b32_e32 v16, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v43 +; SI-NEXT: v_or_b32_e32 v17, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v50 +; SI-NEXT: v_or_b32_e32 v18, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v42 +; SI-NEXT: v_or_b32_e32 v19, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v49 +; SI-NEXT: v_or_b32_e32 v20, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v41 +; SI-NEXT: v_or_b32_e32 v21, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v48 +; SI-NEXT: v_or_b32_e32 v22, v0, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v40 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v23, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v38 +; SI-NEXT: v_or_b32_e32 v24, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v55 +; SI-NEXT: v_or_b32_e32 v25, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, v36 +; SI-NEXT: v_mov_b32_e32 v1, v37 +; SI-NEXT: v_mov_b32_e32 v2, v26 +; SI-NEXT: v_mov_b32_e32 v3, v27 +; SI-NEXT: v_mov_b32_e32 v4, v28 +; SI-NEXT: v_mov_b32_e32 v5, v29 +; SI-NEXT: v_mov_b32_e32 v6, v30 +; SI-NEXT: v_mov_b32_e32 v7, v31 +; SI-NEXT: v_mov_b32_e32 v8, v32 +; SI-NEXT: v_mov_b32_e32 v9, v33 +; SI-NEXT: v_mov_b32_e32 v10, v34 +; SI-NEXT: v_mov_b32_e32 v11, v35 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB49_4: -; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: s_branch .LBB49_2 ; ; VI-LABEL: bitcast_v13f64_to_v52i16_scalar: @@ -33447,192 +32394,242 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v52i16_to_v13f64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v50, v10 -; SI-NEXT: v_mov_b32_e32 v51, v8 -; SI-NEXT: v_mov_b32_e32 v52, v6 -; SI-NEXT: v_mov_b32_e32 v53, v4 -; SI-NEXT: v_mov_b32_e32 v54, v2 -; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:12 -; SI-NEXT: v_mov_b32_e32 v49, v12 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v29 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v8 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:4 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v12 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v18 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:60 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:52 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v22 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:44 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:36 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:28 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v37, v20 +; SI-NEXT: v_mov_b32_e32 v38, v19 +; SI-NEXT: v_mov_b32_e32 v39, v18 +; SI-NEXT: v_mov_b32_e32 v48, v17 +; SI-NEXT: v_mov_b32_e32 v49, v16 +; SI-NEXT: v_mov_b32_e32 v50, v15 +; SI-NEXT: v_mov_b32_e32 v51, v14 +; SI-NEXT: v_mov_b32_e32 v52, v13 +; SI-NEXT: v_mov_b32_e32 v53, v12 +; SI-NEXT: v_mov_b32_e32 v54, v11 +; SI-NEXT: v_mov_b32_e32 v55, v10 +; SI-NEXT: v_mov_b32_e32 v40, v9 +; SI-NEXT: v_mov_b32_e32 v41, v8 +; SI-NEXT: v_mov_b32_e32 v42, v7 +; SI-NEXT: v_mov_b32_e32 v43, v6 +; SI-NEXT: v_mov_b32_e32 v44, v5 +; SI-NEXT: v_mov_b32_e32 v45, v4 +; SI-NEXT: v_mov_b32_e32 v46, v3 +; SI-NEXT: v_mov_b32_e32 v47, v2 +; SI-NEXT: v_mov_b32_e32 v56, v1 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_mov_b32_e32 v57, v0 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v25 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v24 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v23 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v22 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v38 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v48 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v50 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v52 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v40 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v42 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v43 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v44 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v45 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v46 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v47 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v56 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v57 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB50_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v53 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v52 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v51 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v50 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v49 -; SI-NEXT: v_or_b32_e32 v0, v0, v58 -; SI-NEXT: v_or_b32_e32 v1, v1, v48 -; SI-NEXT: v_or_b32_e32 v2, v2, v39 -; SI-NEXT: v_or_b32_e32 v3, v3, v57 -; SI-NEXT: v_or_b32_e32 v4, v4, v38 -; SI-NEXT: v_or_b32_e32 v5, v5, v56 -; SI-NEXT: v_or_b32_e32 v6, v6, v47 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v54 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v57 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v56 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v47 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v46 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v45 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v44 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v43 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v42 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v41 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v40 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v55 +; SI-NEXT: v_or_b32_e32 v0, v0, v60 +; SI-NEXT: v_or_b32_e32 v1, v1, v36 +; SI-NEXT: v_or_b32_e32 v2, v2, v59 +; SI-NEXT: v_or_b32_e32 v3, v3, v35 +; SI-NEXT: v_or_b32_e32 v4, v4, v58 +; SI-NEXT: v_or_b32_e32 v5, v5, v34 +; SI-NEXT: v_or_b32_e32 v6, v6, v63 +; SI-NEXT: v_or_b32_e32 v7, v7, v33 +; SI-NEXT: v_or_b32_e32 v8, v8, v62 +; SI-NEXT: v_or_b32_e32 v9, v9, v32 +; SI-NEXT: v_or_b32_e32 v10, v10, v61 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v53 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v52 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v51 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v50 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v49 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v48 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v39 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v38 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v37 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_or_b32_e32 v24, v24, v25 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_or_b32_e32 v25, v25, v26 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; kill: killed $vgpr26 @@ -33669,97 +32666,39 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; kill: killed $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 ; SI-NEXT: ; kill: killed $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: v_or_b32_e32 v7, v7, v37 -; SI-NEXT: v_or_b32_e32 v8, v8, v46 -; SI-NEXT: v_or_b32_e32 v9, v9, v45 -; SI-NEXT: v_or_b32_e32 v10, v10, v36 -; SI-NEXT: v_or_b32_e32 v11, v11, v44 -; SI-NEXT: v_or_b32_e32 v12, v12, v43 -; SI-NEXT: v_or_b32_e32 v13, v13, v35 -; SI-NEXT: v_or_b32_e32 v14, v14, v42 -; SI-NEXT: v_or_b32_e32 v15, v15, v34 -; SI-NEXT: v_or_b32_e32 v16, v16, v41 -; SI-NEXT: v_or_b32_e32 v17, v17, v33 -; SI-NEXT: v_or_b32_e32 v18, v18, v40 -; SI-NEXT: v_or_b32_e32 v19, v19, v32 -; SI-NEXT: v_or_b32_e32 v20, v20, v63 -; SI-NEXT: v_or_b32_e32 v21, v21, v62 -; SI-NEXT: v_or_b32_e32 v22, v22, v61 -; SI-NEXT: v_or_b32_e32 v23, v23, v60 -; SI-NEXT: v_or_b32_e32 v24, v24, v59 ; SI-NEXT: ; kill: killed $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; kill: killed $vgpr26 ; SI-NEXT: .LBB50_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB50_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v53 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v52 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v51 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v50 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v49 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v54 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v57 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v56 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v47 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v46 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v45 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v44 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v43 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v42 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v41 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v40 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v55 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 @@ -33767,14 +32706,22 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v0, v60, v0 ; SI-NEXT: s_mov_b32 s6, 0x30000 -; SI-NEXT: v_or_b32_e32 v1, v48, v1 -; SI-NEXT: v_or_b32_e32 v2, v39, v2 -; SI-NEXT: v_or_b32_e32 v3, v57, v3 -; SI-NEXT: v_or_b32_e32 v4, v38, v4 -; SI-NEXT: v_or_b32_e32 v5, v56, v5 -; SI-NEXT: v_or_b32_e32 v6, v47, v6 +; SI-NEXT: v_or_b32_e32 v1, v36, v1 +; SI-NEXT: v_or_b32_e32 v2, v59, v2 +; SI-NEXT: v_or_b32_e32 v3, v35, v3 +; SI-NEXT: v_or_b32_e32 v4, v58, v4 +; SI-NEXT: v_or_b32_e32 v5, v34, v5 +; SI-NEXT: v_or_b32_e32 v6, v63, v6 +; SI-NEXT: v_or_b32_e32 v7, v33, v7 +; SI-NEXT: v_or_b32_e32 v8, v62, v8 +; SI-NEXT: v_or_b32_e32 v9, v32, v9 +; SI-NEXT: v_or_b32_e32 v10, v61, v10 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 @@ -33782,78 +32729,50 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 ; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 ; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v53 ; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v52 ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v51 ; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v50 ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v49 ; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v48 ; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v39 ; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v38 ; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v37 ; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; SI-NEXT: v_or_b32_e32 v7, v37, v7 -; SI-NEXT: v_or_b32_e32 v8, v46, v8 -; SI-NEXT: v_or_b32_e32 v9, v45, v9 -; SI-NEXT: v_or_b32_e32 v10, v36, v10 -; SI-NEXT: v_or_b32_e32 v11, v44, v11 -; SI-NEXT: v_or_b32_e32 v12, v43, v12 -; SI-NEXT: v_or_b32_e32 v13, v35, v13 -; SI-NEXT: v_or_b32_e32 v14, v42, v14 -; SI-NEXT: v_or_b32_e32 v15, v34, v15 -; SI-NEXT: v_or_b32_e32 v16, v41, v16 -; SI-NEXT: v_or_b32_e32 v17, v33, v17 -; SI-NEXT: v_or_b32_e32 v18, v40, v18 -; SI-NEXT: v_or_b32_e32 v19, v32, v19 -; SI-NEXT: v_or_b32_e32 v20, v63, v20 -; SI-NEXT: v_or_b32_e32 v21, v62, v21 -; SI-NEXT: v_or_b32_e32 v22, v61, v22 -; SI-NEXT: v_or_b32_e32 v23, v60, v23 -; SI-NEXT: v_or_b32_e32 v24, v59, v24 -; SI-NEXT: v_or_b32_e32 v25, v26, v25 -; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 -; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 ; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 ; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 @@ -33864,29 +32783,58 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 ; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 ; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v20 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v24, vcc, 0x30000, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 ; SI-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v25 ; SI-NEXT: .LBB50_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -34595,366 +33543,296 @@ define inreg <13 x double> @bitcast_v52i16_to_v13f64_scalar(<52 x i16> inreg %a, ; SI-LABEL: bitcast_v52i16_to_v13f64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v47, v8 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_mov_b32_e32 v57, v6 -; SI-NEXT: v_mov_b32_e32 v32, v4 -; SI-NEXT: v_mov_b32_e32 v34, v2 -; SI-NEXT: v_mov_b32_e32 v37, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v62, v30 -; SI-NEXT: v_mov_b32_e32 v30, v24 -; SI-NEXT: v_mov_b32_e32 v38, v22 -; SI-NEXT: v_mov_b32_e32 v39, v20 -; SI-NEXT: v_mov_b32_e32 v48, v18 -; SI-NEXT: v_mov_b32_e32 v49, v16 -; SI-NEXT: v_mov_b32_e32 v50, v14 -; SI-NEXT: v_mov_b32_e32 v40, v12 -; SI-NEXT: v_mov_b32_e32 v41, v10 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v1 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v29 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v2 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v4 +; SI-NEXT: v_mov_b32_e32 v36, v7 +; SI-NEXT: v_mov_b32_e32 v35, v8 +; SI-NEXT: v_mov_b32_e32 v51, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v36 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v34, v9 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v35 +; SI-NEXT: v_mov_b32_e32 v33, v10 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v34 +; SI-NEXT: v_mov_b32_e32 v32, v11 +; SI-NEXT: v_mov_b32_e32 v37, v6 +; SI-NEXT: v_mov_b32_e32 v38, v5 +; SI-NEXT: v_mov_b32_e32 v39, v4 +; SI-NEXT: v_mov_b32_e32 v48, v3 +; SI-NEXT: v_mov_b32_e32 v49, v2 +; SI-NEXT: v_mov_b32_e32 v50, v1 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v38 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v48 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v50 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v51 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s7, s28, 16 +; SI-NEXT: s_lshr_b32 s8, s27, 16 +; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: s_lshr_b32 s13, s22, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s40, s19, 16 +; SI-NEXT: s_lshr_b32 s41, s18, 16 +; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v32 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v8 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v0 ; SI-NEXT: s_cbranch_scc0 .LBB51_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 -; SI-NEXT: v_or_b32_e32 v7, v0, v31 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 -; SI-NEXT: v_or_b32_e32 v9, v0, v61 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v57 -; SI-NEXT: v_or_b32_e32 v10, v0, v60 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v47 -; SI-NEXT: v_or_b32_e32 v11, v0, v59 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v41 -; SI-NEXT: v_or_b32_e32 v12, v0, v36 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v40 -; SI-NEXT: v_or_b32_e32 v13, v0, v35 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s44, s42, 16 +; SI-NEXT: s_or_b32 s5, s5, s44 +; SI-NEXT: s_and_b32 s44, s18, 0xffff +; SI-NEXT: s_lshl_b32 s45, s41, 16 +; SI-NEXT: s_or_b32 s44, s44, s45 +; SI-NEXT: s_and_b32 s45, s19, 0xffff +; SI-NEXT: s_lshl_b32 s46, s40, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; SI-NEXT: s_or_b32 s45, s45, s46 +; SI-NEXT: s_and_b32 s46, s20, 0xffff +; SI-NEXT: s_lshl_b32 s47, s15, 16 +; SI-NEXT: v_or_b32_e32 v14, v0, v47 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 -; SI-NEXT: v_or_b32_e32 v14, v0, v33 +; SI-NEXT: s_or_b32 s46, s46, s47 +; SI-NEXT: s_and_b32 s47, s21, 0xffff +; SI-NEXT: s_lshl_b32 s56, s14, 16 +; SI-NEXT: v_or_b32_e32 v15, v0, v46 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 -; SI-NEXT: v_or_b32_e32 v15, v0, v55 +; SI-NEXT: s_or_b32 s47, s47, s56 +; SI-NEXT: s_and_b32 s56, s22, 0xffff +; SI-NEXT: s_lshl_b32 s57, s13, 16 +; SI-NEXT: v_or_b32_e32 v16, v0, v45 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 -; SI-NEXT: v_or_b32_e32 v16, v0, v54 +; SI-NEXT: s_or_b32 s56, s56, s57 +; SI-NEXT: s_and_b32 s57, s23, 0xffff +; SI-NEXT: s_lshl_b32 s58, s12, 16 +; SI-NEXT: v_or_b32_e32 v17, v0, v44 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 -; SI-NEXT: v_or_b32_e32 v17, v0, v58 +; SI-NEXT: s_or_b32 s57, s57, s58 +; SI-NEXT: s_and_b32 s58, s24, 0xffff +; SI-NEXT: s_lshl_b32 s59, s11, 16 +; SI-NEXT: v_or_b32_e32 v18, v0, v43 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 -; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: v_or_b32_e32 v18, v0, v53 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: v_or_b32_e32 v19, v0, v56 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v26 -; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: v_or_b32_e32 v20, v0, v52 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v28 -; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: v_or_b32_e32 v21, v0, v46 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v62 -; SI-NEXT: s_or_b32 s7, s7, s8 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: v_or_b32_e32 v22, v0, v45 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v44 -; SI-NEXT: s_or_b32 s8, s8, s9 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: v_or_b32_e32 v23, v0, v51 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v43 -; SI-NEXT: s_or_b32 s9, s9, s10 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v34 -; SI-NEXT: v_or_b32_e32 v24, v0, v29 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v42 -; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_or_b32_e32 v8, v1, v63 -; SI-NEXT: v_or_b32_e32 v25, v0, v27 +; SI-NEXT: s_or_b32 s58, s58, s59 +; SI-NEXT: s_and_b32 s59, s25, 0xffff +; SI-NEXT: s_lshl_b32 s60, s10, 16 +; SI-NEXT: v_or_b32_e32 v19, v0, v42 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: s_or_b32 s59, s59, s60 +; SI-NEXT: s_and_b32 s60, s26, 0xffff +; SI-NEXT: s_lshl_b32 s61, s9, 16 +; SI-NEXT: v_or_b32_e32 v20, v0, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: s_or_b32 s60, s60, s61 +; SI-NEXT: s_and_b32 s61, s27, 0xffff +; SI-NEXT: s_lshl_b32 s62, s8, 16 +; SI-NEXT: v_or_b32_e32 v21, v0, v40 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: s_or_b32 s61, s61, s62 +; SI-NEXT: s_and_b32 s62, s28, 0xffff +; SI-NEXT: s_lshl_b32 s63, s7, 16 +; SI-NEXT: v_or_b32_e32 v22, v0, v55 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: s_or_b32 s62, s62, s63 +; SI-NEXT: s_and_b32 s63, s29, 0xffff +; SI-NEXT: s_lshl_b32 s72, s6, 16 +; SI-NEXT: v_or_b32_e32 v23, v0, v54 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: s_or_b32 s63, s63, s72 +; SI-NEXT: v_or_b32_e32 v24, v0, v53 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: v_or_b32_e32 v25, v0, v52 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_mov_b32_e32 v5, s9 -; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: v_mov_b32_e32 v2, s44 +; SI-NEXT: v_mov_b32_e32 v3, s45 +; SI-NEXT: v_mov_b32_e32 v4, s46 +; SI-NEXT: v_mov_b32_e32 v5, s47 +; SI-NEXT: v_mov_b32_e32 v6, s56 +; SI-NEXT: v_mov_b32_e32 v7, s57 +; SI-NEXT: v_mov_b32_e32 v8, s58 +; SI-NEXT: v_mov_b32_e32 v9, s59 +; SI-NEXT: v_mov_b32_e32 v10, s60 +; SI-NEXT: v_mov_b32_e32 v11, s61 +; SI-NEXT: v_mov_b32_e32 v12, s62 +; SI-NEXT: v_mov_b32_e32 v13, s63 ; SI-NEXT: s_cbranch_execnz .LBB51_3 ; SI-NEXT: .LBB51_2: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_or_b32_e32 v0, v31, v0 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_or_b32_e32 v0, v61, v0 -; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v57 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v60, v0 -; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v47 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v59, v0 -; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v36, v0 -; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v40 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v35, v0 -; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v33, v0 +; SI-NEXT: v_or_b32_e32 v0, v47, v0 ; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v55, v0 +; SI-NEXT: v_or_b32_e32 v0, v46, v0 ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v54, v0 +; SI-NEXT: v_or_b32_e32 v0, v45, v0 ; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: v_or_b32_e32 v0, v44, v0 ; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v53, v0 +; SI-NEXT: v_or_b32_e32 v0, v43, v0 ; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v56, v0 +; SI-NEXT: v_or_b32_e32 v0, v42, v0 ; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v52, v0 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: v_or_b32_e32 v0, v41, v0 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s16, s42, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: s_and_b32 s16, s18, 0xffff +; SI-NEXT: s_lshl_b32 s17, s41, 16 +; SI-NEXT: s_add_i32 s19, s19, 3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_and_b32 s17, s19, 0xffff +; SI-NEXT: s_lshl_b32 s18, s40, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_or_b32_e32 v0, v40, v0 +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: s_and_b32 s18, s20, 0xffff +; SI-NEXT: s_lshl_b32 s15, s15, 16 +; SI-NEXT: s_add_i32 s21, s21, 3 ; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v62 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: s_or_b32 s15, s15, s18 +; SI-NEXT: s_and_b32 s18, s21, 0xffff +; SI-NEXT: s_lshl_b32 s14, s14, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_or_b32_e32 v0, v45, v0 -; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s14, s14, s18 +; SI-NEXT: s_and_b32 s18, s22, 0xffff +; SI-NEXT: s_lshl_b32 s13, s13, 16 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: v_or_b32_e32 v0, v55, v0 +; SI-NEXT: s_or_b32 s13, s13, s18 +; SI-NEXT: s_and_b32 s18, s23, 0xffff +; SI-NEXT: s_lshl_b32 s12, s12, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 ; SI-NEXT: v_add_i32_e32 v22, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v44 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 +; SI-NEXT: s_or_b32 s12, s12, s18 +; SI-NEXT: s_and_b32 s18, s24, 0xffff +; SI-NEXT: s_lshl_b32 s11, s11, 16 +; SI-NEXT: s_add_i32 s25, s25, 3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: v_or_b32_e32 v0, v51, v0 -; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v43 -; SI-NEXT: s_or_b32 s7, s8, s7 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_or_b32 s11, s11, s18 +; SI-NEXT: s_and_b32 s18, s25, 0xffff +; SI-NEXT: s_lshl_b32 s10, s10, 16 ; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_or_b32 s8, s9, s8 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: v_or_b32_e32 v0, v54, v0 +; SI-NEXT: s_or_b32 s10, s10, s18 +; SI-NEXT: s_and_b32 s18, s26, 0xffff +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: s_or_b32 s9, s9, s18 +; SI-NEXT: s_and_b32 s18, s27, 0xffff +; SI-NEXT: s_lshl_b32 s8, s8, 16 ; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: v_or_b32_e32 v0, v29, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v34 -; SI-NEXT: s_or_b32 s9, s10, s9 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s8, s8, s18 +; SI-NEXT: s_and_b32 s18, s28, 0xffff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: v_or_b32_e32 v0, v53, v0 +; SI-NEXT: s_or_b32 s7, s7, s18 +; SI-NEXT: s_and_b32 s18, s29, 0xffff +; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: v_add_i32_e32 v24, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v42 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 +; SI-NEXT: s_or_b32 s6, s6, s18 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v63, v1 ; SI-NEXT: s_add_i32 s4, s4, 0x30000 ; SI-NEXT: s_add_i32 s5, s5, 0x30000 -; SI-NEXT: s_add_i32 s6, s6, 0x30000 -; SI-NEXT: s_add_i32 s7, s7, 0x30000 -; SI-NEXT: s_add_i32 s8, s8, 0x30000 -; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s16, s16, 0x30000 +; SI-NEXT: s_add_i32 s17, s17, 0x30000 +; SI-NEXT: s_add_i32 s15, s15, 0x30000 +; SI-NEXT: s_add_i32 s14, s14, 0x30000 +; SI-NEXT: s_add_i32 s13, s13, 0x30000 +; SI-NEXT: s_add_i32 s12, s12, 0x30000 +; SI-NEXT: s_add_i32 s11, s11, 0x30000 ; SI-NEXT: s_add_i32 s10, s10, 0x30000 -; SI-NEXT: v_or_b32_e32 v0, v27, v0 -; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v52, v0 ; SI-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v0 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_mov_b32_e32 v5, s9 -; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: v_mov_b32_e32 v3, s17 +; SI-NEXT: v_mov_b32_e32 v4, s15 +; SI-NEXT: v_mov_b32_e32 v5, s14 +; SI-NEXT: v_mov_b32_e32 v6, s13 +; SI-NEXT: v_mov_b32_e32 v7, s12 +; SI-NEXT: v_mov_b32_e32 v8, s11 +; SI-NEXT: v_mov_b32_e32 v9, s10 +; SI-NEXT: v_mov_b32_e32 v10, s9 +; SI-NEXT: v_mov_b32_e32 v11, s8 +; SI-NEXT: v_mov_b32_e32 v12, s7 +; SI-NEXT: v_mov_b32_e32 v13, s6 ; SI-NEXT: .LBB51_3: ; %end -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB51_4: -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v47, v43 -; SI-NEXT: v_mov_b32_e32 v43, v50 -; SI-NEXT: v_mov_b32_e32 v50, v38 -; SI-NEXT: v_mov_b32_e32 v38, v62 -; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v62, v56 -; SI-NEXT: v_mov_b32_e32 v56, v44 -; SI-NEXT: v_mov_b32_e32 v44, v40 -; SI-NEXT: v_mov_b32_e32 v40, v39 -; SI-NEXT: v_mov_b32_e32 v39, v28 -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v59, v45 -; SI-NEXT: v_mov_b32_e32 v45, v41 -; SI-NEXT: v_mov_b32_e32 v41, v48 -; SI-NEXT: v_mov_b32_e32 v48, v26 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v60, v52 -; SI-NEXT: v_mov_b32_e32 v52, v46 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_mov_b32_e32 v46, v42 -; SI-NEXT: v_mov_b32_e32 v42, v49 -; SI-NEXT: v_mov_b32_e32 v49, v30 -; SI-NEXT: v_mov_b32_e32 v61, v63 -; SI-NEXT: v_mov_b32_e32 v63, v57 -; SI-NEXT: v_mov_b32_e32 v57, v27 -; SI-NEXT: v_mov_b32_e32 v53, v37 -; SI-NEXT: v_mov_b32_e32 v37, v36 -; SI-NEXT: v_mov_b32_e32 v36, v35 -; SI-NEXT: v_mov_b32_e32 v35, v34 -; SI-NEXT: v_mov_b32_e32 v34, v33 -; SI-NEXT: v_mov_b32_e32 v33, v55 -; SI-NEXT: v_mov_b32_e32 v55, v32 -; SI-NEXT: v_mov_b32_e32 v32, v54 -; SI-NEXT: v_mov_b32_e32 v54, v58 -; SI-NEXT: v_mov_b32_e32 v58, v51 -; SI-NEXT: v_mov_b32_e32 v51, v29 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v29, v51 -; SI-NEXT: v_mov_b32_e32 v51, v58 -; SI-NEXT: v_mov_b32_e32 v58, v54 -; SI-NEXT: v_mov_b32_e32 v54, v32 -; SI-NEXT: v_mov_b32_e32 v32, v55 -; SI-NEXT: v_mov_b32_e32 v55, v33 -; SI-NEXT: v_mov_b32_e32 v33, v34 -; SI-NEXT: v_mov_b32_e32 v34, v35 -; SI-NEXT: v_mov_b32_e32 v35, v36 -; SI-NEXT: v_mov_b32_e32 v36, v37 -; SI-NEXT: v_mov_b32_e32 v37, v53 -; SI-NEXT: v_mov_b32_e32 v27, v57 -; SI-NEXT: v_mov_b32_e32 v57, v63 -; SI-NEXT: v_mov_b32_e32 v63, v61 -; SI-NEXT: v_mov_b32_e32 v30, v49 -; SI-NEXT: v_mov_b32_e32 v49, v42 -; SI-NEXT: v_mov_b32_e32 v42, v46 -; SI-NEXT: v_mov_b32_e32 v46, v52 -; SI-NEXT: v_mov_b32_e32 v52, v60 -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v26, v48 -; SI-NEXT: v_mov_b32_e32 v48, v41 -; SI-NEXT: v_mov_b32_e32 v41, v45 -; SI-NEXT: v_mov_b32_e32 v45, v59 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v28, v39 -; SI-NEXT: v_mov_b32_e32 v39, v40 -; SI-NEXT: v_mov_b32_e32 v40, v44 -; SI-NEXT: v_mov_b32_e32 v44, v56 -; SI-NEXT: v_mov_b32_e32 v56, v62 -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v62, v38 -; SI-NEXT: v_mov_b32_e32 v38, v50 -; SI-NEXT: v_mov_b32_e32 v50, v43 -; SI-NEXT: v_mov_b32_e32 v43, v47 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_branch .LBB51_2 ; ; VI-LABEL: bitcast_v52i16_to_v13f64_scalar: @@ -35632,16 +34510,16 @@ end: define <52 x half> @bitcast_v13f64_to_v52f16(<13 x double> %a, i32 %b) { ; SI-LABEL: bitcast_v13f64_to_v52f16: ; SI: ; %bb.0: -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; kill: killed $vgpr51 -; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; kill: killed $vgpr48 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v27 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; kill: killed $vgpr51 -; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; kill: killed $vgpr48 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -35658,129 +34536,115 @@ define <52 x half> @bitcast_v13f64_to_v52f16(<13 x double> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; kill: killed $vgpr51 -; SI-NEXT: ; kill: killed $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; kill: killed $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr62 ; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; kill: killed $vgpr51 -; SI-NEXT: ; kill: killed $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; kill: killed $vgpr51 -; SI-NEXT: ; kill: killed $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; kill: killed $vgpr51 -; SI-NEXT: ; kill: killed $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; kill: killed $vgpr51 -; SI-NEXT: ; kill: killed $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB52_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v4 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v25 -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v24 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 @@ -35788,97 +34652,106 @@ define <52 x half> @bitcast_v13f64_to_v52f16(<13 x double> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v35, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v37, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v39, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v27 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v49, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v24 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v58, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v56, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: .LBB52_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB52_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 -; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v55 -; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 -; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v20 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v53 -; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 -; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 -; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 -; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 -; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 -; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 -; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 -; SI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 -; SI-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v18 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; SI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; SI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v25 ; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 @@ -35893,271 +34766,187 @@ define <52 x half> @bitcast_v13f64_to_v52f16(<13 x double> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v56, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 ; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 ; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 ; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 ; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 ; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 ; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 ; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 ; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_mov_b32_e32 v55, v24 -; SI-NEXT: v_mov_b32_e32 v53, v25 -; SI-NEXT: v_mov_b32_e32 v51, v26 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v51, v24 +; SI-NEXT: v_mov_b32_e32 v49, v25 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: .LBB52_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v38 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v36 -; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v34 -; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 -; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v31 -; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v29 -; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v27 -; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v62 -; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v60 -; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v58 -; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 -; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v44 -; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 -; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 -; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 -; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 -; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v36 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v35 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v32 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v30 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v29 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v26 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v62 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v63 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v59 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v60 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v47 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v56 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v12, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v48 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v49 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v14, v54 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v20, v53 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v22, v40 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v55 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v53 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v51 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v44 ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -36174,7 +34963,13 @@ define <52 x half> @bitcast_v13f64_to_v52f16(<13 x double> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_or_b32_e32 v23, v25, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v51 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 +; SI-NEXT: v_or_b32_e32 v25, v27, v25 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v13f64_to_v52f16: @@ -36670,22 +35465,22 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a ; SI-LABEL: bitcast_v13f64_to_v52f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 -; SI-NEXT: v_mov_b32_e32 v25, s16 -; SI-NEXT: v_mov_b32_e32 v26, s17 -; SI-NEXT: v_mov_b32_e32 v21, s18 -; SI-NEXT: v_mov_b32_e32 v22, s19 -; SI-NEXT: v_mov_b32_e32 v17, s20 -; SI-NEXT: v_mov_b32_e32 v18, s21 -; SI-NEXT: v_mov_b32_e32 v23, s22 -; SI-NEXT: v_mov_b32_e32 v24, s23 -; SI-NEXT: v_mov_b32_e32 v19, s24 -; SI-NEXT: v_mov_b32_e32 v20, s25 -; SI-NEXT: v_mov_b32_e32 v15, s26 -; SI-NEXT: v_mov_b32_e32 v16, s27 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: v_mov_b32_e32 v24, s16 +; SI-NEXT: v_mov_b32_e32 v25, s17 +; SI-NEXT: v_mov_b32_e32 v20, s18 +; SI-NEXT: v_mov_b32_e32 v21, s19 +; SI-NEXT: v_mov_b32_e32 v16, s20 +; SI-NEXT: v_mov_b32_e32 v17, s21 +; SI-NEXT: v_mov_b32_e32 v22, s22 +; SI-NEXT: v_mov_b32_e32 v23, s23 +; SI-NEXT: v_mov_b32_e32 v18, s24 +; SI-NEXT: v_mov_b32_e32 v19, s25 +; SI-NEXT: v_mov_b32_e32 v14, s26 +; SI-NEXT: v_mov_b32_e32 v15, s27 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mov_b32_e32 v13, s28 -; SI-NEXT: v_mov_b32_e32 v14, s29 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -36704,173 +35499,164 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB53_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v12 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v60, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v10 -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v19 -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v18 -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v27 ; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v27 ; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v27, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v21 -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v11 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v59, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v10 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v27, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v25 -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v57, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v26 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v27, v7 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v7 ; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v27, v6 -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v43, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v26 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v27, v5 -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v11 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v27, v4 -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v27, v3 -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v51, v10 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v3 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v27, v2 -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v48, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v26 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v27, v1 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v0 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v27, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v14 ; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v27, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v22 ; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v27, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v27, v16 -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v27, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v14 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v27, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v24 ; SI-NEXT: s_cbranch_execnz .LBB53_3 ; SI-NEXT: .LBB53_2: ; %cmp.true -; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 -; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 -; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 -; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v5 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v14 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v6 -; SI-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v13 -; SI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 -; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 -; SI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 -; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 -; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 -; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v43 -; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v26 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v13 +; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v41 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v12 +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 @@ -36881,267 +35667,185 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 ; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 ; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 ; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 ; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 ; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 ; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 ; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 ; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_mov_b32_e32 v43, v10 -; SI-NEXT: v_mov_b32_e32 v41, v11 -; SI-NEXT: v_mov_b32_e32 v51, v12 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_mov_b32_e32 v51, v10 +; SI-NEXT: v_mov_b32_e32 v49, v11 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: .LBB53_3: ; %end ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v48 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v39 -; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v38 -; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v36 -; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v34 -; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 -; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v31 -; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v38 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v29 -; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v27 -; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v62 -; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 -; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 -; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 -; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 -; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 -; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 -; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v36 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v62 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v34 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v58 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v30 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v45 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v27 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v41 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v61 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v12, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v48 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v49 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v14, v53 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v20, v42 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v22, v46 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v43 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v41 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v51 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v57 ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -37158,76 +35862,81 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_or_b32_e32 v23, v25, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v51 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 +; SI-NEXT: v_or_b32_e32 v25, v27, v25 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB53_4: -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; kill: killed $vgpr51 -; SI-NEXT: ; kill: killed $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; kill: killed $vgpr51 -; SI-NEXT: ; kill: killed $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; kill: killed $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; kill: killed $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; kill: killed $vgpr51 -; SI-NEXT: ; kill: killed $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; kill: killed $vgpr51 -; SI-NEXT: ; kill: killed $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; kill: killed $vgpr51 -; SI-NEXT: ; kill: killed $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; kill: killed $vgpr51 -; SI-NEXT: ; kill: killed $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; kill: killed $vgpr51 -; SI-NEXT: ; kill: killed $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: s_branch .LBB53_2 ; ; VI-LABEL: bitcast_v13f64_to_v52f16_scalar: @@ -37969,187 +36678,203 @@ define <13 x double> @bitcast_v52f16_to_v13f64(<52 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v52f16_to_v13f64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v42, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v10 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:24 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v20 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v44, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v15 ; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:20 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v14 ; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v13 ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v12 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:40 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:36 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:48 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:84 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v60, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v11 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v61, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v59 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v9 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v58 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v6 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v57 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v48 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v36 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v37 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v38 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v39 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v63 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v62 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v49 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v61 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v62 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v61 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v60 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v25 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB54_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v49 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; kill: killed $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; kill: killed $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; kill: killed $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; kill: killed $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; kill: killed $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; kill: killed $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; kill: killed $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; kill: killed $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v61 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; kill: killed $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr26 @@ -38191,11 +36916,16 @@ define <13 x double> @bitcast_v52f16_to_v13f64(<52 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v53 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v57 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v47 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v45 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v63 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v45 ; SI-NEXT: ; kill: killed $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: v_or_b32_e32 v0, v42, v0 @@ -38203,11 +36933,16 @@ define <13 x double> @bitcast_v52f16_to_v13f64(<52 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v2, v54, v2 ; SI-NEXT: v_or_b32_e32 v3, v52, v3 ; SI-NEXT: v_or_b32_e32 v4, v50, v4 -; SI-NEXT: v_or_b32_e32 v21, v56, v21 -; SI-NEXT: v_or_b32_e32 v22, v46, v22 -; SI-NEXT: v_or_b32_e32 v23, v44, v23 -; SI-NEXT: v_or_b32_e32 v24, v34, v24 -; SI-NEXT: v_or_b32_e32 v25, v32, v25 +; SI-NEXT: v_or_b32_e32 v5, v48, v5 +; SI-NEXT: v_or_b32_e32 v6, v38, v6 +; SI-NEXT: v_or_b32_e32 v7, v36, v7 +; SI-NEXT: v_or_b32_e32 v8, v34, v8 +; SI-NEXT: v_or_b32_e32 v9, v32, v9 +; SI-NEXT: v_or_b32_e32 v10, v62, v10 +; SI-NEXT: v_or_b32_e32 v22, v58, v22 +; SI-NEXT: v_or_b32_e32 v23, v56, v23 +; SI-NEXT: v_or_b32_e32 v24, v46, v24 +; SI-NEXT: v_or_b32_e32 v25, v44, v25 ; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr41 @@ -38219,100 +36954,85 @@ define <13 x double> @bitcast_v52f16_to_v13f64(<52 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: ; kill: killed $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v59 -; SI-NEXT: v_or_b32_e32 v20, v58, v20 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v60, v21 +; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: .LBB54_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB54_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v41 @@ -38333,7 +37053,7 @@ define <13 x double> @bitcast_v52f16_to_v13f64(<52 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v2, v55 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v54 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v50 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -38343,145 +37063,117 @@ define <13 x double> @bitcast_v52f16_to_v13f64(<52 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v53 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v5, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v38 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v51 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v8, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v32 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v46 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v44 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v33 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v32 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v62 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v37 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v56 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v35 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v33 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v44 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v61 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 @@ -38489,12 +37181,12 @@ define <13 x double> @bitcast_v52f16_to_v13f64(<52 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 @@ -38504,7 +37196,7 @@ define <13 x double> @bitcast_v52f16_to_v13f64(<52 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v16, v14 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 @@ -38516,12 +37208,12 @@ define <13 x double> @bitcast_v52f16_to_v13f64(<52 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 @@ -38531,7 +37223,7 @@ define <13 x double> @bitcast_v52f16_to_v13f64(<52 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 @@ -38543,31 +37235,41 @@ define <13 x double> @bitcast_v52f16_to_v13f64(<52 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_or_b32_e32 v18, v19, v18 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v59 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v57 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; SI-NEXT: v_or_b32_e32 v20, v22, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v56 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v60 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: v_or_b32_e32 v21, v22, v21 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v57 ; SI-NEXT: v_or_b32_e32 v22, v24, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v47 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; SI-NEXT: v_or_b32_e32 v23, v25, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v46 ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 @@ -38576,22 +37278,22 @@ define <13 x double> @bitcast_v52f16_to_v13f64(<52 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v25, v27, v25 ; SI-NEXT: .LBB54_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -39301,420 +38003,490 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a ; SI-LABEL: bitcast_v52f16_to_v13f64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v7 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v54, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v53, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v11, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v1, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v2, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v12, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v14, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v3, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v10, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v4, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v9, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v5, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v8, s26 -; SI-NEXT: v_cvt_f16_f32_e32 v6, s29 -; SI-NEXT: v_cvt_f16_f32_e32 v7, s28 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v38 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v39 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v44 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: s_lshr_b32 s40, s17, 16 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 +; SI-NEXT: s_lshr_b32 s41, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s41 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s17 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v9 +; SI-NEXT: s_lshr_b32 s14, s19, 16 +; SI-NEXT: s_lshr_b32 s15, s18, 16 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v7 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_cvt_f16_f32_e32 v60, v25 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 +; SI-NEXT: s_lshr_b32 s12, s21, 16 +; SI-NEXT: s_lshr_b32 s13, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s21 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v10 +; SI-NEXT: s_lshr_b32 s10, s23, 16 +; SI-NEXT: s_lshr_b32 s11, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 +; SI-NEXT: s_lshr_b32 s8, s25, 16 +; SI-NEXT: s_lshr_b32 s9, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v11 +; SI-NEXT: s_lshr_b32 s6, s27, 16 +; SI-NEXT: s_lshr_b32 s7, s26, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v10 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v1 +; SI-NEXT: s_lshr_b32 s4, s29, 16 +; SI-NEXT: s_lshr_b32 s5, s28, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s29 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v20 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB55_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v46 +; SI-NEXT: v_or_b32_e32 v6, v29, v6 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v53 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v26 -; SI-NEXT: v_or_b32_e32 v0, v11, v0 -; SI-NEXT: v_or_b32_e32 v2, v14, v2 -; SI-NEXT: v_or_b32_e32 v3, v10, v3 -; SI-NEXT: v_or_b32_e32 v4, v9, v4 -; SI-NEXT: v_or_b32_e32 v5, v8, v5 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v46 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v41 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v42 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v56 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v43 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v57 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v62 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v34 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v58 -; SI-NEXT: v_mov_b32_e32 v51, v46 -; SI-NEXT: v_or_b32_e32 v7, v45, v7 -; SI-NEXT: v_or_b32_e32 v8, v40, v8 -; SI-NEXT: v_or_b32_e32 v9, v55, v9 -; SI-NEXT: v_or_b32_e32 v10, v54, v10 -; SI-NEXT: v_or_b32_e32 v11, v47, v11 -; SI-NEXT: v_or_b32_e32 v12, v60, v12 -; SI-NEXT: v_or_b32_e32 v13, v52, v13 -; SI-NEXT: v_or_b32_e32 v14, v63, v14 -; SI-NEXT: v_or_b32_e32 v15, v61, v15 -; SI-NEXT: v_or_b32_e32 v17, v35, v17 -; SI-NEXT: v_or_b32_e32 v18, v33, v18 -; SI-NEXT: v_or_b32_e32 v19, v59, v19 -; SI-NEXT: v_or_b32_e32 v20, v27, v20 -; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v60 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v62 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v63 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v45 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v40 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v27 +; SI-NEXT: v_mov_b32_e32 v61, v60 +; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: v_or_b32_e32 v1, v56, v1 +; SI-NEXT: v_or_b32_e32 v2, v37, v2 +; SI-NEXT: v_mov_b32_e32 v57, v58 +; SI-NEXT: v_or_b32_e32 v3, v58, v3 +; SI-NEXT: v_mov_b32_e32 v56, v47 +; SI-NEXT: v_or_b32_e32 v4, v47, v4 +; SI-NEXT: v_mov_b32_e32 v47, v45 +; SI-NEXT: v_mov_b32_e32 v45, v44 +; SI-NEXT: v_or_b32_e32 v5, v44, v5 +; SI-NEXT: v_or_b32_e32 v7, v34, v7 +; SI-NEXT: v_or_b32_e32 v8, v35, v8 +; SI-NEXT: v_or_b32_e32 v9, v42, v9 +; SI-NEXT: v_or_b32_e32 v10, v41, v10 +; SI-NEXT: v_or_b32_e32 v11, v54, v11 +; SI-NEXT: v_or_b32_e32 v12, v53, v12 +; SI-NEXT: v_or_b32_e32 v13, v51, v13 +; SI-NEXT: v_or_b32_e32 v14, v49, v14 +; SI-NEXT: v_or_b32_e32 v15, v39, v15 +; SI-NEXT: v_or_b32_e32 v16, v28, v16 +; SI-NEXT: v_or_b32_e32 v17, v30, v17 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_or_b32_e32 v23, v24, v23 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v37, v16 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v21, v22, v21 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v31 -; SI-NEXT: v_or_b32_e32 v22, v30, v22 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_or_b32_e32 v25, v38, v25 +; SI-NEXT: v_or_b32_e32 v25, v29, v25 ; SI-NEXT: s_cbranch_execnz .LBB55_3 ; SI-NEXT: .LBB55_2: ; %cmp.true -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v54 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v0, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v56 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v35 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v10, v42 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v52 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v63 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v30 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v27 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v30 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v28 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v37 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v63 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v59 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v47 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v46 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v32 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v33 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v36 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v43 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v40 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v55 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v14, v50 ; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v62 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v48 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v27 ; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v31 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v33 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_or_b32_e32 v18, v19, v18 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v58 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v26 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; SI-NEXT: v_or_b32_e32 v20, v22, v20 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_or_b32_e32 v21, v22, v21 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_or_b32_e32 v22, v24, v22 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 @@ -39724,7 +38496,7 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; SI-NEXT: v_or_b32_e32 v23, v25, v23 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 @@ -39738,57 +38510,68 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 ; SI-NEXT: v_or_b32_e32 v25, v27, v25 ; SI-NEXT: .LBB55_3: ; %end -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB55_4: -; SI-NEXT: v_mov_b32_e32 v50, v63 -; SI-NEXT: v_mov_b32_e32 v63, v58 -; SI-NEXT: v_mov_b32_e32 v58, v30 -; SI-NEXT: v_mov_b32_e32 v38, v37 -; SI-NEXT: v_mov_b32_e32 v37, v36 -; SI-NEXT: v_mov_b32_e32 v36, v35 -; SI-NEXT: v_mov_b32_e32 v35, v34 -; SI-NEXT: v_mov_b32_e32 v34, v33 -; SI-NEXT: v_mov_b32_e32 v33, v32 -; SI-NEXT: v_mov_b32_e32 v32, v59 -; SI-NEXT: v_mov_b32_e32 v59, v31 -; SI-NEXT: v_mov_b32_e32 v48, v61 -; SI-NEXT: v_mov_b32_e32 v61, v26 -; SI-NEXT: v_mov_b32_e32 v49, v62 -; SI-NEXT: v_mov_b32_e32 v62, v27 +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_mov_b32_e32 v42, v52 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_mov_b32_e32 v41, v51 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v40, v50 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v55, v49 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v54, v48 +; SI-NEXT: v_mov_b32_e32 v52, v26 +; SI-NEXT: v_mov_b32_e32 v51, v28 +; SI-NEXT: v_mov_b32_e32 v50, v27 +; SI-NEXT: v_mov_b32_e32 v49, v30 +; SI-NEXT: v_mov_b32_e32 v48, v31 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v51, v46 -; SI-NEXT: v_mov_b32_e32 v27, v62 -; SI-NEXT: v_mov_b32_e32 v62, v49 -; SI-NEXT: v_mov_b32_e32 v26, v61 -; SI-NEXT: v_mov_b32_e32 v61, v48 -; SI-NEXT: v_mov_b32_e32 v31, v59 -; SI-NEXT: v_mov_b32_e32 v59, v32 -; SI-NEXT: v_mov_b32_e32 v32, v33 -; SI-NEXT: v_mov_b32_e32 v33, v34 -; SI-NEXT: v_mov_b32_e32 v34, v35 -; SI-NEXT: v_mov_b32_e32 v35, v36 -; SI-NEXT: v_mov_b32_e32 v36, v37 -; SI-NEXT: v_mov_b32_e32 v37, v38 -; SI-NEXT: v_mov_b32_e32 v30, v58 -; SI-NEXT: v_mov_b32_e32 v58, v63 -; SI-NEXT: v_mov_b32_e32 v63, v50 +; SI-NEXT: v_mov_b32_e32 v61, v60 +; SI-NEXT: v_mov_b32_e32 v31, v48 +; SI-NEXT: v_mov_b32_e32 v30, v49 +; SI-NEXT: v_mov_b32_e32 v27, v50 +; SI-NEXT: v_mov_b32_e32 v28, v51 +; SI-NEXT: v_mov_b32_e32 v26, v52 +; SI-NEXT: v_mov_b32_e32 v48, v54 +; SI-NEXT: v_mov_b32_e32 v49, v55 +; SI-NEXT: v_mov_b32_e32 v50, v40 +; SI-NEXT: v_mov_b32_e32 v51, v41 +; SI-NEXT: v_mov_b32_e32 v52, v42 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v57, v58 +; SI-NEXT: v_mov_b32_e32 v56, v47 +; SI-NEXT: v_mov_b32_e32 v47, v45 +; SI-NEXT: v_mov_b32_e32 v45, v44 ; SI-NEXT: s_branch .LBB55_2 ; ; VI-LABEL: bitcast_v52f16_to_v13f64_scalar: @@ -40430,803 +39213,714 @@ define <52 x half> @bitcast_v52i16_to_v52f16(<52 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v52i16_to_v52f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:88 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:24 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v39 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; kill: killed $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; kill: killed $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; kill: killed $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; kill: killed $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; kill: killed $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; kill: killed $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; kill: killed $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; kill: killed $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; kill: killed $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; kill: killed $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; kill: killed $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; kill: killed $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; kill: killed $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; kill: killed $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; kill: killed $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; kill: killed $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; kill: killed $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; kill: killed $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; kill: killed $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; kill: killed $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; kill: killed $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; kill: killed $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; kill: killed $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; kill: killed $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; kill: killed $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; kill: killed $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; kill: killed $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; kill: killed $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; kill: killed $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; kill: killed $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; kill: killed $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; kill: killed $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; kill: killed $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; kill: killed $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; kill: killed $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; kill: killed $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; kill: killed $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; kill: killed $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; kill: killed $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; kill: killed $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; kill: killed $vgpr39 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v24 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v0 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB56_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v53, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v4 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v7 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v12 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v57 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v60 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v63 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v2 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v50 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v3 ; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v4 ; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v52 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v5 ; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v53 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v6 ; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v54 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v7 ; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v55 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v8 ; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v40 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v9 ; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v41 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v10 ; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v42 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v11 ; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v12 ; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v44 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v13 ; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v14 ; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v46 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v15 ; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v16 ; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v56 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v17 ; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v18 ; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v19 ; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v20 ; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v21 ; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v22 ; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v23 ; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v24 ; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v25 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v26 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v27 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v28 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v42 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v45 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v46 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v47 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: .LBB56_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB56_4 +; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_add_i32_e32 v48, vcc, 3, v48 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v48 +; SI-NEXT: v_add_i32_e32 v49, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v36 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 +; SI-NEXT: v_add_i32_e32 v50, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v37 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v49 +; SI-NEXT: v_add_i32_e32 v51, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v2 +; SI-NEXT: v_add_i32_e32 v52, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v57 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v50 +; SI-NEXT: v_add_i32_e32 v53, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v58 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v3 +; SI-NEXT: v_add_i32_e32 v54, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v59 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 +; SI-NEXT: v_add_i32_e32 v55, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v60 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v4 +; SI-NEXT: v_add_i32_e32 v40, vcc, 3, v40 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v61 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v52 +; SI-NEXT: v_add_i32_e32 v41, vcc, 3, v41 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v62 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v5 +; SI-NEXT: v_add_i32_e32 v42, vcc, 3, v42 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v63 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: .LBB56_2: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB56_4 -; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v53 +; SI-NEXT: v_add_i32_e32 v43, vcc, 3, v43 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v9 -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v6 +; SI-NEXT: v_add_i32_e32 v39, vcc, 3, v44 ; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v54 ; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: v_add_i32_e32 v38, vcc, 3, v46 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v7 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 ; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v55 +; SI-NEXT: v_add_i32_e32 v37, vcc, 3, v56 ; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v8 ; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 ; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v40 ; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 ; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v9 ; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 ; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v17 -; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v41 +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 ; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v18 -; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v10 ; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v19 -; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v20 -; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 -; SI-NEXT: v_add_i32_e32 v42, vcc, 3, v42 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v21 -; SI-NEXT: v_add_i32_e32 v43, vcc, 3, v43 -; SI-NEXT: v_add_i32_e32 v44, vcc, 3, v44 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v22 -; SI-NEXT: v_add_i32_e32 v45, vcc, 3, v45 -; SI-NEXT: v_add_i32_e32 v46, vcc, 3, v46 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v23 -; SI-NEXT: v_add_i32_e32 v47, vcc, 3, v47 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v31 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v24 -; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32 -; SI-NEXT: v_add_i32_e32 v33, vcc, 3, v33 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v25 -; SI-NEXT: v_add_i32_e32 v34, vcc, 3, v34 -; SI-NEXT: v_add_i32_e32 v35, vcc, 3, v35 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v26 -; SI-NEXT: v_add_i32_e32 v36, vcc, 3, v36 -; SI-NEXT: v_add_i32_e32 v37, vcc, 3, v37 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v27 -; SI-NEXT: v_add_i32_e32 v38, vcc, 3, v38 -; SI-NEXT: v_add_i32_e32 v57, vcc, 3, v57 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v28 -; SI-NEXT: v_add_i32_e32 v58, vcc, 3, v58 -; SI-NEXT: v_add_i32_e32 v59, vcc, 3, v59 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v29 -; SI-NEXT: v_add_i32_e32 v60, vcc, 3, v60 -; SI-NEXT: v_add_i32_e32 v61, vcc, 3, v61 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v30 -; SI-NEXT: v_add_i32_e32 v62, vcc, 3, v62 -; SI-NEXT: v_add_i32_e32 v63, vcc, 3, v63 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v42 +; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v63 +; SI-NEXT: v_add_i32_e32 v35, vcc, 3, v62 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v42 -; SI-NEXT: v_add_i32_e32 v56, vcc, 3, v56 -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v11 +; SI-NEXT: v_add_i32_e32 v36, vcc, 3, v61 +; SI-NEXT: v_add_i32_e32 v33, vcc, 3, v60 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v43 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 +; SI-NEXT: v_add_i32_e32 v34, vcc, 3, v59 +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v58 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v44 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v12 +; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v57 +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v47 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v45 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v39 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v46 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v4 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v7 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v12 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v56 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v35 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v36 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v37 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v57 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v58 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v59 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v20 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v60 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v21 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v61 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v62 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v63 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: .LBB56_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v54 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v50 -; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v48 -; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 -; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 -; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v14, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v36 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v24, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v31 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v2, v39 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v18, v30 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v20, v33 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v22, v35 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 +; SI-NEXT: v_or_b32_e32 v23, v25, v23 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v24, v37 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 +; SI-NEXT: v_or_b32_e32 v25, v27, v25 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v52i16_to_v52f16: @@ -41743,744 +40437,496 @@ define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i ; SI-LABEL: bitcast_v52i16_to_v52f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:28 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: s_lshr_b32 s43, s29, 16 +; SI-NEXT: s_lshr_b32 s42, s28, 16 +; SI-NEXT: s_lshr_b32 s41, s27, 16 +; SI-NEXT: s_lshr_b32 s40, s26, 16 +; SI-NEXT: s_lshr_b32 s15, s25, 16 +; SI-NEXT: s_lshr_b32 s14, s24, 16 +; SI-NEXT: s_lshr_b32 s13, s23, 16 +; SI-NEXT: s_lshr_b32 s12, s22, 16 +; SI-NEXT: s_lshr_b32 s11, s21, 16 +; SI-NEXT: s_lshr_b32 s10, s20, 16 +; SI-NEXT: s_lshr_b32 s9, s19, 16 +; SI-NEXT: s_lshr_b32 s8, s18, 16 +; SI-NEXT: s_lshr_b32 s7, s17, 16 +; SI-NEXT: s_lshr_b32 s6, s16, 16 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v4 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: s_cbranch_scc0 .LBB57_2 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v0 +; SI-NEXT: s_cbranch_scc0 .LBB57_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v31, s22 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v49, s17 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v60, s20 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v63, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v61, s25 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v59, s27 -; SI-NEXT: s_mov_b64 s[4:5], 0 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v19 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v25 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v28 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v57 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v46 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v44 -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v5 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v6 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v7 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v8 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v9 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v10 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v11 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v12 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v13 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v12, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s10 +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v12, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s13 +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v12, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s15 +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v12, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s41 +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v12, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s29 +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v12, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v0 +; SI-NEXT: v_mov_b32_e32 v22, v16 +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v12, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v1 +; SI-NEXT: v_mov_b32_e32 v23, v17 +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v12, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v2 +; SI-NEXT: v_mov_b32_e32 v24, v18 +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v12, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v3 +; SI-NEXT: v_mov_b32_e32 v25, v19 +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v12, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v4 +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v12, s24 +; SI-NEXT: v_mov_b32_e32 v21, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v5 +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v12, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v41 +; SI-NEXT: v_mov_b32_e32 v14, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v43 +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v12, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v7 +; SI-NEXT: v_mov_b32_e32 v16, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v8 +; SI-NEXT: v_mov_b32_e32 v17, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v9 +; SI-NEXT: v_mov_b32_e32 v18, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v10 +; SI-NEXT: v_mov_b32_e32 v19, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v11 +; SI-NEXT: v_mov_b32_e32 v20, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 +; SI-NEXT: s_cbranch_execnz .LBB57_3 +; SI-NEXT: .LBB57_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_add_i32_e32 v41, vcc, 3, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s16 +; SI-NEXT: s_add_i32 s6, s6, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v14 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v12, s6 +; SI-NEXT: s_add_i32 s8, s8, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v15 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v12, s17 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v16 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v12, s18 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: v_add_i32_e32 v63, vcc, 3, v20 +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v17 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v12, s8 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v47, vcc, 3, v19 +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v18 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v12, s19 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v45, vcc, 3, v18 +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v20 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v12, s20 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v46, vcc, 3, v17 +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v22 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v12, s21 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v43, vcc, 3, v16 +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v24 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v12, s22 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v44, vcc, 3, v14 +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v43 -; SI-NEXT: s_branch .LBB57_3 -; SI-NEXT: .LBB57_2: -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: s_mov_b64 s[4:5], -1 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; kill: killed $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: .LBB57_3: ; %Flow -; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v58, v62 -; SI-NEXT: v_mov_b32_e32 v62, v32 -; SI-NEXT: v_mov_b32_e32 v32, v37 -; SI-NEXT: v_mov_b32_e32 v37, v39 -; SI-NEXT: v_mov_b32_e32 v39, v51 -; SI-NEXT: v_mov_b32_e32 v51, v53 -; SI-NEXT: v_mov_b32_e32 v53, v55 -; SI-NEXT: v_mov_b32_e32 v55, v41 -; SI-NEXT: v_mov_b32_e32 v41, v42 -; SI-NEXT: s_cbranch_vccnz .LBB57_5 -; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v42, vcc, 3, v43 -; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s22 -; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v12, s23 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s24 +; SI-NEXT: v_add_i32_e32 v42, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v55, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v39, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v33, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v36, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: s_add_i32 s43, s43, 3 ; SI-NEXT: s_add_i32 s29, s29, 3 -; SI-NEXT: v_add_i32_e32 v44, vcc, 3, v44 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, s26 -; SI-NEXT: v_add_i32_e32 v45, vcc, 3, v45 -; SI-NEXT: v_add_i32_e32 v46, vcc, 3, v46 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, s28 -; SI-NEXT: v_add_i32_e32 v47, vcc, 3, v47 -; SI-NEXT: v_add_i32_e32 v56, vcc, 3, v56 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, s29 -; SI-NEXT: v_add_i32_e32 v57, vcc, 3, v57 +; SI-NEXT: s_add_i32 s42, s42, 3 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s41, s41, 3 ; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_add_i32 s40, s40, 3 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s15, s15, 3 ; SI-NEXT: s_add_i32 s25, s25, 3 -; SI-NEXT: s_add_i32 s23, s23, 3 -; SI-NEXT: s_add_i32 s21, s21, 3 -; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v49, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v60, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v63, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v61, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v59, s27 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v32, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v44 +; SI-NEXT: s_add_i32 s14, s14, 3 +; SI-NEXT: s_add_i32 s13, s13, 3 +; SI-NEXT: s_add_i32 s12, s12, 3 +; SI-NEXT: s_add_i32 s11, s11, 3 +; SI-NEXT: s_add_i32 s10, s10, 3 +; SI-NEXT: s_add_i32 s9, s9, 3 +; SI-NEXT: s_add_i32 s7, s7, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s13 +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v35, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s15 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v12, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: .LBB57_3: ; %end +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v2, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v13 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v62 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v63 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_add_i32_e32 v43, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v52, v30 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v37, v29 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v54, v28 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v39, v27 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v40, v26 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v51, v25 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v53, v23 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v55, v21 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v41, v19 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v2 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v3 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v4 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v6 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v7 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v9 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v10 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v11 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v12 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v13 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v14 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v15 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v17 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v18 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v20 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v22 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v43 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: .LBB57_5: ; %end -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v34 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v60 -; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v63 -; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 -; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 -; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v4, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v47 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v2, v41 -; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v6, v31 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v2, v55 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v8, v35 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v2, v53 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v10, v38 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v51 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v39 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v32 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v62 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v58 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v31 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v49 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v32 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v51 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v29 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v53 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v33 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v56 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v40 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v57 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v41 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v59 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v43 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 +; SI-NEXT: v_or_b32_e32 v23, v25, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v45 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 +; SI-NEXT: v_or_b32_e32 v25, v27, v25 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB57_4: +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; kill: killed $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; kill: killed $vgpr15 +; SI-NEXT: v_mov_b32_e32 v25, v19 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; kill: killed $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; kill: killed $vgpr15 +; SI-NEXT: v_mov_b32_e32 v24, v18 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; kill: killed $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; kill: killed $vgpr15 +; SI-NEXT: v_mov_b32_e32 v23, v17 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; kill: killed $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: v_mov_b32_e32 v22, v16 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; kill: killed $vgpr13 +; SI-NEXT: v_mov_b32_e32 v21, v14 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: v_mov_b32_e32 v20, v63 +; SI-NEXT: v_mov_b32_e32 v19, v61 +; SI-NEXT: v_mov_b32_e32 v18, v47 +; SI-NEXT: v_mov_b32_e32 v17, v46 +; SI-NEXT: v_mov_b32_e32 v16, v45 +; SI-NEXT: v_mov_b32_e32 v14, v43 +; SI-NEXT: v_mov_b32_e32 v12, v41 +; SI-NEXT: ; kill: killed $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; kill: killed $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; kill: killed $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: s_branch .LBB57_2 ; ; VI-LABEL: bitcast_v52i16_to_v52f16_scalar: ; VI: ; %bb.0: @@ -43303,542 +41749,466 @@ define <52 x i16> @bitcast_v52f16_to_v52i16(<52 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v52f16_to_v52i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:8 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:16 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:20 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v5 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v38 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v35 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v38 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v3 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v35 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v38 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v35 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v38 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:28 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:32 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:88 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v3 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v50, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v7 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v11 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v7, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v50 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v24 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v30 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v18, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v62 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v31 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f16_f32_e32 v26, v36 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f16_f32_e32 v9, v38 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f16_f32_e32 v62, v48 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f16_f32_e32 v36, v53 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f16_f32_e32 v25, v54 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f16_f32_e32 v48, v32 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f16_f32_e32 v38, v33 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v31, v34 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v24, v35 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v53, v37 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v32, v39 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v45 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_or_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: v_mov_b32_e32 v37, v22 -; SI-NEXT: v_mov_b32_e32 v22, v2 -; SI-NEXT: v_mov_b32_e32 v39, v3 -; SI-NEXT: v_mov_b32_e32 v49, v5 -; SI-NEXT: v_mov_b32_e32 v54, v7 -; SI-NEXT: v_mov_b32_e32 v61, v8 -; SI-NEXT: v_mov_b32_e32 v63, v4 -; SI-NEXT: s_xor_b64 exec, exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB58_2 ; SI-NEXT: ; %bb.1: ; %cmp.true -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v30, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v34 -; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v61 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_or_b32_e32 v63, v34, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v42 -; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 +; SI-NEXT: v_add_f32_e32 v42, 0x38000000, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 +; SI-NEXT: v_add_f32_e32 v40, 0x38000000, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v42 +; SI-NEXT: v_add_f32_e32 v42, 0x38000000, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v42 +; SI-NEXT: v_add_f32_e32 v55, 0x38000000, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v55 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v55 +; SI-NEXT: v_add_f32_e32 v53, 0x38000000, v53 +; SI-NEXT: v_add_f32_e32 v55, 0x38000000, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v55 +; SI-NEXT: v_add_f32_e32 v52, 0x38000000, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: v_add_f32_e32 v52, 0x38000000, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 +; SI-NEXT: v_add_f32_e32 v50, 0x38000000, v50 +; SI-NEXT: v_add_f32_e32 v49, 0x38000000, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v50 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v48 ; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v49 +; SI-NEXT: v_add_f32_e32 v49, 0x38000000, v52 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v8 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v7 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v33, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_or_b32_e32 v28, v28, v49 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_or_b32_e32 v24, v24, v49 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_or_b32_e32 v2, v33, v30 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v43 +; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_or_b32_e32 v29, v29, v49 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 +; SI-NEXT: v_or_b32_e32 v31, v31, v49 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_or_b32_e32 v18, v18, v49 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_or_b32_e32 v32, v32, v49 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_or_b32_e32 v34, v34, v49 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_add_f32_e32 v43, 0x38000000, v43 +; SI-NEXT: v_or_b32_e32 v12, v12, v49 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v54 -; SI-NEXT: v_or_b32_e32 v61, v35, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v41 -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v34 -; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v49 -; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 -; SI-NEXT: v_or_b32_e32 v54, v33, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v40 -; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v35 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v43 +; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_or_b32_e32 v35, v35, v49 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: v_or_b32_e32 v49, v34, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v39 -; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v29 -; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v37 -; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v56 -; SI-NEXT: v_or_b32_e32 v39, v35, v55 -; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 -; SI-NEXT: v_or_b32_e32 v22, v22, v29 -; SI-NEXT: v_or_b32_e32 v37, v33, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v27 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v59 -; SI-NEXT: v_or_b32_e32 v35, v34, v27 -; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v26 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v58 -; SI-NEXT: v_or_b32_e32 v59, v28, v26 -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v34 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v36 -; SI-NEXT: v_or_b32_e32 v58, v28, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v32 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v31 -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v28 -; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; SI-NEXT: v_or_b32_e32 v31, v25, v57 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v32 -; SI-NEXT: v_or_b32_e32 v53, v5, v25 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v38 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v2 -; SI-NEXT: v_or_b32_e32 v48, v3, v25 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v62 -; SI-NEXT: v_or_b32_e32 v9, v9, v25 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v60 -; SI-NEXT: v_or_b32_e32 v14, v14, v25 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v10 -; SI-NEXT: v_or_b32_e32 v12, v12, v25 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v17 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_or_b32_e32 v18, v18, v25 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v11 -; SI-NEXT: v_or_b32_e32 v23, v23, v25 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: v_or_b32_e32 v21, v21, v25 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v20 -; SI-NEXT: v_or_b32_e32 v15, v15, v25 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v13 -; SI-NEXT: v_or_b32_e32 v11, v4, v25 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v2 -; SI-NEXT: v_or_b32_e32 v6, v6, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v50 -; SI-NEXT: v_or_b32_e32 v36, v33, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v51 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v28 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v1 -; SI-NEXT: v_or_b32_e32 v52, v25, v33 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v50 -; SI-NEXT: v_or_b32_e32 v51, v28, v25 -; SI-NEXT: v_alignbit_b32 v45, v51, v30, 16 -; SI-NEXT: v_alignbit_b32 v44, v52, v44, 16 -; SI-NEXT: v_alignbit_b32 v43, v6, v42, 16 -; SI-NEXT: v_alignbit_b32 v42, v11, v41, 16 -; SI-NEXT: v_alignbit_b32 v41, v15, v46, 16 -; SI-NEXT: v_alignbit_b32 v40, v21, v55, 16 -; SI-NEXT: v_alignbit_b32 v55, v23, v29, 16 -; SI-NEXT: v_alignbit_b32 v29, v18, v47, 16 -; SI-NEXT: v_alignbit_b32 v28, v12, v27, 16 -; SI-NEXT: v_alignbit_b32 v27, v14, v26, 16 -; SI-NEXT: v_alignbit_b32 v26, v9, v56, 16 -; SI-NEXT: v_mov_b32_e32 v56, v35 -; SI-NEXT: v_alignbit_b32 v25, v48, v24, 16 -; SI-NEXT: v_alignbit_b32 v24, v53, v57, 16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_or_b32_e32 v37, v37, v49 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v5 +; SI-NEXT: v_or_b32_e32 v6, v6, v49 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v3 +; SI-NEXT: v_or_b32_e32 v39, v39, v49 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v44 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v55 +; SI-NEXT: v_or_b32_e32 v2, v2, v49 +; SI-NEXT: v_or_b32_e32 v0, v0, v26 +; SI-NEXT: v_or_b32_e32 v38, v38, v43 +; SI-NEXT: v_or_b32_e32 v4, v4, v41 +; SI-NEXT: v_or_b32_e32 v8, v8, v40 +; SI-NEXT: v_or_b32_e32 v36, v36, v45 +; SI-NEXT: v_or_b32_e32 v10, v10, v54 +; SI-NEXT: v_or_b32_e32 v14, v14, v53 +; SI-NEXT: v_or_b32_e32 v33, v33, v46 +; SI-NEXT: v_or_b32_e32 v16, v16, v51 +; SI-NEXT: v_or_b32_e32 v20, v20, v50 +; SI-NEXT: v_or_b32_e32 v30, v30, v47 +; SI-NEXT: v_or_b32_e32 v22, v22, v48 +; SI-NEXT: v_or_b32_e32 v27, v27, v56 +; SI-NEXT: v_alignbit_b32 v44, v2, v26, 16 +; SI-NEXT: v_alignbit_b32 v43, v39, v43, 16 +; SI-NEXT: v_alignbit_b32 v42, v6, v41, 16 +; SI-NEXT: v_alignbit_b32 v41, v37, v40, 16 +; SI-NEXT: v_alignbit_b32 v40, v35, v45, 16 +; SI-NEXT: v_alignbit_b32 v55, v12, v54, 16 +; SI-NEXT: v_alignbit_b32 v54, v34, v53, 16 +; SI-NEXT: v_alignbit_b32 v53, v32, v46, 16 +; SI-NEXT: v_alignbit_b32 v52, v18, v51, 16 +; SI-NEXT: v_alignbit_b32 v51, v31, v50, 16 +; SI-NEXT: v_alignbit_b32 v50, v29, v47, 16 +; SI-NEXT: v_alignbit_b32 v49, v24, v48, 16 +; SI-NEXT: v_alignbit_b32 v48, v28, v56, 16 ; SI-NEXT: .LBB58_2: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v45 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v44 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v13 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v30, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v30, v30, v33 -; SI-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v30, 0xffff, v51 -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v50 -; SI-NEXT: v_or_b32_e32 v30, v30, v33 -; SI-NEXT: v_add_i32_e32 v33, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v30, v33, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v30, 0xffff, v63 -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v44 -; SI-NEXT: v_or_b32_e32 v30, v30, v33 -; SI-NEXT: v_add_i32_e32 v33, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v30, v33, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v30, 0xffff, v52 -; SI-NEXT: v_or_b32_e32 v1, v30, v1 -; SI-NEXT: v_add_i32_e32 v30, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v1, v30, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v61 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v43 -; SI-NEXT: v_or_b32_e32 v1, v1, v30 -; SI-NEXT: v_add_i32_e32 v30, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v1, v30, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v0, v0, v26 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v38 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v43 +; SI-NEXT: v_or_b32_e32 v2, v2, v26 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v26, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v42 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v4, v4, v26 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v41 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v40 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v6, v6, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v37 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v36 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v8, v8, v26 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v35 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v54 +; SI-NEXT: v_or_b32_e32 v9, v26, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v55 +; SI-NEXT: v_or_b32_e32 v12, v12, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v34 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v10, v10, v26 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v53 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v14, v14, v26 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v51 +; SI-NEXT: v_or_b32_e32 v15, v26, v15 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v52 +; SI-NEXT: v_or_b32_e32 v18, v18, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v31 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v16, v16, v26 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v50 +; SI-NEXT: v_or_b32_e32 v20, v20, v26 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v26, v21 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v49 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v22, v22, v26 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v48 +; SI-NEXT: v_or_b32_e32 v24, v24, v26 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v6 -; SI-NEXT: v_add_i32_e32 v6, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v42 -; SI-NEXT: v_or_b32_e32 v1, v1, v6 -; SI-NEXT: v_add_i32_e32 v6, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 -; SI-NEXT: v_or_b32_e32 v1, v1, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v49 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v41 -; SI-NEXT: v_or_b32_e32 v1, v1, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v20 -; SI-NEXT: v_or_b32_e32 v1, v1, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v39 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v40 -; SI-NEXT: v_or_b32_e32 v1, v1, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v19 -; SI-NEXT: v_or_b32_e32 v1, v1, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v55 -; SI-NEXT: v_or_b32_e32 v1, v1, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v16 -; SI-NEXT: v_or_b32_e32 v1, v1, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v37 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v29 -; SI-NEXT: v_or_b32_e32 v1, v1, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 56, v0 -; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v17 -; SI-NEXT: v_or_b32_e32 v1, v1, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 60, v0 -; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v56 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v28 -; SI-NEXT: v_or_b32_e32 v1, v1, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 64, v0 -; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v10 -; SI-NEXT: v_or_b32_e32 v1, v1, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 0x44, v0 -; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v59 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v27 -; SI-NEXT: v_or_b32_e32 v1, v1, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 0x48, v0 -; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v60 -; SI-NEXT: v_or_b32_e32 v1, v1, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 0x4c, v0 -; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v58 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v26 -; SI-NEXT: v_or_b32_e32 v1, v1, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 0x50, v0 -; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v62 -; SI-NEXT: v_or_b32_e32 v1, v1, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 0x54, v0 -; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v36 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v25 -; SI-NEXT: v_or_b32_e32 v1, v1, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 0x58, v0 -; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v48 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v31 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v24 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v53 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v52f16_to_v52i16: @@ -44356,527 +42726,514 @@ define inreg <52 x i16> @bitcast_v52f16_to_v52i16_scalar(<52 x half> inreg %a, i ; SI-LABEL: bitcast_v52f16_to_v52i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v58, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v3 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v62, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v10 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v63, v14 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s16 +; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s22 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s40, s19, 16 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s40 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v61, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s26 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s12 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v63, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s28 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v9 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s8, s27, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v6 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v20 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s8 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s6 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v18 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v44, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v3, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v46, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v45, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v15, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v61, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v11, s28 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v43, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v41, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v16, s26 -; SI-NEXT: v_cvt_f16_f32_e32 v54, s29 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v39 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v24, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v20, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v39, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v10 +; SI-NEXT: s_lshr_b32 s7, s28, 16 +; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: s_lshr_b32 s13, s22, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s41, s18, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v5 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v45, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v55, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v13 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v22 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_cbranch_scc0 .LBB59_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_cbranch_execnz .LBB59_3 ; SI-NEXT: .LBB59_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v1, v43 -; SI-NEXT: v_mov_b32_e32 v38, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v41 -; SI-NEXT: v_mov_b32_e32 v28, v7 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_mov_b32_e32 v36, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v39 -; SI-NEXT: v_mov_b32_e32 v9, v15 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v1 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v3 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v1 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_or_b32_e32 v39, v7, v19 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v38 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v38, v7, v15 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v1, v52 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v50 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v48 -; SI-NEXT: v_mov_b32_e32 v29, v11 -; SI-NEXT: v_or_b32_e32 v5, v5, v23 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v29 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v1 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_or_b32_e32 v5, v5, v25 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v3, v33 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v35 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v53 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v54 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v37 +; SI-NEXT: v_or_b32_e32 v20, v20, v0 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v41 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v38 +; SI-NEXT: v_or_b32_e32 v38, v22, v2 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v50 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v49 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v32 +; SI-NEXT: v_or_b32_e32 v32, v22, v4 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_or_b32_e32 v49, v20, v6 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v36 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v36, v20, v8 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v10, v48 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v27, v39 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v28 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_or_b32_e32 v7, v7, v17 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v44 -; SI-NEXT: v_or_b32_e32 v5, v5, v21 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v56 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_or_b32_e32 v5, v5, v11 -; SI-NEXT: v_or_b32_e32 v56, v7, v13 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v31 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v40 -; SI-NEXT: v_or_b32_e32 v36, v1, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v55 -; SI-NEXT: v_or_b32_e32 v37, v28, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v53 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v35, v3, v5 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 -; SI-NEXT: v_or_b32_e32 v33, v28, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v30 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v1 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v27 -; SI-NEXT: v_or_b32_e32 v31, v29, v1 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v28 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v30 -; SI-NEXT: v_or_b32_e32 v2, v2, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v57 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v29 -; SI-NEXT: v_or_b32_e32 v4, v4, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v51 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_or_b32_e32 v22, v22, v10 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v24 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v55 +; SI-NEXT: v_or_b32_e32 v22, v22, v12 +; SI-NEXT: v_or_b32_e32 v20, v20, v14 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v30 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_or_b32_e32 v48, v24, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v47 +; SI-NEXT: v_or_b32_e32 v50, v20, v18 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v46 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_or_b32_e32 v28, v24, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v51 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v34, v27, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v29 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v27 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v27 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v57 -; SI-NEXT: v_or_b32_e32 v6, v6, v27 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v28 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v32 -; SI-NEXT: v_or_b32_e32 v8, v8, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v47 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v60 -; SI-NEXT: v_or_b32_e32 v10, v10, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v27 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v27 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v47 -; SI-NEXT: v_or_b32_e32 v12, v12, v27 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v28 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v63 -; SI-NEXT: v_or_b32_e32 v14, v14, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v27 +; SI-NEXT: v_or_b32_e32 v37, v26, v24 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v29 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v58 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v30 +; SI-NEXT: v_or_b32_e32 v23, v23, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v35 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v26 +; SI-NEXT: v_lshr_b64 v[46:47], v[22:23], 16 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v27, v62 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v59 -; SI-NEXT: v_or_b32_e32 v18, v18, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v58 +; SI-NEXT: v_or_b32_e32 v21, v21, v26 ; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v62, v27 -; SI-NEXT: v_lshr_b64 v[50:51], v[17:18], 16 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v27 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v31 +; SI-NEXT: v_or_b32_e32 v19, v19, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v56 ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v62 -; SI-NEXT: v_or_b32_e32 v22, v22, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v61 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v58 -; SI-NEXT: v_or_b32_e32 v26, v26, v27 +; SI-NEXT: v_or_b32_e32 v17, v17, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v27, v45 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v28 -; SI-NEXT: v_lshr_b64 v[54:55], v[25:26], 16 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v26 +; SI-NEXT: v_mov_b32_e32 v47, v28 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v56 +; SI-NEXT: v_or_b32_e32 v15, v15, v26 ; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v46 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v61 -; SI-NEXT: v_lshr_b64 v[52:53], v[21:22], 16 -; SI-NEXT: v_or_b32_e32 v16, v16, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v27 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v45 +; SI-NEXT: v_or_b32_e32 v13, v13, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v44 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v59 +; SI-NEXT: v_or_b32_e32 v11, v11, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v57 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v26 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v44 +; SI-NEXT: v_or_b32_e32 v9, v9, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v63 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v57 +; SI-NEXT: v_or_b32_e32 v7, v7, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v60 ; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v27 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v45 -; SI-NEXT: v_or_b32_e32 v20, v20, v27 -; SI-NEXT: v_mov_b32_e32 v53, v33 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v46 -; SI-NEXT: v_or_b32_e32 v24, v24, v27 -; SI-NEXT: v_lshr_b64 v[43:44], v[23:24], 16 -; SI-NEXT: v_lshr_b64 v[33:34], v[7:8], 16 -; SI-NEXT: v_mov_b32_e32 v7, v56 -; SI-NEXT: v_lshr_b64 v[55:56], v[3:4], 16 -; SI-NEXT: v_mov_b32_e32 v44, v37 -; SI-NEXT: v_lshr_b64 v[41:42], v[19:20], 16 -; SI-NEXT: v_mov_b32_e32 v19, v39 -; SI-NEXT: v_lshr_b64 v[39:40], v[15:16], 16 -; SI-NEXT: v_mov_b32_e32 v15, v38 -; SI-NEXT: v_lshr_b64 v[37:38], v[11:12], 16 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v42, v36 -; SI-NEXT: v_mov_b32_e32 v40, v35 -; SI-NEXT: v_mov_b32_e32 v51, v32 -; SI-NEXT: v_lshr_b64 v[48:49], v[13:14], 16 -; SI-NEXT: v_lshr_b64 v[35:36], v[9:10], 16 -; SI-NEXT: v_mov_b32_e32 v34, v31 -; SI-NEXT: v_lshr_b64 v[31:32], v[5:6], 16 -; SI-NEXT: v_lshr_b64 v[27:28], v[1:2], 16 -; SI-NEXT: v_mov_b32_e32 v32, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v27 +; SI-NEXT: v_lshr_b64 v[52:53], v[6:7], 16 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v61 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v63 +; SI-NEXT: v_or_b32_e32 v5, v5, v27 +; SI-NEXT: v_lshr_b64 v[54:55], v[4:5], 16 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v60 +; SI-NEXT: v_or_b32_e32 v3, v3, v26 +; SI-NEXT: v_lshr_b64 v[40:41], v[2:3], 16 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v61 +; SI-NEXT: v_or_b32_e32 v1, v1, v26 +; SI-NEXT: v_lshr_b64 v[42:43], v[0:1], 16 +; SI-NEXT: v_mov_b32_e32 v43, v50 +; SI-NEXT: v_mov_b32_e32 v41, v49 +; SI-NEXT: v_mov_b32_e32 v55, v48 +; SI-NEXT: v_lshr_b64 v[50:51], v[8:9], 16 +; SI-NEXT: v_lshr_b64 v[48:49], v[10:11], 16 +; SI-NEXT: v_mov_b32_e32 v53, v38 +; SI-NEXT: v_mov_b32_e32 v51, v37 +; SI-NEXT: v_mov_b32_e32 v49, v36 +; SI-NEXT: v_lshr_b64 v[38:39], v[12:13], 16 +; SI-NEXT: v_lshr_b64 v[36:37], v[14:15], 16 +; SI-NEXT: v_mov_b32_e32 v39, v34 +; SI-NEXT: v_mov_b32_e32 v37, v32 +; SI-NEXT: v_lshr_b64 v[34:35], v[16:17], 16 +; SI-NEXT: v_lshr_b64 v[32:33], v[18:19], 16 +; SI-NEXT: v_mov_b32_e32 v35, v31 +; SI-NEXT: v_mov_b32_e32 v33, v30 +; SI-NEXT: v_lshr_b64 v[30:31], v[20:21], 16 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_lshr_b64 v[26:27], v[24:25], 16 ; SI-NEXT: .LBB59_3: ; %end -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v43 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v24 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v46 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v41 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v19 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v45 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v39 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v15 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v61 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v54 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v11 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v26 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v58 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v52 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v62 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v42 ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v31 +; SI-NEXT: v_or_b32_e32 v0, v2, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v59 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v48 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v63 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v56 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v37 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v47 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v44 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v35 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v60 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v42 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v33 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v51 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v40 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v31 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v57 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v53 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v55 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v32 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v34 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v27 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v61 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v53 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v60 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v54 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v37 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v63 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v52 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v41 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v57 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v50 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v49 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v44 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v48 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v38 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v36 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v34 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v32 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v46 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v10, v10, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v59 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v12, v12, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v45 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_or_b32_e32 v14, v14, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v56 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v55 +; SI-NEXT: v_or_b32_e32 v16, v16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v62 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v43 +; SI-NEXT: v_or_b32_e32 v18, v18, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v35 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v47 +; SI-NEXT: v_or_b32_e32 v20, v20, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v58 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v39 +; SI-NEXT: v_or_b32_e32 v22, v22, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v33 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v51 +; SI-NEXT: v_or_b32_e32 v24, v24, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v29 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB59_4: ; SI-NEXT: s_branch .LBB59_2 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll index 41b86c0960b46..b42188f0f3980 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll @@ -2719,327 +2719,245 @@ define <56 x i16> @bitcast_v28i32_to_v56i16(<28 x i32> %a, i32 %b) { ; SI-LABEL: bitcast_v28i32_to_v56i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v29 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB12_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_alignbit_b32 v29, v28, v27, 16 -; SI-NEXT: v_alignbit_b32 v30, v26, v25, 16 -; SI-NEXT: v_alignbit_b32 v31, v24, v23, 16 -; SI-NEXT: v_alignbit_b32 v32, v22, v21, 16 -; SI-NEXT: v_alignbit_b32 v33, v20, v19, 16 -; SI-NEXT: v_alignbit_b32 v34, v18, v17, 16 -; SI-NEXT: v_alignbit_b32 v37, v16, v15, 16 -; SI-NEXT: v_alignbit_b32 v39, v14, v13, 16 -; SI-NEXT: v_alignbit_b32 v49, v12, v11, 16 -; SI-NEXT: v_alignbit_b32 v51, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v54, v8, v7, 16 -; SI-NEXT: v_alignbit_b32 v40, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v28, v27, v26, 16 +; SI-NEXT: v_alignbit_b32 v29, v25, v24, 16 +; SI-NEXT: v_alignbit_b32 v30, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v31, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v32, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v33, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v34, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v35, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v36, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v38, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v48, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v51, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v53, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v40, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v15 ; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_alignbit_b32 v42, v4, v3, 16 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v13 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v11 ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_alignbit_b32 v44, v2, v1, 16 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v9 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v7 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v5 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v3 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v1 ; SI-NEXT: .LBB12_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB12_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 ; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 ; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 ; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 ; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 ; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 ; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 ; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 ; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 ; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 ; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 ; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; SI-NEXT: v_alignbit_b32 v29, v28, v27, 16 -; SI-NEXT: v_alignbit_b32 v30, v26, v25, 16 -; SI-NEXT: v_alignbit_b32 v31, v24, v23, 16 -; SI-NEXT: v_alignbit_b32 v32, v22, v21, 16 -; SI-NEXT: v_alignbit_b32 v33, v20, v19, 16 -; SI-NEXT: v_alignbit_b32 v34, v18, v17, 16 -; SI-NEXT: v_alignbit_b32 v37, v16, v15, 16 -; SI-NEXT: v_alignbit_b32 v39, v14, v13, 16 -; SI-NEXT: v_alignbit_b32 v49, v12, v11, 16 -; SI-NEXT: v_alignbit_b32 v51, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v54, v8, v7, 16 -; SI-NEXT: v_alignbit_b32 v40, v6, v5, 16 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_alignbit_b32 v28, v27, v26, 16 +; SI-NEXT: v_alignbit_b32 v29, v25, v24, 16 +; SI-NEXT: v_alignbit_b32 v30, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v31, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v32, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v33, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v34, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v35, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v36, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v38, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v48, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v51, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v53, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v40, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v15 ; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_alignbit_b32 v42, v4, v3, 16 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v13 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v11 ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_alignbit_b32 v44, v2, v1, 16 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v9 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v7 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v5 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v3 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v1 ; SI-NEXT: .LBB12_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 +; SI-NEXT: v_or_b32_e32 v0, v0, v40 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; SI-NEXT: v_or_b32_e32 v1, v1, v44 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v56 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v42 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v47 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v46 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v45 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v43 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v49 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v53 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v24 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v25 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v26 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v27 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v28 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v47 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v40 +; SI-NEXT: v_or_b32_e32 v2, v2, v53 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v46 +; SI-NEXT: v_or_b32_e32 v4, v4, v51 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v45 +; SI-NEXT: v_or_b32_e32 v6, v6, v48 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v44 +; SI-NEXT: v_or_b32_e32 v8, v8, v38 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v43 +; SI-NEXT: v_or_b32_e32 v10, v10, v36 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v42 +; SI-NEXT: v_or_b32_e32 v12, v12, v35 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v41 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_or_b32_e32 v14, v14, v34 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v55 +; SI-NEXT: v_or_b32_e32 v16, v16, v33 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v54 +; SI-NEXT: v_or_b32_e32 v18, v18, v32 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v52 +; SI-NEXT: v_or_b32_e32 v20, v20, v31 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v50 +; SI-NEXT: v_or_b32_e32 v22, v22, v30 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v49 +; SI-NEXT: v_or_b32_e32 v24, v24, v29 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v39 +; SI-NEXT: v_or_b32_e32 v26, v26, v28 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v37 +; SI-NEXT: v_or_b32_e32 v3, v3, v53 +; SI-NEXT: v_or_b32_e32 v5, v5, v51 +; SI-NEXT: v_or_b32_e32 v7, v7, v48 +; SI-NEXT: v_or_b32_e32 v9, v9, v38 +; SI-NEXT: v_or_b32_e32 v11, v11, v36 +; SI-NEXT: v_or_b32_e32 v13, v13, v35 +; SI-NEXT: v_or_b32_e32 v15, v15, v34 +; SI-NEXT: v_or_b32_e32 v17, v17, v33 +; SI-NEXT: v_or_b32_e32 v19, v19, v32 +; SI-NEXT: v_or_b32_e32 v21, v21, v31 +; SI-NEXT: v_or_b32_e32 v23, v23, v30 +; SI-NEXT: v_or_b32_e32 v25, v25, v29 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v28i32_to_v56i16: @@ -3638,67 +3556,67 @@ define inreg <56 x i16> @bitcast_v28i32_to_v56i16_scalar(<28 x i32> inreg %a, i3 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v20, s30, 0 -; SI-NEXT: v_writelane_b32 v20, s31, 1 -; SI-NEXT: v_writelane_b32 v20, s34, 2 -; SI-NEXT: v_writelane_b32 v20, s35, 3 -; SI-NEXT: v_writelane_b32 v20, s36, 4 -; SI-NEXT: v_writelane_b32 v20, s37, 5 -; SI-NEXT: v_writelane_b32 v20, s38, 6 -; SI-NEXT: v_writelane_b32 v20, s39, 7 -; SI-NEXT: v_writelane_b32 v20, s48, 8 -; SI-NEXT: v_mov_b32_e32 v16, s16 -; SI-NEXT: v_mov_b32_e32 v17, s17 -; SI-NEXT: v_writelane_b32 v20, s49, 9 -; SI-NEXT: v_mov_b32_e32 v18, s18 -; SI-NEXT: v_mov_b32_e32 v19, s19 -; SI-NEXT: v_readfirstlane_b32 s44, v16 -; SI-NEXT: v_mov_b32_e32 v16, s20 -; SI-NEXT: v_readfirstlane_b32 s45, v17 -; SI-NEXT: v_mov_b32_e32 v17, s21 -; SI-NEXT: v_writelane_b32 v20, s50, 10 -; SI-NEXT: v_readfirstlane_b32 s42, v18 -; SI-NEXT: v_mov_b32_e32 v18, s22 -; SI-NEXT: v_readfirstlane_b32 s43, v19 -; SI-NEXT: v_mov_b32_e32 v19, s23 -; SI-NEXT: v_readfirstlane_b32 s40, v16 -; SI-NEXT: v_mov_b32_e32 v16, s24 -; SI-NEXT: v_readfirstlane_b32 s41, v17 -; SI-NEXT: v_mov_b32_e32 v17, s25 -; SI-NEXT: v_writelane_b32 v20, s51, 11 -; SI-NEXT: v_readfirstlane_b32 s24, v18 -; SI-NEXT: v_mov_b32_e32 v18, s26 -; SI-NEXT: v_readfirstlane_b32 s25, v19 -; SI-NEXT: v_mov_b32_e32 v19, s27 -; SI-NEXT: v_readfirstlane_b32 s22, v16 -; SI-NEXT: v_mov_b32_e32 v16, s28 -; SI-NEXT: v_readfirstlane_b32 s23, v17 -; SI-NEXT: v_mov_b32_e32 v17, s29 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15 -; SI-NEXT: v_writelane_b32 v20, s52, 12 -; SI-NEXT: v_readfirstlane_b32 s20, v18 -; SI-NEXT: v_readfirstlane_b32 s21, v19 -; SI-NEXT: v_readfirstlane_b32 s18, v16 -; SI-NEXT: v_readfirstlane_b32 s19, v17 -; SI-NEXT: v_readfirstlane_b32 s16, v1 -; SI-NEXT: v_readfirstlane_b32 s17, v2 -; SI-NEXT: v_readfirstlane_b32 s14, v3 -; SI-NEXT: v_readfirstlane_b32 s15, v4 -; SI-NEXT: v_readfirstlane_b32 s12, v5 -; SI-NEXT: v_readfirstlane_b32 s13, v6 -; SI-NEXT: v_readfirstlane_b32 s10, v7 -; SI-NEXT: v_readfirstlane_b32 s11, v8 -; SI-NEXT: v_readfirstlane_b32 s8, v9 -; SI-NEXT: v_readfirstlane_b32 s9, v10 -; SI-NEXT: v_readfirstlane_b32 s6, v11 -; SI-NEXT: v_readfirstlane_b32 s7, v12 -; SI-NEXT: v_readfirstlane_b32 s4, v13 +; SI-NEXT: v_writelane_b32 v28, s30, 0 +; SI-NEXT: v_writelane_b32 v28, s31, 1 +; SI-NEXT: v_writelane_b32 v28, s34, 2 +; SI-NEXT: v_writelane_b32 v28, s35, 3 +; SI-NEXT: v_writelane_b32 v28, s36, 4 +; SI-NEXT: v_writelane_b32 v28, s37, 5 +; SI-NEXT: v_writelane_b32 v28, s38, 6 +; SI-NEXT: v_writelane_b32 v28, s39, 7 +; SI-NEXT: v_writelane_b32 v28, s48, 8 +; SI-NEXT: v_writelane_b32 v28, s49, 9 +; SI-NEXT: v_mov_b32_e32 v15, s16 +; SI-NEXT: v_mov_b32_e32 v16, s17 +; SI-NEXT: v_mov_b32_e32 v17, s18 +; SI-NEXT: v_mov_b32_e32 v18, s19 +; SI-NEXT: v_writelane_b32 v28, s50, 10 +; SI-NEXT: v_mov_b32_e32 v19, s20 +; SI-NEXT: v_readfirstlane_b32 s44, v15 +; SI-NEXT: v_mov_b32_e32 v15, s21 +; SI-NEXT: v_readfirstlane_b32 s45, v16 +; SI-NEXT: v_mov_b32_e32 v16, s22 +; SI-NEXT: v_readfirstlane_b32 s42, v17 +; SI-NEXT: v_mov_b32_e32 v17, s23 +; SI-NEXT: v_readfirstlane_b32 s43, v18 +; SI-NEXT: v_mov_b32_e32 v18, s24 +; SI-NEXT: v_writelane_b32 v28, s51, 11 +; SI-NEXT: v_readfirstlane_b32 s40, v19 +; SI-NEXT: v_mov_b32_e32 v19, s25 +; SI-NEXT: v_readfirstlane_b32 s41, v15 +; SI-NEXT: v_mov_b32_e32 v15, s26 +; SI-NEXT: v_readfirstlane_b32 s24, v16 +; SI-NEXT: v_mov_b32_e32 v16, s27 +; SI-NEXT: v_readfirstlane_b32 s25, v17 +; SI-NEXT: v_mov_b32_e32 v17, s28 +; SI-NEXT: v_readfirstlane_b32 s22, v18 +; SI-NEXT: v_mov_b32_e32 v18, s29 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: v_writelane_b32 v28, s52, 12 +; SI-NEXT: v_readfirstlane_b32 s23, v19 +; SI-NEXT: v_readfirstlane_b32 s20, v15 +; SI-NEXT: v_readfirstlane_b32 s21, v16 +; SI-NEXT: v_readfirstlane_b32 s18, v17 +; SI-NEXT: v_readfirstlane_b32 s19, v18 +; SI-NEXT: v_readfirstlane_b32 s16, v0 +; SI-NEXT: v_readfirstlane_b32 s17, v1 +; SI-NEXT: v_readfirstlane_b32 s14, v2 +; SI-NEXT: v_readfirstlane_b32 s15, v3 +; SI-NEXT: v_readfirstlane_b32 s12, v4 +; SI-NEXT: v_readfirstlane_b32 s13, v5 +; SI-NEXT: v_readfirstlane_b32 s10, v6 +; SI-NEXT: v_readfirstlane_b32 s11, v7 +; SI-NEXT: v_readfirstlane_b32 s8, v8 +; SI-NEXT: v_readfirstlane_b32 s9, v9 +; SI-NEXT: v_readfirstlane_b32 s6, v10 +; SI-NEXT: v_readfirstlane_b32 s7, v11 +; SI-NEXT: v_readfirstlane_b32 s4, v12 ; SI-NEXT: s_and_b64 s[26:27], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s5, v14 -; SI-NEXT: v_writelane_b32 v20, s53, 13 +; SI-NEXT: v_readfirstlane_b32 s5, v13 +; SI-NEXT: v_writelane_b32 v28, s53, 13 ; SI-NEXT: s_cbranch_scc0 .LBB13_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_lshr_b32 s30, s5, 16 @@ -3791,212 +3709,133 @@ define inreg <56 x i16> @bitcast_v28i32_to_v56i16_scalar(<28 x i32> inreg %a, i3 ; SI-NEXT: s_lshl_b32 s27, s92, 16 ; SI-NEXT: s_and_b32 s29, s44, 0xffff ; SI-NEXT: s_or_b32 s27, s29, s27 -; SI-NEXT: v_mov_b32_e32 v1, s27 -; SI-NEXT: s_and_b32 s27, s45, 0xffff -; SI-NEXT: s_lshl_b32 s29, s53, 16 -; SI-NEXT: s_or_b32 s27, s27, s29 -; SI-NEXT: v_mov_b32_e32 v2, s27 -; SI-NEXT: s_lshl_b32 s27, s90, 16 -; SI-NEXT: s_and_b32 s29, s42, 0xffff -; SI-NEXT: s_or_b32 s27, s29, s27 -; SI-NEXT: v_mov_b32_e32 v3, s27 -; SI-NEXT: s_and_b32 s27, s43, 0xffff -; SI-NEXT: s_lshl_b32 s29, s52, 16 -; SI-NEXT: s_or_b32 s27, s27, s29 -; SI-NEXT: v_mov_b32_e32 v4, s27 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; SI-NEXT: s_lshl_b32 s27, s88, 16 -; SI-NEXT: s_and_b32 s29, s40, 0xffff -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v1, vcc, 8, v0 -; SI-NEXT: s_or_b32 s27, s29, s27 -; SI-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v1, vcc, 12, v0 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v2, s27 -; SI-NEXT: s_and_b32 s27, s41, 0xffff -; SI-NEXT: s_lshl_b32 s29, s51, 16 -; SI-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v1, vcc, 16, v0 -; SI-NEXT: s_or_b32 s27, s27, s29 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s27 +; SI-NEXT: s_and_b32 s29, s45, 0xffff +; SI-NEXT: s_lshl_b32 s44, s53, 16 +; SI-NEXT: s_or_b32 s29, s29, s44 +; SI-NEXT: s_lshl_b32 s44, s90, 16 +; SI-NEXT: s_and_b32 s42, s42, 0xffff +; SI-NEXT: s_or_b32 s42, s42, s44 +; SI-NEXT: s_and_b32 s43, s43, 0xffff +; SI-NEXT: s_lshl_b32 s44, s52, 16 +; SI-NEXT: s_or_b32 s43, s43, s44 +; SI-NEXT: s_lshl_b32 s44, s88, 16 +; SI-NEXT: s_and_b32 s40, s40, 0xffff +; SI-NEXT: s_or_b32 s40, s40, s44 +; SI-NEXT: s_and_b32 s41, s41, 0xffff +; SI-NEXT: s_lshl_b32 s44, s51, 16 +; SI-NEXT: s_or_b32 s41, s41, s44 +; SI-NEXT: s_lshl_b32 s44, s78, 16 ; SI-NEXT: s_and_b32 s24, s24, 0xffff -; SI-NEXT: s_lshl_b32 s27, s78, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 20, v0 -; SI-NEXT: s_or_b32 s24, s24, s27 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s24 -; SI-NEXT: s_and_b32 s24, s25, 0xffff -; SI-NEXT: s_lshl_b32 s25, s50, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 24, v0 -; SI-NEXT: s_or_b32 s24, s24, s25 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s24 +; SI-NEXT: s_or_b32 s24, s24, s44 +; SI-NEXT: s_and_b32 s25, s25, 0xffff +; SI-NEXT: s_lshl_b32 s44, s50, 16 +; SI-NEXT: s_or_b32 s25, s25, s44 ; SI-NEXT: s_and_b32 s22, s22, 0xffff -; SI-NEXT: s_lshl_b32 s24, s76, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 28, v0 -; SI-NEXT: s_or_b32 s22, s22, s24 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s22 -; SI-NEXT: s_and_b32 s22, s23, 0xffff -; SI-NEXT: s_lshl_b32 s23, s49, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 32, v0 -; SI-NEXT: s_or_b32 s22, s22, s23 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s22 +; SI-NEXT: s_lshl_b32 s44, s76, 16 +; SI-NEXT: s_or_b32 s22, s22, s44 +; SI-NEXT: s_and_b32 s23, s23, 0xffff +; SI-NEXT: s_lshl_b32 s44, s49, 16 +; SI-NEXT: s_or_b32 s23, s23, s44 ; SI-NEXT: s_and_b32 s20, s20, 0xffff -; SI-NEXT: s_lshl_b32 s22, s74, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 36, v0 -; SI-NEXT: s_or_b32 s20, s20, s22 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s20 -; SI-NEXT: s_and_b32 s20, s21, 0xffff -; SI-NEXT: s_lshl_b32 s21, s48, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 40, v0 -; SI-NEXT: s_or_b32 s20, s20, s21 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s20 +; SI-NEXT: s_lshl_b32 s44, s74, 16 +; SI-NEXT: s_or_b32 s20, s20, s44 +; SI-NEXT: s_and_b32 s21, s21, 0xffff +; SI-NEXT: s_lshl_b32 s44, s48, 16 +; SI-NEXT: s_or_b32 s21, s21, s44 ; SI-NEXT: s_and_b32 s18, s18, 0xffff -; SI-NEXT: s_lshl_b32 s20, s72, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 44, v0 -; SI-NEXT: s_or_b32 s18, s18, s20 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s18 -; SI-NEXT: s_and_b32 s18, s19, 0xffff -; SI-NEXT: s_lshl_b32 s19, s39, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 48, v0 -; SI-NEXT: s_or_b32 s18, s18, s19 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: s_lshl_b32 s44, s72, 16 +; SI-NEXT: s_or_b32 s18, s18, s44 +; SI-NEXT: s_and_b32 s19, s19, 0xffff +; SI-NEXT: s_lshl_b32 s44, s39, 16 +; SI-NEXT: s_or_b32 s19, s19, s44 ; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: s_lshl_b32 s18, s62, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 52, v0 -; SI-NEXT: s_or_b32 s16, s16, s18 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s17, 0xffff -; SI-NEXT: s_lshl_b32 s17, s38, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 56, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_lshl_b32 s44, s62, 16 +; SI-NEXT: s_or_b32 s16, s16, s44 +; SI-NEXT: s_and_b32 s17, s17, 0xffff +; SI-NEXT: s_lshl_b32 s44, s38, 16 +; SI-NEXT: s_or_b32 s17, s17, s44 ; SI-NEXT: s_and_b32 s14, s14, 0xffff -; SI-NEXT: s_lshl_b32 s16, s60, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 60, v0 -; SI-NEXT: s_or_b32 s14, s14, s16 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s14 -; SI-NEXT: s_and_b32 s14, s15, 0xffff -; SI-NEXT: s_lshl_b32 s15, s37, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 64, v0 -; SI-NEXT: s_or_b32 s14, s14, s15 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s14 +; SI-NEXT: s_lshl_b32 s44, s60, 16 +; SI-NEXT: s_or_b32 s14, s14, s44 +; SI-NEXT: s_and_b32 s15, s15, 0xffff +; SI-NEXT: s_lshl_b32 s44, s37, 16 +; SI-NEXT: s_or_b32 s15, s15, s44 ; SI-NEXT: s_and_b32 s12, s12, 0xffff -; SI-NEXT: s_lshl_b32 s14, s58, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x44, v0 -; SI-NEXT: s_or_b32 s12, s12, s14 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s12 -; SI-NEXT: s_and_b32 s12, s13, 0xffff -; SI-NEXT: s_lshl_b32 s13, s36, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x48, v0 -; SI-NEXT: s_or_b32 s12, s12, s13 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s12 +; SI-NEXT: s_lshl_b32 s44, s58, 16 +; SI-NEXT: s_or_b32 s12, s12, s44 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_lshl_b32 s44, s36, 16 +; SI-NEXT: s_or_b32 s13, s13, s44 ; SI-NEXT: s_and_b32 s10, s10, 0xffff -; SI-NEXT: s_lshl_b32 s12, s56, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x4c, v0 -; SI-NEXT: s_or_b32 s10, s10, s12 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s10 -; SI-NEXT: s_and_b32 s10, s11, 0xffff -; SI-NEXT: s_lshl_b32 s11, s35, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x50, v0 -; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: s_lshl_b32 s44, s56, 16 +; SI-NEXT: s_or_b32 s10, s10, s44 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: s_lshl_b32 s44, s35, 16 +; SI-NEXT: s_or_b32 s11, s11, s44 ; SI-NEXT: s_and_b32 s8, s8, 0xffff -; SI-NEXT: s_lshl_b32 s10, s46, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x54, v0 -; SI-NEXT: s_or_b32 s8, s8, s10 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s8 -; SI-NEXT: s_and_b32 s8, s9, 0xffff -; SI-NEXT: s_lshl_b32 s9, s34, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x58, v0 -; SI-NEXT: s_or_b32 s8, s8, s9 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: s_lshl_b32 s44, s46, 16 ; SI-NEXT: s_and_b32 s6, s6, 0xffff -; SI-NEXT: s_lshl_b32 s8, s28, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x5c, v0 -; SI-NEXT: s_or_b32 s6, s6, s8 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: s_and_b32 s6, s7, 0xffff -; SI-NEXT: s_lshl_b32 s7, s31, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x60, v0 -; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_lshl_b32 s28, s28, 16 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_lshl_b32 s6, s26, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x64, v0 -; SI-NEXT: s_or_b32 s4, s4, s6 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s4 -; SI-NEXT: s_and_b32 s4, s5, 0xffff -; SI-NEXT: s_lshl_b32 s5, s30, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x68, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0 -; SI-NEXT: v_mov_b32_e32 v1, s4 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: v_readlane_b32 s53, v20, 13 -; SI-NEXT: v_readlane_b32 s52, v20, 12 -; SI-NEXT: v_readlane_b32 s51, v20, 11 -; SI-NEXT: v_readlane_b32 s50, v20, 10 -; SI-NEXT: v_readlane_b32 s49, v20, 9 -; SI-NEXT: v_readlane_b32 s48, v20, 8 -; SI-NEXT: v_readlane_b32 s39, v20, 7 -; SI-NEXT: v_readlane_b32 s38, v20, 6 -; SI-NEXT: v_readlane_b32 s37, v20, 5 -; SI-NEXT: v_readlane_b32 s36, v20, 4 -; SI-NEXT: v_readlane_b32 s35, v20, 3 -; SI-NEXT: v_readlane_b32 s34, v20, 2 -; SI-NEXT: v_readlane_b32 s31, v20, 1 -; SI-NEXT: v_readlane_b32 s30, v20, 0 +; SI-NEXT: s_lshl_b32 s26, s26, 16 +; SI-NEXT: s_or_b32 s8, s8, s44 +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_lshl_b32 s44, s34, 16 +; SI-NEXT: s_or_b32 s6, s6, s28 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_lshl_b32 s28, s31, 16 +; SI-NEXT: s_or_b32 s4, s4, s26 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s26, s30, 16 +; SI-NEXT: s_or_b32 s9, s9, s44 +; SI-NEXT: s_or_b32 s7, s7, s28 +; SI-NEXT: s_or_b32 s5, s5, s26 +; SI-NEXT: v_mov_b32_e32 v0, s27 +; SI-NEXT: v_mov_b32_e32 v1, s29 +; SI-NEXT: v_mov_b32_e32 v2, s42 +; SI-NEXT: v_mov_b32_e32 v3, s43 +; SI-NEXT: v_mov_b32_e32 v4, s40 +; SI-NEXT: v_mov_b32_e32 v5, s41 +; SI-NEXT: v_mov_b32_e32 v6, s24 +; SI-NEXT: v_mov_b32_e32 v7, s25 +; SI-NEXT: v_mov_b32_e32 v8, s22 +; SI-NEXT: v_mov_b32_e32 v9, s23 +; SI-NEXT: v_mov_b32_e32 v10, s20 +; SI-NEXT: v_mov_b32_e32 v11, s21 +; SI-NEXT: v_mov_b32_e32 v12, s18 +; SI-NEXT: v_mov_b32_e32 v13, s19 +; SI-NEXT: v_mov_b32_e32 v14, s16 +; SI-NEXT: v_mov_b32_e32 v15, s17 +; SI-NEXT: v_mov_b32_e32 v16, s14 +; SI-NEXT: v_mov_b32_e32 v17, s15 +; SI-NEXT: v_mov_b32_e32 v18, s12 +; SI-NEXT: v_mov_b32_e32 v19, s13 +; SI-NEXT: v_mov_b32_e32 v20, s10 +; SI-NEXT: v_mov_b32_e32 v21, s11 +; SI-NEXT: v_mov_b32_e32 v22, s8 +; SI-NEXT: v_mov_b32_e32 v23, s9 +; SI-NEXT: v_mov_b32_e32 v24, s6 +; SI-NEXT: v_mov_b32_e32 v25, s7 +; SI-NEXT: v_mov_b32_e32 v26, s4 +; SI-NEXT: v_mov_b32_e32 v27, s5 +; SI-NEXT: v_readlane_b32 s53, v28, 13 +; SI-NEXT: v_readlane_b32 s52, v28, 12 +; SI-NEXT: v_readlane_b32 s51, v28, 11 +; SI-NEXT: v_readlane_b32 s50, v28, 10 +; SI-NEXT: v_readlane_b32 s49, v28, 9 +; SI-NEXT: v_readlane_b32 s48, v28, 8 +; SI-NEXT: v_readlane_b32 s39, v28, 7 +; SI-NEXT: v_readlane_b32 s38, v28, 6 +; SI-NEXT: v_readlane_b32 s37, v28, 5 +; SI-NEXT: v_readlane_b32 s36, v28, 4 +; SI-NEXT: v_readlane_b32 s35, v28, 3 +; SI-NEXT: v_readlane_b32 s34, v28, 2 +; SI-NEXT: v_readlane_b32 s31, v28, 1 +; SI-NEXT: v_readlane_b32 s30, v28, 0 ; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[4:5] -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB13_4: ; SI-NEXT: ; implicit-def: $sgpr92 @@ -4776,167 +4615,260 @@ define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v56i16_to_v28i32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v54, v2 -; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 -; SI-NEXT: v_mov_b32_e32 v53, v4 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v29 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v48, v19 +; SI-NEXT: v_mov_b32_e32 v49, v18 +; SI-NEXT: v_mov_b32_e32 v50, v17 +; SI-NEXT: v_mov_b32_e32 v51, v16 +; SI-NEXT: v_mov_b32_e32 v52, v15 +; SI-NEXT: v_mov_b32_e32 v53, v14 +; SI-NEXT: v_mov_b32_e32 v54, v13 +; SI-NEXT: v_mov_b32_e32 v55, v12 +; SI-NEXT: v_mov_b32_e32 v40, v11 +; SI-NEXT: v_mov_b32_e32 v41, v10 +; SI-NEXT: v_mov_b32_e32 v42, v9 +; SI-NEXT: v_mov_b32_e32 v43, v8 +; SI-NEXT: v_mov_b32_e32 v44, v7 +; SI-NEXT: v_mov_b32_e32 v45, v6 +; SI-NEXT: v_mov_b32_e32 v46, v5 +; SI-NEXT: v_mov_b32_e32 v47, v4 +; SI-NEXT: v_mov_b32_e32 v56, v3 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_mov_b32_e32 v57, v2 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_mov_b32_e32 v58, v1 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_mov_b32_e32 v59, v0 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v27 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v26 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v25 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v24 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v23 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v22 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v21 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v48 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v50 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v52 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v40 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v42 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v43 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v44 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v45 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v46 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v47 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v56 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v57 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v58 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v59 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB14_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v55 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v59 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v58 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v57 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v56 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v47 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v46 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v45 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v44 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v43 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v42 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v41 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v40 +; SI-NEXT: v_or_b32_e32 v0, v0, v39 +; SI-NEXT: v_or_b32_e32 v1, v1, v38 +; SI-NEXT: v_or_b32_e32 v2, v2, v62 +; SI-NEXT: v_or_b32_e32 v3, v3, v37 +; SI-NEXT: v_or_b32_e32 v4, v4, v32 +; SI-NEXT: v_or_b32_e32 v5, v5, v36 +; SI-NEXT: v_or_b32_e32 v6, v6, v61 +; SI-NEXT: v_or_b32_e32 v7, v7, v35 +; SI-NEXT: v_or_b32_e32 v8, v8, v60 +; SI-NEXT: v_or_b32_e32 v9, v9, v34 +; SI-NEXT: v_or_b32_e32 v10, v10, v63 +; SI-NEXT: v_or_b32_e32 v11, v11, v33 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v0 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:84 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:76 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v4 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v54 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v53 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v52 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v51 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v50 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v49 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v48 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:68 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:100 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:60 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v8 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v12 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v16 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v18 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:52 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v22 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v24 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v26 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:44 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:36 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:28 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v24, v24, v25 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:20 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:12 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v26, v26, v27 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execz .LBB14_2 -; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; kill: killed $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr28 @@ -4980,195 +4912,41 @@ define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; kill: killed $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v53 -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v59 ; SI-NEXT: ; kill: killed $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: v_or_b32_e32 v0, v0, v58 -; SI-NEXT: v_or_b32_e32 v1, v1, v52 -; SI-NEXT: v_or_b32_e32 v2, v2, v57 -; SI-NEXT: v_or_b32_e32 v16, v16, v49 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: ; kill: killed $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; kill: killed $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; SI-NEXT: v_or_b32_e32 v3, v3, v51 -; SI-NEXT: v_or_b32_e32 v4, v4, v56 -; SI-NEXT: v_or_b32_e32 v5, v5, v50 -; SI-NEXT: v_or_b32_e32 v6, v6, v46 -; SI-NEXT: v_or_b32_e32 v7, v7, v45 -; SI-NEXT: v_or_b32_e32 v8, v8, v39 -; SI-NEXT: v_or_b32_e32 v9, v9, v37 -; SI-NEXT: v_or_b32_e32 v10, v10, v36 -; SI-NEXT: v_or_b32_e32 v11, v11, v34 -; SI-NEXT: v_or_b32_e32 v12, v12, v43 -; SI-NEXT: v_or_b32_e32 v13, v13, v42 -; SI-NEXT: v_or_b32_e32 v14, v14, v40 -; SI-NEXT: v_or_b32_e32 v15, v15, v47 -; SI-NEXT: v_or_b32_e32 v17, v17, v48 -; SI-NEXT: v_or_b32_e32 v18, v18, v38 -; SI-NEXT: v_or_b32_e32 v19, v19, v44 -; SI-NEXT: v_or_b32_e32 v20, v20, v35 -; SI-NEXT: v_or_b32_e32 v21, v21, v33 -; SI-NEXT: v_or_b32_e32 v22, v22, v32 -; SI-NEXT: v_or_b32_e32 v23, v23, v41 -; SI-NEXT: v_or_b32_e32 v24, v24, v63 -; SI-NEXT: v_or_b32_e32 v25, v25, v62 -; SI-NEXT: v_or_b32_e32 v26, v26, v61 -; SI-NEXT: v_or_b32_e32 v27, v27, v60 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: .LBB14_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB14_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v53 -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v59 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v55 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v59 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v58 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v57 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v56 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v47 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v46 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v45 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v44 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v43 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v42 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v41 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v40 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; SI-NEXT: v_or_b32_e32 v0, v58, v0 -; SI-NEXT: s_mov_b32 s6, 0x30000 -; SI-NEXT: v_or_b32_e32 v1, v52, v1 -; SI-NEXT: v_or_b32_e32 v2, v57, v2 -; SI-NEXT: v_or_b32_e32 v16, v49, v16 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 @@ -5178,45 +4956,22 @@ define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; SI-NEXT: v_or_b32_e32 v3, v51, v3 -; SI-NEXT: v_or_b32_e32 v4, v56, v4 -; SI-NEXT: v_or_b32_e32 v5, v50, v5 -; SI-NEXT: v_or_b32_e32 v6, v46, v6 -; SI-NEXT: v_or_b32_e32 v7, v45, v7 -; SI-NEXT: v_or_b32_e32 v8, v39, v8 -; SI-NEXT: v_or_b32_e32 v9, v37, v9 -; SI-NEXT: v_or_b32_e32 v10, v36, v10 -; SI-NEXT: v_or_b32_e32 v11, v34, v11 -; SI-NEXT: v_or_b32_e32 v12, v43, v12 -; SI-NEXT: v_or_b32_e32 v13, v42, v13 -; SI-NEXT: v_or_b32_e32 v14, v40, v14 -; SI-NEXT: v_or_b32_e32 v15, v47, v15 -; SI-NEXT: v_or_b32_e32 v17, v48, v17 -; SI-NEXT: v_or_b32_e32 v18, v38, v18 -; SI-NEXT: v_or_b32_e32 v19, v44, v19 -; SI-NEXT: v_or_b32_e32 v20, v35, v20 -; SI-NEXT: v_or_b32_e32 v21, v33, v21 -; SI-NEXT: v_or_b32_e32 v22, v32, v22 -; SI-NEXT: v_or_b32_e32 v23, v41, v23 -; SI-NEXT: v_or_b32_e32 v24, v63, v24 -; SI-NEXT: v_or_b32_e32 v25, v62, v25 -; SI-NEXT: v_or_b32_e32 v26, v61, v26 -; SI-NEXT: v_or_b32_e32 v27, v60, v27 +; SI-NEXT: v_or_b32_e32 v0, v39, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v1, v38, v1 +; SI-NEXT: v_or_b32_e32 v2, v62, v2 +; SI-NEXT: v_or_b32_e32 v3, v37, v3 +; SI-NEXT: v_or_b32_e32 v4, v32, v4 +; SI-NEXT: v_or_b32_e32 v5, v36, v5 +; SI-NEXT: v_or_b32_e32 v6, v61, v6 +; SI-NEXT: v_or_b32_e32 v7, v35, v7 +; SI-NEXT: v_or_b32_e32 v8, v60, v8 +; SI-NEXT: v_or_b32_e32 v9, v34, v9 +; SI-NEXT: v_or_b32_e32 v10, v63, v10 +; SI-NEXT: v_or_b32_e32 v11, v33, v11 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 ; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 ; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 ; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 @@ -5224,41 +4979,121 @@ define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 ; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 ; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v54 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v53 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v52 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v51 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v50 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v49 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v48 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 ; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 ; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 ; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 ; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 ; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 ; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 ; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 ; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v25, vcc, s6, v25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v26, v27, v26 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v26, vcc, 0x30000, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_or_b32_e32 v27, v28, v27 ; SI-NEXT: v_add_i32_e32 v27, vcc, 0x30000, v27 ; SI-NEXT: .LBB14_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -6025,402 +5860,323 @@ define inreg <28 x i32> @bitcast_v56i16_to_v28i32_scalar(<56 x i16> inreg %a, i3 ; SI-LABEL: bitcast_v56i16_to_v28i32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_mov_b32_e32 v57, v12 +; SI-NEXT: v_mov_b32_e32 v48, v5 +; SI-NEXT: v_mov_b32_e32 v39, v6 +; SI-NEXT: v_mov_b32_e32 v53, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v48 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v38, v7 ; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_mov_b32_e32 v58, v10 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_mov_b32_e32 v60, v8 -; SI-NEXT: v_mov_b32_e32 v33, v6 -; SI-NEXT: v_mov_b32_e32 v35, v4 -; SI-NEXT: v_mov_b32_e32 v39, v2 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mov_b32_e32 v61, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:36 -; SI-NEXT: v_mov_b32_e32 v31, v26 -; SI-NEXT: v_mov_b32_e32 v41, v24 -; SI-NEXT: v_mov_b32_e32 v42, v22 -; SI-NEXT: v_mov_b32_e32 v43, v20 -; SI-NEXT: v_mov_b32_e32 v49, v18 -; SI-NEXT: v_mov_b32_e32 v44, v16 -; SI-NEXT: v_mov_b32_e32 v45, v14 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v11 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v29 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v39 +; SI-NEXT: v_mov_b32_e32 v37, v8 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v38 +; SI-NEXT: v_mov_b32_e32 v36, v9 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v37 +; SI-NEXT: v_mov_b32_e32 v35, v10 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v36 +; SI-NEXT: v_mov_b32_e32 v34, v11 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v35 +; SI-NEXT: v_mov_b32_e32 v33, v12 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v34 +; SI-NEXT: v_mov_b32_e32 v32, v13 +; SI-NEXT: v_mov_b32_e32 v49, v4 +; SI-NEXT: v_mov_b32_e32 v50, v3 +; SI-NEXT: v_mov_b32_e32 v51, v2 +; SI-NEXT: v_mov_b32_e32 v52, v1 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v50 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v52 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v53 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s7, s28, 16 +; SI-NEXT: s_lshr_b32 s8, s27, 16 +; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: s_lshr_b32 s13, s22, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s40, s19, 16 +; SI-NEXT: s_lshr_b32 s41, s18, 16 +; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v32 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v8 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v12 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v0 ; SI-NEXT: s_cbranch_scc0 .LBB15_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v61 -; SI-NEXT: v_or_b32_e32 v7, v0, v18 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 -; SI-NEXT: v_or_b32_e32 v9, v0, v16 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 -; SI-NEXT: v_or_b32_e32 v10, v0, v14 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v60 -; SI-NEXT: v_or_b32_e32 v11, v0, v5 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v58 -; SI-NEXT: v_or_b32_e32 v12, v0, v3 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v57 -; SI-NEXT: v_or_b32_e32 v13, v0, v63 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v45 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_or_b32_e32 v14, v0, v62 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v44 -; SI-NEXT: v_or_b32_e32 v15, v0, v40 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_or_b32_e32 v16, v0, v48 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v43 -; SI-NEXT: v_or_b32_e32 v17, v0, v38 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v42 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_or_b32_e32 v18, v0, v37 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v41 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v39 -; SI-NEXT: v_or_b32_e32 v19, v0, v36 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v31 ; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: v_or_b32_e32 v8, v1, v20 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_or_b32_e32 v20, v0, v55 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v28 +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: v_or_b32_e32 v21, v0, v34 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 -; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: v_or_b32_e32 v22, v0, v54 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v56 -; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: v_or_b32_e32 v23, v0, v32 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s44, s42, 16 +; SI-NEXT: v_or_b32_e32 v14, v0, v59 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; SI-NEXT: s_or_b32 s5, s5, s44 +; SI-NEXT: s_and_b32 s44, s18, 0xffff +; SI-NEXT: s_lshl_b32 s45, s41, 16 +; SI-NEXT: v_or_b32_e32 v15, v0, v58 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 -; SI-NEXT: s_or_b32 s7, s7, s8 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: v_or_b32_e32 v24, v0, v59 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v47 -; SI-NEXT: s_or_b32 s8, s8, s9 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: v_or_b32_e32 v25, v0, v53 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v46 -; SI-NEXT: s_or_b32 s9, s9, s10 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_or_b32_e32 v26, v0, v52 -; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: s_or_b32 s44, s44, s45 +; SI-NEXT: s_and_b32 s45, s19, 0xffff +; SI-NEXT: s_lshl_b32 s46, s40, 16 +; SI-NEXT: v_or_b32_e32 v16, v0, v57 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 -; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_or_b32_e32 v27, v0, v29 +; SI-NEXT: s_or_b32 s45, s45, s46 +; SI-NEXT: s_and_b32 s46, s20, 0xffff +; SI-NEXT: s_lshl_b32 s47, s15, 16 +; SI-NEXT: v_or_b32_e32 v17, v0, v56 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: s_or_b32 s46, s46, s47 +; SI-NEXT: s_and_b32 s47, s21, 0xffff +; SI-NEXT: s_lshl_b32 s56, s14, 16 +; SI-NEXT: v_or_b32_e32 v18, v0, v47 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; SI-NEXT: s_or_b32 s47, s47, s56 +; SI-NEXT: s_and_b32 s56, s22, 0xffff +; SI-NEXT: s_lshl_b32 s57, s13, 16 +; SI-NEXT: v_or_b32_e32 v19, v0, v46 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: s_or_b32 s56, s56, s57 +; SI-NEXT: s_and_b32 s57, s23, 0xffff +; SI-NEXT: s_lshl_b32 s58, s12, 16 +; SI-NEXT: v_or_b32_e32 v20, v0, v45 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: s_or_b32 s57, s57, s58 +; SI-NEXT: s_and_b32 s58, s24, 0xffff +; SI-NEXT: s_lshl_b32 s59, s11, 16 +; SI-NEXT: v_or_b32_e32 v21, v0, v44 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: s_or_b32 s58, s58, s59 +; SI-NEXT: s_and_b32 s59, s25, 0xffff +; SI-NEXT: s_lshl_b32 s60, s10, 16 +; SI-NEXT: v_or_b32_e32 v22, v0, v43 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: s_or_b32 s59, s59, s60 +; SI-NEXT: s_and_b32 s60, s26, 0xffff +; SI-NEXT: s_lshl_b32 s61, s9, 16 +; SI-NEXT: v_or_b32_e32 v23, v0, v42 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: s_or_b32 s60, s60, s61 +; SI-NEXT: s_and_b32 s61, s27, 0xffff +; SI-NEXT: s_lshl_b32 s62, s8, 16 +; SI-NEXT: v_or_b32_e32 v24, v0, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: s_or_b32 s61, s61, s62 +; SI-NEXT: s_and_b32 s62, s28, 0xffff +; SI-NEXT: s_lshl_b32 s63, s7, 16 +; SI-NEXT: v_or_b32_e32 v25, v0, v40 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: s_or_b32 s62, s62, s63 +; SI-NEXT: s_and_b32 s63, s29, 0xffff +; SI-NEXT: s_lshl_b32 s72, s6, 16 +; SI-NEXT: v_or_b32_e32 v26, v0, v55 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: s_or_b32 s63, s63, s72 +; SI-NEXT: v_or_b32_e32 v27, v0, v54 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_mov_b32_e32 v5, s9 -; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: v_mov_b32_e32 v2, s44 +; SI-NEXT: v_mov_b32_e32 v3, s45 +; SI-NEXT: v_mov_b32_e32 v4, s46 +; SI-NEXT: v_mov_b32_e32 v5, s47 +; SI-NEXT: v_mov_b32_e32 v6, s56 +; SI-NEXT: v_mov_b32_e32 v7, s57 +; SI-NEXT: v_mov_b32_e32 v8, s58 +; SI-NEXT: v_mov_b32_e32 v9, s59 +; SI-NEXT: v_mov_b32_e32 v10, s60 +; SI-NEXT: v_mov_b32_e32 v11, s61 +; SI-NEXT: v_mov_b32_e32 v12, s62 +; SI-NEXT: v_mov_b32_e32 v13, s63 ; SI-NEXT: s_cbranch_execnz .LBB15_3 ; SI-NEXT: .LBB15_2: ; %cmp.true -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v61 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: s_or_b32 s7, s8, s7 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: s_or_b32 s8, s9, s8 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: s_or_b32 s9, s10, s9 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: s_or_b32 s10, s11, s10 -; SI-NEXT: s_add_i32 s4, s4, 0x30000 -; SI-NEXT: s_add_i32 s5, s5, 0x30000 -; SI-NEXT: s_add_i32 s6, s6, 0x30000 -; SI-NEXT: s_add_i32 s7, s7, 0x30000 -; SI-NEXT: s_add_i32 s8, s8, 0x30000 -; SI-NEXT: s_add_i32 s9, s9, 0x30000 -; SI-NEXT: s_add_i32 s10, s10, 0x30000 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_mov_b32_e32 v5, s9 -; SI-NEXT: v_mov_b32_e32 v6, s10 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v39 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v60 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v58 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v57 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v63, v0 -; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v45 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v62, v0 +; SI-NEXT: v_or_b32_e32 v0, v59, v0 ; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v44 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v40, v0 +; SI-NEXT: v_or_b32_e32 v0, v58, v0 ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v48, v0 +; SI-NEXT: v_or_b32_e32 v0, v57, v0 ; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v43 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v38, v0 +; SI-NEXT: v_or_b32_e32 v0, v56, v0 ; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v42 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v37, v0 +; SI-NEXT: v_or_b32_e32 v0, v47, v0 ; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v36, v0 +; SI-NEXT: v_or_b32_e32 v0, v46, v0 ; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v31 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v55, v0 +; SI-NEXT: v_or_b32_e32 v0, v45, v0 ; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v34, v0 +; SI-NEXT: v_or_b32_e32 v0, v44, v0 ; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v54, v0 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s16, s42, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: v_add_i32_e32 v22, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v56 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: s_and_b32 s16, s18, 0xffff +; SI-NEXT: s_lshl_b32 s17, s41, 16 +; SI-NEXT: s_add_i32 s19, s19, 3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v32, v0 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_and_b32 s17, s19, 0xffff +; SI-NEXT: s_lshl_b32 s18, s40, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: s_and_b32 s18, s20, 0xffff +; SI-NEXT: s_lshl_b32 s15, s15, 16 +; SI-NEXT: s_add_i32 s21, s21, 3 ; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: s_or_b32 s15, s15, s18 +; SI-NEXT: s_and_b32 s18, s21, 0xffff +; SI-NEXT: s_lshl_b32 s14, s14, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v59, v0 +; SI-NEXT: s_or_b32 s14, s14, s18 +; SI-NEXT: s_and_b32 s18, s22, 0xffff +; SI-NEXT: s_lshl_b32 s13, s13, 16 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: v_or_b32_e32 v0, v41, v0 +; SI-NEXT: s_or_b32 s13, s13, s18 +; SI-NEXT: s_and_b32 s18, s23, 0xffff +; SI-NEXT: s_lshl_b32 s12, s12, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 ; SI-NEXT: v_add_i32_e32 v24, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v47 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 +; SI-NEXT: s_or_b32 s12, s12, s18 +; SI-NEXT: s_and_b32 s18, s24, 0xffff +; SI-NEXT: s_lshl_b32 s11, s11, 16 +; SI-NEXT: s_add_i32 s25, s25, 3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v53, v0 +; SI-NEXT: s_or_b32 s11, s11, s18 +; SI-NEXT: s_and_b32 s18, s25, 0xffff +; SI-NEXT: s_lshl_b32 s10, s10, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: v_or_b32_e32 v0, v40, v0 +; SI-NEXT: s_or_b32 s10, s10, s18 +; SI-NEXT: s_and_b32 s18, s26, 0xffff +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_add_i32 s27, s27, 3 ; SI-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v46 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: s_or_b32 s9, s9, s18 +; SI-NEXT: s_and_b32 s18, s27, 0xffff +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v52, v0 +; SI-NEXT: s_or_b32 s8, s8, s18 +; SI-NEXT: s_and_b32 s18, s28, 0xffff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: v_or_b32_e32 v0, v55, v0 +; SI-NEXT: s_or_b32 s7, s7, s18 +; SI-NEXT: s_and_b32 s18, s29, 0xffff +; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: v_add_i32_e32 v26, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 +; SI-NEXT: s_or_b32 s6, s6, s18 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v29, v0 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s16, s16, 0x30000 +; SI-NEXT: s_add_i32 s17, s17, 0x30000 +; SI-NEXT: s_add_i32 s15, s15, 0x30000 +; SI-NEXT: s_add_i32 s14, s14, 0x30000 +; SI-NEXT: s_add_i32 s13, s13, 0x30000 +; SI-NEXT: s_add_i32 s12, s12, 0x30000 +; SI-NEXT: s_add_i32 s11, s11, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v54, v0 ; SI-NEXT: v_add_i32_e32 v27, vcc, 0x30000, v0 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: v_mov_b32_e32 v3, s17 +; SI-NEXT: v_mov_b32_e32 v4, s15 +; SI-NEXT: v_mov_b32_e32 v5, s14 +; SI-NEXT: v_mov_b32_e32 v6, s13 +; SI-NEXT: v_mov_b32_e32 v7, s12 +; SI-NEXT: v_mov_b32_e32 v8, s11 +; SI-NEXT: v_mov_b32_e32 v9, s10 +; SI-NEXT: v_mov_b32_e32 v10, s9 +; SI-NEXT: v_mov_b32_e32 v11, s8 +; SI-NEXT: v_mov_b32_e32 v12, s7 +; SI-NEXT: v_mov_b32_e32 v13, s6 ; SI-NEXT: .LBB15_3: ; %end -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB15_4: -; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v62, v58 -; SI-NEXT: v_mov_b32_e32 v58, v51 -; SI-NEXT: v_mov_b32_e32 v51, v47 -; SI-NEXT: v_mov_b32_e32 v47, v44 -; SI-NEXT: v_mov_b32_e32 v44, v41 -; SI-NEXT: v_mov_b32_e32 v41, v30 -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v63, v59 -; SI-NEXT: v_mov_b32_e32 v59, v56 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_mov_b32_e32 v56, v50 -; SI-NEXT: v_mov_b32_e32 v50, v45 -; SI-NEXT: v_mov_b32_e32 v45, v42 -; SI-NEXT: v_mov_b32_e32 v42, v28 -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v61, v52 -; SI-NEXT: v_mov_b32_e32 v52, v57 -; SI-NEXT: v_mov_b32_e32 v57, v46 -; SI-NEXT: v_mov_b32_e32 v46, v49 -; SI-NEXT: v_mov_b32_e32 v49, v43 -; SI-NEXT: v_mov_b32_e32 v43, v31 -; SI-NEXT: v_mov_b32_e32 v53, v40 -; SI-NEXT: v_mov_b32_e32 v40, v48 -; SI-NEXT: v_mov_b32_e32 v48, v39 -; SI-NEXT: v_mov_b32_e32 v39, v38 -; SI-NEXT: v_mov_b32_e32 v38, v37 -; SI-NEXT: v_mov_b32_e32 v37, v36 -; SI-NEXT: v_mov_b32_e32 v36, v35 -; SI-NEXT: v_mov_b32_e32 v35, v55 -; SI-NEXT: v_mov_b32_e32 v55, v34 -; SI-NEXT: v_mov_b32_e32 v34, v33 -; SI-NEXT: v_mov_b32_e32 v33, v54 -; SI-NEXT: v_mov_b32_e32 v54, v32 -; SI-NEXT: v_mov_b32_e32 v32, v60 -; SI-NEXT: v_mov_b32_e32 v60, v29 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v29, v60 -; SI-NEXT: v_mov_b32_e32 v60, v32 -; SI-NEXT: v_mov_b32_e32 v32, v54 -; SI-NEXT: v_mov_b32_e32 v54, v33 -; SI-NEXT: v_mov_b32_e32 v33, v34 -; SI-NEXT: v_mov_b32_e32 v34, v55 -; SI-NEXT: v_mov_b32_e32 v55, v35 -; SI-NEXT: v_mov_b32_e32 v35, v36 -; SI-NEXT: v_mov_b32_e32 v36, v37 -; SI-NEXT: v_mov_b32_e32 v37, v38 -; SI-NEXT: v_mov_b32_e32 v38, v39 -; SI-NEXT: v_mov_b32_e32 v39, v48 -; SI-NEXT: v_mov_b32_e32 v48, v40 -; SI-NEXT: v_mov_b32_e32 v40, v53 -; SI-NEXT: v_mov_b32_e32 v31, v43 -; SI-NEXT: v_mov_b32_e32 v43, v49 -; SI-NEXT: v_mov_b32_e32 v49, v46 -; SI-NEXT: v_mov_b32_e32 v46, v57 -; SI-NEXT: v_mov_b32_e32 v57, v52 -; SI-NEXT: v_mov_b32_e32 v52, v61 -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v28, v42 -; SI-NEXT: v_mov_b32_e32 v42, v45 -; SI-NEXT: v_mov_b32_e32 v45, v50 -; SI-NEXT: v_mov_b32_e32 v50, v56 -; SI-NEXT: v_mov_b32_e32 v56, v59 -; SI-NEXT: v_mov_b32_e32 v59, v63 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v30, v41 -; SI-NEXT: v_mov_b32_e32 v41, v44 -; SI-NEXT: v_mov_b32_e32 v44, v47 -; SI-NEXT: v_mov_b32_e32 v47, v51 -; SI-NEXT: v_mov_b32_e32 v51, v58 -; SI-NEXT: v_mov_b32_e32 v58, v62 -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: s_branch .LBB15_2 ; ; VI-LABEL: bitcast_v56i16_to_v28i32_scalar: @@ -7147,49 +6903,17 @@ end: define <56 x half> @bitcast_v28i32_to_v56f16(<28 x i32> %a, i32 %b) { ; SI-LABEL: bitcast_v28i32_to_v56f16: ; SI: ; %bb.0: +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -7206,116 +6930,139 @@ define <56 x half> @bitcast_v28i32_to_v56f16(<28 x i32> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB16_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v24 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v62, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v25 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v63, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v6 +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v30 +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v30 +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v27 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v25 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 @@ -7326,36 +7073,45 @@ define <56 x half> @bitcast_v28i32_to_v56f16(<28 x i32> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v28 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v49, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v30 +; SI-NEXT: v_mov_b32_e32 v30, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v24 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v37, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v1 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v33, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v0 +; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 @@ -7383,30 +7139,28 @@ define <56 x half> @bitcast_v28i32_to_v56f16(<28 x i32> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: .LBB16_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB16_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v57 ; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v20 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v47 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v46 ; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 ; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v19 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v45 -; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v45 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 ; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 @@ -7422,43 +7176,40 @@ define <56 x half> @bitcast_v28i32_to_v56f16(<28 x i32> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 ; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 ; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 ; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 ; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 ; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v25 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 ; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 ; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v23 ; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 @@ -7476,299 +7227,210 @@ define <56 x half> @bitcast_v28i32_to_v56f16(<28 x i32> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 ; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 ; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 ; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 ; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 ; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 ; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 ; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 ; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 ; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 -; SI-NEXT: v_mov_b32_e32 v29, v28 -; SI-NEXT: v_mov_b32_e32 v57, v25 -; SI-NEXT: v_mov_b32_e32 v47, v26 -; SI-NEXT: v_mov_b32_e32 v45, v27 -; SI-NEXT: v_mov_b32_e32 v43, v1 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v46, v24 +; SI-NEXT: v_mov_b32_e32 v45, v25 +; SI-NEXT: v_mov_b32_e32 v43, v26 +; SI-NEXT: v_mov_b32_e32 v41, v27 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: .LBB16_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v54 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v52 -; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v50 -; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v49 -; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v39 -; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 -; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 -; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 -; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v30 -; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v63 -; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 -; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v58 -; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 -; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 -; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v44 -; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v52 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v51 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v48 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v38 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v37 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v34 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v33 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v8, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v47 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v42 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v22, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v31 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v10, v61 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 -; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v12, v57 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v14, v44 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v16, v40 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v62 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v22, v58 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v57 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v47 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v45 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v29 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v62 +; SI-NEXT: v_or_b32_e32 v23, v25, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v46 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 +; SI-NEXT: v_or_b32_e32 v25, v27, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v43 ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -7785,7 +7447,12 @@ define <56 x half> @bitcast_v28i32_to_v56f16(<28 x i32> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v26, v30 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_or_b32_e32 v26, v27, v26 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 +; SI-NEXT: v_or_b32_e32 v27, v29, v27 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v28i32_to_v56f16: @@ -8383,50 +8050,50 @@ define inreg <56 x half> @bitcast_v28i32_to_v56f16_scalar(<28 x i32> inreg %a, i ; SI-LABEL: bitcast_v28i32_to_v56f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v16, s16 -; SI-NEXT: v_mov_b32_e32 v17, s17 -; SI-NEXT: v_mov_b32_e32 v18, s18 -; SI-NEXT: v_mov_b32_e32 v19, s19 -; SI-NEXT: v_readfirstlane_b32 s40, v16 -; SI-NEXT: v_mov_b32_e32 v16, s20 -; SI-NEXT: v_readfirstlane_b32 s41, v17 -; SI-NEXT: v_mov_b32_e32 v17, s21 -; SI-NEXT: v_readfirstlane_b32 s42, v18 -; SI-NEXT: v_mov_b32_e32 v18, s22 -; SI-NEXT: v_readfirstlane_b32 s43, v19 -; SI-NEXT: v_mov_b32_e32 v19, s23 -; SI-NEXT: v_readfirstlane_b32 s23, v16 -; SI-NEXT: v_mov_b32_e32 v16, s24 -; SI-NEXT: v_readfirstlane_b32 s24, v17 -; SI-NEXT: v_mov_b32_e32 v17, s25 -; SI-NEXT: v_readfirstlane_b32 s25, v18 -; SI-NEXT: v_mov_b32_e32 v18, s26 -; SI-NEXT: v_readfirstlane_b32 s26, v19 -; SI-NEXT: v_mov_b32_e32 v19, s27 -; SI-NEXT: v_readfirstlane_b32 s27, v16 -; SI-NEXT: v_mov_b32_e32 v16, s28 -; SI-NEXT: v_readfirstlane_b32 s28, v17 -; SI-NEXT: v_mov_b32_e32 v17, s29 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15 -; SI-NEXT: v_readfirstlane_b32 s29, v18 -; SI-NEXT: v_readfirstlane_b32 s22, v19 -; SI-NEXT: v_readfirstlane_b32 s21, v16 -; SI-NEXT: v_readfirstlane_b32 s20, v17 -; SI-NEXT: v_readfirstlane_b32 s19, v1 -; SI-NEXT: v_readfirstlane_b32 s18, v2 -; SI-NEXT: v_readfirstlane_b32 s17, v3 -; SI-NEXT: v_readfirstlane_b32 s16, v4 -; SI-NEXT: v_readfirstlane_b32 s15, v5 -; SI-NEXT: v_readfirstlane_b32 s14, v6 -; SI-NEXT: v_readfirstlane_b32 s13, v7 -; SI-NEXT: v_readfirstlane_b32 s12, v8 -; SI-NEXT: v_readfirstlane_b32 s11, v9 -; SI-NEXT: v_readfirstlane_b32 s10, v10 -; SI-NEXT: v_readfirstlane_b32 s8, v11 -; SI-NEXT: v_readfirstlane_b32 s7, v12 -; SI-NEXT: v_readfirstlane_b32 s6, v13 +; SI-NEXT: v_mov_b32_e32 v15, s16 +; SI-NEXT: v_mov_b32_e32 v16, s17 +; SI-NEXT: v_mov_b32_e32 v17, s18 +; SI-NEXT: v_mov_b32_e32 v18, s19 +; SI-NEXT: v_mov_b32_e32 v19, s20 +; SI-NEXT: v_readfirstlane_b32 s40, v15 +; SI-NEXT: v_mov_b32_e32 v15, s21 +; SI-NEXT: v_readfirstlane_b32 s41, v16 +; SI-NEXT: v_mov_b32_e32 v16, s22 +; SI-NEXT: v_readfirstlane_b32 s42, v17 +; SI-NEXT: v_mov_b32_e32 v17, s23 +; SI-NEXT: v_readfirstlane_b32 s43, v18 +; SI-NEXT: v_mov_b32_e32 v18, s24 +; SI-NEXT: v_readfirstlane_b32 s24, v19 +; SI-NEXT: v_mov_b32_e32 v19, s25 +; SI-NEXT: v_readfirstlane_b32 s25, v15 +; SI-NEXT: v_mov_b32_e32 v15, s26 +; SI-NEXT: v_readfirstlane_b32 s26, v16 +; SI-NEXT: v_mov_b32_e32 v16, s27 +; SI-NEXT: v_readfirstlane_b32 s27, v17 +; SI-NEXT: v_mov_b32_e32 v17, s28 +; SI-NEXT: v_readfirstlane_b32 s28, v18 +; SI-NEXT: v_mov_b32_e32 v18, s29 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: v_readfirstlane_b32 s29, v19 +; SI-NEXT: v_readfirstlane_b32 s23, v15 +; SI-NEXT: v_readfirstlane_b32 s22, v16 +; SI-NEXT: v_readfirstlane_b32 s21, v17 +; SI-NEXT: v_readfirstlane_b32 s20, v18 +; SI-NEXT: v_readfirstlane_b32 s19, v0 +; SI-NEXT: v_readfirstlane_b32 s18, v1 +; SI-NEXT: v_readfirstlane_b32 s17, v2 +; SI-NEXT: v_readfirstlane_b32 s16, v3 +; SI-NEXT: v_readfirstlane_b32 s15, v4 +; SI-NEXT: v_readfirstlane_b32 s14, v5 +; SI-NEXT: v_readfirstlane_b32 s13, v6 +; SI-NEXT: v_readfirstlane_b32 s12, v7 +; SI-NEXT: v_readfirstlane_b32 s11, v8 +; SI-NEXT: v_readfirstlane_b32 s10, v9 +; SI-NEXT: v_readfirstlane_b32 s8, v10 +; SI-NEXT: v_readfirstlane_b32 s7, v11 +; SI-NEXT: v_readfirstlane_b32 s6, v12 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s9, v14 +; SI-NEXT: v_readfirstlane_b32 s9, v13 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill @@ -8439,106 +8106,105 @@ define inreg <56 x half> @bitcast_v28i32_to_v56f16_scalar(<28 x i32> inreg %a, i ; SI-NEXT: s_cbranch_scc0 .LBB17_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_lshr_b32 s4, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s4 ; SI-NEXT: s_lshr_b32 s4, s6, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 ; SI-NEXT: s_lshr_b32 s4, s7, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s4 ; SI-NEXT: s_lshr_b32 s4, s8, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s4 ; SI-NEXT: s_lshr_b32 s4, s10, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s4 ; SI-NEXT: s_lshr_b32 s4, s11, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 ; SI-NEXT: s_lshr_b32 s4, s12, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 ; SI-NEXT: s_lshr_b32 s4, s13, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 ; SI-NEXT: s_lshr_b32 s4, s14, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 -; SI-NEXT: s_lshr_b32 s4, s15, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 +; SI-NEXT: s_lshr_b32 s4, s15, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 ; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 ; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 ; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 ; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 ; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 ; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 ; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 ; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 ; SI-NEXT: s_lshr_b32 s4, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 ; SI-NEXT: s_lshr_b32 s4, s27, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 ; SI-NEXT: s_lshr_b32 s4, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v49, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v40, s4 ; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v51, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 ; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v53, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v55, s4 -; SI-NEXT: s_lshr_b32 s4, s43, 16 ; SI-NEXT: s_waitcnt expcnt(6) ; SI-NEXT: v_cvt_f32_f16_e32 v42, s4 +; SI-NEXT: s_lshr_b32 s4, s43, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 ; SI-NEXT: s_lshr_b32 s4, s42, 16 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v44, s4 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v45, s4 ; SI-NEXT: s_lshr_b32 s4, s41, 16 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v46, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 ; SI-NEXT: s_lshr_b32 s4, s40, 16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v56, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v54, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v40, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v41, s43 -; SI-NEXT: v_cvt_f32_f16_e32 v43, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v45, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v47, s40 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v47, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v51, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v53, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v54, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v55, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v41, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v43, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v44, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v46, s40 ; SI-NEXT: s_cbranch_execnz .LBB17_3 ; SI-NEXT: .LBB17_2: ; %cmp.true ; SI-NEXT: s_add_i32 s40, s40, 3 ; SI-NEXT: s_add_i32 s41, s41, 3 ; SI-NEXT: s_add_i32 s42, s42, 3 ; SI-NEXT: s_add_i32 s43, s43, 3 -; SI-NEXT: s_add_i32 s23, s23, 3 ; SI-NEXT: s_add_i32 s24, s24, 3 ; SI-NEXT: s_add_i32 s25, s25, 3 ; SI-NEXT: s_add_i32 s26, s26, 3 ; SI-NEXT: s_add_i32 s27, s27, 3 ; SI-NEXT: s_add_i32 s28, s28, 3 ; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 ; SI-NEXT: s_add_i32 s22, s22, 3 ; SI-NEXT: s_add_i32 s21, s21, 3 ; SI-NEXT: s_add_i32 s20, s20, 3 @@ -8560,13 +8226,13 @@ define inreg <56 x half> @bitcast_v28i32_to_v56f16_scalar(<28 x i32> inreg %a, i ; SI-NEXT: s_lshr_b32 s5, s41, 16 ; SI-NEXT: s_lshr_b32 s44, s42, 16 ; SI-NEXT: s_lshr_b32 s45, s43, 16 -; SI-NEXT: s_lshr_b32 s46, s23, 16 -; SI-NEXT: s_lshr_b32 s47, s24, 16 -; SI-NEXT: s_lshr_b32 s56, s25, 16 -; SI-NEXT: s_lshr_b32 s57, s26, 16 -; SI-NEXT: s_lshr_b32 s58, s27, 16 -; SI-NEXT: s_lshr_b32 s59, s28, 16 -; SI-NEXT: s_lshr_b32 s60, s29, 16 +; SI-NEXT: s_lshr_b32 s46, s24, 16 +; SI-NEXT: s_lshr_b32 s47, s25, 16 +; SI-NEXT: s_lshr_b32 s56, s26, 16 +; SI-NEXT: s_lshr_b32 s57, s27, 16 +; SI-NEXT: s_lshr_b32 s58, s28, 16 +; SI-NEXT: s_lshr_b32 s59, s29, 16 +; SI-NEXT: s_lshr_b32 s60, s23, 16 ; SI-NEXT: s_lshr_b32 s61, s22, 16 ; SI-NEXT: s_lshr_b32 s62, s21, 16 ; SI-NEXT: s_lshr_b32 s63, s20, 16 @@ -8584,262 +8250,105 @@ define inreg <56 x half> @bitcast_v28i32_to_v56f16_scalar(<28 x i32> inreg %a, i ; SI-NEXT: s_lshr_b32 s91, s7, 16 ; SI-NEXT: s_lshr_b32 s92, s6, 16 ; SI-NEXT: s_lshr_b32 s93, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v54, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v40, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v41, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v51, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v53, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v54, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v55, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v41, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s25 ; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v43, s42 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v45, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v43, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s43 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v44, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s41 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v46, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s93 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s92 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s91 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s90 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s89 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s88 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s79 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s78 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s77 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s76 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s75 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s74 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s73 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s72 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s63 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s62 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s61 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s60 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s59 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s58 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s57 +; SI-NEXT: v_cvt_f32_f16_e32 v40, s56 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s47 +; SI-NEXT: v_cvt_f32_f16_e32 v42, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s45 +; SI-NEXT: v_cvt_f32_f16_e32 v45, s44 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v47, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s93 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s92 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s91 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s90 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s89 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s88 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s79 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s78 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s77 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s76 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s75 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s74 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s73 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s72 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s63 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s62 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s61 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s60 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s59 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s58 -; SI-NEXT: v_cvt_f32_f16_e32 v49, s57 -; SI-NEXT: v_cvt_f32_f16_e32 v51, s56 -; SI-NEXT: v_cvt_f32_f16_e32 v53, s47 -; SI-NEXT: v_cvt_f32_f16_e32 v55, s46 -; SI-NEXT: v_cvt_f32_f16_e32 v42, s45 -; SI-NEXT: v_cvt_f32_f16_e32 v44, s44 -; SI-NEXT: v_cvt_f32_f16_e32 v46, s5 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v56, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v47, s4 ; SI-NEXT: .LBB17_3: ; %end -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v56, v56 +; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v47, v47 ; SI-NEXT: v_cvt_f16_f32_e32 v46, v46 ; SI-NEXT: v_cvt_f16_f32_e32 v45, v45 ; SI-NEXT: v_cvt_f16_f32_e32 v44, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v43 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v47 +; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v45 +; SI-NEXT: v_or_b32_e32 v2, v44, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v42, v42 -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v55 -; SI-NEXT: v_or_b32_e32 v47, v47, v56 -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v40, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 -; SI-NEXT: buffer_store_dword v47, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v47, vcc, 4, v0 -; SI-NEXT: v_or_b32_e32 v45, v45, v46 -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 -; SI-NEXT: buffer_store_dword v45, v47, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v45, vcc, 8, v0 -; SI-NEXT: v_or_b32_e32 v43, v43, v44 -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 -; SI-NEXT: buffer_store_dword v43, v45, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v43, vcc, 12, v0 -; SI-NEXT: v_or_b32_e32 v41, v41, v42 -; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 -; SI-NEXT: buffer_store_dword v41, v43, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v41, vcc, 16, v0 -; SI-NEXT: v_or_b32_e32 v55, v40, v55 -; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 -; SI-NEXT: buffer_store_dword v55, v41, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v55, vcc, 20, v0 -; SI-NEXT: v_or_b32_e32 v53, v54, v53 -; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 -; SI-NEXT: buffer_store_dword v53, v55, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v53, vcc, 24, v0 -; SI-NEXT: v_or_b32_e32 v51, v52, v51 -; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: buffer_store_dword v51, v53, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v51, vcc, 28, v0 -; SI-NEXT: v_or_b32_e32 v49, v50, v49 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 -; SI-NEXT: buffer_store_dword v49, v51, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v49, vcc, 32, v0 -; SI-NEXT: v_or_b32_e32 v39, v48, v39 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; SI-NEXT: buffer_store_dword v39, v49, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v39, vcc, 36, v0 -; SI-NEXT: v_or_b32_e32 v37, v38, v37 -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; SI-NEXT: buffer_store_dword v37, v39, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v37, vcc, 40, v0 -; SI-NEXT: v_or_b32_e32 v35, v36, v35 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; SI-NEXT: buffer_store_dword v35, v37, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v35, vcc, 44, v0 -; SI-NEXT: v_or_b32_e32 v32, v34, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: buffer_store_dword v32, v35, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v32, v33 -; SI-NEXT: v_add_i32_e32 v33, vcc, 48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; SI-NEXT: v_or_b32_e32 v30, v32, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: buffer_store_dword v30, v33, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v30, v31 -; SI-NEXT: v_add_i32_e32 v31, vcc, 52, v0 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; SI-NEXT: v_or_b32_e32 v28, v30, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: buffer_store_dword v28, v31, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v28, v29 -; SI-NEXT: v_add_i32_e32 v29, vcc, 56, v0 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_or_b32_e32 v26, v28, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: buffer_store_dword v26, v29, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v26, v27 -; SI-NEXT: v_add_i32_e32 v27, vcc, 60, v0 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_or_b32_e32 v24, v26, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: buffer_store_dword v24, v27, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v24, v25 -; SI-NEXT: v_add_i32_e32 v25, vcc, 64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v22, v24, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: buffer_store_dword v22, v25, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v22, v23 -; SI-NEXT: v_add_i32_e32 v23, vcc, 0x44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v22, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: buffer_store_dword v20, v23, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v21 -; SI-NEXT: v_add_i32_e32 v21, vcc, 0x48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v20, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: buffer_store_dword v18, v21, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v18, v19 -; SI-NEXT: v_add_i32_e32 v19, vcc, 0x4c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_or_b32_e32 v15, v18, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: buffer_store_dword v15, v19, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v17 -; SI-NEXT: v_add_i32_e32 v17, vcc, 0x50, v0 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: buffer_store_dword v13, v17, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v13, v16 -; SI-NEXT: v_add_i32_e32 v15, vcc, 0x54, v0 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: buffer_store_dword v11, v15, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v11, v14 -; SI-NEXT: v_add_i32_e32 v13, vcc, 0x58, v0 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v44 +; SI-NEXT: v_or_b32_e32 v5, v5, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: buffer_store_dword v9, v13, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v9, v12 -; SI-NEXT: v_add_i32_e32 v11, vcc, 0x5c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: buffer_store_dword v7, v11, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v7, v10 -; SI-NEXT: v_add_i32_e32 v9, vcc, 0x60, v0 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v8 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v6 -; SI-NEXT: v_add_i32_e32 v5, vcc, 0x68, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v4 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v40 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 +; SI-NEXT: v_or_b32_e32 v7, v7, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v8 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v56, v1 +; SI-NEXT: v_or_b32_e32 v3, v46, v3 +; SI-NEXT: v_or_b32_e32 v4, v43, v4 +; SI-NEXT: v_or_b32_e32 v6, v41, v6 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v40 ; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -8849,65 +8358,141 @@ define inreg <56 x half> @bitcast_v28i32_to_v56f16_scalar(<28 x i32> inreg %a, i ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_or_b32_e32 v9, v54, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v54 +; SI-NEXT: v_or_b32_e32 v11, v52, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v52 +; SI-NEXT: v_or_b32_e32 v13, v50, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v50 +; SI-NEXT: v_or_b32_e32 v15, v48, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v48 +; SI-NEXT: v_or_b32_e32 v17, v38, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v38 +; SI-NEXT: v_or_b32_e32 v19, v36, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v36 +; SI-NEXT: v_or_b32_e32 v21, v34, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v34 +; SI-NEXT: v_or_b32_e32 v23, v32, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v32 +; SI-NEXT: v_or_b32_e32 v25, v30, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v30 +; SI-NEXT: v_or_b32_e32 v8, v55, v8 +; SI-NEXT: v_or_b32_e32 v10, v53, v10 +; SI-NEXT: v_or_b32_e32 v12, v51, v12 +; SI-NEXT: v_or_b32_e32 v14, v49, v14 +; SI-NEXT: v_or_b32_e32 v16, v39, v16 +; SI-NEXT: v_or_b32_e32 v18, v37, v18 +; SI-NEXT: v_or_b32_e32 v20, v35, v20 +; SI-NEXT: v_or_b32_e32 v22, v33, v22 +; SI-NEXT: v_or_b32_e32 v24, v31, v24 +; SI-NEXT: v_or_b32_e32 v26, v29, v26 +; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB17_4: +; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: s_branch .LBB17_2 ; ; VI-LABEL: bitcast_v28i32_to_v56f16_scalar: @@ -9657,198 +9242,219 @@ define <28 x i32> @bitcast_v56f16_to_v28i32(<56 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v56f16_to_v28i32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v46, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v6 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:24 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:20 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:88 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:100 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v7 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v48 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v10 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v39 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v38 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v56, v56 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v10 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v32 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v37 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v32 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v21 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v32 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v32 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v35 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v33 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v49 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v50 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v51 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v52 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v53 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v62 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v54 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v55 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v47, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v40 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v27 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB18_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v41 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; kill: killed $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; kill: killed $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; kill: killed $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; kill: killed $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; kill: killed $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; kill: killed $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v49 ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; kill: killed $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr28 @@ -9898,6 +9504,10 @@ define <28 x i32> @bitcast_v56f16_to_v28i32(<56 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v47 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v45 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v41 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v53 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v51 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v61 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v59 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v57 @@ -9910,6 +9520,10 @@ define <28 x i32> @bitcast_v56f16_to_v28i32(<56 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v0, v46, v0 ; SI-NEXT: v_or_b32_e32 v1, v44, v1 ; SI-NEXT: v_or_b32_e32 v2, v42, v2 +; SI-NEXT: v_or_b32_e32 v3, v40, v3 +; SI-NEXT: v_or_b32_e32 v4, v54, v4 +; SI-NEXT: v_or_b32_e32 v5, v52, v5 +; SI-NEXT: v_or_b32_e32 v6, v50, v6 ; SI-NEXT: v_or_b32_e32 v21, v60, v21 ; SI-NEXT: v_or_b32_e32 v22, v58, v22 ; SI-NEXT: v_or_b32_e32 v23, v48, v23 @@ -9924,6 +9538,14 @@ define <28 x i32> @bitcast_v56f16_to_v28i32(<56 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; kill: killed $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; kill: killed $vgpr28 @@ -9941,84 +9563,65 @@ define <28 x i32> @bitcast_v56f16_to_v28i32(<56 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v56 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: v_or_b32_e32 v19, v20, v19 @@ -10030,7 +9633,7 @@ define <28 x i32> @bitcast_v56f16_to_v28i32(<56 x half> %a, i32 %b) { ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB18_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v47 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v45 @@ -10050,187 +9653,177 @@ define <28 x i32> @bitcast_v56f16_to_v28i32(<56 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v3, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v43 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v54 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v41 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v60 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v52 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v50 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v55 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v61 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v60 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v53 ; SI-NEXT: v_cvt_f32_f16_e32 v24, v58 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v25, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v33 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v51 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v38 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v36 ; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v49 ; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v33 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v32 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 @@ -10238,9 +9831,9 @@ define <28 x i32> @bitcast_v56f16_to_v28i32(<56 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 @@ -10250,7 +9843,7 @@ define <28 x i32> @bitcast_v56f16_to_v28i32(<56 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 @@ -10262,7 +9855,7 @@ define <28 x i32> @bitcast_v56f16_to_v28i32(<56 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_or_b32_e32 v17, v18, v17 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 @@ -10315,22 +9908,22 @@ define <28 x i32> @bitcast_v56f16_to_v28i32(<56 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v27, v29, v27 ; SI-NEXT: .LBB18_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -11098,456 +10691,546 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i ; SI-LABEL: bitcast_v56f16_to_v28i32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:44 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v20 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v2, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v11, s20 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v3, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v10, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v4, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v9, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v5, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v8, s26 -; SI-NEXT: v_cvt_f16_f32_e32 v6, s29 -; SI-NEXT: v_cvt_f16_f32_e32 v7, s28 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v5 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v51 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v61 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_cvt_f16_f32_e32 v0, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v6 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v7 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v53, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v54, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v10 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v55, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 +; SI-NEXT: s_lshr_b32 s41, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s41 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s17 +; SI-NEXT: s_lshr_b32 s15, s18, 16 +; SI-NEXT: s_lshr_b32 s40, s17, 16 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s40 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s15 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v43, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s19 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v11 +; SI-NEXT: s_lshr_b32 s13, s20, 16 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 +; SI-NEXT: s_lshr_b32 s14, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v30 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s21 +; SI-NEXT: s_lshr_b32 s11, s22, 16 +; SI-NEXT: s_lshr_b32 s12, s21, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s12 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v12 +; SI-NEXT: s_lshr_b32 s9, s24, 16 +; SI-NEXT: s_lshr_b32 s10, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s10 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 +; SI-NEXT: s_lshr_b32 s8, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s8 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v32 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v13 +; SI-NEXT: s_lshr_b32 s6, s27, 16 +; SI-NEXT: s_lshr_b32 s7, s26, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v31 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v29, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s6 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 +; SI-NEXT: s_lshr_b32 s4, s29, 16 +; SI-NEXT: s_lshr_b32 s5, s28, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v56, s29 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB19_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v49, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_mov_b32_e32 v48, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_mov_b32_e32 v61, v44 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v43 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v54 -; SI-NEXT: v_mov_b32_e32 v39, v11 -; SI-NEXT: v_or_b32_e32 v2, v11, v2 -; SI-NEXT: v_mov_b32_e32 v33, v10 -; SI-NEXT: v_or_b32_e32 v3, v10, v3 -; SI-NEXT: v_or_b32_e32 v4, v9, v4 -; SI-NEXT: v_or_b32_e32 v5, v8, v5 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v58 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v35 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v61 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v46 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v60 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v62 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v57 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v63 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v34 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v60 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v44 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v40 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v30 +; SI-NEXT: v_mov_b32_e32 v49, v39 +; SI-NEXT: v_or_b32_e32 v0, v39, v0 +; SI-NEXT: v_mov_b32_e32 v39, v38 +; SI-NEXT: v_or_b32_e32 v1, v38, v1 +; SI-NEXT: v_mov_b32_e32 v55, v54 +; SI-NEXT: v_or_b32_e32 v2, v53, v2 +; SI-NEXT: v_mov_b32_e32 v53, v52 +; SI-NEXT: v_mov_b32_e32 v32, v48 +; SI-NEXT: v_or_b32_e32 v3, v48, v3 +; SI-NEXT: v_mov_b32_e32 v51, v50 +; SI-NEXT: v_mov_b32_e32 v38, v37 +; SI-NEXT: v_or_b32_e32 v4, v37, v4 +; SI-NEXT: v_mov_b32_e32 v37, v36 +; SI-NEXT: v_mov_b32_e32 v48, v63 +; SI-NEXT: v_or_b32_e32 v5, v63, v5 +; SI-NEXT: v_mov_b32_e32 v36, v35 +; SI-NEXT: v_mov_b32_e32 v63, v62 +; SI-NEXT: v_or_b32_e32 v6, v62, v6 +; SI-NEXT: v_or_b32_e32 v7, v59, v7 +; SI-NEXT: v_mov_b32_e32 v61, v60 +; SI-NEXT: v_mov_b32_e32 v59, v47 +; SI-NEXT: v_or_b32_e32 v8, v47, v8 +; SI-NEXT: v_mov_b32_e32 v56, v58 +; SI-NEXT: v_or_b32_e32 v11, v46, v11 +; SI-NEXT: v_or_b32_e32 v12, v45, v12 +; SI-NEXT: v_or_b32_e32 v13, v43, v13 +; SI-NEXT: v_or_b32_e32 v14, v41, v14 +; SI-NEXT: v_or_b32_e32 v15, v31, v15 +; SI-NEXT: v_or_b32_e32 v16, v28, v16 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v0, v55, v0 -; SI-NEXT: v_or_b32_e32 v1, v53, v1 -; SI-NEXT: v_or_b32_e32 v7, v47, v7 -; SI-NEXT: v_mov_b32_e32 v42, v58 -; SI-NEXT: v_or_b32_e32 v8, v58, v8 -; SI-NEXT: v_mov_b32_e32 v41, v60 -; SI-NEXT: v_or_b32_e32 v9, v59, v9 -; SI-NEXT: v_mov_b32_e32 v40, v56 -; SI-NEXT: v_or_b32_e32 v10, v56, v10 -; SI-NEXT: v_or_b32_e32 v11, v45, v11 -; SI-NEXT: v_or_b32_e32 v12, v38, v12 -; SI-NEXT: v_or_b32_e32 v13, v36, v13 -; SI-NEXT: v_or_b32_e32 v14, v35, v14 -; SI-NEXT: v_or_b32_e32 v15, v32, v15 -; SI-NEXT: v_or_b32_e32 v17, v37, v17 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v29 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: v_or_b32_e32 v21, v22, v21 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; SI-NEXT: v_or_b32_e32 v23, v24, v23 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 ; SI-NEXT: v_or_b32_e32 v25, v26, v25 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 ; SI-NEXT: v_or_b32_e32 v26, v27, v26 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v57 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v10, v33, v10 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; SI-NEXT: v_or_b32_e32 v27, v50, v27 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v27, v57, v27 ; SI-NEXT: s_cbranch_execnz .LBB19_3 ; SI-NEXT: .LBB19_2: ; %cmp.true -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v53 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v1, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v38 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v47 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v48 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v45 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v63 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v33 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v45 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v32 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v31 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v28 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v22, v29 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v2, v55 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v53 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v51 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v37 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v36 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v61 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v61 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v56 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v62 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v34 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v29 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v44 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v42 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v16, v30 ; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v37 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_or_b32_e32 v17, v18, v17 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v18, v20, v18 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 @@ -11559,22 +11242,16 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_or_b32_e32 v20, v21, v20 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; SI-NEXT: v_or_b32_e32 v22, v24, v22 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 @@ -11586,12 +11263,12 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_or_b32_e32 v23, v24, v23 ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v25 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 ; SI-NEXT: v_or_b32_e32 v24, v26, v24 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 @@ -11601,7 +11278,7 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 ; SI-NEXT: v_or_b32_e32 v25, v27, v25 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 @@ -11615,38 +11292,63 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 ; SI-NEXT: v_or_b32_e32 v27, v29, v27 ; SI-NEXT: .LBB19_3: ; %end -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB19_4: -; SI-NEXT: v_mov_b32_e32 v39, v11 -; SI-NEXT: v_mov_b32_e32 v33, v10 -; SI-NEXT: v_mov_b32_e32 v49, v2 -; SI-NEXT: v_mov_b32_e32 v48, v3 -; SI-NEXT: v_mov_b32_e32 v52, v37 -; SI-NEXT: v_mov_b32_e32 v37, v29 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v57, v43 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v46, v42 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v45, v41 +; SI-NEXT: v_mov_b32_e32 v43, v31 +; SI-NEXT: v_mov_b32_e32 v42, v30 +; SI-NEXT: v_mov_b32_e32 v41, v28 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v42, v58 -; SI-NEXT: v_mov_b32_e32 v41, v60 -; SI-NEXT: v_mov_b32_e32 v40, v56 -; SI-NEXT: v_mov_b32_e32 v29, v37 -; SI-NEXT: v_mov_b32_e32 v37, v52 -; SI-NEXT: v_mov_b32_e32 v61, v44 +; SI-NEXT: v_mov_b32_e32 v55, v54 +; SI-NEXT: v_mov_b32_e32 v28, v41 +; SI-NEXT: v_mov_b32_e32 v30, v42 +; SI-NEXT: v_mov_b32_e32 v41, v45 +; SI-NEXT: v_mov_b32_e32 v42, v46 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v53, v52 +; SI-NEXT: v_mov_b32_e32 v51, v50 +; SI-NEXT: v_mov_b32_e32 v49, v39 +; SI-NEXT: v_mov_b32_e32 v32, v48 +; SI-NEXT: v_mov_b32_e32 v39, v38 +; SI-NEXT: v_mov_b32_e32 v38, v37 +; SI-NEXT: v_mov_b32_e32 v37, v36 +; SI-NEXT: v_mov_b32_e32 v36, v35 +; SI-NEXT: v_mov_b32_e32 v48, v63 +; SI-NEXT: v_mov_b32_e32 v63, v62 +; SI-NEXT: v_mov_b32_e32 v61, v60 +; SI-NEXT: v_mov_b32_e32 v59, v47 +; SI-NEXT: v_mov_b32_e32 v56, v58 +; SI-NEXT: v_mov_b32_e32 v31, v43 +; SI-NEXT: v_mov_b32_e32 v43, v57 ; SI-NEXT: s_branch .LBB19_2 ; ; VI-LABEL: bitcast_v56f16_to_v28i32_scalar: @@ -14072,327 +13774,245 @@ define <56 x i16> @bitcast_v28f32_to_v56i16(<28 x float> %a, i32 %b) { ; SI-LABEL: bitcast_v28f32_to_v56i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v29 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB28_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_alignbit_b32 v29, v28, v27, 16 -; SI-NEXT: v_alignbit_b32 v30, v26, v25, 16 -; SI-NEXT: v_alignbit_b32 v31, v24, v23, 16 -; SI-NEXT: v_alignbit_b32 v32, v22, v21, 16 -; SI-NEXT: v_alignbit_b32 v33, v20, v19, 16 -; SI-NEXT: v_alignbit_b32 v34, v18, v17, 16 -; SI-NEXT: v_alignbit_b32 v37, v16, v15, 16 -; SI-NEXT: v_alignbit_b32 v39, v14, v13, 16 -; SI-NEXT: v_alignbit_b32 v49, v12, v11, 16 -; SI-NEXT: v_alignbit_b32 v51, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v54, v8, v7, 16 -; SI-NEXT: v_alignbit_b32 v40, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v28, v27, v26, 16 +; SI-NEXT: v_alignbit_b32 v29, v25, v24, 16 +; SI-NEXT: v_alignbit_b32 v30, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v31, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v32, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v33, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v34, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v35, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v36, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v38, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v48, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v51, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v53, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v40, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v15 ; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_alignbit_b32 v42, v4, v3, 16 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v13 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v11 ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_alignbit_b32 v44, v2, v1, 16 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v9 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v7 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v5 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v3 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v1 ; SI-NEXT: .LBB28_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB28_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 -; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 ; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 -; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 ; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 -; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 ; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 -; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 ; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 -; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 ; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 -; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 ; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 -; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 ; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 -; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 ; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 -; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 ; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 -; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 ; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 -; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 ; SI-NEXT: v_add_f32_e32 v27, 1.0, v27 -; SI-NEXT: v_alignbit_b32 v29, v28, v27, 16 -; SI-NEXT: v_alignbit_b32 v30, v26, v25, 16 -; SI-NEXT: v_alignbit_b32 v31, v24, v23, 16 -; SI-NEXT: v_alignbit_b32 v32, v22, v21, 16 -; SI-NEXT: v_alignbit_b32 v33, v20, v19, 16 -; SI-NEXT: v_alignbit_b32 v34, v18, v17, 16 -; SI-NEXT: v_alignbit_b32 v37, v16, v15, 16 -; SI-NEXT: v_alignbit_b32 v39, v14, v13, 16 -; SI-NEXT: v_alignbit_b32 v49, v12, v11, 16 -; SI-NEXT: v_alignbit_b32 v51, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v54, v8, v7, 16 -; SI-NEXT: v_alignbit_b32 v40, v6, v5, 16 +; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; SI-NEXT: v_alignbit_b32 v28, v27, v26, 16 +; SI-NEXT: v_alignbit_b32 v29, v25, v24, 16 +; SI-NEXT: v_alignbit_b32 v30, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v31, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v32, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v33, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v34, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v35, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v36, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v38, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v48, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v51, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v53, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v40, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v15 ; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_alignbit_b32 v42, v4, v3, 16 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v13 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v11 ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_alignbit_b32 v44, v2, v1, 16 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v9 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v7 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v5 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v3 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v1 ; SI-NEXT: .LBB28_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 +; SI-NEXT: v_or_b32_e32 v0, v0, v40 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; SI-NEXT: v_or_b32_e32 v1, v1, v44 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v56 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v42 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v47 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v46 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v45 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v43 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v49 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v53 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v24 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v25 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v26 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v27 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v28 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v47 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v40 +; SI-NEXT: v_or_b32_e32 v2, v2, v53 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v46 +; SI-NEXT: v_or_b32_e32 v4, v4, v51 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v45 +; SI-NEXT: v_or_b32_e32 v6, v6, v48 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v44 +; SI-NEXT: v_or_b32_e32 v8, v8, v38 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v43 +; SI-NEXT: v_or_b32_e32 v10, v10, v36 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v42 +; SI-NEXT: v_or_b32_e32 v12, v12, v35 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v41 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_or_b32_e32 v14, v14, v34 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v55 +; SI-NEXT: v_or_b32_e32 v16, v16, v33 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v54 +; SI-NEXT: v_or_b32_e32 v18, v18, v32 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v52 +; SI-NEXT: v_or_b32_e32 v20, v20, v31 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v50 +; SI-NEXT: v_or_b32_e32 v22, v22, v30 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v49 +; SI-NEXT: v_or_b32_e32 v24, v24, v29 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v39 +; SI-NEXT: v_or_b32_e32 v26, v26, v28 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v37 +; SI-NEXT: v_or_b32_e32 v3, v3, v53 +; SI-NEXT: v_or_b32_e32 v5, v5, v51 +; SI-NEXT: v_or_b32_e32 v7, v7, v48 +; SI-NEXT: v_or_b32_e32 v9, v9, v38 +; SI-NEXT: v_or_b32_e32 v11, v11, v36 +; SI-NEXT: v_or_b32_e32 v13, v13, v35 +; SI-NEXT: v_or_b32_e32 v15, v15, v34 +; SI-NEXT: v_or_b32_e32 v17, v17, v33 +; SI-NEXT: v_or_b32_e32 v19, v19, v32 +; SI-NEXT: v_or_b32_e32 v21, v21, v31 +; SI-NEXT: v_or_b32_e32 v23, v23, v30 +; SI-NEXT: v_or_b32_e32 v25, v25, v29 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v28f32_to_v56i16: @@ -14962,346 +14582,288 @@ define inreg <56 x i16> @bitcast_v28f32_to_v56i16_scalar(<28 x float> inreg %a, ; SI-LABEL: bitcast_v28f32_to_v56i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15 -; SI-NEXT: v_mov_b32_e32 v27, s16 -; SI-NEXT: v_mov_b32_e32 v28, s17 -; SI-NEXT: v_mov_b32_e32 v23, s18 -; SI-NEXT: v_mov_b32_e32 v24, s19 -; SI-NEXT: v_mov_b32_e32 v25, s20 -; SI-NEXT: v_mov_b32_e32 v26, s21 -; SI-NEXT: v_mov_b32_e32 v21, s22 -; SI-NEXT: v_mov_b32_e32 v22, s23 -; SI-NEXT: v_mov_b32_e32 v19, s24 -; SI-NEXT: v_mov_b32_e32 v20, s25 -; SI-NEXT: v_mov_b32_e32 v17, s26 -; SI-NEXT: v_mov_b32_e32 v18, s27 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: v_mov_b32_e32 v24, s16 +; SI-NEXT: v_mov_b32_e32 v25, s17 +; SI-NEXT: v_mov_b32_e32 v20, s18 +; SI-NEXT: v_mov_b32_e32 v21, s19 +; SI-NEXT: v_mov_b32_e32 v26, s20 +; SI-NEXT: v_mov_b32_e32 v27, s21 +; SI-NEXT: v_mov_b32_e32 v22, s22 +; SI-NEXT: v_mov_b32_e32 v23, s23 +; SI-NEXT: v_mov_b32_e32 v18, s24 +; SI-NEXT: v_mov_b32_e32 v19, s25 +; SI-NEXT: v_mov_b32_e32 v16, s26 +; SI-NEXT: v_mov_b32_e32 v17, s27 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mov_b32_e32 v15, s28 -; SI-NEXT: v_mov_b32_e32 v16, s29 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v14, s28 +; SI-NEXT: v_mov_b32_e32 v15, s29 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB29_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshr_b64 v[29:30], v[13:14], 16 -; SI-NEXT: v_lshr_b64 v[30:31], v[11:12], 16 -; SI-NEXT: v_lshr_b64 v[31:32], v[9:10], 16 -; SI-NEXT: v_lshr_b64 v[32:33], v[7:8], 16 -; SI-NEXT: v_lshr_b64 v[48:49], v[19:20], 16 -; SI-NEXT: v_lshr_b64 v[33:34], v[5:6], 16 -; SI-NEXT: v_lshr_b64 v[49:50], v[21:22], 16 -; SI-NEXT: v_lshr_b64 v[34:35], v[3:4], 16 -; SI-NEXT: v_lshr_b64 v[50:51], v[25:26], 16 -; SI-NEXT: v_lshr_b64 v[35:36], v[15:16], 16 -; SI-NEXT: v_lshr_b64 v[51:52], v[23:24], 16 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v2 +; SI-NEXT: v_lshr_b64 v[50:51], v[12:13], 16 +; SI-NEXT: v_lshr_b64 v[51:52], v[10:11], 16 +; SI-NEXT: v_lshr_b64 v[52:53], v[8:9], 16 +; SI-NEXT: v_lshr_b64 v[53:54], v[6:7], 16 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v13 +; SI-NEXT: v_lshr_b64 v[54:55], v[4:5], 16 +; SI-NEXT: v_lshr_b64 v[40:41], v[2:3], 16 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v3 ; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v1 ; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v15 ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v17 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v19 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v23 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v24 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v28 -; SI-NEXT: v_lshr_b64 v[38:39], v[1:2], 16 -; SI-NEXT: v_lshr_b64 v[36:37], v[17:18], 16 -; SI-NEXT: v_lshr_b64 v[52:53], v[27:28], 16 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v27 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v25 +; SI-NEXT: v_mov_b32_e32 v55, v28 +; SI-NEXT: v_lshr_b64 v[41:42], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[48:49], v[14:15], 16 +; SI-NEXT: v_lshr_b64 v[38:39], v[16:17], 16 +; SI-NEXT: v_lshr_b64 v[36:37], v[18:19], 16 +; SI-NEXT: v_lshr_b64 v[34:35], v[22:23], 16 +; SI-NEXT: v_lshr_b64 v[32:33], v[26:27], 16 +; SI-NEXT: v_lshr_b64 v[30:31], v[20:21], 16 +; SI-NEXT: v_lshr_b64 v[28:29], v[24:25], 16 ; SI-NEXT: s_cbranch_execnz .LBB29_3 ; SI-NEXT: .LBB29_2: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 ; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 ; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 ; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 -; SI-NEXT: v_lshr_b64 v[29:30], v[13:14], 16 ; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_lshr_b64 v[50:51], v[12:13], 16 ; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 -; SI-NEXT: v_lshr_b64 v[30:31], v[11:12], 16 -; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 -; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 ; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_lshr_b64 v[51:52], v[10:11], 16 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 -; SI-NEXT: v_lshr_b64 v[31:32], v[9:10], 16 -; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 -; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 ; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 -; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 -; SI-NEXT: v_lshr_b64 v[32:33], v[7:8], 16 -; SI-NEXT: v_lshr_b64 v[48:49], v[19:20], 16 -; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; SI-NEXT: v_lshr_b64 v[52:53], v[8:9], 16 ; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 -; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 -; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_lshr_b64 v[33:34], v[5:6], 16 -; SI-NEXT: v_lshr_b64 v[49:50], v[21:22], 16 ; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 -; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 -; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 -; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 -; SI-NEXT: v_lshr_b64 v[34:35], v[3:4], 16 -; SI-NEXT: v_lshr_b64 v[50:51], v[25:26], 16 -; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 ; SI-NEXT: v_add_f32_e32 v27, 1.0, v27 +; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 ; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 ; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 -; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 ; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshr_b64 v[35:36], v[15:16], 16 -; SI-NEXT: v_lshr_b64 v[51:52], v[23:24], 16 -; SI-NEXT: v_lshr_b64 v[38:39], v[1:2], 16 -; SI-NEXT: v_lshr_b64 v[36:37], v[17:18], 16 -; SI-NEXT: v_lshr_b64 v[52:53], v[27:28], 16 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v2 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_lshr_b64 v[53:54], v[6:7], 16 +; SI-NEXT: v_lshr_b64 v[40:41], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[54:55], v[4:5], 16 +; SI-NEXT: v_lshr_b64 v[41:42], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[48:49], v[14:15], 16 +; SI-NEXT: v_lshr_b64 v[38:39], v[16:17], 16 +; SI-NEXT: v_lshr_b64 v[36:37], v[18:19], 16 +; SI-NEXT: v_lshr_b64 v[34:35], v[22:23], 16 +; SI-NEXT: v_lshr_b64 v[32:33], v[26:27], 16 +; SI-NEXT: v_lshr_b64 v[30:31], v[20:21], 16 +; SI-NEXT: v_lshr_b64 v[28:29], v[24:25], 16 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v3 ; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v1 ; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v15 ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v17 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v19 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v23 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v27 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v25 ; SI-NEXT: .LBB29_3: ; %end -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v52 -; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; SI-NEXT: v_or_b32_e32 v27, v27, v37 -; SI-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v27, 0xffff, v28 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v59 -; SI-NEXT: v_or_b32_e32 v27, v27, v28 -; SI-NEXT: v_add_i32_e32 v28, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v27, v28, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v51 -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; SI-NEXT: v_or_b32_e32 v23, v23, v27 -; SI-NEXT: v_add_i32_e32 v27, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v23, v27, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v58 -; SI-NEXT: v_or_b32_e32 v23, v23, v24 -; SI-NEXT: v_add_i32_e32 v24, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v23, v24, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v28, v24, v28 ; SI-NEXT: v_and_b32_e32 v24, 0xffff, v25 -; SI-NEXT: v_or_b32_e32 v23, v24, v23 -; SI-NEXT: v_add_i32_e32 v24, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v23, v24, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v26 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v57 -; SI-NEXT: v_or_b32_e32 v23, v23, v24 -; SI-NEXT: v_add_i32_e32 v24, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v23, v24, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v49 -; SI-NEXT: v_or_b32_e32 v21, v21, v23 -; SI-NEXT: v_add_i32_e32 v23, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v21, v23, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v43 +; SI-NEXT: v_or_b32_e32 v29, v24, v25 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v30 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v30, v20, v24 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v63 +; SI-NEXT: v_or_b32_e32 v31, v20, v21 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v32 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v32, v21, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v62 +; SI-NEXT: v_or_b32_e32 v33, v20, v21 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v34 ; SI-NEXT: v_and_b32_e32 v21, 0xffff, v22 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v56 -; SI-NEXT: v_or_b32_e32 v21, v21, v22 -; SI-NEXT: v_add_i32_e32 v22, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v21, v22, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v48 -; SI-NEXT: v_or_b32_e32 v19, v19, v21 -; SI-NEXT: v_add_i32_e32 v21, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v19, v21, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v47 -; SI-NEXT: v_or_b32_e32 v19, v19, v20 -; SI-NEXT: v_add_i32_e32 v20, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v36 -; SI-NEXT: v_or_b32_e32 v17, v17, v19 -; SI-NEXT: v_add_i32_e32 v19, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v17, v19, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v46 -; SI-NEXT: v_or_b32_e32 v17, v17, v18 -; SI-NEXT: v_add_i32_e32 v18, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v35 -; SI-NEXT: v_or_b32_e32 v15, v15, v17 -; SI-NEXT: v_add_i32_e32 v17, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v15, v17, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v45 -; SI-NEXT: v_or_b32_e32 v15, v15, v16 -; SI-NEXT: v_add_i32_e32 v16, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v38 -; SI-NEXT: v_or_b32_e32 v1, v1, v15 -; SI-NEXT: v_add_i32_e32 v15, vcc, 56, v0 -; SI-NEXT: buffer_store_dword v1, v15, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v44 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v43 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v42 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v34, v21, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v61 +; SI-NEXT: v_or_b32_e32 v35, v20, v21 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v36 +; SI-NEXT: v_or_b32_e32 v36, v18, v20 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v60 +; SI-NEXT: v_or_b32_e32 v37, v18, v19 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v38 +; SI-NEXT: v_or_b32_e32 v38, v16, v18 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v59 +; SI-NEXT: v_or_b32_e32 v39, v16, v17 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v48 +; SI-NEXT: v_or_b32_e32 v48, v14, v16 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v58 +; SI-NEXT: v_or_b32_e32 v49, v14, v15 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v41 +; SI-NEXT: v_or_b32_e32 v14, v0, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v57 +; SI-NEXT: v_or_b32_e32 v15, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v40 +; SI-NEXT: v_or_b32_e32 v16, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v56 +; SI-NEXT: v_or_b32_e32 v17, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v54 +; SI-NEXT: v_or_b32_e32 v18, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v47 +; SI-NEXT: v_or_b32_e32 v19, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v53 +; SI-NEXT: v_or_b32_e32 v20, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v46 +; SI-NEXT: v_or_b32_e32 v21, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v52 +; SI-NEXT: v_or_b32_e32 v22, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v45 +; SI-NEXT: v_or_b32_e32 v23, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v51 +; SI-NEXT: v_or_b32_e32 v24, v0, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v44 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v11 +; SI-NEXT: v_or_b32_e32 v25, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v50 +; SI-NEXT: v_or_b32_e32 v26, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v55 +; SI-NEXT: v_or_b32_e32 v27, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, v28 +; SI-NEXT: v_mov_b32_e32 v1, v29 +; SI-NEXT: v_mov_b32_e32 v2, v30 +; SI-NEXT: v_mov_b32_e32 v3, v31 +; SI-NEXT: v_mov_b32_e32 v4, v32 +; SI-NEXT: v_mov_b32_e32 v5, v33 +; SI-NEXT: v_mov_b32_e32 v6, v34 +; SI-NEXT: v_mov_b32_e32 v7, v35 +; SI-NEXT: v_mov_b32_e32 v8, v36 +; SI-NEXT: v_mov_b32_e32 v9, v37 +; SI-NEXT: v_mov_b32_e32 v10, v38 +; SI-NEXT: v_mov_b32_e32 v11, v39 +; SI-NEXT: v_mov_b32_e32 v12, v48 +; SI-NEXT: v_mov_b32_e32 v13, v49 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB29_4: -; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: s_branch .LBB29_2 ; ; VI-LABEL: bitcast_v28f32_to_v56i16_scalar: @@ -16137,167 +15699,260 @@ define <28 x float> @bitcast_v56i16_to_v28f32(<56 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v56i16_to_v28f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v54, v2 -; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 -; SI-NEXT: v_mov_b32_e32 v53, v4 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v29 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v48, v19 +; SI-NEXT: v_mov_b32_e32 v49, v18 +; SI-NEXT: v_mov_b32_e32 v50, v17 +; SI-NEXT: v_mov_b32_e32 v51, v16 +; SI-NEXT: v_mov_b32_e32 v52, v15 +; SI-NEXT: v_mov_b32_e32 v53, v14 +; SI-NEXT: v_mov_b32_e32 v54, v13 +; SI-NEXT: v_mov_b32_e32 v55, v12 +; SI-NEXT: v_mov_b32_e32 v40, v11 +; SI-NEXT: v_mov_b32_e32 v41, v10 +; SI-NEXT: v_mov_b32_e32 v42, v9 +; SI-NEXT: v_mov_b32_e32 v43, v8 +; SI-NEXT: v_mov_b32_e32 v44, v7 +; SI-NEXT: v_mov_b32_e32 v45, v6 +; SI-NEXT: v_mov_b32_e32 v46, v5 +; SI-NEXT: v_mov_b32_e32 v47, v4 +; SI-NEXT: v_mov_b32_e32 v56, v3 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_mov_b32_e32 v57, v2 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_mov_b32_e32 v58, v1 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_mov_b32_e32 v59, v0 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v27 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v26 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v25 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v24 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v23 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v22 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v21 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v48 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v50 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v52 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v40 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v42 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v43 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v44 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v45 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v46 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v47 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v56 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v57 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v58 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v59 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB30_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v55 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v59 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v58 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v57 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v56 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v47 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v46 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v45 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v44 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v43 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v42 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v41 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v40 +; SI-NEXT: v_or_b32_e32 v0, v0, v39 +; SI-NEXT: v_or_b32_e32 v1, v1, v38 +; SI-NEXT: v_or_b32_e32 v2, v2, v62 +; SI-NEXT: v_or_b32_e32 v3, v3, v37 +; SI-NEXT: v_or_b32_e32 v4, v4, v32 +; SI-NEXT: v_or_b32_e32 v5, v5, v36 +; SI-NEXT: v_or_b32_e32 v6, v6, v61 +; SI-NEXT: v_or_b32_e32 v7, v7, v35 +; SI-NEXT: v_or_b32_e32 v8, v8, v60 +; SI-NEXT: v_or_b32_e32 v9, v9, v34 +; SI-NEXT: v_or_b32_e32 v10, v10, v63 +; SI-NEXT: v_or_b32_e32 v11, v11, v33 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v0 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:84 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:76 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v4 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v54 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v53 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v52 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v51 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v50 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v49 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v48 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:68 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:100 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:60 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v8 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v12 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v16 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v18 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:52 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v22 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v24 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v26 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:44 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:36 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:28 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v24, v24, v25 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:20 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:12 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v26, v26, v27 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execz .LBB30_2 -; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; kill: killed $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr28 @@ -16341,195 +15996,41 @@ define <28 x float> @bitcast_v56i16_to_v28f32(<56 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; kill: killed $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v53 -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v59 ; SI-NEXT: ; kill: killed $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: v_or_b32_e32 v0, v0, v58 -; SI-NEXT: v_or_b32_e32 v1, v1, v52 -; SI-NEXT: v_or_b32_e32 v2, v2, v57 -; SI-NEXT: v_or_b32_e32 v16, v16, v49 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: ; kill: killed $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; kill: killed $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; SI-NEXT: v_or_b32_e32 v3, v3, v51 -; SI-NEXT: v_or_b32_e32 v4, v4, v56 -; SI-NEXT: v_or_b32_e32 v5, v5, v50 -; SI-NEXT: v_or_b32_e32 v6, v6, v46 -; SI-NEXT: v_or_b32_e32 v7, v7, v45 -; SI-NEXT: v_or_b32_e32 v8, v8, v39 -; SI-NEXT: v_or_b32_e32 v9, v9, v37 -; SI-NEXT: v_or_b32_e32 v10, v10, v36 -; SI-NEXT: v_or_b32_e32 v11, v11, v34 -; SI-NEXT: v_or_b32_e32 v12, v12, v43 -; SI-NEXT: v_or_b32_e32 v13, v13, v42 -; SI-NEXT: v_or_b32_e32 v14, v14, v40 -; SI-NEXT: v_or_b32_e32 v15, v15, v47 -; SI-NEXT: v_or_b32_e32 v17, v17, v48 -; SI-NEXT: v_or_b32_e32 v18, v18, v38 -; SI-NEXT: v_or_b32_e32 v19, v19, v44 -; SI-NEXT: v_or_b32_e32 v20, v20, v35 -; SI-NEXT: v_or_b32_e32 v21, v21, v33 -; SI-NEXT: v_or_b32_e32 v22, v22, v32 -; SI-NEXT: v_or_b32_e32 v23, v23, v41 -; SI-NEXT: v_or_b32_e32 v24, v24, v63 -; SI-NEXT: v_or_b32_e32 v25, v25, v62 -; SI-NEXT: v_or_b32_e32 v26, v26, v61 -; SI-NEXT: v_or_b32_e32 v27, v27, v60 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: .LBB30_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB30_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v53 -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v59 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v55 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v59 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v58 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v57 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v56 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v47 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v46 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v45 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v44 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v43 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v42 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v41 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v40 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; SI-NEXT: v_or_b32_e32 v0, v58, v0 -; SI-NEXT: s_mov_b32 s6, 0x30000 -; SI-NEXT: v_or_b32_e32 v1, v52, v1 -; SI-NEXT: v_or_b32_e32 v2, v57, v2 -; SI-NEXT: v_or_b32_e32 v16, v49, v16 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 @@ -16539,45 +16040,22 @@ define <28 x float> @bitcast_v56i16_to_v28f32(<56 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; SI-NEXT: v_or_b32_e32 v3, v51, v3 -; SI-NEXT: v_or_b32_e32 v4, v56, v4 -; SI-NEXT: v_or_b32_e32 v5, v50, v5 -; SI-NEXT: v_or_b32_e32 v6, v46, v6 -; SI-NEXT: v_or_b32_e32 v7, v45, v7 -; SI-NEXT: v_or_b32_e32 v8, v39, v8 -; SI-NEXT: v_or_b32_e32 v9, v37, v9 -; SI-NEXT: v_or_b32_e32 v10, v36, v10 -; SI-NEXT: v_or_b32_e32 v11, v34, v11 -; SI-NEXT: v_or_b32_e32 v12, v43, v12 -; SI-NEXT: v_or_b32_e32 v13, v42, v13 -; SI-NEXT: v_or_b32_e32 v14, v40, v14 -; SI-NEXT: v_or_b32_e32 v15, v47, v15 -; SI-NEXT: v_or_b32_e32 v17, v48, v17 -; SI-NEXT: v_or_b32_e32 v18, v38, v18 -; SI-NEXT: v_or_b32_e32 v19, v44, v19 -; SI-NEXT: v_or_b32_e32 v20, v35, v20 -; SI-NEXT: v_or_b32_e32 v21, v33, v21 -; SI-NEXT: v_or_b32_e32 v22, v32, v22 -; SI-NEXT: v_or_b32_e32 v23, v41, v23 -; SI-NEXT: v_or_b32_e32 v24, v63, v24 -; SI-NEXT: v_or_b32_e32 v25, v62, v25 -; SI-NEXT: v_or_b32_e32 v26, v61, v26 -; SI-NEXT: v_or_b32_e32 v27, v60, v27 +; SI-NEXT: v_or_b32_e32 v0, v39, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v1, v38, v1 +; SI-NEXT: v_or_b32_e32 v2, v62, v2 +; SI-NEXT: v_or_b32_e32 v3, v37, v3 +; SI-NEXT: v_or_b32_e32 v4, v32, v4 +; SI-NEXT: v_or_b32_e32 v5, v36, v5 +; SI-NEXT: v_or_b32_e32 v6, v61, v6 +; SI-NEXT: v_or_b32_e32 v7, v35, v7 +; SI-NEXT: v_or_b32_e32 v8, v60, v8 +; SI-NEXT: v_or_b32_e32 v9, v34, v9 +; SI-NEXT: v_or_b32_e32 v10, v63, v10 +; SI-NEXT: v_or_b32_e32 v11, v33, v11 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 ; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 ; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 ; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 @@ -16585,41 +16063,121 @@ define <28 x float> @bitcast_v56i16_to_v28f32(<56 x i16> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 ; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 ; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v54 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v53 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v52 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v51 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v50 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v49 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v48 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 ; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 ; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 ; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 ; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 ; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 ; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 ; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 ; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v25, vcc, s6, v25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v26, v27, v26 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v26, vcc, 0x30000, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_or_b32_e32 v27, v28, v27 ; SI-NEXT: v_add_i32_e32 v27, vcc, 0x30000, v27 ; SI-NEXT: .LBB30_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -17386,402 +16944,323 @@ define inreg <28 x float> @bitcast_v56i16_to_v28f32_scalar(<56 x i16> inreg %a, ; SI-LABEL: bitcast_v56i16_to_v28f32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_mov_b32_e32 v57, v12 +; SI-NEXT: v_mov_b32_e32 v48, v5 +; SI-NEXT: v_mov_b32_e32 v39, v6 +; SI-NEXT: v_mov_b32_e32 v53, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v48 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v38, v7 ; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_mov_b32_e32 v58, v10 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_mov_b32_e32 v60, v8 -; SI-NEXT: v_mov_b32_e32 v33, v6 -; SI-NEXT: v_mov_b32_e32 v35, v4 -; SI-NEXT: v_mov_b32_e32 v39, v2 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mov_b32_e32 v61, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:36 -; SI-NEXT: v_mov_b32_e32 v31, v26 -; SI-NEXT: v_mov_b32_e32 v41, v24 -; SI-NEXT: v_mov_b32_e32 v42, v22 -; SI-NEXT: v_mov_b32_e32 v43, v20 -; SI-NEXT: v_mov_b32_e32 v49, v18 -; SI-NEXT: v_mov_b32_e32 v44, v16 -; SI-NEXT: v_mov_b32_e32 v45, v14 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v11 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v29 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v39 +; SI-NEXT: v_mov_b32_e32 v37, v8 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v38 +; SI-NEXT: v_mov_b32_e32 v36, v9 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v37 +; SI-NEXT: v_mov_b32_e32 v35, v10 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v36 +; SI-NEXT: v_mov_b32_e32 v34, v11 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v35 +; SI-NEXT: v_mov_b32_e32 v33, v12 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v34 +; SI-NEXT: v_mov_b32_e32 v32, v13 +; SI-NEXT: v_mov_b32_e32 v49, v4 +; SI-NEXT: v_mov_b32_e32 v50, v3 +; SI-NEXT: v_mov_b32_e32 v51, v2 +; SI-NEXT: v_mov_b32_e32 v52, v1 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v50 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v52 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v53 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s7, s28, 16 +; SI-NEXT: s_lshr_b32 s8, s27, 16 +; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: s_lshr_b32 s13, s22, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s40, s19, 16 +; SI-NEXT: s_lshr_b32 s41, s18, 16 +; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v32 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v8 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v12 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v0 ; SI-NEXT: s_cbranch_scc0 .LBB31_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v61 -; SI-NEXT: v_or_b32_e32 v7, v0, v18 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 -; SI-NEXT: v_or_b32_e32 v9, v0, v16 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 -; SI-NEXT: v_or_b32_e32 v10, v0, v14 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v60 -; SI-NEXT: v_or_b32_e32 v11, v0, v5 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v58 -; SI-NEXT: v_or_b32_e32 v12, v0, v3 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v57 -; SI-NEXT: v_or_b32_e32 v13, v0, v63 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v45 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_or_b32_e32 v14, v0, v62 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v44 -; SI-NEXT: v_or_b32_e32 v15, v0, v40 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_or_b32_e32 v16, v0, v48 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v43 -; SI-NEXT: v_or_b32_e32 v17, v0, v38 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v42 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_or_b32_e32 v18, v0, v37 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v41 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v39 -; SI-NEXT: v_or_b32_e32 v19, v0, v36 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v31 ; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: v_or_b32_e32 v8, v1, v20 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_or_b32_e32 v20, v0, v55 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v28 +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: v_or_b32_e32 v21, v0, v34 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 -; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: v_or_b32_e32 v22, v0, v54 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v56 -; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: v_or_b32_e32 v23, v0, v32 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s44, s42, 16 +; SI-NEXT: v_or_b32_e32 v14, v0, v59 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; SI-NEXT: s_or_b32 s5, s5, s44 +; SI-NEXT: s_and_b32 s44, s18, 0xffff +; SI-NEXT: s_lshl_b32 s45, s41, 16 +; SI-NEXT: v_or_b32_e32 v15, v0, v58 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 -; SI-NEXT: s_or_b32 s7, s7, s8 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: v_or_b32_e32 v24, v0, v59 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v47 -; SI-NEXT: s_or_b32 s8, s8, s9 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: v_or_b32_e32 v25, v0, v53 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v46 -; SI-NEXT: s_or_b32 s9, s9, s10 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_or_b32_e32 v26, v0, v52 -; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: s_or_b32 s44, s44, s45 +; SI-NEXT: s_and_b32 s45, s19, 0xffff +; SI-NEXT: s_lshl_b32 s46, s40, 16 +; SI-NEXT: v_or_b32_e32 v16, v0, v57 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 -; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_or_b32_e32 v27, v0, v29 +; SI-NEXT: s_or_b32 s45, s45, s46 +; SI-NEXT: s_and_b32 s46, s20, 0xffff +; SI-NEXT: s_lshl_b32 s47, s15, 16 +; SI-NEXT: v_or_b32_e32 v17, v0, v56 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: s_or_b32 s46, s46, s47 +; SI-NEXT: s_and_b32 s47, s21, 0xffff +; SI-NEXT: s_lshl_b32 s56, s14, 16 +; SI-NEXT: v_or_b32_e32 v18, v0, v47 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; SI-NEXT: s_or_b32 s47, s47, s56 +; SI-NEXT: s_and_b32 s56, s22, 0xffff +; SI-NEXT: s_lshl_b32 s57, s13, 16 +; SI-NEXT: v_or_b32_e32 v19, v0, v46 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: s_or_b32 s56, s56, s57 +; SI-NEXT: s_and_b32 s57, s23, 0xffff +; SI-NEXT: s_lshl_b32 s58, s12, 16 +; SI-NEXT: v_or_b32_e32 v20, v0, v45 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: s_or_b32 s57, s57, s58 +; SI-NEXT: s_and_b32 s58, s24, 0xffff +; SI-NEXT: s_lshl_b32 s59, s11, 16 +; SI-NEXT: v_or_b32_e32 v21, v0, v44 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: s_or_b32 s58, s58, s59 +; SI-NEXT: s_and_b32 s59, s25, 0xffff +; SI-NEXT: s_lshl_b32 s60, s10, 16 +; SI-NEXT: v_or_b32_e32 v22, v0, v43 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: s_or_b32 s59, s59, s60 +; SI-NEXT: s_and_b32 s60, s26, 0xffff +; SI-NEXT: s_lshl_b32 s61, s9, 16 +; SI-NEXT: v_or_b32_e32 v23, v0, v42 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: s_or_b32 s60, s60, s61 +; SI-NEXT: s_and_b32 s61, s27, 0xffff +; SI-NEXT: s_lshl_b32 s62, s8, 16 +; SI-NEXT: v_or_b32_e32 v24, v0, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: s_or_b32 s61, s61, s62 +; SI-NEXT: s_and_b32 s62, s28, 0xffff +; SI-NEXT: s_lshl_b32 s63, s7, 16 +; SI-NEXT: v_or_b32_e32 v25, v0, v40 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: s_or_b32 s62, s62, s63 +; SI-NEXT: s_and_b32 s63, s29, 0xffff +; SI-NEXT: s_lshl_b32 s72, s6, 16 +; SI-NEXT: v_or_b32_e32 v26, v0, v55 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: s_or_b32 s63, s63, s72 +; SI-NEXT: v_or_b32_e32 v27, v0, v54 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_mov_b32_e32 v5, s9 -; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: v_mov_b32_e32 v2, s44 +; SI-NEXT: v_mov_b32_e32 v3, s45 +; SI-NEXT: v_mov_b32_e32 v4, s46 +; SI-NEXT: v_mov_b32_e32 v5, s47 +; SI-NEXT: v_mov_b32_e32 v6, s56 +; SI-NEXT: v_mov_b32_e32 v7, s57 +; SI-NEXT: v_mov_b32_e32 v8, s58 +; SI-NEXT: v_mov_b32_e32 v9, s59 +; SI-NEXT: v_mov_b32_e32 v10, s60 +; SI-NEXT: v_mov_b32_e32 v11, s61 +; SI-NEXT: v_mov_b32_e32 v12, s62 +; SI-NEXT: v_mov_b32_e32 v13, s63 ; SI-NEXT: s_cbranch_execnz .LBB31_3 ; SI-NEXT: .LBB31_2: ; %cmp.true -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v61 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: s_or_b32 s7, s8, s7 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: s_or_b32 s8, s9, s8 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: s_or_b32 s9, s10, s9 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: s_or_b32 s10, s11, s10 -; SI-NEXT: s_add_i32 s4, s4, 0x30000 -; SI-NEXT: s_add_i32 s5, s5, 0x30000 -; SI-NEXT: s_add_i32 s6, s6, 0x30000 -; SI-NEXT: s_add_i32 s7, s7, 0x30000 -; SI-NEXT: s_add_i32 s8, s8, 0x30000 -; SI-NEXT: s_add_i32 s9, s9, 0x30000 -; SI-NEXT: s_add_i32 s10, s10, 0x30000 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_mov_b32_e32 v5, s9 -; SI-NEXT: v_mov_b32_e32 v6, s10 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v39 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v60 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v58 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v57 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v63, v0 -; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v45 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v62, v0 +; SI-NEXT: v_or_b32_e32 v0, v59, v0 ; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v44 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v40, v0 +; SI-NEXT: v_or_b32_e32 v0, v58, v0 ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v48, v0 +; SI-NEXT: v_or_b32_e32 v0, v57, v0 ; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v43 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v38, v0 +; SI-NEXT: v_or_b32_e32 v0, v56, v0 ; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v42 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v37, v0 +; SI-NEXT: v_or_b32_e32 v0, v47, v0 ; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v36, v0 +; SI-NEXT: v_or_b32_e32 v0, v46, v0 ; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v31 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v55, v0 +; SI-NEXT: v_or_b32_e32 v0, v45, v0 ; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v34, v0 +; SI-NEXT: v_or_b32_e32 v0, v44, v0 ; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v54, v0 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s16, s42, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: v_add_i32_e32 v22, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v56 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: s_and_b32 s16, s18, 0xffff +; SI-NEXT: s_lshl_b32 s17, s41, 16 +; SI-NEXT: s_add_i32 s19, s19, 3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v32, v0 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_and_b32 s17, s19, 0xffff +; SI-NEXT: s_lshl_b32 s18, s40, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: s_and_b32 s18, s20, 0xffff +; SI-NEXT: s_lshl_b32 s15, s15, 16 +; SI-NEXT: s_add_i32 s21, s21, 3 ; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: s_or_b32 s15, s15, s18 +; SI-NEXT: s_and_b32 s18, s21, 0xffff +; SI-NEXT: s_lshl_b32 s14, s14, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v59, v0 +; SI-NEXT: s_or_b32 s14, s14, s18 +; SI-NEXT: s_and_b32 s18, s22, 0xffff +; SI-NEXT: s_lshl_b32 s13, s13, 16 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: v_or_b32_e32 v0, v41, v0 +; SI-NEXT: s_or_b32 s13, s13, s18 +; SI-NEXT: s_and_b32 s18, s23, 0xffff +; SI-NEXT: s_lshl_b32 s12, s12, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 ; SI-NEXT: v_add_i32_e32 v24, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v47 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 +; SI-NEXT: s_or_b32 s12, s12, s18 +; SI-NEXT: s_and_b32 s18, s24, 0xffff +; SI-NEXT: s_lshl_b32 s11, s11, 16 +; SI-NEXT: s_add_i32 s25, s25, 3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v53, v0 +; SI-NEXT: s_or_b32 s11, s11, s18 +; SI-NEXT: s_and_b32 s18, s25, 0xffff +; SI-NEXT: s_lshl_b32 s10, s10, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: v_or_b32_e32 v0, v40, v0 +; SI-NEXT: s_or_b32 s10, s10, s18 +; SI-NEXT: s_and_b32 s18, s26, 0xffff +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_add_i32 s27, s27, 3 ; SI-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v46 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: s_or_b32 s9, s9, s18 +; SI-NEXT: s_and_b32 s18, s27, 0xffff +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v52, v0 +; SI-NEXT: s_or_b32 s8, s8, s18 +; SI-NEXT: s_and_b32 s18, s28, 0xffff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: v_or_b32_e32 v0, v55, v0 +; SI-NEXT: s_or_b32 s7, s7, s18 +; SI-NEXT: s_and_b32 s18, s29, 0xffff +; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: v_add_i32_e32 v26, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 +; SI-NEXT: s_or_b32 s6, s6, s18 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v29, v0 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s16, s16, 0x30000 +; SI-NEXT: s_add_i32 s17, s17, 0x30000 +; SI-NEXT: s_add_i32 s15, s15, 0x30000 +; SI-NEXT: s_add_i32 s14, s14, 0x30000 +; SI-NEXT: s_add_i32 s13, s13, 0x30000 +; SI-NEXT: s_add_i32 s12, s12, 0x30000 +; SI-NEXT: s_add_i32 s11, s11, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v54, v0 ; SI-NEXT: v_add_i32_e32 v27, vcc, 0x30000, v0 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: v_mov_b32_e32 v3, s17 +; SI-NEXT: v_mov_b32_e32 v4, s15 +; SI-NEXT: v_mov_b32_e32 v5, s14 +; SI-NEXT: v_mov_b32_e32 v6, s13 +; SI-NEXT: v_mov_b32_e32 v7, s12 +; SI-NEXT: v_mov_b32_e32 v8, s11 +; SI-NEXT: v_mov_b32_e32 v9, s10 +; SI-NEXT: v_mov_b32_e32 v10, s9 +; SI-NEXT: v_mov_b32_e32 v11, s8 +; SI-NEXT: v_mov_b32_e32 v12, s7 +; SI-NEXT: v_mov_b32_e32 v13, s6 ; SI-NEXT: .LBB31_3: ; %end -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB31_4: -; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v62, v58 -; SI-NEXT: v_mov_b32_e32 v58, v51 -; SI-NEXT: v_mov_b32_e32 v51, v47 -; SI-NEXT: v_mov_b32_e32 v47, v44 -; SI-NEXT: v_mov_b32_e32 v44, v41 -; SI-NEXT: v_mov_b32_e32 v41, v30 -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v63, v59 -; SI-NEXT: v_mov_b32_e32 v59, v56 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_mov_b32_e32 v56, v50 -; SI-NEXT: v_mov_b32_e32 v50, v45 -; SI-NEXT: v_mov_b32_e32 v45, v42 -; SI-NEXT: v_mov_b32_e32 v42, v28 -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v61, v52 -; SI-NEXT: v_mov_b32_e32 v52, v57 -; SI-NEXT: v_mov_b32_e32 v57, v46 -; SI-NEXT: v_mov_b32_e32 v46, v49 -; SI-NEXT: v_mov_b32_e32 v49, v43 -; SI-NEXT: v_mov_b32_e32 v43, v31 -; SI-NEXT: v_mov_b32_e32 v53, v40 -; SI-NEXT: v_mov_b32_e32 v40, v48 -; SI-NEXT: v_mov_b32_e32 v48, v39 -; SI-NEXT: v_mov_b32_e32 v39, v38 -; SI-NEXT: v_mov_b32_e32 v38, v37 -; SI-NEXT: v_mov_b32_e32 v37, v36 -; SI-NEXT: v_mov_b32_e32 v36, v35 -; SI-NEXT: v_mov_b32_e32 v35, v55 -; SI-NEXT: v_mov_b32_e32 v55, v34 -; SI-NEXT: v_mov_b32_e32 v34, v33 -; SI-NEXT: v_mov_b32_e32 v33, v54 -; SI-NEXT: v_mov_b32_e32 v54, v32 -; SI-NEXT: v_mov_b32_e32 v32, v60 -; SI-NEXT: v_mov_b32_e32 v60, v29 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v29, v60 -; SI-NEXT: v_mov_b32_e32 v60, v32 -; SI-NEXT: v_mov_b32_e32 v32, v54 -; SI-NEXT: v_mov_b32_e32 v54, v33 -; SI-NEXT: v_mov_b32_e32 v33, v34 -; SI-NEXT: v_mov_b32_e32 v34, v55 -; SI-NEXT: v_mov_b32_e32 v55, v35 -; SI-NEXT: v_mov_b32_e32 v35, v36 -; SI-NEXT: v_mov_b32_e32 v36, v37 -; SI-NEXT: v_mov_b32_e32 v37, v38 -; SI-NEXT: v_mov_b32_e32 v38, v39 -; SI-NEXT: v_mov_b32_e32 v39, v48 -; SI-NEXT: v_mov_b32_e32 v48, v40 -; SI-NEXT: v_mov_b32_e32 v40, v53 -; SI-NEXT: v_mov_b32_e32 v31, v43 -; SI-NEXT: v_mov_b32_e32 v43, v49 -; SI-NEXT: v_mov_b32_e32 v49, v46 -; SI-NEXT: v_mov_b32_e32 v46, v57 -; SI-NEXT: v_mov_b32_e32 v57, v52 -; SI-NEXT: v_mov_b32_e32 v52, v61 -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v28, v42 -; SI-NEXT: v_mov_b32_e32 v42, v45 -; SI-NEXT: v_mov_b32_e32 v45, v50 -; SI-NEXT: v_mov_b32_e32 v50, v56 -; SI-NEXT: v_mov_b32_e32 v56, v59 -; SI-NEXT: v_mov_b32_e32 v59, v63 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v30, v41 -; SI-NEXT: v_mov_b32_e32 v41, v44 -; SI-NEXT: v_mov_b32_e32 v44, v47 -; SI-NEXT: v_mov_b32_e32 v47, v51 -; SI-NEXT: v_mov_b32_e32 v51, v58 -; SI-NEXT: v_mov_b32_e32 v58, v62 -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: s_branch .LBB31_2 ; ; VI-LABEL: bitcast_v56i16_to_v28f32_scalar: @@ -18508,49 +17987,17 @@ end: define <56 x half> @bitcast_v28f32_to_v56f16(<28 x float> %a, i32 %b) { ; SI-LABEL: bitcast_v28f32_to_v56f16: ; SI: ; %bb.0: +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -18567,116 +18014,139 @@ define <56 x half> @bitcast_v28f32_to_v56f16(<28 x float> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB32_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v24 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v62, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v25 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v63, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v6 +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v30 +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v30 +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v27 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v25 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 @@ -18687,36 +18157,45 @@ define <56 x half> @bitcast_v28f32_to_v56f16(<28 x float> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v28 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v49, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v30 +; SI-NEXT: v_mov_b32_e32 v30, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v24 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v37, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v1 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v33, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v0 +; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 @@ -18744,30 +18223,28 @@ define <56 x half> @bitcast_v28f32_to_v56f16(<28 x float> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: .LBB32_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB32_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 -; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v57 ; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v20 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v46 ; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v47 ; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v19 -; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v45 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v18 ; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 ; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 ; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 ; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 @@ -18783,43 +18260,40 @@ define <56 x half> @bitcast_v28f32_to_v56f16(<28 x float> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 ; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 ; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 -; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 ; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 ; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 ; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v25 +; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 ; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 ; SI-NEXT: v_add_f32_e32 v27, 1.0, v27 -; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v23 ; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 @@ -18837,299 +18311,210 @@ define <56 x half> @bitcast_v28f32_to_v56f16(<28 x float> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 ; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 ; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 ; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 ; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 ; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 ; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 ; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 ; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 ; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 -; SI-NEXT: v_mov_b32_e32 v29, v28 -; SI-NEXT: v_mov_b32_e32 v57, v25 -; SI-NEXT: v_mov_b32_e32 v47, v26 -; SI-NEXT: v_mov_b32_e32 v45, v27 -; SI-NEXT: v_mov_b32_e32 v43, v1 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v46, v24 +; SI-NEXT: v_mov_b32_e32 v45, v25 +; SI-NEXT: v_mov_b32_e32 v43, v26 +; SI-NEXT: v_mov_b32_e32 v41, v27 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: .LBB32_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v54 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v52 -; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v50 -; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v49 -; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v39 -; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 -; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 -; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 -; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v30 -; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v63 -; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 -; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v58 -; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 -; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 -; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v44 -; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v52 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v51 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v48 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v38 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v37 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v34 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v33 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v8, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v47 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v42 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v22, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v31 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v10, v61 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 -; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v12, v57 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v14, v44 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v16, v40 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v62 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v22, v58 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v57 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v47 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v45 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v29 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v62 +; SI-NEXT: v_or_b32_e32 v23, v25, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v46 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 +; SI-NEXT: v_or_b32_e32 v25, v27, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v43 ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -19146,7 +18531,12 @@ define <56 x half> @bitcast_v28f32_to_v56f16(<28 x float> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v26, v30 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_or_b32_e32 v26, v27, v26 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 +; SI-NEXT: v_or_b32_e32 v27, v29, v27 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v28f32_to_v56f16: @@ -19716,22 +19106,22 @@ define inreg <56 x half> @bitcast_v28f32_to_v56f16_scalar(<28 x float> inreg %a, ; SI-LABEL: bitcast_v28f32_to_v56f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15 -; SI-NEXT: v_mov_b32_e32 v25, s16 -; SI-NEXT: v_mov_b32_e32 v23, s17 -; SI-NEXT: v_mov_b32_e32 v22, s18 -; SI-NEXT: v_mov_b32_e32 v20, s19 -; SI-NEXT: v_mov_b32_e32 v34, s20 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: v_mov_b32_e32 v21, s16 +; SI-NEXT: v_mov_b32_e32 v20, s17 +; SI-NEXT: v_mov_b32_e32 v29, s18 +; SI-NEXT: v_mov_b32_e32 v22, s19 +; SI-NEXT: v_mov_b32_e32 v28, s20 +; SI-NEXT: v_mov_b32_e32 v15, s21 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mov_b32_e32 v35, s21 ; SI-NEXT: v_mov_b32_e32 v33, s22 ; SI-NEXT: v_mov_b32_e32 v32, s23 -; SI-NEXT: v_mov_b32_e32 v31, s24 -; SI-NEXT: v_mov_b32_e32 v29, s25 -; SI-NEXT: v_mov_b32_e32 v28, s26 -; SI-NEXT: v_mov_b32_e32 v27, s27 -; SI-NEXT: v_mov_b32_e32 v26, s28 -; SI-NEXT: v_mov_b32_e32 v24, s29 +; SI-NEXT: v_mov_b32_e32 v14, s24 +; SI-NEXT: v_mov_b32_e32 v16, s25 +; SI-NEXT: v_mov_b32_e32 v19, s26 +; SI-NEXT: v_mov_b32_e32 v18, s27 +; SI-NEXT: v_mov_b32_e32 v31, s28 +; SI-NEXT: v_mov_b32_e32 v23, s29 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -19750,219 +19140,210 @@ define inreg <56 x half> @bitcast_v28f32_to_v56f16_scalar(<28 x float> inreg %a, ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB33_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v15 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v15 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v15 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v15 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v15 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v11 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v12 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v14 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v18 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v14 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v12 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v15 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v8 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v29 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v20 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v7 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v27 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v11 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v10 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v9 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v8 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v17, v7 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v28 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v17, v6 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v3 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v17, v5 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v31 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v17, v4 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v3 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v17, v3 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v17, v2 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v32 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v17, v1 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v15 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v15 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v26 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v15 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v15 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v15 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v15 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v31 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v15 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v15 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v15 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v15 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v34 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v15 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v15 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v15 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v62, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v0 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v21 ; SI-NEXT: s_cbranch_execnz .LBB33_3 ; SI-NEXT: .LBB33_2: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 -; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 -; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v24 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v9 -; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v26 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v7 ; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v49 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v36 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v6 ; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v5 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v34 ; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v4 ; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v37 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v3 -; SI-NEXT: v_add_f32_e32 v15, 1.0, v25 -; SI-NEXT: v_add_f32_e32 v21, 1.0, v23 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v63 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v61 -; SI-NEXT: v_add_f32_e32 v19, 1.0, v22 -; SI-NEXT: v_add_f32_e32 v18, 1.0, v20 -; SI-NEXT: v_add_f32_e32 v16, 1.0, v34 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v1 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v21 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v59 -; SI-NEXT: v_add_f32_e32 v30, 1.0, v35 -; SI-NEXT: v_add_f32_e32 v33, 1.0, v33 -; SI-NEXT: v_add_f32_e32 v32, 1.0, v32 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v46 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v25, 1.0, v29 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v57 -; SI-NEXT: v_add_f32_e32 v31, 1.0, v31 -; SI-NEXT: v_add_f32_e32 v29, 1.0, v29 -; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v42 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v28 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v26, 1.0, v33 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v47 -; SI-NEXT: v_add_f32_e32 v27, 1.0, v27 -; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v40 +; SI-NEXT: v_add_f32_e32 v27, 1.0, v32 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v30, 1.0, v31 +; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 ; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 ; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 ; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 ; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 -; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v33 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v32 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v31 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v29 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v11 ; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v13 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 @@ -19975,293 +19356,122 @@ define inreg <56 x half> @bitcast_v28f32_to_v56f16_scalar(<28 x float> inreg %a, ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 ; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 ; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 ; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 ; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_mov_b32_e32 v49, v12 -; SI-NEXT: v_mov_b32_e32 v39, v13 -; SI-NEXT: v_mov_b32_e32 v37, v14 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v21 +; SI-NEXT: v_mov_b32_e32 v36, v12 +; SI-NEXT: v_mov_b32_e32 v34, v13 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: .LBB33_3: ; %end ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v63 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v21 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v19 -; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v18 -; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v16 -; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v63 -; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v62 -; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v60 -; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v58 -; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v56 -; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v46 -; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v44 -; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 -; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 -; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 -; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v49 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v39 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v2, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v30 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v59 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v24 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v46 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v26 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v42 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v62 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v60 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v53 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v57 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v49 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v45 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v37 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v41 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v24, v48 ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -20278,86 +19488,167 @@ define inreg <56 x half> @bitcast_v28f32_to_v56f16_scalar(<28 x float> inreg %a, ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v26, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v34 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v22, v38 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 +; SI-NEXT: v_or_b32_e32 v23, v25, v23 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v24, v50 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v54 +; SI-NEXT: v_or_b32_e32 v25, v27, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v36 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_or_b32_e32 v26, v27, v26 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 +; SI-NEXT: v_or_b32_e32 v27, v29, v27 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB33_4: -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; kill: killed $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; kill: killed $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; kill: killed $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; kill: killed $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; kill: killed $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; kill: killed $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; kill: killed $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; kill: killed $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; kill: killed $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; kill: killed $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; kill: killed $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; kill: killed $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; kill: killed $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; kill: killed $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; kill: killed $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; kill: killed $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; kill: killed $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; kill: killed $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: s_branch .LBB33_2 ; ; VI-LABEL: bitcast_v28f32_to_v56f16_scalar: @@ -21193,198 +20484,219 @@ define <28 x float> @bitcast_v56f16_to_v28f32(<56 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v56f16_to_v28f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v46, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v6 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:24 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:20 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:88 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:100 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v7 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v48 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v10 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v39 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v38 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v56, v56 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v10 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v32 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v37 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v32 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v21 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v32 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v32 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v35 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v33 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v49 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v50 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v51 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v52 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v53 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v62 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v54 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v55 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v47, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v40 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v27 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB34_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v41 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; kill: killed $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; kill: killed $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; kill: killed $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; kill: killed $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; kill: killed $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; kill: killed $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v49 ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; kill: killed $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr28 @@ -21434,6 +20746,10 @@ define <28 x float> @bitcast_v56f16_to_v28f32(<56 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v47 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v45 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v41 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v53 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v51 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v61 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v59 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v57 @@ -21446,6 +20762,10 @@ define <28 x float> @bitcast_v56f16_to_v28f32(<56 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v0, v46, v0 ; SI-NEXT: v_or_b32_e32 v1, v44, v1 ; SI-NEXT: v_or_b32_e32 v2, v42, v2 +; SI-NEXT: v_or_b32_e32 v3, v40, v3 +; SI-NEXT: v_or_b32_e32 v4, v54, v4 +; SI-NEXT: v_or_b32_e32 v5, v52, v5 +; SI-NEXT: v_or_b32_e32 v6, v50, v6 ; SI-NEXT: v_or_b32_e32 v21, v60, v21 ; SI-NEXT: v_or_b32_e32 v22, v58, v22 ; SI-NEXT: v_or_b32_e32 v23, v48, v23 @@ -21460,6 +20780,14 @@ define <28 x float> @bitcast_v56f16_to_v28f32(<56 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; kill: killed $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; kill: killed $vgpr28 @@ -21477,84 +20805,65 @@ define <28 x float> @bitcast_v56f16_to_v28f32(<56 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v56 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: v_or_b32_e32 v19, v20, v19 @@ -21566,7 +20875,7 @@ define <28 x float> @bitcast_v56f16_to_v28f32(<56 x half> %a, i32 %b) { ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB34_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v47 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v45 @@ -21586,187 +20895,177 @@ define <28 x float> @bitcast_v56f16_to_v28f32(<56 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v3, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v43 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v54 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v41 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v60 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v52 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v50 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v55 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v61 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v60 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v53 ; SI-NEXT: v_cvt_f32_f16_e32 v24, v58 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v25, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v33 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v51 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v38 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v36 ; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v49 ; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v33 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v32 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 @@ -21774,9 +21073,9 @@ define <28 x float> @bitcast_v56f16_to_v28f32(<56 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 @@ -21786,7 +21085,7 @@ define <28 x float> @bitcast_v56f16_to_v28f32(<56 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 @@ -21798,7 +21097,7 @@ define <28 x float> @bitcast_v56f16_to_v28f32(<56 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_or_b32_e32 v17, v18, v17 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 @@ -21851,22 +21150,22 @@ define <28 x float> @bitcast_v56f16_to_v28f32(<56 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v27, v29, v27 ; SI-NEXT: .LBB34_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -22634,456 +21933,546 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a, ; SI-LABEL: bitcast_v56f16_to_v28f32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:44 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v20 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v2, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v11, s20 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v3, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v10, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v4, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v9, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v5, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v8, s26 -; SI-NEXT: v_cvt_f16_f32_e32 v6, s29 -; SI-NEXT: v_cvt_f16_f32_e32 v7, s28 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v5 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v51 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v61 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_cvt_f16_f32_e32 v0, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v6 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v7 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v53, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v54, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v10 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v55, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 +; SI-NEXT: s_lshr_b32 s41, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s41 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s17 +; SI-NEXT: s_lshr_b32 s15, s18, 16 +; SI-NEXT: s_lshr_b32 s40, s17, 16 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s40 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s15 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v43, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s19 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v11 +; SI-NEXT: s_lshr_b32 s13, s20, 16 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 +; SI-NEXT: s_lshr_b32 s14, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v30 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s21 +; SI-NEXT: s_lshr_b32 s11, s22, 16 +; SI-NEXT: s_lshr_b32 s12, s21, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s12 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v12 +; SI-NEXT: s_lshr_b32 s9, s24, 16 +; SI-NEXT: s_lshr_b32 s10, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s10 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 +; SI-NEXT: s_lshr_b32 s8, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s8 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v32 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v13 +; SI-NEXT: s_lshr_b32 s6, s27, 16 +; SI-NEXT: s_lshr_b32 s7, s26, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v31 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v29, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s6 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 +; SI-NEXT: s_lshr_b32 s4, s29, 16 +; SI-NEXT: s_lshr_b32 s5, s28, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v56, s29 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB35_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v49, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_mov_b32_e32 v48, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_mov_b32_e32 v61, v44 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v43 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v54 -; SI-NEXT: v_mov_b32_e32 v39, v11 -; SI-NEXT: v_or_b32_e32 v2, v11, v2 -; SI-NEXT: v_mov_b32_e32 v33, v10 -; SI-NEXT: v_or_b32_e32 v3, v10, v3 -; SI-NEXT: v_or_b32_e32 v4, v9, v4 -; SI-NEXT: v_or_b32_e32 v5, v8, v5 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v58 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v35 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v61 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v46 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v60 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v62 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v57 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v63 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v34 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v60 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v44 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v40 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v30 +; SI-NEXT: v_mov_b32_e32 v49, v39 +; SI-NEXT: v_or_b32_e32 v0, v39, v0 +; SI-NEXT: v_mov_b32_e32 v39, v38 +; SI-NEXT: v_or_b32_e32 v1, v38, v1 +; SI-NEXT: v_mov_b32_e32 v55, v54 +; SI-NEXT: v_or_b32_e32 v2, v53, v2 +; SI-NEXT: v_mov_b32_e32 v53, v52 +; SI-NEXT: v_mov_b32_e32 v32, v48 +; SI-NEXT: v_or_b32_e32 v3, v48, v3 +; SI-NEXT: v_mov_b32_e32 v51, v50 +; SI-NEXT: v_mov_b32_e32 v38, v37 +; SI-NEXT: v_or_b32_e32 v4, v37, v4 +; SI-NEXT: v_mov_b32_e32 v37, v36 +; SI-NEXT: v_mov_b32_e32 v48, v63 +; SI-NEXT: v_or_b32_e32 v5, v63, v5 +; SI-NEXT: v_mov_b32_e32 v36, v35 +; SI-NEXT: v_mov_b32_e32 v63, v62 +; SI-NEXT: v_or_b32_e32 v6, v62, v6 +; SI-NEXT: v_or_b32_e32 v7, v59, v7 +; SI-NEXT: v_mov_b32_e32 v61, v60 +; SI-NEXT: v_mov_b32_e32 v59, v47 +; SI-NEXT: v_or_b32_e32 v8, v47, v8 +; SI-NEXT: v_mov_b32_e32 v56, v58 +; SI-NEXT: v_or_b32_e32 v11, v46, v11 +; SI-NEXT: v_or_b32_e32 v12, v45, v12 +; SI-NEXT: v_or_b32_e32 v13, v43, v13 +; SI-NEXT: v_or_b32_e32 v14, v41, v14 +; SI-NEXT: v_or_b32_e32 v15, v31, v15 +; SI-NEXT: v_or_b32_e32 v16, v28, v16 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v0, v55, v0 -; SI-NEXT: v_or_b32_e32 v1, v53, v1 -; SI-NEXT: v_or_b32_e32 v7, v47, v7 -; SI-NEXT: v_mov_b32_e32 v42, v58 -; SI-NEXT: v_or_b32_e32 v8, v58, v8 -; SI-NEXT: v_mov_b32_e32 v41, v60 -; SI-NEXT: v_or_b32_e32 v9, v59, v9 -; SI-NEXT: v_mov_b32_e32 v40, v56 -; SI-NEXT: v_or_b32_e32 v10, v56, v10 -; SI-NEXT: v_or_b32_e32 v11, v45, v11 -; SI-NEXT: v_or_b32_e32 v12, v38, v12 -; SI-NEXT: v_or_b32_e32 v13, v36, v13 -; SI-NEXT: v_or_b32_e32 v14, v35, v14 -; SI-NEXT: v_or_b32_e32 v15, v32, v15 -; SI-NEXT: v_or_b32_e32 v17, v37, v17 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v29 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: v_or_b32_e32 v21, v22, v21 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; SI-NEXT: v_or_b32_e32 v23, v24, v23 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 ; SI-NEXT: v_or_b32_e32 v25, v26, v25 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 ; SI-NEXT: v_or_b32_e32 v26, v27, v26 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v57 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v10, v33, v10 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; SI-NEXT: v_or_b32_e32 v27, v50, v27 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v27, v57, v27 ; SI-NEXT: s_cbranch_execnz .LBB35_3 ; SI-NEXT: .LBB35_2: ; %cmp.true -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v53 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v1, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v38 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v47 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v48 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v45 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v63 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v33 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v45 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v32 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v31 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v28 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v22, v29 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v2, v55 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v53 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v51 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v37 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v36 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v61 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v61 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v56 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v62 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v34 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v29 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v44 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v42 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v16, v30 ; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v37 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_or_b32_e32 v17, v18, v17 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v18, v20, v18 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 @@ -23095,22 +22484,16 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_or_b32_e32 v20, v21, v20 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; SI-NEXT: v_or_b32_e32 v22, v24, v22 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 @@ -23122,12 +22505,12 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_or_b32_e32 v23, v24, v23 ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v25 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 ; SI-NEXT: v_or_b32_e32 v24, v26, v24 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 @@ -23137,7 +22520,7 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 ; SI-NEXT: v_or_b32_e32 v25, v27, v25 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 @@ -23151,38 +22534,63 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a, ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 ; SI-NEXT: v_or_b32_e32 v27, v29, v27 ; SI-NEXT: .LBB35_3: ; %end -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB35_4: -; SI-NEXT: v_mov_b32_e32 v39, v11 -; SI-NEXT: v_mov_b32_e32 v33, v10 -; SI-NEXT: v_mov_b32_e32 v49, v2 -; SI-NEXT: v_mov_b32_e32 v48, v3 -; SI-NEXT: v_mov_b32_e32 v52, v37 -; SI-NEXT: v_mov_b32_e32 v37, v29 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v57, v43 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v46, v42 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v45, v41 +; SI-NEXT: v_mov_b32_e32 v43, v31 +; SI-NEXT: v_mov_b32_e32 v42, v30 +; SI-NEXT: v_mov_b32_e32 v41, v28 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v42, v58 -; SI-NEXT: v_mov_b32_e32 v41, v60 -; SI-NEXT: v_mov_b32_e32 v40, v56 -; SI-NEXT: v_mov_b32_e32 v29, v37 -; SI-NEXT: v_mov_b32_e32 v37, v52 -; SI-NEXT: v_mov_b32_e32 v61, v44 +; SI-NEXT: v_mov_b32_e32 v55, v54 +; SI-NEXT: v_mov_b32_e32 v28, v41 +; SI-NEXT: v_mov_b32_e32 v30, v42 +; SI-NEXT: v_mov_b32_e32 v41, v45 +; SI-NEXT: v_mov_b32_e32 v42, v46 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v53, v52 +; SI-NEXT: v_mov_b32_e32 v51, v50 +; SI-NEXT: v_mov_b32_e32 v49, v39 +; SI-NEXT: v_mov_b32_e32 v32, v48 +; SI-NEXT: v_mov_b32_e32 v39, v38 +; SI-NEXT: v_mov_b32_e32 v38, v37 +; SI-NEXT: v_mov_b32_e32 v37, v36 +; SI-NEXT: v_mov_b32_e32 v36, v35 +; SI-NEXT: v_mov_b32_e32 v48, v63 +; SI-NEXT: v_mov_b32_e32 v63, v62 +; SI-NEXT: v_mov_b32_e32 v61, v60 +; SI-NEXT: v_mov_b32_e32 v59, v47 +; SI-NEXT: v_mov_b32_e32 v56, v58 +; SI-NEXT: v_mov_b32_e32 v31, v43 +; SI-NEXT: v_mov_b32_e32 v43, v57 ; SI-NEXT: s_branch .LBB35_2 ; ; VI-LABEL: bitcast_v56f16_to_v28f32_scalar: @@ -24720,327 +24128,245 @@ define <56 x i16> @bitcast_v14i64_to_v56i16(<14 x i64> %a, i32 %b) { ; SI-LABEL: bitcast_v14i64_to_v56i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v29 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB40_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_alignbit_b32 v29, v28, v27, 16 -; SI-NEXT: v_alignbit_b32 v30, v26, v25, 16 -; SI-NEXT: v_alignbit_b32 v31, v24, v23, 16 -; SI-NEXT: v_alignbit_b32 v32, v22, v21, 16 -; SI-NEXT: v_alignbit_b32 v33, v20, v19, 16 -; SI-NEXT: v_alignbit_b32 v34, v18, v17, 16 -; SI-NEXT: v_alignbit_b32 v36, v16, v15, 16 -; SI-NEXT: v_alignbit_b32 v38, v14, v13, 16 -; SI-NEXT: v_alignbit_b32 v48, v12, v11, 16 -; SI-NEXT: v_alignbit_b32 v51, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v53, v8, v7, 16 -; SI-NEXT: v_alignbit_b32 v40, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v28, v27, v26, 16 +; SI-NEXT: v_alignbit_b32 v29, v25, v24, 16 +; SI-NEXT: v_alignbit_b32 v30, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v31, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v32, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v33, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v34, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v35, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v36, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v37, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v39, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v50, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v53, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v40, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v15 ; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_alignbit_b32 v42, v4, v3, 16 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v13 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v11 ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_alignbit_b32 v44, v2, v1, 16 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v9 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v7 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v5 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v3 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v1 ; SI-NEXT: .LBB40_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB40_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: v_addc_u32_e32 v12, vcc, 0, v12, vcc -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; SI-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; SI-NEXT: v_addc_u32_e32 v16, vcc, 0, v16, vcc -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; SI-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; SI-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc -; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; SI-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc -; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; SI-NEXT: v_addc_u32_e32 v24, vcc, 0, v24, vcc -; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; SI-NEXT: v_addc_u32_e32 v26, vcc, 0, v26, vcc -; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; SI-NEXT: v_addc_u32_e32 v28, vcc, 0, v28, vcc -; SI-NEXT: v_alignbit_b32 v29, v28, v27, 16 -; SI-NEXT: v_alignbit_b32 v30, v26, v25, 16 -; SI-NEXT: v_alignbit_b32 v31, v24, v23, 16 -; SI-NEXT: v_alignbit_b32 v32, v22, v21, 16 -; SI-NEXT: v_alignbit_b32 v33, v20, v19, 16 -; SI-NEXT: v_alignbit_b32 v34, v18, v17, 16 -; SI-NEXT: v_alignbit_b32 v36, v16, v15, 16 -; SI-NEXT: v_alignbit_b32 v38, v14, v13, 16 -; SI-NEXT: v_alignbit_b32 v48, v12, v11, 16 -; SI-NEXT: v_alignbit_b32 v51, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v53, v8, v7, 16 -; SI-NEXT: v_alignbit_b32 v40, v6, v5, 16 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc +; SI-NEXT: v_alignbit_b32 v28, v27, v26, 16 +; SI-NEXT: v_alignbit_b32 v29, v25, v24, 16 +; SI-NEXT: v_alignbit_b32 v30, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v31, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v32, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v33, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v34, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v35, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v36, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v37, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v39, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v50, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v53, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v40, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v15 ; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_alignbit_b32 v42, v4, v3, 16 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v13 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v11 ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_alignbit_b32 v44, v2, v1, 16 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v9 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v7 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v5 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v3 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v1 ; SI-NEXT: .LBB40_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 +; SI-NEXT: v_or_b32_e32 v0, v0, v40 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; SI-NEXT: v_or_b32_e32 v1, v1, v44 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v56 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v42 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v47 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v46 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v53 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v45 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v43 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v49 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v24 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v25 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v26 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v27 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v28 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v47 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v40 +; SI-NEXT: v_or_b32_e32 v2, v2, v53 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v46 +; SI-NEXT: v_or_b32_e32 v4, v4, v50 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v45 +; SI-NEXT: v_or_b32_e32 v6, v6, v39 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v44 +; SI-NEXT: v_or_b32_e32 v8, v8, v37 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v43 +; SI-NEXT: v_or_b32_e32 v10, v10, v36 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v42 +; SI-NEXT: v_or_b32_e32 v12, v12, v35 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v41 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_or_b32_e32 v14, v14, v34 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v55 +; SI-NEXT: v_or_b32_e32 v16, v16, v33 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v54 +; SI-NEXT: v_or_b32_e32 v18, v18, v32 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v52 +; SI-NEXT: v_or_b32_e32 v20, v20, v31 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v51 +; SI-NEXT: v_or_b32_e32 v22, v22, v30 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v49 +; SI-NEXT: v_or_b32_e32 v24, v24, v29 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v48 +; SI-NEXT: v_or_b32_e32 v26, v26, v28 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v38 +; SI-NEXT: v_or_b32_e32 v3, v3, v53 +; SI-NEXT: v_or_b32_e32 v5, v5, v50 +; SI-NEXT: v_or_b32_e32 v7, v7, v39 +; SI-NEXT: v_or_b32_e32 v9, v9, v37 +; SI-NEXT: v_or_b32_e32 v11, v11, v36 +; SI-NEXT: v_or_b32_e32 v13, v13, v35 +; SI-NEXT: v_or_b32_e32 v15, v15, v34 +; SI-NEXT: v_or_b32_e32 v17, v17, v33 +; SI-NEXT: v_or_b32_e32 v19, v19, v32 +; SI-NEXT: v_or_b32_e32 v21, v21, v31 +; SI-NEXT: v_or_b32_e32 v23, v23, v30 +; SI-NEXT: v_or_b32_e32 v25, v25, v29 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v14i64_to_v56i16: @@ -25653,67 +24979,67 @@ define inreg <56 x i16> @bitcast_v14i64_to_v56i16_scalar(<14 x i64> inreg %a, i3 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v20, s30, 0 -; SI-NEXT: v_writelane_b32 v20, s31, 1 -; SI-NEXT: v_writelane_b32 v20, s34, 2 -; SI-NEXT: v_writelane_b32 v20, s35, 3 -; SI-NEXT: v_writelane_b32 v20, s36, 4 -; SI-NEXT: v_writelane_b32 v20, s37, 5 -; SI-NEXT: v_writelane_b32 v20, s38, 6 -; SI-NEXT: v_writelane_b32 v20, s39, 7 -; SI-NEXT: v_writelane_b32 v20, s48, 8 -; SI-NEXT: v_mov_b32_e32 v16, s16 -; SI-NEXT: v_mov_b32_e32 v17, s17 -; SI-NEXT: v_writelane_b32 v20, s49, 9 -; SI-NEXT: v_mov_b32_e32 v18, s18 -; SI-NEXT: v_mov_b32_e32 v19, s19 -; SI-NEXT: v_readfirstlane_b32 s44, v16 -; SI-NEXT: v_mov_b32_e32 v16, s20 -; SI-NEXT: v_readfirstlane_b32 s45, v17 -; SI-NEXT: v_mov_b32_e32 v17, s21 -; SI-NEXT: v_writelane_b32 v20, s50, 10 -; SI-NEXT: v_readfirstlane_b32 s42, v18 -; SI-NEXT: v_mov_b32_e32 v18, s22 -; SI-NEXT: v_readfirstlane_b32 s43, v19 -; SI-NEXT: v_mov_b32_e32 v19, s23 -; SI-NEXT: v_readfirstlane_b32 s40, v16 -; SI-NEXT: v_mov_b32_e32 v16, s24 -; SI-NEXT: v_readfirstlane_b32 s41, v17 -; SI-NEXT: v_mov_b32_e32 v17, s25 -; SI-NEXT: v_writelane_b32 v20, s51, 11 -; SI-NEXT: v_readfirstlane_b32 s24, v18 -; SI-NEXT: v_mov_b32_e32 v18, s26 -; SI-NEXT: v_readfirstlane_b32 s25, v19 -; SI-NEXT: v_mov_b32_e32 v19, s27 -; SI-NEXT: v_readfirstlane_b32 s22, v16 -; SI-NEXT: v_mov_b32_e32 v16, s28 -; SI-NEXT: v_readfirstlane_b32 s23, v17 -; SI-NEXT: v_mov_b32_e32 v17, s29 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15 -; SI-NEXT: v_writelane_b32 v20, s52, 12 -; SI-NEXT: v_readfirstlane_b32 s20, v18 -; SI-NEXT: v_readfirstlane_b32 s21, v19 -; SI-NEXT: v_readfirstlane_b32 s18, v16 -; SI-NEXT: v_readfirstlane_b32 s19, v17 -; SI-NEXT: v_readfirstlane_b32 s16, v1 -; SI-NEXT: v_readfirstlane_b32 s17, v2 -; SI-NEXT: v_readfirstlane_b32 s14, v3 -; SI-NEXT: v_readfirstlane_b32 s15, v4 -; SI-NEXT: v_readfirstlane_b32 s12, v5 -; SI-NEXT: v_readfirstlane_b32 s13, v6 -; SI-NEXT: v_readfirstlane_b32 s10, v7 -; SI-NEXT: v_readfirstlane_b32 s11, v8 -; SI-NEXT: v_readfirstlane_b32 s8, v9 -; SI-NEXT: v_readfirstlane_b32 s9, v10 -; SI-NEXT: v_readfirstlane_b32 s6, v11 -; SI-NEXT: v_readfirstlane_b32 s7, v12 -; SI-NEXT: v_readfirstlane_b32 s4, v13 +; SI-NEXT: v_writelane_b32 v28, s30, 0 +; SI-NEXT: v_writelane_b32 v28, s31, 1 +; SI-NEXT: v_writelane_b32 v28, s34, 2 +; SI-NEXT: v_writelane_b32 v28, s35, 3 +; SI-NEXT: v_writelane_b32 v28, s36, 4 +; SI-NEXT: v_writelane_b32 v28, s37, 5 +; SI-NEXT: v_writelane_b32 v28, s38, 6 +; SI-NEXT: v_writelane_b32 v28, s39, 7 +; SI-NEXT: v_writelane_b32 v28, s48, 8 +; SI-NEXT: v_writelane_b32 v28, s49, 9 +; SI-NEXT: v_mov_b32_e32 v15, s16 +; SI-NEXT: v_mov_b32_e32 v16, s17 +; SI-NEXT: v_mov_b32_e32 v17, s18 +; SI-NEXT: v_mov_b32_e32 v18, s19 +; SI-NEXT: v_writelane_b32 v28, s50, 10 +; SI-NEXT: v_mov_b32_e32 v19, s20 +; SI-NEXT: v_readfirstlane_b32 s44, v15 +; SI-NEXT: v_mov_b32_e32 v15, s21 +; SI-NEXT: v_readfirstlane_b32 s45, v16 +; SI-NEXT: v_mov_b32_e32 v16, s22 +; SI-NEXT: v_readfirstlane_b32 s42, v17 +; SI-NEXT: v_mov_b32_e32 v17, s23 +; SI-NEXT: v_readfirstlane_b32 s43, v18 +; SI-NEXT: v_mov_b32_e32 v18, s24 +; SI-NEXT: v_writelane_b32 v28, s51, 11 +; SI-NEXT: v_readfirstlane_b32 s40, v19 +; SI-NEXT: v_mov_b32_e32 v19, s25 +; SI-NEXT: v_readfirstlane_b32 s41, v15 +; SI-NEXT: v_mov_b32_e32 v15, s26 +; SI-NEXT: v_readfirstlane_b32 s24, v16 +; SI-NEXT: v_mov_b32_e32 v16, s27 +; SI-NEXT: v_readfirstlane_b32 s25, v17 +; SI-NEXT: v_mov_b32_e32 v17, s28 +; SI-NEXT: v_readfirstlane_b32 s22, v18 +; SI-NEXT: v_mov_b32_e32 v18, s29 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: v_writelane_b32 v28, s52, 12 +; SI-NEXT: v_readfirstlane_b32 s23, v19 +; SI-NEXT: v_readfirstlane_b32 s20, v15 +; SI-NEXT: v_readfirstlane_b32 s21, v16 +; SI-NEXT: v_readfirstlane_b32 s18, v17 +; SI-NEXT: v_readfirstlane_b32 s19, v18 +; SI-NEXT: v_readfirstlane_b32 s16, v0 +; SI-NEXT: v_readfirstlane_b32 s17, v1 +; SI-NEXT: v_readfirstlane_b32 s14, v2 +; SI-NEXT: v_readfirstlane_b32 s15, v3 +; SI-NEXT: v_readfirstlane_b32 s12, v4 +; SI-NEXT: v_readfirstlane_b32 s13, v5 +; SI-NEXT: v_readfirstlane_b32 s10, v6 +; SI-NEXT: v_readfirstlane_b32 s11, v7 +; SI-NEXT: v_readfirstlane_b32 s8, v8 +; SI-NEXT: v_readfirstlane_b32 s9, v9 +; SI-NEXT: v_readfirstlane_b32 s6, v10 +; SI-NEXT: v_readfirstlane_b32 s7, v11 +; SI-NEXT: v_readfirstlane_b32 s4, v12 ; SI-NEXT: s_and_b64 s[26:27], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s5, v14 -; SI-NEXT: v_writelane_b32 v20, s53, 13 +; SI-NEXT: v_readfirstlane_b32 s5, v13 +; SI-NEXT: v_writelane_b32 v28, s53, 13 ; SI-NEXT: s_cbranch_scc0 .LBB41_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_lshr_b32 s30, s5, 16 @@ -25806,212 +25132,133 @@ define inreg <56 x i16> @bitcast_v14i64_to_v56i16_scalar(<14 x i64> inreg %a, i3 ; SI-NEXT: s_lshl_b32 s27, s92, 16 ; SI-NEXT: s_and_b32 s29, s44, 0xffff ; SI-NEXT: s_or_b32 s27, s29, s27 -; SI-NEXT: v_mov_b32_e32 v1, s27 -; SI-NEXT: s_and_b32 s27, s45, 0xffff -; SI-NEXT: s_lshl_b32 s29, s53, 16 -; SI-NEXT: s_or_b32 s27, s27, s29 -; SI-NEXT: v_mov_b32_e32 v2, s27 -; SI-NEXT: s_lshl_b32 s27, s90, 16 -; SI-NEXT: s_and_b32 s29, s42, 0xffff -; SI-NEXT: s_or_b32 s27, s29, s27 -; SI-NEXT: v_mov_b32_e32 v3, s27 -; SI-NEXT: s_and_b32 s27, s43, 0xffff -; SI-NEXT: s_lshl_b32 s29, s52, 16 -; SI-NEXT: s_or_b32 s27, s27, s29 -; SI-NEXT: v_mov_b32_e32 v4, s27 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; SI-NEXT: s_lshl_b32 s27, s88, 16 -; SI-NEXT: s_and_b32 s29, s40, 0xffff -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v1, vcc, 8, v0 -; SI-NEXT: s_or_b32 s27, s29, s27 -; SI-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v1, vcc, 12, v0 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v2, s27 -; SI-NEXT: s_and_b32 s27, s41, 0xffff -; SI-NEXT: s_lshl_b32 s29, s51, 16 -; SI-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v1, vcc, 16, v0 -; SI-NEXT: s_or_b32 s27, s27, s29 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s27 +; SI-NEXT: s_and_b32 s29, s45, 0xffff +; SI-NEXT: s_lshl_b32 s44, s53, 16 +; SI-NEXT: s_or_b32 s29, s29, s44 +; SI-NEXT: s_lshl_b32 s44, s90, 16 +; SI-NEXT: s_and_b32 s42, s42, 0xffff +; SI-NEXT: s_or_b32 s42, s42, s44 +; SI-NEXT: s_and_b32 s43, s43, 0xffff +; SI-NEXT: s_lshl_b32 s44, s52, 16 +; SI-NEXT: s_or_b32 s43, s43, s44 +; SI-NEXT: s_lshl_b32 s44, s88, 16 +; SI-NEXT: s_and_b32 s40, s40, 0xffff +; SI-NEXT: s_or_b32 s40, s40, s44 +; SI-NEXT: s_and_b32 s41, s41, 0xffff +; SI-NEXT: s_lshl_b32 s44, s51, 16 +; SI-NEXT: s_or_b32 s41, s41, s44 +; SI-NEXT: s_lshl_b32 s44, s78, 16 ; SI-NEXT: s_and_b32 s24, s24, 0xffff -; SI-NEXT: s_lshl_b32 s27, s78, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 20, v0 -; SI-NEXT: s_or_b32 s24, s24, s27 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s24 -; SI-NEXT: s_and_b32 s24, s25, 0xffff -; SI-NEXT: s_lshl_b32 s25, s50, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 24, v0 -; SI-NEXT: s_or_b32 s24, s24, s25 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s24 +; SI-NEXT: s_or_b32 s24, s24, s44 +; SI-NEXT: s_and_b32 s25, s25, 0xffff +; SI-NEXT: s_lshl_b32 s44, s50, 16 +; SI-NEXT: s_or_b32 s25, s25, s44 ; SI-NEXT: s_and_b32 s22, s22, 0xffff -; SI-NEXT: s_lshl_b32 s24, s76, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 28, v0 -; SI-NEXT: s_or_b32 s22, s22, s24 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s22 -; SI-NEXT: s_and_b32 s22, s23, 0xffff -; SI-NEXT: s_lshl_b32 s23, s49, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 32, v0 -; SI-NEXT: s_or_b32 s22, s22, s23 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s22 +; SI-NEXT: s_lshl_b32 s44, s76, 16 +; SI-NEXT: s_or_b32 s22, s22, s44 +; SI-NEXT: s_and_b32 s23, s23, 0xffff +; SI-NEXT: s_lshl_b32 s44, s49, 16 +; SI-NEXT: s_or_b32 s23, s23, s44 ; SI-NEXT: s_and_b32 s20, s20, 0xffff -; SI-NEXT: s_lshl_b32 s22, s74, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 36, v0 -; SI-NEXT: s_or_b32 s20, s20, s22 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s20 -; SI-NEXT: s_and_b32 s20, s21, 0xffff -; SI-NEXT: s_lshl_b32 s21, s48, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 40, v0 -; SI-NEXT: s_or_b32 s20, s20, s21 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s20 +; SI-NEXT: s_lshl_b32 s44, s74, 16 +; SI-NEXT: s_or_b32 s20, s20, s44 +; SI-NEXT: s_and_b32 s21, s21, 0xffff +; SI-NEXT: s_lshl_b32 s44, s48, 16 +; SI-NEXT: s_or_b32 s21, s21, s44 ; SI-NEXT: s_and_b32 s18, s18, 0xffff -; SI-NEXT: s_lshl_b32 s20, s72, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 44, v0 -; SI-NEXT: s_or_b32 s18, s18, s20 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s18 -; SI-NEXT: s_and_b32 s18, s19, 0xffff -; SI-NEXT: s_lshl_b32 s19, s39, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 48, v0 -; SI-NEXT: s_or_b32 s18, s18, s19 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: s_lshl_b32 s44, s72, 16 +; SI-NEXT: s_or_b32 s18, s18, s44 +; SI-NEXT: s_and_b32 s19, s19, 0xffff +; SI-NEXT: s_lshl_b32 s44, s39, 16 +; SI-NEXT: s_or_b32 s19, s19, s44 ; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: s_lshl_b32 s18, s62, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 52, v0 -; SI-NEXT: s_or_b32 s16, s16, s18 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s17, 0xffff -; SI-NEXT: s_lshl_b32 s17, s38, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 56, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_lshl_b32 s44, s62, 16 +; SI-NEXT: s_or_b32 s16, s16, s44 +; SI-NEXT: s_and_b32 s17, s17, 0xffff +; SI-NEXT: s_lshl_b32 s44, s38, 16 +; SI-NEXT: s_or_b32 s17, s17, s44 ; SI-NEXT: s_and_b32 s14, s14, 0xffff -; SI-NEXT: s_lshl_b32 s16, s60, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 60, v0 -; SI-NEXT: s_or_b32 s14, s14, s16 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s14 -; SI-NEXT: s_and_b32 s14, s15, 0xffff -; SI-NEXT: s_lshl_b32 s15, s37, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 64, v0 -; SI-NEXT: s_or_b32 s14, s14, s15 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s14 +; SI-NEXT: s_lshl_b32 s44, s60, 16 +; SI-NEXT: s_or_b32 s14, s14, s44 +; SI-NEXT: s_and_b32 s15, s15, 0xffff +; SI-NEXT: s_lshl_b32 s44, s37, 16 +; SI-NEXT: s_or_b32 s15, s15, s44 ; SI-NEXT: s_and_b32 s12, s12, 0xffff -; SI-NEXT: s_lshl_b32 s14, s58, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x44, v0 -; SI-NEXT: s_or_b32 s12, s12, s14 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s12 -; SI-NEXT: s_and_b32 s12, s13, 0xffff -; SI-NEXT: s_lshl_b32 s13, s36, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x48, v0 -; SI-NEXT: s_or_b32 s12, s12, s13 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s12 +; SI-NEXT: s_lshl_b32 s44, s58, 16 +; SI-NEXT: s_or_b32 s12, s12, s44 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_lshl_b32 s44, s36, 16 +; SI-NEXT: s_or_b32 s13, s13, s44 ; SI-NEXT: s_and_b32 s10, s10, 0xffff -; SI-NEXT: s_lshl_b32 s12, s56, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x4c, v0 -; SI-NEXT: s_or_b32 s10, s10, s12 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s10 -; SI-NEXT: s_and_b32 s10, s11, 0xffff -; SI-NEXT: s_lshl_b32 s11, s35, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x50, v0 -; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: s_lshl_b32 s44, s56, 16 +; SI-NEXT: s_or_b32 s10, s10, s44 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: s_lshl_b32 s44, s35, 16 +; SI-NEXT: s_or_b32 s11, s11, s44 ; SI-NEXT: s_and_b32 s8, s8, 0xffff -; SI-NEXT: s_lshl_b32 s10, s46, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x54, v0 -; SI-NEXT: s_or_b32 s8, s8, s10 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s8 -; SI-NEXT: s_and_b32 s8, s9, 0xffff -; SI-NEXT: s_lshl_b32 s9, s34, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x58, v0 -; SI-NEXT: s_or_b32 s8, s8, s9 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: s_lshl_b32 s44, s46, 16 ; SI-NEXT: s_and_b32 s6, s6, 0xffff -; SI-NEXT: s_lshl_b32 s8, s28, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x5c, v0 -; SI-NEXT: s_or_b32 s6, s6, s8 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: s_and_b32 s6, s7, 0xffff -; SI-NEXT: s_lshl_b32 s7, s31, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x60, v0 -; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_lshl_b32 s28, s28, 16 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_lshl_b32 s6, s26, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x64, v0 -; SI-NEXT: s_or_b32 s4, s4, s6 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s4 -; SI-NEXT: s_and_b32 s4, s5, 0xffff -; SI-NEXT: s_lshl_b32 s5, s30, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x68, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0 -; SI-NEXT: v_mov_b32_e32 v1, s4 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: v_readlane_b32 s53, v20, 13 -; SI-NEXT: v_readlane_b32 s52, v20, 12 -; SI-NEXT: v_readlane_b32 s51, v20, 11 -; SI-NEXT: v_readlane_b32 s50, v20, 10 -; SI-NEXT: v_readlane_b32 s49, v20, 9 -; SI-NEXT: v_readlane_b32 s48, v20, 8 -; SI-NEXT: v_readlane_b32 s39, v20, 7 -; SI-NEXT: v_readlane_b32 s38, v20, 6 -; SI-NEXT: v_readlane_b32 s37, v20, 5 -; SI-NEXT: v_readlane_b32 s36, v20, 4 -; SI-NEXT: v_readlane_b32 s35, v20, 3 -; SI-NEXT: v_readlane_b32 s34, v20, 2 -; SI-NEXT: v_readlane_b32 s31, v20, 1 -; SI-NEXT: v_readlane_b32 s30, v20, 0 +; SI-NEXT: s_lshl_b32 s26, s26, 16 +; SI-NEXT: s_or_b32 s8, s8, s44 +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_lshl_b32 s44, s34, 16 +; SI-NEXT: s_or_b32 s6, s6, s28 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_lshl_b32 s28, s31, 16 +; SI-NEXT: s_or_b32 s4, s4, s26 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s26, s30, 16 +; SI-NEXT: s_or_b32 s9, s9, s44 +; SI-NEXT: s_or_b32 s7, s7, s28 +; SI-NEXT: s_or_b32 s5, s5, s26 +; SI-NEXT: v_mov_b32_e32 v0, s27 +; SI-NEXT: v_mov_b32_e32 v1, s29 +; SI-NEXT: v_mov_b32_e32 v2, s42 +; SI-NEXT: v_mov_b32_e32 v3, s43 +; SI-NEXT: v_mov_b32_e32 v4, s40 +; SI-NEXT: v_mov_b32_e32 v5, s41 +; SI-NEXT: v_mov_b32_e32 v6, s24 +; SI-NEXT: v_mov_b32_e32 v7, s25 +; SI-NEXT: v_mov_b32_e32 v8, s22 +; SI-NEXT: v_mov_b32_e32 v9, s23 +; SI-NEXT: v_mov_b32_e32 v10, s20 +; SI-NEXT: v_mov_b32_e32 v11, s21 +; SI-NEXT: v_mov_b32_e32 v12, s18 +; SI-NEXT: v_mov_b32_e32 v13, s19 +; SI-NEXT: v_mov_b32_e32 v14, s16 +; SI-NEXT: v_mov_b32_e32 v15, s17 +; SI-NEXT: v_mov_b32_e32 v16, s14 +; SI-NEXT: v_mov_b32_e32 v17, s15 +; SI-NEXT: v_mov_b32_e32 v18, s12 +; SI-NEXT: v_mov_b32_e32 v19, s13 +; SI-NEXT: v_mov_b32_e32 v20, s10 +; SI-NEXT: v_mov_b32_e32 v21, s11 +; SI-NEXT: v_mov_b32_e32 v22, s8 +; SI-NEXT: v_mov_b32_e32 v23, s9 +; SI-NEXT: v_mov_b32_e32 v24, s6 +; SI-NEXT: v_mov_b32_e32 v25, s7 +; SI-NEXT: v_mov_b32_e32 v26, s4 +; SI-NEXT: v_mov_b32_e32 v27, s5 +; SI-NEXT: v_readlane_b32 s53, v28, 13 +; SI-NEXT: v_readlane_b32 s52, v28, 12 +; SI-NEXT: v_readlane_b32 s51, v28, 11 +; SI-NEXT: v_readlane_b32 s50, v28, 10 +; SI-NEXT: v_readlane_b32 s49, v28, 9 +; SI-NEXT: v_readlane_b32 s48, v28, 8 +; SI-NEXT: v_readlane_b32 s39, v28, 7 +; SI-NEXT: v_readlane_b32 s38, v28, 6 +; SI-NEXT: v_readlane_b32 s37, v28, 5 +; SI-NEXT: v_readlane_b32 s36, v28, 4 +; SI-NEXT: v_readlane_b32 s35, v28, 3 +; SI-NEXT: v_readlane_b32 s34, v28, 2 +; SI-NEXT: v_readlane_b32 s31, v28, 1 +; SI-NEXT: v_readlane_b32 s30, v28, 0 ; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[4:5] -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB41_4: ; SI-NEXT: ; implicit-def: $sgpr92 @@ -26791,167 +26038,260 @@ define <14 x i64> @bitcast_v56i16_to_v14i64(<56 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v56i16_to_v14i64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v54, v2 -; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 -; SI-NEXT: v_mov_b32_e32 v53, v4 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v29 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v48, v19 +; SI-NEXT: v_mov_b32_e32 v49, v18 +; SI-NEXT: v_mov_b32_e32 v50, v17 +; SI-NEXT: v_mov_b32_e32 v51, v16 +; SI-NEXT: v_mov_b32_e32 v52, v15 +; SI-NEXT: v_mov_b32_e32 v53, v14 +; SI-NEXT: v_mov_b32_e32 v54, v13 +; SI-NEXT: v_mov_b32_e32 v55, v12 +; SI-NEXT: v_mov_b32_e32 v40, v11 +; SI-NEXT: v_mov_b32_e32 v41, v10 +; SI-NEXT: v_mov_b32_e32 v42, v9 +; SI-NEXT: v_mov_b32_e32 v43, v8 +; SI-NEXT: v_mov_b32_e32 v44, v7 +; SI-NEXT: v_mov_b32_e32 v45, v6 +; SI-NEXT: v_mov_b32_e32 v46, v5 +; SI-NEXT: v_mov_b32_e32 v47, v4 +; SI-NEXT: v_mov_b32_e32 v56, v3 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_mov_b32_e32 v57, v2 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_mov_b32_e32 v58, v1 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_mov_b32_e32 v59, v0 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v27 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v26 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v25 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v24 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v23 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v22 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v21 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v48 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v50 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v52 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v40 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v42 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v43 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v44 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v45 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v46 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v47 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v56 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v57 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v58 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v59 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB42_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v55 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v59 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v58 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v57 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v56 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v47 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v46 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v45 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v44 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v43 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v42 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v41 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v40 +; SI-NEXT: v_or_b32_e32 v0, v0, v39 +; SI-NEXT: v_or_b32_e32 v1, v1, v38 +; SI-NEXT: v_or_b32_e32 v2, v2, v62 +; SI-NEXT: v_or_b32_e32 v3, v3, v37 +; SI-NEXT: v_or_b32_e32 v4, v4, v32 +; SI-NEXT: v_or_b32_e32 v5, v5, v36 +; SI-NEXT: v_or_b32_e32 v6, v6, v61 +; SI-NEXT: v_or_b32_e32 v7, v7, v35 +; SI-NEXT: v_or_b32_e32 v8, v8, v60 +; SI-NEXT: v_or_b32_e32 v9, v9, v34 +; SI-NEXT: v_or_b32_e32 v10, v10, v63 +; SI-NEXT: v_or_b32_e32 v11, v11, v33 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v0 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:84 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:76 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v4 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v54 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v53 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v52 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v51 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v50 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v49 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v48 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:68 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:100 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:60 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v8 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v12 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v16 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v18 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:52 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v22 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v24 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v26 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:44 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:36 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:28 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v24, v24, v25 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:20 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:12 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v26, v26, v27 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execz .LBB42_2 -; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; kill: killed $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr28 @@ -26995,195 +26335,41 @@ define <14 x i64> @bitcast_v56i16_to_v14i64(<56 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; kill: killed $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v53 -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v59 ; SI-NEXT: ; kill: killed $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: v_or_b32_e32 v0, v0, v58 -; SI-NEXT: v_or_b32_e32 v1, v1, v52 -; SI-NEXT: v_or_b32_e32 v2, v2, v57 -; SI-NEXT: v_or_b32_e32 v16, v16, v49 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: ; kill: killed $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; kill: killed $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; SI-NEXT: v_or_b32_e32 v3, v3, v51 -; SI-NEXT: v_or_b32_e32 v4, v4, v56 -; SI-NEXT: v_or_b32_e32 v5, v5, v50 -; SI-NEXT: v_or_b32_e32 v6, v6, v46 -; SI-NEXT: v_or_b32_e32 v7, v7, v45 -; SI-NEXT: v_or_b32_e32 v8, v8, v39 -; SI-NEXT: v_or_b32_e32 v9, v9, v37 -; SI-NEXT: v_or_b32_e32 v10, v10, v36 -; SI-NEXT: v_or_b32_e32 v11, v11, v34 -; SI-NEXT: v_or_b32_e32 v12, v12, v43 -; SI-NEXT: v_or_b32_e32 v13, v13, v42 -; SI-NEXT: v_or_b32_e32 v14, v14, v40 -; SI-NEXT: v_or_b32_e32 v15, v15, v47 -; SI-NEXT: v_or_b32_e32 v17, v17, v48 -; SI-NEXT: v_or_b32_e32 v18, v18, v38 -; SI-NEXT: v_or_b32_e32 v19, v19, v44 -; SI-NEXT: v_or_b32_e32 v20, v20, v35 -; SI-NEXT: v_or_b32_e32 v21, v21, v33 -; SI-NEXT: v_or_b32_e32 v22, v22, v32 -; SI-NEXT: v_or_b32_e32 v23, v23, v41 -; SI-NEXT: v_or_b32_e32 v24, v24, v63 -; SI-NEXT: v_or_b32_e32 v25, v25, v62 -; SI-NEXT: v_or_b32_e32 v26, v26, v61 -; SI-NEXT: v_or_b32_e32 v27, v27, v60 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: .LBB42_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB42_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v53 -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v59 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v55 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v59 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v58 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v57 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v56 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v47 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v46 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v45 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v44 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v43 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v42 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v41 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v40 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; SI-NEXT: v_or_b32_e32 v0, v58, v0 -; SI-NEXT: s_mov_b32 s6, 0x30000 -; SI-NEXT: v_or_b32_e32 v1, v52, v1 -; SI-NEXT: v_or_b32_e32 v2, v57, v2 -; SI-NEXT: v_or_b32_e32 v16, v49, v16 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 @@ -27193,45 +26379,22 @@ define <14 x i64> @bitcast_v56i16_to_v14i64(<56 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; SI-NEXT: v_or_b32_e32 v3, v51, v3 -; SI-NEXT: v_or_b32_e32 v4, v56, v4 -; SI-NEXT: v_or_b32_e32 v5, v50, v5 -; SI-NEXT: v_or_b32_e32 v6, v46, v6 -; SI-NEXT: v_or_b32_e32 v7, v45, v7 -; SI-NEXT: v_or_b32_e32 v8, v39, v8 -; SI-NEXT: v_or_b32_e32 v9, v37, v9 -; SI-NEXT: v_or_b32_e32 v10, v36, v10 -; SI-NEXT: v_or_b32_e32 v11, v34, v11 -; SI-NEXT: v_or_b32_e32 v12, v43, v12 -; SI-NEXT: v_or_b32_e32 v13, v42, v13 -; SI-NEXT: v_or_b32_e32 v14, v40, v14 -; SI-NEXT: v_or_b32_e32 v15, v47, v15 -; SI-NEXT: v_or_b32_e32 v17, v48, v17 -; SI-NEXT: v_or_b32_e32 v18, v38, v18 -; SI-NEXT: v_or_b32_e32 v19, v44, v19 -; SI-NEXT: v_or_b32_e32 v20, v35, v20 -; SI-NEXT: v_or_b32_e32 v21, v33, v21 -; SI-NEXT: v_or_b32_e32 v22, v32, v22 -; SI-NEXT: v_or_b32_e32 v23, v41, v23 -; SI-NEXT: v_or_b32_e32 v24, v63, v24 -; SI-NEXT: v_or_b32_e32 v25, v62, v25 -; SI-NEXT: v_or_b32_e32 v26, v61, v26 -; SI-NEXT: v_or_b32_e32 v27, v60, v27 +; SI-NEXT: v_or_b32_e32 v0, v39, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v1, v38, v1 +; SI-NEXT: v_or_b32_e32 v2, v62, v2 +; SI-NEXT: v_or_b32_e32 v3, v37, v3 +; SI-NEXT: v_or_b32_e32 v4, v32, v4 +; SI-NEXT: v_or_b32_e32 v5, v36, v5 +; SI-NEXT: v_or_b32_e32 v6, v61, v6 +; SI-NEXT: v_or_b32_e32 v7, v35, v7 +; SI-NEXT: v_or_b32_e32 v8, v60, v8 +; SI-NEXT: v_or_b32_e32 v9, v34, v9 +; SI-NEXT: v_or_b32_e32 v10, v63, v10 +; SI-NEXT: v_or_b32_e32 v11, v33, v11 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 ; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 ; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 ; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 @@ -27239,41 +26402,121 @@ define <14 x i64> @bitcast_v56i16_to_v14i64(<56 x i16> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 ; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 ; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v54 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v53 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v52 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v51 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v50 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v49 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v48 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 ; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 ; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 ; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 ; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 ; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 ; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 ; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 ; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v25, vcc, s6, v25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v26, v27, v26 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v26, vcc, 0x30000, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_or_b32_e32 v27, v28, v27 ; SI-NEXT: v_add_i32_e32 v27, vcc, 0x30000, v27 ; SI-NEXT: .LBB42_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -28040,402 +27283,323 @@ define inreg <14 x i64> @bitcast_v56i16_to_v14i64_scalar(<56 x i16> inreg %a, i3 ; SI-LABEL: bitcast_v56i16_to_v14i64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_mov_b32_e32 v57, v12 +; SI-NEXT: v_mov_b32_e32 v48, v5 +; SI-NEXT: v_mov_b32_e32 v39, v6 +; SI-NEXT: v_mov_b32_e32 v53, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v48 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v38, v7 ; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_mov_b32_e32 v58, v10 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_mov_b32_e32 v60, v8 -; SI-NEXT: v_mov_b32_e32 v33, v6 -; SI-NEXT: v_mov_b32_e32 v35, v4 -; SI-NEXT: v_mov_b32_e32 v39, v2 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mov_b32_e32 v61, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:36 -; SI-NEXT: v_mov_b32_e32 v31, v26 -; SI-NEXT: v_mov_b32_e32 v41, v24 -; SI-NEXT: v_mov_b32_e32 v42, v22 -; SI-NEXT: v_mov_b32_e32 v43, v20 -; SI-NEXT: v_mov_b32_e32 v49, v18 -; SI-NEXT: v_mov_b32_e32 v44, v16 -; SI-NEXT: v_mov_b32_e32 v45, v14 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v11 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v29 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v39 +; SI-NEXT: v_mov_b32_e32 v37, v8 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v38 +; SI-NEXT: v_mov_b32_e32 v36, v9 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v37 +; SI-NEXT: v_mov_b32_e32 v35, v10 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v36 +; SI-NEXT: v_mov_b32_e32 v34, v11 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v35 +; SI-NEXT: v_mov_b32_e32 v33, v12 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v34 +; SI-NEXT: v_mov_b32_e32 v32, v13 +; SI-NEXT: v_mov_b32_e32 v49, v4 +; SI-NEXT: v_mov_b32_e32 v50, v3 +; SI-NEXT: v_mov_b32_e32 v51, v2 +; SI-NEXT: v_mov_b32_e32 v52, v1 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v50 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v52 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v53 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s7, s28, 16 +; SI-NEXT: s_lshr_b32 s8, s27, 16 +; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: s_lshr_b32 s13, s22, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s40, s19, 16 +; SI-NEXT: s_lshr_b32 s41, s18, 16 +; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v32 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v8 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v12 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v0 ; SI-NEXT: s_cbranch_scc0 .LBB43_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v61 -; SI-NEXT: v_or_b32_e32 v7, v0, v18 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 -; SI-NEXT: v_or_b32_e32 v9, v0, v16 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 -; SI-NEXT: v_or_b32_e32 v10, v0, v14 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v60 -; SI-NEXT: v_or_b32_e32 v11, v0, v5 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v58 -; SI-NEXT: v_or_b32_e32 v12, v0, v3 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v57 -; SI-NEXT: v_or_b32_e32 v13, v0, v63 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v45 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_or_b32_e32 v14, v0, v62 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v44 -; SI-NEXT: v_or_b32_e32 v15, v0, v40 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_or_b32_e32 v16, v0, v48 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v43 -; SI-NEXT: v_or_b32_e32 v17, v0, v38 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v42 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_or_b32_e32 v18, v0, v37 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v41 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v39 -; SI-NEXT: v_or_b32_e32 v19, v0, v36 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v31 ; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: v_or_b32_e32 v8, v1, v20 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_or_b32_e32 v20, v0, v55 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v28 +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: v_or_b32_e32 v21, v0, v34 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 -; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: v_or_b32_e32 v22, v0, v54 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v56 -; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: v_or_b32_e32 v23, v0, v32 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s44, s42, 16 +; SI-NEXT: v_or_b32_e32 v14, v0, v59 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; SI-NEXT: s_or_b32 s5, s5, s44 +; SI-NEXT: s_and_b32 s44, s18, 0xffff +; SI-NEXT: s_lshl_b32 s45, s41, 16 +; SI-NEXT: v_or_b32_e32 v15, v0, v58 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 -; SI-NEXT: s_or_b32 s7, s7, s8 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: v_or_b32_e32 v24, v0, v59 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v47 -; SI-NEXT: s_or_b32 s8, s8, s9 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: v_or_b32_e32 v25, v0, v53 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v46 -; SI-NEXT: s_or_b32 s9, s9, s10 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_or_b32_e32 v26, v0, v52 -; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: s_or_b32 s44, s44, s45 +; SI-NEXT: s_and_b32 s45, s19, 0xffff +; SI-NEXT: s_lshl_b32 s46, s40, 16 +; SI-NEXT: v_or_b32_e32 v16, v0, v57 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 -; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_or_b32_e32 v27, v0, v29 +; SI-NEXT: s_or_b32 s45, s45, s46 +; SI-NEXT: s_and_b32 s46, s20, 0xffff +; SI-NEXT: s_lshl_b32 s47, s15, 16 +; SI-NEXT: v_or_b32_e32 v17, v0, v56 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: s_or_b32 s46, s46, s47 +; SI-NEXT: s_and_b32 s47, s21, 0xffff +; SI-NEXT: s_lshl_b32 s56, s14, 16 +; SI-NEXT: v_or_b32_e32 v18, v0, v47 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; SI-NEXT: s_or_b32 s47, s47, s56 +; SI-NEXT: s_and_b32 s56, s22, 0xffff +; SI-NEXT: s_lshl_b32 s57, s13, 16 +; SI-NEXT: v_or_b32_e32 v19, v0, v46 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: s_or_b32 s56, s56, s57 +; SI-NEXT: s_and_b32 s57, s23, 0xffff +; SI-NEXT: s_lshl_b32 s58, s12, 16 +; SI-NEXT: v_or_b32_e32 v20, v0, v45 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: s_or_b32 s57, s57, s58 +; SI-NEXT: s_and_b32 s58, s24, 0xffff +; SI-NEXT: s_lshl_b32 s59, s11, 16 +; SI-NEXT: v_or_b32_e32 v21, v0, v44 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: s_or_b32 s58, s58, s59 +; SI-NEXT: s_and_b32 s59, s25, 0xffff +; SI-NEXT: s_lshl_b32 s60, s10, 16 +; SI-NEXT: v_or_b32_e32 v22, v0, v43 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: s_or_b32 s59, s59, s60 +; SI-NEXT: s_and_b32 s60, s26, 0xffff +; SI-NEXT: s_lshl_b32 s61, s9, 16 +; SI-NEXT: v_or_b32_e32 v23, v0, v42 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: s_or_b32 s60, s60, s61 +; SI-NEXT: s_and_b32 s61, s27, 0xffff +; SI-NEXT: s_lshl_b32 s62, s8, 16 +; SI-NEXT: v_or_b32_e32 v24, v0, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: s_or_b32 s61, s61, s62 +; SI-NEXT: s_and_b32 s62, s28, 0xffff +; SI-NEXT: s_lshl_b32 s63, s7, 16 +; SI-NEXT: v_or_b32_e32 v25, v0, v40 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: s_or_b32 s62, s62, s63 +; SI-NEXT: s_and_b32 s63, s29, 0xffff +; SI-NEXT: s_lshl_b32 s72, s6, 16 +; SI-NEXT: v_or_b32_e32 v26, v0, v55 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: s_or_b32 s63, s63, s72 +; SI-NEXT: v_or_b32_e32 v27, v0, v54 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_mov_b32_e32 v5, s9 -; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: v_mov_b32_e32 v2, s44 +; SI-NEXT: v_mov_b32_e32 v3, s45 +; SI-NEXT: v_mov_b32_e32 v4, s46 +; SI-NEXT: v_mov_b32_e32 v5, s47 +; SI-NEXT: v_mov_b32_e32 v6, s56 +; SI-NEXT: v_mov_b32_e32 v7, s57 +; SI-NEXT: v_mov_b32_e32 v8, s58 +; SI-NEXT: v_mov_b32_e32 v9, s59 +; SI-NEXT: v_mov_b32_e32 v10, s60 +; SI-NEXT: v_mov_b32_e32 v11, s61 +; SI-NEXT: v_mov_b32_e32 v12, s62 +; SI-NEXT: v_mov_b32_e32 v13, s63 ; SI-NEXT: s_cbranch_execnz .LBB43_3 ; SI-NEXT: .LBB43_2: ; %cmp.true -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v61 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: s_or_b32 s7, s8, s7 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: s_or_b32 s8, s9, s8 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: s_or_b32 s9, s10, s9 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: s_or_b32 s10, s11, s10 -; SI-NEXT: s_add_i32 s4, s4, 0x30000 -; SI-NEXT: s_add_i32 s5, s5, 0x30000 -; SI-NEXT: s_add_i32 s6, s6, 0x30000 -; SI-NEXT: s_add_i32 s7, s7, 0x30000 -; SI-NEXT: s_add_i32 s8, s8, 0x30000 -; SI-NEXT: s_add_i32 s9, s9, 0x30000 -; SI-NEXT: s_add_i32 s10, s10, 0x30000 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_mov_b32_e32 v5, s9 -; SI-NEXT: v_mov_b32_e32 v6, s10 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v39 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v60 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v58 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v57 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v63, v0 -; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v45 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v62, v0 +; SI-NEXT: v_or_b32_e32 v0, v59, v0 ; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v44 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v40, v0 +; SI-NEXT: v_or_b32_e32 v0, v58, v0 ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v48, v0 +; SI-NEXT: v_or_b32_e32 v0, v57, v0 ; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v43 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v38, v0 +; SI-NEXT: v_or_b32_e32 v0, v56, v0 ; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v42 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v37, v0 +; SI-NEXT: v_or_b32_e32 v0, v47, v0 ; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v36, v0 +; SI-NEXT: v_or_b32_e32 v0, v46, v0 ; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v31 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v55, v0 +; SI-NEXT: v_or_b32_e32 v0, v45, v0 ; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v34, v0 +; SI-NEXT: v_or_b32_e32 v0, v44, v0 ; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v54, v0 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s16, s42, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: v_add_i32_e32 v22, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v56 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: s_and_b32 s16, s18, 0xffff +; SI-NEXT: s_lshl_b32 s17, s41, 16 +; SI-NEXT: s_add_i32 s19, s19, 3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v32, v0 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_and_b32 s17, s19, 0xffff +; SI-NEXT: s_lshl_b32 s18, s40, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: s_and_b32 s18, s20, 0xffff +; SI-NEXT: s_lshl_b32 s15, s15, 16 +; SI-NEXT: s_add_i32 s21, s21, 3 ; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: s_or_b32 s15, s15, s18 +; SI-NEXT: s_and_b32 s18, s21, 0xffff +; SI-NEXT: s_lshl_b32 s14, s14, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v59, v0 +; SI-NEXT: s_or_b32 s14, s14, s18 +; SI-NEXT: s_and_b32 s18, s22, 0xffff +; SI-NEXT: s_lshl_b32 s13, s13, 16 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: v_or_b32_e32 v0, v41, v0 +; SI-NEXT: s_or_b32 s13, s13, s18 +; SI-NEXT: s_and_b32 s18, s23, 0xffff +; SI-NEXT: s_lshl_b32 s12, s12, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 ; SI-NEXT: v_add_i32_e32 v24, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v47 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 +; SI-NEXT: s_or_b32 s12, s12, s18 +; SI-NEXT: s_and_b32 s18, s24, 0xffff +; SI-NEXT: s_lshl_b32 s11, s11, 16 +; SI-NEXT: s_add_i32 s25, s25, 3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v53, v0 +; SI-NEXT: s_or_b32 s11, s11, s18 +; SI-NEXT: s_and_b32 s18, s25, 0xffff +; SI-NEXT: s_lshl_b32 s10, s10, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: v_or_b32_e32 v0, v40, v0 +; SI-NEXT: s_or_b32 s10, s10, s18 +; SI-NEXT: s_and_b32 s18, s26, 0xffff +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_add_i32 s27, s27, 3 ; SI-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v46 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: s_or_b32 s9, s9, s18 +; SI-NEXT: s_and_b32 s18, s27, 0xffff +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v52, v0 +; SI-NEXT: s_or_b32 s8, s8, s18 +; SI-NEXT: s_and_b32 s18, s28, 0xffff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: v_or_b32_e32 v0, v55, v0 +; SI-NEXT: s_or_b32 s7, s7, s18 +; SI-NEXT: s_and_b32 s18, s29, 0xffff +; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: v_add_i32_e32 v26, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 +; SI-NEXT: s_or_b32 s6, s6, s18 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v29, v0 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s16, s16, 0x30000 +; SI-NEXT: s_add_i32 s17, s17, 0x30000 +; SI-NEXT: s_add_i32 s15, s15, 0x30000 +; SI-NEXT: s_add_i32 s14, s14, 0x30000 +; SI-NEXT: s_add_i32 s13, s13, 0x30000 +; SI-NEXT: s_add_i32 s12, s12, 0x30000 +; SI-NEXT: s_add_i32 s11, s11, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v54, v0 ; SI-NEXT: v_add_i32_e32 v27, vcc, 0x30000, v0 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: v_mov_b32_e32 v3, s17 +; SI-NEXT: v_mov_b32_e32 v4, s15 +; SI-NEXT: v_mov_b32_e32 v5, s14 +; SI-NEXT: v_mov_b32_e32 v6, s13 +; SI-NEXT: v_mov_b32_e32 v7, s12 +; SI-NEXT: v_mov_b32_e32 v8, s11 +; SI-NEXT: v_mov_b32_e32 v9, s10 +; SI-NEXT: v_mov_b32_e32 v10, s9 +; SI-NEXT: v_mov_b32_e32 v11, s8 +; SI-NEXT: v_mov_b32_e32 v12, s7 +; SI-NEXT: v_mov_b32_e32 v13, s6 ; SI-NEXT: .LBB43_3: ; %end -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB43_4: -; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v62, v58 -; SI-NEXT: v_mov_b32_e32 v58, v51 -; SI-NEXT: v_mov_b32_e32 v51, v47 -; SI-NEXT: v_mov_b32_e32 v47, v44 -; SI-NEXT: v_mov_b32_e32 v44, v41 -; SI-NEXT: v_mov_b32_e32 v41, v30 -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v63, v59 -; SI-NEXT: v_mov_b32_e32 v59, v56 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_mov_b32_e32 v56, v50 -; SI-NEXT: v_mov_b32_e32 v50, v45 -; SI-NEXT: v_mov_b32_e32 v45, v42 -; SI-NEXT: v_mov_b32_e32 v42, v28 -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v61, v52 -; SI-NEXT: v_mov_b32_e32 v52, v57 -; SI-NEXT: v_mov_b32_e32 v57, v46 -; SI-NEXT: v_mov_b32_e32 v46, v49 -; SI-NEXT: v_mov_b32_e32 v49, v43 -; SI-NEXT: v_mov_b32_e32 v43, v31 -; SI-NEXT: v_mov_b32_e32 v53, v40 -; SI-NEXT: v_mov_b32_e32 v40, v48 -; SI-NEXT: v_mov_b32_e32 v48, v39 -; SI-NEXT: v_mov_b32_e32 v39, v38 -; SI-NEXT: v_mov_b32_e32 v38, v37 -; SI-NEXT: v_mov_b32_e32 v37, v36 -; SI-NEXT: v_mov_b32_e32 v36, v35 -; SI-NEXT: v_mov_b32_e32 v35, v55 -; SI-NEXT: v_mov_b32_e32 v55, v34 -; SI-NEXT: v_mov_b32_e32 v34, v33 -; SI-NEXT: v_mov_b32_e32 v33, v54 -; SI-NEXT: v_mov_b32_e32 v54, v32 -; SI-NEXT: v_mov_b32_e32 v32, v60 -; SI-NEXT: v_mov_b32_e32 v60, v29 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v29, v60 -; SI-NEXT: v_mov_b32_e32 v60, v32 -; SI-NEXT: v_mov_b32_e32 v32, v54 -; SI-NEXT: v_mov_b32_e32 v54, v33 -; SI-NEXT: v_mov_b32_e32 v33, v34 -; SI-NEXT: v_mov_b32_e32 v34, v55 -; SI-NEXT: v_mov_b32_e32 v55, v35 -; SI-NEXT: v_mov_b32_e32 v35, v36 -; SI-NEXT: v_mov_b32_e32 v36, v37 -; SI-NEXT: v_mov_b32_e32 v37, v38 -; SI-NEXT: v_mov_b32_e32 v38, v39 -; SI-NEXT: v_mov_b32_e32 v39, v48 -; SI-NEXT: v_mov_b32_e32 v48, v40 -; SI-NEXT: v_mov_b32_e32 v40, v53 -; SI-NEXT: v_mov_b32_e32 v31, v43 -; SI-NEXT: v_mov_b32_e32 v43, v49 -; SI-NEXT: v_mov_b32_e32 v49, v46 -; SI-NEXT: v_mov_b32_e32 v46, v57 -; SI-NEXT: v_mov_b32_e32 v57, v52 -; SI-NEXT: v_mov_b32_e32 v52, v61 -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v28, v42 -; SI-NEXT: v_mov_b32_e32 v42, v45 -; SI-NEXT: v_mov_b32_e32 v45, v50 -; SI-NEXT: v_mov_b32_e32 v50, v56 -; SI-NEXT: v_mov_b32_e32 v56, v59 -; SI-NEXT: v_mov_b32_e32 v59, v63 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v30, v41 -; SI-NEXT: v_mov_b32_e32 v41, v44 -; SI-NEXT: v_mov_b32_e32 v44, v47 -; SI-NEXT: v_mov_b32_e32 v47, v51 -; SI-NEXT: v_mov_b32_e32 v51, v58 -; SI-NEXT: v_mov_b32_e32 v58, v62 -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: s_branch .LBB43_2 ; ; VI-LABEL: bitcast_v56i16_to_v14i64_scalar: @@ -29162,49 +28326,17 @@ end: define <56 x half> @bitcast_v14i64_to_v56f16(<14 x i64> %a, i32 %b) { ; SI-LABEL: bitcast_v14i64_to_v56f16: ; SI: ; %bb.0: +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -29221,116 +28353,139 @@ define <56 x half> @bitcast_v14i64_to_v56f16(<14 x i64> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB44_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v24 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v62, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v25 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v63, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v6 +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v30 +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v30 +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v27 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v25 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 @@ -29341,36 +28496,45 @@ define <56 x half> @bitcast_v14i64_to_v56f16(<14 x i64> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v28 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v49, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v30 +; SI-NEXT: v_mov_b32_e32 v30, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v24 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v37, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v1 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v33, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v0 +; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 @@ -29398,82 +28562,78 @@ define <56 x half> @bitcast_v14i64_to_v56f16(<14 x i64> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: .LBB44_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB44_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: v_addc_u32_e32 v12, vcc, 0, v12, vcc -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; SI-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; SI-NEXT: v_addc_u32_e32 v16, vcc, 0, v16, vcc -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; SI-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; SI-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc -; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v57 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v20 -; SI-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v47 -; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; SI-NEXT: v_addc_u32_e32 v24, vcc, 0, v24, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v46 ; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v19 -; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; SI-NEXT: v_addc_u32_e32 v26, vcc, 0, v26, vcc -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v45 -; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v25 -; SI-NEXT: v_addc_u32_e32 v28, vcc, 0, v28, vcc -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v45 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc ; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v23 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v23 ; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 @@ -29491,299 +28651,210 @@ define <56 x half> @bitcast_v14i64_to_v56f16(<14 x i64> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 ; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 ; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 ; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 ; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 ; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 ; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 ; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 ; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 ; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 -; SI-NEXT: v_mov_b32_e32 v29, v28 -; SI-NEXT: v_mov_b32_e32 v57, v25 -; SI-NEXT: v_mov_b32_e32 v47, v26 -; SI-NEXT: v_mov_b32_e32 v45, v27 -; SI-NEXT: v_mov_b32_e32 v43, v1 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v46, v24 +; SI-NEXT: v_mov_b32_e32 v45, v25 +; SI-NEXT: v_mov_b32_e32 v43, v26 +; SI-NEXT: v_mov_b32_e32 v41, v27 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: .LBB44_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v54 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v52 -; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v50 -; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v49 -; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v39 -; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 -; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 -; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 -; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v30 -; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v63 -; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 -; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v58 -; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 -; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 -; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v44 -; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v52 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v51 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v48 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v38 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v37 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v34 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v33 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v8, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v47 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v42 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v22, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v31 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v10, v61 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 -; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v12, v57 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v14, v44 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v16, v40 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v62 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v22, v58 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v57 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v47 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v45 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v29 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v62 +; SI-NEXT: v_or_b32_e32 v23, v25, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v46 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 +; SI-NEXT: v_or_b32_e32 v25, v27, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v43 ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -29800,7 +28871,12 @@ define <56 x half> @bitcast_v14i64_to_v56f16(<14 x i64> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v26, v30 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_or_b32_e32 v26, v27, v26 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 +; SI-NEXT: v_or_b32_e32 v27, v29, v27 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v14i64_to_v56f16: @@ -30412,50 +29488,50 @@ define inreg <56 x half> @bitcast_v14i64_to_v56f16_scalar(<14 x i64> inreg %a, i ; SI-LABEL: bitcast_v14i64_to_v56f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v16, s16 -; SI-NEXT: v_mov_b32_e32 v17, s17 -; SI-NEXT: v_mov_b32_e32 v18, s18 -; SI-NEXT: v_mov_b32_e32 v19, s19 -; SI-NEXT: v_readfirstlane_b32 s40, v16 -; SI-NEXT: v_mov_b32_e32 v16, s20 -; SI-NEXT: v_readfirstlane_b32 s42, v17 -; SI-NEXT: v_mov_b32_e32 v17, s21 -; SI-NEXT: v_readfirstlane_b32 s41, v18 -; SI-NEXT: v_mov_b32_e32 v18, s22 -; SI-NEXT: v_readfirstlane_b32 s43, v19 -; SI-NEXT: v_mov_b32_e32 v19, s23 -; SI-NEXT: v_readfirstlane_b32 s22, v16 -; SI-NEXT: v_mov_b32_e32 v16, s24 -; SI-NEXT: v_readfirstlane_b32 s44, v17 -; SI-NEXT: v_mov_b32_e32 v17, s25 -; SI-NEXT: v_readfirstlane_b32 s23, v18 -; SI-NEXT: v_mov_b32_e32 v18, s26 -; SI-NEXT: v_readfirstlane_b32 s45, v19 -; SI-NEXT: v_mov_b32_e32 v19, s27 -; SI-NEXT: v_readfirstlane_b32 s24, v16 -; SI-NEXT: v_mov_b32_e32 v16, s28 +; SI-NEXT: v_mov_b32_e32 v15, s16 +; SI-NEXT: v_mov_b32_e32 v16, s17 +; SI-NEXT: v_mov_b32_e32 v17, s18 +; SI-NEXT: v_mov_b32_e32 v18, s19 +; SI-NEXT: v_mov_b32_e32 v19, s20 +; SI-NEXT: v_readfirstlane_b32 s40, v15 +; SI-NEXT: v_mov_b32_e32 v15, s21 +; SI-NEXT: v_readfirstlane_b32 s42, v16 +; SI-NEXT: v_mov_b32_e32 v16, s22 +; SI-NEXT: v_readfirstlane_b32 s41, v17 +; SI-NEXT: v_mov_b32_e32 v17, s23 +; SI-NEXT: v_readfirstlane_b32 s43, v18 +; SI-NEXT: v_mov_b32_e32 v18, s24 +; SI-NEXT: v_readfirstlane_b32 s24, v19 +; SI-NEXT: v_mov_b32_e32 v19, s25 +; SI-NEXT: v_readfirstlane_b32 s44, v15 +; SI-NEXT: v_mov_b32_e32 v15, s26 +; SI-NEXT: v_readfirstlane_b32 s25, v16 +; SI-NEXT: v_mov_b32_e32 v16, s27 ; SI-NEXT: v_readfirstlane_b32 s27, v17 -; SI-NEXT: v_mov_b32_e32 v17, s29 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15 -; SI-NEXT: v_readfirstlane_b32 s25, v18 -; SI-NEXT: v_readfirstlane_b32 s26, v19 -; SI-NEXT: v_readfirstlane_b32 s20, v16 -; SI-NEXT: v_readfirstlane_b32 s21, v17 -; SI-NEXT: v_readfirstlane_b32 s18, v1 -; SI-NEXT: v_readfirstlane_b32 s19, v2 -; SI-NEXT: v_readfirstlane_b32 s16, v3 -; SI-NEXT: v_readfirstlane_b32 s17, v4 -; SI-NEXT: v_readfirstlane_b32 s14, v5 -; SI-NEXT: v_readfirstlane_b32 s15, v6 -; SI-NEXT: v_readfirstlane_b32 s12, v7 -; SI-NEXT: v_readfirstlane_b32 s13, v8 -; SI-NEXT: v_readfirstlane_b32 s10, v9 -; SI-NEXT: v_readfirstlane_b32 s11, v10 -; SI-NEXT: v_readfirstlane_b32 s7, v11 -; SI-NEXT: v_readfirstlane_b32 s8, v12 -; SI-NEXT: v_readfirstlane_b32 s6, v13 +; SI-NEXT: v_mov_b32_e32 v17, s28 +; SI-NEXT: v_readfirstlane_b32 s26, v18 +; SI-NEXT: v_mov_b32_e32 v18, s29 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: v_readfirstlane_b32 s28, v19 +; SI-NEXT: v_readfirstlane_b32 s22, v15 +; SI-NEXT: v_readfirstlane_b32 s23, v16 +; SI-NEXT: v_readfirstlane_b32 s20, v17 +; SI-NEXT: v_readfirstlane_b32 s21, v18 +; SI-NEXT: v_readfirstlane_b32 s18, v0 +; SI-NEXT: v_readfirstlane_b32 s19, v1 +; SI-NEXT: v_readfirstlane_b32 s16, v2 +; SI-NEXT: v_readfirstlane_b32 s17, v3 +; SI-NEXT: v_readfirstlane_b32 s14, v4 +; SI-NEXT: v_readfirstlane_b32 s15, v5 +; SI-NEXT: v_readfirstlane_b32 s12, v6 +; SI-NEXT: v_readfirstlane_b32 s13, v7 +; SI-NEXT: v_readfirstlane_b32 s10, v8 +; SI-NEXT: v_readfirstlane_b32 s11, v9 +; SI-NEXT: v_readfirstlane_b32 s7, v10 +; SI-NEXT: v_readfirstlane_b32 s8, v11 +; SI-NEXT: v_readfirstlane_b32 s6, v12 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s9, v14 +; SI-NEXT: v_readfirstlane_b32 s9, v13 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill @@ -30468,119 +29544,118 @@ define inreg <56 x half> @bitcast_v14i64_to_v56f16_scalar(<14 x i64> inreg %a, i ; SI-NEXT: s_cbranch_scc0 .LBB45_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_lshr_b32 s4, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s4 ; SI-NEXT: s_lshr_b32 s4, s6, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 ; SI-NEXT: s_lshr_b32 s4, s8, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s4 ; SI-NEXT: s_lshr_b32 s4, s7, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s4 ; SI-NEXT: s_lshr_b32 s4, s11, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s4 ; SI-NEXT: s_lshr_b32 s4, s10, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 ; SI-NEXT: s_lshr_b32 s4, s13, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 ; SI-NEXT: s_lshr_b32 s4, s12, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 ; SI-NEXT: s_lshr_b32 s4, s15, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 -; SI-NEXT: s_lshr_b32 s4, s14, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 +; SI-NEXT: s_lshr_b32 s4, s14, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 ; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 ; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 ; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 ; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 ; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 ; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: s_lshr_b32 s4, s28, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 ; SI-NEXT: s_lshr_b32 s4, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s4 -; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 ; SI-NEXT: s_lshr_b32 s4, s27, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s4 -; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s4 -; SI-NEXT: s_lshr_b32 s4, s45, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v49, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v51, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 +; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v40, s4 ; SI-NEXT: s_lshr_b32 s4, s44, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v53, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v55, s4 -; SI-NEXT: s_lshr_b32 s4, s43, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 +; SI-NEXT: s_lshr_b32 s4, s24, 16 ; SI-NEXT: s_waitcnt expcnt(6) ; SI-NEXT: v_cvt_f32_f16_e32 v42, s4 +; SI-NEXT: s_lshr_b32 s4, s43, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 ; SI-NEXT: s_lshr_b32 s4, s41, 16 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v44, s4 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v45, s4 ; SI-NEXT: s_lshr_b32 s4, s42, 16 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v46, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 ; SI-NEXT: s_lshr_b32 s4, s40, 16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v56, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s45 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v47, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v51, s20 ; SI-NEXT: v_cvt_f32_f16_e32 v52, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v54, s44 -; SI-NEXT: v_cvt_f32_f16_e32 v40, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v41, s43 -; SI-NEXT: v_cvt_f32_f16_e32 v43, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v45, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v47, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v53, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v54, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v55, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v41, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s44 +; SI-NEXT: v_cvt_f32_f16_e32 v43, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v44, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v46, s40 ; SI-NEXT: s_cbranch_execnz .LBB45_3 ; SI-NEXT: .LBB45_2: ; %cmp.true ; SI-NEXT: s_add_u32 s4, s40, 3 ; SI-NEXT: s_addc_u32 s5, s42, 0 -; SI-NEXT: s_lshr_b32 s28, s4, 16 -; SI-NEXT: s_lshr_b32 s29, s5, 16 -; SI-NEXT: s_add_u32 s40, s41, 3 -; SI-NEXT: s_addc_u32 s41, s43, 0 -; SI-NEXT: s_lshr_b32 s42, s40, 16 +; SI-NEXT: s_lshr_b32 s29, s4, 16 +; SI-NEXT: s_lshr_b32 s40, s5, 16 +; SI-NEXT: s_add_u32 s41, s41, 3 +; SI-NEXT: s_addc_u32 s42, s43, 0 ; SI-NEXT: s_lshr_b32 s43, s41, 16 -; SI-NEXT: s_add_u32 s22, s22, 3 +; SI-NEXT: s_lshr_b32 s45, s42, 16 +; SI-NEXT: s_add_u32 s24, s24, 3 ; SI-NEXT: s_addc_u32 s44, s44, 0 -; SI-NEXT: s_lshr_b32 s46, s22, 16 +; SI-NEXT: s_lshr_b32 s46, s24, 16 ; SI-NEXT: s_lshr_b32 s47, s44, 16 -; SI-NEXT: s_add_u32 s23, s23, 3 -; SI-NEXT: s_addc_u32 s45, s45, 0 -; SI-NEXT: s_lshr_b32 s56, s23, 16 -; SI-NEXT: s_lshr_b32 s57, s45, 16 -; SI-NEXT: s_add_u32 s24, s24, 3 -; SI-NEXT: s_addc_u32 s27, s27, 0 -; SI-NEXT: s_lshr_b32 s58, s24, 16 -; SI-NEXT: s_lshr_b32 s59, s27, 16 ; SI-NEXT: s_add_u32 s25, s25, 3 -; SI-NEXT: s_addc_u32 s26, s26, 0 -; SI-NEXT: s_lshr_b32 s60, s25, 16 -; SI-NEXT: s_lshr_b32 s61, s26, 16 +; SI-NEXT: s_addc_u32 s27, s27, 0 +; SI-NEXT: s_lshr_b32 s56, s25, 16 +; SI-NEXT: s_lshr_b32 s57, s27, 16 +; SI-NEXT: s_add_u32 s26, s26, 3 +; SI-NEXT: s_addc_u32 s28, s28, 0 +; SI-NEXT: s_lshr_b32 s58, s26, 16 +; SI-NEXT: s_lshr_b32 s59, s28, 16 +; SI-NEXT: s_add_u32 s22, s22, 3 +; SI-NEXT: s_addc_u32 s23, s23, 0 +; SI-NEXT: s_lshr_b32 s60, s22, 16 +; SI-NEXT: s_lshr_b32 s61, s23, 16 ; SI-NEXT: s_add_u32 s20, s20, 3 ; SI-NEXT: s_addc_u32 s21, s21, 0 ; SI-NEXT: s_lshr_b32 s62, s20, 16 @@ -30613,262 +29688,105 @@ define inreg <56 x half> @bitcast_v14i64_to_v56f16_scalar(<14 x i64> inreg %a, i ; SI-NEXT: s_addc_u32 s9, s9, 0 ; SI-NEXT: s_lshr_b32 s92, s6, 16 ; SI-NEXT: s_lshr_b32 s93, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s45 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v51, s20 ; SI-NEXT: v_cvt_f32_f16_e32 v52, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v54, s44 -; SI-NEXT: v_cvt_f32_f16_e32 v40, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v41, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v53, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v54, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v55, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v41, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s44 ; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v43, s40 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v45, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v43, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s42 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v44, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s5 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v46, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s93 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s92 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s91 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s90 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s89 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s88 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s79 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s78 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s77 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s76 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s75 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s74 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s73 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s72 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s63 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s62 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s61 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s60 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s59 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s58 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s57 +; SI-NEXT: v_cvt_f32_f16_e32 v40, s56 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s47 +; SI-NEXT: v_cvt_f32_f16_e32 v42, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s45 +; SI-NEXT: v_cvt_f32_f16_e32 v45, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s40 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v47, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s93 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s92 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s91 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s90 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s89 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s88 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s79 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s78 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s77 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s76 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s75 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s74 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s73 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s72 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s63 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s62 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s61 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s60 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s59 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s58 -; SI-NEXT: v_cvt_f32_f16_e32 v49, s57 -; SI-NEXT: v_cvt_f32_f16_e32 v51, s56 -; SI-NEXT: v_cvt_f32_f16_e32 v53, s47 -; SI-NEXT: v_cvt_f32_f16_e32 v55, s46 -; SI-NEXT: v_cvt_f32_f16_e32 v42, s43 -; SI-NEXT: v_cvt_f32_f16_e32 v44, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v46, s29 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v56, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v47, s29 ; SI-NEXT: .LBB45_3: ; %end -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v56, v56 +; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v47, v47 ; SI-NEXT: v_cvt_f16_f32_e32 v46, v46 ; SI-NEXT: v_cvt_f16_f32_e32 v45, v45 ; SI-NEXT: v_cvt_f16_f32_e32 v44, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v43 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v47 +; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v45 +; SI-NEXT: v_or_b32_e32 v2, v44, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v42, v42 -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v55 -; SI-NEXT: v_or_b32_e32 v47, v47, v56 -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v40, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 -; SI-NEXT: buffer_store_dword v47, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v47, vcc, 4, v0 -; SI-NEXT: v_or_b32_e32 v45, v45, v46 -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 -; SI-NEXT: buffer_store_dword v45, v47, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v45, vcc, 8, v0 -; SI-NEXT: v_or_b32_e32 v43, v43, v44 -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 -; SI-NEXT: buffer_store_dword v43, v45, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v43, vcc, 12, v0 -; SI-NEXT: v_or_b32_e32 v41, v41, v42 -; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 -; SI-NEXT: buffer_store_dword v41, v43, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v41, vcc, 16, v0 -; SI-NEXT: v_or_b32_e32 v55, v40, v55 -; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 -; SI-NEXT: buffer_store_dword v55, v41, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v55, vcc, 20, v0 -; SI-NEXT: v_or_b32_e32 v53, v54, v53 -; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 -; SI-NEXT: buffer_store_dword v53, v55, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v53, vcc, 24, v0 -; SI-NEXT: v_or_b32_e32 v51, v52, v51 -; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: buffer_store_dword v51, v53, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v51, vcc, 28, v0 -; SI-NEXT: v_or_b32_e32 v49, v50, v49 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 -; SI-NEXT: buffer_store_dword v49, v51, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v49, vcc, 32, v0 -; SI-NEXT: v_or_b32_e32 v39, v48, v39 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; SI-NEXT: buffer_store_dword v39, v49, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v39, vcc, 36, v0 -; SI-NEXT: v_or_b32_e32 v37, v38, v37 -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; SI-NEXT: buffer_store_dword v37, v39, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v37, vcc, 40, v0 -; SI-NEXT: v_or_b32_e32 v35, v36, v35 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; SI-NEXT: buffer_store_dword v35, v37, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v35, vcc, 44, v0 -; SI-NEXT: v_or_b32_e32 v32, v34, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: buffer_store_dword v32, v35, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v32, v33 -; SI-NEXT: v_add_i32_e32 v33, vcc, 48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; SI-NEXT: v_or_b32_e32 v30, v32, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: buffer_store_dword v30, v33, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v30, v31 -; SI-NEXT: v_add_i32_e32 v31, vcc, 52, v0 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; SI-NEXT: v_or_b32_e32 v28, v30, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: buffer_store_dword v28, v31, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v28, v29 -; SI-NEXT: v_add_i32_e32 v29, vcc, 56, v0 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_or_b32_e32 v26, v28, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: buffer_store_dword v26, v29, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v26, v27 -; SI-NEXT: v_add_i32_e32 v27, vcc, 60, v0 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_or_b32_e32 v24, v26, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: buffer_store_dword v24, v27, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v24, v25 -; SI-NEXT: v_add_i32_e32 v25, vcc, 64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v22, v24, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: buffer_store_dword v22, v25, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v22, v23 -; SI-NEXT: v_add_i32_e32 v23, vcc, 0x44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v22, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: buffer_store_dword v20, v23, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v21 -; SI-NEXT: v_add_i32_e32 v21, vcc, 0x48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v20, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: buffer_store_dword v18, v21, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v18, v19 -; SI-NEXT: v_add_i32_e32 v19, vcc, 0x4c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_or_b32_e32 v15, v18, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: buffer_store_dword v15, v19, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v17 -; SI-NEXT: v_add_i32_e32 v17, vcc, 0x50, v0 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: buffer_store_dword v13, v17, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v13, v16 -; SI-NEXT: v_add_i32_e32 v15, vcc, 0x54, v0 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: buffer_store_dword v11, v15, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v11, v14 -; SI-NEXT: v_add_i32_e32 v13, vcc, 0x58, v0 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v44 +; SI-NEXT: v_or_b32_e32 v5, v5, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: buffer_store_dword v9, v13, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v9, v12 -; SI-NEXT: v_add_i32_e32 v11, vcc, 0x5c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: buffer_store_dword v7, v11, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v7, v10 -; SI-NEXT: v_add_i32_e32 v9, vcc, 0x60, v0 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v8 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v6 -; SI-NEXT: v_add_i32_e32 v5, vcc, 0x68, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v4 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v40 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 +; SI-NEXT: v_or_b32_e32 v7, v7, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v8 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v56, v1 +; SI-NEXT: v_or_b32_e32 v3, v46, v3 +; SI-NEXT: v_or_b32_e32 v4, v43, v4 +; SI-NEXT: v_or_b32_e32 v6, v41, v6 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v40 ; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -30878,65 +29796,141 @@ define inreg <56 x half> @bitcast_v14i64_to_v56f16_scalar(<14 x i64> inreg %a, i ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_or_b32_e32 v9, v54, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v54 +; SI-NEXT: v_or_b32_e32 v11, v52, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v52 +; SI-NEXT: v_or_b32_e32 v13, v50, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v50 +; SI-NEXT: v_or_b32_e32 v15, v48, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v48 +; SI-NEXT: v_or_b32_e32 v17, v38, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v38 +; SI-NEXT: v_or_b32_e32 v19, v36, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v36 +; SI-NEXT: v_or_b32_e32 v21, v34, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v34 +; SI-NEXT: v_or_b32_e32 v23, v32, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v32 +; SI-NEXT: v_or_b32_e32 v25, v30, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v30 +; SI-NEXT: v_or_b32_e32 v8, v55, v8 +; SI-NEXT: v_or_b32_e32 v10, v53, v10 +; SI-NEXT: v_or_b32_e32 v12, v51, v12 +; SI-NEXT: v_or_b32_e32 v14, v49, v14 +; SI-NEXT: v_or_b32_e32 v16, v39, v16 +; SI-NEXT: v_or_b32_e32 v18, v37, v18 +; SI-NEXT: v_or_b32_e32 v20, v35, v20 +; SI-NEXT: v_or_b32_e32 v22, v33, v22 +; SI-NEXT: v_or_b32_e32 v24, v31, v24 +; SI-NEXT: v_or_b32_e32 v26, v29, v26 +; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB45_4: +; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: s_branch .LBB45_2 ; ; VI-LABEL: bitcast_v14i64_to_v56f16_scalar: @@ -31686,198 +30680,219 @@ define <14 x i64> @bitcast_v56f16_to_v14i64(<56 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v56f16_to_v14i64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v46, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v6 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:24 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:20 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:88 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:100 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v7 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v48 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v10 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v39 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v38 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v56, v56 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v10 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v32 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v37 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v32 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v21 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v32 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v32 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v35 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v33 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v49 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v50 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v51 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v52 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v53 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v62 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v54 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v55 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v47, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v40 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v27 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB46_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v41 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; kill: killed $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; kill: killed $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; kill: killed $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; kill: killed $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; kill: killed $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; kill: killed $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v49 ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; kill: killed $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr28 @@ -31927,6 +30942,10 @@ define <14 x i64> @bitcast_v56f16_to_v14i64(<56 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v47 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v45 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v41 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v53 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v51 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v61 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v59 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v57 @@ -31939,6 +30958,10 @@ define <14 x i64> @bitcast_v56f16_to_v14i64(<56 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v0, v46, v0 ; SI-NEXT: v_or_b32_e32 v1, v44, v1 ; SI-NEXT: v_or_b32_e32 v2, v42, v2 +; SI-NEXT: v_or_b32_e32 v3, v40, v3 +; SI-NEXT: v_or_b32_e32 v4, v54, v4 +; SI-NEXT: v_or_b32_e32 v5, v52, v5 +; SI-NEXT: v_or_b32_e32 v6, v50, v6 ; SI-NEXT: v_or_b32_e32 v21, v60, v21 ; SI-NEXT: v_or_b32_e32 v22, v58, v22 ; SI-NEXT: v_or_b32_e32 v23, v48, v23 @@ -31953,6 +30976,14 @@ define <14 x i64> @bitcast_v56f16_to_v14i64(<56 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; kill: killed $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; kill: killed $vgpr28 @@ -31970,84 +31001,65 @@ define <14 x i64> @bitcast_v56f16_to_v14i64(<56 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v56 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: v_or_b32_e32 v19, v20, v19 @@ -32059,7 +31071,7 @@ define <14 x i64> @bitcast_v56f16_to_v14i64(<56 x half> %a, i32 %b) { ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB46_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v47 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v45 @@ -32079,187 +31091,177 @@ define <14 x i64> @bitcast_v56f16_to_v14i64(<56 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v3, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v43 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v54 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v41 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v60 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v52 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v50 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v55 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v61 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v60 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v53 ; SI-NEXT: v_cvt_f32_f16_e32 v24, v58 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v25, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v33 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v51 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v38 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v36 ; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v49 ; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v33 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v32 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 @@ -32267,9 +31269,9 @@ define <14 x i64> @bitcast_v56f16_to_v14i64(<56 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 @@ -32279,7 +31281,7 @@ define <14 x i64> @bitcast_v56f16_to_v14i64(<56 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 @@ -32291,7 +31293,7 @@ define <14 x i64> @bitcast_v56f16_to_v14i64(<56 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_or_b32_e32 v17, v18, v17 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 @@ -32344,22 +31346,22 @@ define <14 x i64> @bitcast_v56f16_to_v14i64(<56 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v27, v29, v27 ; SI-NEXT: .LBB46_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -33127,456 +32129,546 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i ; SI-LABEL: bitcast_v56f16_to_v14i64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:44 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v20 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v2, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v11, s20 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v3, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v10, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v4, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v9, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v5, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v8, s26 -; SI-NEXT: v_cvt_f16_f32_e32 v6, s29 -; SI-NEXT: v_cvt_f16_f32_e32 v7, s28 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v5 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v51 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v61 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_cvt_f16_f32_e32 v0, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v6 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v7 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v53, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v54, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v10 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v55, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 +; SI-NEXT: s_lshr_b32 s41, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s41 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s17 +; SI-NEXT: s_lshr_b32 s15, s18, 16 +; SI-NEXT: s_lshr_b32 s40, s17, 16 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s40 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s15 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v43, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s19 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v11 +; SI-NEXT: s_lshr_b32 s13, s20, 16 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 +; SI-NEXT: s_lshr_b32 s14, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v30 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s21 +; SI-NEXT: s_lshr_b32 s11, s22, 16 +; SI-NEXT: s_lshr_b32 s12, s21, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s12 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v12 +; SI-NEXT: s_lshr_b32 s9, s24, 16 +; SI-NEXT: s_lshr_b32 s10, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s10 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 +; SI-NEXT: s_lshr_b32 s8, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s8 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v32 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v13 +; SI-NEXT: s_lshr_b32 s6, s27, 16 +; SI-NEXT: s_lshr_b32 s7, s26, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v31 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v29, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s6 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 +; SI-NEXT: s_lshr_b32 s4, s29, 16 +; SI-NEXT: s_lshr_b32 s5, s28, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v56, s29 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB47_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v49, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_mov_b32_e32 v48, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_mov_b32_e32 v61, v44 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v43 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v54 -; SI-NEXT: v_mov_b32_e32 v39, v11 -; SI-NEXT: v_or_b32_e32 v2, v11, v2 -; SI-NEXT: v_mov_b32_e32 v33, v10 -; SI-NEXT: v_or_b32_e32 v3, v10, v3 -; SI-NEXT: v_or_b32_e32 v4, v9, v4 -; SI-NEXT: v_or_b32_e32 v5, v8, v5 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v58 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v35 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v61 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v46 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v60 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v62 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v57 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v63 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v34 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v60 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v44 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v40 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v30 +; SI-NEXT: v_mov_b32_e32 v49, v39 +; SI-NEXT: v_or_b32_e32 v0, v39, v0 +; SI-NEXT: v_mov_b32_e32 v39, v38 +; SI-NEXT: v_or_b32_e32 v1, v38, v1 +; SI-NEXT: v_mov_b32_e32 v55, v54 +; SI-NEXT: v_or_b32_e32 v2, v53, v2 +; SI-NEXT: v_mov_b32_e32 v53, v52 +; SI-NEXT: v_mov_b32_e32 v32, v48 +; SI-NEXT: v_or_b32_e32 v3, v48, v3 +; SI-NEXT: v_mov_b32_e32 v51, v50 +; SI-NEXT: v_mov_b32_e32 v38, v37 +; SI-NEXT: v_or_b32_e32 v4, v37, v4 +; SI-NEXT: v_mov_b32_e32 v37, v36 +; SI-NEXT: v_mov_b32_e32 v48, v63 +; SI-NEXT: v_or_b32_e32 v5, v63, v5 +; SI-NEXT: v_mov_b32_e32 v36, v35 +; SI-NEXT: v_mov_b32_e32 v63, v62 +; SI-NEXT: v_or_b32_e32 v6, v62, v6 +; SI-NEXT: v_or_b32_e32 v7, v59, v7 +; SI-NEXT: v_mov_b32_e32 v61, v60 +; SI-NEXT: v_mov_b32_e32 v59, v47 +; SI-NEXT: v_or_b32_e32 v8, v47, v8 +; SI-NEXT: v_mov_b32_e32 v56, v58 +; SI-NEXT: v_or_b32_e32 v11, v46, v11 +; SI-NEXT: v_or_b32_e32 v12, v45, v12 +; SI-NEXT: v_or_b32_e32 v13, v43, v13 +; SI-NEXT: v_or_b32_e32 v14, v41, v14 +; SI-NEXT: v_or_b32_e32 v15, v31, v15 +; SI-NEXT: v_or_b32_e32 v16, v28, v16 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v0, v55, v0 -; SI-NEXT: v_or_b32_e32 v1, v53, v1 -; SI-NEXT: v_or_b32_e32 v7, v47, v7 -; SI-NEXT: v_mov_b32_e32 v42, v58 -; SI-NEXT: v_or_b32_e32 v8, v58, v8 -; SI-NEXT: v_mov_b32_e32 v41, v60 -; SI-NEXT: v_or_b32_e32 v9, v59, v9 -; SI-NEXT: v_mov_b32_e32 v40, v56 -; SI-NEXT: v_or_b32_e32 v10, v56, v10 -; SI-NEXT: v_or_b32_e32 v11, v45, v11 -; SI-NEXT: v_or_b32_e32 v12, v38, v12 -; SI-NEXT: v_or_b32_e32 v13, v36, v13 -; SI-NEXT: v_or_b32_e32 v14, v35, v14 -; SI-NEXT: v_or_b32_e32 v15, v32, v15 -; SI-NEXT: v_or_b32_e32 v17, v37, v17 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v29 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: v_or_b32_e32 v21, v22, v21 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; SI-NEXT: v_or_b32_e32 v23, v24, v23 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 ; SI-NEXT: v_or_b32_e32 v25, v26, v25 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 ; SI-NEXT: v_or_b32_e32 v26, v27, v26 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v57 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v10, v33, v10 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; SI-NEXT: v_or_b32_e32 v27, v50, v27 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v27, v57, v27 ; SI-NEXT: s_cbranch_execnz .LBB47_3 ; SI-NEXT: .LBB47_2: ; %cmp.true -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v53 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v1, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v38 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v47 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v48 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v45 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v63 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v33 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v45 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v32 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v31 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v28 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v22, v29 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v2, v55 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v53 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v51 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v37 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v36 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v61 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v61 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v56 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v62 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v34 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v29 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v44 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v42 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v16, v30 ; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v37 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_or_b32_e32 v17, v18, v17 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v18, v20, v18 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 @@ -33588,22 +32680,16 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_or_b32_e32 v20, v21, v20 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; SI-NEXT: v_or_b32_e32 v22, v24, v22 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 @@ -33615,12 +32701,12 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_or_b32_e32 v23, v24, v23 ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v25 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 ; SI-NEXT: v_or_b32_e32 v24, v26, v24 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 @@ -33630,7 +32716,7 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 ; SI-NEXT: v_or_b32_e32 v25, v27, v25 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 @@ -33644,38 +32730,63 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 ; SI-NEXT: v_or_b32_e32 v27, v29, v27 ; SI-NEXT: .LBB47_3: ; %end -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB47_4: -; SI-NEXT: v_mov_b32_e32 v39, v11 -; SI-NEXT: v_mov_b32_e32 v33, v10 -; SI-NEXT: v_mov_b32_e32 v49, v2 -; SI-NEXT: v_mov_b32_e32 v48, v3 -; SI-NEXT: v_mov_b32_e32 v52, v37 -; SI-NEXT: v_mov_b32_e32 v37, v29 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v57, v43 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v46, v42 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v45, v41 +; SI-NEXT: v_mov_b32_e32 v43, v31 +; SI-NEXT: v_mov_b32_e32 v42, v30 +; SI-NEXT: v_mov_b32_e32 v41, v28 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v42, v58 -; SI-NEXT: v_mov_b32_e32 v41, v60 -; SI-NEXT: v_mov_b32_e32 v40, v56 -; SI-NEXT: v_mov_b32_e32 v29, v37 -; SI-NEXT: v_mov_b32_e32 v37, v52 -; SI-NEXT: v_mov_b32_e32 v61, v44 +; SI-NEXT: v_mov_b32_e32 v55, v54 +; SI-NEXT: v_mov_b32_e32 v28, v41 +; SI-NEXT: v_mov_b32_e32 v30, v42 +; SI-NEXT: v_mov_b32_e32 v41, v45 +; SI-NEXT: v_mov_b32_e32 v42, v46 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v53, v52 +; SI-NEXT: v_mov_b32_e32 v51, v50 +; SI-NEXT: v_mov_b32_e32 v49, v39 +; SI-NEXT: v_mov_b32_e32 v32, v48 +; SI-NEXT: v_mov_b32_e32 v39, v38 +; SI-NEXT: v_mov_b32_e32 v38, v37 +; SI-NEXT: v_mov_b32_e32 v37, v36 +; SI-NEXT: v_mov_b32_e32 v36, v35 +; SI-NEXT: v_mov_b32_e32 v48, v63 +; SI-NEXT: v_mov_b32_e32 v63, v62 +; SI-NEXT: v_mov_b32_e32 v61, v60 +; SI-NEXT: v_mov_b32_e32 v59, v47 +; SI-NEXT: v_mov_b32_e32 v56, v58 +; SI-NEXT: v_mov_b32_e32 v31, v43 +; SI-NEXT: v_mov_b32_e32 v43, v57 ; SI-NEXT: s_branch .LBB47_2 ; ; VI-LABEL: bitcast_v56f16_to_v14i64_scalar: @@ -34364,313 +33475,231 @@ define <56 x i16> @bitcast_v14f64_to_v56i16(<14 x double> %a, i32 %b) { ; SI-LABEL: bitcast_v14f64_to_v56i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v29 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB48_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_alignbit_b32 v29, v28, v27, 16 -; SI-NEXT: v_alignbit_b32 v30, v26, v25, 16 -; SI-NEXT: v_alignbit_b32 v31, v24, v23, 16 -; SI-NEXT: v_alignbit_b32 v32, v22, v21, 16 -; SI-NEXT: v_alignbit_b32 v33, v20, v19, 16 -; SI-NEXT: v_alignbit_b32 v34, v18, v17, 16 -; SI-NEXT: v_alignbit_b32 v36, v16, v15, 16 -; SI-NEXT: v_alignbit_b32 v38, v14, v13, 16 -; SI-NEXT: v_alignbit_b32 v48, v12, v11, 16 -; SI-NEXT: v_alignbit_b32 v51, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v53, v8, v7, 16 -; SI-NEXT: v_alignbit_b32 v40, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v28, v27, v26, 16 +; SI-NEXT: v_alignbit_b32 v29, v25, v24, 16 +; SI-NEXT: v_alignbit_b32 v30, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v31, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v32, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v33, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v34, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v35, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v36, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v37, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v39, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v50, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v53, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v40, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v15 ; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_alignbit_b32 v42, v4, v3, 16 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v13 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v11 ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_alignbit_b32 v44, v2, v1, 16 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v9 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v7 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v5 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v3 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v1 ; SI-NEXT: .LBB48_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB48_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 -; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 -; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 -; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 -; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 -; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 -; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 -; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 -; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 -; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 -; SI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 -; SI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 -; SI-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 -; SI-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 -; SI-NEXT: v_alignbit_b32 v29, v28, v27, 16 -; SI-NEXT: v_alignbit_b32 v30, v26, v25, 16 -; SI-NEXT: v_alignbit_b32 v31, v24, v23, 16 -; SI-NEXT: v_alignbit_b32 v32, v22, v21, 16 -; SI-NEXT: v_alignbit_b32 v33, v20, v19, 16 -; SI-NEXT: v_alignbit_b32 v34, v18, v17, 16 -; SI-NEXT: v_alignbit_b32 v36, v16, v15, 16 -; SI-NEXT: v_alignbit_b32 v38, v14, v13, 16 -; SI-NEXT: v_alignbit_b32 v48, v12, v11, 16 -; SI-NEXT: v_alignbit_b32 v51, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v53, v8, v7, 16 -; SI-NEXT: v_alignbit_b32 v40, v6, v5, 16 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; SI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; SI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; SI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; SI-NEXT: v_alignbit_b32 v28, v27, v26, 16 +; SI-NEXT: v_alignbit_b32 v29, v25, v24, 16 +; SI-NEXT: v_alignbit_b32 v30, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v31, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v32, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v33, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v34, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v35, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v36, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v37, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v39, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v50, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v53, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v40, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v15 ; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_alignbit_b32 v42, v4, v3, 16 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v13 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v11 ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_alignbit_b32 v44, v2, v1, 16 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v9 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v7 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v5 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v3 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v1 ; SI-NEXT: .LBB48_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 +; SI-NEXT: v_or_b32_e32 v0, v0, v40 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; SI-NEXT: v_or_b32_e32 v1, v1, v44 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v56 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v42 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v47 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v46 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v53 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v45 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v43 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v49 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v24 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v25 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v26 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v27 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v28 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v47 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v40 +; SI-NEXT: v_or_b32_e32 v2, v2, v53 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v46 +; SI-NEXT: v_or_b32_e32 v4, v4, v50 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v45 +; SI-NEXT: v_or_b32_e32 v6, v6, v39 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v44 +; SI-NEXT: v_or_b32_e32 v8, v8, v37 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v43 +; SI-NEXT: v_or_b32_e32 v10, v10, v36 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v42 +; SI-NEXT: v_or_b32_e32 v12, v12, v35 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v41 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_or_b32_e32 v14, v14, v34 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v55 +; SI-NEXT: v_or_b32_e32 v16, v16, v33 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v54 +; SI-NEXT: v_or_b32_e32 v18, v18, v32 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v52 +; SI-NEXT: v_or_b32_e32 v20, v20, v31 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v51 +; SI-NEXT: v_or_b32_e32 v22, v22, v30 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v49 +; SI-NEXT: v_or_b32_e32 v24, v24, v29 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v48 +; SI-NEXT: v_or_b32_e32 v26, v26, v28 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v38 +; SI-NEXT: v_or_b32_e32 v3, v3, v53 +; SI-NEXT: v_or_b32_e32 v5, v5, v50 +; SI-NEXT: v_or_b32_e32 v7, v7, v39 +; SI-NEXT: v_or_b32_e32 v9, v9, v37 +; SI-NEXT: v_or_b32_e32 v11, v11, v36 +; SI-NEXT: v_or_b32_e32 v13, v13, v35 +; SI-NEXT: v_or_b32_e32 v15, v15, v34 +; SI-NEXT: v_or_b32_e32 v17, v17, v33 +; SI-NEXT: v_or_b32_e32 v19, v19, v32 +; SI-NEXT: v_or_b32_e32 v21, v21, v31 +; SI-NEXT: v_or_b32_e32 v23, v23, v30 +; SI-NEXT: v_or_b32_e32 v25, v25, v29 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v14f64_to_v56i16: @@ -35212,332 +34241,271 @@ define inreg <56 x i16> @bitcast_v14f64_to_v56i16_scalar(<14 x double> inreg %a, ; SI-LABEL: bitcast_v14f64_to_v56i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15 -; SI-NEXT: v_mov_b32_e32 v27, s16 -; SI-NEXT: v_mov_b32_e32 v28, s17 -; SI-NEXT: v_mov_b32_e32 v23, s18 -; SI-NEXT: v_mov_b32_e32 v24, s19 -; SI-NEXT: v_mov_b32_e32 v25, s20 -; SI-NEXT: v_mov_b32_e32 v26, s21 -; SI-NEXT: v_mov_b32_e32 v21, s22 -; SI-NEXT: v_mov_b32_e32 v22, s23 -; SI-NEXT: v_mov_b32_e32 v19, s24 -; SI-NEXT: v_mov_b32_e32 v20, s25 -; SI-NEXT: v_mov_b32_e32 v17, s26 -; SI-NEXT: v_mov_b32_e32 v18, s27 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: v_mov_b32_e32 v24, s16 +; SI-NEXT: v_mov_b32_e32 v25, s17 +; SI-NEXT: v_mov_b32_e32 v20, s18 +; SI-NEXT: v_mov_b32_e32 v21, s19 +; SI-NEXT: v_mov_b32_e32 v26, s20 +; SI-NEXT: v_mov_b32_e32 v27, s21 +; SI-NEXT: v_mov_b32_e32 v22, s22 +; SI-NEXT: v_mov_b32_e32 v23, s23 +; SI-NEXT: v_mov_b32_e32 v18, s24 +; SI-NEXT: v_mov_b32_e32 v19, s25 +; SI-NEXT: v_mov_b32_e32 v16, s26 +; SI-NEXT: v_mov_b32_e32 v17, s27 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mov_b32_e32 v15, s28 -; SI-NEXT: v_mov_b32_e32 v16, s29 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v14, s28 +; SI-NEXT: v_mov_b32_e32 v15, s29 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB49_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshr_b64 v[29:30], v[13:14], 16 -; SI-NEXT: v_lshr_b64 v[30:31], v[11:12], 16 -; SI-NEXT: v_lshr_b64 v[31:32], v[9:10], 16 -; SI-NEXT: v_lshr_b64 v[32:33], v[7:8], 16 -; SI-NEXT: v_lshr_b64 v[48:49], v[19:20], 16 -; SI-NEXT: v_lshr_b64 v[33:34], v[5:6], 16 -; SI-NEXT: v_lshr_b64 v[49:50], v[21:22], 16 -; SI-NEXT: v_lshr_b64 v[34:35], v[3:4], 16 -; SI-NEXT: v_lshr_b64 v[50:51], v[25:26], 16 -; SI-NEXT: v_lshr_b64 v[35:36], v[15:16], 16 -; SI-NEXT: v_lshr_b64 v[51:52], v[23:24], 16 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v2 +; SI-NEXT: v_lshr_b64 v[50:51], v[12:13], 16 +; SI-NEXT: v_lshr_b64 v[51:52], v[10:11], 16 +; SI-NEXT: v_lshr_b64 v[52:53], v[8:9], 16 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v13 +; SI-NEXT: v_lshr_b64 v[53:54], v[6:7], 16 +; SI-NEXT: v_lshr_b64 v[40:41], v[2:3], 16 +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v1 ; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v15 ; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v17 ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v19 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v23 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v27 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v24 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v28 -; SI-NEXT: v_lshr_b64 v[38:39], v[1:2], 16 -; SI-NEXT: v_lshr_b64 v[36:37], v[17:18], 16 -; SI-NEXT: v_lshr_b64 v[52:53], v[27:28], 16 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v25 +; SI-NEXT: v_lshr_b64 v[54:55], v[4:5], 16 +; SI-NEXT: v_lshr_b64 v[41:42], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[48:49], v[14:15], 16 +; SI-NEXT: v_lshr_b64 v[38:39], v[16:17], 16 +; SI-NEXT: v_lshr_b64 v[36:37], v[18:19], 16 +; SI-NEXT: v_lshr_b64 v[34:35], v[22:23], 16 +; SI-NEXT: v_lshr_b64 v[32:33], v[26:27], 16 +; SI-NEXT: v_lshr_b64 v[30:31], v[20:21], 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[28:29], v[24:25], 16 ; SI-NEXT: s_cbranch_execnz .LBB49_3 ; SI-NEXT: .LBB49_2: ; %cmp.true -; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 -; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 -; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 -; SI-NEXT: v_lshr_b64 v[29:30], v[13:14], 16 -; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 -; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 -; SI-NEXT: v_lshr_b64 v[30:31], v[11:12], 16 -; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 -; SI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 -; SI-NEXT: v_lshr_b64 v[31:32], v[9:10], 16 -; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 -; SI-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 -; SI-NEXT: v_lshr_b64 v[32:33], v[7:8], 16 -; SI-NEXT: v_lshr_b64 v[48:49], v[19:20], 16 -; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 -; SI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 -; SI-NEXT: v_lshr_b64 v[33:34], v[5:6], 16 -; SI-NEXT: v_lshr_b64 v[49:50], v[21:22], 16 -; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 -; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 -; SI-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 -; SI-NEXT: v_lshr_b64 v[34:35], v[3:4], 16 -; SI-NEXT: v_lshr_b64 v[50:51], v[25:26], 16 -; SI-NEXT: v_lshr_b64 v[35:36], v[15:16], 16 -; SI-NEXT: v_lshr_b64 v[51:52], v[23:24], 16 -; SI-NEXT: v_lshr_b64 v[38:39], v[1:2], 16 -; SI-NEXT: v_lshr_b64 v[36:37], v[17:18], 16 -; SI-NEXT: v_lshr_b64 v[52:53], v[27:28], 16 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v2 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v16 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v18 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v20 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v22 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v26 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v24 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v28 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_lshr_b64 v[50:51], v[12:13], 16 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_lshr_b64 v[51:52], v[10:11], 16 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; SI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; SI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; SI-NEXT: v_lshr_b64 v[52:53], v[8:9], 16 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v13 +; SI-NEXT: v_lshr_b64 v[53:54], v[6:7], 16 +; SI-NEXT: v_lshr_b64 v[40:41], v[2:3], 16 +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[54:55], v[4:5], 16 +; SI-NEXT: v_lshr_b64 v[41:42], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[48:49], v[14:15], 16 +; SI-NEXT: v_lshr_b64 v[38:39], v[16:17], 16 +; SI-NEXT: v_lshr_b64 v[36:37], v[18:19], 16 +; SI-NEXT: v_lshr_b64 v[34:35], v[22:23], 16 +; SI-NEXT: v_lshr_b64 v[32:33], v[26:27], 16 +; SI-NEXT: v_lshr_b64 v[30:31], v[20:21], 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[28:29], v[24:25], 16 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v25 ; SI-NEXT: .LBB49_3: ; %end -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v52 -; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; SI-NEXT: v_or_b32_e32 v27, v27, v37 -; SI-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v27, 0xffff, v28 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v59 -; SI-NEXT: v_or_b32_e32 v27, v27, v28 -; SI-NEXT: v_add_i32_e32 v28, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v27, v28, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v51 -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; SI-NEXT: v_or_b32_e32 v23, v23, v27 -; SI-NEXT: v_add_i32_e32 v27, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v23, v27, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v58 -; SI-NEXT: v_or_b32_e32 v23, v23, v24 -; SI-NEXT: v_add_i32_e32 v24, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v23, v24, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v28, v24, v28 ; SI-NEXT: v_and_b32_e32 v24, 0xffff, v25 -; SI-NEXT: v_or_b32_e32 v23, v24, v23 -; SI-NEXT: v_add_i32_e32 v24, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v23, v24, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v26 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v57 -; SI-NEXT: v_or_b32_e32 v23, v23, v24 -; SI-NEXT: v_add_i32_e32 v24, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v23, v24, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v49 -; SI-NEXT: v_or_b32_e32 v21, v21, v23 -; SI-NEXT: v_add_i32_e32 v23, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v21, v23, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v43 +; SI-NEXT: v_or_b32_e32 v29, v24, v25 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v30 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v30, v20, v24 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v63 +; SI-NEXT: v_or_b32_e32 v31, v20, v21 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v32 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v32, v21, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v62 +; SI-NEXT: v_or_b32_e32 v33, v20, v21 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v34 ; SI-NEXT: v_and_b32_e32 v21, 0xffff, v22 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v56 -; SI-NEXT: v_or_b32_e32 v21, v21, v22 -; SI-NEXT: v_add_i32_e32 v22, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v21, v22, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v48 -; SI-NEXT: v_or_b32_e32 v19, v19, v21 -; SI-NEXT: v_add_i32_e32 v21, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v19, v21, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v47 -; SI-NEXT: v_or_b32_e32 v19, v19, v20 -; SI-NEXT: v_add_i32_e32 v20, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v36 -; SI-NEXT: v_or_b32_e32 v17, v17, v19 -; SI-NEXT: v_add_i32_e32 v19, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v17, v19, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v46 -; SI-NEXT: v_or_b32_e32 v17, v17, v18 -; SI-NEXT: v_add_i32_e32 v18, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v35 -; SI-NEXT: v_or_b32_e32 v15, v15, v17 -; SI-NEXT: v_add_i32_e32 v17, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v15, v17, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v45 -; SI-NEXT: v_or_b32_e32 v15, v15, v16 -; SI-NEXT: v_add_i32_e32 v16, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v38 -; SI-NEXT: v_or_b32_e32 v1, v1, v15 -; SI-NEXT: v_add_i32_e32 v15, vcc, 56, v0 -; SI-NEXT: buffer_store_dword v1, v15, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v44 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v43 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v42 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v34, v21, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v61 +; SI-NEXT: v_or_b32_e32 v35, v20, v21 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v36 +; SI-NEXT: v_or_b32_e32 v36, v18, v20 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v60 +; SI-NEXT: v_or_b32_e32 v37, v18, v19 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v38 +; SI-NEXT: v_or_b32_e32 v38, v16, v18 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v59 +; SI-NEXT: v_or_b32_e32 v39, v16, v17 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v48 +; SI-NEXT: v_or_b32_e32 v48, v14, v16 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v58 +; SI-NEXT: v_or_b32_e32 v49, v14, v15 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v41 +; SI-NEXT: v_or_b32_e32 v14, v0, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v57 +; SI-NEXT: v_or_b32_e32 v15, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v40 +; SI-NEXT: v_or_b32_e32 v16, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v56 +; SI-NEXT: v_or_b32_e32 v17, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v54 +; SI-NEXT: v_or_b32_e32 v18, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v47 +; SI-NEXT: v_or_b32_e32 v19, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v53 +; SI-NEXT: v_or_b32_e32 v20, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v46 +; SI-NEXT: v_or_b32_e32 v21, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v52 +; SI-NEXT: v_or_b32_e32 v22, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v45 +; SI-NEXT: v_or_b32_e32 v23, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v51 +; SI-NEXT: v_or_b32_e32 v24, v0, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v44 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v11 +; SI-NEXT: v_or_b32_e32 v25, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v50 +; SI-NEXT: v_or_b32_e32 v26, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v13 +; SI-NEXT: v_mov_b32_e32 v2, v30 +; SI-NEXT: v_mov_b32_e32 v3, v31 +; SI-NEXT: v_mov_b32_e32 v4, v32 +; SI-NEXT: v_mov_b32_e32 v5, v33 +; SI-NEXT: v_mov_b32_e32 v6, v34 +; SI-NEXT: v_mov_b32_e32 v7, v35 +; SI-NEXT: v_mov_b32_e32 v8, v36 +; SI-NEXT: v_mov_b32_e32 v9, v37 +; SI-NEXT: v_mov_b32_e32 v10, v38 +; SI-NEXT: v_mov_b32_e32 v11, v39 +; SI-NEXT: v_mov_b32_e32 v12, v48 +; SI-NEXT: v_mov_b32_e32 v13, v49 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v27, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, v28 +; SI-NEXT: v_mov_b32_e32 v1, v29 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB49_4: -; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: s_branch .LBB49_2 ; ; VI-LABEL: bitcast_v14f64_to_v56i16_scalar: @@ -36343,167 +35311,260 @@ define <14 x double> @bitcast_v56i16_to_v14f64(<56 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v56i16_to_v14f64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v54, v2 -; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 -; SI-NEXT: v_mov_b32_e32 v53, v4 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v29 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v48, v19 +; SI-NEXT: v_mov_b32_e32 v49, v18 +; SI-NEXT: v_mov_b32_e32 v50, v17 +; SI-NEXT: v_mov_b32_e32 v51, v16 +; SI-NEXT: v_mov_b32_e32 v52, v15 +; SI-NEXT: v_mov_b32_e32 v53, v14 +; SI-NEXT: v_mov_b32_e32 v54, v13 +; SI-NEXT: v_mov_b32_e32 v55, v12 +; SI-NEXT: v_mov_b32_e32 v40, v11 +; SI-NEXT: v_mov_b32_e32 v41, v10 +; SI-NEXT: v_mov_b32_e32 v42, v9 +; SI-NEXT: v_mov_b32_e32 v43, v8 +; SI-NEXT: v_mov_b32_e32 v44, v7 +; SI-NEXT: v_mov_b32_e32 v45, v6 +; SI-NEXT: v_mov_b32_e32 v46, v5 +; SI-NEXT: v_mov_b32_e32 v47, v4 +; SI-NEXT: v_mov_b32_e32 v56, v3 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_mov_b32_e32 v57, v2 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_mov_b32_e32 v58, v1 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_mov_b32_e32 v59, v0 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v27 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v26 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v25 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v24 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v23 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v22 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v21 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v48 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v50 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v52 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v40 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v42 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v43 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v44 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v45 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v46 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v47 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v56 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v57 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v58 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v59 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB50_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v55 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v59 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v58 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v57 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v56 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v47 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v46 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v45 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v44 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v43 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v42 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v41 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v40 +; SI-NEXT: v_or_b32_e32 v0, v0, v39 +; SI-NEXT: v_or_b32_e32 v1, v1, v38 +; SI-NEXT: v_or_b32_e32 v2, v2, v62 +; SI-NEXT: v_or_b32_e32 v3, v3, v37 +; SI-NEXT: v_or_b32_e32 v4, v4, v32 +; SI-NEXT: v_or_b32_e32 v5, v5, v36 +; SI-NEXT: v_or_b32_e32 v6, v6, v61 +; SI-NEXT: v_or_b32_e32 v7, v7, v35 +; SI-NEXT: v_or_b32_e32 v8, v8, v60 +; SI-NEXT: v_or_b32_e32 v9, v9, v34 +; SI-NEXT: v_or_b32_e32 v10, v10, v63 +; SI-NEXT: v_or_b32_e32 v11, v11, v33 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v0 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:84 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:76 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v4 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v54 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v53 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v52 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v51 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v50 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v49 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v48 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:68 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:100 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:60 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v8 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v12 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v16 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v18 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:52 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v22 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v24 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v26 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:44 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:36 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:28 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v24, v24, v25 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:20 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:12 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v26, v26, v27 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execz .LBB50_2 -; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; kill: killed $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr28 @@ -36547,195 +35608,41 @@ define <14 x double> @bitcast_v56i16_to_v14f64(<56 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; kill: killed $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v53 -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v59 ; SI-NEXT: ; kill: killed $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: v_or_b32_e32 v0, v0, v58 -; SI-NEXT: v_or_b32_e32 v1, v1, v52 -; SI-NEXT: v_or_b32_e32 v2, v2, v57 -; SI-NEXT: v_or_b32_e32 v16, v16, v49 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: ; kill: killed $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; kill: killed $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; SI-NEXT: v_or_b32_e32 v3, v3, v51 -; SI-NEXT: v_or_b32_e32 v4, v4, v56 -; SI-NEXT: v_or_b32_e32 v5, v5, v50 -; SI-NEXT: v_or_b32_e32 v6, v6, v46 -; SI-NEXT: v_or_b32_e32 v7, v7, v45 -; SI-NEXT: v_or_b32_e32 v8, v8, v39 -; SI-NEXT: v_or_b32_e32 v9, v9, v37 -; SI-NEXT: v_or_b32_e32 v10, v10, v36 -; SI-NEXT: v_or_b32_e32 v11, v11, v34 -; SI-NEXT: v_or_b32_e32 v12, v12, v43 -; SI-NEXT: v_or_b32_e32 v13, v13, v42 -; SI-NEXT: v_or_b32_e32 v14, v14, v40 -; SI-NEXT: v_or_b32_e32 v15, v15, v47 -; SI-NEXT: v_or_b32_e32 v17, v17, v48 -; SI-NEXT: v_or_b32_e32 v18, v18, v38 -; SI-NEXT: v_or_b32_e32 v19, v19, v44 -; SI-NEXT: v_or_b32_e32 v20, v20, v35 -; SI-NEXT: v_or_b32_e32 v21, v21, v33 -; SI-NEXT: v_or_b32_e32 v22, v22, v32 -; SI-NEXT: v_or_b32_e32 v23, v23, v41 -; SI-NEXT: v_or_b32_e32 v24, v24, v63 -; SI-NEXT: v_or_b32_e32 v25, v25, v62 -; SI-NEXT: v_or_b32_e32 v26, v26, v61 -; SI-NEXT: v_or_b32_e32 v27, v27, v60 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: .LBB50_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB50_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v53 -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v59 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v55 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v59 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v58 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v57 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v56 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v47 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v46 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v45 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v44 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v43 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v42 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v41 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v40 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; SI-NEXT: v_or_b32_e32 v0, v58, v0 -; SI-NEXT: s_mov_b32 s6, 0x30000 -; SI-NEXT: v_or_b32_e32 v1, v52, v1 -; SI-NEXT: v_or_b32_e32 v2, v57, v2 -; SI-NEXT: v_or_b32_e32 v16, v49, v16 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 @@ -36745,45 +35652,22 @@ define <14 x double> @bitcast_v56i16_to_v14f64(<56 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; SI-NEXT: v_or_b32_e32 v3, v51, v3 -; SI-NEXT: v_or_b32_e32 v4, v56, v4 -; SI-NEXT: v_or_b32_e32 v5, v50, v5 -; SI-NEXT: v_or_b32_e32 v6, v46, v6 -; SI-NEXT: v_or_b32_e32 v7, v45, v7 -; SI-NEXT: v_or_b32_e32 v8, v39, v8 -; SI-NEXT: v_or_b32_e32 v9, v37, v9 -; SI-NEXT: v_or_b32_e32 v10, v36, v10 -; SI-NEXT: v_or_b32_e32 v11, v34, v11 -; SI-NEXT: v_or_b32_e32 v12, v43, v12 -; SI-NEXT: v_or_b32_e32 v13, v42, v13 -; SI-NEXT: v_or_b32_e32 v14, v40, v14 -; SI-NEXT: v_or_b32_e32 v15, v47, v15 -; SI-NEXT: v_or_b32_e32 v17, v48, v17 -; SI-NEXT: v_or_b32_e32 v18, v38, v18 -; SI-NEXT: v_or_b32_e32 v19, v44, v19 -; SI-NEXT: v_or_b32_e32 v20, v35, v20 -; SI-NEXT: v_or_b32_e32 v21, v33, v21 -; SI-NEXT: v_or_b32_e32 v22, v32, v22 -; SI-NEXT: v_or_b32_e32 v23, v41, v23 -; SI-NEXT: v_or_b32_e32 v24, v63, v24 -; SI-NEXT: v_or_b32_e32 v25, v62, v25 -; SI-NEXT: v_or_b32_e32 v26, v61, v26 -; SI-NEXT: v_or_b32_e32 v27, v60, v27 +; SI-NEXT: v_or_b32_e32 v0, v39, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v1, v38, v1 +; SI-NEXT: v_or_b32_e32 v2, v62, v2 +; SI-NEXT: v_or_b32_e32 v3, v37, v3 +; SI-NEXT: v_or_b32_e32 v4, v32, v4 +; SI-NEXT: v_or_b32_e32 v5, v36, v5 +; SI-NEXT: v_or_b32_e32 v6, v61, v6 +; SI-NEXT: v_or_b32_e32 v7, v35, v7 +; SI-NEXT: v_or_b32_e32 v8, v60, v8 +; SI-NEXT: v_or_b32_e32 v9, v34, v9 +; SI-NEXT: v_or_b32_e32 v10, v63, v10 +; SI-NEXT: v_or_b32_e32 v11, v33, v11 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 ; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 ; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 ; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 @@ -36791,41 +35675,121 @@ define <14 x double> @bitcast_v56i16_to_v14f64(<56 x i16> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 ; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 ; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v54 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v53 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v52 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v51 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v50 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v49 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v48 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 ; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 ; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 ; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 ; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 ; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 ; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 ; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 ; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v25, vcc, s6, v25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v26, v27, v26 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v26, vcc, 0x30000, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_or_b32_e32 v27, v28, v27 ; SI-NEXT: v_add_i32_e32 v27, vcc, 0x30000, v27 ; SI-NEXT: .LBB50_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -37592,402 +36556,323 @@ define inreg <14 x double> @bitcast_v56i16_to_v14f64_scalar(<56 x i16> inreg %a, ; SI-LABEL: bitcast_v56i16_to_v14f64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_mov_b32_e32 v57, v12 +; SI-NEXT: v_mov_b32_e32 v48, v5 +; SI-NEXT: v_mov_b32_e32 v39, v6 +; SI-NEXT: v_mov_b32_e32 v53, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v48 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v38, v7 ; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_mov_b32_e32 v58, v10 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_mov_b32_e32 v60, v8 -; SI-NEXT: v_mov_b32_e32 v33, v6 -; SI-NEXT: v_mov_b32_e32 v35, v4 -; SI-NEXT: v_mov_b32_e32 v39, v2 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mov_b32_e32 v61, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:36 -; SI-NEXT: v_mov_b32_e32 v31, v26 -; SI-NEXT: v_mov_b32_e32 v41, v24 -; SI-NEXT: v_mov_b32_e32 v42, v22 -; SI-NEXT: v_mov_b32_e32 v43, v20 -; SI-NEXT: v_mov_b32_e32 v49, v18 -; SI-NEXT: v_mov_b32_e32 v44, v16 -; SI-NEXT: v_mov_b32_e32 v45, v14 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v11 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v29 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v39 +; SI-NEXT: v_mov_b32_e32 v37, v8 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v38 +; SI-NEXT: v_mov_b32_e32 v36, v9 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v37 +; SI-NEXT: v_mov_b32_e32 v35, v10 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v36 +; SI-NEXT: v_mov_b32_e32 v34, v11 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v35 +; SI-NEXT: v_mov_b32_e32 v33, v12 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v34 +; SI-NEXT: v_mov_b32_e32 v32, v13 +; SI-NEXT: v_mov_b32_e32 v49, v4 +; SI-NEXT: v_mov_b32_e32 v50, v3 +; SI-NEXT: v_mov_b32_e32 v51, v2 +; SI-NEXT: v_mov_b32_e32 v52, v1 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v50 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v52 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v53 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s7, s28, 16 +; SI-NEXT: s_lshr_b32 s8, s27, 16 +; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: s_lshr_b32 s13, s22, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s40, s19, 16 +; SI-NEXT: s_lshr_b32 s41, s18, 16 +; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v32 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v8 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v12 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v0 ; SI-NEXT: s_cbranch_scc0 .LBB51_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v61 -; SI-NEXT: v_or_b32_e32 v7, v0, v18 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 -; SI-NEXT: v_or_b32_e32 v9, v0, v16 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 -; SI-NEXT: v_or_b32_e32 v10, v0, v14 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v60 -; SI-NEXT: v_or_b32_e32 v11, v0, v5 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v58 -; SI-NEXT: v_or_b32_e32 v12, v0, v3 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v57 -; SI-NEXT: v_or_b32_e32 v13, v0, v63 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v45 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_or_b32_e32 v14, v0, v62 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v44 -; SI-NEXT: v_or_b32_e32 v15, v0, v40 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_or_b32_e32 v16, v0, v48 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v43 -; SI-NEXT: v_or_b32_e32 v17, v0, v38 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v42 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_or_b32_e32 v18, v0, v37 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v41 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v39 -; SI-NEXT: v_or_b32_e32 v19, v0, v36 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v31 ; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: v_or_b32_e32 v8, v1, v20 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_or_b32_e32 v20, v0, v55 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v28 +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: v_or_b32_e32 v21, v0, v34 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 -; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: v_or_b32_e32 v22, v0, v54 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v56 -; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: v_or_b32_e32 v23, v0, v32 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s44, s42, 16 +; SI-NEXT: v_or_b32_e32 v14, v0, v59 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; SI-NEXT: s_or_b32 s5, s5, s44 +; SI-NEXT: s_and_b32 s44, s18, 0xffff +; SI-NEXT: s_lshl_b32 s45, s41, 16 +; SI-NEXT: v_or_b32_e32 v15, v0, v58 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 -; SI-NEXT: s_or_b32 s7, s7, s8 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: v_or_b32_e32 v24, v0, v59 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v47 -; SI-NEXT: s_or_b32 s8, s8, s9 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: v_or_b32_e32 v25, v0, v53 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v46 -; SI-NEXT: s_or_b32 s9, s9, s10 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_or_b32_e32 v26, v0, v52 -; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: s_or_b32 s44, s44, s45 +; SI-NEXT: s_and_b32 s45, s19, 0xffff +; SI-NEXT: s_lshl_b32 s46, s40, 16 +; SI-NEXT: v_or_b32_e32 v16, v0, v57 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 -; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_or_b32_e32 v27, v0, v29 +; SI-NEXT: s_or_b32 s45, s45, s46 +; SI-NEXT: s_and_b32 s46, s20, 0xffff +; SI-NEXT: s_lshl_b32 s47, s15, 16 +; SI-NEXT: v_or_b32_e32 v17, v0, v56 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: s_or_b32 s46, s46, s47 +; SI-NEXT: s_and_b32 s47, s21, 0xffff +; SI-NEXT: s_lshl_b32 s56, s14, 16 +; SI-NEXT: v_or_b32_e32 v18, v0, v47 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; SI-NEXT: s_or_b32 s47, s47, s56 +; SI-NEXT: s_and_b32 s56, s22, 0xffff +; SI-NEXT: s_lshl_b32 s57, s13, 16 +; SI-NEXT: v_or_b32_e32 v19, v0, v46 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: s_or_b32 s56, s56, s57 +; SI-NEXT: s_and_b32 s57, s23, 0xffff +; SI-NEXT: s_lshl_b32 s58, s12, 16 +; SI-NEXT: v_or_b32_e32 v20, v0, v45 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: s_or_b32 s57, s57, s58 +; SI-NEXT: s_and_b32 s58, s24, 0xffff +; SI-NEXT: s_lshl_b32 s59, s11, 16 +; SI-NEXT: v_or_b32_e32 v21, v0, v44 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: s_or_b32 s58, s58, s59 +; SI-NEXT: s_and_b32 s59, s25, 0xffff +; SI-NEXT: s_lshl_b32 s60, s10, 16 +; SI-NEXT: v_or_b32_e32 v22, v0, v43 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: s_or_b32 s59, s59, s60 +; SI-NEXT: s_and_b32 s60, s26, 0xffff +; SI-NEXT: s_lshl_b32 s61, s9, 16 +; SI-NEXT: v_or_b32_e32 v23, v0, v42 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: s_or_b32 s60, s60, s61 +; SI-NEXT: s_and_b32 s61, s27, 0xffff +; SI-NEXT: s_lshl_b32 s62, s8, 16 +; SI-NEXT: v_or_b32_e32 v24, v0, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: s_or_b32 s61, s61, s62 +; SI-NEXT: s_and_b32 s62, s28, 0xffff +; SI-NEXT: s_lshl_b32 s63, s7, 16 +; SI-NEXT: v_or_b32_e32 v25, v0, v40 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: s_or_b32 s62, s62, s63 +; SI-NEXT: s_and_b32 s63, s29, 0xffff +; SI-NEXT: s_lshl_b32 s72, s6, 16 +; SI-NEXT: v_or_b32_e32 v26, v0, v55 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: s_or_b32 s63, s63, s72 +; SI-NEXT: v_or_b32_e32 v27, v0, v54 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_mov_b32_e32 v5, s9 -; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: v_mov_b32_e32 v2, s44 +; SI-NEXT: v_mov_b32_e32 v3, s45 +; SI-NEXT: v_mov_b32_e32 v4, s46 +; SI-NEXT: v_mov_b32_e32 v5, s47 +; SI-NEXT: v_mov_b32_e32 v6, s56 +; SI-NEXT: v_mov_b32_e32 v7, s57 +; SI-NEXT: v_mov_b32_e32 v8, s58 +; SI-NEXT: v_mov_b32_e32 v9, s59 +; SI-NEXT: v_mov_b32_e32 v10, s60 +; SI-NEXT: v_mov_b32_e32 v11, s61 +; SI-NEXT: v_mov_b32_e32 v12, s62 +; SI-NEXT: v_mov_b32_e32 v13, s63 ; SI-NEXT: s_cbranch_execnz .LBB51_3 ; SI-NEXT: .LBB51_2: ; %cmp.true -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v61 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: s_or_b32 s7, s8, s7 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: s_or_b32 s8, s9, s8 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: s_or_b32 s9, s10, s9 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: s_or_b32 s10, s11, s10 -; SI-NEXT: s_add_i32 s4, s4, 0x30000 -; SI-NEXT: s_add_i32 s5, s5, 0x30000 -; SI-NEXT: s_add_i32 s6, s6, 0x30000 -; SI-NEXT: s_add_i32 s7, s7, 0x30000 -; SI-NEXT: s_add_i32 s8, s8, 0x30000 -; SI-NEXT: s_add_i32 s9, s9, 0x30000 -; SI-NEXT: s_add_i32 s10, s10, 0x30000 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_mov_b32_e32 v5, s9 -; SI-NEXT: v_mov_b32_e32 v6, s10 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v39 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v60 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v58 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v57 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v63, v0 -; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v45 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v62, v0 +; SI-NEXT: v_or_b32_e32 v0, v59, v0 ; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v44 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v40, v0 +; SI-NEXT: v_or_b32_e32 v0, v58, v0 ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v48, v0 +; SI-NEXT: v_or_b32_e32 v0, v57, v0 ; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v43 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v38, v0 +; SI-NEXT: v_or_b32_e32 v0, v56, v0 ; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v42 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v37, v0 +; SI-NEXT: v_or_b32_e32 v0, v47, v0 ; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v36, v0 +; SI-NEXT: v_or_b32_e32 v0, v46, v0 ; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v31 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v55, v0 +; SI-NEXT: v_or_b32_e32 v0, v45, v0 ; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v34, v0 +; SI-NEXT: v_or_b32_e32 v0, v44, v0 ; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v54, v0 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s16, s42, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: v_add_i32_e32 v22, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v56 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: s_and_b32 s16, s18, 0xffff +; SI-NEXT: s_lshl_b32 s17, s41, 16 +; SI-NEXT: s_add_i32 s19, s19, 3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v32, v0 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_and_b32 s17, s19, 0xffff +; SI-NEXT: s_lshl_b32 s18, s40, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: s_and_b32 s18, s20, 0xffff +; SI-NEXT: s_lshl_b32 s15, s15, 16 +; SI-NEXT: s_add_i32 s21, s21, 3 ; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: s_or_b32 s15, s15, s18 +; SI-NEXT: s_and_b32 s18, s21, 0xffff +; SI-NEXT: s_lshl_b32 s14, s14, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v59, v0 +; SI-NEXT: s_or_b32 s14, s14, s18 +; SI-NEXT: s_and_b32 s18, s22, 0xffff +; SI-NEXT: s_lshl_b32 s13, s13, 16 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: v_or_b32_e32 v0, v41, v0 +; SI-NEXT: s_or_b32 s13, s13, s18 +; SI-NEXT: s_and_b32 s18, s23, 0xffff +; SI-NEXT: s_lshl_b32 s12, s12, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 ; SI-NEXT: v_add_i32_e32 v24, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v47 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 +; SI-NEXT: s_or_b32 s12, s12, s18 +; SI-NEXT: s_and_b32 s18, s24, 0xffff +; SI-NEXT: s_lshl_b32 s11, s11, 16 +; SI-NEXT: s_add_i32 s25, s25, 3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v53, v0 +; SI-NEXT: s_or_b32 s11, s11, s18 +; SI-NEXT: s_and_b32 s18, s25, 0xffff +; SI-NEXT: s_lshl_b32 s10, s10, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: v_or_b32_e32 v0, v40, v0 +; SI-NEXT: s_or_b32 s10, s10, s18 +; SI-NEXT: s_and_b32 s18, s26, 0xffff +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_add_i32 s27, s27, 3 ; SI-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v46 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: s_or_b32 s9, s9, s18 +; SI-NEXT: s_and_b32 s18, s27, 0xffff +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v52, v0 +; SI-NEXT: s_or_b32 s8, s8, s18 +; SI-NEXT: s_and_b32 s18, s28, 0xffff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: v_or_b32_e32 v0, v55, v0 +; SI-NEXT: s_or_b32 s7, s7, s18 +; SI-NEXT: s_and_b32 s18, s29, 0xffff +; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: v_add_i32_e32 v26, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 +; SI-NEXT: s_or_b32 s6, s6, s18 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v29, v0 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s16, s16, 0x30000 +; SI-NEXT: s_add_i32 s17, s17, 0x30000 +; SI-NEXT: s_add_i32 s15, s15, 0x30000 +; SI-NEXT: s_add_i32 s14, s14, 0x30000 +; SI-NEXT: s_add_i32 s13, s13, 0x30000 +; SI-NEXT: s_add_i32 s12, s12, 0x30000 +; SI-NEXT: s_add_i32 s11, s11, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v54, v0 ; SI-NEXT: v_add_i32_e32 v27, vcc, 0x30000, v0 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: v_mov_b32_e32 v3, s17 +; SI-NEXT: v_mov_b32_e32 v4, s15 +; SI-NEXT: v_mov_b32_e32 v5, s14 +; SI-NEXT: v_mov_b32_e32 v6, s13 +; SI-NEXT: v_mov_b32_e32 v7, s12 +; SI-NEXT: v_mov_b32_e32 v8, s11 +; SI-NEXT: v_mov_b32_e32 v9, s10 +; SI-NEXT: v_mov_b32_e32 v10, s9 +; SI-NEXT: v_mov_b32_e32 v11, s8 +; SI-NEXT: v_mov_b32_e32 v12, s7 +; SI-NEXT: v_mov_b32_e32 v13, s6 ; SI-NEXT: .LBB51_3: ; %end -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB51_4: -; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v62, v58 -; SI-NEXT: v_mov_b32_e32 v58, v51 -; SI-NEXT: v_mov_b32_e32 v51, v47 -; SI-NEXT: v_mov_b32_e32 v47, v44 -; SI-NEXT: v_mov_b32_e32 v44, v41 -; SI-NEXT: v_mov_b32_e32 v41, v30 -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v63, v59 -; SI-NEXT: v_mov_b32_e32 v59, v56 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_mov_b32_e32 v56, v50 -; SI-NEXT: v_mov_b32_e32 v50, v45 -; SI-NEXT: v_mov_b32_e32 v45, v42 -; SI-NEXT: v_mov_b32_e32 v42, v28 -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v61, v52 -; SI-NEXT: v_mov_b32_e32 v52, v57 -; SI-NEXT: v_mov_b32_e32 v57, v46 -; SI-NEXT: v_mov_b32_e32 v46, v49 -; SI-NEXT: v_mov_b32_e32 v49, v43 -; SI-NEXT: v_mov_b32_e32 v43, v31 -; SI-NEXT: v_mov_b32_e32 v53, v40 -; SI-NEXT: v_mov_b32_e32 v40, v48 -; SI-NEXT: v_mov_b32_e32 v48, v39 -; SI-NEXT: v_mov_b32_e32 v39, v38 -; SI-NEXT: v_mov_b32_e32 v38, v37 -; SI-NEXT: v_mov_b32_e32 v37, v36 -; SI-NEXT: v_mov_b32_e32 v36, v35 -; SI-NEXT: v_mov_b32_e32 v35, v55 -; SI-NEXT: v_mov_b32_e32 v55, v34 -; SI-NEXT: v_mov_b32_e32 v34, v33 -; SI-NEXT: v_mov_b32_e32 v33, v54 -; SI-NEXT: v_mov_b32_e32 v54, v32 -; SI-NEXT: v_mov_b32_e32 v32, v60 -; SI-NEXT: v_mov_b32_e32 v60, v29 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v29, v60 -; SI-NEXT: v_mov_b32_e32 v60, v32 -; SI-NEXT: v_mov_b32_e32 v32, v54 -; SI-NEXT: v_mov_b32_e32 v54, v33 -; SI-NEXT: v_mov_b32_e32 v33, v34 -; SI-NEXT: v_mov_b32_e32 v34, v55 -; SI-NEXT: v_mov_b32_e32 v55, v35 -; SI-NEXT: v_mov_b32_e32 v35, v36 -; SI-NEXT: v_mov_b32_e32 v36, v37 -; SI-NEXT: v_mov_b32_e32 v37, v38 -; SI-NEXT: v_mov_b32_e32 v38, v39 -; SI-NEXT: v_mov_b32_e32 v39, v48 -; SI-NEXT: v_mov_b32_e32 v48, v40 -; SI-NEXT: v_mov_b32_e32 v40, v53 -; SI-NEXT: v_mov_b32_e32 v31, v43 -; SI-NEXT: v_mov_b32_e32 v43, v49 -; SI-NEXT: v_mov_b32_e32 v49, v46 -; SI-NEXT: v_mov_b32_e32 v46, v57 -; SI-NEXT: v_mov_b32_e32 v57, v52 -; SI-NEXT: v_mov_b32_e32 v52, v61 -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v28, v42 -; SI-NEXT: v_mov_b32_e32 v42, v45 -; SI-NEXT: v_mov_b32_e32 v45, v50 -; SI-NEXT: v_mov_b32_e32 v50, v56 -; SI-NEXT: v_mov_b32_e32 v56, v59 -; SI-NEXT: v_mov_b32_e32 v59, v63 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v30, v41 -; SI-NEXT: v_mov_b32_e32 v41, v44 -; SI-NEXT: v_mov_b32_e32 v44, v47 -; SI-NEXT: v_mov_b32_e32 v47, v51 -; SI-NEXT: v_mov_b32_e32 v51, v58 -; SI-NEXT: v_mov_b32_e32 v58, v62 -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: s_branch .LBB51_2 ; ; VI-LABEL: bitcast_v56i16_to_v14f64_scalar: @@ -38715,48 +37600,46 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) { ; SI-LABEL: bitcast_v14f64_to_v56f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -38773,11 +37656,9 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr49 @@ -38786,103 +37667,102 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB52_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v24 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v62, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v28 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v26 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 @@ -38890,19 +37770,20 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v29 -; SI-NEXT: v_mov_b32_e32 v29, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v25 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v39, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v28 +; SI-NEXT: v_mov_b32_e32 v28, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v26 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill @@ -38916,83 +37797,80 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v38, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: .LBB52_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB52_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 -; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v47 -; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 -; SI-NEXT: v_add_f64 v[54:55], v[1:2], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v18 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v42 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v16 +; SI-NEXT: v_add_f64 v[53:54], v[0:1], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v45 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v17 -; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 -; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v43 -; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 -; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 -; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 -; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 -; SI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 -; SI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 -; SI-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 -; SI-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v40 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; SI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; SI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; SI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v53 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v54 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v55 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 @@ -39012,47 +37890,48 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v53 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 ; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 ; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 ; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 ; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 ; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 ; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 ; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 ; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v51 ; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 ; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 ; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 ; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v1 -; SI-NEXT: v_mov_b32_e32 v47, v25 -; SI-NEXT: v_mov_b32_e32 v45, v26 -; SI-NEXT: v_mov_b32_e32 v43, v27 -; SI-NEXT: v_mov_b32_e32 v42, v28 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v51, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v0 +; SI-NEXT: v_mov_b32_e32 v42, v26 +; SI-NEXT: v_mov_b32_e32 v40, v27 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill @@ -39066,247 +37945,159 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: .LBB52_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v0, v53 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v41 ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v51 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v40 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v53 -; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v52 -; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v50 -; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v48 -; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v38 -; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 -; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v30 -; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v63 -; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 -; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v58 -; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 -; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v49 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v50 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v37 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v38 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v32 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v35 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v63 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v61 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v56 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v44 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v20, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v31 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 -; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v12, v45 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v44 -; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v14, v41 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v18, v43 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v20, v47 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v22, v59 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v62 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v29 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 +; SI-NEXT: v_or_b32_e32 v23, v25, v23 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v24, v62 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v47 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v45 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v43 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v42 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 +; SI-NEXT: v_or_b32_e32 v25, v27, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v40 ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -39323,7 +38114,11 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_or_b32_e32 v26, v27, v26 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 +; SI-NEXT: v_or_b32_e32 v27, v29, v27 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v14f64_to_v56f16: @@ -39865,22 +38660,22 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a ; SI-LABEL: bitcast_v14f64_to_v56f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15 -; SI-NEXT: v_mov_b32_e32 v25, s16 -; SI-NEXT: v_mov_b32_e32 v26, s17 -; SI-NEXT: v_mov_b32_e32 v21, s18 -; SI-NEXT: v_mov_b32_e32 v22, s19 -; SI-NEXT: v_mov_b32_e32 v27, s20 -; SI-NEXT: v_mov_b32_e32 v28, s21 -; SI-NEXT: v_mov_b32_e32 v23, s22 -; SI-NEXT: v_mov_b32_e32 v24, s23 -; SI-NEXT: v_mov_b32_e32 v19, s24 -; SI-NEXT: v_mov_b32_e32 v20, s25 -; SI-NEXT: v_mov_b32_e32 v17, s26 -; SI-NEXT: v_mov_b32_e32 v18, s27 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: v_mov_b32_e32 v24, s16 +; SI-NEXT: v_mov_b32_e32 v25, s17 +; SI-NEXT: v_mov_b32_e32 v20, s18 +; SI-NEXT: v_mov_b32_e32 v21, s19 +; SI-NEXT: v_mov_b32_e32 v26, s20 +; SI-NEXT: v_mov_b32_e32 v27, s21 +; SI-NEXT: v_mov_b32_e32 v22, s22 +; SI-NEXT: v_mov_b32_e32 v23, s23 +; SI-NEXT: v_mov_b32_e32 v18, s24 +; SI-NEXT: v_mov_b32_e32 v19, s25 +; SI-NEXT: v_mov_b32_e32 v16, s26 +; SI-NEXT: v_mov_b32_e32 v17, s27 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mov_b32_e32 v15, s28 -; SI-NEXT: v_mov_b32_e32 v16, s29 +; SI-NEXT: v_mov_b32_e32 v14, s28 +; SI-NEXT: v_mov_b32_e32 v15, s29 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -39899,210 +38694,205 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB53_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v39, v10 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v13 -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v39, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v29 -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v39, v8 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v10 -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v39, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v29 -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v39, v6 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v7 -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v35, v9 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v12 +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v35, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v28 +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v35, v7 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v9 +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v35, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v7 +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v35, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v39, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v35, v4 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v27 +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v39, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v13 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v5 -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v35, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v29 +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v39, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v11 -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v3 +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v39, v2 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v35, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v29 +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v35, v1 +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v39, v1 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v35, v0 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v21 +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v39, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v23 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v2 -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v35, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v29 +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v39, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v27 -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v39, v18 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v18 -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v39, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v14 ; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v24 -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v39, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v22 -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v15 +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v39, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v19 +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v35, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v29 ; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v14 -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v39, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v10 +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v35, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v24 ; SI-NEXT: s_cbranch_execnz .LBB53_3 ; SI-NEXT: .LBB53_2: ; %cmp.true -; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 -; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v1 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 -; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v15 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v5 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 -; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v46 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v4 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v15 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v6 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v17 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v57 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v1, v44 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v18 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v10 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v47 -; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 -; SI-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v16 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v11 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v20 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v4 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v42 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v2 +; SI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; SI-NEXT: v_add_f64 v[53:54], v[20:21], 1.0 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v45 -; SI-NEXT: v_add_f64 v[53:54], v[21:22], 1.0 -; SI-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v29 -; SI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 -; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 -; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 -; SI-NEXT: v_mov_b32_e32 v45, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v43 -; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v40 +; SI-NEXT: v_mov_b32_e32 v40, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v62 +; SI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; SI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v24 ; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v53 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v54 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v7 ; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v13 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 @@ -40114,291 +38904,201 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 ; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 ; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 ; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 ; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 ; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v21 -; SI-NEXT: v_mov_b32_e32 v29, v14 -; SI-NEXT: v_mov_b32_e32 v57, v11 -; SI-NEXT: v_mov_b32_e32 v47, v12 -; SI-NEXT: v_mov_b32_e32 v43, v13 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v20 +; SI-NEXT: v_mov_b32_e32 v29, v13 +; SI-NEXT: v_mov_b32_e32 v46, v10 +; SI-NEXT: v_mov_b32_e32 v44, v11 +; SI-NEXT: v_mov_b32_e32 v42, v12 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: .LBB53_3: ; %end ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v40 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v55 -; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v54 -; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v52 -; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v50 -; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v49 -; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v39 -; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v63 -; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 -; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v58 -; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 -; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 -; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v44 -; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 -; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v54 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v52 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v32 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v50 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v28 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v62 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v38 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v59 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v35 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v10, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v57 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v12, v43 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v62 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v20, v56 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v22, v60 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v57 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v47 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v43 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v29 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v63 +; SI-NEXT: v_or_b32_e32 v23, v25, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v46 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 +; SI-NEXT: v_or_b32_e32 v25, v27, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v42 ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -40415,49 +39115,53 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v26, v31 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_or_b32_e32 v26, v27, v26 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 +; SI-NEXT: v_or_b32_e32 v27, v29, v27 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB53_4: ; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; kill: killed $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; kill: killed $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr29 @@ -41300,198 +40004,219 @@ define <14 x double> @bitcast_v56f16_to_v14f64(<56 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v56f16_to_v14f64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v46, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v6 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:24 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:20 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:88 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:100 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v7 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v48 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v10 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v39 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v38 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v56, v56 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v10 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v32 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v37 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v32 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v21 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v32 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v32 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v35 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v33 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v49 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v50 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v51 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v52 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v53 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v62 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v54 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v55 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v47, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v40 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v27 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB54_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v41 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; kill: killed $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; kill: killed $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; kill: killed $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; kill: killed $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; kill: killed $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; kill: killed $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v49 ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; kill: killed $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr28 @@ -41541,6 +40266,10 @@ define <14 x double> @bitcast_v56f16_to_v14f64(<56 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v47 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v45 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v41 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v53 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v51 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v61 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v59 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v57 @@ -41553,6 +40282,10 @@ define <14 x double> @bitcast_v56f16_to_v14f64(<56 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v0, v46, v0 ; SI-NEXT: v_or_b32_e32 v1, v44, v1 ; SI-NEXT: v_or_b32_e32 v2, v42, v2 +; SI-NEXT: v_or_b32_e32 v3, v40, v3 +; SI-NEXT: v_or_b32_e32 v4, v54, v4 +; SI-NEXT: v_or_b32_e32 v5, v52, v5 +; SI-NEXT: v_or_b32_e32 v6, v50, v6 ; SI-NEXT: v_or_b32_e32 v21, v60, v21 ; SI-NEXT: v_or_b32_e32 v22, v58, v22 ; SI-NEXT: v_or_b32_e32 v23, v48, v23 @@ -41567,6 +40300,14 @@ define <14 x double> @bitcast_v56f16_to_v14f64(<56 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; kill: killed $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; kill: killed $vgpr28 @@ -41584,84 +40325,65 @@ define <14 x double> @bitcast_v56f16_to_v14f64(<56 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v56 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: v_or_b32_e32 v19, v20, v19 @@ -41673,7 +40395,7 @@ define <14 x double> @bitcast_v56f16_to_v14f64(<56 x half> %a, i32 %b) { ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB54_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v47 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v45 @@ -41693,187 +40415,177 @@ define <14 x double> @bitcast_v56f16_to_v14f64(<56 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v3, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v43 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v54 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v41 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v60 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v52 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v50 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v55 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v61 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v60 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v53 ; SI-NEXT: v_cvt_f32_f16_e32 v24, v58 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v25, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v33 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v51 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v38 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v36 ; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v49 ; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v33 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v32 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 @@ -41881,9 +40593,9 @@ define <14 x double> @bitcast_v56f16_to_v14f64(<56 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 @@ -41893,7 +40605,7 @@ define <14 x double> @bitcast_v56f16_to_v14f64(<56 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 @@ -41905,7 +40617,7 @@ define <14 x double> @bitcast_v56f16_to_v14f64(<56 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_or_b32_e32 v17, v18, v17 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 @@ -41958,22 +40670,22 @@ define <14 x double> @bitcast_v56f16_to_v14f64(<56 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v27, v29, v27 ; SI-NEXT: .LBB54_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -42741,456 +41453,546 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a ; SI-LABEL: bitcast_v56f16_to_v14f64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:44 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v20 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v2, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v11, s20 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v3, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v10, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v4, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v9, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v5, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v8, s26 -; SI-NEXT: v_cvt_f16_f32_e32 v6, s29 -; SI-NEXT: v_cvt_f16_f32_e32 v7, s28 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v5 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v51 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v61 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_cvt_f16_f32_e32 v0, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v6 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v7 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v53, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v54, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v10 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v55, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 +; SI-NEXT: s_lshr_b32 s41, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s41 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s17 +; SI-NEXT: s_lshr_b32 s15, s18, 16 +; SI-NEXT: s_lshr_b32 s40, s17, 16 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s40 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s15 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v43, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s19 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v11 +; SI-NEXT: s_lshr_b32 s13, s20, 16 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 +; SI-NEXT: s_lshr_b32 s14, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v30 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s21 +; SI-NEXT: s_lshr_b32 s11, s22, 16 +; SI-NEXT: s_lshr_b32 s12, s21, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s12 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v12 +; SI-NEXT: s_lshr_b32 s9, s24, 16 +; SI-NEXT: s_lshr_b32 s10, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s10 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 +; SI-NEXT: s_lshr_b32 s8, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s8 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v32 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v13 +; SI-NEXT: s_lshr_b32 s6, s27, 16 +; SI-NEXT: s_lshr_b32 s7, s26, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v31 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v29, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s6 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 +; SI-NEXT: s_lshr_b32 s4, s29, 16 +; SI-NEXT: s_lshr_b32 s5, s28, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v56, s29 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB55_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v49, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_mov_b32_e32 v48, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_mov_b32_e32 v61, v44 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v43 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v54 -; SI-NEXT: v_mov_b32_e32 v39, v11 -; SI-NEXT: v_or_b32_e32 v2, v11, v2 -; SI-NEXT: v_mov_b32_e32 v33, v10 -; SI-NEXT: v_or_b32_e32 v3, v10, v3 -; SI-NEXT: v_or_b32_e32 v4, v9, v4 -; SI-NEXT: v_or_b32_e32 v5, v8, v5 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v58 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v35 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v61 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v46 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v60 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v62 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v57 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v63 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v34 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v60 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v44 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v40 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v30 +; SI-NEXT: v_mov_b32_e32 v49, v39 +; SI-NEXT: v_or_b32_e32 v0, v39, v0 +; SI-NEXT: v_mov_b32_e32 v39, v38 +; SI-NEXT: v_or_b32_e32 v1, v38, v1 +; SI-NEXT: v_mov_b32_e32 v55, v54 +; SI-NEXT: v_or_b32_e32 v2, v53, v2 +; SI-NEXT: v_mov_b32_e32 v53, v52 +; SI-NEXT: v_mov_b32_e32 v32, v48 +; SI-NEXT: v_or_b32_e32 v3, v48, v3 +; SI-NEXT: v_mov_b32_e32 v51, v50 +; SI-NEXT: v_mov_b32_e32 v38, v37 +; SI-NEXT: v_or_b32_e32 v4, v37, v4 +; SI-NEXT: v_mov_b32_e32 v37, v36 +; SI-NEXT: v_mov_b32_e32 v48, v63 +; SI-NEXT: v_or_b32_e32 v5, v63, v5 +; SI-NEXT: v_mov_b32_e32 v36, v35 +; SI-NEXT: v_mov_b32_e32 v63, v62 +; SI-NEXT: v_or_b32_e32 v6, v62, v6 +; SI-NEXT: v_or_b32_e32 v7, v59, v7 +; SI-NEXT: v_mov_b32_e32 v61, v60 +; SI-NEXT: v_mov_b32_e32 v59, v47 +; SI-NEXT: v_or_b32_e32 v8, v47, v8 +; SI-NEXT: v_mov_b32_e32 v56, v58 +; SI-NEXT: v_or_b32_e32 v11, v46, v11 +; SI-NEXT: v_or_b32_e32 v12, v45, v12 +; SI-NEXT: v_or_b32_e32 v13, v43, v13 +; SI-NEXT: v_or_b32_e32 v14, v41, v14 +; SI-NEXT: v_or_b32_e32 v15, v31, v15 +; SI-NEXT: v_or_b32_e32 v16, v28, v16 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v0, v55, v0 -; SI-NEXT: v_or_b32_e32 v1, v53, v1 -; SI-NEXT: v_or_b32_e32 v7, v47, v7 -; SI-NEXT: v_mov_b32_e32 v42, v58 -; SI-NEXT: v_or_b32_e32 v8, v58, v8 -; SI-NEXT: v_mov_b32_e32 v41, v60 -; SI-NEXT: v_or_b32_e32 v9, v59, v9 -; SI-NEXT: v_mov_b32_e32 v40, v56 -; SI-NEXT: v_or_b32_e32 v10, v56, v10 -; SI-NEXT: v_or_b32_e32 v11, v45, v11 -; SI-NEXT: v_or_b32_e32 v12, v38, v12 -; SI-NEXT: v_or_b32_e32 v13, v36, v13 -; SI-NEXT: v_or_b32_e32 v14, v35, v14 -; SI-NEXT: v_or_b32_e32 v15, v32, v15 -; SI-NEXT: v_or_b32_e32 v17, v37, v17 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v29 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: v_or_b32_e32 v21, v22, v21 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; SI-NEXT: v_or_b32_e32 v23, v24, v23 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 ; SI-NEXT: v_or_b32_e32 v25, v26, v25 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 ; SI-NEXT: v_or_b32_e32 v26, v27, v26 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v57 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v10, v33, v10 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; SI-NEXT: v_or_b32_e32 v27, v50, v27 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v27, v57, v27 ; SI-NEXT: s_cbranch_execnz .LBB55_3 ; SI-NEXT: .LBB55_2: ; %cmp.true -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v53 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v1, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v38 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v47 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v48 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v45 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v63 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v33 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v45 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v32 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v31 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v28 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v22, v29 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v2, v55 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v53 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v51 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v37 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v36 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v61 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v61 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v56 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v62 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v34 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v29 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v44 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v42 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v16, v30 ; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v37 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_or_b32_e32 v17, v18, v17 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v18, v20, v18 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 @@ -43202,22 +42004,16 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_or_b32_e32 v20, v21, v20 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; SI-NEXT: v_or_b32_e32 v22, v24, v22 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 @@ -43229,12 +42025,12 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_or_b32_e32 v23, v24, v23 ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v25 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 ; SI-NEXT: v_or_b32_e32 v24, v26, v24 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 @@ -43244,7 +42040,7 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 ; SI-NEXT: v_or_b32_e32 v25, v27, v25 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 @@ -43258,38 +42054,63 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 ; SI-NEXT: v_or_b32_e32 v27, v29, v27 ; SI-NEXT: .LBB55_3: ; %end -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB55_4: -; SI-NEXT: v_mov_b32_e32 v39, v11 -; SI-NEXT: v_mov_b32_e32 v33, v10 -; SI-NEXT: v_mov_b32_e32 v49, v2 -; SI-NEXT: v_mov_b32_e32 v48, v3 -; SI-NEXT: v_mov_b32_e32 v52, v37 -; SI-NEXT: v_mov_b32_e32 v37, v29 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v57, v43 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v46, v42 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v45, v41 +; SI-NEXT: v_mov_b32_e32 v43, v31 +; SI-NEXT: v_mov_b32_e32 v42, v30 +; SI-NEXT: v_mov_b32_e32 v41, v28 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v42, v58 -; SI-NEXT: v_mov_b32_e32 v41, v60 -; SI-NEXT: v_mov_b32_e32 v40, v56 -; SI-NEXT: v_mov_b32_e32 v29, v37 -; SI-NEXT: v_mov_b32_e32 v37, v52 -; SI-NEXT: v_mov_b32_e32 v61, v44 +; SI-NEXT: v_mov_b32_e32 v55, v54 +; SI-NEXT: v_mov_b32_e32 v28, v41 +; SI-NEXT: v_mov_b32_e32 v30, v42 +; SI-NEXT: v_mov_b32_e32 v41, v45 +; SI-NEXT: v_mov_b32_e32 v42, v46 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v53, v52 +; SI-NEXT: v_mov_b32_e32 v51, v50 +; SI-NEXT: v_mov_b32_e32 v49, v39 +; SI-NEXT: v_mov_b32_e32 v32, v48 +; SI-NEXT: v_mov_b32_e32 v39, v38 +; SI-NEXT: v_mov_b32_e32 v38, v37 +; SI-NEXT: v_mov_b32_e32 v37, v36 +; SI-NEXT: v_mov_b32_e32 v36, v35 +; SI-NEXT: v_mov_b32_e32 v48, v63 +; SI-NEXT: v_mov_b32_e32 v63, v62 +; SI-NEXT: v_mov_b32_e32 v61, v60 +; SI-NEXT: v_mov_b32_e32 v59, v47 +; SI-NEXT: v_mov_b32_e32 v56, v58 +; SI-NEXT: v_mov_b32_e32 v31, v43 +; SI-NEXT: v_mov_b32_e32 v43, v57 ; SI-NEXT: s_branch .LBB55_2 ; ; VI-LABEL: bitcast_v56f16_to_v14f64_scalar: @@ -43978,897 +42799,795 @@ define <56 x half> @bitcast_v56i16_to_v56f16(<56 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v56i16_to_v56f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:100 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:88 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:72 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:104 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:8 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v55 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; kill: killed $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; kill: killed $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; kill: killed $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; kill: killed $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; kill: killed $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; kill: killed $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; kill: killed $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; kill: killed $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; kill: killed $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; kill: killed $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; kill: killed $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; kill: killed $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; kill: killed $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; kill: killed $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; kill: killed $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; kill: killed $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; kill: killed $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; kill: killed $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; kill: killed $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; kill: killed $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; kill: killed $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; kill: killed $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; kill: killed $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; kill: killed $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; kill: killed $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; kill: killed $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; kill: killed $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; kill: killed $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; kill: killed $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; kill: killed $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; kill: killed $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; kill: killed $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; kill: killed $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; kill: killed $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; kill: killed $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; kill: killed $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; kill: killed $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; kill: killed $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; kill: killed $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; kill: killed $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; kill: killed $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; kill: killed $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; kill: killed $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; kill: killed $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; kill: killed $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; kill: killed $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; kill: killed $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; kill: killed $vgpr55 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v17 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v0 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; kill: killed $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; kill: killed $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB56_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v41, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v4 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v31 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v35 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v38 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v48 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v41 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v2 ; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v42 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v3 ; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v4 ; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v44 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v5 ; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v45 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v6 ; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v46 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v7 ; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v8 ; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v56 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v9 ; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v57 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v10 ; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v58 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v11 ; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v12 ; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v60 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v13 ; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v61 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v14 ; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v15 ; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v62 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v16 ; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v63 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v17 ; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v18 ; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v19 ; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v20 ; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v21 ; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v22 ; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v23 ; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v24 ; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v25 ; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v26 ; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v27 ; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v28 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v46 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v47 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v39 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v48 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v49 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v51 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v52 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v53 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v54 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v56 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v57 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v58 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v59 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v60 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v61 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v62 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: .LBB56_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB56_4 +; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v63 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_add_i32_e32 v40, vcc, 3, v40 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v40 +; SI-NEXT: v_add_i32_e32 v41, vcc, 3, v41 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 +; SI-NEXT: v_add_i32_e32 v42, vcc, 3, v42 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v41 +; SI-NEXT: v_add_i32_e32 v43, vcc, 3, v43 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v2 +; SI-NEXT: v_add_i32_e32 v44, vcc, 3, v44 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v36 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v42 +; SI-NEXT: v_add_i32_e32 v45, vcc, 3, v45 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v37 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v3 +; SI-NEXT: v_add_i32_e32 v46, vcc, 3, v46 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: .LBB56_2: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB56_4 -; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 +; SI-NEXT: v_add_i32_e32 v47, vcc, 3, v47 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v5 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v4 +; SI-NEXT: v_add_i32_e32 v55, vcc, 3, v56 ; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v44 +; SI-NEXT: v_add_i32_e32 v54, vcc, 3, v57 ; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v5 +; SI-NEXT: v_add_i32_e32 v53, vcc, 3, v58 ; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v45 +; SI-NEXT: v_add_i32_e32 v52, vcc, 3, v59 ; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v6 +; SI-NEXT: v_add_i32_e32 v51, vcc, 3, v60 ; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v46 +; SI-NEXT: v_add_i32_e32 v50, vcc, 3, v61 ; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v7 +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 ; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v47 +; SI-NEXT: v_add_i32_e32 v49, vcc, 3, v62 ; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v8 +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v32 +; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v63 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v55 ; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v9 ; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v54 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 ; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v10 +; SI-NEXT: v_add_i32_e32 v33, vcc, 3, v33 ; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v53 ; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 ; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v11 ; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 ; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v52 ; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v17 -; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v18 -; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 -; SI-NEXT: v_add_i32_e32 v46, vcc, 3, v46 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v19 -; SI-NEXT: v_add_i32_e32 v47, vcc, 3, v47 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_add_i32_e32 v39, vcc, 3, v39 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v20 ; SI-NEXT: v_add_i32_e32 v48, vcc, 3, v48 -; SI-NEXT: v_add_i32_e32 v49, vcc, 3, v49 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v21 -; SI-NEXT: v_add_i32_e32 v50, vcc, 3, v50 -; SI-NEXT: v_add_i32_e32 v51, vcc, 3, v51 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v22 -; SI-NEXT: v_add_i32_e32 v52, vcc, 3, v52 -; SI-NEXT: v_add_i32_e32 v53, vcc, 3, v53 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v23 -; SI-NEXT: v_add_i32_e32 v54, vcc, 3, v54 -; SI-NEXT: v_add_i32_e32 v56, vcc, 3, v56 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v24 -; SI-NEXT: v_add_i32_e32 v57, vcc, 3, v57 -; SI-NEXT: v_add_i32_e32 v58, vcc, 3, v58 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v25 -; SI-NEXT: v_add_i32_e32 v59, vcc, 3, v59 -; SI-NEXT: v_add_i32_e32 v60, vcc, 3, v60 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v26 -; SI-NEXT: v_add_i32_e32 v61, vcc, 3, v61 -; SI-NEXT: v_add_i32_e32 v62, vcc, 3, v62 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v27 -; SI-NEXT: v_add_i32_e32 v63, vcc, 3, v63 -; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v28 -; SI-NEXT: v_add_i32_e32 v33, vcc, 3, v33 -; SI-NEXT: v_add_i32_e32 v34, vcc, 3, v34 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v29 -; SI-NEXT: v_add_i32_e32 v35, vcc, 3, v35 -; SI-NEXT: v_add_i32_e32 v36, vcc, 3, v36 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v12 +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: v_add_i32_e32 v39, vcc, 3, v39 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v30 -; SI-NEXT: v_add_i32_e32 v37, vcc, 3, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 ; SI-NEXT: v_add_i32_e32 v38, vcc, 3, v38 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v46 -; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v31 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v47 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v39 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: v_add_i32_e32 v37, vcc, 3, v37 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v4 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v13 +; SI-NEXT: v_add_i32_e32 v36, vcc, 3, v36 +; SI-NEXT: v_add_i32_e32 v35, vcc, 3, v35 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v31 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v50 +; SI-NEXT: v_add_i32_e32 v34, vcc, 3, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v37 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v51 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v48 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v52 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v53 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v35 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v54 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v56 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v32 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v57 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v58 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v30 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v59 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v60 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v61 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v62 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v20 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v63 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v33 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v32 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v21 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v33 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v35 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v36 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v37 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; SI-NEXT: .LBB56_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v42 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 -; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 -; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v20, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v52 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v26, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v55 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v2, v55 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v22, v49 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 +; SI-NEXT: v_or_b32_e32 v23, v25, v23 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v24, v51 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 +; SI-NEXT: v_or_b32_e32 v25, v27, v25 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v26, v53 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_or_b32_e32 v26, v27, v26 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 +; SI-NEXT: v_or_b32_e32 v27, v29, v27 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v56i16_to_v56f16: @@ -45433,842 +44152,578 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i ; SI-LABEL: bitcast_v56i16_to_v56f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:4 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:44 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:40 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:36 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:32 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:28 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: s_lshr_b32 s43, s29, 16 +; SI-NEXT: s_lshr_b32 s42, s28, 16 +; SI-NEXT: s_lshr_b32 s41, s27, 16 +; SI-NEXT: s_lshr_b32 s40, s26, 16 +; SI-NEXT: s_lshr_b32 s15, s25, 16 +; SI-NEXT: s_lshr_b32 s14, s24, 16 +; SI-NEXT: s_lshr_b32 s13, s23, 16 +; SI-NEXT: s_lshr_b32 s12, s22, 16 +; SI-NEXT: s_lshr_b32 s11, s21, 16 +; SI-NEXT: s_lshr_b32 s10, s20, 16 +; SI-NEXT: s_lshr_b32 s9, s19, 16 +; SI-NEXT: s_lshr_b32 s8, s18, 16 +; SI-NEXT: s_lshr_b32 s7, s17, 16 +; SI-NEXT: s_lshr_b32 s6, s16, 16 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v8 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: s_cbranch_scc0 .LBB57_2 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v0 +; SI-NEXT: s_cbranch_scc0 .LBB57_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v33, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s19 -; SI-NEXT: s_mov_b64 s[4:5], 0 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v33, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v25 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v33, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v28 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v33, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v57 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v33, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v35 -; SI-NEXT: v_mov_b32_e32 v47, v34 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v33, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v31 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v33, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v61 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v33, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v58 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v33, s26 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v14, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s13 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v33, s27 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v33, s28 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v33, s29 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v33, v1 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v33, v2 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v33, v3 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v33, v4 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v33, v5 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v33, v6 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v33, v7 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v33, v8 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v33, v9 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v33, v10 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v33, v11 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v33, v12 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v33, v13 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v33, v14 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v14, s6 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v33, v15 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v15, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s14 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v33, v16 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v14, s17 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v33, v17 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v15, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s15 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v33, v18 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v14, s7 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v33, v19 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v15, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s40 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v14, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s41 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v49, s42 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v14, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s43 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v14, s19 +; SI-NEXT: v_mov_b32_e32 v28, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v1 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v14, s9 +; SI-NEXT: v_mov_b32_e32 v29, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v18 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v14, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v30, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v19 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v33, v20 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v14, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v3 +; SI-NEXT: v_mov_b32_e32 v31, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v20 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v33, v22 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v14, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v4 +; SI-NEXT: v_mov_b32_e32 v32, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v23 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v33, v24 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v14, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v5 +; SI-NEXT: v_mov_b32_e32 v33, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v41 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v14, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v6 +; SI-NEXT: v_mov_b32_e32 v34, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v43 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v14, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v7 +; SI-NEXT: v_mov_b32_e32 v35, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v45 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v14, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v8 +; SI-NEXT: v_mov_b32_e32 v19, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v16 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v33, v59 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_branch .LBB57_3 -; SI-NEXT: .LBB57_2: -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: v_mov_b32_e32 v47, v34 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: s_mov_b64 s[4:5], -1 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: .LBB57_3: ; %Flow -; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v33, v34 -; SI-NEXT: v_mov_b32_e32 v34, v36 -; SI-NEXT: v_mov_b32_e32 v36, v48 -; SI-NEXT: v_mov_b32_e32 v48, v50 -; SI-NEXT: v_mov_b32_e32 v50, v52 -; SI-NEXT: v_mov_b32_e32 v52, v54 -; SI-NEXT: v_mov_b32_e32 v54, v40 -; SI-NEXT: v_mov_b32_e32 v40, v42 -; SI-NEXT: v_mov_b32_e32 v42, v44 -; SI-NEXT: v_mov_b32_e32 v44, v46 -; SI-NEXT: s_cbranch_vccnz .LBB57_5 -; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v46, vcc, 3, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v9 +; SI-NEXT: v_mov_b32_e32 v21, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v10 +; SI-NEXT: v_mov_b32_e32 v16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v11 +; SI-NEXT: v_mov_b32_e32 v23, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v12 +; SI-NEXT: v_mov_b32_e32 v17, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v13 +; SI-NEXT: v_mov_b32_e32 v18, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: s_cbranch_execnz .LBB57_3 +; SI-NEXT: .LBB57_2: ; %cmp.true ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s16 -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: s_add_i32 s21, s21, 3 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v33, s18 -; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: s_add_i32 s23, s23, 3 -; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v33, s20 -; SI-NEXT: s_add_i32 s25, s25, 3 -; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: s_add_i32 s27, s27, 3 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v33, s21 -; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: s_add_i32 s29, s29, 3 -; SI-NEXT: v_add_i32_e32 v58, vcc, 3, v58 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v33, s22 -; SI-NEXT: v_add_i32_e32 v59, vcc, 3, v59 -; SI-NEXT: v_add_i32_e32 v60, vcc, 3, v60 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v33, s23 -; SI-NEXT: v_add_i32_e32 v61, vcc, 3, v61 -; SI-NEXT: v_add_i32_e32 v62, vcc, 3, v62 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v33, s24 -; SI-NEXT: v_add_i32_e32 v63, vcc, 3, v63 -; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v31 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v33, s25 -; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32 -; SI-NEXT: v_add_i32_e32 v56, vcc, 3, v56 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v33, s26 -; SI-NEXT: v_add_i32_e32 v57, vcc, 3, v57 -; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v33, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s16 +; SI-NEXT: s_add_i32 s6, s6, 3 ; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s19 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v33, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v46 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v33, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v63 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v49, v62 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v33, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v58 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_add_i32_e32 v47, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v48, v47 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v41, v30 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v52, v29 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v43, v28 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v54, v27 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v45, v26 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v40, v25 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v42, v23 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v44, v21 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v2 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v3 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v4 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v6 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v7 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v9 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v10 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v11 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v12 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v13 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v14 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v15 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v17 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_add_i32 s7, s7, 3 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v18 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v14, s6 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s8, s8, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v19 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v14, s17 +; SI-NEXT: s_add_i32 s9, s9, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s10, s10, 3 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v20 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v14, s7 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s11, s11, 3 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v22 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v14, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s22 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s12, s12, 3 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v14, s8 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v59 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: .LBB57_5: ; %end -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v15, s24 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v39 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: v_cvt_f32_f16_e32 v14, s19 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v15, s26 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: v_cvt_f32_f16_e32 v14, s9 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: v_cvt_f32_f16_e32 v14, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s28 +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v18 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: v_cvt_f32_f16_e32 v14, s10 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v17 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: v_cvt_f32_f16_e32 v14, s21 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v23 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: v_cvt_f32_f16_e32 v14, s11 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v16 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: v_cvt_f32_f16_e32 v14, s12 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v21 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: v_cvt_f32_f16_e32 v14, s23 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v45, vcc, 3, v19 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: v_cvt_f32_f16_e32 v14, s25 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v35 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v43, vcc, 3, v34 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v41, vcc, 3, v33 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v42, vcc, 3, v32 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v55, vcc, 3, v31 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v40, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v53, vcc, 3, v29 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v37, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: s_add_i32 s43, s43, 3 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_add_i32 s42, s42, 3 +; SI-NEXT: s_add_i32 s41, s41, 3 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_add_i32 s40, s40, 3 +; SI-NEXT: s_add_i32 s15, s15, 3 +; SI-NEXT: s_add_i32 s14, s14, 3 +; SI-NEXT: s_add_i32 s13, s13, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s14 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v48, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s40 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v14, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s41 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v49, s42 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v15, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: .LBB57_3: ; %end ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v8, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v62 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v8, v36 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v10, v39 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v2, v44 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v12, v49 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v2, v42 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v14, v51 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v40 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v54 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v52 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v50 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v48 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v36 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v34 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v46 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v54 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v57 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v55 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v61 +; SI-NEXT: v_or_b32_e32 v19, v23, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v41 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v28 +; SI-NEXT: v_or_b32_e32 v20, v23, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v63 +; SI-NEXT: v_or_b32_e32 v21, v29, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 +; SI-NEXT: v_or_b32_e32 v22, v28, v22 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v58 +; SI-NEXT: v_or_b32_e32 v23, v30, v23 +; SI-NEXT: v_or_b32_e32 v24, v28, v24 +; SI-NEXT: v_or_b32_e32 v25, v29, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v59 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v26, v28, v26 +; SI-NEXT: v_or_b32_e32 v27, v29, v27 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB57_4: +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; kill: killed $vgpr15 +; SI-NEXT: v_mov_b32_e32 v35, v45 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; kill: killed $vgpr15 +; SI-NEXT: v_mov_b32_e32 v34, v43 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: v_mov_b32_e32 v33, v41 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: v_mov_b32_e32 v32, v23 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: v_mov_b32_e32 v31, v20 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: v_mov_b32_e32 v30, v19 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: v_mov_b32_e32 v29, v18 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: v_mov_b32_e32 v28, v17 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: v_mov_b32_e32 v19, v16 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: v_mov_b32_e32 v18, v27 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: v_mov_b32_e32 v17, v26 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: v_mov_b32_e32 v23, v25 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: v_mov_b32_e32 v16, v24 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: v_mov_b32_e32 v21, v22 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: s_branch .LBB57_2 ; ; VI-LABEL: bitcast_v56i16_to_v56f16_scalar: ; VI: ; %bb.0: @@ -47156,592 +45611,509 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v56f16_to_v56i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v4 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v32, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v34 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v37 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v49 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v34 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v37 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v7 ; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:4 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:8 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:12 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v49 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v2 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:16 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v33 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v34 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v36 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v37 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v39 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v49 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:24 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:88 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:100 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:104 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v46 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v14 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v18 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v22, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v62 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v34 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f16_f32_e32 v60, v37 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f16_f32_e32 v27, v38 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f16_f32_e32 v53, v39 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f16_f32_e32 v31, v48 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f16_f32_e32 v39, v51 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f16_f32_e32 v26, v52 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v34, v54 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v32, v41 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v57 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_or_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: v_mov_b32_e32 v51, v57 -; SI-NEXT: v_mov_b32_e32 v52, v7 -; SI-NEXT: v_mov_b32_e32 v54, v9 -; SI-NEXT: v_mov_b32_e32 v55, v11 -; SI-NEXT: v_mov_b32_e32 v41, v13 -; SI-NEXT: v_mov_b32_e32 v48, v5 -; SI-NEXT: s_xor_b64 exec, exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB58_2 ; SI-NEXT: ; %bb.1: ; %cmp.true -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v37, v56 -; SI-NEXT: v_mov_b32_e32 v7, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 -; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v39 -; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 +; SI-NEXT: v_add_f32_e32 v47, 0x38000000, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 +; SI-NEXT: v_add_f32_e32 v45, 0x38000000, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v47 +; SI-NEXT: v_add_f32_e32 v47, 0x38000000, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v47 +; SI-NEXT: v_add_f32_e32 v44, 0x38000000, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v44 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v44 +; SI-NEXT: v_add_f32_e32 v42, 0x38000000, v42 +; SI-NEXT: v_add_f32_e32 v44, 0x38000000, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v44 +; SI-NEXT: v_add_f32_e32 v41, 0x38000000, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 +; SI-NEXT: v_add_f32_e32 v41, 0x38000000, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 +; SI-NEXT: v_add_f32_e32 v55, 0x38000000, v55 +; SI-NEXT: v_add_f32_e32 v54, 0x38000000, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v55 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v53 ; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v33 -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v25 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v11 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v54 +; SI-NEXT: v_add_f32_e32 v54, 0x38000000, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v52, 0x38000000, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v52 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_or_b32_e32 v29, v29, v54 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_or_b32_e32 v30, v30, v54 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_or_b32_e32 v24, v24, v54 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_or_b32_e32 v31, v31, v54 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: v_or_b32_e32 v33, v33, v54 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v38, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 -; SI-NEXT: v_or_b32_e32 v5, v38, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v46 -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v37 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v37, v9 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 -; SI-NEXT: v_or_b32_e32 v48, v39, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v45 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v38 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_or_b32_e32 v18, v18, v54 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_or_b32_e32 v34, v34, v54 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v38, v9 -; SI-NEXT: v_or_b32_e32 v9, v37, v45 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v37, v44 -; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 -; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_or_b32_e32 v9, v38, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v43 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_or_b32_e32 v36, v36, v54 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v50, 0x38000000, v50 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_add_f32_e32 v51, 0x38000000, v51 +; SI-NEXT: v_or_b32_e32 v12, v12, v54 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v49, 0x38000000, v49 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v48 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_or_b32_e32 v37, v37, v54 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 -; SI-NEXT: v_or_b32_e32 v41, v39, v43 -; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_or_b32_e32 v39, v39, v54 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v5 +; SI-NEXT: v_or_b32_e32 v6, v6, v54 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v3 +; SI-NEXT: v_or_b32_e32 v50, v50, v54 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v56 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v47 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v44 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v9, v63 -; SI-NEXT: v_or_b32_e32 v55, v37, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v52 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 -; SI-NEXT: v_or_b32_e32 v54, v25, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v40 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v51 -; SI-NEXT: v_or_b32_e32 v52, v37, v40 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v30 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v62 -; SI-NEXT: v_or_b32_e32 v51, v21, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v29 -; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v37 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v61 -; SI-NEXT: v_or_b32_e32 v62, v25, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v28 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v49 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_or_b32_e32 v61, v29, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v27 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v60 -; SI-NEXT: v_or_b32_e32 v49, v21, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v7 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v29 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v26 -; SI-NEXT: v_or_b32_e32 v37, v25, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v32 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v34 -; SI-NEXT: v_or_b32_e32 v39, v29, v26 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v25 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v25 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v53 -; SI-NEXT: v_or_b32_e32 v34, v21, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v50 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v25 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v9 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v31 -; SI-NEXT: v_or_b32_e32 v53, v7, v21 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v36 -; SI-NEXT: v_or_b32_e32 v50, v25, v21 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v33 -; SI-NEXT: v_or_b32_e32 v35, v13, v21 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v63 -; SI-NEXT: v_or_b32_e32 v16, v16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v14 -; SI-NEXT: v_or_b32_e32 v15, v15, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v22 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v20 -; SI-NEXT: v_alignbit_b32 v29, v35, v28, 16 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_alignbit_b32 v28, v50, v27, 16 -; SI-NEXT: v_or_b32_e32 v22, v21, v22 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v18 -; SI-NEXT: v_or_b32_e32 v24, v24, v21 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v23 -; SI-NEXT: v_or_b32_e32 v19, v19, v21 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v17 -; SI-NEXT: v_or_b32_e32 v10, v10, v21 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v12 -; SI-NEXT: v_or_b32_e32 v1, v1, v21 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v8 -; SI-NEXT: v_or_b32_e32 v5, v5, v21 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v2 -; SI-NEXT: v_or_b32_e32 v6, v6, v21 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v4 -; SI-NEXT: v_or_b32_e32 v3, v3, v21 -; SI-NEXT: v_alignbit_b32 v56, v3, v47, 16 -; SI-NEXT: v_alignbit_b32 v47, v6, v46, 16 -; SI-NEXT: v_alignbit_b32 v46, v5, v45, 16 -; SI-NEXT: v_alignbit_b32 v45, v1, v57, 16 -; SI-NEXT: v_alignbit_b32 v44, v10, v43, 16 -; SI-NEXT: v_alignbit_b32 v43, v19, v42, 16 -; SI-NEXT: v_alignbit_b32 v21, v24, v58, 16 -; SI-NEXT: v_alignbit_b32 v25, v22, v40, 16 -; SI-NEXT: v_alignbit_b32 v40, v15, v30, 16 -; SI-NEXT: v_alignbit_b32 v30, v16, v59, 16 -; SI-NEXT: v_alignbit_b32 v27, v53, v60, 16 -; SI-NEXT: v_mov_b32_e32 v60, v37 -; SI-NEXT: v_alignbit_b32 v26, v34, v26, 16 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v41 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; SI-NEXT: v_or_b32_e32 v2, v2, v54 +; SI-NEXT: v_or_b32_e32 v0, v0, v28 +; SI-NEXT: v_or_b32_e32 v49, v49, v46 +; SI-NEXT: v_or_b32_e32 v4, v4, v45 +; SI-NEXT: v_or_b32_e32 v48, v48, v57 +; SI-NEXT: v_or_b32_e32 v8, v8, v43 +; SI-NEXT: v_or_b32_e32 v10, v10, v42 +; SI-NEXT: v_or_b32_e32 v38, v38, v58 +; SI-NEXT: v_or_b32_e32 v14, v14, v40 +; SI-NEXT: v_or_b32_e32 v16, v16, v55 +; SI-NEXT: v_or_b32_e32 v35, v35, v59 +; SI-NEXT: v_or_b32_e32 v20, v20, v53 +; SI-NEXT: v_or_b32_e32 v22, v22, v52 +; SI-NEXT: v_or_b32_e32 v32, v32, v60 +; SI-NEXT: v_or_b32_e32 v26, v26, v51 +; SI-NEXT: v_alignbit_b32 v56, v2, v28, 16 +; SI-NEXT: v_alignbit_b32 v47, v50, v46, 16 +; SI-NEXT: v_alignbit_b32 v46, v6, v45, 16 +; SI-NEXT: v_alignbit_b32 v45, v39, v57, 16 +; SI-NEXT: v_alignbit_b32 v44, v37, v43, 16 +; SI-NEXT: v_alignbit_b32 v43, v12, v42, 16 +; SI-NEXT: v_alignbit_b32 v42, v36, v58, 16 +; SI-NEXT: v_alignbit_b32 v41, v34, v40, 16 +; SI-NEXT: v_alignbit_b32 v40, v18, v55, 16 +; SI-NEXT: v_alignbit_b32 v55, v33, v59, 16 +; SI-NEXT: v_alignbit_b32 v54, v31, v53, 16 +; SI-NEXT: v_alignbit_b32 v53, v24, v52, 16 +; SI-NEXT: v_alignbit_b32 v52, v30, v60, 16 +; SI-NEXT: v_alignbit_b32 v51, v29, v51, 16 ; SI-NEXT: .LBB58_2: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v56 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 4, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v37, 0xffff, v5 -; SI-NEXT: v_or_b32_e32 v37, v37, v38 -; SI-NEXT: buffer_store_dword v37, v0, s[0:3], 0 offen -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v48 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v47 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v6 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v46 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v8 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v56 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v45 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v28 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v49 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v47 +; SI-NEXT: v_or_b32_e32 v2, v2, v28 +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v50 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v28, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v46 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v4, v4, v28 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v48 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v45 +; SI-NEXT: v_or_b32_e32 v6, v6, v28 +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v28, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v44 +; SI-NEXT: v_or_b32_e32 v8, v8, v28 +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v37 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v28, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v43 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v10, v10, v28 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v38 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v42 +; SI-NEXT: v_or_b32_e32 v12, v12, v28 +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v28, v13 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v41 +; SI-NEXT: v_or_b32_e32 v14, v14, v28 +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v34 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v28, v15 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v40 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v16, v16, v28 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v35 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v55 +; SI-NEXT: v_or_b32_e32 v18, v18, v28 +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v28, v19 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v54 +; SI-NEXT: v_or_b32_e32 v20, v20, v28 +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v31 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v28, v21 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v53 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v22, v22, v28 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v52 +; SI-NEXT: v_or_b32_e32 v24, v24, v28 +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_or_b32_e32 v25, v28, v25 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v51 +; SI-NEXT: v_or_b32_e32 v26, v26, v28 +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_or_b32_e32 v27, v28, v27 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v12 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v41 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v44 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v17 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v55 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v43 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v23 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v21 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v24 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v18 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v52 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v20 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v51 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v14 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v62 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v63 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v61 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v35 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v49 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v28 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v50 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v60 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v53 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v39 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v26 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v34 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v56f16_to_v56i16: @@ -48307,618 +46679,577 @@ define inreg <56 x i16> @bitcast_v56f16_to_v56i16_scalar(<56 x half> inreg %a, i ; SI-LABEL: bitcast_v56f16_to_v56i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:40 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v59, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v63, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v26 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v3, s16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s16 +; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s18 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_cvt_f16_f32_e32 v58, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v3 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s40, s19, 16 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v13 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v61, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v58, s23 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v1, s14 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s10 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v59, v5 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v23, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v4 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, s28 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v5 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v12 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v3, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s28 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s8, s27, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v20 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v17 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v25 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v8, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v47, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v45, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v20, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v43, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v16, s26 -; SI-NEXT: v_cvt_f16_f32_e32 v41, s29 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v36, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v28 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f16_f32_e32 v33, v38 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v49 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f16_f32_e32 v62, v51 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v53 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f16_f32_e32 v27, v55 -; SI-NEXT: s_waitcnt vmcnt(8) expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v24, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s6 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v5 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v2 +; SI-NEXT: s_lshr_b32 s7, s28, 16 +; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: s_lshr_b32 s13, s22, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s41, s18, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v3 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v63, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v47, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v15 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v24 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_cbranch_scc0 .LBB59_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_cbranch_execnz .LBB59_3 ; SI-NEXT: .LBB59_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v1, v47 -; SI-NEXT: v_mov_b32_e32 v28, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v45 -; SI-NEXT: v_mov_b32_e32 v51, v23 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v43 -; SI-NEXT: v_mov_b32_e32 v49, v19 -; SI-NEXT: v_mov_b32_e32 v53, v36 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v1 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_mov_b32_e32 v36, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v41 -; SI-NEXT: v_mov_b32_e32 v38, v15 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v1 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v28 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v39 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_or_b32_e32 v5, v5, v23 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v52 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v50 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v48 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v37 -; SI-NEXT: v_mov_b32_e32 v37, v11 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v35 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v62 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v33 -; SI-NEXT: v_mov_b32_e32 v33, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v53 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v0, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v44 +; SI-NEXT: v_mov_b32_e32 v31, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v39 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v43 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v34 +; SI-NEXT: v_or_b32_e32 v34, v26, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v52 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v54 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v48 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v36 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v32 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_or_b32_e32 v5, v28, v19 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v49 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_or_b32_e32 v28, v28, v15 -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_or_b32_e32 v26, v29, v2 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v39, v29, v4 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_or_b32_e32 v39, v5, v29 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_or_b32_e32 v5, v31, v25 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_or_b32_e32 v29, v29, v10 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v38 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_or_b32_e32 v28, v28, v21 -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_or_b32_e32 v26, v26, v6 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v28, v37 -; SI-NEXT: v_or_b32_e32 v38, v31, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v56 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_or_b32_e32 v37, v28, v11 -; SI-NEXT: v_or_b32_e32 v62, v31, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v33 +; SI-NEXT: v_or_b32_e32 v26, v30, v8 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_or_b32_e32 v29, v29, v16 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v29, v31 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_or_b32_e32 v5, v5, v17 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_or_b32_e32 v26, v26, v12 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v63 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_or_b32_e32 v5, v5, v9 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v26, v30, v14 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v46 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v44 -; SI-NEXT: v_or_b32_e32 v35, v28, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v34 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v53 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_or_b32_e32 v48, v26, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v55 +; SI-NEXT: v_or_b32_e32 v36, v30, v20 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_or_b32_e32 v38, v26, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v28 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v35 ; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_or_b32_e32 v34, v1, v3 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v27 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v31 -; SI-NEXT: v_or_b32_e32 v56, v28, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v27 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v27 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v31 -; SI-NEXT: v_or_b32_e32 v2, v2, v27 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_or_b32_e32 v28, v28, v24 +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v28, v51 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_or_b32_e32 v31, v29, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v33 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v28 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v35 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v29 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v30 +; SI-NEXT: v_or_b32_e32 v25, v25, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v56 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v32 +; SI-NEXT: v_or_b32_e32 v23, v23, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v49 ; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v28 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v45 ; SI-NEXT: v_cvt_f16_f32_e32 v33, v28 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v32 -; SI-NEXT: v_or_b32_e32 v4, v4, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v60 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v56 +; SI-NEXT: v_or_b32_e32 v21, v21, v28 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v29 ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v33 -; SI-NEXT: v_or_b32_e32 v6, v6, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v57 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v27 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v27 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v60 -; SI-NEXT: v_or_b32_e32 v8, v8, v27 +; SI-NEXT: v_or_b32_e32 v19, v19, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v62 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v37 +; SI-NEXT: v_or_b32_e32 v17, v17, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v63 ; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v28 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v61 ; SI-NEXT: v_cvt_f16_f32_e32 v63, v28 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v57 -; SI-NEXT: v_or_b32_e32 v10, v10, v27 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v62 +; SI-NEXT: v_or_b32_e32 v15, v15, v28 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v29 ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v63 -; SI-NEXT: v_or_b32_e32 v12, v12, v28 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v27 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v27 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v36 -; SI-NEXT: v_or_b32_e32 v14, v14, v27 -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v18, v18, v27 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: v_lshr_b64 v[52:53], v[17:18], 16 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_or_b32_e32 v13, v13, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v57 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v61 +; SI-NEXT: v_or_b32_e32 v11, v11, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v59 ; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v27 -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; SI-NEXT: v_or_b32_e32 v22, v22, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v59 -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: v_lshr_b64 v[54:55], v[21:22], 16 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v51 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v28 +; SI-NEXT: v_lshr_b64 v[52:53], v[10:11], 16 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v60 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v57 +; SI-NEXT: v_or_b32_e32 v9, v9, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v59 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v29 +; SI-NEXT: v_or_b32_e32 v7, v7, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v41 +; SI-NEXT: v_lshr_b64 v[40:41], v[6:7], 16 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v60 +; SI-NEXT: v_or_b32_e32 v5, v5, v29 ; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_or_b32_e32 v26, v26, v27 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v59 -; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v36, v30 -; SI-NEXT: v_or_b32_e32 v30, v28, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v58 -; SI-NEXT: v_lshr_b64 v[41:42], v[29:30], 16 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v36 -; SI-NEXT: v_or_b32_e32 v16, v16, v28 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v61 -; SI-NEXT: v_lshr_b64 v[43:44], v[15:16], 16 -; SI-NEXT: v_mov_b32_e32 v44, v34 -; SI-NEXT: v_mov_b32_e32 v42, v33 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v27 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v58 -; SI-NEXT: v_or_b32_e32 v20, v20, v27 -; SI-NEXT: v_lshr_b64 v[45:46], v[19:20], 16 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v61 -; SI-NEXT: v_or_b32_e32 v24, v24, v27 -; SI-NEXT: v_lshr_b64 v[33:34], v[5:6], 16 -; SI-NEXT: v_lshr_b64 v[47:48], v[23:24], 16 -; SI-NEXT: v_mov_b32_e32 v23, v36 -; SI-NEXT: v_mov_b32_e32 v46, v35 -; SI-NEXT: v_lshr_b64 v[35:36], v[7:8], 16 -; SI-NEXT: v_mov_b32_e32 v7, v63 -; SI-NEXT: v_mov_b32_e32 v34, v56 -; SI-NEXT: v_mov_b32_e32 v56, v62 -; SI-NEXT: v_lshr_b64 v[62:63], v[3:4], 16 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v19, v39 -; SI-NEXT: v_mov_b32_e32 v15, v38 -; SI-NEXT: v_lshr_b64 v[39:40], v[25:26], 16 -; SI-NEXT: v_lshr_b64 v[50:51], v[13:14], 16 -; SI-NEXT: v_lshr_b64 v[48:49], v[11:12], 16 -; SI-NEXT: v_mov_b32_e32 v11, v37 -; SI-NEXT: v_lshr_b64 v[37:38], v[9:10], 16 -; SI-NEXT: v_lshr_b64 v[27:28], v[1:2], 16 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v58 +; SI-NEXT: v_lshr_b64 v[42:43], v[4:5], 16 +; SI-NEXT: v_lshr_b64 v[54:55], v[8:9], 16 +; SI-NEXT: v_mov_b32_e32 v41, v29 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v29 +; SI-NEXT: v_or_b32_e32 v3, v3, v28 +; SI-NEXT: v_lshr_b64 v[44:45], v[2:3], 16 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v58 +; SI-NEXT: v_or_b32_e32 v1, v1, v28 +; SI-NEXT: v_lshr_b64 v[28:29], v[26:27], 16 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: v_lshr_b64 v[46:47], v[0:1], 16 +; SI-NEXT: v_mov_b32_e32 v47, v48 +; SI-NEXT: v_mov_b32_e32 v45, v37 +; SI-NEXT: v_mov_b32_e32 v43, v39 +; SI-NEXT: v_mov_b32_e32 v55, v38 +; SI-NEXT: v_mov_b32_e32 v53, v36 +; SI-NEXT: v_lshr_b64 v[50:51], v[12:13], 16 +; SI-NEXT: v_lshr_b64 v[48:49], v[14:15], 16 +; SI-NEXT: v_lshr_b64 v[38:39], v[16:17], 16 +; SI-NEXT: v_lshr_b64 v[36:37], v[18:19], 16 +; SI-NEXT: v_mov_b32_e32 v51, v35 +; SI-NEXT: v_mov_b32_e32 v49, v33 +; SI-NEXT: v_mov_b32_e32 v39, v34 +; SI-NEXT: v_mov_b32_e32 v37, v32 +; SI-NEXT: v_lshr_b64 v[34:35], v[20:21], 16 +; SI-NEXT: v_lshr_b64 v[32:33], v[22:23], 16 +; SI-NEXT: v_mov_b32_e32 v35, v31 +; SI-NEXT: v_mov_b32_e32 v33, v30 +; SI-NEXT: v_lshr_b64 v[30:31], v[24:25], 16 ; SI-NEXT: .LBB59_3: ; %end -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v47 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v24 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v61 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v45 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v58 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v43 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v36 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v23 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v41 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v19 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v30 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v59 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v39 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v46 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v39 +; SI-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v58 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v44 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v26 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v54 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v43 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v50 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v48 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v38 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v36 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v34 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v32 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v41 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v42 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v60 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v40 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v59 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v54 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v57 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v52 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v61 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v52 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v12, v12, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v63 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_or_b32_e32 v14, v14, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v62 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v50 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v16, v16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v45 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v47 +; SI-NEXT: v_or_b32_e32 v18, v18, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v49 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v53 +; SI-NEXT: v_or_b32_e32 v20, v20, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v56 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v55 +; SI-NEXT: v_or_b32_e32 v22, v22, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v37 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v30 +; SI-NEXT: v_or_b32_e32 v24, v24, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v33 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v35 +; SI-NEXT: v_or_b32_e32 v26, v26, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v51 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v48 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v63 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v37 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v57 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v56 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v35 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v60 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v46 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v33 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v42 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v44 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v62 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v32 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v34 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v27 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB59_4: ; SI-NEXT: s_branch .LBB59_2 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll index 4372f11f8ab4a..c4d17c79d773e 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll @@ -2849,357 +2849,267 @@ define <60 x i16> @bitcast_v30i32_to_v60i16(<30 x i32> %a, i32 %b) { ; SI-LABEL: bitcast_v30i32_to_v60i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB12_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_alignbit_b32 v31, v30, v29, 16 -; SI-NEXT: v_alignbit_b32 v32, v28, v27, 16 -; SI-NEXT: v_alignbit_b32 v33, v26, v25, 16 -; SI-NEXT: v_alignbit_b32 v34, v24, v23, 16 -; SI-NEXT: v_alignbit_b32 v35, v22, v21, 16 -; SI-NEXT: v_alignbit_b32 v36, v20, v19, 16 -; SI-NEXT: v_alignbit_b32 v38, v18, v17, 16 -; SI-NEXT: v_alignbit_b32 v48, v16, v15, 16 -; SI-NEXT: v_alignbit_b32 v51, v14, v13, 16 -; SI-NEXT: v_alignbit_b32 v53, v12, v11, 16 -; SI-NEXT: v_alignbit_b32 v55, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v41, v8, v7, 16 -; SI-NEXT: v_alignbit_b32 v44, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v30, v29, v28, 16 +; SI-NEXT: v_alignbit_b32 v31, v27, v26, 16 +; SI-NEXT: v_alignbit_b32 v32, v25, v24, 16 +; SI-NEXT: v_alignbit_b32 v33, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v34, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v35, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v36, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v37, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v38, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v39, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v49, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v52, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v54, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v41, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v43, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v15 ; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_alignbit_b32 v46, v4, v3, 16 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v13 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v11 ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_alignbit_b32 v56, v2, v1, 16 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v9 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v7 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v5 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v3 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v1 ; SI-NEXT: .LBB12_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB12_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 ; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 ; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 ; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 ; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 ; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 ; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 ; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 ; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 ; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 ; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 ; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 ; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 -; SI-NEXT: v_alignbit_b32 v31, v30, v29, 16 -; SI-NEXT: v_alignbit_b32 v32, v28, v27, 16 -; SI-NEXT: v_alignbit_b32 v33, v26, v25, 16 -; SI-NEXT: v_alignbit_b32 v34, v24, v23, 16 -; SI-NEXT: v_alignbit_b32 v35, v22, v21, 16 -; SI-NEXT: v_alignbit_b32 v36, v20, v19, 16 -; SI-NEXT: v_alignbit_b32 v38, v18, v17, 16 -; SI-NEXT: v_alignbit_b32 v48, v16, v15, 16 -; SI-NEXT: v_alignbit_b32 v51, v14, v13, 16 -; SI-NEXT: v_alignbit_b32 v53, v12, v11, 16 -; SI-NEXT: v_alignbit_b32 v55, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v41, v8, v7, 16 -; SI-NEXT: v_alignbit_b32 v44, v6, v5, 16 +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_alignbit_b32 v30, v29, v28, 16 +; SI-NEXT: v_alignbit_b32 v31, v27, v26, 16 +; SI-NEXT: v_alignbit_b32 v32, v25, v24, 16 +; SI-NEXT: v_alignbit_b32 v33, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v34, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v35, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v36, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v37, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v38, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v39, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v49, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v52, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v54, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v41, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v43, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v15 ; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_alignbit_b32 v46, v4, v3, 16 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v13 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v11 ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_alignbit_b32 v56, v2, v1, 16 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v9 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v7 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v5 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v3 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v1 ; SI-NEXT: .LBB12_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v43 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v41 +; SI-NEXT: v_or_b32_e32 v0, v0, v43 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v56 -; SI-NEXT: v_or_b32_e32 v1, v1, v56 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v60 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v46 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v59 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v44 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v58 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v57 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v47 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v53 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v45 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v43 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v42 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v24 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v25 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v26 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v49 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v27 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v28 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v29 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v59 +; SI-NEXT: v_or_b32_e32 v2, v2, v41 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v58 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v43 +; SI-NEXT: v_or_b32_e32 v3, v3, v41 +; SI-NEXT: v_or_b32_e32 v4, v4, v54 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v57 +; SI-NEXT: v_or_b32_e32 v6, v6, v52 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v56 +; SI-NEXT: v_or_b32_e32 v8, v8, v49 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v47 +; SI-NEXT: v_or_b32_e32 v10, v10, v39 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v46 +; SI-NEXT: v_or_b32_e32 v12, v12, v38 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v45 +; SI-NEXT: v_or_b32_e32 v14, v14, v37 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v44 +; SI-NEXT: v_or_b32_e32 v16, v16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v42 +; SI-NEXT: v_or_b32_e32 v18, v18, v35 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v40 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v30 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_or_b32_e32 v20, v20, v34 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v55 +; SI-NEXT: v_or_b32_e32 v22, v22, v33 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v53 +; SI-NEXT: v_or_b32_e32 v24, v24, v32 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v51 +; SI-NEXT: v_or_b32_e32 v26, v26, v31 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v50 +; SI-NEXT: v_or_b32_e32 v28, v28, v30 +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v48 +; SI-NEXT: v_or_b32_e32 v5, v5, v54 +; SI-NEXT: v_or_b32_e32 v7, v7, v52 +; SI-NEXT: v_or_b32_e32 v9, v9, v49 +; SI-NEXT: v_or_b32_e32 v11, v11, v39 +; SI-NEXT: v_or_b32_e32 v13, v13, v38 +; SI-NEXT: v_or_b32_e32 v15, v15, v37 +; SI-NEXT: v_or_b32_e32 v17, v17, v36 +; SI-NEXT: v_or_b32_e32 v19, v19, v35 +; SI-NEXT: v_or_b32_e32 v21, v21, v34 +; SI-NEXT: v_or_b32_e32 v23, v23, v33 +; SI-NEXT: v_or_b32_e32 v25, v25, v32 +; SI-NEXT: v_or_b32_e32 v27, v27, v31 +; SI-NEXT: v_or_b32_e32 v29, v29, v30 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v30i32_to_v60i16: @@ -3848,72 +3758,72 @@ define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i3 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v20, s30, 0 -; SI-NEXT: v_writelane_b32 v20, s31, 1 -; SI-NEXT: v_writelane_b32 v20, s34, 2 -; SI-NEXT: v_writelane_b32 v20, s35, 3 -; SI-NEXT: v_writelane_b32 v20, s36, 4 -; SI-NEXT: v_writelane_b32 v20, s37, 5 -; SI-NEXT: v_writelane_b32 v20, s38, 6 -; SI-NEXT: v_writelane_b32 v20, s39, 7 -; SI-NEXT: v_writelane_b32 v20, s48, 8 -; SI-NEXT: v_mov_b32_e32 v18, s16 -; SI-NEXT: v_mov_b32_e32 v19, s17 -; SI-NEXT: v_writelane_b32 v20, s49, 9 -; SI-NEXT: v_readfirstlane_b32 s46, v18 -; SI-NEXT: v_mov_b32_e32 v18, s18 -; SI-NEXT: v_readfirstlane_b32 s47, v19 -; SI-NEXT: v_mov_b32_e32 v19, s19 -; SI-NEXT: v_writelane_b32 v20, s50, 10 -; SI-NEXT: v_readfirstlane_b32 s44, v18 +; SI-NEXT: v_writelane_b32 v30, s30, 0 +; SI-NEXT: v_writelane_b32 v30, s31, 1 +; SI-NEXT: v_writelane_b32 v30, s34, 2 +; SI-NEXT: v_writelane_b32 v30, s35, 3 +; SI-NEXT: v_writelane_b32 v30, s36, 4 +; SI-NEXT: v_writelane_b32 v30, s37, 5 +; SI-NEXT: v_writelane_b32 v30, s38, 6 +; SI-NEXT: v_writelane_b32 v30, s39, 7 +; SI-NEXT: v_writelane_b32 v30, s48, 8 +; SI-NEXT: v_writelane_b32 v30, s49, 9 +; SI-NEXT: v_writelane_b32 v30, s50, 10 +; SI-NEXT: v_mov_b32_e32 v17, s16 +; SI-NEXT: v_mov_b32_e32 v18, s17 +; SI-NEXT: v_writelane_b32 v30, s51, 11 +; SI-NEXT: v_mov_b32_e32 v19, s18 +; SI-NEXT: v_readfirstlane_b32 s46, v17 +; SI-NEXT: v_mov_b32_e32 v17, s19 +; SI-NEXT: v_readfirstlane_b32 s47, v18 ; SI-NEXT: v_mov_b32_e32 v18, s20 -; SI-NEXT: v_readfirstlane_b32 s45, v19 +; SI-NEXT: v_writelane_b32 v30, s52, 12 +; SI-NEXT: v_readfirstlane_b32 s44, v19 ; SI-NEXT: v_mov_b32_e32 v19, s21 -; SI-NEXT: v_writelane_b32 v20, s51, 11 +; SI-NEXT: v_readfirstlane_b32 s45, v17 +; SI-NEXT: v_mov_b32_e32 v17, s22 ; SI-NEXT: v_readfirstlane_b32 s42, v18 -; SI-NEXT: v_mov_b32_e32 v18, s22 +; SI-NEXT: v_mov_b32_e32 v18, s23 +; SI-NEXT: v_writelane_b32 v30, s53, 13 ; SI-NEXT: v_readfirstlane_b32 s43, v19 -; SI-NEXT: v_mov_b32_e32 v19, s23 -; SI-NEXT: v_writelane_b32 v20, s52, 12 -; SI-NEXT: v_readfirstlane_b32 s40, v18 -; SI-NEXT: v_mov_b32_e32 v18, s24 -; SI-NEXT: v_readfirstlane_b32 s41, v19 -; SI-NEXT: v_mov_b32_e32 v19, s25 -; SI-NEXT: v_writelane_b32 v20, s53, 13 -; SI-NEXT: v_readfirstlane_b32 s24, v18 +; SI-NEXT: v_mov_b32_e32 v19, s24 +; SI-NEXT: v_readfirstlane_b32 s40, v17 +; SI-NEXT: v_mov_b32_e32 v17, s25 +; SI-NEXT: v_readfirstlane_b32 s41, v18 ; SI-NEXT: v_mov_b32_e32 v18, s26 -; SI-NEXT: v_readfirstlane_b32 s25, v19 +; SI-NEXT: v_writelane_b32 v30, s54, 14 +; SI-NEXT: v_readfirstlane_b32 s24, v19 ; SI-NEXT: v_mov_b32_e32 v19, s27 -; SI-NEXT: v_writelane_b32 v20, s54, 14 +; SI-NEXT: v_readfirstlane_b32 s25, v17 +; SI-NEXT: v_mov_b32_e32 v17, s28 ; SI-NEXT: v_readfirstlane_b32 s22, v18 -; SI-NEXT: v_mov_b32_e32 v18, s28 +; SI-NEXT: v_mov_b32_e32 v18, s29 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: v_writelane_b32 v30, s55, 15 ; SI-NEXT: v_readfirstlane_b32 s23, v19 -; SI-NEXT: v_mov_b32_e32 v19, s29 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 -; SI-NEXT: v_writelane_b32 v20, s55, 15 -; SI-NEXT: v_readfirstlane_b32 s20, v18 -; SI-NEXT: v_readfirstlane_b32 s21, v19 -; SI-NEXT: v_readfirstlane_b32 s18, v1 -; SI-NEXT: v_readfirstlane_b32 s19, v2 -; SI-NEXT: v_readfirstlane_b32 s16, v3 -; SI-NEXT: v_readfirstlane_b32 s17, v4 -; SI-NEXT: v_readfirstlane_b32 s14, v5 -; SI-NEXT: v_readfirstlane_b32 s15, v6 -; SI-NEXT: v_readfirstlane_b32 s12, v7 -; SI-NEXT: v_readfirstlane_b32 s13, v8 -; SI-NEXT: v_readfirstlane_b32 s10, v9 -; SI-NEXT: v_readfirstlane_b32 s11, v10 -; SI-NEXT: v_readfirstlane_b32 s8, v11 -; SI-NEXT: v_readfirstlane_b32 s9, v12 -; SI-NEXT: v_readfirstlane_b32 s6, v13 -; SI-NEXT: v_readfirstlane_b32 s7, v14 -; SI-NEXT: v_readfirstlane_b32 s4, v15 +; SI-NEXT: v_readfirstlane_b32 s20, v17 +; SI-NEXT: v_readfirstlane_b32 s21, v18 +; SI-NEXT: v_readfirstlane_b32 s18, v0 +; SI-NEXT: v_readfirstlane_b32 s19, v1 +; SI-NEXT: v_readfirstlane_b32 s16, v2 +; SI-NEXT: v_readfirstlane_b32 s17, v3 +; SI-NEXT: v_readfirstlane_b32 s14, v4 +; SI-NEXT: v_readfirstlane_b32 s15, v5 +; SI-NEXT: v_readfirstlane_b32 s12, v6 +; SI-NEXT: v_readfirstlane_b32 s13, v7 +; SI-NEXT: v_readfirstlane_b32 s10, v8 +; SI-NEXT: v_readfirstlane_b32 s11, v9 +; SI-NEXT: v_readfirstlane_b32 s8, v10 +; SI-NEXT: v_readfirstlane_b32 s9, v11 +; SI-NEXT: v_readfirstlane_b32 s6, v12 +; SI-NEXT: v_readfirstlane_b32 s7, v13 +; SI-NEXT: v_readfirstlane_b32 s4, v14 ; SI-NEXT: s_and_b64 s[26:27], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s5, v16 -; SI-NEXT: v_writelane_b32 v20, s64, 16 +; SI-NEXT: v_readfirstlane_b32 s5, v15 +; SI-NEXT: v_writelane_b32 v30, s64, 16 ; SI-NEXT: s_cbranch_scc0 .LBB13_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_lshr_b32 s34, s5, 16 @@ -4012,227 +3922,144 @@ define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i3 ; SI-NEXT: s_lshl_b32 s27, s30, 16 ; SI-NEXT: s_and_b32 s29, s46, 0xffff ; SI-NEXT: s_or_b32 s27, s29, s27 -; SI-NEXT: v_mov_b32_e32 v1, s27 -; SI-NEXT: s_and_b32 s27, s47, 0xffff -; SI-NEXT: s_lshl_b32 s29, s64, 16 -; SI-NEXT: s_or_b32 s27, s27, s29 -; SI-NEXT: v_mov_b32_e32 v2, s27 -; SI-NEXT: s_lshl_b32 s27, s94, 16 -; SI-NEXT: s_and_b32 s29, s44, 0xffff -; SI-NEXT: s_or_b32 s27, s29, s27 -; SI-NEXT: v_mov_b32_e32 v3, s27 -; SI-NEXT: s_and_b32 s27, s45, 0xffff -; SI-NEXT: s_lshl_b32 s29, s55, 16 -; SI-NEXT: s_or_b32 s27, s27, s29 -; SI-NEXT: v_mov_b32_e32 v4, s27 -; SI-NEXT: s_lshl_b32 s27, s92, 16 -; SI-NEXT: s_and_b32 s29, s42, 0xffff -; SI-NEXT: s_or_b32 s27, s29, s27 -; SI-NEXT: v_mov_b32_e32 v5, s27 -; SI-NEXT: s_and_b32 s27, s43, 0xffff -; SI-NEXT: s_lshl_b32 s29, s54, 16 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; SI-NEXT: s_or_b32 s27, s27, s29 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v1, vcc, 8, v0 -; SI-NEXT: v_mov_b32_e32 v6, s27 -; SI-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v1, vcc, 12, v0 -; SI-NEXT: s_lshl_b32 s27, s90, 16 -; SI-NEXT: s_and_b32 s29, s40, 0xffff -; SI-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v1, vcc, 16, v0 -; SI-NEXT: s_or_b32 s27, s29, s27 -; SI-NEXT: buffer_store_dword v5, v1, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v1, vcc, 20, v0 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_mov_b32_e32 v2, s27 -; SI-NEXT: s_and_b32 s27, s41, 0xffff -; SI-NEXT: s_lshl_b32 s29, s53, 16 -; SI-NEXT: buffer_store_dword v6, v1, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v1, vcc, 24, v0 -; SI-NEXT: s_or_b32 s27, s27, s29 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s27 +; SI-NEXT: s_and_b32 s29, s47, 0xffff +; SI-NEXT: s_lshl_b32 s46, s64, 16 +; SI-NEXT: s_or_b32 s29, s29, s46 +; SI-NEXT: s_lshl_b32 s46, s94, 16 +; SI-NEXT: s_and_b32 s44, s44, 0xffff +; SI-NEXT: s_or_b32 s44, s44, s46 +; SI-NEXT: s_and_b32 s45, s45, 0xffff +; SI-NEXT: s_lshl_b32 s46, s55, 16 +; SI-NEXT: s_or_b32 s45, s45, s46 +; SI-NEXT: s_lshl_b32 s46, s92, 16 +; SI-NEXT: s_and_b32 s42, s42, 0xffff +; SI-NEXT: s_or_b32 s42, s42, s46 +; SI-NEXT: s_and_b32 s43, s43, 0xffff +; SI-NEXT: s_lshl_b32 s46, s54, 16 +; SI-NEXT: s_or_b32 s43, s43, s46 +; SI-NEXT: s_lshl_b32 s46, s90, 16 +; SI-NEXT: s_and_b32 s40, s40, 0xffff +; SI-NEXT: s_or_b32 s40, s40, s46 +; SI-NEXT: s_and_b32 s41, s41, 0xffff +; SI-NEXT: s_lshl_b32 s46, s53, 16 +; SI-NEXT: s_or_b32 s41, s41, s46 +; SI-NEXT: s_lshl_b32 s46, s88, 16 ; SI-NEXT: s_and_b32 s24, s24, 0xffff -; SI-NEXT: s_lshl_b32 s27, s88, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 28, v0 -; SI-NEXT: s_or_b32 s24, s24, s27 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s24 -; SI-NEXT: s_and_b32 s24, s25, 0xffff -; SI-NEXT: s_lshl_b32 s25, s52, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 32, v0 -; SI-NEXT: s_or_b32 s24, s24, s25 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s24 +; SI-NEXT: s_or_b32 s24, s24, s46 +; SI-NEXT: s_and_b32 s25, s25, 0xffff +; SI-NEXT: s_lshl_b32 s46, s52, 16 +; SI-NEXT: s_or_b32 s25, s25, s46 +; SI-NEXT: s_lshl_b32 s46, s78, 16 ; SI-NEXT: s_and_b32 s22, s22, 0xffff -; SI-NEXT: s_lshl_b32 s24, s78, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 36, v0 -; SI-NEXT: s_or_b32 s22, s22, s24 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s22 -; SI-NEXT: s_and_b32 s22, s23, 0xffff -; SI-NEXT: s_lshl_b32 s23, s51, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 40, v0 -; SI-NEXT: s_or_b32 s22, s22, s23 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s22 +; SI-NEXT: s_or_b32 s22, s22, s46 +; SI-NEXT: s_and_b32 s23, s23, 0xffff +; SI-NEXT: s_lshl_b32 s46, s51, 16 +; SI-NEXT: s_or_b32 s23, s23, s46 ; SI-NEXT: s_and_b32 s20, s20, 0xffff -; SI-NEXT: s_lshl_b32 s22, s76, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 44, v0 -; SI-NEXT: s_or_b32 s20, s20, s22 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s20 -; SI-NEXT: s_and_b32 s20, s21, 0xffff -; SI-NEXT: s_lshl_b32 s21, s50, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 48, v0 -; SI-NEXT: s_or_b32 s20, s20, s21 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s20 +; SI-NEXT: s_lshl_b32 s46, s76, 16 +; SI-NEXT: s_or_b32 s20, s20, s46 +; SI-NEXT: s_and_b32 s21, s21, 0xffff +; SI-NEXT: s_lshl_b32 s46, s50, 16 +; SI-NEXT: s_or_b32 s21, s21, s46 ; SI-NEXT: s_and_b32 s18, s18, 0xffff -; SI-NEXT: s_lshl_b32 s20, s74, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 52, v0 -; SI-NEXT: s_or_b32 s18, s18, s20 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s18 -; SI-NEXT: s_and_b32 s18, s19, 0xffff -; SI-NEXT: s_lshl_b32 s19, s49, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 56, v0 -; SI-NEXT: s_or_b32 s18, s18, s19 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: s_lshl_b32 s46, s74, 16 +; SI-NEXT: s_or_b32 s18, s18, s46 +; SI-NEXT: s_and_b32 s19, s19, 0xffff +; SI-NEXT: s_lshl_b32 s46, s49, 16 +; SI-NEXT: s_or_b32 s19, s19, s46 ; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: s_lshl_b32 s18, s72, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 60, v0 -; SI-NEXT: s_or_b32 s16, s16, s18 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s17, 0xffff -; SI-NEXT: s_lshl_b32 s17, s48, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 64, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_lshl_b32 s46, s72, 16 +; SI-NEXT: s_or_b32 s16, s16, s46 +; SI-NEXT: s_and_b32 s17, s17, 0xffff +; SI-NEXT: s_lshl_b32 s46, s48, 16 +; SI-NEXT: s_or_b32 s17, s17, s46 ; SI-NEXT: s_and_b32 s14, s14, 0xffff -; SI-NEXT: s_lshl_b32 s16, s62, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x44, v0 -; SI-NEXT: s_or_b32 s14, s14, s16 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s14 -; SI-NEXT: s_and_b32 s14, s15, 0xffff -; SI-NEXT: s_lshl_b32 s15, s39, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x48, v0 -; SI-NEXT: s_or_b32 s14, s14, s15 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s14 +; SI-NEXT: s_lshl_b32 s46, s62, 16 +; SI-NEXT: s_or_b32 s14, s14, s46 +; SI-NEXT: s_and_b32 s15, s15, 0xffff +; SI-NEXT: s_lshl_b32 s46, s39, 16 +; SI-NEXT: s_or_b32 s15, s15, s46 ; SI-NEXT: s_and_b32 s12, s12, 0xffff -; SI-NEXT: s_lshl_b32 s14, s60, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x4c, v0 -; SI-NEXT: s_or_b32 s12, s12, s14 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s12 -; SI-NEXT: s_and_b32 s12, s13, 0xffff -; SI-NEXT: s_lshl_b32 s13, s38, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x50, v0 -; SI-NEXT: s_or_b32 s12, s12, s13 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s12 +; SI-NEXT: s_lshl_b32 s46, s60, 16 +; SI-NEXT: s_or_b32 s12, s12, s46 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_lshl_b32 s46, s38, 16 +; SI-NEXT: s_or_b32 s13, s13, s46 ; SI-NEXT: s_and_b32 s10, s10, 0xffff -; SI-NEXT: s_lshl_b32 s12, s58, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x54, v0 -; SI-NEXT: s_or_b32 s10, s10, s12 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s10 -; SI-NEXT: s_and_b32 s10, s11, 0xffff -; SI-NEXT: s_lshl_b32 s11, s37, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x58, v0 -; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: s_lshl_b32 s46, s58, 16 +; SI-NEXT: s_or_b32 s10, s10, s46 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: s_lshl_b32 s46, s37, 16 +; SI-NEXT: s_or_b32 s11, s11, s46 ; SI-NEXT: s_and_b32 s8, s8, 0xffff -; SI-NEXT: s_lshl_b32 s10, s56, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x5c, v0 -; SI-NEXT: s_or_b32 s8, s8, s10 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s8 -; SI-NEXT: s_and_b32 s8, s9, 0xffff -; SI-NEXT: s_lshl_b32 s9, s36, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x60, v0 -; SI-NEXT: s_or_b32 s8, s8, s9 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: s_lshl_b32 s46, s56, 16 ; SI-NEXT: s_and_b32 s6, s6, 0xffff -; SI-NEXT: s_lshl_b32 s8, s28, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x64, v0 -; SI-NEXT: s_or_b32 s6, s6, s8 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: s_and_b32 s6, s7, 0xffff -; SI-NEXT: s_lshl_b32 s7, s35, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x68, v0 -; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_lshl_b32 s28, s28, 16 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_lshl_b32 s6, s26, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x6c, v0 -; SI-NEXT: s_or_b32 s4, s4, s6 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s4 -; SI-NEXT: s_and_b32 s4, s5, 0xffff -; SI-NEXT: s_lshl_b32 s5, s34, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x70, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0 -; SI-NEXT: v_mov_b32_e32 v1, s4 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: v_readlane_b32 s64, v20, 16 -; SI-NEXT: v_readlane_b32 s55, v20, 15 -; SI-NEXT: v_readlane_b32 s54, v20, 14 -; SI-NEXT: v_readlane_b32 s53, v20, 13 -; SI-NEXT: v_readlane_b32 s52, v20, 12 -; SI-NEXT: v_readlane_b32 s51, v20, 11 -; SI-NEXT: v_readlane_b32 s50, v20, 10 -; SI-NEXT: v_readlane_b32 s49, v20, 9 -; SI-NEXT: v_readlane_b32 s48, v20, 8 -; SI-NEXT: v_readlane_b32 s39, v20, 7 -; SI-NEXT: v_readlane_b32 s38, v20, 6 -; SI-NEXT: v_readlane_b32 s37, v20, 5 -; SI-NEXT: v_readlane_b32 s36, v20, 4 -; SI-NEXT: v_readlane_b32 s35, v20, 3 -; SI-NEXT: v_readlane_b32 s34, v20, 2 -; SI-NEXT: v_readlane_b32 s31, v20, 1 -; SI-NEXT: v_readlane_b32 s30, v20, 0 +; SI-NEXT: s_lshl_b32 s26, s26, 16 +; SI-NEXT: s_or_b32 s8, s8, s46 +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_lshl_b32 s46, s36, 16 +; SI-NEXT: s_or_b32 s6, s6, s28 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_lshl_b32 s28, s35, 16 +; SI-NEXT: s_or_b32 s4, s4, s26 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s26, s34, 16 +; SI-NEXT: s_or_b32 s9, s9, s46 +; SI-NEXT: s_or_b32 s7, s7, s28 +; SI-NEXT: s_or_b32 s5, s5, s26 +; SI-NEXT: v_mov_b32_e32 v0, s27 +; SI-NEXT: v_mov_b32_e32 v1, s29 +; SI-NEXT: v_mov_b32_e32 v2, s44 +; SI-NEXT: v_mov_b32_e32 v3, s45 +; SI-NEXT: v_mov_b32_e32 v4, s42 +; SI-NEXT: v_mov_b32_e32 v5, s43 +; SI-NEXT: v_mov_b32_e32 v6, s40 +; SI-NEXT: v_mov_b32_e32 v7, s41 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s22 +; SI-NEXT: v_mov_b32_e32 v11, s23 +; SI-NEXT: v_mov_b32_e32 v12, s20 +; SI-NEXT: v_mov_b32_e32 v13, s21 +; SI-NEXT: v_mov_b32_e32 v14, s18 +; SI-NEXT: v_mov_b32_e32 v15, s19 +; SI-NEXT: v_mov_b32_e32 v16, s16 +; SI-NEXT: v_mov_b32_e32 v17, s17 +; SI-NEXT: v_mov_b32_e32 v18, s14 +; SI-NEXT: v_mov_b32_e32 v19, s15 +; SI-NEXT: v_mov_b32_e32 v20, s12 +; SI-NEXT: v_mov_b32_e32 v21, s13 +; SI-NEXT: v_mov_b32_e32 v22, s10 +; SI-NEXT: v_mov_b32_e32 v23, s11 +; SI-NEXT: v_mov_b32_e32 v24, s8 +; SI-NEXT: v_mov_b32_e32 v25, s9 +; SI-NEXT: v_mov_b32_e32 v26, s6 +; SI-NEXT: v_mov_b32_e32 v27, s7 +; SI-NEXT: v_mov_b32_e32 v28, s4 +; SI-NEXT: v_mov_b32_e32 v29, s5 +; SI-NEXT: v_readlane_b32 s64, v30, 16 +; SI-NEXT: v_readlane_b32 s55, v30, 15 +; SI-NEXT: v_readlane_b32 s54, v30, 14 +; SI-NEXT: v_readlane_b32 s53, v30, 13 +; SI-NEXT: v_readlane_b32 s52, v30, 12 +; SI-NEXT: v_readlane_b32 s51, v30, 11 +; SI-NEXT: v_readlane_b32 s50, v30, 10 +; SI-NEXT: v_readlane_b32 s49, v30, 9 +; SI-NEXT: v_readlane_b32 s48, v30, 8 +; SI-NEXT: v_readlane_b32 s39, v30, 7 +; SI-NEXT: v_readlane_b32 s38, v30, 6 +; SI-NEXT: v_readlane_b32 s37, v30, 5 +; SI-NEXT: v_readlane_b32 s36, v30, 4 +; SI-NEXT: v_readlane_b32 s35, v30, 3 +; SI-NEXT: v_readlane_b32 s34, v30, 2 +; SI-NEXT: v_readlane_b32 s31, v30, 1 +; SI-NEXT: v_readlane_b32 s30, v30, 0 ; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[4:5] -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB13_4: ; SI-NEXT: ; implicit-def: $sgpr30 @@ -5082,181 +4909,279 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v60i16_to_v30i32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:12 -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v29 -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:60 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:4 -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v2 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:52 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:112 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:108 -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v4 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:104 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:100 -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v8 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:36 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92 -; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:88 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:84 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v12 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v16 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v18 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v20 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v49, v20 +; SI-NEXT: v_mov_b32_e32 v50, v19 +; SI-NEXT: v_mov_b32_e32 v51, v18 +; SI-NEXT: v_mov_b32_e32 v52, v17 +; SI-NEXT: v_mov_b32_e32 v53, v16 +; SI-NEXT: v_mov_b32_e32 v54, v15 +; SI-NEXT: v_mov_b32_e32 v55, v14 +; SI-NEXT: v_mov_b32_e32 v40, v13 +; SI-NEXT: v_mov_b32_e32 v41, v12 +; SI-NEXT: v_mov_b32_e32 v42, v11 +; SI-NEXT: v_mov_b32_e32 v43, v10 +; SI-NEXT: v_mov_b32_e32 v44, v9 +; SI-NEXT: v_mov_b32_e32 v45, v8 +; SI-NEXT: v_mov_b32_e32 v46, v7 +; SI-NEXT: v_mov_b32_e32 v47, v6 +; SI-NEXT: v_mov_b32_e32 v56, v5 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_mov_b32_e32 v57, v4 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_mov_b32_e32 v58, v3 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_mov_b32_e32 v59, v2 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_mov_b32_e32 v60, v1 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v61, v0 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v29 +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v28 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v27 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v26 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v25 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v24 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v23 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v22 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v50 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v52 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v40 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v42 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v43 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v44 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v45 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v46 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v47 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v56 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v57 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v58 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v59 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v60 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v61 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v23 ; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v22 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v24 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:68 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v26 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:64 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v28 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB14_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v42 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v61 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v60 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v59 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v58 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v57 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v56 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v47 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v46 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v45 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v44 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v43 +; SI-NEXT: v_or_b32_e32 v0, v0, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v48 +; SI-NEXT: v_or_b32_e32 v2, v2, v32 +; SI-NEXT: v_or_b32_e32 v3, v3, v39 +; SI-NEXT: v_or_b32_e32 v4, v4, v34 +; SI-NEXT: v_or_b32_e32 v5, v5, v38 +; SI-NEXT: v_or_b32_e32 v6, v6, v63 +; SI-NEXT: v_or_b32_e32 v7, v7, v37 +; SI-NEXT: v_or_b32_e32 v8, v8, v33 +; SI-NEXT: v_or_b32_e32 v9, v9, v36 +; SI-NEXT: v_or_b32_e32 v10, v10, v62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v41 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v40 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v55 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v54 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v53 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v52 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v51 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v50 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v49 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v24, v24, v25 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v26, v26, v27 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: v_or_b32_e32 v28, v28, v29 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: v_or_b32_e32 v29, v29, v30 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; kill: killed $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr30 @@ -5308,195 +5233,40 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; kill: killed $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v61 ; SI-NEXT: ; kill: killed $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: v_or_b32_e32 v0, v0, v60 -; SI-NEXT: v_or_b32_e32 v18, v18, v37 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: ; kill: killed $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; SI-NEXT: v_or_b32_e32 v1, v1, v54 -; SI-NEXT: v_or_b32_e32 v2, v2, v59 -; SI-NEXT: v_or_b32_e32 v3, v3, v58 -; SI-NEXT: v_or_b32_e32 v4, v4, v53 -; SI-NEXT: v_or_b32_e32 v5, v5, v57 -; SI-NEXT: v_or_b32_e32 v6, v6, v52 -; SI-NEXT: v_or_b32_e32 v7, v7, v56 -; SI-NEXT: v_or_b32_e32 v8, v8, v51 -; SI-NEXT: v_or_b32_e32 v9, v9, v50 -; SI-NEXT: v_or_b32_e32 v10, v10, v49 -; SI-NEXT: v_or_b32_e32 v11, v11, v48 -; SI-NEXT: v_or_b32_e32 v12, v12, v47 -; SI-NEXT: v_or_b32_e32 v13, v13, v39 -; SI-NEXT: v_or_b32_e32 v14, v14, v46 -; SI-NEXT: v_or_b32_e32 v15, v15, v45 -; SI-NEXT: v_or_b32_e32 v16, v16, v38 -; SI-NEXT: v_or_b32_e32 v17, v17, v44 -; SI-NEXT: v_or_b32_e32 v19, v19, v36 -; SI-NEXT: v_or_b32_e32 v20, v20, v43 -; SI-NEXT: v_or_b32_e32 v21, v21, v35 -; SI-NEXT: v_or_b32_e32 v22, v22, v42 -; SI-NEXT: v_or_b32_e32 v23, v23, v34 -; SI-NEXT: v_or_b32_e32 v24, v24, v41 -; SI-NEXT: v_or_b32_e32 v25, v25, v33 -; SI-NEXT: v_or_b32_e32 v26, v26, v40 -; SI-NEXT: v_or_b32_e32 v27, v27, v32 -; SI-NEXT: v_or_b32_e32 v28, v28, v63 -; SI-NEXT: v_or_b32_e32 v29, v29, v62 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr62 ; SI-NEXT: .LBB14_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB14_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v61 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v42 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v61 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v60 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v59 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v58 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v57 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v56 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v47 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v46 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v45 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v44 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v43 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; SI-NEXT: v_or_b32_e32 v0, v60, v0 -; SI-NEXT: s_mov_b32 s6, 0x30000 -; SI-NEXT: v_or_b32_e32 v18, v37, v18 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 @@ -5507,52 +5277,19 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; SI-NEXT: v_or_b32_e32 v1, v54, v1 -; SI-NEXT: v_or_b32_e32 v2, v59, v2 -; SI-NEXT: v_or_b32_e32 v3, v58, v3 -; SI-NEXT: v_or_b32_e32 v4, v53, v4 -; SI-NEXT: v_or_b32_e32 v5, v57, v5 -; SI-NEXT: v_or_b32_e32 v6, v52, v6 -; SI-NEXT: v_or_b32_e32 v7, v56, v7 -; SI-NEXT: v_or_b32_e32 v8, v51, v8 -; SI-NEXT: v_or_b32_e32 v9, v50, v9 -; SI-NEXT: v_or_b32_e32 v10, v49, v10 -; SI-NEXT: v_or_b32_e32 v11, v48, v11 -; SI-NEXT: v_or_b32_e32 v12, v47, v12 -; SI-NEXT: v_or_b32_e32 v13, v39, v13 -; SI-NEXT: v_or_b32_e32 v14, v46, v14 -; SI-NEXT: v_or_b32_e32 v15, v45, v15 -; SI-NEXT: v_or_b32_e32 v16, v38, v16 -; SI-NEXT: v_or_b32_e32 v17, v44, v17 -; SI-NEXT: v_or_b32_e32 v19, v36, v19 -; SI-NEXT: v_or_b32_e32 v20, v43, v20 -; SI-NEXT: v_or_b32_e32 v21, v35, v21 -; SI-NEXT: v_or_b32_e32 v22, v42, v22 -; SI-NEXT: v_or_b32_e32 v23, v34, v23 -; SI-NEXT: v_or_b32_e32 v24, v41, v24 -; SI-NEXT: v_or_b32_e32 v25, v33, v25 -; SI-NEXT: v_or_b32_e32 v26, v40, v26 -; SI-NEXT: v_or_b32_e32 v27, v32, v27 -; SI-NEXT: v_or_b32_e32 v28, v63, v28 -; SI-NEXT: v_or_b32_e32 v29, v62, v29 +; SI-NEXT: v_or_b32_e32 v0, v35, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v1, v48, v1 +; SI-NEXT: v_or_b32_e32 v2, v32, v2 +; SI-NEXT: v_or_b32_e32 v3, v39, v3 +; SI-NEXT: v_or_b32_e32 v4, v34, v4 +; SI-NEXT: v_or_b32_e32 v5, v38, v5 +; SI-NEXT: v_or_b32_e32 v6, v63, v6 +; SI-NEXT: v_or_b32_e32 v7, v37, v7 +; SI-NEXT: v_or_b32_e32 v8, v33, v8 +; SI-NEXT: v_or_b32_e32 v9, v36, v9 +; SI-NEXT: v_or_b32_e32 v10, v62, v10 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 ; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 @@ -5563,6 +5300,46 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 ; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 ; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v41 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v40 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v55 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v54 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v53 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v52 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v51 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v50 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v49 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 ; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 ; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 @@ -5570,36 +5347,90 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 ; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 ; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 ; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 ; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v20 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v25, vcc, s6, v25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v26, v27, v26 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v26, vcc, s6, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v27, vcc, 0x30000, v27 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: v_or_b32_e32 v28, v29, v28 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v28, vcc, 0x30000, v28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: v_or_b32_e32 v29, v30, v29 ; SI-NEXT: v_add_i32_e32 v29, vcc, 0x30000, v29 ; SI-NEXT: .LBB14_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v60i16_to_v30i32: @@ -6423,424 +6254,349 @@ define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i3 ; SI-LABEL: bitcast_v60i16_to_v30i32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_mov_b32_e32 v60, v16 -; SI-NEXT: v_mov_b32_e32 v53, v14 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mov_b32_e32 v62, v12 -; SI-NEXT: v_mov_b32_e32 v32, v10 -; SI-NEXT: v_mov_b32_e32 v55, v8 -; SI-NEXT: v_mov_b32_e32 v37, v6 -; SI-NEXT: v_mov_b32_e32 v41, v4 -; SI-NEXT: v_mov_b32_e32 v44, v2 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v63, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:52 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v30, v28 -; SI-NEXT: v_mov_b32_e32 v39, v26 -; SI-NEXT: v_mov_b32_e32 v48, v24 -; SI-NEXT: v_mov_b32_e32 v49, v22 -; SI-NEXT: v_mov_b32_e32 v47, v20 -; SI-NEXT: v_mov_b32_e32 v50, v18 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v29 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v4 +; SI-NEXT: v_mov_b32_e32 v52, v3 +; SI-NEXT: v_mov_b32_e32 v51, v4 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v52 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v50, v5 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v51 +; SI-NEXT: v_mov_b32_e32 v49, v6 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v50 +; SI-NEXT: v_mov_b32_e32 v48, v7 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v49 +; SI-NEXT: v_mov_b32_e32 v39, v8 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v48 +; SI-NEXT: v_mov_b32_e32 v38, v9 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v39 +; SI-NEXT: v_mov_b32_e32 v37, v10 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v38 +; SI-NEXT: v_mov_b32_e32 v36, v11 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v37 +; SI-NEXT: v_mov_b32_e32 v35, v12 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v36 +; SI-NEXT: v_mov_b32_e32 v34, v13 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v35 +; SI-NEXT: v_mov_b32_e32 v33, v14 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v34 +; SI-NEXT: v_mov_b32_e32 v32, v15 +; SI-NEXT: v_mov_b32_e32 v53, v2 +; SI-NEXT: v_mov_b32_e32 v54, v1 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v54 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v55 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s7, s28, 16 +; SI-NEXT: s_lshr_b32 s8, s27, 16 +; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: s_lshr_b32 s13, s22, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s40, s19, 16 +; SI-NEXT: s_lshr_b32 s41, s18, 16 +; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v32 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v0 ; SI-NEXT: s_cbranch_scc0 .LBB15_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v63 -; SI-NEXT: v_or_b32_e32 v7, v0, v31 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v41 -; SI-NEXT: v_or_b32_e32 v9, v0, v28 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 -; SI-NEXT: v_or_b32_e32 v10, v0, v24 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; SI-NEXT: v_or_b32_e32 v11, v0, v22 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 -; SI-NEXT: v_or_b32_e32 v12, v0, v20 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v62 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_or_b32_e32 v13, v0, v13 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: v_or_b32_e32 v14, v0, v63 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 -; SI-NEXT: v_or_b32_e32 v14, v0, v18 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v60 -; SI-NEXT: v_or_b32_e32 v15, v0, v15 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s44, s42, 16 +; SI-NEXT: v_or_b32_e32 v16, v0, v61 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; SI-NEXT: s_or_b32 s5, s5, s44 +; SI-NEXT: s_and_b32 s44, s18, 0xffff +; SI-NEXT: s_lshl_b32 s45, s41, 16 +; SI-NEXT: v_or_b32_e32 v17, v0, v60 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; SI-NEXT: s_or_b32 s44, s44, s45 +; SI-NEXT: s_and_b32 s45, s19, 0xffff +; SI-NEXT: s_lshl_b32 s46, s40, 16 +; SI-NEXT: v_or_b32_e32 v18, v0, v59 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 -; SI-NEXT: v_or_b32_e32 v16, v0, v17 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v47 -; SI-NEXT: v_or_b32_e32 v17, v0, v5 +; SI-NEXT: s_or_b32 s45, s45, s46 +; SI-NEXT: s_and_b32 s46, s20, 0xffff +; SI-NEXT: s_lshl_b32 s47, s15, 16 +; SI-NEXT: v_or_b32_e32 v19, v0, v58 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 -; SI-NEXT: v_or_b32_e32 v18, v0, v3 +; SI-NEXT: s_or_b32 s46, s46, s47 +; SI-NEXT: s_and_b32 s47, s21, 0xffff +; SI-NEXT: s_lshl_b32 s56, s14, 16 +; SI-NEXT: v_or_b32_e32 v20, v0, v57 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 -; SI-NEXT: v_or_b32_e32 v19, v0, v46 +; SI-NEXT: s_or_b32 s47, s47, s56 +; SI-NEXT: s_and_b32 s56, s22, 0xffff +; SI-NEXT: s_lshl_b32 s57, s13, 16 +; SI-NEXT: v_or_b32_e32 v21, v0, v56 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_or_b32_e32 v20, v0, v45 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 -; SI-NEXT: v_or_b32_e32 v21, v0, v43 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v44 -; SI-NEXT: s_or_b32 s7, s7, s8 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: v_or_b32_e32 v8, v1, v26 -; SI-NEXT: s_or_b32 s8, s8, s9 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: s_or_b32 s9, s9, s10 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_mov_b32_e32 v5, s9 -; SI-NEXT: v_mov_b32_e32 v6, s10 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_or_b32_e32 v22, v0, v42 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v61 -; SI-NEXT: v_or_b32_e32 v23, v0, v40 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_or_b32_e32 v24, v0, v38 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v59 -; SI-NEXT: v_or_b32_e32 v25, v0, v36 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v58 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_or_b32_e32 v26, v0, v35 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 -; SI-NEXT: v_or_b32_e32 v27, v0, v34 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v57 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_or_b32_e32 v28, v0, v54 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v56 -; SI-NEXT: v_or_b32_e32 v29, v0, v33 +; SI-NEXT: s_or_b32 s56, s56, s57 +; SI-NEXT: s_and_b32 s57, s23, 0xffff +; SI-NEXT: s_lshl_b32 s58, s12, 16 +; SI-NEXT: v_or_b32_e32 v22, v0, v47 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: s_or_b32 s57, s57, s58 +; SI-NEXT: s_and_b32 s58, s24, 0xffff +; SI-NEXT: s_lshl_b32 s59, s11, 16 +; SI-NEXT: v_or_b32_e32 v23, v0, v46 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: s_or_b32 s58, s58, s59 +; SI-NEXT: s_and_b32 s59, s25, 0xffff +; SI-NEXT: s_lshl_b32 s60, s10, 16 +; SI-NEXT: v_or_b32_e32 v24, v0, v45 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: s_or_b32 s59, s59, s60 +; SI-NEXT: s_and_b32 s60, s26, 0xffff +; SI-NEXT: s_lshl_b32 s61, s9, 16 +; SI-NEXT: v_or_b32_e32 v25, v0, v44 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: s_or_b32 s60, s60, s61 +; SI-NEXT: s_and_b32 s61, s27, 0xffff +; SI-NEXT: s_lshl_b32 s62, s8, 16 +; SI-NEXT: v_or_b32_e32 v26, v0, v43 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: s_or_b32 s61, s61, s62 +; SI-NEXT: s_and_b32 s62, s28, 0xffff +; SI-NEXT: s_lshl_b32 s63, s7, 16 +; SI-NEXT: v_or_b32_e32 v27, v0, v42 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: s_or_b32 s62, s62, s63 +; SI-NEXT: s_and_b32 s63, s29, 0xffff +; SI-NEXT: s_lshl_b32 s72, s6, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; SI-NEXT: v_or_b32_e32 v28, v0, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: s_or_b32 s63, s63, s72 +; SI-NEXT: v_or_b32_e32 v15, v1, v62 +; SI-NEXT: v_or_b32_e32 v29, v0, v40 ; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s44 +; SI-NEXT: v_mov_b32_e32 v3, s45 +; SI-NEXT: v_mov_b32_e32 v4, s46 +; SI-NEXT: v_mov_b32_e32 v5, s47 +; SI-NEXT: v_mov_b32_e32 v6, s56 +; SI-NEXT: v_mov_b32_e32 v7, s57 +; SI-NEXT: v_mov_b32_e32 v8, s58 +; SI-NEXT: v_mov_b32_e32 v9, s59 +; SI-NEXT: v_mov_b32_e32 v10, s60 +; SI-NEXT: v_mov_b32_e32 v11, s61 +; SI-NEXT: v_mov_b32_e32 v12, s62 +; SI-NEXT: v_mov_b32_e32 v13, s63 ; SI-NEXT: s_cbranch_execnz .LBB15_3 ; SI-NEXT: .LBB15_2: ; %cmp.true -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v63 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: s_or_b32 s7, s8, s7 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: s_or_b32 s8, s9, s8 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: s_or_b32 s9, s10, s9 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: s_or_b32 s10, s11, s10 -; SI-NEXT: s_add_i32 s4, s4, 0x30000 -; SI-NEXT: s_add_i32 s5, s5, 0x30000 -; SI-NEXT: s_add_i32 s6, s6, 0x30000 -; SI-NEXT: s_add_i32 s7, s7, 0x30000 -; SI-NEXT: s_add_i32 s8, s8, 0x30000 -; SI-NEXT: s_add_i32 s9, s9, 0x30000 -; SI-NEXT: s_add_i32 s10, s10, 0x30000 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_mov_b32_e32 v5, s9 -; SI-NEXT: v_mov_b32_e32 v6, s10 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v44 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v62 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v0, v63, v0 ; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v60 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v0, v61, v0 ; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v47 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v0, v60, v0 ; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v0, v59, v0 ; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: v_or_b32_e32 v0, v58, v0 ; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v45, v0 +; SI-NEXT: v_or_b32_e32 v0, v57, v0 ; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: v_or_b32_e32 v0, v56, v0 ; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: v_or_b32_e32 v0, v47, v0 ; SI-NEXT: v_add_i32_e32 v22, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v61 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v40, v0 +; SI-NEXT: v_or_b32_e32 v0, v46, v0 ; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v38, v0 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: v_or_b32_e32 v0, v45, v0 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s16, s42, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: v_add_i32_e32 v24, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v59 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: s_and_b32 s16, s18, 0xffff +; SI-NEXT: s_lshl_b32 s17, s41, 16 +; SI-NEXT: s_add_i32 s19, s19, 3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v36, v0 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_and_b32 s17, s19, 0xffff +; SI-NEXT: s_lshl_b32 s18, s40, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_or_b32_e32 v0, v44, v0 +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: s_and_b32 s18, s20, 0xffff +; SI-NEXT: s_lshl_b32 s15, s15, 16 +; SI-NEXT: s_add_i32 s21, s21, 3 ; SI-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v58 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: s_or_b32 s15, s15, s18 +; SI-NEXT: s_and_b32 s18, s21, 0xffff +; SI-NEXT: s_lshl_b32 s14, s14, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v35, v0 +; SI-NEXT: s_or_b32 s14, s14, s18 +; SI-NEXT: s_and_b32 s18, s22, 0xffff +; SI-NEXT: s_lshl_b32 s13, s13, 16 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: s_or_b32 s13, s13, s18 +; SI-NEXT: s_and_b32 s18, s23, 0xffff +; SI-NEXT: s_lshl_b32 s12, s12, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 ; SI-NEXT: v_add_i32_e32 v26, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 +; SI-NEXT: s_or_b32 s12, s12, s18 +; SI-NEXT: s_and_b32 s18, s24, 0xffff +; SI-NEXT: s_lshl_b32 s11, s11, 16 +; SI-NEXT: s_add_i32 s25, s25, 3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v34, v0 +; SI-NEXT: s_or_b32 s11, s11, s18 +; SI-NEXT: s_and_b32 s18, s25, 0xffff +; SI-NEXT: s_lshl_b32 s10, s10, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: s_or_b32 s10, s10, s18 +; SI-NEXT: s_and_b32 s18, s26, 0xffff +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_add_i32 s27, s27, 3 ; SI-NEXT: v_add_i32_e32 v27, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v57 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: s_or_b32 s9, s9, s18 +; SI-NEXT: s_and_b32 s18, s27, 0xffff +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v54, v0 +; SI-NEXT: s_or_b32 s8, s8, s18 +; SI-NEXT: s_and_b32 s18, s28, 0xffff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: v_or_b32_e32 v0, v41, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: s_or_b32 s7, s7, s18 +; SI-NEXT: s_and_b32 s18, s29, 0xffff +; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: v_add_i32_e32 v28, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v56 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_or_b32 s6, s6, s18 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v33, v0 +; SI-NEXT: v_or_b32_e32 v1, v62, v1 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s16, s16, 0x30000 +; SI-NEXT: s_add_i32 s17, s17, 0x30000 +; SI-NEXT: s_add_i32 s15, s15, 0x30000 +; SI-NEXT: s_add_i32 s14, s14, 0x30000 +; SI-NEXT: s_add_i32 s13, s13, 0x30000 +; SI-NEXT: s_add_i32 s12, s12, 0x30000 +; SI-NEXT: s_add_i32 s11, s11, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v40, v0 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v1 ; SI-NEXT: v_add_i32_e32 v29, vcc, 0x30000, v0 ; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: v_mov_b32_e32 v3, s17 +; SI-NEXT: v_mov_b32_e32 v4, s15 +; SI-NEXT: v_mov_b32_e32 v5, s14 +; SI-NEXT: v_mov_b32_e32 v6, s13 +; SI-NEXT: v_mov_b32_e32 v7, s12 +; SI-NEXT: v_mov_b32_e32 v8, s11 +; SI-NEXT: v_mov_b32_e32 v9, s10 +; SI-NEXT: v_mov_b32_e32 v10, s9 +; SI-NEXT: v_mov_b32_e32 v11, s8 +; SI-NEXT: v_mov_b32_e32 v12, s7 +; SI-NEXT: v_mov_b32_e32 v13, s6 ; SI-NEXT: .LBB15_3: ; %end -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB15_4: -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v46, v44 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v45, v43 -; SI-NEXT: v_mov_b32_e32 v44, v42 -; SI-NEXT: v_mov_b32_e32 v43, v41 -; SI-NEXT: v_mov_b32_e32 v42, v40 -; SI-NEXT: v_mov_b32_e32 v41, v38 -; SI-NEXT: v_mov_b32_e32 v40, v37 -; SI-NEXT: v_mov_b32_e32 v38, v36 -; SI-NEXT: v_mov_b32_e32 v37, v35 -; SI-NEXT: v_mov_b32_e32 v36, v55 -; SI-NEXT: v_mov_b32_e32 v55, v34 -; SI-NEXT: v_mov_b32_e32 v35, v54 -; SI-NEXT: v_mov_b32_e32 v54, v33 -; SI-NEXT: v_mov_b32_e32 v34, v32 -; SI-NEXT: v_mov_b32_e32 v33, v62 -; SI-NEXT: v_mov_b32_e32 v62, v60 -; SI-NEXT: v_mov_b32_e32 v32, v63 -; SI-NEXT: v_mov_b32_e32 v63, v53 -; SI-NEXT: v_mov_b32_e32 v53, v61 -; SI-NEXT: v_mov_b32_e32 v61, v52 -; SI-NEXT: v_mov_b32_e32 v52, v59 -; SI-NEXT: v_mov_b32_e32 v59, v51 -; SI-NEXT: v_mov_b32_e32 v51, v57 -; SI-NEXT: v_mov_b32_e32 v57, v50 -; SI-NEXT: v_mov_b32_e32 v50, v47 -; SI-NEXT: v_mov_b32_e32 v47, v48 -; SI-NEXT: v_mov_b32_e32 v48, v30 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v60, v62 -; SI-NEXT: v_mov_b32_e32 v30, v48 -; SI-NEXT: v_mov_b32_e32 v48, v47 -; SI-NEXT: v_mov_b32_e32 v47, v50 -; SI-NEXT: v_mov_b32_e32 v50, v57 -; SI-NEXT: v_mov_b32_e32 v57, v51 -; SI-NEXT: v_mov_b32_e32 v51, v59 -; SI-NEXT: v_mov_b32_e32 v59, v52 -; SI-NEXT: v_mov_b32_e32 v52, v61 -; SI-NEXT: v_mov_b32_e32 v61, v53 -; SI-NEXT: v_mov_b32_e32 v53, v63 -; SI-NEXT: v_mov_b32_e32 v63, v32 -; SI-NEXT: v_mov_b32_e32 v62, v33 -; SI-NEXT: v_mov_b32_e32 v32, v34 -; SI-NEXT: v_mov_b32_e32 v33, v54 -; SI-NEXT: v_mov_b32_e32 v54, v35 -; SI-NEXT: v_mov_b32_e32 v34, v55 -; SI-NEXT: v_mov_b32_e32 v55, v36 -; SI-NEXT: v_mov_b32_e32 v35, v37 -; SI-NEXT: v_mov_b32_e32 v36, v38 -; SI-NEXT: v_mov_b32_e32 v37, v40 -; SI-NEXT: v_mov_b32_e32 v38, v41 -; SI-NEXT: v_mov_b32_e32 v40, v42 -; SI-NEXT: v_mov_b32_e32 v41, v43 -; SI-NEXT: v_mov_b32_e32 v42, v44 -; SI-NEXT: v_mov_b32_e32 v43, v45 -; SI-NEXT: v_mov_b32_e32 v44, v46 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: s_branch .LBB15_2 ; ; VI-LABEL: bitcast_v60i16_to_v30i32_scalar: @@ -7617,190 +7373,193 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) { ; SI-LABEL: bitcast_v30i32_to_v60f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB16_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v23 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v61, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v11 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v32 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v32 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v5 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v22 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v59, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v21 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v62, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v13 ; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v27 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v25 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 @@ -7813,44 +7572,38 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v25 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v44, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v28 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v40, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v0 +; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 @@ -7880,38 +7633,39 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: .LBB16_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB16_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v35 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 ; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v30 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v18 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v34 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v34 -; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v20 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v32 -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v37 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v62 +; SI-NEXT: v_mov_b32_e32 v62, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v60 ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 ; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 @@ -7927,38 +7681,35 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 ; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 ; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 ; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 ; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 ; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 ; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 ; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 ; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 -; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v16 ; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v29 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 ; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 @@ -7985,346 +7736,256 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 ; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 ; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 ; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 ; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v61 -; SI-NEXT: v_mov_b32_e32 v61, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v59 +; SI-NEXT: v_mov_b32_e32 v59, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 ; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 ; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 ; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 ; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 ; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 ; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 ; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 ; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 ; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 -; SI-NEXT: v_mov_b32_e32 v37, v27 -; SI-NEXT: v_mov_b32_e32 v35, v28 -; SI-NEXT: v_mov_b32_e32 v34, v29 -; SI-NEXT: v_mov_b32_e32 v32, v30 -; SI-NEXT: v_mov_b32_e32 v63, v25 -; SI-NEXT: v_mov_b32_e32 v59, v26 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v34, v27 +; SI-NEXT: v_mov_b32_e32 v32, v29 +; SI-NEXT: v_mov_b32_e32 v30, v28 +; SI-NEXT: v_mov_b32_e32 v60, v25 +; SI-NEXT: v_mov_b32_e32 v57, v26 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: .LBB16_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v46 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v47 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v44 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v43 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v40 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v44 -; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 -; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 -; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 -; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 -; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 -; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 -; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 -; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 -; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 -; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v31 -; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v62 -; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v4, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v36 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v62 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v32 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v6, v50 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 -; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v8, v37 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v58 -; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v10, v33 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v12, v63 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v14, v58 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v22, v59 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 +; SI-NEXT: v_or_b32_e32 v23, v25, v23 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v24, v38 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 +; SI-NEXT: v_or_b32_e32 v25, v27, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v57 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v26, v49 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_or_b32_e32 v26, v27, v26 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v53 +; SI-NEXT: v_or_b32_e32 v27, v29, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v55 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_or_b32_e32 v28, v29, v28 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v30 +; SI-NEXT: v_or_b32_e32 v29, v31, v29 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v63 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v59 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v35 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v34 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v32 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v30i32_to_v60f16: @@ -8972,52 +8633,52 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i ; SI-LABEL: bitcast_v30i32_to_v60f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v18, s16 -; SI-NEXT: v_mov_b32_e32 v19, s17 -; SI-NEXT: v_readfirstlane_b32 s40, v18 -; SI-NEXT: v_mov_b32_e32 v18, s18 -; SI-NEXT: v_readfirstlane_b32 s41, v19 -; SI-NEXT: v_mov_b32_e32 v19, s19 -; SI-NEXT: v_readfirstlane_b32 s42, v18 +; SI-NEXT: v_mov_b32_e32 v17, s16 +; SI-NEXT: v_mov_b32_e32 v18, s17 +; SI-NEXT: v_mov_b32_e32 v19, s18 +; SI-NEXT: v_readfirstlane_b32 s40, v17 +; SI-NEXT: v_mov_b32_e32 v17, s19 +; SI-NEXT: v_readfirstlane_b32 s41, v18 ; SI-NEXT: v_mov_b32_e32 v18, s20 -; SI-NEXT: v_readfirstlane_b32 s43, v19 +; SI-NEXT: v_readfirstlane_b32 s42, v19 ; SI-NEXT: v_mov_b32_e32 v19, s21 +; SI-NEXT: v_readfirstlane_b32 s43, v17 +; SI-NEXT: v_mov_b32_e32 v17, s22 ; SI-NEXT: v_readfirstlane_b32 s44, v18 -; SI-NEXT: v_mov_b32_e32 v18, s22 +; SI-NEXT: v_mov_b32_e32 v18, s23 ; SI-NEXT: v_readfirstlane_b32 s45, v19 -; SI-NEXT: v_mov_b32_e32 v19, s23 -; SI-NEXT: v_readfirstlane_b32 s23, v18 -; SI-NEXT: v_mov_b32_e32 v18, s24 -; SI-NEXT: v_readfirstlane_b32 s24, v19 -; SI-NEXT: v_mov_b32_e32 v19, s25 +; SI-NEXT: v_mov_b32_e32 v19, s24 +; SI-NEXT: v_readfirstlane_b32 s24, v17 +; SI-NEXT: v_mov_b32_e32 v17, s25 ; SI-NEXT: v_readfirstlane_b32 s25, v18 ; SI-NEXT: v_mov_b32_e32 v18, s26 ; SI-NEXT: v_readfirstlane_b32 s26, v19 ; SI-NEXT: v_mov_b32_e32 v19, s27 -; SI-NEXT: v_readfirstlane_b32 s27, v18 -; SI-NEXT: v_mov_b32_e32 v18, s28 -; SI-NEXT: v_readfirstlane_b32 s28, v19 -; SI-NEXT: v_mov_b32_e32 v19, s29 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 -; SI-NEXT: v_readfirstlane_b32 s29, v18 -; SI-NEXT: v_readfirstlane_b32 s22, v19 -; SI-NEXT: v_readfirstlane_b32 s21, v1 -; SI-NEXT: v_readfirstlane_b32 s20, v2 -; SI-NEXT: v_readfirstlane_b32 s19, v3 -; SI-NEXT: v_readfirstlane_b32 s18, v4 -; SI-NEXT: v_readfirstlane_b32 s17, v5 -; SI-NEXT: v_readfirstlane_b32 s16, v6 -; SI-NEXT: v_readfirstlane_b32 s15, v7 -; SI-NEXT: v_readfirstlane_b32 s14, v8 -; SI-NEXT: v_readfirstlane_b32 s13, v9 -; SI-NEXT: v_readfirstlane_b32 s12, v10 -; SI-NEXT: v_readfirstlane_b32 s11, v11 -; SI-NEXT: v_readfirstlane_b32 s10, v12 -; SI-NEXT: v_readfirstlane_b32 s8, v13 -; SI-NEXT: v_readfirstlane_b32 s7, v14 -; SI-NEXT: v_readfirstlane_b32 s6, v15 +; SI-NEXT: v_readfirstlane_b32 s27, v17 +; SI-NEXT: v_mov_b32_e32 v17, s28 +; SI-NEXT: v_readfirstlane_b32 s28, v18 +; SI-NEXT: v_mov_b32_e32 v18, s29 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: v_readfirstlane_b32 s29, v19 +; SI-NEXT: v_readfirstlane_b32 s23, v17 +; SI-NEXT: v_readfirstlane_b32 s22, v18 +; SI-NEXT: v_readfirstlane_b32 s21, v0 +; SI-NEXT: v_readfirstlane_b32 s20, v1 +; SI-NEXT: v_readfirstlane_b32 s19, v2 +; SI-NEXT: v_readfirstlane_b32 s18, v3 +; SI-NEXT: v_readfirstlane_b32 s17, v4 +; SI-NEXT: v_readfirstlane_b32 s16, v5 +; SI-NEXT: v_readfirstlane_b32 s15, v6 +; SI-NEXT: v_readfirstlane_b32 s14, v7 +; SI-NEXT: v_readfirstlane_b32 s13, v8 +; SI-NEXT: v_readfirstlane_b32 s12, v9 +; SI-NEXT: v_readfirstlane_b32 s11, v10 +; SI-NEXT: v_readfirstlane_b32 s10, v11 +; SI-NEXT: v_readfirstlane_b32 s8, v12 +; SI-NEXT: v_readfirstlane_b32 s7, v13 +; SI-NEXT: v_readfirstlane_b32 s6, v14 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s9, v16 +; SI-NEXT: v_readfirstlane_b32 s9, v15 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill @@ -9034,99 +8695,98 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i ; SI-NEXT: s_cbranch_scc0 .LBB17_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_lshr_b32 s4, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s4 ; SI-NEXT: s_lshr_b32 s4, s6, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s4 ; SI-NEXT: s_lshr_b32 s4, s7, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s4 ; SI-NEXT: s_lshr_b32 s4, s8, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 ; SI-NEXT: s_lshr_b32 s4, s10, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s4 ; SI-NEXT: s_lshr_b32 s4, s11, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s4 ; SI-NEXT: s_lshr_b32 s4, s12, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s4 ; SI-NEXT: s_lshr_b32 s4, s13, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 ; SI-NEXT: s_lshr_b32 s4, s14, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 ; SI-NEXT: s_lshr_b32 s4, s15, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 ; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 ; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 ; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 ; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 ; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 ; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 ; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 ; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 ; SI-NEXT: s_lshr_b32 s4, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 ; SI-NEXT: s_lshr_b32 s4, s27, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 ; SI-NEXT: s_lshr_b32 s4, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v49, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 ; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v51, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 ; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v53, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v55, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v44, s4 ; SI-NEXT: s_lshr_b32 s4, s45, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v41, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 ; SI-NEXT: s_lshr_b32 s4, s44, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v43, s4 -; SI-NEXT: s_lshr_b32 s4, s43, 16 ; SI-NEXT: s_waitcnt expcnt(6) ; SI-NEXT: v_cvt_f32_f16_e32 v46, s4 +; SI-NEXT: s_lshr_b32 s4, s43, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 ; SI-NEXT: s_lshr_b32 s4, s42, 16 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v56, s4 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v57, s4 ; SI-NEXT: s_lshr_b32 s4, s41, 16 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v58, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 ; SI-NEXT: s_lshr_b32 s4, s40, 16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v60, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v54, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v40, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v42, s45 -; SI-NEXT: v_cvt_f32_f16_e32 v44, s44 -; SI-NEXT: v_cvt_f32_f16_e32 v45, s43 -; SI-NEXT: v_cvt_f32_f16_e32 v47, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v57, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v59, s40 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v59, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v51, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v53, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v54, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v55, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v40, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v41, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v42, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v43, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v45, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s45 +; SI-NEXT: v_cvt_f32_f16_e32 v47, s44 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v56, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v58, s40 ; SI-NEXT: s_cbranch_execnz .LBB17_3 ; SI-NEXT: .LBB17_2: ; %cmp.true ; SI-NEXT: s_add_i32 s40, s40, 3 @@ -9135,13 +8795,13 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i ; SI-NEXT: s_add_i32 s43, s43, 3 ; SI-NEXT: s_add_i32 s44, s44, 3 ; SI-NEXT: s_add_i32 s45, s45, 3 -; SI-NEXT: s_add_i32 s23, s23, 3 ; SI-NEXT: s_add_i32 s24, s24, 3 ; SI-NEXT: s_add_i32 s25, s25, 3 ; SI-NEXT: s_add_i32 s26, s26, 3 ; SI-NEXT: s_add_i32 s27, s27, 3 ; SI-NEXT: s_add_i32 s28, s28, 3 ; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 ; SI-NEXT: s_add_i32 s22, s22, 3 ; SI-NEXT: s_add_i32 s21, s21, 3 ; SI-NEXT: s_add_i32 s20, s20, 3 @@ -9165,13 +8825,13 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i ; SI-NEXT: s_lshr_b32 s47, s43, 16 ; SI-NEXT: s_lshr_b32 s56, s44, 16 ; SI-NEXT: s_lshr_b32 s57, s45, 16 -; SI-NEXT: s_lshr_b32 s58, s23, 16 -; SI-NEXT: s_lshr_b32 s59, s24, 16 -; SI-NEXT: s_lshr_b32 s60, s25, 16 -; SI-NEXT: s_lshr_b32 s61, s26, 16 -; SI-NEXT: s_lshr_b32 s62, s27, 16 -; SI-NEXT: s_lshr_b32 s63, s28, 16 -; SI-NEXT: s_lshr_b32 s72, s29, 16 +; SI-NEXT: s_lshr_b32 s58, s24, 16 +; SI-NEXT: s_lshr_b32 s59, s25, 16 +; SI-NEXT: s_lshr_b32 s60, s26, 16 +; SI-NEXT: s_lshr_b32 s61, s27, 16 +; SI-NEXT: s_lshr_b32 s62, s28, 16 +; SI-NEXT: s_lshr_b32 s63, s29, 16 +; SI-NEXT: s_lshr_b32 s72, s23, 16 ; SI-NEXT: s_lshr_b32 s73, s22, 16 ; SI-NEXT: s_lshr_b32 s74, s21, 16 ; SI-NEXT: s_lshr_b32 s75, s20, 16 @@ -9189,280 +8849,125 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i ; SI-NEXT: s_lshr_b32 s95, s7, 16 ; SI-NEXT: s_lshr_b32 vcc_lo, s6, 16 ; SI-NEXT: s_lshr_b32 vcc_hi, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v54, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v40, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v42, s45 -; SI-NEXT: v_cvt_f32_f16_e32 v44, s44 -; SI-NEXT: v_cvt_f32_f16_e32 v45, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v51, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v53, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v54, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v55, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v40, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v41, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v42, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v43, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v45, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s45 ; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v47, s42 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v57, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v47, s44 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s43 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v56, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s41 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v58, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v28, vcc_hi +; SI-NEXT: v_cvt_f32_f16_e32 v29, vcc_lo +; SI-NEXT: v_cvt_f32_f16_e32 v26, s95 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s94 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s93 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s92 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s91 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s90 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s89 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s88 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s79 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s78 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s77 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s76 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s75 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s74 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s73 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s72 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s63 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s62 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s61 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s60 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s59 +; SI-NEXT: v_cvt_f32_f16_e32 v44, s58 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s57 +; SI-NEXT: v_cvt_f32_f16_e32 v46, s56 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s47 +; SI-NEXT: v_cvt_f32_f16_e32 v57, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v59, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v1, vcc_hi -; SI-NEXT: v_cvt_f32_f16_e32 v2, vcc_lo -; SI-NEXT: v_cvt_f32_f16_e32 v3, s95 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s94 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s93 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s92 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s91 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s90 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s89 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s88 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s79 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s78 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s77 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s76 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s75 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s74 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s73 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s72 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s63 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s62 -; SI-NEXT: v_cvt_f32_f16_e32 v49, s61 -; SI-NEXT: v_cvt_f32_f16_e32 v51, s60 -; SI-NEXT: v_cvt_f32_f16_e32 v53, s59 -; SI-NEXT: v_cvt_f32_f16_e32 v55, s58 -; SI-NEXT: v_cvt_f32_f16_e32 v41, s57 -; SI-NEXT: v_cvt_f32_f16_e32 v43, s56 -; SI-NEXT: v_cvt_f32_f16_e32 v46, s47 -; SI-NEXT: v_cvt_f32_f16_e32 v56, s46 -; SI-NEXT: v_cvt_f32_f16_e32 v58, s5 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v60, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v59, s4 ; SI-NEXT: .LBB17_3: ; %end -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v60, v60 +; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v59, v59 ; SI-NEXT: v_cvt_f16_f32_e32 v58, v58 ; SI-NEXT: v_cvt_f16_f32_e32 v57, v57 ; SI-NEXT: v_cvt_f16_f32_e32 v56, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v47 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v59 +; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v57 +; SI-NEXT: v_or_b32_e32 v2, v56, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v46, v46 -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v43 -; SI-NEXT: v_or_b32_e32 v59, v59, v60 -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v44, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 -; SI-NEXT: buffer_store_dword v59, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v59, vcc, 4, v0 -; SI-NEXT: v_or_b32_e32 v57, v57, v58 -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v56 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v46 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v56 +; SI-NEXT: v_or_b32_e32 v5, v5, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v46 +; SI-NEXT: v_or_b32_e32 v7, v7, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v42, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v55 -; SI-NEXT: buffer_store_dword v57, v59, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v57, vcc, 8, v0 -; SI-NEXT: v_or_b32_e32 v47, v47, v56 -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 -; SI-NEXT: buffer_store_dword v47, v57, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v47, vcc, 12, v0 -; SI-NEXT: v_or_b32_e32 v45, v45, v46 -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 -; SI-NEXT: buffer_store_dword v45, v47, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v45, vcc, 16, v0 -; SI-NEXT: v_or_b32_e32 v43, v44, v43 -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 -; SI-NEXT: buffer_store_dword v43, v45, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v43, vcc, 20, v0 -; SI-NEXT: v_or_b32_e32 v41, v42, v41 -; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 -; SI-NEXT: buffer_store_dword v41, v43, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v41, vcc, 24, v0 -; SI-NEXT: v_or_b32_e32 v55, v40, v55 -; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 -; SI-NEXT: buffer_store_dword v55, v41, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v55, vcc, 28, v0 -; SI-NEXT: v_or_b32_e32 v53, v54, v53 -; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 -; SI-NEXT: buffer_store_dword v53, v55, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v53, vcc, 32, v0 -; SI-NEXT: v_or_b32_e32 v51, v52, v51 -; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; SI-NEXT: buffer_store_dword v51, v53, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v51, vcc, 36, v0 -; SI-NEXT: v_or_b32_e32 v49, v50, v49 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; SI-NEXT: buffer_store_dword v49, v51, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v49, vcc, 40, v0 -; SI-NEXT: v_or_b32_e32 v39, v48, v39 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; SI-NEXT: buffer_store_dword v39, v49, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v39, vcc, 44, v0 -; SI-NEXT: v_or_b32_e32 v36, v38, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 -; SI-NEXT: buffer_store_dword v36, v39, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v36, v37 -; SI-NEXT: v_add_i32_e32 v37, vcc, 48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; SI-NEXT: v_or_b32_e32 v34, v36, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: buffer_store_dword v34, v37, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v34, v35 -; SI-NEXT: v_add_i32_e32 v35, vcc, 52, v0 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; SI-NEXT: v_or_b32_e32 v32, v34, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: buffer_store_dword v32, v35, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v32, v33 -; SI-NEXT: v_add_i32_e32 v33, vcc, 56, v0 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; SI-NEXT: v_or_b32_e32 v30, v32, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: buffer_store_dword v30, v33, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v30, v31 -; SI-NEXT: v_add_i32_e32 v31, vcc, 60, v0 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; SI-NEXT: v_or_b32_e32 v28, v30, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: buffer_store_dword v28, v31, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v28, v29 -; SI-NEXT: v_add_i32_e32 v29, vcc, 64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_or_b32_e32 v26, v28, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: buffer_store_dword v26, v29, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v26, v27 -; SI-NEXT: v_add_i32_e32 v27, vcc, 0x44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_or_b32_e32 v24, v26, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: buffer_store_dword v24, v27, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v24, v25 -; SI-NEXT: v_add_i32_e32 v25, vcc, 0x48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v22, v24, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: buffer_store_dword v22, v25, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v22, v23 -; SI-NEXT: v_add_i32_e32 v23, vcc, 0x4c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_or_b32_e32 v19, v22, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: buffer_store_dword v19, v23, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v21 -; SI-NEXT: v_add_i32_e32 v21, vcc, 0x50, v0 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: buffer_store_dword v17, v21, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v20 -; SI-NEXT: v_add_i32_e32 v19, vcc, 0x54, v0 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: buffer_store_dword v15, v19, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v18 -; SI-NEXT: v_add_i32_e32 v17, vcc, 0x58, v0 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: buffer_store_dword v13, v17, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v13, v16 -; SI-NEXT: v_add_i32_e32 v15, vcc, 0x5c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: buffer_store_dword v11, v15, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v11, v14 -; SI-NEXT: v_add_i32_e32 v13, vcc, 0x60, v0 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: buffer_store_dword v9, v13, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v9, v12 -; SI-NEXT: v_add_i32_e32 v11, vcc, 0x64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: buffer_store_dword v7, v11, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v7, v10 -; SI-NEXT: v_add_i32_e32 v9, vcc, 0x68, v0 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v8 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x6c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v6 -; SI-NEXT: v_add_i32_e32 v5, vcc, 0x70, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v44 +; SI-NEXT: v_or_b32_e32 v9, v42, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v40 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v4 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 +; SI-NEXT: v_or_b32_e32 v11, v40, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v12 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v60, v1 +; SI-NEXT: v_or_b32_e32 v3, v58, v3 +; SI-NEXT: v_or_b32_e32 v4, v47, v4 +; SI-NEXT: v_or_b32_e32 v6, v45, v6 +; SI-NEXT: v_or_b32_e32 v8, v43, v8 +; SI-NEXT: v_or_b32_e32 v10, v41, v10 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v40 ; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -9476,69 +8981,137 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_or_b32_e32 v13, v54, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v54 +; SI-NEXT: v_or_b32_e32 v15, v52, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v52 +; SI-NEXT: v_or_b32_e32 v17, v50, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v50 +; SI-NEXT: v_or_b32_e32 v19, v48, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v48 +; SI-NEXT: v_or_b32_e32 v21, v38, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v38 +; SI-NEXT: v_or_b32_e32 v23, v36, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v36 +; SI-NEXT: v_or_b32_e32 v25, v34, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v34 +; SI-NEXT: v_or_b32_e32 v27, v32, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v32 +; SI-NEXT: v_or_b32_e32 v12, v55, v12 +; SI-NEXT: v_or_b32_e32 v14, v53, v14 +; SI-NEXT: v_or_b32_e32 v16, v51, v16 +; SI-NEXT: v_or_b32_e32 v18, v49, v18 +; SI-NEXT: v_or_b32_e32 v20, v39, v20 +; SI-NEXT: v_or_b32_e32 v22, v37, v22 +; SI-NEXT: v_or_b32_e32 v24, v35, v24 +; SI-NEXT: v_or_b32_e32 v26, v33, v26 +; SI-NEXT: v_or_b32_e32 v28, v31, v28 +; SI-NEXT: v_or_b32_e32 v29, v30, v29 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB17_4: +; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: s_branch .LBB17_2 ; ; VI-LABEL: bitcast_v30i32_to_v60f16_scalar: @@ -10356,218 +9929,244 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v60f16_to_v30i32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v3 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: s_waitcnt expcnt(5) ; SI-NEXT: v_cvt_f16_f32_e32 v58, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:116 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:88 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:84 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v10 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v3 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v44 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v43 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:104 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:100 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:112 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:108 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v60 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v42 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v6 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v41 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v7 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v40 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v55 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v54 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v32 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v32 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v52 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v51 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v55 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v62 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v53 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v54 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v32 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v32, v47 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v45 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v44 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v46 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v43 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v30 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v31 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v46 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v42 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v46, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v41 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v35 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v40 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v62 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v29 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB18_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v45 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; kill: killed $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr30 @@ -10630,7 +10229,11 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v59 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v57 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v41 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v53 ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v51 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v49 ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v39 @@ -10641,6 +10244,10 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: v_or_b32_e32 v0, v58, v0 ; SI-NEXT: v_or_b32_e32 v1, v56, v1 +; SI-NEXT: v_or_b32_e32 v2, v46, v2 +; SI-NEXT: v_or_b32_e32 v20, v42, v20 +; SI-NEXT: v_or_b32_e32 v21, v40, v21 +; SI-NEXT: v_or_b32_e32 v22, v54, v22 ; SI-NEXT: v_or_b32_e32 v23, v52, v23 ; SI-NEXT: v_or_b32_e32 v24, v50, v24 ; SI-NEXT: v_or_b32_e32 v25, v48, v25 @@ -10652,10 +10259,19 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; kill: killed $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr50 @@ -10669,111 +10285,93 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v60 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_or_b32_e32 v21, v22, v21 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v63 -; SI-NEXT: v_or_b32_e32 v22, v62, v22 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: v_or_b32_e32 v19, v44, v19 +; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: .LBB18_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB18_4 ; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v57 @@ -10791,217 +10389,207 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v24, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v42 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v40 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v53 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v25, v52 ; SI-NEXT: v_cvt_f32_f16_e32 v26, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v39 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v39 ; SI-NEXT: v_cvt_f32_f16_e32 v28, v38 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v36 ; SI-NEXT: v_cvt_f32_f16_e32 v30, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v32 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 ; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 ; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 ; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v32 ; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 ; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 ; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v60 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 @@ -11009,9 +10597,12 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 @@ -11021,34 +10612,27 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_or_b32_e32 v18, v20, v18 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v44 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: v_or_b32_e32 v19, v20, v19 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v41 ; SI-NEXT: v_or_b32_e32 v20, v22, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v63 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v55 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v54 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_or_b32_e32 v22, v23, v22 @@ -11086,22 +10670,22 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v29, v31, v29 ; SI-NEXT: .LBB18_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -11927,509 +11511,565 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i ; SI-LABEL: bitcast_v60f16_to_v30i32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:36 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:48 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:44 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:56 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:52 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v7 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v3 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v17, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v1, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v12, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v2, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v11, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v3, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v10, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v4, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v9, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v5, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v8, s26 -; SI-NEXT: v_cvt_f16_f32_e32 v6, s29 -; SI-NEXT: v_cvt_f16_f32_e32 v7, s28 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v33 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v5 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v6 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v7 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v8 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v9 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v10 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v11 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v12 +; SI-NEXT: s_lshr_b32 s40, s17, 16 +; SI-NEXT: s_lshr_b32 s41, s16, 16 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v58, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s40 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v59, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s16 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v34 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v13 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_cbranch_scc0 .LBB19_2 -; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v10, v3 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v34 -; SI-NEXT: v_mov_b32_e32 v33, v32 -; SI-NEXT: v_or_b32_e32 v10, v32, v10 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_mov_b32_e32 v44, v43 -; SI-NEXT: v_or_b32_e32 v13, v43, v13 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v8, v5 -; SI-NEXT: v_mov_b32_e32 v57, v39 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v39 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v49 -; SI-NEXT: v_or_b32_e32 v7, v37, v7 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v59 -; SI-NEXT: v_or_b32_e32 v1, v12, v1 -; SI-NEXT: v_or_b32_e32 v2, v11, v2 -; SI-NEXT: v_or_b32_e32 v4, v9, v4 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v63 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v41 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 +; SI-NEXT: s_lshr_b32 s15, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v33 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v32, s15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s18 +; SI-NEXT: s_lshr_b32 s13, s20, 16 +; SI-NEXT: s_lshr_b32 s14, s19, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v33 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s20 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 +; SI-NEXT: s_lshr_b32 s11, s22, 16 +; SI-NEXT: s_lshr_b32 s12, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v33 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s22 +; SI-NEXT: s_lshr_b32 s9, s24, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 +; SI-NEXT: s_lshr_b32 s10, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v34 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v15 +; SI-NEXT: s_lshr_b32 s7, s26, 16 +; SI-NEXT: s_lshr_b32 s8, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 +; SI-NEXT: s_lshr_b32 s4, s29, 16 +; SI-NEXT: s_lshr_b32 s5, s28, 16 +; SI-NEXT: s_lshr_b32 s6, s27, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v60, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v61, s29 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB19_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v31 +; SI-NEXT: v_mov_b32_e32 v46, v45 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v45 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v59 +; SI-NEXT: v_or_b32_e32 v12, v62, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v56 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v44 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v63 +; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: v_or_b32_e32 v2, v57, v2 +; SI-NEXT: v_mov_b32_e32 v43, v42 +; SI-NEXT: v_or_b32_e32 v3, v42, v3 +; SI-NEXT: v_or_b32_e32 v4, v40, v4 +; SI-NEXT: v_or_b32_e32 v5, v53, v5 +; SI-NEXT: v_mov_b32_e32 v55, v54 +; SI-NEXT: v_mov_b32_e32 v53, v52 +; SI-NEXT: v_or_b32_e32 v6, v52, v6 +; SI-NEXT: v_mov_b32_e32 v50, v49 +; SI-NEXT: v_or_b32_e32 v7, v49, v7 +; SI-NEXT: v_or_b32_e32 v8, v48, v8 +; SI-NEXT: v_mov_b32_e32 v48, v39 +; SI-NEXT: v_mov_b32_e32 v35, v38 +; SI-NEXT: v_or_b32_e32 v9, v38, v9 +; SI-NEXT: v_mov_b32_e32 v61, v36 +; SI-NEXT: v_or_b32_e32 v10, v36, v10 +; SI-NEXT: v_mov_b32_e32 v33, v63 +; SI-NEXT: v_mov_b32_e32 v60, v34 +; SI-NEXT: v_or_b32_e32 v11, v34, v11 +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v30 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v18, v22, v18 -; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v52 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v50 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v48 -; SI-NEXT: v_or_b32_e32 v0, v58, v0 -; SI-NEXT: v_mov_b32_e32 v56, v34 -; SI-NEXT: v_mov_b32_e32 v47, v36 -; SI-NEXT: v_mov_b32_e32 v46, v35 -; SI-NEXT: v_or_b32_e32 v11, v35, v11 -; SI-NEXT: v_mov_b32_e32 v60, v63 -; SI-NEXT: v_mov_b32_e32 v45, v62 -; SI-NEXT: v_or_b32_e32 v12, v62, v12 -; SI-NEXT: v_mov_b32_e32 v42, v41 -; SI-NEXT: v_mov_b32_e32 v40, v55 -; SI-NEXT: v_or_b32_e32 v14, v55, v14 -; SI-NEXT: v_or_b32_e32 v15, v61, v15 -; SI-NEXT: v_or_b32_e32 v20, v53, v20 -; SI-NEXT: v_or_b32_e32 v21, v51, v21 -; SI-NEXT: v_or_b32_e32 v22, v30, v22 -; SI-NEXT: v_or_b32_e32 v23, v31, v23 -; SI-NEXT: s_mov_b64 s[4:5], 0 -; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_or_b32_e32 v17, v32, v17 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 ; SI-NEXT: v_or_b32_e32 v25, v26, v25 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v16, v43, v16 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 ; SI-NEXT: v_or_b32_e32 v26, v27, v26 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v35, v39 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 ; SI-NEXT: v_or_b32_e32 v27, v28, v27 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v37 -; SI-NEXT: v_or_b32_e32 v9, v39, v9 -; SI-NEXT: v_mov_b32_e32 v36, v37 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v45 +; SI-NEXT: v_or_b32_e32 v1, v41, v1 +; SI-NEXT: v_mov_b32_e32 v41, v40 +; SI-NEXT: v_mov_b32_e32 v34, v45 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 ; SI-NEXT: v_or_b32_e32 v28, v29, v28 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v8, v38, v8 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v58, v13 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; SI-NEXT: v_or_b32_e32 v29, v54, v29 -; SI-NEXT: v_mov_b32_e32 v54, v32 -; SI-NEXT: s_branch .LBB19_3 -; SI-NEXT: .LBB19_2: -; SI-NEXT: v_mov_b32_e32 v54, v53 -; SI-NEXT: v_mov_b32_e32 v53, v52 -; SI-NEXT: v_mov_b32_e32 v52, v51 -; SI-NEXT: v_mov_b32_e32 v51, v50 -; SI-NEXT: v_mov_b32_e32 v50, v30 -; SI-NEXT: v_mov_b32_e32 v49, v48 -; SI-NEXT: v_mov_b32_e32 v48, v31 -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v47, v36 -; SI-NEXT: v_mov_b32_e32 v46, v35 -; SI-NEXT: v_mov_b32_e32 v44, v43 -; SI-NEXT: v_mov_b32_e32 v30, v50 -; SI-NEXT: v_mov_b32_e32 v50, v51 -; SI-NEXT: v_mov_b32_e32 v51, v52 -; SI-NEXT: v_mov_b32_e32 v52, v53 -; SI-NEXT: v_mov_b32_e32 v53, v54 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v57, v39 -; SI-NEXT: v_mov_b32_e32 v56, v34 -; SI-NEXT: v_mov_b32_e32 v33, v32 -; SI-NEXT: v_mov_b32_e32 v60, v63 -; SI-NEXT: v_mov_b32_e32 v45, v62 -; SI-NEXT: v_mov_b32_e32 v42, v41 -; SI-NEXT: v_mov_b32_e32 v40, v55 -; SI-NEXT: s_mov_b64 s[4:5], -1 -; SI-NEXT: v_mov_b32_e32 v31, v48 -; SI-NEXT: v_mov_b32_e32 v48, v49 -; SI-NEXT: .LBB19_3: ; %Flow -; SI-NEXT: v_mov_b32_e32 v32, v33 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] -; SI-NEXT: v_mov_b32_e32 v61, v40 -; SI-NEXT: v_mov_b32_e32 v40, v44 -; SI-NEXT: s_cbranch_vccnz .LBB19_5 -; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v58 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v38 +; SI-NEXT: v_or_b32_e32 v29, v62, v29 +; SI-NEXT: s_cbranch_execnz .LBB19_3 +; SI-NEXT: .LBB19_2: ; %cmp.true +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v0, v46 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v41 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v50 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v61 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v58 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v40 -; SI-NEXT: v_mov_b32_e32 v55, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v43 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v48 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v31 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v57 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v56 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v44 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v55 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v37 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v51 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v48 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v37 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v33 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v59 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v30 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 @@ -12437,9 +12077,9 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 @@ -12449,7 +12089,7 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_or_b32_e32 v18, v20, v18 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 @@ -12461,39 +12101,46 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_or_b32_e32 v19, v20, v19 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v52 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_or_b32_e32 v20, v22, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v50 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v30 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_or_b32_e32 v22, v23, v22 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: v_or_b32_e32 v23, v25, v23 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; SI-NEXT: v_or_b32_e32 v24, v26, v24 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 @@ -12505,12 +12152,12 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 ; SI-NEXT: v_or_b32_e32 v25, v26, v25 ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v27 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 ; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 ; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 ; SI-NEXT: v_or_b32_e32 v26, v28, v26 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 ; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 @@ -12520,7 +12167,7 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 ; SI-NEXT: v_or_b32_e32 v27, v29, v27 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 ; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 @@ -12533,25 +12180,48 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v28, v29, v28 ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v30 ; SI-NEXT: v_or_b32_e32 v29, v31, v29 -; SI-NEXT: .LBB19_5: ; %end -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: .LBB19_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB19_4: +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v58, v30 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: v_mov_b32_e32 v60, v34 +; SI-NEXT: v_mov_b32_e32 v30, v58 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v46, v45 +; SI-NEXT: v_mov_b32_e32 v43, v42 +; SI-NEXT: v_mov_b32_e32 v41, v40 +; SI-NEXT: v_mov_b32_e32 v55, v54 +; SI-NEXT: v_mov_b32_e32 v53, v52 +; SI-NEXT: v_mov_b32_e32 v50, v49 +; SI-NEXT: v_mov_b32_e32 v48, v39 +; SI-NEXT: v_mov_b32_e32 v35, v38 +; SI-NEXT: v_mov_b32_e32 v61, v36 +; SI-NEXT: v_mov_b32_e32 v33, v63 +; SI-NEXT: s_branch .LBB19_2 ; ; VI-LABEL: bitcast_v60f16_to_v30i32_scalar: ; VI: ; %bb.0: @@ -15105,357 +14775,267 @@ define <60 x i16> @bitcast_v30f32_to_v60i16(<30 x float> %a, i32 %b) { ; SI-LABEL: bitcast_v30f32_to_v60i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB28_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_alignbit_b32 v31, v30, v29, 16 -; SI-NEXT: v_alignbit_b32 v32, v28, v27, 16 -; SI-NEXT: v_alignbit_b32 v33, v26, v25, 16 -; SI-NEXT: v_alignbit_b32 v34, v24, v23, 16 -; SI-NEXT: v_alignbit_b32 v35, v22, v21, 16 -; SI-NEXT: v_alignbit_b32 v36, v20, v19, 16 -; SI-NEXT: v_alignbit_b32 v38, v18, v17, 16 -; SI-NEXT: v_alignbit_b32 v48, v16, v15, 16 -; SI-NEXT: v_alignbit_b32 v51, v14, v13, 16 -; SI-NEXT: v_alignbit_b32 v53, v12, v11, 16 -; SI-NEXT: v_alignbit_b32 v55, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v41, v8, v7, 16 -; SI-NEXT: v_alignbit_b32 v44, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v30, v29, v28, 16 +; SI-NEXT: v_alignbit_b32 v31, v27, v26, 16 +; SI-NEXT: v_alignbit_b32 v32, v25, v24, 16 +; SI-NEXT: v_alignbit_b32 v33, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v34, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v35, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v36, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v37, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v38, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v39, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v49, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v52, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v54, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v41, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v43, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v15 ; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_alignbit_b32 v46, v4, v3, 16 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v13 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v11 ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_alignbit_b32 v56, v2, v1, 16 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v9 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v7 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v5 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v3 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v1 ; SI-NEXT: .LBB28_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB28_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 -; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 ; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 -; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 ; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 -; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 ; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 -; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 ; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 -; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 ; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 -; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 ; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 -; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 ; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 -; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 ; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 -; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 ; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 -; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 ; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 -; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 ; SI-NEXT: v_add_f32_e32 v27, 1.0, v27 -; SI-NEXT: v_add_f32_e32 v30, 1.0, v30 +; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 ; SI-NEXT: v_add_f32_e32 v29, 1.0, v29 -; SI-NEXT: v_alignbit_b32 v31, v30, v29, 16 -; SI-NEXT: v_alignbit_b32 v32, v28, v27, 16 -; SI-NEXT: v_alignbit_b32 v33, v26, v25, 16 -; SI-NEXT: v_alignbit_b32 v34, v24, v23, 16 -; SI-NEXT: v_alignbit_b32 v35, v22, v21, 16 -; SI-NEXT: v_alignbit_b32 v36, v20, v19, 16 -; SI-NEXT: v_alignbit_b32 v38, v18, v17, 16 -; SI-NEXT: v_alignbit_b32 v48, v16, v15, 16 -; SI-NEXT: v_alignbit_b32 v51, v14, v13, 16 -; SI-NEXT: v_alignbit_b32 v53, v12, v11, 16 -; SI-NEXT: v_alignbit_b32 v55, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v41, v8, v7, 16 -; SI-NEXT: v_alignbit_b32 v44, v6, v5, 16 +; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; SI-NEXT: v_alignbit_b32 v30, v29, v28, 16 +; SI-NEXT: v_alignbit_b32 v31, v27, v26, 16 +; SI-NEXT: v_alignbit_b32 v32, v25, v24, 16 +; SI-NEXT: v_alignbit_b32 v33, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v34, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v35, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v36, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v37, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v38, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v39, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v49, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v52, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v54, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v41, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v43, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v15 ; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_alignbit_b32 v46, v4, v3, 16 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v13 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v11 ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_alignbit_b32 v56, v2, v1, 16 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v9 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v7 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v5 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v3 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v1 ; SI-NEXT: .LBB28_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v43 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v41 +; SI-NEXT: v_or_b32_e32 v0, v0, v43 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v56 -; SI-NEXT: v_or_b32_e32 v1, v1, v56 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v60 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v46 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v59 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v44 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v58 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v57 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v47 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v53 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v45 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v43 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v42 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v24 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v25 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v26 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v49 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v27 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v28 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v29 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v59 +; SI-NEXT: v_or_b32_e32 v2, v2, v41 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v58 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v43 +; SI-NEXT: v_or_b32_e32 v3, v3, v41 +; SI-NEXT: v_or_b32_e32 v4, v4, v54 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v57 +; SI-NEXT: v_or_b32_e32 v6, v6, v52 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v56 +; SI-NEXT: v_or_b32_e32 v8, v8, v49 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v47 +; SI-NEXT: v_or_b32_e32 v10, v10, v39 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v46 +; SI-NEXT: v_or_b32_e32 v12, v12, v38 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v45 +; SI-NEXT: v_or_b32_e32 v14, v14, v37 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v44 +; SI-NEXT: v_or_b32_e32 v16, v16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v42 +; SI-NEXT: v_or_b32_e32 v18, v18, v35 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v40 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v30 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_or_b32_e32 v20, v20, v34 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v55 +; SI-NEXT: v_or_b32_e32 v22, v22, v33 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v53 +; SI-NEXT: v_or_b32_e32 v24, v24, v32 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v51 +; SI-NEXT: v_or_b32_e32 v26, v26, v31 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v50 +; SI-NEXT: v_or_b32_e32 v28, v28, v30 +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v48 +; SI-NEXT: v_or_b32_e32 v5, v5, v54 +; SI-NEXT: v_or_b32_e32 v7, v7, v52 +; SI-NEXT: v_or_b32_e32 v9, v9, v49 +; SI-NEXT: v_or_b32_e32 v11, v11, v39 +; SI-NEXT: v_or_b32_e32 v13, v13, v38 +; SI-NEXT: v_or_b32_e32 v15, v15, v37 +; SI-NEXT: v_or_b32_e32 v17, v17, v36 +; SI-NEXT: v_or_b32_e32 v19, v19, v35 +; SI-NEXT: v_or_b32_e32 v21, v21, v34 +; SI-NEXT: v_or_b32_e32 v23, v23, v33 +; SI-NEXT: v_or_b32_e32 v25, v25, v32 +; SI-NEXT: v_or_b32_e32 v27, v27, v31 +; SI-NEXT: v_or_b32_e32 v29, v29, v30 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v30f32_to_v60i16: @@ -16073,372 +15653,309 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a, ; SI-LABEL: bitcast_v30f32_to_v60i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 -; SI-NEXT: v_mov_b32_e32 v27, s16 -; SI-NEXT: v_mov_b32_e32 v28, s17 -; SI-NEXT: v_mov_b32_e32 v29, s18 -; SI-NEXT: v_mov_b32_e32 v30, s19 -; SI-NEXT: v_mov_b32_e32 v25, s20 -; SI-NEXT: v_mov_b32_e32 v26, s21 -; SI-NEXT: v_mov_b32_e32 v23, s22 -; SI-NEXT: v_mov_b32_e32 v24, s23 -; SI-NEXT: v_mov_b32_e32 v21, s24 -; SI-NEXT: v_mov_b32_e32 v22, s25 -; SI-NEXT: v_mov_b32_e32 v19, s26 -; SI-NEXT: v_mov_b32_e32 v20, s27 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: v_mov_b32_e32 v24, s16 +; SI-NEXT: v_mov_b32_e32 v25, s17 +; SI-NEXT: v_mov_b32_e32 v28, s18 +; SI-NEXT: v_mov_b32_e32 v29, s19 +; SI-NEXT: v_mov_b32_e32 v26, s20 +; SI-NEXT: v_mov_b32_e32 v27, s21 +; SI-NEXT: v_mov_b32_e32 v22, s22 +; SI-NEXT: v_mov_b32_e32 v23, s23 +; SI-NEXT: v_mov_b32_e32 v20, s24 +; SI-NEXT: v_mov_b32_e32 v21, s25 +; SI-NEXT: v_mov_b32_e32 v18, s26 +; SI-NEXT: v_mov_b32_e32 v19, s27 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mov_b32_e32 v17, s28 -; SI-NEXT: v_mov_b32_e32 v18, s29 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v16, s28 +; SI-NEXT: v_mov_b32_e32 v17, s29 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB29_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshr_b64 v[31:32], v[15:16], 16 -; SI-NEXT: v_lshr_b64 v[32:33], v[13:14], 16 -; SI-NEXT: v_lshr_b64 v[48:49], v[17:18], 16 -; SI-NEXT: v_lshr_b64 v[33:34], v[11:12], 16 -; SI-NEXT: v_lshr_b64 v[49:50], v[19:20], 16 -; SI-NEXT: v_lshr_b64 v[34:35], v[9:10], 16 -; SI-NEXT: v_lshr_b64 v[50:51], v[21:22], 16 -; SI-NEXT: v_lshr_b64 v[35:36], v[7:8], 16 -; SI-NEXT: v_lshr_b64 v[51:52], v[23:24], 16 -; SI-NEXT: v_lshr_b64 v[36:37], v[5:6], 16 -; SI-NEXT: v_lshr_b64 v[52:53], v[25:26], 16 -; SI-NEXT: v_lshr_b64 v[37:38], v[3:4], 16 -; SI-NEXT: v_lshr_b64 v[53:54], v[29:30], 16 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v2 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v18 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v20 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v22 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v24 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v26 +; SI-NEXT: v_lshr_b64 v[40:41], v[12:13], 16 +; SI-NEXT: v_lshr_b64 v[41:42], v[10:11], 16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v11 +; SI-NEXT: v_lshr_b64 v[42:43], v[8:9], 16 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[31:32], v[14:15], 16 +; SI-NEXT: v_lshr_b64 v[43:44], v[6:7], 16 +; SI-NEXT: v_lshr_b64 v[44:45], v[4:5], 16 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v13 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[45:46], v[2:3], 16 +; SI-NEXT: v_mov_b32_e32 v46, v30 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v30 +; SI-NEXT: v_lshr_b64 v[30:31], v[0:1], 16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v28 -; SI-NEXT: v_lshr_b64 v[38:39], v[1:2], 16 -; SI-NEXT: v_lshr_b64 v[54:55], v[27:28], 16 +; SI-NEXT: v_lshr_b64 v[31:32], v[16:17], 16 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v25 +; SI-NEXT: v_lshr_b64 v[50:51], v[18:19], 16 +; SI-NEXT: v_lshr_b64 v[48:49], v[20:21], 16 +; SI-NEXT: v_lshr_b64 v[38:39], v[22:23], 16 +; SI-NEXT: v_lshr_b64 v[36:37], v[26:27], 16 +; SI-NEXT: v_lshr_b64 v[34:35], v[28:29], 16 +; SI-NEXT: v_lshr_b64 v[32:33], v[24:25], 16 ; SI-NEXT: s_cbranch_execnz .LBB29_3 ; SI-NEXT: .LBB29_2: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 ; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 -; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 -; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 ; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 ; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 -; SI-NEXT: v_lshr_b64 v[31:32], v[15:16], 16 -; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 -; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 ; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_lshr_b64 v[30:31], v[14:15], 16 ; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 -; SI-NEXT: v_lshr_b64 v[32:33], v[13:14], 16 -; SI-NEXT: v_lshr_b64 v[48:49], v[17:18], 16 -; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 -; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 ; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_lshr_b64 v[40:41], v[12:13], 16 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 -; SI-NEXT: v_lshr_b64 v[33:34], v[11:12], 16 -; SI-NEXT: v_lshr_b64 v[49:50], v[19:20], 16 -; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 -; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 ; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[41:42], v[10:11], 16 ; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 -; SI-NEXT: v_lshr_b64 v[34:35], v[9:10], 16 -; SI-NEXT: v_lshr_b64 v[50:51], v[21:22], 16 -; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 -; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 ; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_lshr_b64 v[42:43], v[8:9], 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[30:31], v[0:1], 16 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 ; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 -; SI-NEXT: v_lshr_b64 v[35:36], v[7:8], 16 -; SI-NEXT: v_lshr_b64 v[51:52], v[23:24], 16 -; SI-NEXT: v_add_f32_e32 v30, 1.0, v30 -; SI-NEXT: v_add_f32_e32 v29, 1.0, v29 ; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 -; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_lshr_b64 v[36:37], v[5:6], 16 -; SI-NEXT: v_lshr_b64 v[52:53], v[25:26], 16 +; SI-NEXT: v_lshr_b64 v[43:44], v[6:7], 16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v11 +; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_add_f32_e32 v29, 1.0, v29 ; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 ; SI-NEXT: v_add_f32_e32 v27, 1.0, v27 +; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 ; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshr_b64 v[37:38], v[3:4], 16 -; SI-NEXT: v_lshr_b64 v[53:54], v[29:30], 16 -; SI-NEXT: v_lshr_b64 v[38:39], v[1:2], 16 -; SI-NEXT: v_lshr_b64 v[54:55], v[27:28], 16 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v2 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v18 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v20 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v22 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v24 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v26 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v30 +; SI-NEXT: v_lshr_b64 v[44:45], v[4:5], 16 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v28 +; SI-NEXT: v_lshr_b64 v[31:32], v[16:17], 16 +; SI-NEXT: v_lshr_b64 v[45:46], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[50:51], v[18:19], 16 +; SI-NEXT: v_lshr_b64 v[48:49], v[20:21], 16 +; SI-NEXT: v_lshr_b64 v[38:39], v[22:23], 16 +; SI-NEXT: v_lshr_b64 v[36:37], v[26:27], 16 +; SI-NEXT: v_lshr_b64 v[34:35], v[28:29], 16 +; SI-NEXT: v_lshr_b64 v[32:33], v[24:25], 16 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v25 ; SI-NEXT: .LBB29_3: ; %end -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v54 -; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; SI-NEXT: v_or_b32_e32 v27, v27, v39 -; SI-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v27, 0xffff, v28 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v62 -; SI-NEXT: v_or_b32_e32 v27, v27, v28 -; SI-NEXT: v_add_i32_e32 v28, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v27, v28, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v53 -; SI-NEXT: v_and_b32_e32 v28, 0xffff, v29 -; SI-NEXT: v_or_b32_e32 v27, v28, v27 -; SI-NEXT: v_add_i32_e32 v28, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v27, v28, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v27, 0xffff, v30 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v61 -; SI-NEXT: v_or_b32_e32 v27, v27, v28 -; SI-NEXT: v_add_i32_e32 v28, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v27, v28, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v52 -; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; SI-NEXT: v_or_b32_e32 v25, v25, v27 -; SI-NEXT: v_add_i32_e32 v27, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v25, v27, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v32, v24, v32 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v54 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_or_b32_e32 v33, v24, v25 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v34 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v28 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v34, v25, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v57 +; SI-NEXT: v_or_b32_e32 v35, v24, v25 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v36 ; SI-NEXT: v_and_b32_e32 v25, 0xffff, v26 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v60 -; SI-NEXT: v_or_b32_e32 v25, v25, v26 -; SI-NEXT: v_add_i32_e32 v26, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v25, v26, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v51 -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; SI-NEXT: v_or_b32_e32 v23, v23, v25 -; SI-NEXT: v_add_i32_e32 v25, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v23, v25, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v59 -; SI-NEXT: v_or_b32_e32 v23, v23, v24 -; SI-NEXT: v_add_i32_e32 v24, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v23, v24, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v50 -; SI-NEXT: v_or_b32_e32 v21, v21, v23 -; SI-NEXT: v_add_i32_e32 v23, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v21, v23, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v22 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v58 -; SI-NEXT: v_or_b32_e32 v21, v21, v22 -; SI-NEXT: v_add_i32_e32 v22, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v21, v22, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v49 -; SI-NEXT: v_or_b32_e32 v19, v19, v21 -; SI-NEXT: v_add_i32_e32 v21, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v19, v21, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v57 -; SI-NEXT: v_or_b32_e32 v19, v19, v20 -; SI-NEXT: v_add_i32_e32 v20, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v48 -; SI-NEXT: v_or_b32_e32 v17, v17, v19 -; SI-NEXT: v_add_i32_e32 v19, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v17, v19, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v56 -; SI-NEXT: v_or_b32_e32 v17, v17, v18 -; SI-NEXT: v_add_i32_e32 v18, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v38 -; SI-NEXT: v_or_b32_e32 v1, v1, v17 -; SI-NEXT: v_add_i32_e32 v17, vcc, 56, v0 -; SI-NEXT: buffer_store_dword v1, v17, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v47 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v46 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v45 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v44 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v43 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v42 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_or_b32_e32 v36, v25, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v56 +; SI-NEXT: v_or_b32_e32 v37, v24, v25 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v38 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v38, v22, v24 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v47 +; SI-NEXT: v_or_b32_e32 v39, v22, v23 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v48 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v48, v20, v22 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v63 +; SI-NEXT: v_or_b32_e32 v49, v20, v21 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v50 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v50, v18, v20 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v52 +; SI-NEXT: v_or_b32_e32 v51, v18, v19 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v31 +; SI-NEXT: v_or_b32_e32 v52, v16, v18 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v53 +; SI-NEXT: v_or_b32_e32 v53, v16, v17 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v30 +; SI-NEXT: v_or_b32_e32 v30, v0, v16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v62 +; SI-NEXT: v_or_b32_e32 v31, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v45 +; SI-NEXT: v_or_b32_e32 v16, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v61 +; SI-NEXT: v_or_b32_e32 v17, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v44 +; SI-NEXT: v_or_b32_e32 v18, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v60 +; SI-NEXT: v_or_b32_e32 v19, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v43 +; SI-NEXT: v_or_b32_e32 v20, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v59 +; SI-NEXT: v_or_b32_e32 v21, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v42 +; SI-NEXT: v_or_b32_e32 v22, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v58 +; SI-NEXT: v_or_b32_e32 v23, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v41 +; SI-NEXT: v_or_b32_e32 v24, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v11 +; SI-NEXT: v_mov_b32_e32 v3, v35 +; SI-NEXT: v_mov_b32_e32 v4, v36 +; SI-NEXT: v_mov_b32_e32 v5, v37 +; SI-NEXT: v_mov_b32_e32 v6, v38 +; SI-NEXT: v_mov_b32_e32 v7, v39 +; SI-NEXT: v_mov_b32_e32 v8, v48 +; SI-NEXT: v_mov_b32_e32 v9, v49 +; SI-NEXT: v_mov_b32_e32 v10, v50 +; SI-NEXT: v_mov_b32_e32 v11, v51 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v25, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v40 +; SI-NEXT: v_or_b32_e32 v26, v0, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v46 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v13 +; SI-NEXT: v_or_b32_e32 v27, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, v34 +; SI-NEXT: v_mov_b32_e32 v12, v52 +; SI-NEXT: v_mov_b32_e32 v13, v53 +; SI-NEXT: v_mov_b32_e32 v14, v30 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v28, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v55 +; SI-NEXT: v_or_b32_e32 v29, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, v32 +; SI-NEXT: v_mov_b32_e32 v1, v33 +; SI-NEXT: v_mov_b32_e32 v15, v31 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB29_4: +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr62 ; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; kill: killed $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_branch .LBB29_2 ; ; VI-LABEL: bitcast_v30f32_to_v60i16_scalar: @@ -17342,181 +16859,279 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v60i16_to_v30f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:12 -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v29 -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:60 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:4 -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v2 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:52 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:112 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:108 -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v4 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:104 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:100 -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v8 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:36 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92 -; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:88 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:84 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v12 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v16 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v18 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v20 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v49, v20 +; SI-NEXT: v_mov_b32_e32 v50, v19 +; SI-NEXT: v_mov_b32_e32 v51, v18 +; SI-NEXT: v_mov_b32_e32 v52, v17 +; SI-NEXT: v_mov_b32_e32 v53, v16 +; SI-NEXT: v_mov_b32_e32 v54, v15 +; SI-NEXT: v_mov_b32_e32 v55, v14 +; SI-NEXT: v_mov_b32_e32 v40, v13 +; SI-NEXT: v_mov_b32_e32 v41, v12 +; SI-NEXT: v_mov_b32_e32 v42, v11 +; SI-NEXT: v_mov_b32_e32 v43, v10 +; SI-NEXT: v_mov_b32_e32 v44, v9 +; SI-NEXT: v_mov_b32_e32 v45, v8 +; SI-NEXT: v_mov_b32_e32 v46, v7 +; SI-NEXT: v_mov_b32_e32 v47, v6 +; SI-NEXT: v_mov_b32_e32 v56, v5 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_mov_b32_e32 v57, v4 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_mov_b32_e32 v58, v3 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_mov_b32_e32 v59, v2 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_mov_b32_e32 v60, v1 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v61, v0 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v29 +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v28 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v27 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v26 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v25 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v24 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v23 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v22 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v50 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v52 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v40 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v42 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v43 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v44 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v45 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v46 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v47 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v56 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v57 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v58 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v59 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v60 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v61 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v23 ; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v22 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v24 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:68 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v26 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:64 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v28 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB30_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v42 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v61 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v60 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v59 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v58 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v57 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v56 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v47 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v46 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v45 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v44 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v43 +; SI-NEXT: v_or_b32_e32 v0, v0, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v48 +; SI-NEXT: v_or_b32_e32 v2, v2, v32 +; SI-NEXT: v_or_b32_e32 v3, v3, v39 +; SI-NEXT: v_or_b32_e32 v4, v4, v34 +; SI-NEXT: v_or_b32_e32 v5, v5, v38 +; SI-NEXT: v_or_b32_e32 v6, v6, v63 +; SI-NEXT: v_or_b32_e32 v7, v7, v37 +; SI-NEXT: v_or_b32_e32 v8, v8, v33 +; SI-NEXT: v_or_b32_e32 v9, v9, v36 +; SI-NEXT: v_or_b32_e32 v10, v10, v62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v41 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v40 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v55 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v54 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v53 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v52 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v51 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v50 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v49 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v24, v24, v25 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v26, v26, v27 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: v_or_b32_e32 v28, v28, v29 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: v_or_b32_e32 v29, v29, v30 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; kill: killed $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr30 @@ -17568,195 +17183,40 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; kill: killed $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v61 ; SI-NEXT: ; kill: killed $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: v_or_b32_e32 v0, v0, v60 -; SI-NEXT: v_or_b32_e32 v18, v18, v37 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: ; kill: killed $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; SI-NEXT: v_or_b32_e32 v1, v1, v54 -; SI-NEXT: v_or_b32_e32 v2, v2, v59 -; SI-NEXT: v_or_b32_e32 v3, v3, v58 -; SI-NEXT: v_or_b32_e32 v4, v4, v53 -; SI-NEXT: v_or_b32_e32 v5, v5, v57 -; SI-NEXT: v_or_b32_e32 v6, v6, v52 -; SI-NEXT: v_or_b32_e32 v7, v7, v56 -; SI-NEXT: v_or_b32_e32 v8, v8, v51 -; SI-NEXT: v_or_b32_e32 v9, v9, v50 -; SI-NEXT: v_or_b32_e32 v10, v10, v49 -; SI-NEXT: v_or_b32_e32 v11, v11, v48 -; SI-NEXT: v_or_b32_e32 v12, v12, v47 -; SI-NEXT: v_or_b32_e32 v13, v13, v39 -; SI-NEXT: v_or_b32_e32 v14, v14, v46 -; SI-NEXT: v_or_b32_e32 v15, v15, v45 -; SI-NEXT: v_or_b32_e32 v16, v16, v38 -; SI-NEXT: v_or_b32_e32 v17, v17, v44 -; SI-NEXT: v_or_b32_e32 v19, v19, v36 -; SI-NEXT: v_or_b32_e32 v20, v20, v43 -; SI-NEXT: v_or_b32_e32 v21, v21, v35 -; SI-NEXT: v_or_b32_e32 v22, v22, v42 -; SI-NEXT: v_or_b32_e32 v23, v23, v34 -; SI-NEXT: v_or_b32_e32 v24, v24, v41 -; SI-NEXT: v_or_b32_e32 v25, v25, v33 -; SI-NEXT: v_or_b32_e32 v26, v26, v40 -; SI-NEXT: v_or_b32_e32 v27, v27, v32 -; SI-NEXT: v_or_b32_e32 v28, v28, v63 -; SI-NEXT: v_or_b32_e32 v29, v29, v62 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr62 ; SI-NEXT: .LBB30_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB30_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v61 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v42 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v61 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v60 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v59 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v58 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v57 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v56 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v47 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v46 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v45 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v44 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v43 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; SI-NEXT: v_or_b32_e32 v0, v60, v0 -; SI-NEXT: s_mov_b32 s6, 0x30000 -; SI-NEXT: v_or_b32_e32 v18, v37, v18 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 @@ -17767,52 +17227,19 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; SI-NEXT: v_or_b32_e32 v1, v54, v1 -; SI-NEXT: v_or_b32_e32 v2, v59, v2 -; SI-NEXT: v_or_b32_e32 v3, v58, v3 -; SI-NEXT: v_or_b32_e32 v4, v53, v4 -; SI-NEXT: v_or_b32_e32 v5, v57, v5 -; SI-NEXT: v_or_b32_e32 v6, v52, v6 -; SI-NEXT: v_or_b32_e32 v7, v56, v7 -; SI-NEXT: v_or_b32_e32 v8, v51, v8 -; SI-NEXT: v_or_b32_e32 v9, v50, v9 -; SI-NEXT: v_or_b32_e32 v10, v49, v10 -; SI-NEXT: v_or_b32_e32 v11, v48, v11 -; SI-NEXT: v_or_b32_e32 v12, v47, v12 -; SI-NEXT: v_or_b32_e32 v13, v39, v13 -; SI-NEXT: v_or_b32_e32 v14, v46, v14 -; SI-NEXT: v_or_b32_e32 v15, v45, v15 -; SI-NEXT: v_or_b32_e32 v16, v38, v16 -; SI-NEXT: v_or_b32_e32 v17, v44, v17 -; SI-NEXT: v_or_b32_e32 v19, v36, v19 -; SI-NEXT: v_or_b32_e32 v20, v43, v20 -; SI-NEXT: v_or_b32_e32 v21, v35, v21 -; SI-NEXT: v_or_b32_e32 v22, v42, v22 -; SI-NEXT: v_or_b32_e32 v23, v34, v23 -; SI-NEXT: v_or_b32_e32 v24, v41, v24 -; SI-NEXT: v_or_b32_e32 v25, v33, v25 -; SI-NEXT: v_or_b32_e32 v26, v40, v26 -; SI-NEXT: v_or_b32_e32 v27, v32, v27 -; SI-NEXT: v_or_b32_e32 v28, v63, v28 -; SI-NEXT: v_or_b32_e32 v29, v62, v29 +; SI-NEXT: v_or_b32_e32 v0, v35, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v1, v48, v1 +; SI-NEXT: v_or_b32_e32 v2, v32, v2 +; SI-NEXT: v_or_b32_e32 v3, v39, v3 +; SI-NEXT: v_or_b32_e32 v4, v34, v4 +; SI-NEXT: v_or_b32_e32 v5, v38, v5 +; SI-NEXT: v_or_b32_e32 v6, v63, v6 +; SI-NEXT: v_or_b32_e32 v7, v37, v7 +; SI-NEXT: v_or_b32_e32 v8, v33, v8 +; SI-NEXT: v_or_b32_e32 v9, v36, v9 +; SI-NEXT: v_or_b32_e32 v10, v62, v10 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 ; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 @@ -17823,6 +17250,46 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 ; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 ; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v41 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v40 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v55 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v54 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v53 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v52 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v51 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v50 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v49 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 ; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 ; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 @@ -17830,36 +17297,90 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 ; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 ; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 ; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 ; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v20 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v25, vcc, s6, v25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v26, v27, v26 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v26, vcc, s6, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v27, vcc, 0x30000, v27 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: v_or_b32_e32 v28, v29, v28 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v28, vcc, 0x30000, v28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: v_or_b32_e32 v29, v30, v29 ; SI-NEXT: v_add_i32_e32 v29, vcc, 0x30000, v29 ; SI-NEXT: .LBB30_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v60i16_to_v30f32: @@ -18683,424 +18204,349 @@ define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a, ; SI-LABEL: bitcast_v60i16_to_v30f32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_mov_b32_e32 v60, v16 -; SI-NEXT: v_mov_b32_e32 v53, v14 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mov_b32_e32 v62, v12 -; SI-NEXT: v_mov_b32_e32 v32, v10 -; SI-NEXT: v_mov_b32_e32 v55, v8 -; SI-NEXT: v_mov_b32_e32 v37, v6 -; SI-NEXT: v_mov_b32_e32 v41, v4 -; SI-NEXT: v_mov_b32_e32 v44, v2 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v63, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:52 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v30, v28 -; SI-NEXT: v_mov_b32_e32 v39, v26 -; SI-NEXT: v_mov_b32_e32 v48, v24 -; SI-NEXT: v_mov_b32_e32 v49, v22 -; SI-NEXT: v_mov_b32_e32 v47, v20 -; SI-NEXT: v_mov_b32_e32 v50, v18 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v29 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v4 +; SI-NEXT: v_mov_b32_e32 v52, v3 +; SI-NEXT: v_mov_b32_e32 v51, v4 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v52 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v50, v5 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v51 +; SI-NEXT: v_mov_b32_e32 v49, v6 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v50 +; SI-NEXT: v_mov_b32_e32 v48, v7 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v49 +; SI-NEXT: v_mov_b32_e32 v39, v8 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v48 +; SI-NEXT: v_mov_b32_e32 v38, v9 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v39 +; SI-NEXT: v_mov_b32_e32 v37, v10 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v38 +; SI-NEXT: v_mov_b32_e32 v36, v11 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v37 +; SI-NEXT: v_mov_b32_e32 v35, v12 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v36 +; SI-NEXT: v_mov_b32_e32 v34, v13 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v35 +; SI-NEXT: v_mov_b32_e32 v33, v14 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v34 +; SI-NEXT: v_mov_b32_e32 v32, v15 +; SI-NEXT: v_mov_b32_e32 v53, v2 +; SI-NEXT: v_mov_b32_e32 v54, v1 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v54 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v55 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s7, s28, 16 +; SI-NEXT: s_lshr_b32 s8, s27, 16 +; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: s_lshr_b32 s13, s22, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s40, s19, 16 +; SI-NEXT: s_lshr_b32 s41, s18, 16 +; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v32 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v0 ; SI-NEXT: s_cbranch_scc0 .LBB31_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v63 -; SI-NEXT: v_or_b32_e32 v7, v0, v31 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v41 -; SI-NEXT: v_or_b32_e32 v9, v0, v28 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 -; SI-NEXT: v_or_b32_e32 v10, v0, v24 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; SI-NEXT: v_or_b32_e32 v11, v0, v22 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 -; SI-NEXT: v_or_b32_e32 v12, v0, v20 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v62 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_or_b32_e32 v13, v0, v13 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: v_or_b32_e32 v14, v0, v63 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 -; SI-NEXT: v_or_b32_e32 v14, v0, v18 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v60 -; SI-NEXT: v_or_b32_e32 v15, v0, v15 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s44, s42, 16 +; SI-NEXT: v_or_b32_e32 v16, v0, v61 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; SI-NEXT: s_or_b32 s5, s5, s44 +; SI-NEXT: s_and_b32 s44, s18, 0xffff +; SI-NEXT: s_lshl_b32 s45, s41, 16 +; SI-NEXT: v_or_b32_e32 v17, v0, v60 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; SI-NEXT: s_or_b32 s44, s44, s45 +; SI-NEXT: s_and_b32 s45, s19, 0xffff +; SI-NEXT: s_lshl_b32 s46, s40, 16 +; SI-NEXT: v_or_b32_e32 v18, v0, v59 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 -; SI-NEXT: v_or_b32_e32 v16, v0, v17 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v47 -; SI-NEXT: v_or_b32_e32 v17, v0, v5 +; SI-NEXT: s_or_b32 s45, s45, s46 +; SI-NEXT: s_and_b32 s46, s20, 0xffff +; SI-NEXT: s_lshl_b32 s47, s15, 16 +; SI-NEXT: v_or_b32_e32 v19, v0, v58 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 -; SI-NEXT: v_or_b32_e32 v18, v0, v3 +; SI-NEXT: s_or_b32 s46, s46, s47 +; SI-NEXT: s_and_b32 s47, s21, 0xffff +; SI-NEXT: s_lshl_b32 s56, s14, 16 +; SI-NEXT: v_or_b32_e32 v20, v0, v57 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 -; SI-NEXT: v_or_b32_e32 v19, v0, v46 +; SI-NEXT: s_or_b32 s47, s47, s56 +; SI-NEXT: s_and_b32 s56, s22, 0xffff +; SI-NEXT: s_lshl_b32 s57, s13, 16 +; SI-NEXT: v_or_b32_e32 v21, v0, v56 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_or_b32_e32 v20, v0, v45 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 -; SI-NEXT: v_or_b32_e32 v21, v0, v43 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v44 -; SI-NEXT: s_or_b32 s7, s7, s8 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: v_or_b32_e32 v8, v1, v26 -; SI-NEXT: s_or_b32 s8, s8, s9 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: s_or_b32 s9, s9, s10 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_mov_b32_e32 v5, s9 -; SI-NEXT: v_mov_b32_e32 v6, s10 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_or_b32_e32 v22, v0, v42 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v61 -; SI-NEXT: v_or_b32_e32 v23, v0, v40 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_or_b32_e32 v24, v0, v38 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v59 -; SI-NEXT: v_or_b32_e32 v25, v0, v36 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v58 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_or_b32_e32 v26, v0, v35 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 -; SI-NEXT: v_or_b32_e32 v27, v0, v34 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v57 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_or_b32_e32 v28, v0, v54 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v56 -; SI-NEXT: v_or_b32_e32 v29, v0, v33 +; SI-NEXT: s_or_b32 s56, s56, s57 +; SI-NEXT: s_and_b32 s57, s23, 0xffff +; SI-NEXT: s_lshl_b32 s58, s12, 16 +; SI-NEXT: v_or_b32_e32 v22, v0, v47 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: s_or_b32 s57, s57, s58 +; SI-NEXT: s_and_b32 s58, s24, 0xffff +; SI-NEXT: s_lshl_b32 s59, s11, 16 +; SI-NEXT: v_or_b32_e32 v23, v0, v46 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: s_or_b32 s58, s58, s59 +; SI-NEXT: s_and_b32 s59, s25, 0xffff +; SI-NEXT: s_lshl_b32 s60, s10, 16 +; SI-NEXT: v_or_b32_e32 v24, v0, v45 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: s_or_b32 s59, s59, s60 +; SI-NEXT: s_and_b32 s60, s26, 0xffff +; SI-NEXT: s_lshl_b32 s61, s9, 16 +; SI-NEXT: v_or_b32_e32 v25, v0, v44 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: s_or_b32 s60, s60, s61 +; SI-NEXT: s_and_b32 s61, s27, 0xffff +; SI-NEXT: s_lshl_b32 s62, s8, 16 +; SI-NEXT: v_or_b32_e32 v26, v0, v43 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: s_or_b32 s61, s61, s62 +; SI-NEXT: s_and_b32 s62, s28, 0xffff +; SI-NEXT: s_lshl_b32 s63, s7, 16 +; SI-NEXT: v_or_b32_e32 v27, v0, v42 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: s_or_b32 s62, s62, s63 +; SI-NEXT: s_and_b32 s63, s29, 0xffff +; SI-NEXT: s_lshl_b32 s72, s6, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; SI-NEXT: v_or_b32_e32 v28, v0, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: s_or_b32 s63, s63, s72 +; SI-NEXT: v_or_b32_e32 v15, v1, v62 +; SI-NEXT: v_or_b32_e32 v29, v0, v40 ; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s44 +; SI-NEXT: v_mov_b32_e32 v3, s45 +; SI-NEXT: v_mov_b32_e32 v4, s46 +; SI-NEXT: v_mov_b32_e32 v5, s47 +; SI-NEXT: v_mov_b32_e32 v6, s56 +; SI-NEXT: v_mov_b32_e32 v7, s57 +; SI-NEXT: v_mov_b32_e32 v8, s58 +; SI-NEXT: v_mov_b32_e32 v9, s59 +; SI-NEXT: v_mov_b32_e32 v10, s60 +; SI-NEXT: v_mov_b32_e32 v11, s61 +; SI-NEXT: v_mov_b32_e32 v12, s62 +; SI-NEXT: v_mov_b32_e32 v13, s63 ; SI-NEXT: s_cbranch_execnz .LBB31_3 ; SI-NEXT: .LBB31_2: ; %cmp.true -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v63 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: s_or_b32 s7, s8, s7 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: s_or_b32 s8, s9, s8 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: s_or_b32 s9, s10, s9 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: s_or_b32 s10, s11, s10 -; SI-NEXT: s_add_i32 s4, s4, 0x30000 -; SI-NEXT: s_add_i32 s5, s5, 0x30000 -; SI-NEXT: s_add_i32 s6, s6, 0x30000 -; SI-NEXT: s_add_i32 s7, s7, 0x30000 -; SI-NEXT: s_add_i32 s8, s8, 0x30000 -; SI-NEXT: s_add_i32 s9, s9, 0x30000 -; SI-NEXT: s_add_i32 s10, s10, 0x30000 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_mov_b32_e32 v5, s9 -; SI-NEXT: v_mov_b32_e32 v6, s10 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v44 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v62 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v0, v63, v0 ; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v60 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v0, v61, v0 ; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v47 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v0, v60, v0 ; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v0, v59, v0 ; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: v_or_b32_e32 v0, v58, v0 ; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v45, v0 +; SI-NEXT: v_or_b32_e32 v0, v57, v0 ; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: v_or_b32_e32 v0, v56, v0 ; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: v_or_b32_e32 v0, v47, v0 ; SI-NEXT: v_add_i32_e32 v22, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v61 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v40, v0 +; SI-NEXT: v_or_b32_e32 v0, v46, v0 ; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v38, v0 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: v_or_b32_e32 v0, v45, v0 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s16, s42, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: v_add_i32_e32 v24, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v59 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: s_and_b32 s16, s18, 0xffff +; SI-NEXT: s_lshl_b32 s17, s41, 16 +; SI-NEXT: s_add_i32 s19, s19, 3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v36, v0 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_and_b32 s17, s19, 0xffff +; SI-NEXT: s_lshl_b32 s18, s40, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_or_b32_e32 v0, v44, v0 +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: s_and_b32 s18, s20, 0xffff +; SI-NEXT: s_lshl_b32 s15, s15, 16 +; SI-NEXT: s_add_i32 s21, s21, 3 ; SI-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v58 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: s_or_b32 s15, s15, s18 +; SI-NEXT: s_and_b32 s18, s21, 0xffff +; SI-NEXT: s_lshl_b32 s14, s14, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v35, v0 +; SI-NEXT: s_or_b32 s14, s14, s18 +; SI-NEXT: s_and_b32 s18, s22, 0xffff +; SI-NEXT: s_lshl_b32 s13, s13, 16 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: s_or_b32 s13, s13, s18 +; SI-NEXT: s_and_b32 s18, s23, 0xffff +; SI-NEXT: s_lshl_b32 s12, s12, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 ; SI-NEXT: v_add_i32_e32 v26, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 +; SI-NEXT: s_or_b32 s12, s12, s18 +; SI-NEXT: s_and_b32 s18, s24, 0xffff +; SI-NEXT: s_lshl_b32 s11, s11, 16 +; SI-NEXT: s_add_i32 s25, s25, 3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v34, v0 +; SI-NEXT: s_or_b32 s11, s11, s18 +; SI-NEXT: s_and_b32 s18, s25, 0xffff +; SI-NEXT: s_lshl_b32 s10, s10, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: s_or_b32 s10, s10, s18 +; SI-NEXT: s_and_b32 s18, s26, 0xffff +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_add_i32 s27, s27, 3 ; SI-NEXT: v_add_i32_e32 v27, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v57 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: s_or_b32 s9, s9, s18 +; SI-NEXT: s_and_b32 s18, s27, 0xffff +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v54, v0 +; SI-NEXT: s_or_b32 s8, s8, s18 +; SI-NEXT: s_and_b32 s18, s28, 0xffff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: v_or_b32_e32 v0, v41, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: s_or_b32 s7, s7, s18 +; SI-NEXT: s_and_b32 s18, s29, 0xffff +; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: v_add_i32_e32 v28, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v56 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_or_b32 s6, s6, s18 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v33, v0 +; SI-NEXT: v_or_b32_e32 v1, v62, v1 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s16, s16, 0x30000 +; SI-NEXT: s_add_i32 s17, s17, 0x30000 +; SI-NEXT: s_add_i32 s15, s15, 0x30000 +; SI-NEXT: s_add_i32 s14, s14, 0x30000 +; SI-NEXT: s_add_i32 s13, s13, 0x30000 +; SI-NEXT: s_add_i32 s12, s12, 0x30000 +; SI-NEXT: s_add_i32 s11, s11, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v40, v0 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v1 ; SI-NEXT: v_add_i32_e32 v29, vcc, 0x30000, v0 ; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: v_mov_b32_e32 v3, s17 +; SI-NEXT: v_mov_b32_e32 v4, s15 +; SI-NEXT: v_mov_b32_e32 v5, s14 +; SI-NEXT: v_mov_b32_e32 v6, s13 +; SI-NEXT: v_mov_b32_e32 v7, s12 +; SI-NEXT: v_mov_b32_e32 v8, s11 +; SI-NEXT: v_mov_b32_e32 v9, s10 +; SI-NEXT: v_mov_b32_e32 v10, s9 +; SI-NEXT: v_mov_b32_e32 v11, s8 +; SI-NEXT: v_mov_b32_e32 v12, s7 +; SI-NEXT: v_mov_b32_e32 v13, s6 ; SI-NEXT: .LBB31_3: ; %end -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB31_4: -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v46, v44 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v45, v43 -; SI-NEXT: v_mov_b32_e32 v44, v42 -; SI-NEXT: v_mov_b32_e32 v43, v41 -; SI-NEXT: v_mov_b32_e32 v42, v40 -; SI-NEXT: v_mov_b32_e32 v41, v38 -; SI-NEXT: v_mov_b32_e32 v40, v37 -; SI-NEXT: v_mov_b32_e32 v38, v36 -; SI-NEXT: v_mov_b32_e32 v37, v35 -; SI-NEXT: v_mov_b32_e32 v36, v55 -; SI-NEXT: v_mov_b32_e32 v55, v34 -; SI-NEXT: v_mov_b32_e32 v35, v54 -; SI-NEXT: v_mov_b32_e32 v54, v33 -; SI-NEXT: v_mov_b32_e32 v34, v32 -; SI-NEXT: v_mov_b32_e32 v33, v62 -; SI-NEXT: v_mov_b32_e32 v62, v60 -; SI-NEXT: v_mov_b32_e32 v32, v63 -; SI-NEXT: v_mov_b32_e32 v63, v53 -; SI-NEXT: v_mov_b32_e32 v53, v61 -; SI-NEXT: v_mov_b32_e32 v61, v52 -; SI-NEXT: v_mov_b32_e32 v52, v59 -; SI-NEXT: v_mov_b32_e32 v59, v51 -; SI-NEXT: v_mov_b32_e32 v51, v57 -; SI-NEXT: v_mov_b32_e32 v57, v50 -; SI-NEXT: v_mov_b32_e32 v50, v47 -; SI-NEXT: v_mov_b32_e32 v47, v48 -; SI-NEXT: v_mov_b32_e32 v48, v30 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v60, v62 -; SI-NEXT: v_mov_b32_e32 v30, v48 -; SI-NEXT: v_mov_b32_e32 v48, v47 -; SI-NEXT: v_mov_b32_e32 v47, v50 -; SI-NEXT: v_mov_b32_e32 v50, v57 -; SI-NEXT: v_mov_b32_e32 v57, v51 -; SI-NEXT: v_mov_b32_e32 v51, v59 -; SI-NEXT: v_mov_b32_e32 v59, v52 -; SI-NEXT: v_mov_b32_e32 v52, v61 -; SI-NEXT: v_mov_b32_e32 v61, v53 -; SI-NEXT: v_mov_b32_e32 v53, v63 -; SI-NEXT: v_mov_b32_e32 v63, v32 -; SI-NEXT: v_mov_b32_e32 v62, v33 -; SI-NEXT: v_mov_b32_e32 v32, v34 -; SI-NEXT: v_mov_b32_e32 v33, v54 -; SI-NEXT: v_mov_b32_e32 v54, v35 -; SI-NEXT: v_mov_b32_e32 v34, v55 -; SI-NEXT: v_mov_b32_e32 v55, v36 -; SI-NEXT: v_mov_b32_e32 v35, v37 -; SI-NEXT: v_mov_b32_e32 v36, v38 -; SI-NEXT: v_mov_b32_e32 v37, v40 -; SI-NEXT: v_mov_b32_e32 v38, v41 -; SI-NEXT: v_mov_b32_e32 v40, v42 -; SI-NEXT: v_mov_b32_e32 v41, v43 -; SI-NEXT: v_mov_b32_e32 v42, v44 -; SI-NEXT: v_mov_b32_e32 v43, v45 -; SI-NEXT: v_mov_b32_e32 v44, v46 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: s_branch .LBB31_2 ; ; VI-LABEL: bitcast_v60i16_to_v30f32_scalar: @@ -19877,190 +19323,193 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) { ; SI-LABEL: bitcast_v30f32_to_v60f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB32_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v23 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v61, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v11 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v32 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v32 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v5 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v22 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v59, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v21 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v62, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v13 ; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v27 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v25 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 @@ -20073,44 +19522,38 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v25 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v44, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v28 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v40, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v0 +; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 @@ -20140,38 +19583,39 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: .LBB32_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB32_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 -; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v35 -; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v21 ; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v34 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v20 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v30 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 ; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 -; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v18 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v34 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v32 -; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v37 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v62 +; SI-NEXT: v_mov_b32_e32 v62, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v60 ; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 ; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 ; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 @@ -20187,38 +19631,35 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 ; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 ; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 -; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 ; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 ; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 ; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 ; SI-NEXT: v_add_f32_e32 v27, 1.0, v27 ; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 ; SI-NEXT: v_add_f32_e32 v29, 1.0, v29 -; SI-NEXT: v_add_f32_e32 v30, 1.0, v30 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v16 ; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v29 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 ; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 @@ -20245,346 +19686,256 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 ; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 ; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 ; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 ; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v61 -; SI-NEXT: v_mov_b32_e32 v61, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v59 +; SI-NEXT: v_mov_b32_e32 v59, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 ; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 ; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 ; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 ; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 ; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 ; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 ; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 ; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 ; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 -; SI-NEXT: v_mov_b32_e32 v37, v27 -; SI-NEXT: v_mov_b32_e32 v35, v28 -; SI-NEXT: v_mov_b32_e32 v34, v29 -; SI-NEXT: v_mov_b32_e32 v32, v30 -; SI-NEXT: v_mov_b32_e32 v63, v25 -; SI-NEXT: v_mov_b32_e32 v59, v26 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v34, v27 +; SI-NEXT: v_mov_b32_e32 v32, v29 +; SI-NEXT: v_mov_b32_e32 v30, v28 +; SI-NEXT: v_mov_b32_e32 v60, v25 +; SI-NEXT: v_mov_b32_e32 v57, v26 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: .LBB32_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v46 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v47 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v44 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v43 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v40 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v44 -; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 -; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 -; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 -; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 -; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 -; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 -; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 -; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 -; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 -; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v31 -; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v62 -; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v4, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v36 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v62 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v32 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v6, v50 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 -; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v8, v37 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v58 -; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v10, v33 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v12, v63 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v14, v58 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v22, v59 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 +; SI-NEXT: v_or_b32_e32 v23, v25, v23 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v24, v38 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 +; SI-NEXT: v_or_b32_e32 v25, v27, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v57 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v26, v49 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_or_b32_e32 v26, v27, v26 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v53 +; SI-NEXT: v_or_b32_e32 v27, v29, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v55 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_or_b32_e32 v28, v29, v28 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v30 +; SI-NEXT: v_or_b32_e32 v29, v31, v29 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v63 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v59 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v35 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v34 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v32 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v30f32_to_v60f16: @@ -21202,22 +20553,22 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a, ; SI-LABEL: bitcast_v30f32_to_v60f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 -; SI-NEXT: v_mov_b32_e32 v31, s16 -; SI-NEXT: v_mov_b32_e32 v29, s17 -; SI-NEXT: v_mov_b32_e32 v50, s18 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: v_mov_b32_e32 v19, s16 +; SI-NEXT: v_mov_b32_e32 v32, s17 +; SI-NEXT: v_mov_b32_e32 v31, s18 +; SI-NEXT: v_mov_b32_e32 v48, s19 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mov_b32_e32 v51, s19 ; SI-NEXT: v_mov_b32_e32 v49, s20 -; SI-NEXT: v_mov_b32_e32 v48, s21 -; SI-NEXT: v_mov_b32_e32 v39, s22 -; SI-NEXT: v_mov_b32_e32 v38, s23 -; SI-NEXT: v_mov_b32_e32 v37, s24 -; SI-NEXT: v_mov_b32_e32 v36, s25 -; SI-NEXT: v_mov_b32_e32 v35, s26 +; SI-NEXT: v_mov_b32_e32 v39, s21 +; SI-NEXT: v_mov_b32_e32 v38, s22 +; SI-NEXT: v_mov_b32_e32 v37, s23 +; SI-NEXT: v_mov_b32_e32 v36, s24 +; SI-NEXT: v_mov_b32_e32 v35, s25 +; SI-NEXT: v_mov_b32_e32 v17, s26 ; SI-NEXT: v_mov_b32_e32 v34, s27 -; SI-NEXT: v_mov_b32_e32 v33, s28 -; SI-NEXT: v_mov_b32_e32 v32, s29 +; SI-NEXT: v_mov_b32_e32 v16, s28 +; SI-NEXT: v_mov_b32_e32 v33, s29 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -21236,246 +20587,241 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a, ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB33_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v58, v34 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v60, v35 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v57, v16 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v59, v34 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v39 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v36 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v51 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v39 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v29 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v48 ; SI-NEXT: v_cvt_f32_f16_e32 v30, v31 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v32 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v16 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v15 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v14 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v13 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v12 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v11 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v10 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v15 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v14 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v13 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v12 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v11 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v10 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v9 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v8 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v9 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v18, v7 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v8 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v18, v6 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v7 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v18, v5 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v6 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v18, v4 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v5 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v4 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v18, v3 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v3 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v18, v2 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v2 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v18, v1 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v1 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v18, v0 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v32 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v18, v33 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v33 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v19 ; SI-NEXT: s_cbranch_execnz .LBB33_3 ; SI-NEXT: .LBB33_2: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v32, 1.0, v32 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v33, 1.0, v33 -; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v32 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v9 -; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v33 -; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v33 ; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v63 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v6 ; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v63 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v5 ; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v4 -; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v59 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v61 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v3 ; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v2 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v57 -; SI-NEXT: v_add_f32_e32 v19, 1.0, v31 -; SI-NEXT: v_add_f32_e32 v21, 1.0, v29 -; SI-NEXT: v_add_f32_e32 v27, 1.0, v50 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v47 -; SI-NEXT: v_add_f32_e32 v26, 1.0, v51 -; SI-NEXT: v_add_f32_e32 v24, 1.0, v49 -; SI-NEXT: v_add_f32_e32 v22, 1.0, v48 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v45 -; SI-NEXT: v_add_f32_e32 v20, 1.0, v39 -; SI-NEXT: v_add_f32_e32 v18, 1.0, v38 -; SI-NEXT: v_add_f32_e32 v17, 1.0, v37 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v58 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v56 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v32 +; SI-NEXT: v_add_f32_e32 v27, 1.0, v31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v46 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v48 +; SI-NEXT: v_add_f32_e32 v23, 1.0, v49 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v39 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v44 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v38 +; SI-NEXT: v_add_f32_e32 v26, 1.0, v37 ; SI-NEXT: v_add_f32_e32 v28, 1.0, v36 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v42 ; SI-NEXT: v_add_f32_e32 v30, 1.0, v35 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 ; SI-NEXT: v_add_f32_e32 v34, 1.0, v34 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v55 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 ; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 ; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 ; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 ; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 ; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 ; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 -; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v21 ; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v26 ; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v28 ; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v34 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v33 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v9 ; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 @@ -21490,322 +20836,226 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a, ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 ; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 ; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 ; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v54 ; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v19 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: .LBB33_3: ; %end +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v28 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v23 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v21 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v29 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v20 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: v_cvt_f16_f32_e32 v1, v21 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v27 -; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v26 -; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v24 -; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v22 -; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v20 -; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v18 -; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v17 -; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v62 -; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v60 -; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v58 -; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 -; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 -; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v61 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v22 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v56 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v26 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v44 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v62 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v55 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v59 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v51 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v57 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v24, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v47 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v44 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v22, v50 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 +; SI-NEXT: v_or_b32_e32 v23, v25, v23 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v24, v53 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 +; SI-NEXT: v_or_b32_e32 v25, v27, v25 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v26, v41 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_or_b32_e32 v26, v27, v26 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v45 ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -21822,96 +21072,102 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a, ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_or_b32_e32 v27, v29, v27 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_or_b32_e32 v28, v29, v28 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v30 +; SI-NEXT: v_or_b32_e32 v29, v31, v29 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB33_4: -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; kill: killed $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; kill: killed $vgpr53 -; SI-NEXT: ; kill: killed $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; kill: killed $vgpr52 -; SI-NEXT: ; kill: killed $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; kill: killed $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; kill: killed $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; kill: killed $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; kill: killed $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; kill: killed $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; kill: killed $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; kill: killed $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; kill: killed $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; kill: killed $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; kill: killed $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; kill: killed $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; kill: killed $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; kill: killed $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; kill: killed $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; kill: killed $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; kill: killed $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; kill: killed $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; kill: killed $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; kill: killed $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr50 ; SI-NEXT: ; kill: killed $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; kill: killed $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; kill: killed $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; kill: killed $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; kill: killed $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: s_branch .LBB33_2 ; ; VI-LABEL: bitcast_v30f32_to_v60f16_scalar: @@ -22815,218 +22071,244 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v60f16_to_v30f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v3 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: s_waitcnt expcnt(5) ; SI-NEXT: v_cvt_f16_f32_e32 v58, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:116 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:88 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:84 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v10 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v3 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v44 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v43 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:104 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:100 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:112 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:108 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v60 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v42 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v6 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v41 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v7 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v40 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v55 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v54 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v32 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v32 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v52 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v51 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v55 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v62 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v53 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v54 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v32 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v32, v47 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v45 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v44 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v46 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v43 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v30 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v31 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v46 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v42 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v46, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v41 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v35 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v40 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v62 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v29 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB34_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v45 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; kill: killed $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr30 @@ -23089,7 +22371,11 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v59 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v57 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v41 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v53 ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v51 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v49 ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v39 @@ -23100,6 +22386,10 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: v_or_b32_e32 v0, v58, v0 ; SI-NEXT: v_or_b32_e32 v1, v56, v1 +; SI-NEXT: v_or_b32_e32 v2, v46, v2 +; SI-NEXT: v_or_b32_e32 v20, v42, v20 +; SI-NEXT: v_or_b32_e32 v21, v40, v21 +; SI-NEXT: v_or_b32_e32 v22, v54, v22 ; SI-NEXT: v_or_b32_e32 v23, v52, v23 ; SI-NEXT: v_or_b32_e32 v24, v50, v24 ; SI-NEXT: v_or_b32_e32 v25, v48, v25 @@ -23111,10 +22401,19 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; kill: killed $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr50 @@ -23128,111 +22427,93 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v60 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_or_b32_e32 v21, v22, v21 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v63 -; SI-NEXT: v_or_b32_e32 v22, v62, v22 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: v_or_b32_e32 v19, v44, v19 +; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: .LBB34_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB34_4 ; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v57 @@ -23250,217 +22531,207 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v24, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v42 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v40 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v53 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v25, v52 ; SI-NEXT: v_cvt_f32_f16_e32 v26, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v39 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v39 ; SI-NEXT: v_cvt_f32_f16_e32 v28, v38 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v36 ; SI-NEXT: v_cvt_f32_f16_e32 v30, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v32 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 ; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 ; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 ; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v32 ; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 ; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 ; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v60 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 @@ -23468,9 +22739,12 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 @@ -23480,34 +22754,27 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_or_b32_e32 v18, v20, v18 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v44 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: v_or_b32_e32 v19, v20, v19 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v41 ; SI-NEXT: v_or_b32_e32 v20, v22, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v63 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v55 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v54 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_or_b32_e32 v22, v23, v22 @@ -23545,22 +22812,22 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v29, v31, v29 ; SI-NEXT: .LBB34_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -24386,509 +23653,565 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a, ; SI-LABEL: bitcast_v60f16_to_v30f32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:36 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:48 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:44 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:56 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:52 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v7 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v3 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v17, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v1, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v12, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v2, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v11, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v3, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v10, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v4, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v9, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v5, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v8, s26 -; SI-NEXT: v_cvt_f16_f32_e32 v6, s29 -; SI-NEXT: v_cvt_f16_f32_e32 v7, s28 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v33 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v5 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v6 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v7 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v8 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v9 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v10 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v11 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v12 +; SI-NEXT: s_lshr_b32 s40, s17, 16 +; SI-NEXT: s_lshr_b32 s41, s16, 16 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v58, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s40 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v59, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s16 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v34 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v13 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_cbranch_scc0 .LBB35_2 -; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v10, v3 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v34 -; SI-NEXT: v_mov_b32_e32 v33, v32 -; SI-NEXT: v_or_b32_e32 v10, v32, v10 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_mov_b32_e32 v44, v43 -; SI-NEXT: v_or_b32_e32 v13, v43, v13 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v8, v5 -; SI-NEXT: v_mov_b32_e32 v57, v39 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v39 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v49 -; SI-NEXT: v_or_b32_e32 v7, v37, v7 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v59 -; SI-NEXT: v_or_b32_e32 v1, v12, v1 -; SI-NEXT: v_or_b32_e32 v2, v11, v2 -; SI-NEXT: v_or_b32_e32 v4, v9, v4 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v63 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v41 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 +; SI-NEXT: s_lshr_b32 s15, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v33 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v32, s15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s18 +; SI-NEXT: s_lshr_b32 s13, s20, 16 +; SI-NEXT: s_lshr_b32 s14, s19, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v33 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s20 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 +; SI-NEXT: s_lshr_b32 s11, s22, 16 +; SI-NEXT: s_lshr_b32 s12, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v33 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s22 +; SI-NEXT: s_lshr_b32 s9, s24, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 +; SI-NEXT: s_lshr_b32 s10, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v34 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v15 +; SI-NEXT: s_lshr_b32 s7, s26, 16 +; SI-NEXT: s_lshr_b32 s8, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 +; SI-NEXT: s_lshr_b32 s4, s29, 16 +; SI-NEXT: s_lshr_b32 s5, s28, 16 +; SI-NEXT: s_lshr_b32 s6, s27, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v60, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v61, s29 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB35_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v31 +; SI-NEXT: v_mov_b32_e32 v46, v45 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v45 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v59 +; SI-NEXT: v_or_b32_e32 v12, v62, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v56 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v44 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v63 +; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: v_or_b32_e32 v2, v57, v2 +; SI-NEXT: v_mov_b32_e32 v43, v42 +; SI-NEXT: v_or_b32_e32 v3, v42, v3 +; SI-NEXT: v_or_b32_e32 v4, v40, v4 +; SI-NEXT: v_or_b32_e32 v5, v53, v5 +; SI-NEXT: v_mov_b32_e32 v55, v54 +; SI-NEXT: v_mov_b32_e32 v53, v52 +; SI-NEXT: v_or_b32_e32 v6, v52, v6 +; SI-NEXT: v_mov_b32_e32 v50, v49 +; SI-NEXT: v_or_b32_e32 v7, v49, v7 +; SI-NEXT: v_or_b32_e32 v8, v48, v8 +; SI-NEXT: v_mov_b32_e32 v48, v39 +; SI-NEXT: v_mov_b32_e32 v35, v38 +; SI-NEXT: v_or_b32_e32 v9, v38, v9 +; SI-NEXT: v_mov_b32_e32 v61, v36 +; SI-NEXT: v_or_b32_e32 v10, v36, v10 +; SI-NEXT: v_mov_b32_e32 v33, v63 +; SI-NEXT: v_mov_b32_e32 v60, v34 +; SI-NEXT: v_or_b32_e32 v11, v34, v11 +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v30 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v18, v22, v18 -; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v52 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v50 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v48 -; SI-NEXT: v_or_b32_e32 v0, v58, v0 -; SI-NEXT: v_mov_b32_e32 v56, v34 -; SI-NEXT: v_mov_b32_e32 v47, v36 -; SI-NEXT: v_mov_b32_e32 v46, v35 -; SI-NEXT: v_or_b32_e32 v11, v35, v11 -; SI-NEXT: v_mov_b32_e32 v60, v63 -; SI-NEXT: v_mov_b32_e32 v45, v62 -; SI-NEXT: v_or_b32_e32 v12, v62, v12 -; SI-NEXT: v_mov_b32_e32 v42, v41 -; SI-NEXT: v_mov_b32_e32 v40, v55 -; SI-NEXT: v_or_b32_e32 v14, v55, v14 -; SI-NEXT: v_or_b32_e32 v15, v61, v15 -; SI-NEXT: v_or_b32_e32 v20, v53, v20 -; SI-NEXT: v_or_b32_e32 v21, v51, v21 -; SI-NEXT: v_or_b32_e32 v22, v30, v22 -; SI-NEXT: v_or_b32_e32 v23, v31, v23 -; SI-NEXT: s_mov_b64 s[4:5], 0 -; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_or_b32_e32 v17, v32, v17 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 ; SI-NEXT: v_or_b32_e32 v25, v26, v25 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v16, v43, v16 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 ; SI-NEXT: v_or_b32_e32 v26, v27, v26 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v35, v39 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 ; SI-NEXT: v_or_b32_e32 v27, v28, v27 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v37 -; SI-NEXT: v_or_b32_e32 v9, v39, v9 -; SI-NEXT: v_mov_b32_e32 v36, v37 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v45 +; SI-NEXT: v_or_b32_e32 v1, v41, v1 +; SI-NEXT: v_mov_b32_e32 v41, v40 +; SI-NEXT: v_mov_b32_e32 v34, v45 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 ; SI-NEXT: v_or_b32_e32 v28, v29, v28 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v8, v38, v8 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v58, v13 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; SI-NEXT: v_or_b32_e32 v29, v54, v29 -; SI-NEXT: v_mov_b32_e32 v54, v32 -; SI-NEXT: s_branch .LBB35_3 -; SI-NEXT: .LBB35_2: -; SI-NEXT: v_mov_b32_e32 v54, v53 -; SI-NEXT: v_mov_b32_e32 v53, v52 -; SI-NEXT: v_mov_b32_e32 v52, v51 -; SI-NEXT: v_mov_b32_e32 v51, v50 -; SI-NEXT: v_mov_b32_e32 v50, v30 -; SI-NEXT: v_mov_b32_e32 v49, v48 -; SI-NEXT: v_mov_b32_e32 v48, v31 -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v47, v36 -; SI-NEXT: v_mov_b32_e32 v46, v35 -; SI-NEXT: v_mov_b32_e32 v44, v43 -; SI-NEXT: v_mov_b32_e32 v30, v50 -; SI-NEXT: v_mov_b32_e32 v50, v51 -; SI-NEXT: v_mov_b32_e32 v51, v52 -; SI-NEXT: v_mov_b32_e32 v52, v53 -; SI-NEXT: v_mov_b32_e32 v53, v54 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v57, v39 -; SI-NEXT: v_mov_b32_e32 v56, v34 -; SI-NEXT: v_mov_b32_e32 v33, v32 -; SI-NEXT: v_mov_b32_e32 v60, v63 -; SI-NEXT: v_mov_b32_e32 v45, v62 -; SI-NEXT: v_mov_b32_e32 v42, v41 -; SI-NEXT: v_mov_b32_e32 v40, v55 -; SI-NEXT: s_mov_b64 s[4:5], -1 -; SI-NEXT: v_mov_b32_e32 v31, v48 -; SI-NEXT: v_mov_b32_e32 v48, v49 -; SI-NEXT: .LBB35_3: ; %Flow -; SI-NEXT: v_mov_b32_e32 v32, v33 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] -; SI-NEXT: v_mov_b32_e32 v61, v40 -; SI-NEXT: v_mov_b32_e32 v40, v44 -; SI-NEXT: s_cbranch_vccnz .LBB35_5 -; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v58 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v38 +; SI-NEXT: v_or_b32_e32 v29, v62, v29 +; SI-NEXT: s_cbranch_execnz .LBB35_3 +; SI-NEXT: .LBB35_2: ; %cmp.true +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v0, v46 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v41 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v50 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v61 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v58 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v40 -; SI-NEXT: v_mov_b32_e32 v55, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v43 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v48 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v31 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v57 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v56 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v44 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v55 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v37 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v51 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v48 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v37 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v33 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v59 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v30 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 @@ -24896,9 +24219,9 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a, ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 @@ -24908,7 +24231,7 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_or_b32_e32 v18, v20, v18 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 @@ -24920,39 +24243,46 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_or_b32_e32 v19, v20, v19 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v52 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_or_b32_e32 v20, v22, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v50 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v30 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_or_b32_e32 v22, v23, v22 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: v_or_b32_e32 v23, v25, v23 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; SI-NEXT: v_or_b32_e32 v24, v26, v24 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 @@ -24964,12 +24294,12 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 ; SI-NEXT: v_or_b32_e32 v25, v26, v25 ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v27 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 ; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 ; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 ; SI-NEXT: v_or_b32_e32 v26, v28, v26 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 ; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 @@ -24979,7 +24309,7 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 ; SI-NEXT: v_or_b32_e32 v27, v29, v27 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 ; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 @@ -24992,25 +24322,48 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a, ; SI-NEXT: v_or_b32_e32 v28, v29, v28 ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v30 ; SI-NEXT: v_or_b32_e32 v29, v31, v29 -; SI-NEXT: .LBB35_5: ; %end -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: .LBB35_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB35_4: +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v58, v30 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: v_mov_b32_e32 v60, v34 +; SI-NEXT: v_mov_b32_e32 v30, v58 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v46, v45 +; SI-NEXT: v_mov_b32_e32 v43, v42 +; SI-NEXT: v_mov_b32_e32 v41, v40 +; SI-NEXT: v_mov_b32_e32 v55, v54 +; SI-NEXT: v_mov_b32_e32 v53, v52 +; SI-NEXT: v_mov_b32_e32 v50, v49 +; SI-NEXT: v_mov_b32_e32 v48, v39 +; SI-NEXT: v_mov_b32_e32 v35, v38 +; SI-NEXT: v_mov_b32_e32 v61, v36 +; SI-NEXT: v_mov_b32_e32 v33, v63 +; SI-NEXT: s_branch .LBB35_2 ; ; VI-LABEL: bitcast_v60f16_to_v30f32_scalar: ; VI: ; %bb.0: @@ -26634,357 +25987,267 @@ define <60 x i16> @bitcast_v15i64_to_v60i16(<15 x i64> %a, i32 %b) { ; SI-LABEL: bitcast_v15i64_to_v60i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB40_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_alignbit_b32 v31, v30, v29, 16 -; SI-NEXT: v_alignbit_b32 v32, v28, v27, 16 -; SI-NEXT: v_alignbit_b32 v33, v26, v25, 16 -; SI-NEXT: v_alignbit_b32 v34, v24, v23, 16 -; SI-NEXT: v_alignbit_b32 v35, v22, v21, 16 -; SI-NEXT: v_alignbit_b32 v36, v20, v19, 16 -; SI-NEXT: v_alignbit_b32 v37, v18, v17, 16 -; SI-NEXT: v_alignbit_b32 v39, v16, v15, 16 -; SI-NEXT: v_alignbit_b32 v50, v14, v13, 16 -; SI-NEXT: v_alignbit_b32 v52, v12, v11, 16 -; SI-NEXT: v_alignbit_b32 v55, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v41, v8, v7, 16 -; SI-NEXT: v_alignbit_b32 v43, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v30, v29, v28, 16 +; SI-NEXT: v_alignbit_b32 v31, v27, v26, 16 +; SI-NEXT: v_alignbit_b32 v32, v25, v24, 16 +; SI-NEXT: v_alignbit_b32 v33, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v34, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v35, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v36, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v37, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v38, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v39, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v48, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v51, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v54, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v40, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v43, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v15 ; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_alignbit_b32 v46, v4, v3, 16 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v13 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v11 ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_alignbit_b32 v56, v2, v1, 16 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v9 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v7 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v5 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v3 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v1 ; SI-NEXT: .LBB40_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB40_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: v_addc_u32_e32 v12, vcc, 0, v12, vcc -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; SI-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; SI-NEXT: v_addc_u32_e32 v16, vcc, 0, v16, vcc -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; SI-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; SI-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc -; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; SI-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc -; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; SI-NEXT: v_addc_u32_e32 v24, vcc, 0, v24, vcc -; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; SI-NEXT: v_addc_u32_e32 v26, vcc, 0, v26, vcc -; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; SI-NEXT: v_addc_u32_e32 v28, vcc, 0, v28, vcc -; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 -; SI-NEXT: v_addc_u32_e32 v30, vcc, 0, v30, vcc -; SI-NEXT: v_alignbit_b32 v31, v30, v29, 16 -; SI-NEXT: v_alignbit_b32 v32, v28, v27, 16 -; SI-NEXT: v_alignbit_b32 v33, v26, v25, 16 -; SI-NEXT: v_alignbit_b32 v34, v24, v23, 16 -; SI-NEXT: v_alignbit_b32 v35, v22, v21, 16 -; SI-NEXT: v_alignbit_b32 v36, v20, v19, 16 -; SI-NEXT: v_alignbit_b32 v37, v18, v17, 16 -; SI-NEXT: v_alignbit_b32 v39, v16, v15, 16 -; SI-NEXT: v_alignbit_b32 v50, v14, v13, 16 -; SI-NEXT: v_alignbit_b32 v52, v12, v11, 16 -; SI-NEXT: v_alignbit_b32 v55, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v41, v8, v7, 16 -; SI-NEXT: v_alignbit_b32 v43, v6, v5, 16 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_addc_u32_e32 v29, vcc, 0, v29, vcc +; SI-NEXT: v_alignbit_b32 v30, v29, v28, 16 +; SI-NEXT: v_alignbit_b32 v31, v27, v26, 16 +; SI-NEXT: v_alignbit_b32 v32, v25, v24, 16 +; SI-NEXT: v_alignbit_b32 v33, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v34, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v35, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v36, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v37, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v38, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v39, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v48, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v51, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v54, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v40, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v43, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v15 ; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_alignbit_b32 v46, v4, v3, 16 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v13 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v11 ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_alignbit_b32 v56, v2, v1, 16 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v9 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v7 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v5 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v3 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v1 ; SI-NEXT: .LBB40_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v43 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 +; SI-NEXT: v_or_b32_e32 v0, v0, v43 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v56 -; SI-NEXT: v_or_b32_e32 v1, v1, v56 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v60 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v46 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v59 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v43 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v58 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v57 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v47 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v45 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v44 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v42 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v53 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v24 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v25 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v26 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v49 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v27 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v28 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v29 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v30 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v59 +; SI-NEXT: v_or_b32_e32 v2, v2, v40 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v58 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v43 +; SI-NEXT: v_or_b32_e32 v3, v3, v40 +; SI-NEXT: v_or_b32_e32 v4, v4, v54 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v57 +; SI-NEXT: v_or_b32_e32 v6, v6, v51 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v56 +; SI-NEXT: v_or_b32_e32 v8, v8, v48 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v47 +; SI-NEXT: v_or_b32_e32 v10, v10, v39 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v46 +; SI-NEXT: v_or_b32_e32 v12, v12, v38 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v45 +; SI-NEXT: v_or_b32_e32 v14, v14, v37 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v44 +; SI-NEXT: v_or_b32_e32 v16, v16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v42 +; SI-NEXT: v_or_b32_e32 v18, v18, v35 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v41 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_or_b32_e32 v20, v20, v34 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v55 +; SI-NEXT: v_or_b32_e32 v22, v22, v33 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v53 +; SI-NEXT: v_or_b32_e32 v24, v24, v32 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v52 +; SI-NEXT: v_or_b32_e32 v26, v26, v31 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v50 +; SI-NEXT: v_or_b32_e32 v28, v28, v30 +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v49 +; SI-NEXT: v_or_b32_e32 v5, v5, v54 +; SI-NEXT: v_or_b32_e32 v7, v7, v51 +; SI-NEXT: v_or_b32_e32 v9, v9, v48 +; SI-NEXT: v_or_b32_e32 v11, v11, v39 +; SI-NEXT: v_or_b32_e32 v13, v13, v38 +; SI-NEXT: v_or_b32_e32 v15, v15, v37 +; SI-NEXT: v_or_b32_e32 v17, v17, v36 +; SI-NEXT: v_or_b32_e32 v19, v19, v35 +; SI-NEXT: v_or_b32_e32 v21, v21, v34 +; SI-NEXT: v_or_b32_e32 v23, v23, v33 +; SI-NEXT: v_or_b32_e32 v25, v25, v32 +; SI-NEXT: v_or_b32_e32 v27, v27, v31 +; SI-NEXT: v_or_b32_e32 v29, v29, v30 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v15i64_to_v60i16: @@ -27649,72 +26912,72 @@ define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i3 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v20, s30, 0 -; SI-NEXT: v_writelane_b32 v20, s31, 1 -; SI-NEXT: v_writelane_b32 v20, s34, 2 -; SI-NEXT: v_writelane_b32 v20, s35, 3 -; SI-NEXT: v_writelane_b32 v20, s36, 4 -; SI-NEXT: v_writelane_b32 v20, s37, 5 -; SI-NEXT: v_writelane_b32 v20, s38, 6 -; SI-NEXT: v_writelane_b32 v20, s39, 7 -; SI-NEXT: v_writelane_b32 v20, s48, 8 -; SI-NEXT: v_mov_b32_e32 v18, s16 -; SI-NEXT: v_mov_b32_e32 v19, s17 -; SI-NEXT: v_writelane_b32 v20, s49, 9 -; SI-NEXT: v_readfirstlane_b32 s46, v18 -; SI-NEXT: v_mov_b32_e32 v18, s18 -; SI-NEXT: v_readfirstlane_b32 s47, v19 -; SI-NEXT: v_mov_b32_e32 v19, s19 -; SI-NEXT: v_writelane_b32 v20, s50, 10 -; SI-NEXT: v_readfirstlane_b32 s44, v18 +; SI-NEXT: v_writelane_b32 v30, s30, 0 +; SI-NEXT: v_writelane_b32 v30, s31, 1 +; SI-NEXT: v_writelane_b32 v30, s34, 2 +; SI-NEXT: v_writelane_b32 v30, s35, 3 +; SI-NEXT: v_writelane_b32 v30, s36, 4 +; SI-NEXT: v_writelane_b32 v30, s37, 5 +; SI-NEXT: v_writelane_b32 v30, s38, 6 +; SI-NEXT: v_writelane_b32 v30, s39, 7 +; SI-NEXT: v_writelane_b32 v30, s48, 8 +; SI-NEXT: v_writelane_b32 v30, s49, 9 +; SI-NEXT: v_writelane_b32 v30, s50, 10 +; SI-NEXT: v_mov_b32_e32 v17, s16 +; SI-NEXT: v_mov_b32_e32 v18, s17 +; SI-NEXT: v_writelane_b32 v30, s51, 11 +; SI-NEXT: v_mov_b32_e32 v19, s18 +; SI-NEXT: v_readfirstlane_b32 s46, v17 +; SI-NEXT: v_mov_b32_e32 v17, s19 +; SI-NEXT: v_readfirstlane_b32 s47, v18 ; SI-NEXT: v_mov_b32_e32 v18, s20 -; SI-NEXT: v_readfirstlane_b32 s45, v19 +; SI-NEXT: v_writelane_b32 v30, s52, 12 +; SI-NEXT: v_readfirstlane_b32 s44, v19 ; SI-NEXT: v_mov_b32_e32 v19, s21 -; SI-NEXT: v_writelane_b32 v20, s51, 11 +; SI-NEXT: v_readfirstlane_b32 s45, v17 +; SI-NEXT: v_mov_b32_e32 v17, s22 ; SI-NEXT: v_readfirstlane_b32 s42, v18 -; SI-NEXT: v_mov_b32_e32 v18, s22 +; SI-NEXT: v_mov_b32_e32 v18, s23 +; SI-NEXT: v_writelane_b32 v30, s53, 13 ; SI-NEXT: v_readfirstlane_b32 s43, v19 -; SI-NEXT: v_mov_b32_e32 v19, s23 -; SI-NEXT: v_writelane_b32 v20, s52, 12 -; SI-NEXT: v_readfirstlane_b32 s40, v18 -; SI-NEXT: v_mov_b32_e32 v18, s24 -; SI-NEXT: v_readfirstlane_b32 s41, v19 -; SI-NEXT: v_mov_b32_e32 v19, s25 -; SI-NEXT: v_writelane_b32 v20, s53, 13 -; SI-NEXT: v_readfirstlane_b32 s24, v18 +; SI-NEXT: v_mov_b32_e32 v19, s24 +; SI-NEXT: v_readfirstlane_b32 s40, v17 +; SI-NEXT: v_mov_b32_e32 v17, s25 +; SI-NEXT: v_readfirstlane_b32 s41, v18 ; SI-NEXT: v_mov_b32_e32 v18, s26 -; SI-NEXT: v_readfirstlane_b32 s25, v19 +; SI-NEXT: v_writelane_b32 v30, s54, 14 +; SI-NEXT: v_readfirstlane_b32 s24, v19 ; SI-NEXT: v_mov_b32_e32 v19, s27 -; SI-NEXT: v_writelane_b32 v20, s54, 14 +; SI-NEXT: v_readfirstlane_b32 s25, v17 +; SI-NEXT: v_mov_b32_e32 v17, s28 ; SI-NEXT: v_readfirstlane_b32 s22, v18 -; SI-NEXT: v_mov_b32_e32 v18, s28 +; SI-NEXT: v_mov_b32_e32 v18, s29 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: v_writelane_b32 v30, s55, 15 ; SI-NEXT: v_readfirstlane_b32 s23, v19 -; SI-NEXT: v_mov_b32_e32 v19, s29 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 -; SI-NEXT: v_writelane_b32 v20, s55, 15 -; SI-NEXT: v_readfirstlane_b32 s20, v18 -; SI-NEXT: v_readfirstlane_b32 s21, v19 -; SI-NEXT: v_readfirstlane_b32 s18, v1 -; SI-NEXT: v_readfirstlane_b32 s19, v2 -; SI-NEXT: v_readfirstlane_b32 s16, v3 -; SI-NEXT: v_readfirstlane_b32 s17, v4 -; SI-NEXT: v_readfirstlane_b32 s14, v5 -; SI-NEXT: v_readfirstlane_b32 s15, v6 -; SI-NEXT: v_readfirstlane_b32 s12, v7 -; SI-NEXT: v_readfirstlane_b32 s13, v8 -; SI-NEXT: v_readfirstlane_b32 s10, v9 -; SI-NEXT: v_readfirstlane_b32 s11, v10 -; SI-NEXT: v_readfirstlane_b32 s8, v11 -; SI-NEXT: v_readfirstlane_b32 s9, v12 -; SI-NEXT: v_readfirstlane_b32 s6, v13 -; SI-NEXT: v_readfirstlane_b32 s7, v14 -; SI-NEXT: v_readfirstlane_b32 s4, v15 +; SI-NEXT: v_readfirstlane_b32 s20, v17 +; SI-NEXT: v_readfirstlane_b32 s21, v18 +; SI-NEXT: v_readfirstlane_b32 s18, v0 +; SI-NEXT: v_readfirstlane_b32 s19, v1 +; SI-NEXT: v_readfirstlane_b32 s16, v2 +; SI-NEXT: v_readfirstlane_b32 s17, v3 +; SI-NEXT: v_readfirstlane_b32 s14, v4 +; SI-NEXT: v_readfirstlane_b32 s15, v5 +; SI-NEXT: v_readfirstlane_b32 s12, v6 +; SI-NEXT: v_readfirstlane_b32 s13, v7 +; SI-NEXT: v_readfirstlane_b32 s10, v8 +; SI-NEXT: v_readfirstlane_b32 s11, v9 +; SI-NEXT: v_readfirstlane_b32 s8, v10 +; SI-NEXT: v_readfirstlane_b32 s9, v11 +; SI-NEXT: v_readfirstlane_b32 s6, v12 +; SI-NEXT: v_readfirstlane_b32 s7, v13 +; SI-NEXT: v_readfirstlane_b32 s4, v14 ; SI-NEXT: s_and_b64 s[26:27], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s5, v16 -; SI-NEXT: v_writelane_b32 v20, s64, 16 +; SI-NEXT: v_readfirstlane_b32 s5, v15 +; SI-NEXT: v_writelane_b32 v30, s64, 16 ; SI-NEXT: s_cbranch_scc0 .LBB41_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_lshr_b32 s34, s5, 16 @@ -27813,227 +27076,144 @@ define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i3 ; SI-NEXT: s_lshl_b32 s27, s30, 16 ; SI-NEXT: s_and_b32 s29, s46, 0xffff ; SI-NEXT: s_or_b32 s27, s29, s27 -; SI-NEXT: v_mov_b32_e32 v1, s27 -; SI-NEXT: s_and_b32 s27, s47, 0xffff -; SI-NEXT: s_lshl_b32 s29, s64, 16 -; SI-NEXT: s_or_b32 s27, s27, s29 -; SI-NEXT: v_mov_b32_e32 v2, s27 -; SI-NEXT: s_lshl_b32 s27, s94, 16 -; SI-NEXT: s_and_b32 s29, s44, 0xffff -; SI-NEXT: s_or_b32 s27, s29, s27 -; SI-NEXT: v_mov_b32_e32 v3, s27 -; SI-NEXT: s_and_b32 s27, s45, 0xffff -; SI-NEXT: s_lshl_b32 s29, s55, 16 -; SI-NEXT: s_or_b32 s27, s27, s29 -; SI-NEXT: v_mov_b32_e32 v4, s27 -; SI-NEXT: s_lshl_b32 s27, s92, 16 -; SI-NEXT: s_and_b32 s29, s42, 0xffff -; SI-NEXT: s_or_b32 s27, s29, s27 -; SI-NEXT: v_mov_b32_e32 v5, s27 -; SI-NEXT: s_and_b32 s27, s43, 0xffff -; SI-NEXT: s_lshl_b32 s29, s54, 16 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; SI-NEXT: s_or_b32 s27, s27, s29 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v1, vcc, 8, v0 -; SI-NEXT: v_mov_b32_e32 v6, s27 -; SI-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v1, vcc, 12, v0 -; SI-NEXT: s_lshl_b32 s27, s90, 16 -; SI-NEXT: s_and_b32 s29, s40, 0xffff -; SI-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v1, vcc, 16, v0 -; SI-NEXT: s_or_b32 s27, s29, s27 -; SI-NEXT: buffer_store_dword v5, v1, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v1, vcc, 20, v0 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_mov_b32_e32 v2, s27 -; SI-NEXT: s_and_b32 s27, s41, 0xffff -; SI-NEXT: s_lshl_b32 s29, s53, 16 -; SI-NEXT: buffer_store_dword v6, v1, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v1, vcc, 24, v0 -; SI-NEXT: s_or_b32 s27, s27, s29 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s27 +; SI-NEXT: s_and_b32 s29, s47, 0xffff +; SI-NEXT: s_lshl_b32 s46, s64, 16 +; SI-NEXT: s_or_b32 s29, s29, s46 +; SI-NEXT: s_lshl_b32 s46, s94, 16 +; SI-NEXT: s_and_b32 s44, s44, 0xffff +; SI-NEXT: s_or_b32 s44, s44, s46 +; SI-NEXT: s_and_b32 s45, s45, 0xffff +; SI-NEXT: s_lshl_b32 s46, s55, 16 +; SI-NEXT: s_or_b32 s45, s45, s46 +; SI-NEXT: s_lshl_b32 s46, s92, 16 +; SI-NEXT: s_and_b32 s42, s42, 0xffff +; SI-NEXT: s_or_b32 s42, s42, s46 +; SI-NEXT: s_and_b32 s43, s43, 0xffff +; SI-NEXT: s_lshl_b32 s46, s54, 16 +; SI-NEXT: s_or_b32 s43, s43, s46 +; SI-NEXT: s_lshl_b32 s46, s90, 16 +; SI-NEXT: s_and_b32 s40, s40, 0xffff +; SI-NEXT: s_or_b32 s40, s40, s46 +; SI-NEXT: s_and_b32 s41, s41, 0xffff +; SI-NEXT: s_lshl_b32 s46, s53, 16 +; SI-NEXT: s_or_b32 s41, s41, s46 +; SI-NEXT: s_lshl_b32 s46, s88, 16 ; SI-NEXT: s_and_b32 s24, s24, 0xffff -; SI-NEXT: s_lshl_b32 s27, s88, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 28, v0 -; SI-NEXT: s_or_b32 s24, s24, s27 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s24 -; SI-NEXT: s_and_b32 s24, s25, 0xffff -; SI-NEXT: s_lshl_b32 s25, s52, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 32, v0 -; SI-NEXT: s_or_b32 s24, s24, s25 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s24 +; SI-NEXT: s_or_b32 s24, s24, s46 +; SI-NEXT: s_and_b32 s25, s25, 0xffff +; SI-NEXT: s_lshl_b32 s46, s52, 16 +; SI-NEXT: s_or_b32 s25, s25, s46 +; SI-NEXT: s_lshl_b32 s46, s78, 16 ; SI-NEXT: s_and_b32 s22, s22, 0xffff -; SI-NEXT: s_lshl_b32 s24, s78, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 36, v0 -; SI-NEXT: s_or_b32 s22, s22, s24 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s22 -; SI-NEXT: s_and_b32 s22, s23, 0xffff -; SI-NEXT: s_lshl_b32 s23, s51, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 40, v0 -; SI-NEXT: s_or_b32 s22, s22, s23 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s22 +; SI-NEXT: s_or_b32 s22, s22, s46 +; SI-NEXT: s_and_b32 s23, s23, 0xffff +; SI-NEXT: s_lshl_b32 s46, s51, 16 +; SI-NEXT: s_or_b32 s23, s23, s46 ; SI-NEXT: s_and_b32 s20, s20, 0xffff -; SI-NEXT: s_lshl_b32 s22, s76, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 44, v0 -; SI-NEXT: s_or_b32 s20, s20, s22 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s20 -; SI-NEXT: s_and_b32 s20, s21, 0xffff -; SI-NEXT: s_lshl_b32 s21, s50, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 48, v0 -; SI-NEXT: s_or_b32 s20, s20, s21 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s20 +; SI-NEXT: s_lshl_b32 s46, s76, 16 +; SI-NEXT: s_or_b32 s20, s20, s46 +; SI-NEXT: s_and_b32 s21, s21, 0xffff +; SI-NEXT: s_lshl_b32 s46, s50, 16 +; SI-NEXT: s_or_b32 s21, s21, s46 ; SI-NEXT: s_and_b32 s18, s18, 0xffff -; SI-NEXT: s_lshl_b32 s20, s74, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 52, v0 -; SI-NEXT: s_or_b32 s18, s18, s20 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s18 -; SI-NEXT: s_and_b32 s18, s19, 0xffff -; SI-NEXT: s_lshl_b32 s19, s49, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 56, v0 -; SI-NEXT: s_or_b32 s18, s18, s19 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: s_lshl_b32 s46, s74, 16 +; SI-NEXT: s_or_b32 s18, s18, s46 +; SI-NEXT: s_and_b32 s19, s19, 0xffff +; SI-NEXT: s_lshl_b32 s46, s49, 16 +; SI-NEXT: s_or_b32 s19, s19, s46 ; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: s_lshl_b32 s18, s72, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 60, v0 -; SI-NEXT: s_or_b32 s16, s16, s18 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s17, 0xffff -; SI-NEXT: s_lshl_b32 s17, s48, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 64, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_lshl_b32 s46, s72, 16 +; SI-NEXT: s_or_b32 s16, s16, s46 +; SI-NEXT: s_and_b32 s17, s17, 0xffff +; SI-NEXT: s_lshl_b32 s46, s48, 16 +; SI-NEXT: s_or_b32 s17, s17, s46 ; SI-NEXT: s_and_b32 s14, s14, 0xffff -; SI-NEXT: s_lshl_b32 s16, s62, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x44, v0 -; SI-NEXT: s_or_b32 s14, s14, s16 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s14 -; SI-NEXT: s_and_b32 s14, s15, 0xffff -; SI-NEXT: s_lshl_b32 s15, s39, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x48, v0 -; SI-NEXT: s_or_b32 s14, s14, s15 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s14 +; SI-NEXT: s_lshl_b32 s46, s62, 16 +; SI-NEXT: s_or_b32 s14, s14, s46 +; SI-NEXT: s_and_b32 s15, s15, 0xffff +; SI-NEXT: s_lshl_b32 s46, s39, 16 +; SI-NEXT: s_or_b32 s15, s15, s46 ; SI-NEXT: s_and_b32 s12, s12, 0xffff -; SI-NEXT: s_lshl_b32 s14, s60, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x4c, v0 -; SI-NEXT: s_or_b32 s12, s12, s14 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s12 -; SI-NEXT: s_and_b32 s12, s13, 0xffff -; SI-NEXT: s_lshl_b32 s13, s38, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x50, v0 -; SI-NEXT: s_or_b32 s12, s12, s13 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s12 +; SI-NEXT: s_lshl_b32 s46, s60, 16 +; SI-NEXT: s_or_b32 s12, s12, s46 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_lshl_b32 s46, s38, 16 +; SI-NEXT: s_or_b32 s13, s13, s46 ; SI-NEXT: s_and_b32 s10, s10, 0xffff -; SI-NEXT: s_lshl_b32 s12, s58, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x54, v0 -; SI-NEXT: s_or_b32 s10, s10, s12 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s10 -; SI-NEXT: s_and_b32 s10, s11, 0xffff -; SI-NEXT: s_lshl_b32 s11, s37, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x58, v0 -; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: s_lshl_b32 s46, s58, 16 +; SI-NEXT: s_or_b32 s10, s10, s46 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: s_lshl_b32 s46, s37, 16 +; SI-NEXT: s_or_b32 s11, s11, s46 ; SI-NEXT: s_and_b32 s8, s8, 0xffff -; SI-NEXT: s_lshl_b32 s10, s56, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x5c, v0 -; SI-NEXT: s_or_b32 s8, s8, s10 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s8 -; SI-NEXT: s_and_b32 s8, s9, 0xffff -; SI-NEXT: s_lshl_b32 s9, s36, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x60, v0 -; SI-NEXT: s_or_b32 s8, s8, s9 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: s_lshl_b32 s46, s56, 16 ; SI-NEXT: s_and_b32 s6, s6, 0xffff -; SI-NEXT: s_lshl_b32 s8, s28, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x64, v0 -; SI-NEXT: s_or_b32 s6, s6, s8 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: s_and_b32 s6, s7, 0xffff -; SI-NEXT: s_lshl_b32 s7, s35, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x68, v0 -; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_lshl_b32 s28, s28, 16 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_lshl_b32 s6, s26, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x6c, v0 -; SI-NEXT: s_or_b32 s4, s4, s6 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s4 -; SI-NEXT: s_and_b32 s4, s5, 0xffff -; SI-NEXT: s_lshl_b32 s5, s34, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x70, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0 -; SI-NEXT: v_mov_b32_e32 v1, s4 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: v_readlane_b32 s64, v20, 16 -; SI-NEXT: v_readlane_b32 s55, v20, 15 -; SI-NEXT: v_readlane_b32 s54, v20, 14 -; SI-NEXT: v_readlane_b32 s53, v20, 13 -; SI-NEXT: v_readlane_b32 s52, v20, 12 -; SI-NEXT: v_readlane_b32 s51, v20, 11 -; SI-NEXT: v_readlane_b32 s50, v20, 10 -; SI-NEXT: v_readlane_b32 s49, v20, 9 -; SI-NEXT: v_readlane_b32 s48, v20, 8 -; SI-NEXT: v_readlane_b32 s39, v20, 7 -; SI-NEXT: v_readlane_b32 s38, v20, 6 -; SI-NEXT: v_readlane_b32 s37, v20, 5 -; SI-NEXT: v_readlane_b32 s36, v20, 4 -; SI-NEXT: v_readlane_b32 s35, v20, 3 -; SI-NEXT: v_readlane_b32 s34, v20, 2 -; SI-NEXT: v_readlane_b32 s31, v20, 1 -; SI-NEXT: v_readlane_b32 s30, v20, 0 +; SI-NEXT: s_lshl_b32 s26, s26, 16 +; SI-NEXT: s_or_b32 s8, s8, s46 +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_lshl_b32 s46, s36, 16 +; SI-NEXT: s_or_b32 s6, s6, s28 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_lshl_b32 s28, s35, 16 +; SI-NEXT: s_or_b32 s4, s4, s26 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s26, s34, 16 +; SI-NEXT: s_or_b32 s9, s9, s46 +; SI-NEXT: s_or_b32 s7, s7, s28 +; SI-NEXT: s_or_b32 s5, s5, s26 +; SI-NEXT: v_mov_b32_e32 v0, s27 +; SI-NEXT: v_mov_b32_e32 v1, s29 +; SI-NEXT: v_mov_b32_e32 v2, s44 +; SI-NEXT: v_mov_b32_e32 v3, s45 +; SI-NEXT: v_mov_b32_e32 v4, s42 +; SI-NEXT: v_mov_b32_e32 v5, s43 +; SI-NEXT: v_mov_b32_e32 v6, s40 +; SI-NEXT: v_mov_b32_e32 v7, s41 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s22 +; SI-NEXT: v_mov_b32_e32 v11, s23 +; SI-NEXT: v_mov_b32_e32 v12, s20 +; SI-NEXT: v_mov_b32_e32 v13, s21 +; SI-NEXT: v_mov_b32_e32 v14, s18 +; SI-NEXT: v_mov_b32_e32 v15, s19 +; SI-NEXT: v_mov_b32_e32 v16, s16 +; SI-NEXT: v_mov_b32_e32 v17, s17 +; SI-NEXT: v_mov_b32_e32 v18, s14 +; SI-NEXT: v_mov_b32_e32 v19, s15 +; SI-NEXT: v_mov_b32_e32 v20, s12 +; SI-NEXT: v_mov_b32_e32 v21, s13 +; SI-NEXT: v_mov_b32_e32 v22, s10 +; SI-NEXT: v_mov_b32_e32 v23, s11 +; SI-NEXT: v_mov_b32_e32 v24, s8 +; SI-NEXT: v_mov_b32_e32 v25, s9 +; SI-NEXT: v_mov_b32_e32 v26, s6 +; SI-NEXT: v_mov_b32_e32 v27, s7 +; SI-NEXT: v_mov_b32_e32 v28, s4 +; SI-NEXT: v_mov_b32_e32 v29, s5 +; SI-NEXT: v_readlane_b32 s64, v30, 16 +; SI-NEXT: v_readlane_b32 s55, v30, 15 +; SI-NEXT: v_readlane_b32 s54, v30, 14 +; SI-NEXT: v_readlane_b32 s53, v30, 13 +; SI-NEXT: v_readlane_b32 s52, v30, 12 +; SI-NEXT: v_readlane_b32 s51, v30, 11 +; SI-NEXT: v_readlane_b32 s50, v30, 10 +; SI-NEXT: v_readlane_b32 s49, v30, 9 +; SI-NEXT: v_readlane_b32 s48, v30, 8 +; SI-NEXT: v_readlane_b32 s39, v30, 7 +; SI-NEXT: v_readlane_b32 s38, v30, 6 +; SI-NEXT: v_readlane_b32 s37, v30, 5 +; SI-NEXT: v_readlane_b32 s36, v30, 4 +; SI-NEXT: v_readlane_b32 s35, v30, 3 +; SI-NEXT: v_readlane_b32 s34, v30, 2 +; SI-NEXT: v_readlane_b32 s31, v30, 1 +; SI-NEXT: v_readlane_b32 s30, v30, 0 ; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[4:5] -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB41_4: ; SI-NEXT: ; implicit-def: $sgpr30 @@ -28883,181 +28063,279 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v60i16_to_v15i64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:12 -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v29 -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:60 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:4 -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v2 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:52 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:112 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:108 -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v4 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:104 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:100 -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v8 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:36 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92 -; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:88 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:84 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v12 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v16 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v18 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v20 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v49, v20 +; SI-NEXT: v_mov_b32_e32 v50, v19 +; SI-NEXT: v_mov_b32_e32 v51, v18 +; SI-NEXT: v_mov_b32_e32 v52, v17 +; SI-NEXT: v_mov_b32_e32 v53, v16 +; SI-NEXT: v_mov_b32_e32 v54, v15 +; SI-NEXT: v_mov_b32_e32 v55, v14 +; SI-NEXT: v_mov_b32_e32 v40, v13 +; SI-NEXT: v_mov_b32_e32 v41, v12 +; SI-NEXT: v_mov_b32_e32 v42, v11 +; SI-NEXT: v_mov_b32_e32 v43, v10 +; SI-NEXT: v_mov_b32_e32 v44, v9 +; SI-NEXT: v_mov_b32_e32 v45, v8 +; SI-NEXT: v_mov_b32_e32 v46, v7 +; SI-NEXT: v_mov_b32_e32 v47, v6 +; SI-NEXT: v_mov_b32_e32 v56, v5 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_mov_b32_e32 v57, v4 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_mov_b32_e32 v58, v3 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_mov_b32_e32 v59, v2 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_mov_b32_e32 v60, v1 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v61, v0 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v29 +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v28 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v27 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v26 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v25 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v24 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v23 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v22 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v50 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v52 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v40 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v42 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v43 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v44 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v45 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v46 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v47 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v56 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v57 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v58 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v59 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v60 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v61 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v23 ; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v22 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v24 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:68 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v26 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:64 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v28 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB42_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v42 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v61 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v60 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v59 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v58 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v57 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v56 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v47 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v46 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v45 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v44 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v43 +; SI-NEXT: v_or_b32_e32 v0, v0, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v48 +; SI-NEXT: v_or_b32_e32 v2, v2, v32 +; SI-NEXT: v_or_b32_e32 v3, v3, v39 +; SI-NEXT: v_or_b32_e32 v4, v4, v34 +; SI-NEXT: v_or_b32_e32 v5, v5, v38 +; SI-NEXT: v_or_b32_e32 v6, v6, v63 +; SI-NEXT: v_or_b32_e32 v7, v7, v37 +; SI-NEXT: v_or_b32_e32 v8, v8, v33 +; SI-NEXT: v_or_b32_e32 v9, v9, v36 +; SI-NEXT: v_or_b32_e32 v10, v10, v62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v41 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v40 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v55 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v54 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v53 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v52 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v51 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v50 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v49 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v24, v24, v25 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v26, v26, v27 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: v_or_b32_e32 v28, v28, v29 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: v_or_b32_e32 v29, v29, v30 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; kill: killed $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr30 @@ -29109,195 +28387,40 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; kill: killed $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v61 ; SI-NEXT: ; kill: killed $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: v_or_b32_e32 v0, v0, v60 -; SI-NEXT: v_or_b32_e32 v18, v18, v37 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: ; kill: killed $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; SI-NEXT: v_or_b32_e32 v1, v1, v54 -; SI-NEXT: v_or_b32_e32 v2, v2, v59 -; SI-NEXT: v_or_b32_e32 v3, v3, v58 -; SI-NEXT: v_or_b32_e32 v4, v4, v53 -; SI-NEXT: v_or_b32_e32 v5, v5, v57 -; SI-NEXT: v_or_b32_e32 v6, v6, v52 -; SI-NEXT: v_or_b32_e32 v7, v7, v56 -; SI-NEXT: v_or_b32_e32 v8, v8, v51 -; SI-NEXT: v_or_b32_e32 v9, v9, v50 -; SI-NEXT: v_or_b32_e32 v10, v10, v49 -; SI-NEXT: v_or_b32_e32 v11, v11, v48 -; SI-NEXT: v_or_b32_e32 v12, v12, v47 -; SI-NEXT: v_or_b32_e32 v13, v13, v39 -; SI-NEXT: v_or_b32_e32 v14, v14, v46 -; SI-NEXT: v_or_b32_e32 v15, v15, v45 -; SI-NEXT: v_or_b32_e32 v16, v16, v38 -; SI-NEXT: v_or_b32_e32 v17, v17, v44 -; SI-NEXT: v_or_b32_e32 v19, v19, v36 -; SI-NEXT: v_or_b32_e32 v20, v20, v43 -; SI-NEXT: v_or_b32_e32 v21, v21, v35 -; SI-NEXT: v_or_b32_e32 v22, v22, v42 -; SI-NEXT: v_or_b32_e32 v23, v23, v34 -; SI-NEXT: v_or_b32_e32 v24, v24, v41 -; SI-NEXT: v_or_b32_e32 v25, v25, v33 -; SI-NEXT: v_or_b32_e32 v26, v26, v40 -; SI-NEXT: v_or_b32_e32 v27, v27, v32 -; SI-NEXT: v_or_b32_e32 v28, v28, v63 -; SI-NEXT: v_or_b32_e32 v29, v29, v62 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr62 ; SI-NEXT: .LBB42_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB42_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v61 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v42 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v61 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v60 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v59 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v58 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v57 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v56 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v47 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v46 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v45 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v44 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v43 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; SI-NEXT: v_or_b32_e32 v0, v60, v0 -; SI-NEXT: s_mov_b32 s6, 0x30000 -; SI-NEXT: v_or_b32_e32 v18, v37, v18 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 @@ -29308,52 +28431,19 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; SI-NEXT: v_or_b32_e32 v1, v54, v1 -; SI-NEXT: v_or_b32_e32 v2, v59, v2 -; SI-NEXT: v_or_b32_e32 v3, v58, v3 -; SI-NEXT: v_or_b32_e32 v4, v53, v4 -; SI-NEXT: v_or_b32_e32 v5, v57, v5 -; SI-NEXT: v_or_b32_e32 v6, v52, v6 -; SI-NEXT: v_or_b32_e32 v7, v56, v7 -; SI-NEXT: v_or_b32_e32 v8, v51, v8 -; SI-NEXT: v_or_b32_e32 v9, v50, v9 -; SI-NEXT: v_or_b32_e32 v10, v49, v10 -; SI-NEXT: v_or_b32_e32 v11, v48, v11 -; SI-NEXT: v_or_b32_e32 v12, v47, v12 -; SI-NEXT: v_or_b32_e32 v13, v39, v13 -; SI-NEXT: v_or_b32_e32 v14, v46, v14 -; SI-NEXT: v_or_b32_e32 v15, v45, v15 -; SI-NEXT: v_or_b32_e32 v16, v38, v16 -; SI-NEXT: v_or_b32_e32 v17, v44, v17 -; SI-NEXT: v_or_b32_e32 v19, v36, v19 -; SI-NEXT: v_or_b32_e32 v20, v43, v20 -; SI-NEXT: v_or_b32_e32 v21, v35, v21 -; SI-NEXT: v_or_b32_e32 v22, v42, v22 -; SI-NEXT: v_or_b32_e32 v23, v34, v23 -; SI-NEXT: v_or_b32_e32 v24, v41, v24 -; SI-NEXT: v_or_b32_e32 v25, v33, v25 -; SI-NEXT: v_or_b32_e32 v26, v40, v26 -; SI-NEXT: v_or_b32_e32 v27, v32, v27 -; SI-NEXT: v_or_b32_e32 v28, v63, v28 -; SI-NEXT: v_or_b32_e32 v29, v62, v29 +; SI-NEXT: v_or_b32_e32 v0, v35, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v1, v48, v1 +; SI-NEXT: v_or_b32_e32 v2, v32, v2 +; SI-NEXT: v_or_b32_e32 v3, v39, v3 +; SI-NEXT: v_or_b32_e32 v4, v34, v4 +; SI-NEXT: v_or_b32_e32 v5, v38, v5 +; SI-NEXT: v_or_b32_e32 v6, v63, v6 +; SI-NEXT: v_or_b32_e32 v7, v37, v7 +; SI-NEXT: v_or_b32_e32 v8, v33, v8 +; SI-NEXT: v_or_b32_e32 v9, v36, v9 +; SI-NEXT: v_or_b32_e32 v10, v62, v10 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 ; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 @@ -29364,6 +28454,46 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 ; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 ; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v41 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v40 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v55 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v54 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v53 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v52 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v51 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v50 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v49 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 ; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 ; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 @@ -29371,36 +28501,90 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 ; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 ; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 ; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 ; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v20 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v25, vcc, s6, v25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v26, v27, v26 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v26, vcc, s6, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v27, vcc, 0x30000, v27 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: v_or_b32_e32 v28, v29, v28 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v28, vcc, 0x30000, v28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: v_or_b32_e32 v29, v30, v29 ; SI-NEXT: v_add_i32_e32 v29, vcc, 0x30000, v29 ; SI-NEXT: .LBB42_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v60i16_to_v15i64: @@ -30224,424 +29408,349 @@ define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i3 ; SI-LABEL: bitcast_v60i16_to_v15i64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_mov_b32_e32 v60, v16 -; SI-NEXT: v_mov_b32_e32 v53, v14 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mov_b32_e32 v62, v12 -; SI-NEXT: v_mov_b32_e32 v32, v10 -; SI-NEXT: v_mov_b32_e32 v55, v8 -; SI-NEXT: v_mov_b32_e32 v37, v6 -; SI-NEXT: v_mov_b32_e32 v41, v4 -; SI-NEXT: v_mov_b32_e32 v44, v2 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v63, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:52 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v30, v28 -; SI-NEXT: v_mov_b32_e32 v39, v26 -; SI-NEXT: v_mov_b32_e32 v48, v24 -; SI-NEXT: v_mov_b32_e32 v49, v22 -; SI-NEXT: v_mov_b32_e32 v47, v20 -; SI-NEXT: v_mov_b32_e32 v50, v18 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v29 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v4 +; SI-NEXT: v_mov_b32_e32 v52, v3 +; SI-NEXT: v_mov_b32_e32 v51, v4 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v52 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v50, v5 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v51 +; SI-NEXT: v_mov_b32_e32 v49, v6 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v50 +; SI-NEXT: v_mov_b32_e32 v48, v7 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v49 +; SI-NEXT: v_mov_b32_e32 v39, v8 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v48 +; SI-NEXT: v_mov_b32_e32 v38, v9 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v39 +; SI-NEXT: v_mov_b32_e32 v37, v10 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v38 +; SI-NEXT: v_mov_b32_e32 v36, v11 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v37 +; SI-NEXT: v_mov_b32_e32 v35, v12 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v36 +; SI-NEXT: v_mov_b32_e32 v34, v13 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v35 +; SI-NEXT: v_mov_b32_e32 v33, v14 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v34 +; SI-NEXT: v_mov_b32_e32 v32, v15 +; SI-NEXT: v_mov_b32_e32 v53, v2 +; SI-NEXT: v_mov_b32_e32 v54, v1 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v54 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v55 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s7, s28, 16 +; SI-NEXT: s_lshr_b32 s8, s27, 16 +; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: s_lshr_b32 s13, s22, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s40, s19, 16 +; SI-NEXT: s_lshr_b32 s41, s18, 16 +; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v32 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v0 ; SI-NEXT: s_cbranch_scc0 .LBB43_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v63 -; SI-NEXT: v_or_b32_e32 v7, v0, v31 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v41 -; SI-NEXT: v_or_b32_e32 v9, v0, v28 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 -; SI-NEXT: v_or_b32_e32 v10, v0, v24 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; SI-NEXT: v_or_b32_e32 v11, v0, v22 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 -; SI-NEXT: v_or_b32_e32 v12, v0, v20 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v62 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_or_b32_e32 v13, v0, v13 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: v_or_b32_e32 v14, v0, v63 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 -; SI-NEXT: v_or_b32_e32 v14, v0, v18 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v60 -; SI-NEXT: v_or_b32_e32 v15, v0, v15 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s44, s42, 16 +; SI-NEXT: v_or_b32_e32 v16, v0, v61 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; SI-NEXT: s_or_b32 s5, s5, s44 +; SI-NEXT: s_and_b32 s44, s18, 0xffff +; SI-NEXT: s_lshl_b32 s45, s41, 16 +; SI-NEXT: v_or_b32_e32 v17, v0, v60 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; SI-NEXT: s_or_b32 s44, s44, s45 +; SI-NEXT: s_and_b32 s45, s19, 0xffff +; SI-NEXT: s_lshl_b32 s46, s40, 16 +; SI-NEXT: v_or_b32_e32 v18, v0, v59 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 -; SI-NEXT: v_or_b32_e32 v16, v0, v17 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v47 -; SI-NEXT: v_or_b32_e32 v17, v0, v5 +; SI-NEXT: s_or_b32 s45, s45, s46 +; SI-NEXT: s_and_b32 s46, s20, 0xffff +; SI-NEXT: s_lshl_b32 s47, s15, 16 +; SI-NEXT: v_or_b32_e32 v19, v0, v58 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 -; SI-NEXT: v_or_b32_e32 v18, v0, v3 +; SI-NEXT: s_or_b32 s46, s46, s47 +; SI-NEXT: s_and_b32 s47, s21, 0xffff +; SI-NEXT: s_lshl_b32 s56, s14, 16 +; SI-NEXT: v_or_b32_e32 v20, v0, v57 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 -; SI-NEXT: v_or_b32_e32 v19, v0, v46 +; SI-NEXT: s_or_b32 s47, s47, s56 +; SI-NEXT: s_and_b32 s56, s22, 0xffff +; SI-NEXT: s_lshl_b32 s57, s13, 16 +; SI-NEXT: v_or_b32_e32 v21, v0, v56 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_or_b32_e32 v20, v0, v45 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 -; SI-NEXT: v_or_b32_e32 v21, v0, v43 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v44 -; SI-NEXT: s_or_b32 s7, s7, s8 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: v_or_b32_e32 v8, v1, v26 -; SI-NEXT: s_or_b32 s8, s8, s9 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: s_or_b32 s9, s9, s10 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_mov_b32_e32 v5, s9 -; SI-NEXT: v_mov_b32_e32 v6, s10 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_or_b32_e32 v22, v0, v42 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v61 -; SI-NEXT: v_or_b32_e32 v23, v0, v40 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_or_b32_e32 v24, v0, v38 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v59 -; SI-NEXT: v_or_b32_e32 v25, v0, v36 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v58 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_or_b32_e32 v26, v0, v35 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 -; SI-NEXT: v_or_b32_e32 v27, v0, v34 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v57 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_or_b32_e32 v28, v0, v54 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v56 -; SI-NEXT: v_or_b32_e32 v29, v0, v33 +; SI-NEXT: s_or_b32 s56, s56, s57 +; SI-NEXT: s_and_b32 s57, s23, 0xffff +; SI-NEXT: s_lshl_b32 s58, s12, 16 +; SI-NEXT: v_or_b32_e32 v22, v0, v47 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: s_or_b32 s57, s57, s58 +; SI-NEXT: s_and_b32 s58, s24, 0xffff +; SI-NEXT: s_lshl_b32 s59, s11, 16 +; SI-NEXT: v_or_b32_e32 v23, v0, v46 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: s_or_b32 s58, s58, s59 +; SI-NEXT: s_and_b32 s59, s25, 0xffff +; SI-NEXT: s_lshl_b32 s60, s10, 16 +; SI-NEXT: v_or_b32_e32 v24, v0, v45 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: s_or_b32 s59, s59, s60 +; SI-NEXT: s_and_b32 s60, s26, 0xffff +; SI-NEXT: s_lshl_b32 s61, s9, 16 +; SI-NEXT: v_or_b32_e32 v25, v0, v44 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: s_or_b32 s60, s60, s61 +; SI-NEXT: s_and_b32 s61, s27, 0xffff +; SI-NEXT: s_lshl_b32 s62, s8, 16 +; SI-NEXT: v_or_b32_e32 v26, v0, v43 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: s_or_b32 s61, s61, s62 +; SI-NEXT: s_and_b32 s62, s28, 0xffff +; SI-NEXT: s_lshl_b32 s63, s7, 16 +; SI-NEXT: v_or_b32_e32 v27, v0, v42 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: s_or_b32 s62, s62, s63 +; SI-NEXT: s_and_b32 s63, s29, 0xffff +; SI-NEXT: s_lshl_b32 s72, s6, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; SI-NEXT: v_or_b32_e32 v28, v0, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: s_or_b32 s63, s63, s72 +; SI-NEXT: v_or_b32_e32 v15, v1, v62 +; SI-NEXT: v_or_b32_e32 v29, v0, v40 ; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s44 +; SI-NEXT: v_mov_b32_e32 v3, s45 +; SI-NEXT: v_mov_b32_e32 v4, s46 +; SI-NEXT: v_mov_b32_e32 v5, s47 +; SI-NEXT: v_mov_b32_e32 v6, s56 +; SI-NEXT: v_mov_b32_e32 v7, s57 +; SI-NEXT: v_mov_b32_e32 v8, s58 +; SI-NEXT: v_mov_b32_e32 v9, s59 +; SI-NEXT: v_mov_b32_e32 v10, s60 +; SI-NEXT: v_mov_b32_e32 v11, s61 +; SI-NEXT: v_mov_b32_e32 v12, s62 +; SI-NEXT: v_mov_b32_e32 v13, s63 ; SI-NEXT: s_cbranch_execnz .LBB43_3 ; SI-NEXT: .LBB43_2: ; %cmp.true -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v63 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: s_or_b32 s7, s8, s7 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: s_or_b32 s8, s9, s8 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: s_or_b32 s9, s10, s9 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: s_or_b32 s10, s11, s10 -; SI-NEXT: s_add_i32 s4, s4, 0x30000 -; SI-NEXT: s_add_i32 s5, s5, 0x30000 -; SI-NEXT: s_add_i32 s6, s6, 0x30000 -; SI-NEXT: s_add_i32 s7, s7, 0x30000 -; SI-NEXT: s_add_i32 s8, s8, 0x30000 -; SI-NEXT: s_add_i32 s9, s9, 0x30000 -; SI-NEXT: s_add_i32 s10, s10, 0x30000 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_mov_b32_e32 v5, s9 -; SI-NEXT: v_mov_b32_e32 v6, s10 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v44 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v62 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v0, v63, v0 ; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v60 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v0, v61, v0 ; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v47 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v0, v60, v0 ; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v0, v59, v0 ; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: v_or_b32_e32 v0, v58, v0 ; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v45, v0 +; SI-NEXT: v_or_b32_e32 v0, v57, v0 ; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: v_or_b32_e32 v0, v56, v0 ; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: v_or_b32_e32 v0, v47, v0 ; SI-NEXT: v_add_i32_e32 v22, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v61 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v40, v0 +; SI-NEXT: v_or_b32_e32 v0, v46, v0 ; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v38, v0 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: v_or_b32_e32 v0, v45, v0 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s16, s42, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: v_add_i32_e32 v24, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v59 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: s_and_b32 s16, s18, 0xffff +; SI-NEXT: s_lshl_b32 s17, s41, 16 +; SI-NEXT: s_add_i32 s19, s19, 3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v36, v0 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_and_b32 s17, s19, 0xffff +; SI-NEXT: s_lshl_b32 s18, s40, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_or_b32_e32 v0, v44, v0 +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: s_and_b32 s18, s20, 0xffff +; SI-NEXT: s_lshl_b32 s15, s15, 16 +; SI-NEXT: s_add_i32 s21, s21, 3 ; SI-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v58 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: s_or_b32 s15, s15, s18 +; SI-NEXT: s_and_b32 s18, s21, 0xffff +; SI-NEXT: s_lshl_b32 s14, s14, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v35, v0 +; SI-NEXT: s_or_b32 s14, s14, s18 +; SI-NEXT: s_and_b32 s18, s22, 0xffff +; SI-NEXT: s_lshl_b32 s13, s13, 16 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: s_or_b32 s13, s13, s18 +; SI-NEXT: s_and_b32 s18, s23, 0xffff +; SI-NEXT: s_lshl_b32 s12, s12, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 ; SI-NEXT: v_add_i32_e32 v26, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 +; SI-NEXT: s_or_b32 s12, s12, s18 +; SI-NEXT: s_and_b32 s18, s24, 0xffff +; SI-NEXT: s_lshl_b32 s11, s11, 16 +; SI-NEXT: s_add_i32 s25, s25, 3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v34, v0 +; SI-NEXT: s_or_b32 s11, s11, s18 +; SI-NEXT: s_and_b32 s18, s25, 0xffff +; SI-NEXT: s_lshl_b32 s10, s10, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: s_or_b32 s10, s10, s18 +; SI-NEXT: s_and_b32 s18, s26, 0xffff +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_add_i32 s27, s27, 3 ; SI-NEXT: v_add_i32_e32 v27, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v57 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: s_or_b32 s9, s9, s18 +; SI-NEXT: s_and_b32 s18, s27, 0xffff +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v54, v0 +; SI-NEXT: s_or_b32 s8, s8, s18 +; SI-NEXT: s_and_b32 s18, s28, 0xffff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: v_or_b32_e32 v0, v41, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: s_or_b32 s7, s7, s18 +; SI-NEXT: s_and_b32 s18, s29, 0xffff +; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: v_add_i32_e32 v28, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v56 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_or_b32 s6, s6, s18 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v33, v0 +; SI-NEXT: v_or_b32_e32 v1, v62, v1 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s16, s16, 0x30000 +; SI-NEXT: s_add_i32 s17, s17, 0x30000 +; SI-NEXT: s_add_i32 s15, s15, 0x30000 +; SI-NEXT: s_add_i32 s14, s14, 0x30000 +; SI-NEXT: s_add_i32 s13, s13, 0x30000 +; SI-NEXT: s_add_i32 s12, s12, 0x30000 +; SI-NEXT: s_add_i32 s11, s11, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v40, v0 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v1 ; SI-NEXT: v_add_i32_e32 v29, vcc, 0x30000, v0 ; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: v_mov_b32_e32 v3, s17 +; SI-NEXT: v_mov_b32_e32 v4, s15 +; SI-NEXT: v_mov_b32_e32 v5, s14 +; SI-NEXT: v_mov_b32_e32 v6, s13 +; SI-NEXT: v_mov_b32_e32 v7, s12 +; SI-NEXT: v_mov_b32_e32 v8, s11 +; SI-NEXT: v_mov_b32_e32 v9, s10 +; SI-NEXT: v_mov_b32_e32 v10, s9 +; SI-NEXT: v_mov_b32_e32 v11, s8 +; SI-NEXT: v_mov_b32_e32 v12, s7 +; SI-NEXT: v_mov_b32_e32 v13, s6 ; SI-NEXT: .LBB43_3: ; %end -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB43_4: -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v46, v44 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v45, v43 -; SI-NEXT: v_mov_b32_e32 v44, v42 -; SI-NEXT: v_mov_b32_e32 v43, v41 -; SI-NEXT: v_mov_b32_e32 v42, v40 -; SI-NEXT: v_mov_b32_e32 v41, v38 -; SI-NEXT: v_mov_b32_e32 v40, v37 -; SI-NEXT: v_mov_b32_e32 v38, v36 -; SI-NEXT: v_mov_b32_e32 v37, v35 -; SI-NEXT: v_mov_b32_e32 v36, v55 -; SI-NEXT: v_mov_b32_e32 v55, v34 -; SI-NEXT: v_mov_b32_e32 v35, v54 -; SI-NEXT: v_mov_b32_e32 v54, v33 -; SI-NEXT: v_mov_b32_e32 v34, v32 -; SI-NEXT: v_mov_b32_e32 v33, v62 -; SI-NEXT: v_mov_b32_e32 v62, v60 -; SI-NEXT: v_mov_b32_e32 v32, v63 -; SI-NEXT: v_mov_b32_e32 v63, v53 -; SI-NEXT: v_mov_b32_e32 v53, v61 -; SI-NEXT: v_mov_b32_e32 v61, v52 -; SI-NEXT: v_mov_b32_e32 v52, v59 -; SI-NEXT: v_mov_b32_e32 v59, v51 -; SI-NEXT: v_mov_b32_e32 v51, v57 -; SI-NEXT: v_mov_b32_e32 v57, v50 -; SI-NEXT: v_mov_b32_e32 v50, v47 -; SI-NEXT: v_mov_b32_e32 v47, v48 -; SI-NEXT: v_mov_b32_e32 v48, v30 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v60, v62 -; SI-NEXT: v_mov_b32_e32 v30, v48 -; SI-NEXT: v_mov_b32_e32 v48, v47 -; SI-NEXT: v_mov_b32_e32 v47, v50 -; SI-NEXT: v_mov_b32_e32 v50, v57 -; SI-NEXT: v_mov_b32_e32 v57, v51 -; SI-NEXT: v_mov_b32_e32 v51, v59 -; SI-NEXT: v_mov_b32_e32 v59, v52 -; SI-NEXT: v_mov_b32_e32 v52, v61 -; SI-NEXT: v_mov_b32_e32 v61, v53 -; SI-NEXT: v_mov_b32_e32 v53, v63 -; SI-NEXT: v_mov_b32_e32 v63, v32 -; SI-NEXT: v_mov_b32_e32 v62, v33 -; SI-NEXT: v_mov_b32_e32 v32, v34 -; SI-NEXT: v_mov_b32_e32 v33, v54 -; SI-NEXT: v_mov_b32_e32 v54, v35 -; SI-NEXT: v_mov_b32_e32 v34, v55 -; SI-NEXT: v_mov_b32_e32 v55, v36 -; SI-NEXT: v_mov_b32_e32 v35, v37 -; SI-NEXT: v_mov_b32_e32 v36, v38 -; SI-NEXT: v_mov_b32_e32 v37, v40 -; SI-NEXT: v_mov_b32_e32 v38, v41 -; SI-NEXT: v_mov_b32_e32 v40, v42 -; SI-NEXT: v_mov_b32_e32 v41, v43 -; SI-NEXT: v_mov_b32_e32 v42, v44 -; SI-NEXT: v_mov_b32_e32 v43, v45 -; SI-NEXT: v_mov_b32_e32 v44, v46 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: s_branch .LBB43_2 ; ; VI-LABEL: bitcast_v60i16_to_v15i64_scalar: @@ -31418,190 +30527,193 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) { ; SI-LABEL: bitcast_v15i64_to_v60f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB44_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v23 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v61, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v11 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v32 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v32 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v5 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v22 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v59, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v21 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v62, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v13 ; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v27 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v25 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 @@ -31614,44 +30726,38 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v25 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v44, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v28 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v40, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v0 +; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 @@ -31681,86 +30787,82 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: .LBB44_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB44_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: v_addc_u32_e32 v12, vcc, 0, v12, vcc -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; SI-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; SI-NEXT: v_addc_u32_e32 v16, vcc, 0, v16, vcc -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; SI-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; SI-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc -; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; SI-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v35 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v20 -; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v21 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v2 +; SI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; SI-NEXT: v_cvt_f32_f16_e32 v42, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v30 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v22 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v0 +; SI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc +; SI-NEXT: v_cvt_f32_f16_e32 v46, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v34 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v34 -; SI-NEXT: v_addc_u32_e32 v24, vcc, 0, v24, vcc -; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v32 -; SI-NEXT: v_addc_u32_e32 v26, vcc, 0, v26, vcc -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v23 -; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v1 -; SI-NEXT: v_addc_u32_e32 v28, vcc, 0, v28, vcc -; SI-NEXT: v_cvt_f32_f16_e32 v56, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v37 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v63 -; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 -; SI-NEXT: v_addc_u32_e32 v30, vcc, 0, v30, vcc -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v62 +; SI-NEXT: v_mov_b32_e32 v62, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v60 +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_addc_u32_e32 v29, vcc, 0, v29, vcc +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v16 ; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v29 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 ; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 @@ -31787,346 +30889,256 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 ; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 ; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 ; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 ; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v61 -; SI-NEXT: v_mov_b32_e32 v61, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v59 +; SI-NEXT: v_mov_b32_e32 v59, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 ; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 ; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 ; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 ; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 ; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 ; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 ; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 ; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 ; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 -; SI-NEXT: v_mov_b32_e32 v37, v27 -; SI-NEXT: v_mov_b32_e32 v35, v28 -; SI-NEXT: v_mov_b32_e32 v34, v29 -; SI-NEXT: v_mov_b32_e32 v32, v30 -; SI-NEXT: v_mov_b32_e32 v63, v25 -; SI-NEXT: v_mov_b32_e32 v59, v26 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v34, v27 +; SI-NEXT: v_mov_b32_e32 v32, v29 +; SI-NEXT: v_mov_b32_e32 v30, v28 +; SI-NEXT: v_mov_b32_e32 v60, v25 +; SI-NEXT: v_mov_b32_e32 v57, v26 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: .LBB44_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v46 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v47 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v44 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v43 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v40 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v44 -; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 -; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 -; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 -; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 -; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 -; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 -; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 -; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 -; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 -; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v31 -; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v62 -; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v4, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v36 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v62 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v32 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v6, v50 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 -; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v8, v37 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v58 -; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v10, v33 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v12, v63 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v14, v58 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v22, v59 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 +; SI-NEXT: v_or_b32_e32 v23, v25, v23 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v24, v38 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 +; SI-NEXT: v_or_b32_e32 v25, v27, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v57 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v26, v49 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_or_b32_e32 v26, v27, v26 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v53 +; SI-NEXT: v_or_b32_e32 v27, v29, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v55 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_or_b32_e32 v28, v29, v28 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v30 +; SI-NEXT: v_or_b32_e32 v29, v31, v29 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v63 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v59 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v35 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v34 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v32 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v15i64_to_v60f16: @@ -32790,52 +31802,52 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i ; SI-LABEL: bitcast_v15i64_to_v60f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v18, s16 -; SI-NEXT: v_mov_b32_e32 v19, s17 -; SI-NEXT: v_readfirstlane_b32 s40, v18 -; SI-NEXT: v_mov_b32_e32 v18, s18 -; SI-NEXT: v_readfirstlane_b32 s43, v19 -; SI-NEXT: v_mov_b32_e32 v19, s19 -; SI-NEXT: v_readfirstlane_b32 s41, v18 +; SI-NEXT: v_mov_b32_e32 v17, s16 +; SI-NEXT: v_mov_b32_e32 v18, s17 +; SI-NEXT: v_mov_b32_e32 v19, s18 +; SI-NEXT: v_readfirstlane_b32 s40, v17 +; SI-NEXT: v_mov_b32_e32 v17, s19 +; SI-NEXT: v_readfirstlane_b32 s43, v18 ; SI-NEXT: v_mov_b32_e32 v18, s20 -; SI-NEXT: v_readfirstlane_b32 s44, v19 +; SI-NEXT: v_readfirstlane_b32 s41, v19 ; SI-NEXT: v_mov_b32_e32 v19, s21 +; SI-NEXT: v_readfirstlane_b32 s44, v17 +; SI-NEXT: v_mov_b32_e32 v17, s22 ; SI-NEXT: v_readfirstlane_b32 s42, v18 -; SI-NEXT: v_mov_b32_e32 v18, s22 +; SI-NEXT: v_mov_b32_e32 v18, s23 ; SI-NEXT: v_readfirstlane_b32 s45, v19 -; SI-NEXT: v_mov_b32_e32 v19, s23 -; SI-NEXT: v_readfirstlane_b32 s22, v18 -; SI-NEXT: v_mov_b32_e32 v18, s24 -; SI-NEXT: v_readfirstlane_b32 s46, v19 -; SI-NEXT: v_mov_b32_e32 v19, s25 -; SI-NEXT: v_readfirstlane_b32 s23, v18 +; SI-NEXT: v_mov_b32_e32 v19, s24 +; SI-NEXT: v_readfirstlane_b32 s24, v17 +; SI-NEXT: v_mov_b32_e32 v17, s25 +; SI-NEXT: v_readfirstlane_b32 s46, v18 ; SI-NEXT: v_mov_b32_e32 v18, s26 -; SI-NEXT: v_readfirstlane_b32 s47, v19 +; SI-NEXT: v_readfirstlane_b32 s25, v19 ; SI-NEXT: v_mov_b32_e32 v19, s27 -; SI-NEXT: v_readfirstlane_b32 s24, v18 -; SI-NEXT: v_mov_b32_e32 v18, s28 -; SI-NEXT: v_readfirstlane_b32 s27, v19 -; SI-NEXT: v_mov_b32_e32 v19, s29 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 -; SI-NEXT: v_readfirstlane_b32 s25, v18 -; SI-NEXT: v_readfirstlane_b32 s26, v19 -; SI-NEXT: v_readfirstlane_b32 s20, v1 -; SI-NEXT: v_readfirstlane_b32 s21, v2 -; SI-NEXT: v_readfirstlane_b32 s18, v3 -; SI-NEXT: v_readfirstlane_b32 s19, v4 -; SI-NEXT: v_readfirstlane_b32 s16, v5 -; SI-NEXT: v_readfirstlane_b32 s17, v6 -; SI-NEXT: v_readfirstlane_b32 s14, v7 -; SI-NEXT: v_readfirstlane_b32 s15, v8 -; SI-NEXT: v_readfirstlane_b32 s12, v9 -; SI-NEXT: v_readfirstlane_b32 s13, v10 -; SI-NEXT: v_readfirstlane_b32 s10, v11 -; SI-NEXT: v_readfirstlane_b32 s11, v12 -; SI-NEXT: v_readfirstlane_b32 s7, v13 -; SI-NEXT: v_readfirstlane_b32 s8, v14 -; SI-NEXT: v_readfirstlane_b32 s6, v15 +; SI-NEXT: v_readfirstlane_b32 s27, v17 +; SI-NEXT: v_mov_b32_e32 v17, s28 +; SI-NEXT: v_readfirstlane_b32 s26, v18 +; SI-NEXT: v_mov_b32_e32 v18, s29 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: v_readfirstlane_b32 s28, v19 +; SI-NEXT: v_readfirstlane_b32 s22, v17 +; SI-NEXT: v_readfirstlane_b32 s23, v18 +; SI-NEXT: v_readfirstlane_b32 s20, v0 +; SI-NEXT: v_readfirstlane_b32 s21, v1 +; SI-NEXT: v_readfirstlane_b32 s18, v2 +; SI-NEXT: v_readfirstlane_b32 s19, v3 +; SI-NEXT: v_readfirstlane_b32 s16, v4 +; SI-NEXT: v_readfirstlane_b32 s17, v5 +; SI-NEXT: v_readfirstlane_b32 s14, v6 +; SI-NEXT: v_readfirstlane_b32 s15, v7 +; SI-NEXT: v_readfirstlane_b32 s12, v8 +; SI-NEXT: v_readfirstlane_b32 s13, v9 +; SI-NEXT: v_readfirstlane_b32 s10, v10 +; SI-NEXT: v_readfirstlane_b32 s11, v11 +; SI-NEXT: v_readfirstlane_b32 s7, v12 +; SI-NEXT: v_readfirstlane_b32 s8, v13 +; SI-NEXT: v_readfirstlane_b32 s6, v14 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s9, v16 +; SI-NEXT: v_readfirstlane_b32 s9, v15 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill @@ -32852,129 +31864,128 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i ; SI-NEXT: s_cbranch_scc0 .LBB45_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_lshr_b32 s4, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s4 ; SI-NEXT: s_lshr_b32 s4, s6, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s4 ; SI-NEXT: s_lshr_b32 s4, s8, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s4 ; SI-NEXT: s_lshr_b32 s4, s7, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 ; SI-NEXT: s_lshr_b32 s4, s11, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s4 ; SI-NEXT: s_lshr_b32 s4, s10, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s4 ; SI-NEXT: s_lshr_b32 s4, s13, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s4 ; SI-NEXT: s_lshr_b32 s4, s12, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 ; SI-NEXT: s_lshr_b32 s4, s15, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 ; SI-NEXT: s_lshr_b32 s4, s14, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 ; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 ; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 ; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 ; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 ; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 ; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 +; SI-NEXT: s_lshr_b32 s4, s28, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 ; SI-NEXT: s_lshr_b32 s4, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s4 -; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 ; SI-NEXT: s_lshr_b32 s4, s27, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s4 -; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s4 -; SI-NEXT: s_lshr_b32 s4, s47, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v49, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v51, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 +; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 ; SI-NEXT: s_lshr_b32 s4, s46, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v53, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v55, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 +; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v44, s4 ; SI-NEXT: s_lshr_b32 s4, s45, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v41, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 ; SI-NEXT: s_lshr_b32 s4, s42, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v43, s4 -; SI-NEXT: s_lshr_b32 s4, s44, 16 ; SI-NEXT: s_waitcnt expcnt(6) ; SI-NEXT: v_cvt_f32_f16_e32 v46, s4 +; SI-NEXT: s_lshr_b32 s4, s44, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 ; SI-NEXT: s_lshr_b32 s4, s41, 16 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v56, s4 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v57, s4 ; SI-NEXT: s_lshr_b32 s4, s43, 16 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v58, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 ; SI-NEXT: s_lshr_b32 s4, s40, 16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v60, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s47 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v54, s46 -; SI-NEXT: v_cvt_f32_f16_e32 v40, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v42, s45 -; SI-NEXT: v_cvt_f32_f16_e32 v44, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v45, s44 -; SI-NEXT: v_cvt_f32_f16_e32 v47, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v57, s43 -; SI-NEXT: v_cvt_f32_f16_e32 v59, s40 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v59, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v51, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v53, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v54, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v55, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v40, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v41, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v42, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v43, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v45, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s45 +; SI-NEXT: v_cvt_f32_f16_e32 v47, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s44 +; SI-NEXT: v_cvt_f32_f16_e32 v56, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v58, s40 ; SI-NEXT: s_cbranch_execnz .LBB45_3 ; SI-NEXT: .LBB45_2: ; %cmp.true ; SI-NEXT: s_add_u32 s4, s40, 3 ; SI-NEXT: s_addc_u32 s5, s43, 0 -; SI-NEXT: s_lshr_b32 s28, s4, 16 -; SI-NEXT: s_lshr_b32 s29, s5, 16 -; SI-NEXT: s_add_u32 s40, s41, 3 -; SI-NEXT: s_addc_u32 s41, s44, 0 -; SI-NEXT: s_lshr_b32 s43, s40, 16 +; SI-NEXT: s_lshr_b32 s29, s4, 16 +; SI-NEXT: s_lshr_b32 s40, s5, 16 +; SI-NEXT: s_add_u32 s41, s41, 3 +; SI-NEXT: s_addc_u32 s43, s44, 0 ; SI-NEXT: s_lshr_b32 s44, s41, 16 +; SI-NEXT: s_lshr_b32 s47, s43, 16 ; SI-NEXT: s_add_u32 s42, s42, 3 ; SI-NEXT: s_addc_u32 s45, s45, 0 ; SI-NEXT: s_lshr_b32 s56, s42, 16 ; SI-NEXT: s_lshr_b32 s57, s45, 16 -; SI-NEXT: s_add_u32 s22, s22, 3 +; SI-NEXT: s_add_u32 s24, s24, 3 ; SI-NEXT: s_addc_u32 s46, s46, 0 -; SI-NEXT: s_lshr_b32 s58, s22, 16 +; SI-NEXT: s_lshr_b32 s58, s24, 16 ; SI-NEXT: s_lshr_b32 s59, s46, 16 -; SI-NEXT: s_add_u32 s23, s23, 3 -; SI-NEXT: s_addc_u32 s47, s47, 0 -; SI-NEXT: s_lshr_b32 s60, s23, 16 -; SI-NEXT: s_lshr_b32 s61, s47, 16 -; SI-NEXT: s_add_u32 s24, s24, 3 -; SI-NEXT: s_addc_u32 s27, s27, 0 -; SI-NEXT: s_lshr_b32 s62, s24, 16 -; SI-NEXT: s_lshr_b32 s63, s27, 16 ; SI-NEXT: s_add_u32 s25, s25, 3 -; SI-NEXT: s_addc_u32 s26, s26, 0 -; SI-NEXT: s_lshr_b32 s72, s25, 16 -; SI-NEXT: s_lshr_b32 s73, s26, 16 +; SI-NEXT: s_addc_u32 s27, s27, 0 +; SI-NEXT: s_lshr_b32 s60, s25, 16 +; SI-NEXT: s_lshr_b32 s61, s27, 16 +; SI-NEXT: s_add_u32 s26, s26, 3 +; SI-NEXT: s_addc_u32 s28, s28, 0 +; SI-NEXT: s_lshr_b32 s62, s26, 16 +; SI-NEXT: s_lshr_b32 s63, s28, 16 +; SI-NEXT: s_add_u32 s22, s22, 3 +; SI-NEXT: s_addc_u32 s23, s23, 0 +; SI-NEXT: s_lshr_b32 s72, s22, 16 +; SI-NEXT: s_lshr_b32 s73, s23, 16 ; SI-NEXT: s_add_u32 s20, s20, 3 ; SI-NEXT: s_addc_u32 s21, s21, 0 ; SI-NEXT: s_lshr_b32 s74, s20, 16 @@ -33007,280 +32018,125 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i ; SI-NEXT: s_addc_u32 s9, s9, 0 ; SI-NEXT: s_lshr_b32 vcc_lo, s6, 16 ; SI-NEXT: s_lshr_b32 vcc_hi, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s47 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v54, s46 -; SI-NEXT: v_cvt_f32_f16_e32 v40, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v42, s45 -; SI-NEXT: v_cvt_f32_f16_e32 v44, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v45, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v51, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v53, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v54, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v55, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v40, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v41, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v42, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v43, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v45, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s45 ; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v47, s40 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v57, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v47, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s43 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v56, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s5 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v58, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v28, vcc_hi +; SI-NEXT: v_cvt_f32_f16_e32 v29, vcc_lo +; SI-NEXT: v_cvt_f32_f16_e32 v26, s95 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s94 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s93 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s92 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s91 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s90 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s89 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s88 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s79 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s78 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s77 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s76 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s75 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s74 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s73 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s72 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s63 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s62 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s61 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s60 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s59 +; SI-NEXT: v_cvt_f32_f16_e32 v44, s58 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s57 +; SI-NEXT: v_cvt_f32_f16_e32 v46, s56 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s47 +; SI-NEXT: v_cvt_f32_f16_e32 v57, s44 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s40 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v59, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v1, vcc_hi -; SI-NEXT: v_cvt_f32_f16_e32 v2, vcc_lo -; SI-NEXT: v_cvt_f32_f16_e32 v3, s95 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s94 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s93 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s92 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s91 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s90 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s89 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s88 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s79 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s78 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s77 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s76 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s75 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s74 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s73 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s72 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s63 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s62 -; SI-NEXT: v_cvt_f32_f16_e32 v49, s61 -; SI-NEXT: v_cvt_f32_f16_e32 v51, s60 -; SI-NEXT: v_cvt_f32_f16_e32 v53, s59 -; SI-NEXT: v_cvt_f32_f16_e32 v55, s58 -; SI-NEXT: v_cvt_f32_f16_e32 v41, s57 -; SI-NEXT: v_cvt_f32_f16_e32 v43, s56 -; SI-NEXT: v_cvt_f32_f16_e32 v46, s44 -; SI-NEXT: v_cvt_f32_f16_e32 v56, s43 -; SI-NEXT: v_cvt_f32_f16_e32 v58, s29 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v60, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v59, s29 ; SI-NEXT: .LBB45_3: ; %end -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v60, v60 +; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v59, v59 ; SI-NEXT: v_cvt_f16_f32_e32 v58, v58 ; SI-NEXT: v_cvt_f16_f32_e32 v57, v57 ; SI-NEXT: v_cvt_f16_f32_e32 v56, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v47 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v59 +; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v57 +; SI-NEXT: v_or_b32_e32 v2, v56, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v46, v46 -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v43 -; SI-NEXT: v_or_b32_e32 v59, v59, v60 -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v44, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 -; SI-NEXT: buffer_store_dword v59, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v59, vcc, 4, v0 -; SI-NEXT: v_or_b32_e32 v57, v57, v58 -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v56 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v46 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v56 +; SI-NEXT: v_or_b32_e32 v5, v5, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v46 +; SI-NEXT: v_or_b32_e32 v7, v7, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v42, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v55 -; SI-NEXT: buffer_store_dword v57, v59, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v57, vcc, 8, v0 -; SI-NEXT: v_or_b32_e32 v47, v47, v56 -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 -; SI-NEXT: buffer_store_dword v47, v57, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v47, vcc, 12, v0 -; SI-NEXT: v_or_b32_e32 v45, v45, v46 -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 -; SI-NEXT: buffer_store_dword v45, v47, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v45, vcc, 16, v0 -; SI-NEXT: v_or_b32_e32 v43, v44, v43 -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 -; SI-NEXT: buffer_store_dword v43, v45, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v43, vcc, 20, v0 -; SI-NEXT: v_or_b32_e32 v41, v42, v41 -; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 -; SI-NEXT: buffer_store_dword v41, v43, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v41, vcc, 24, v0 -; SI-NEXT: v_or_b32_e32 v55, v40, v55 -; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 -; SI-NEXT: buffer_store_dword v55, v41, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v55, vcc, 28, v0 -; SI-NEXT: v_or_b32_e32 v53, v54, v53 -; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 -; SI-NEXT: buffer_store_dword v53, v55, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v53, vcc, 32, v0 -; SI-NEXT: v_or_b32_e32 v51, v52, v51 -; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; SI-NEXT: buffer_store_dword v51, v53, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v51, vcc, 36, v0 -; SI-NEXT: v_or_b32_e32 v49, v50, v49 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; SI-NEXT: buffer_store_dword v49, v51, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v49, vcc, 40, v0 -; SI-NEXT: v_or_b32_e32 v39, v48, v39 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; SI-NEXT: buffer_store_dword v39, v49, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v39, vcc, 44, v0 -; SI-NEXT: v_or_b32_e32 v36, v38, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 -; SI-NEXT: buffer_store_dword v36, v39, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v36, v37 -; SI-NEXT: v_add_i32_e32 v37, vcc, 48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; SI-NEXT: v_or_b32_e32 v34, v36, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: buffer_store_dword v34, v37, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v34, v35 -; SI-NEXT: v_add_i32_e32 v35, vcc, 52, v0 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; SI-NEXT: v_or_b32_e32 v32, v34, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: buffer_store_dword v32, v35, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v32, v33 -; SI-NEXT: v_add_i32_e32 v33, vcc, 56, v0 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; SI-NEXT: v_or_b32_e32 v30, v32, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: buffer_store_dword v30, v33, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v30, v31 -; SI-NEXT: v_add_i32_e32 v31, vcc, 60, v0 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; SI-NEXT: v_or_b32_e32 v28, v30, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: buffer_store_dword v28, v31, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v28, v29 -; SI-NEXT: v_add_i32_e32 v29, vcc, 64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_or_b32_e32 v26, v28, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: buffer_store_dword v26, v29, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v26, v27 -; SI-NEXT: v_add_i32_e32 v27, vcc, 0x44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_or_b32_e32 v24, v26, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: buffer_store_dword v24, v27, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v24, v25 -; SI-NEXT: v_add_i32_e32 v25, vcc, 0x48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v22, v24, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: buffer_store_dword v22, v25, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v22, v23 -; SI-NEXT: v_add_i32_e32 v23, vcc, 0x4c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_or_b32_e32 v19, v22, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: buffer_store_dword v19, v23, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v21 -; SI-NEXT: v_add_i32_e32 v21, vcc, 0x50, v0 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: buffer_store_dword v17, v21, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v20 -; SI-NEXT: v_add_i32_e32 v19, vcc, 0x54, v0 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: buffer_store_dword v15, v19, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v18 -; SI-NEXT: v_add_i32_e32 v17, vcc, 0x58, v0 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: buffer_store_dword v13, v17, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v13, v16 -; SI-NEXT: v_add_i32_e32 v15, vcc, 0x5c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: buffer_store_dword v11, v15, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v11, v14 -; SI-NEXT: v_add_i32_e32 v13, vcc, 0x60, v0 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: buffer_store_dword v9, v13, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v9, v12 -; SI-NEXT: v_add_i32_e32 v11, vcc, 0x64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: buffer_store_dword v7, v11, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v7, v10 -; SI-NEXT: v_add_i32_e32 v9, vcc, 0x68, v0 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v8 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x6c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v6 -; SI-NEXT: v_add_i32_e32 v5, vcc, 0x70, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v44 +; SI-NEXT: v_or_b32_e32 v9, v42, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v40 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v4 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 +; SI-NEXT: v_or_b32_e32 v11, v40, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v12 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v60, v1 +; SI-NEXT: v_or_b32_e32 v3, v58, v3 +; SI-NEXT: v_or_b32_e32 v4, v47, v4 +; SI-NEXT: v_or_b32_e32 v6, v45, v6 +; SI-NEXT: v_or_b32_e32 v8, v43, v8 +; SI-NEXT: v_or_b32_e32 v10, v41, v10 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v40 ; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -33294,69 +32150,137 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_or_b32_e32 v13, v54, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v54 +; SI-NEXT: v_or_b32_e32 v15, v52, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v52 +; SI-NEXT: v_or_b32_e32 v17, v50, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v50 +; SI-NEXT: v_or_b32_e32 v19, v48, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v48 +; SI-NEXT: v_or_b32_e32 v21, v38, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v38 +; SI-NEXT: v_or_b32_e32 v23, v36, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v36 +; SI-NEXT: v_or_b32_e32 v25, v34, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v34 +; SI-NEXT: v_or_b32_e32 v27, v32, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v32 +; SI-NEXT: v_or_b32_e32 v12, v55, v12 +; SI-NEXT: v_or_b32_e32 v14, v53, v14 +; SI-NEXT: v_or_b32_e32 v16, v51, v16 +; SI-NEXT: v_or_b32_e32 v18, v49, v18 +; SI-NEXT: v_or_b32_e32 v20, v39, v20 +; SI-NEXT: v_or_b32_e32 v22, v37, v22 +; SI-NEXT: v_or_b32_e32 v24, v35, v24 +; SI-NEXT: v_or_b32_e32 v26, v33, v26 +; SI-NEXT: v_or_b32_e32 v28, v31, v28 +; SI-NEXT: v_or_b32_e32 v29, v30, v29 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB45_4: +; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: s_branch .LBB45_2 ; ; VI-LABEL: bitcast_v15i64_to_v60f16_scalar: @@ -34174,218 +33098,244 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v60f16_to_v15i64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v3 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: s_waitcnt expcnt(5) ; SI-NEXT: v_cvt_f16_f32_e32 v58, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:116 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:88 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:84 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v10 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v3 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v44 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v43 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:104 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:100 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:112 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:108 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v60 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v42 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v6 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v41 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v7 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v40 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v55 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v54 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v32 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v32 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v52 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v51 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v55 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v62 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v53 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v54 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v32 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v32, v47 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v45 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v44 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v46 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v43 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v30 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v31 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v46 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v42 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v46, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v41 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v35 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v40 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v62 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v29 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB46_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v45 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; kill: killed $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr30 @@ -34448,7 +33398,11 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v59 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v57 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v41 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v53 ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v51 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v49 ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v39 @@ -34459,6 +33413,10 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: v_or_b32_e32 v0, v58, v0 ; SI-NEXT: v_or_b32_e32 v1, v56, v1 +; SI-NEXT: v_or_b32_e32 v2, v46, v2 +; SI-NEXT: v_or_b32_e32 v20, v42, v20 +; SI-NEXT: v_or_b32_e32 v21, v40, v21 +; SI-NEXT: v_or_b32_e32 v22, v54, v22 ; SI-NEXT: v_or_b32_e32 v23, v52, v23 ; SI-NEXT: v_or_b32_e32 v24, v50, v24 ; SI-NEXT: v_or_b32_e32 v25, v48, v25 @@ -34470,10 +33428,19 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; kill: killed $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr50 @@ -34487,111 +33454,93 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v60 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_or_b32_e32 v21, v22, v21 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v63 -; SI-NEXT: v_or_b32_e32 v22, v62, v22 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: v_or_b32_e32 v19, v44, v19 +; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: .LBB46_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB46_4 ; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v57 @@ -34609,217 +33558,207 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v24, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v42 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v40 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v53 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v25, v52 ; SI-NEXT: v_cvt_f32_f16_e32 v26, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v39 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v39 ; SI-NEXT: v_cvt_f32_f16_e32 v28, v38 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v36 ; SI-NEXT: v_cvt_f32_f16_e32 v30, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v32 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 ; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 ; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 ; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v32 ; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 ; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 ; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v60 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 @@ -34827,9 +33766,12 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 @@ -34839,34 +33781,27 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_or_b32_e32 v18, v20, v18 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v44 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: v_or_b32_e32 v19, v20, v19 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v41 ; SI-NEXT: v_or_b32_e32 v20, v22, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v63 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v55 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v54 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_or_b32_e32 v22, v23, v22 @@ -34904,22 +33839,22 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v29, v31, v29 ; SI-NEXT: .LBB46_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -35745,509 +34680,565 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i ; SI-LABEL: bitcast_v60f16_to_v15i64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:36 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:48 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:44 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:56 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:52 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v7 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v3 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v17, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v1, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v12, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v2, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v11, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v3, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v10, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v4, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v9, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v5, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v8, s26 -; SI-NEXT: v_cvt_f16_f32_e32 v6, s29 -; SI-NEXT: v_cvt_f16_f32_e32 v7, s28 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v33 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v5 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v6 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v7 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v8 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v9 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v10 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v11 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v12 +; SI-NEXT: s_lshr_b32 s40, s17, 16 +; SI-NEXT: s_lshr_b32 s41, s16, 16 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v58, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s40 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v59, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s16 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v34 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v13 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_cbranch_scc0 .LBB47_2 -; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v10, v3 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v34 -; SI-NEXT: v_mov_b32_e32 v33, v32 -; SI-NEXT: v_or_b32_e32 v10, v32, v10 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_mov_b32_e32 v44, v43 -; SI-NEXT: v_or_b32_e32 v13, v43, v13 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v8, v5 -; SI-NEXT: v_mov_b32_e32 v57, v39 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v39 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v49 -; SI-NEXT: v_or_b32_e32 v7, v37, v7 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v59 -; SI-NEXT: v_or_b32_e32 v1, v12, v1 -; SI-NEXT: v_or_b32_e32 v2, v11, v2 -; SI-NEXT: v_or_b32_e32 v4, v9, v4 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v63 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v41 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 +; SI-NEXT: s_lshr_b32 s15, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v33 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v32, s15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s18 +; SI-NEXT: s_lshr_b32 s13, s20, 16 +; SI-NEXT: s_lshr_b32 s14, s19, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v33 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s20 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 +; SI-NEXT: s_lshr_b32 s11, s22, 16 +; SI-NEXT: s_lshr_b32 s12, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v33 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s22 +; SI-NEXT: s_lshr_b32 s9, s24, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 +; SI-NEXT: s_lshr_b32 s10, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v34 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v15 +; SI-NEXT: s_lshr_b32 s7, s26, 16 +; SI-NEXT: s_lshr_b32 s8, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 +; SI-NEXT: s_lshr_b32 s4, s29, 16 +; SI-NEXT: s_lshr_b32 s5, s28, 16 +; SI-NEXT: s_lshr_b32 s6, s27, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v60, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v61, s29 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB47_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v31 +; SI-NEXT: v_mov_b32_e32 v46, v45 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v45 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v59 +; SI-NEXT: v_or_b32_e32 v12, v62, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v56 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v44 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v63 +; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: v_or_b32_e32 v2, v57, v2 +; SI-NEXT: v_mov_b32_e32 v43, v42 +; SI-NEXT: v_or_b32_e32 v3, v42, v3 +; SI-NEXT: v_or_b32_e32 v4, v40, v4 +; SI-NEXT: v_or_b32_e32 v5, v53, v5 +; SI-NEXT: v_mov_b32_e32 v55, v54 +; SI-NEXT: v_mov_b32_e32 v53, v52 +; SI-NEXT: v_or_b32_e32 v6, v52, v6 +; SI-NEXT: v_mov_b32_e32 v50, v49 +; SI-NEXT: v_or_b32_e32 v7, v49, v7 +; SI-NEXT: v_or_b32_e32 v8, v48, v8 +; SI-NEXT: v_mov_b32_e32 v48, v39 +; SI-NEXT: v_mov_b32_e32 v35, v38 +; SI-NEXT: v_or_b32_e32 v9, v38, v9 +; SI-NEXT: v_mov_b32_e32 v61, v36 +; SI-NEXT: v_or_b32_e32 v10, v36, v10 +; SI-NEXT: v_mov_b32_e32 v33, v63 +; SI-NEXT: v_mov_b32_e32 v60, v34 +; SI-NEXT: v_or_b32_e32 v11, v34, v11 +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v30 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v18, v22, v18 -; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v52 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v50 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v48 -; SI-NEXT: v_or_b32_e32 v0, v58, v0 -; SI-NEXT: v_mov_b32_e32 v56, v34 -; SI-NEXT: v_mov_b32_e32 v47, v36 -; SI-NEXT: v_mov_b32_e32 v46, v35 -; SI-NEXT: v_or_b32_e32 v11, v35, v11 -; SI-NEXT: v_mov_b32_e32 v60, v63 -; SI-NEXT: v_mov_b32_e32 v45, v62 -; SI-NEXT: v_or_b32_e32 v12, v62, v12 -; SI-NEXT: v_mov_b32_e32 v42, v41 -; SI-NEXT: v_mov_b32_e32 v40, v55 -; SI-NEXT: v_or_b32_e32 v14, v55, v14 -; SI-NEXT: v_or_b32_e32 v15, v61, v15 -; SI-NEXT: v_or_b32_e32 v20, v53, v20 -; SI-NEXT: v_or_b32_e32 v21, v51, v21 -; SI-NEXT: v_or_b32_e32 v22, v30, v22 -; SI-NEXT: v_or_b32_e32 v23, v31, v23 -; SI-NEXT: s_mov_b64 s[4:5], 0 -; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_or_b32_e32 v17, v32, v17 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 ; SI-NEXT: v_or_b32_e32 v25, v26, v25 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v16, v43, v16 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 ; SI-NEXT: v_or_b32_e32 v26, v27, v26 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v35, v39 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 ; SI-NEXT: v_or_b32_e32 v27, v28, v27 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v37 -; SI-NEXT: v_or_b32_e32 v9, v39, v9 -; SI-NEXT: v_mov_b32_e32 v36, v37 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v45 +; SI-NEXT: v_or_b32_e32 v1, v41, v1 +; SI-NEXT: v_mov_b32_e32 v41, v40 +; SI-NEXT: v_mov_b32_e32 v34, v45 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 ; SI-NEXT: v_or_b32_e32 v28, v29, v28 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v8, v38, v8 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v58, v13 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; SI-NEXT: v_or_b32_e32 v29, v54, v29 -; SI-NEXT: v_mov_b32_e32 v54, v32 -; SI-NEXT: s_branch .LBB47_3 -; SI-NEXT: .LBB47_2: -; SI-NEXT: v_mov_b32_e32 v54, v53 -; SI-NEXT: v_mov_b32_e32 v53, v52 -; SI-NEXT: v_mov_b32_e32 v52, v51 -; SI-NEXT: v_mov_b32_e32 v51, v50 -; SI-NEXT: v_mov_b32_e32 v50, v30 -; SI-NEXT: v_mov_b32_e32 v49, v48 -; SI-NEXT: v_mov_b32_e32 v48, v31 -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v47, v36 -; SI-NEXT: v_mov_b32_e32 v46, v35 -; SI-NEXT: v_mov_b32_e32 v44, v43 -; SI-NEXT: v_mov_b32_e32 v30, v50 -; SI-NEXT: v_mov_b32_e32 v50, v51 -; SI-NEXT: v_mov_b32_e32 v51, v52 -; SI-NEXT: v_mov_b32_e32 v52, v53 -; SI-NEXT: v_mov_b32_e32 v53, v54 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v57, v39 -; SI-NEXT: v_mov_b32_e32 v56, v34 -; SI-NEXT: v_mov_b32_e32 v33, v32 -; SI-NEXT: v_mov_b32_e32 v60, v63 -; SI-NEXT: v_mov_b32_e32 v45, v62 -; SI-NEXT: v_mov_b32_e32 v42, v41 -; SI-NEXT: v_mov_b32_e32 v40, v55 -; SI-NEXT: s_mov_b64 s[4:5], -1 -; SI-NEXT: v_mov_b32_e32 v31, v48 -; SI-NEXT: v_mov_b32_e32 v48, v49 -; SI-NEXT: .LBB47_3: ; %Flow -; SI-NEXT: v_mov_b32_e32 v32, v33 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] -; SI-NEXT: v_mov_b32_e32 v61, v40 -; SI-NEXT: v_mov_b32_e32 v40, v44 -; SI-NEXT: s_cbranch_vccnz .LBB47_5 -; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v58 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v38 +; SI-NEXT: v_or_b32_e32 v29, v62, v29 +; SI-NEXT: s_cbranch_execnz .LBB47_3 +; SI-NEXT: .LBB47_2: ; %cmp.true +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v0, v46 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v41 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v50 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v61 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v58 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v40 -; SI-NEXT: v_mov_b32_e32 v55, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v43 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v48 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v31 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v57 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v56 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v44 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v55 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v37 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v51 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v48 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v37 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v33 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v59 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v30 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 @@ -36255,9 +35246,9 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 @@ -36267,7 +35258,7 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_or_b32_e32 v18, v20, v18 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 @@ -36279,39 +35270,46 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_or_b32_e32 v19, v20, v19 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v52 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_or_b32_e32 v20, v22, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v50 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v30 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_or_b32_e32 v22, v23, v22 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: v_or_b32_e32 v23, v25, v23 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; SI-NEXT: v_or_b32_e32 v24, v26, v24 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 @@ -36323,12 +35321,12 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 ; SI-NEXT: v_or_b32_e32 v25, v26, v25 ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v27 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 ; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 ; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 ; SI-NEXT: v_or_b32_e32 v26, v28, v26 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 ; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 @@ -36338,7 +35336,7 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 ; SI-NEXT: v_or_b32_e32 v27, v29, v27 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 ; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 @@ -36351,25 +35349,48 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v28, v29, v28 ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v30 ; SI-NEXT: v_or_b32_e32 v29, v31, v29 -; SI-NEXT: .LBB47_5: ; %end -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: .LBB47_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB47_4: +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v58, v30 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: v_mov_b32_e32 v60, v34 +; SI-NEXT: v_mov_b32_e32 v30, v58 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v46, v45 +; SI-NEXT: v_mov_b32_e32 v43, v42 +; SI-NEXT: v_mov_b32_e32 v41, v40 +; SI-NEXT: v_mov_b32_e32 v55, v54 +; SI-NEXT: v_mov_b32_e32 v53, v52 +; SI-NEXT: v_mov_b32_e32 v50, v49 +; SI-NEXT: v_mov_b32_e32 v48, v39 +; SI-NEXT: v_mov_b32_e32 v35, v38 +; SI-NEXT: v_mov_b32_e32 v61, v36 +; SI-NEXT: v_mov_b32_e32 v33, v63 +; SI-NEXT: s_branch .LBB47_2 ; ; VI-LABEL: bitcast_v60f16_to_v15i64_scalar: ; VI: ; %bb.0: @@ -37104,342 +36125,252 @@ define <60 x i16> @bitcast_v15f64_to_v60i16(<15 x double> %a, i32 %b) { ; SI-LABEL: bitcast_v15f64_to_v60i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB48_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_alignbit_b32 v31, v30, v29, 16 -; SI-NEXT: v_alignbit_b32 v32, v28, v27, 16 -; SI-NEXT: v_alignbit_b32 v33, v26, v25, 16 -; SI-NEXT: v_alignbit_b32 v34, v24, v23, 16 -; SI-NEXT: v_alignbit_b32 v35, v22, v21, 16 -; SI-NEXT: v_alignbit_b32 v36, v20, v19, 16 -; SI-NEXT: v_alignbit_b32 v37, v18, v17, 16 -; SI-NEXT: v_alignbit_b32 v39, v16, v15, 16 -; SI-NEXT: v_alignbit_b32 v50, v14, v13, 16 -; SI-NEXT: v_alignbit_b32 v52, v12, v11, 16 -; SI-NEXT: v_alignbit_b32 v55, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v41, v8, v7, 16 -; SI-NEXT: v_alignbit_b32 v43, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v30, v29, v28, 16 +; SI-NEXT: v_alignbit_b32 v31, v27, v26, 16 +; SI-NEXT: v_alignbit_b32 v32, v25, v24, 16 +; SI-NEXT: v_alignbit_b32 v33, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v34, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v35, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v36, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v37, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v38, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v39, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v48, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v51, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v54, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v40, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v43, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v15 ; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_alignbit_b32 v46, v4, v3, 16 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v13 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v11 ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_alignbit_b32 v56, v2, v1, 16 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v9 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v7 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v5 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v3 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v1 ; SI-NEXT: .LBB48_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB48_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 -; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 -; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 -; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 -; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 -; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 -; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 -; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 -; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 -; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 -; SI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 -; SI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 -; SI-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 -; SI-NEXT: v_add_f64 v[29:30], v[29:30], 1.0 -; SI-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 -; SI-NEXT: v_alignbit_b32 v31, v30, v29, 16 -; SI-NEXT: v_alignbit_b32 v32, v28, v27, 16 -; SI-NEXT: v_alignbit_b32 v33, v26, v25, 16 -; SI-NEXT: v_alignbit_b32 v34, v24, v23, 16 -; SI-NEXT: v_alignbit_b32 v35, v22, v21, 16 -; SI-NEXT: v_alignbit_b32 v36, v20, v19, 16 -; SI-NEXT: v_alignbit_b32 v37, v18, v17, 16 -; SI-NEXT: v_alignbit_b32 v39, v16, v15, 16 -; SI-NEXT: v_alignbit_b32 v50, v14, v13, 16 -; SI-NEXT: v_alignbit_b32 v52, v12, v11, 16 -; SI-NEXT: v_alignbit_b32 v55, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v41, v8, v7, 16 -; SI-NEXT: v_alignbit_b32 v43, v6, v5, 16 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; SI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; SI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; SI-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; SI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; SI-NEXT: v_alignbit_b32 v30, v29, v28, 16 +; SI-NEXT: v_alignbit_b32 v31, v27, v26, 16 +; SI-NEXT: v_alignbit_b32 v32, v25, v24, 16 +; SI-NEXT: v_alignbit_b32 v33, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v34, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v35, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v36, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v37, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v38, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v39, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v48, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v51, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v54, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v40, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v43, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v15 ; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_alignbit_b32 v46, v4, v3, 16 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v13 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v11 ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_alignbit_b32 v56, v2, v1, 16 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v9 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v7 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v5 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v3 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v1 ; SI-NEXT: .LBB48_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v56 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v43 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 +; SI-NEXT: v_or_b32_e32 v0, v0, v43 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v1, v56 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v60 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v46 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v59 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v43 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v58 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v57 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v47 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v45 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v44 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v42 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v53 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v24 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v25 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v26 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v49 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v27 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v28 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v29 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v30 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v59 +; SI-NEXT: v_or_b32_e32 v2, v2, v40 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v58 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v43 +; SI-NEXT: v_or_b32_e32 v3, v3, v40 +; SI-NEXT: v_or_b32_e32 v4, v4, v54 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v57 +; SI-NEXT: v_or_b32_e32 v6, v6, v51 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v56 +; SI-NEXT: v_or_b32_e32 v8, v8, v48 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v47 +; SI-NEXT: v_or_b32_e32 v10, v10, v39 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v46 +; SI-NEXT: v_or_b32_e32 v12, v12, v38 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v45 +; SI-NEXT: v_or_b32_e32 v14, v14, v37 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v44 +; SI-NEXT: v_or_b32_e32 v16, v16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v42 +; SI-NEXT: v_or_b32_e32 v18, v18, v35 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v41 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_or_b32_e32 v20, v20, v34 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v55 +; SI-NEXT: v_or_b32_e32 v22, v22, v33 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v53 +; SI-NEXT: v_or_b32_e32 v24, v24, v32 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v52 +; SI-NEXT: v_or_b32_e32 v26, v26, v31 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v50 +; SI-NEXT: v_or_b32_e32 v28, v28, v30 +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v49 +; SI-NEXT: v_or_b32_e32 v5, v5, v54 +; SI-NEXT: v_or_b32_e32 v7, v7, v51 +; SI-NEXT: v_or_b32_e32 v9, v9, v48 +; SI-NEXT: v_or_b32_e32 v11, v11, v39 +; SI-NEXT: v_or_b32_e32 v13, v13, v38 +; SI-NEXT: v_or_b32_e32 v15, v15, v37 +; SI-NEXT: v_or_b32_e32 v17, v17, v36 +; SI-NEXT: v_or_b32_e32 v19, v19, v35 +; SI-NEXT: v_or_b32_e32 v21, v21, v34 +; SI-NEXT: v_or_b32_e32 v23, v23, v33 +; SI-NEXT: v_or_b32_e32 v25, v25, v32 +; SI-NEXT: v_or_b32_e32 v27, v27, v31 +; SI-NEXT: v_or_b32_e32 v29, v29, v30 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v15f64_to_v60i16: @@ -38027,22 +36958,22 @@ define inreg <60 x i16> @bitcast_v15f64_to_v60i16_scalar(<15 x double> inreg %a, ; SI-LABEL: bitcast_v15f64_to_v60i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 -; SI-NEXT: v_mov_b32_e32 v27, s16 -; SI-NEXT: v_mov_b32_e32 v28, s17 -; SI-NEXT: v_mov_b32_e32 v29, s18 -; SI-NEXT: v_mov_b32_e32 v30, s19 -; SI-NEXT: v_mov_b32_e32 v25, s20 -; SI-NEXT: v_mov_b32_e32 v26, s21 -; SI-NEXT: v_mov_b32_e32 v23, s22 -; SI-NEXT: v_mov_b32_e32 v24, s23 -; SI-NEXT: v_mov_b32_e32 v21, s24 -; SI-NEXT: v_mov_b32_e32 v22, s25 -; SI-NEXT: v_mov_b32_e32 v19, s26 -; SI-NEXT: v_mov_b32_e32 v20, s27 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: v_mov_b32_e32 v24, s16 +; SI-NEXT: v_mov_b32_e32 v25, s17 +; SI-NEXT: v_mov_b32_e32 v28, s18 +; SI-NEXT: v_mov_b32_e32 v29, s19 +; SI-NEXT: v_mov_b32_e32 v26, s20 +; SI-NEXT: v_mov_b32_e32 v27, s21 +; SI-NEXT: v_mov_b32_e32 v22, s22 +; SI-NEXT: v_mov_b32_e32 v23, s23 +; SI-NEXT: v_mov_b32_e32 v20, s24 +; SI-NEXT: v_mov_b32_e32 v21, s25 +; SI-NEXT: v_mov_b32_e32 v18, s26 +; SI-NEXT: v_mov_b32_e32 v19, s27 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mov_b32_e32 v17, s28 -; SI-NEXT: v_mov_b32_e32 v18, s29 +; SI-NEXT: v_mov_b32_e32 v16, s28 +; SI-NEXT: v_mov_b32_e32 v17, s29 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -38061,276 +36992,191 @@ define inreg <60 x i16> @bitcast_v15f64_to_v60i16_scalar(<15 x double> inreg %a, ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB49_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshr_b64 v[31:32], v[15:16], 16 -; SI-NEXT: v_lshr_b64 v[32:33], v[13:14], 16 -; SI-NEXT: v_lshr_b64 v[33:34], v[11:12], 16 -; SI-NEXT: v_lshr_b64 v[34:35], v[9:10], 16 -; SI-NEXT: v_lshr_b64 v[35:36], v[7:8], 16 -; SI-NEXT: v_lshr_b64 v[36:37], v[5:6], 16 -; SI-NEXT: v_lshr_b64 v[51:52], v[1:2], 16 -; SI-NEXT: v_lshr_b64 v[37:38], v[3:4], 16 -; SI-NEXT: v_lshr_b64 v[52:53], v[23:24], 16 -; SI-NEXT: v_lshr_b64 v[38:39], v[17:18], 16 -; SI-NEXT: v_lshr_b64 v[48:49], v[19:20], 16 -; SI-NEXT: v_lshr_b64 v[53:54], v[25:26], 16 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v2 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v18 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v20 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v22 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v24 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v26 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v30 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v28 -; SI-NEXT: v_lshr_b64 v[49:50], v[21:22], 16 -; SI-NEXT: v_lshr_b64 v[54:55], v[29:30], 16 -; SI-NEXT: v_lshr_b64 v[39:40], v[27:28], 16 +; SI-NEXT: v_lshr_b64 v[40:41], v[12:13], 16 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v15 +; SI-NEXT: v_lshr_b64 v[41:42], v[10:11], 16 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v13 +; SI-NEXT: v_lshr_b64 v[42:43], v[8:9], 16 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v11 +; SI-NEXT: v_lshr_b64 v[43:44], v[6:7], 16 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[44:45], v[4:5], 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[30:31], v[16:17], 16 +; SI-NEXT: v_lshr_b64 v[45:46], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[31:32], v[18:19], 16 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v25 +; SI-NEXT: v_lshr_b64 v[54:55], v[14:15], 16 +; SI-NEXT: v_lshr_b64 v[46:47], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[48:49], v[20:21], 16 +; SI-NEXT: v_lshr_b64 v[38:39], v[22:23], 16 +; SI-NEXT: v_lshr_b64 v[36:37], v[26:27], 16 +; SI-NEXT: v_lshr_b64 v[34:35], v[28:29], 16 +; SI-NEXT: v_lshr_b64 v[32:33], v[24:25], 16 ; SI-NEXT: s_cbranch_execnz .LBB49_3 ; SI-NEXT: .LBB49_2: ; %cmp.true -; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 -; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 -; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 -; SI-NEXT: v_lshr_b64 v[31:32], v[15:16], 16 -; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 -; SI-NEXT: v_lshr_b64 v[32:33], v[13:14], 16 -; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 -; SI-NEXT: v_lshr_b64 v[33:34], v[11:12], 16 -; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 -; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 -; SI-NEXT: v_lshr_b64 v[34:35], v[9:10], 16 -; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 -; SI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 -; SI-NEXT: v_lshr_b64 v[35:36], v[7:8], 16 -; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 -; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 -; SI-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 -; SI-NEXT: v_lshr_b64 v[36:37], v[5:6], 16 -; SI-NEXT: v_lshr_b64 v[51:52], v[1:2], 16 -; SI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 -; SI-NEXT: v_add_f64 v[29:30], v[29:30], 1.0 -; SI-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 -; SI-NEXT: v_lshr_b64 v[37:38], v[3:4], 16 -; SI-NEXT: v_lshr_b64 v[52:53], v[23:24], 16 -; SI-NEXT: v_lshr_b64 v[38:39], v[17:18], 16 -; SI-NEXT: v_lshr_b64 v[48:49], v[19:20], 16 -; SI-NEXT: v_lshr_b64 v[53:54], v[25:26], 16 -; SI-NEXT: v_lshr_b64 v[49:50], v[21:22], 16 -; SI-NEXT: v_lshr_b64 v[54:55], v[29:30], 16 -; SI-NEXT: v_lshr_b64 v[39:40], v[27:28], 16 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v2 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v18 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v20 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v22 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v24 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v26 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v30 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v28 -; SI-NEXT: .LBB49_3: ; %end -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; SI-NEXT: v_or_b32_e32 v27, v27, v39 -; SI-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v27, 0xffff, v28 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v63 -; SI-NEXT: v_or_b32_e32 v27, v27, v28 -; SI-NEXT: v_add_i32_e32 v28, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v27, v28, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v54 -; SI-NEXT: v_and_b32_e32 v28, 0xffff, v29 -; SI-NEXT: v_or_b32_e32 v27, v28, v27 -; SI-NEXT: v_add_i32_e32 v28, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v27, v28, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v27, 0xffff, v30 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v62 -; SI-NEXT: v_or_b32_e32 v27, v27, v28 -; SI-NEXT: v_add_i32_e32 v28, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v27, v28, s[0:3], 0 offen +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_lshr_b64 v[40:41], v[12:13], 16 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v15 +; SI-NEXT: v_lshr_b64 v[41:42], v[10:11], 16 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v53 -; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; SI-NEXT: v_or_b32_e32 v25, v25, v27 -; SI-NEXT: v_add_i32_e32 v27, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v25, v27, s[0:3], 0 offen +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v13 +; SI-NEXT: v_lshr_b64 v[42:43], v[8:9], 16 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v11 +; SI-NEXT: v_lshr_b64 v[43:44], v[6:7], 16 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; SI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; SI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; SI-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; SI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[44:45], v[4:5], 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[30:31], v[16:17], 16 +; SI-NEXT: v_lshr_b64 v[45:46], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[31:32], v[18:19], 16 +; SI-NEXT: v_lshr_b64 v[54:55], v[14:15], 16 +; SI-NEXT: v_lshr_b64 v[46:47], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[48:49], v[20:21], 16 +; SI-NEXT: v_lshr_b64 v[38:39], v[22:23], 16 +; SI-NEXT: v_lshr_b64 v[36:37], v[26:27], 16 +; SI-NEXT: v_lshr_b64 v[34:35], v[28:29], 16 +; SI-NEXT: v_lshr_b64 v[32:33], v[24:25], 16 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v25 +; SI-NEXT: .LBB49_3: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v32, v24, v32 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v58 +; SI-NEXT: v_or_b32_e32 v33, v24, v25 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v34 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v28 +; SI-NEXT: v_or_b32_e32 v34, v25, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v57 +; SI-NEXT: v_or_b32_e32 v35, v24, v25 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v36 ; SI-NEXT: v_and_b32_e32 v25, 0xffff, v26 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v61 -; SI-NEXT: v_or_b32_e32 v25, v25, v26 -; SI-NEXT: v_add_i32_e32 v26, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v25, v26, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v52 -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; SI-NEXT: v_or_b32_e32 v23, v23, v25 -; SI-NEXT: v_add_i32_e32 v25, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v23, v25, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v60 -; SI-NEXT: v_or_b32_e32 v23, v23, v24 -; SI-NEXT: v_add_i32_e32 v24, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v23, v24, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v49 -; SI-NEXT: v_or_b32_e32 v21, v21, v23 -; SI-NEXT: v_add_i32_e32 v23, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v21, v23, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v22 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v59 -; SI-NEXT: v_or_b32_e32 v21, v21, v22 -; SI-NEXT: v_add_i32_e32 v22, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v21, v22, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v48 -; SI-NEXT: v_or_b32_e32 v19, v19, v21 -; SI-NEXT: v_add_i32_e32 v21, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v19, v21, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v58 -; SI-NEXT: v_or_b32_e32 v19, v19, v20 -; SI-NEXT: v_add_i32_e32 v20, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v38 -; SI-NEXT: v_or_b32_e32 v17, v17, v19 -; SI-NEXT: v_add_i32_e32 v19, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v17, v19, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v57 -; SI-NEXT: v_or_b32_e32 v17, v17, v18 -; SI-NEXT: v_add_i32_e32 v18, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v51 -; SI-NEXT: v_or_b32_e32 v1, v1, v17 -; SI-NEXT: v_add_i32_e32 v17, vcc, 56, v0 -; SI-NEXT: buffer_store_dword v1, v17, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v56 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v47 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v46 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v45 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v44 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v43 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v42 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v36, v25, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v56 +; SI-NEXT: v_or_b32_e32 v37, v24, v25 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v38 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v38, v22, v24 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v52 +; SI-NEXT: v_or_b32_e32 v39, v22, v23 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v48 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v48, v20, v22 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v50 +; SI-NEXT: v_or_b32_e32 v49, v20, v21 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v31 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v50, v18, v20 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v51 +; SI-NEXT: v_or_b32_e32 v51, v18, v19 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v30 +; SI-NEXT: v_or_b32_e32 v52, v16, v18 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v53 +; SI-NEXT: v_or_b32_e32 v53, v16, v17 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v46 +; SI-NEXT: v_or_b32_e32 v30, v0, v16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v63 +; SI-NEXT: v_or_b32_e32 v31, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v45 +; SI-NEXT: v_or_b32_e32 v16, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v62 +; SI-NEXT: v_or_b32_e32 v17, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v44 +; SI-NEXT: v_or_b32_e32 v18, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v61 +; SI-NEXT: v_or_b32_e32 v19, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v43 +; SI-NEXT: v_or_b32_e32 v20, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v60 +; SI-NEXT: v_or_b32_e32 v21, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v42 +; SI-NEXT: v_or_b32_e32 v22, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v59 +; SI-NEXT: v_or_b32_e32 v23, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v41 +; SI-NEXT: v_or_b32_e32 v24, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v11 +; SI-NEXT: v_mov_b32_e32 v2, v34 +; SI-NEXT: v_mov_b32_e32 v3, v35 +; SI-NEXT: v_mov_b32_e32 v4, v36 +; SI-NEXT: v_mov_b32_e32 v5, v37 +; SI-NEXT: v_mov_b32_e32 v6, v38 +; SI-NEXT: v_mov_b32_e32 v7, v39 +; SI-NEXT: v_mov_b32_e32 v8, v48 +; SI-NEXT: v_mov_b32_e32 v9, v49 +; SI-NEXT: v_mov_b32_e32 v10, v50 +; SI-NEXT: v_mov_b32_e32 v11, v51 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v25, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v40 +; SI-NEXT: v_or_b32_e32 v26, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -38347,39 +37193,59 @@ define inreg <60 x i16> @bitcast_v15f64_to_v60i16_scalar(<15 x double> inreg %a, ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v13 +; SI-NEXT: v_mov_b32_e32 v12, v52 +; SI-NEXT: v_mov_b32_e32 v13, v53 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v27, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v54 +; SI-NEXT: v_or_b32_e32 v28, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v15 +; SI-NEXT: v_mov_b32_e32 v14, v30 +; SI-NEXT: v_mov_b32_e32 v15, v31 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v29, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, v32 +; SI-NEXT: v_mov_b32_e32 v1, v33 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB49_4: -; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; kill: killed $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr33 ; SI-NEXT: s_branch .LBB49_2 ; ; VI-LABEL: bitcast_v15f64_to_v60i16_scalar: @@ -39253,181 +38119,279 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v60i16_to_v15f64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:12 -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v29 -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:60 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:4 -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v2 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:52 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:112 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:108 -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v4 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:104 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:100 -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v8 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:36 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92 -; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:88 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:84 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v12 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v16 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v18 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v20 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v49, v20 +; SI-NEXT: v_mov_b32_e32 v50, v19 +; SI-NEXT: v_mov_b32_e32 v51, v18 +; SI-NEXT: v_mov_b32_e32 v52, v17 +; SI-NEXT: v_mov_b32_e32 v53, v16 +; SI-NEXT: v_mov_b32_e32 v54, v15 +; SI-NEXT: v_mov_b32_e32 v55, v14 +; SI-NEXT: v_mov_b32_e32 v40, v13 +; SI-NEXT: v_mov_b32_e32 v41, v12 +; SI-NEXT: v_mov_b32_e32 v42, v11 +; SI-NEXT: v_mov_b32_e32 v43, v10 +; SI-NEXT: v_mov_b32_e32 v44, v9 +; SI-NEXT: v_mov_b32_e32 v45, v8 +; SI-NEXT: v_mov_b32_e32 v46, v7 +; SI-NEXT: v_mov_b32_e32 v47, v6 +; SI-NEXT: v_mov_b32_e32 v56, v5 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_mov_b32_e32 v57, v4 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_mov_b32_e32 v58, v3 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_mov_b32_e32 v59, v2 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_mov_b32_e32 v60, v1 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v61, v0 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v29 +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v28 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v27 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v26 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v25 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v24 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v23 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v22 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v50 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v52 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v40 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v42 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v43 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v44 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v45 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v46 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v47 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v56 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v57 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v58 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v59 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v60 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v61 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v23 ; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v22 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v24 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:68 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v26 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:64 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v28 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB50_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v42 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v61 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v60 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v59 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v58 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v57 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v56 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v47 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v46 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v45 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v44 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v43 +; SI-NEXT: v_or_b32_e32 v0, v0, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v48 +; SI-NEXT: v_or_b32_e32 v2, v2, v32 +; SI-NEXT: v_or_b32_e32 v3, v3, v39 +; SI-NEXT: v_or_b32_e32 v4, v4, v34 +; SI-NEXT: v_or_b32_e32 v5, v5, v38 +; SI-NEXT: v_or_b32_e32 v6, v6, v63 +; SI-NEXT: v_or_b32_e32 v7, v7, v37 +; SI-NEXT: v_or_b32_e32 v8, v8, v33 +; SI-NEXT: v_or_b32_e32 v9, v9, v36 +; SI-NEXT: v_or_b32_e32 v10, v10, v62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v41 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v40 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v55 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v54 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v53 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v52 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v51 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v50 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v49 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v24, v24, v25 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v26, v26, v27 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: v_or_b32_e32 v28, v28, v29 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: v_or_b32_e32 v29, v29, v30 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; kill: killed $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr30 @@ -39479,195 +38443,40 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; kill: killed $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v61 ; SI-NEXT: ; kill: killed $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: v_or_b32_e32 v0, v0, v60 -; SI-NEXT: v_or_b32_e32 v18, v18, v37 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: ; kill: killed $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; SI-NEXT: v_or_b32_e32 v1, v1, v54 -; SI-NEXT: v_or_b32_e32 v2, v2, v59 -; SI-NEXT: v_or_b32_e32 v3, v3, v58 -; SI-NEXT: v_or_b32_e32 v4, v4, v53 -; SI-NEXT: v_or_b32_e32 v5, v5, v57 -; SI-NEXT: v_or_b32_e32 v6, v6, v52 -; SI-NEXT: v_or_b32_e32 v7, v7, v56 -; SI-NEXT: v_or_b32_e32 v8, v8, v51 -; SI-NEXT: v_or_b32_e32 v9, v9, v50 -; SI-NEXT: v_or_b32_e32 v10, v10, v49 -; SI-NEXT: v_or_b32_e32 v11, v11, v48 -; SI-NEXT: v_or_b32_e32 v12, v12, v47 -; SI-NEXT: v_or_b32_e32 v13, v13, v39 -; SI-NEXT: v_or_b32_e32 v14, v14, v46 -; SI-NEXT: v_or_b32_e32 v15, v15, v45 -; SI-NEXT: v_or_b32_e32 v16, v16, v38 -; SI-NEXT: v_or_b32_e32 v17, v17, v44 -; SI-NEXT: v_or_b32_e32 v19, v19, v36 -; SI-NEXT: v_or_b32_e32 v20, v20, v43 -; SI-NEXT: v_or_b32_e32 v21, v21, v35 -; SI-NEXT: v_or_b32_e32 v22, v22, v42 -; SI-NEXT: v_or_b32_e32 v23, v23, v34 -; SI-NEXT: v_or_b32_e32 v24, v24, v41 -; SI-NEXT: v_or_b32_e32 v25, v25, v33 -; SI-NEXT: v_or_b32_e32 v26, v26, v40 -; SI-NEXT: v_or_b32_e32 v27, v27, v32 -; SI-NEXT: v_or_b32_e32 v28, v28, v63 -; SI-NEXT: v_or_b32_e32 v29, v29, v62 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr62 ; SI-NEXT: .LBB50_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB50_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v61 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v42 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v61 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v60 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v59 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v58 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v57 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v56 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v47 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v46 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v45 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v44 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v43 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; SI-NEXT: v_or_b32_e32 v0, v60, v0 -; SI-NEXT: s_mov_b32 s6, 0x30000 -; SI-NEXT: v_or_b32_e32 v18, v37, v18 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 @@ -39678,52 +38487,19 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; SI-NEXT: v_or_b32_e32 v1, v54, v1 -; SI-NEXT: v_or_b32_e32 v2, v59, v2 -; SI-NEXT: v_or_b32_e32 v3, v58, v3 -; SI-NEXT: v_or_b32_e32 v4, v53, v4 -; SI-NEXT: v_or_b32_e32 v5, v57, v5 -; SI-NEXT: v_or_b32_e32 v6, v52, v6 -; SI-NEXT: v_or_b32_e32 v7, v56, v7 -; SI-NEXT: v_or_b32_e32 v8, v51, v8 -; SI-NEXT: v_or_b32_e32 v9, v50, v9 -; SI-NEXT: v_or_b32_e32 v10, v49, v10 -; SI-NEXT: v_or_b32_e32 v11, v48, v11 -; SI-NEXT: v_or_b32_e32 v12, v47, v12 -; SI-NEXT: v_or_b32_e32 v13, v39, v13 -; SI-NEXT: v_or_b32_e32 v14, v46, v14 -; SI-NEXT: v_or_b32_e32 v15, v45, v15 -; SI-NEXT: v_or_b32_e32 v16, v38, v16 -; SI-NEXT: v_or_b32_e32 v17, v44, v17 -; SI-NEXT: v_or_b32_e32 v19, v36, v19 -; SI-NEXT: v_or_b32_e32 v20, v43, v20 -; SI-NEXT: v_or_b32_e32 v21, v35, v21 -; SI-NEXT: v_or_b32_e32 v22, v42, v22 -; SI-NEXT: v_or_b32_e32 v23, v34, v23 -; SI-NEXT: v_or_b32_e32 v24, v41, v24 -; SI-NEXT: v_or_b32_e32 v25, v33, v25 -; SI-NEXT: v_or_b32_e32 v26, v40, v26 -; SI-NEXT: v_or_b32_e32 v27, v32, v27 -; SI-NEXT: v_or_b32_e32 v28, v63, v28 -; SI-NEXT: v_or_b32_e32 v29, v62, v29 +; SI-NEXT: v_or_b32_e32 v0, v35, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v1, v48, v1 +; SI-NEXT: v_or_b32_e32 v2, v32, v2 +; SI-NEXT: v_or_b32_e32 v3, v39, v3 +; SI-NEXT: v_or_b32_e32 v4, v34, v4 +; SI-NEXT: v_or_b32_e32 v5, v38, v5 +; SI-NEXT: v_or_b32_e32 v6, v63, v6 +; SI-NEXT: v_or_b32_e32 v7, v37, v7 +; SI-NEXT: v_or_b32_e32 v8, v33, v8 +; SI-NEXT: v_or_b32_e32 v9, v36, v9 +; SI-NEXT: v_or_b32_e32 v10, v62, v10 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 ; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 @@ -39734,6 +38510,46 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 ; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 ; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v41 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v40 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v55 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v54 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v53 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v52 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v51 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v50 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v49 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 ; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 ; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 @@ -39741,36 +38557,90 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 ; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 ; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 ; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 ; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v20 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v25, vcc, s6, v25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v26, v27, v26 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v26, vcc, s6, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v27, vcc, 0x30000, v27 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: v_or_b32_e32 v28, v29, v28 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v28, vcc, 0x30000, v28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: v_or_b32_e32 v29, v30, v29 ; SI-NEXT: v_add_i32_e32 v29, vcc, 0x30000, v29 ; SI-NEXT: .LBB50_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v60i16_to_v15f64: @@ -40594,424 +39464,349 @@ define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a, ; SI-LABEL: bitcast_v60i16_to_v15f64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_mov_b32_e32 v60, v16 -; SI-NEXT: v_mov_b32_e32 v53, v14 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mov_b32_e32 v62, v12 -; SI-NEXT: v_mov_b32_e32 v32, v10 -; SI-NEXT: v_mov_b32_e32 v55, v8 -; SI-NEXT: v_mov_b32_e32 v37, v6 -; SI-NEXT: v_mov_b32_e32 v41, v4 -; SI-NEXT: v_mov_b32_e32 v44, v2 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v63, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:52 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v30, v28 -; SI-NEXT: v_mov_b32_e32 v39, v26 -; SI-NEXT: v_mov_b32_e32 v48, v24 -; SI-NEXT: v_mov_b32_e32 v49, v22 -; SI-NEXT: v_mov_b32_e32 v47, v20 -; SI-NEXT: v_mov_b32_e32 v50, v18 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v29 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v4 +; SI-NEXT: v_mov_b32_e32 v52, v3 +; SI-NEXT: v_mov_b32_e32 v51, v4 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v52 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v50, v5 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v51 +; SI-NEXT: v_mov_b32_e32 v49, v6 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v50 +; SI-NEXT: v_mov_b32_e32 v48, v7 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v49 +; SI-NEXT: v_mov_b32_e32 v39, v8 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v48 +; SI-NEXT: v_mov_b32_e32 v38, v9 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v39 +; SI-NEXT: v_mov_b32_e32 v37, v10 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v38 +; SI-NEXT: v_mov_b32_e32 v36, v11 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v37 +; SI-NEXT: v_mov_b32_e32 v35, v12 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v36 +; SI-NEXT: v_mov_b32_e32 v34, v13 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v35 +; SI-NEXT: v_mov_b32_e32 v33, v14 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v34 +; SI-NEXT: v_mov_b32_e32 v32, v15 +; SI-NEXT: v_mov_b32_e32 v53, v2 +; SI-NEXT: v_mov_b32_e32 v54, v1 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v54 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v55 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s7, s28, 16 +; SI-NEXT: s_lshr_b32 s8, s27, 16 +; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: s_lshr_b32 s13, s22, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s40, s19, 16 +; SI-NEXT: s_lshr_b32 s41, s18, 16 +; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v32 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v0 ; SI-NEXT: s_cbranch_scc0 .LBB51_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v63 -; SI-NEXT: v_or_b32_e32 v7, v0, v31 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v41 -; SI-NEXT: v_or_b32_e32 v9, v0, v28 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 -; SI-NEXT: v_or_b32_e32 v10, v0, v24 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; SI-NEXT: v_or_b32_e32 v11, v0, v22 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 -; SI-NEXT: v_or_b32_e32 v12, v0, v20 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v62 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_or_b32_e32 v13, v0, v13 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: v_or_b32_e32 v14, v0, v63 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 -; SI-NEXT: v_or_b32_e32 v14, v0, v18 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v60 -; SI-NEXT: v_or_b32_e32 v15, v0, v15 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s44, s42, 16 +; SI-NEXT: v_or_b32_e32 v16, v0, v61 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; SI-NEXT: s_or_b32 s5, s5, s44 +; SI-NEXT: s_and_b32 s44, s18, 0xffff +; SI-NEXT: s_lshl_b32 s45, s41, 16 +; SI-NEXT: v_or_b32_e32 v17, v0, v60 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; SI-NEXT: s_or_b32 s44, s44, s45 +; SI-NEXT: s_and_b32 s45, s19, 0xffff +; SI-NEXT: s_lshl_b32 s46, s40, 16 +; SI-NEXT: v_or_b32_e32 v18, v0, v59 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 -; SI-NEXT: v_or_b32_e32 v16, v0, v17 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v47 -; SI-NEXT: v_or_b32_e32 v17, v0, v5 +; SI-NEXT: s_or_b32 s45, s45, s46 +; SI-NEXT: s_and_b32 s46, s20, 0xffff +; SI-NEXT: s_lshl_b32 s47, s15, 16 +; SI-NEXT: v_or_b32_e32 v19, v0, v58 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 -; SI-NEXT: v_or_b32_e32 v18, v0, v3 +; SI-NEXT: s_or_b32 s46, s46, s47 +; SI-NEXT: s_and_b32 s47, s21, 0xffff +; SI-NEXT: s_lshl_b32 s56, s14, 16 +; SI-NEXT: v_or_b32_e32 v20, v0, v57 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 -; SI-NEXT: v_or_b32_e32 v19, v0, v46 +; SI-NEXT: s_or_b32 s47, s47, s56 +; SI-NEXT: s_and_b32 s56, s22, 0xffff +; SI-NEXT: s_lshl_b32 s57, s13, 16 +; SI-NEXT: v_or_b32_e32 v21, v0, v56 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_or_b32_e32 v20, v0, v45 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 -; SI-NEXT: v_or_b32_e32 v21, v0, v43 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v44 -; SI-NEXT: s_or_b32 s7, s7, s8 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: v_or_b32_e32 v8, v1, v26 -; SI-NEXT: s_or_b32 s8, s8, s9 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: s_or_b32 s9, s9, s10 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_mov_b32_e32 v5, s9 -; SI-NEXT: v_mov_b32_e32 v6, s10 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_or_b32_e32 v22, v0, v42 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v61 -; SI-NEXT: v_or_b32_e32 v23, v0, v40 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_or_b32_e32 v24, v0, v38 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v59 -; SI-NEXT: v_or_b32_e32 v25, v0, v36 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v58 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_or_b32_e32 v26, v0, v35 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 -; SI-NEXT: v_or_b32_e32 v27, v0, v34 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v57 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_or_b32_e32 v28, v0, v54 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v56 -; SI-NEXT: v_or_b32_e32 v29, v0, v33 +; SI-NEXT: s_or_b32 s56, s56, s57 +; SI-NEXT: s_and_b32 s57, s23, 0xffff +; SI-NEXT: s_lshl_b32 s58, s12, 16 +; SI-NEXT: v_or_b32_e32 v22, v0, v47 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: s_or_b32 s57, s57, s58 +; SI-NEXT: s_and_b32 s58, s24, 0xffff +; SI-NEXT: s_lshl_b32 s59, s11, 16 +; SI-NEXT: v_or_b32_e32 v23, v0, v46 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: s_or_b32 s58, s58, s59 +; SI-NEXT: s_and_b32 s59, s25, 0xffff +; SI-NEXT: s_lshl_b32 s60, s10, 16 +; SI-NEXT: v_or_b32_e32 v24, v0, v45 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: s_or_b32 s59, s59, s60 +; SI-NEXT: s_and_b32 s60, s26, 0xffff +; SI-NEXT: s_lshl_b32 s61, s9, 16 +; SI-NEXT: v_or_b32_e32 v25, v0, v44 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: s_or_b32 s60, s60, s61 +; SI-NEXT: s_and_b32 s61, s27, 0xffff +; SI-NEXT: s_lshl_b32 s62, s8, 16 +; SI-NEXT: v_or_b32_e32 v26, v0, v43 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: s_or_b32 s61, s61, s62 +; SI-NEXT: s_and_b32 s62, s28, 0xffff +; SI-NEXT: s_lshl_b32 s63, s7, 16 +; SI-NEXT: v_or_b32_e32 v27, v0, v42 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: s_or_b32 s62, s62, s63 +; SI-NEXT: s_and_b32 s63, s29, 0xffff +; SI-NEXT: s_lshl_b32 s72, s6, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; SI-NEXT: v_or_b32_e32 v28, v0, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: s_or_b32 s63, s63, s72 +; SI-NEXT: v_or_b32_e32 v15, v1, v62 +; SI-NEXT: v_or_b32_e32 v29, v0, v40 ; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s44 +; SI-NEXT: v_mov_b32_e32 v3, s45 +; SI-NEXT: v_mov_b32_e32 v4, s46 +; SI-NEXT: v_mov_b32_e32 v5, s47 +; SI-NEXT: v_mov_b32_e32 v6, s56 +; SI-NEXT: v_mov_b32_e32 v7, s57 +; SI-NEXT: v_mov_b32_e32 v8, s58 +; SI-NEXT: v_mov_b32_e32 v9, s59 +; SI-NEXT: v_mov_b32_e32 v10, s60 +; SI-NEXT: v_mov_b32_e32 v11, s61 +; SI-NEXT: v_mov_b32_e32 v12, s62 +; SI-NEXT: v_mov_b32_e32 v13, s63 ; SI-NEXT: s_cbranch_execnz .LBB51_3 ; SI-NEXT: .LBB51_2: ; %cmp.true -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v63 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: s_or_b32 s7, s8, s7 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: s_or_b32 s8, s9, s8 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: s_or_b32 s9, s10, s9 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: s_or_b32 s10, s11, s10 -; SI-NEXT: s_add_i32 s4, s4, 0x30000 -; SI-NEXT: s_add_i32 s5, s5, 0x30000 -; SI-NEXT: s_add_i32 s6, s6, 0x30000 -; SI-NEXT: s_add_i32 s7, s7, 0x30000 -; SI-NEXT: s_add_i32 s8, s8, 0x30000 -; SI-NEXT: s_add_i32 s9, s9, 0x30000 -; SI-NEXT: s_add_i32 s10, s10, 0x30000 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_mov_b32_e32 v5, s9 -; SI-NEXT: v_mov_b32_e32 v6, s10 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v44 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v62 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v0, v63, v0 ; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v60 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v0, v61, v0 ; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v47 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v0, v60, v0 ; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v0, v59, v0 ; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: v_or_b32_e32 v0, v58, v0 ; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v45, v0 +; SI-NEXT: v_or_b32_e32 v0, v57, v0 ; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: v_or_b32_e32 v0, v56, v0 ; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: v_or_b32_e32 v0, v47, v0 ; SI-NEXT: v_add_i32_e32 v22, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v61 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v40, v0 +; SI-NEXT: v_or_b32_e32 v0, v46, v0 ; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v38, v0 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: v_or_b32_e32 v0, v45, v0 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s16, s42, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: v_add_i32_e32 v24, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v59 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: s_and_b32 s16, s18, 0xffff +; SI-NEXT: s_lshl_b32 s17, s41, 16 +; SI-NEXT: s_add_i32 s19, s19, 3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v36, v0 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_and_b32 s17, s19, 0xffff +; SI-NEXT: s_lshl_b32 s18, s40, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_or_b32_e32 v0, v44, v0 +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: s_and_b32 s18, s20, 0xffff +; SI-NEXT: s_lshl_b32 s15, s15, 16 +; SI-NEXT: s_add_i32 s21, s21, 3 ; SI-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v58 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: s_or_b32 s15, s15, s18 +; SI-NEXT: s_and_b32 s18, s21, 0xffff +; SI-NEXT: s_lshl_b32 s14, s14, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v35, v0 +; SI-NEXT: s_or_b32 s14, s14, s18 +; SI-NEXT: s_and_b32 s18, s22, 0xffff +; SI-NEXT: s_lshl_b32 s13, s13, 16 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: s_or_b32 s13, s13, s18 +; SI-NEXT: s_and_b32 s18, s23, 0xffff +; SI-NEXT: s_lshl_b32 s12, s12, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 ; SI-NEXT: v_add_i32_e32 v26, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 +; SI-NEXT: s_or_b32 s12, s12, s18 +; SI-NEXT: s_and_b32 s18, s24, 0xffff +; SI-NEXT: s_lshl_b32 s11, s11, 16 +; SI-NEXT: s_add_i32 s25, s25, 3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v34, v0 +; SI-NEXT: s_or_b32 s11, s11, s18 +; SI-NEXT: s_and_b32 s18, s25, 0xffff +; SI-NEXT: s_lshl_b32 s10, s10, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: s_or_b32 s10, s10, s18 +; SI-NEXT: s_and_b32 s18, s26, 0xffff +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_add_i32 s27, s27, 3 ; SI-NEXT: v_add_i32_e32 v27, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v57 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: s_or_b32 s9, s9, s18 +; SI-NEXT: s_and_b32 s18, s27, 0xffff +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v54, v0 +; SI-NEXT: s_or_b32 s8, s8, s18 +; SI-NEXT: s_and_b32 s18, s28, 0xffff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: v_or_b32_e32 v0, v41, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: s_or_b32 s7, s7, s18 +; SI-NEXT: s_and_b32 s18, s29, 0xffff +; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: v_add_i32_e32 v28, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v56 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_or_b32 s6, s6, s18 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v33, v0 +; SI-NEXT: v_or_b32_e32 v1, v62, v1 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s16, s16, 0x30000 +; SI-NEXT: s_add_i32 s17, s17, 0x30000 +; SI-NEXT: s_add_i32 s15, s15, 0x30000 +; SI-NEXT: s_add_i32 s14, s14, 0x30000 +; SI-NEXT: s_add_i32 s13, s13, 0x30000 +; SI-NEXT: s_add_i32 s12, s12, 0x30000 +; SI-NEXT: s_add_i32 s11, s11, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v40, v0 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v1 ; SI-NEXT: v_add_i32_e32 v29, vcc, 0x30000, v0 ; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: v_mov_b32_e32 v3, s17 +; SI-NEXT: v_mov_b32_e32 v4, s15 +; SI-NEXT: v_mov_b32_e32 v5, s14 +; SI-NEXT: v_mov_b32_e32 v6, s13 +; SI-NEXT: v_mov_b32_e32 v7, s12 +; SI-NEXT: v_mov_b32_e32 v8, s11 +; SI-NEXT: v_mov_b32_e32 v9, s10 +; SI-NEXT: v_mov_b32_e32 v10, s9 +; SI-NEXT: v_mov_b32_e32 v11, s8 +; SI-NEXT: v_mov_b32_e32 v12, s7 +; SI-NEXT: v_mov_b32_e32 v13, s6 ; SI-NEXT: .LBB51_3: ; %end -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB51_4: -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v46, v44 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v45, v43 -; SI-NEXT: v_mov_b32_e32 v44, v42 -; SI-NEXT: v_mov_b32_e32 v43, v41 -; SI-NEXT: v_mov_b32_e32 v42, v40 -; SI-NEXT: v_mov_b32_e32 v41, v38 -; SI-NEXT: v_mov_b32_e32 v40, v37 -; SI-NEXT: v_mov_b32_e32 v38, v36 -; SI-NEXT: v_mov_b32_e32 v37, v35 -; SI-NEXT: v_mov_b32_e32 v36, v55 -; SI-NEXT: v_mov_b32_e32 v55, v34 -; SI-NEXT: v_mov_b32_e32 v35, v54 -; SI-NEXT: v_mov_b32_e32 v54, v33 -; SI-NEXT: v_mov_b32_e32 v34, v32 -; SI-NEXT: v_mov_b32_e32 v33, v62 -; SI-NEXT: v_mov_b32_e32 v62, v60 -; SI-NEXT: v_mov_b32_e32 v32, v63 -; SI-NEXT: v_mov_b32_e32 v63, v53 -; SI-NEXT: v_mov_b32_e32 v53, v61 -; SI-NEXT: v_mov_b32_e32 v61, v52 -; SI-NEXT: v_mov_b32_e32 v52, v59 -; SI-NEXT: v_mov_b32_e32 v59, v51 -; SI-NEXT: v_mov_b32_e32 v51, v57 -; SI-NEXT: v_mov_b32_e32 v57, v50 -; SI-NEXT: v_mov_b32_e32 v50, v47 -; SI-NEXT: v_mov_b32_e32 v47, v48 -; SI-NEXT: v_mov_b32_e32 v48, v30 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v60, v62 -; SI-NEXT: v_mov_b32_e32 v30, v48 -; SI-NEXT: v_mov_b32_e32 v48, v47 -; SI-NEXT: v_mov_b32_e32 v47, v50 -; SI-NEXT: v_mov_b32_e32 v50, v57 -; SI-NEXT: v_mov_b32_e32 v57, v51 -; SI-NEXT: v_mov_b32_e32 v51, v59 -; SI-NEXT: v_mov_b32_e32 v59, v52 -; SI-NEXT: v_mov_b32_e32 v52, v61 -; SI-NEXT: v_mov_b32_e32 v61, v53 -; SI-NEXT: v_mov_b32_e32 v53, v63 -; SI-NEXT: v_mov_b32_e32 v63, v32 -; SI-NEXT: v_mov_b32_e32 v62, v33 -; SI-NEXT: v_mov_b32_e32 v32, v34 -; SI-NEXT: v_mov_b32_e32 v33, v54 -; SI-NEXT: v_mov_b32_e32 v54, v35 -; SI-NEXT: v_mov_b32_e32 v34, v55 -; SI-NEXT: v_mov_b32_e32 v55, v36 -; SI-NEXT: v_mov_b32_e32 v35, v37 -; SI-NEXT: v_mov_b32_e32 v36, v38 -; SI-NEXT: v_mov_b32_e32 v37, v40 -; SI-NEXT: v_mov_b32_e32 v38, v41 -; SI-NEXT: v_mov_b32_e32 v40, v42 -; SI-NEXT: v_mov_b32_e32 v41, v43 -; SI-NEXT: v_mov_b32_e32 v42, v44 -; SI-NEXT: v_mov_b32_e32 v43, v45 -; SI-NEXT: v_mov_b32_e32 v44, v46 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: s_branch .LBB51_2 ; ; VI-LABEL: bitcast_v60i16_to_v15f64_scalar: @@ -41788,161 +40583,154 @@ define <60 x half> @bitcast_v15f64_to_v60f16(<15 x double> %a, i32 %b) { ; SI-LABEL: bitcast_v15f64_to_v60f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; kill: killed $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; kill: killed $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; kill: killed $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; kill: killed $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; kill: killed $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; kill: killed $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; kill: killed $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; kill: killed $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; kill: killed $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; kill: killed $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; kill: killed $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB52_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v29 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v32 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v27 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v21 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v8 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v7 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v33 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v5 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v3 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v31 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v52 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v27 ; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v18 ; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 ; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 @@ -41963,146 +40751,147 @@ define <60 x half> @bitcast_v15f64_to_v60f16(<15 x double> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v50, v4 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v34, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v35 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v36 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v38 -; SI-NEXT: v_mov_b32_e32 v38, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v39 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v31, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v50 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v53 +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v30, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v55 ; SI-NEXT: v_cvt_f32_f16_e32 v33, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v46 ; SI-NEXT: v_cvt_f32_f16_e32 v53, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v58 -; SI-NEXT: v_mov_b32_e32 v58, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v60 -; SI-NEXT: v_mov_b32_e32 v60, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v62 -; SI-NEXT: v_mov_b32_e32 v62, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v61 -; SI-NEXT: v_mov_b32_e32 v61, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v63 -; SI-NEXT: v_mov_b32_e32 v63, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v48 -; SI-NEXT: v_mov_b32_e32 v48, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v3 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v58 +; SI-NEXT: v_mov_b32_e32 v58, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v59 +; SI-NEXT: v_mov_b32_e32 v59, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v62 +; SI-NEXT: v_mov_b32_e32 v62, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v63 +; SI-NEXT: v_mov_b32_e32 v63, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v56 +; SI-NEXT: v_mov_b32_e32 v56, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v57 +; SI-NEXT: v_mov_b32_e32 v57, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v60 +; SI-NEXT: v_mov_b32_e32 v60, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v61 +; SI-NEXT: v_mov_b32_e32 v61, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v32 +; SI-NEXT: v_mov_b32_e32 v32, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v2 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: .LBB52_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB52_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 -; SI-NEXT: v_add_f64 v[29:30], v[29:30], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v29 -; SI-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 -; SI-NEXT: v_add_f64 v[49:50], v[3:4], 1.0 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v36 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v28 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v21 +; SI-NEXT: v_add_f64 v[30:31], v[0:1], 1.0 +; SI-NEXT: v_add_f64 v[35:36], v[2:3], 1.0 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v34 -; SI-NEXT: v_add_f64 v[32:33], v[1:2], 1.0 -; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 -; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 -; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 -; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 -; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 -; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 -; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 -; SI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 -; SI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 -; SI-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v32 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v49 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v50 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v47 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; SI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; SI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; SI-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v13 ; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v14 ; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v30 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 ; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 @@ -42125,347 +40914,258 @@ define <60 x half> @bitcast_v15f64_to_v60f16(<15 x double> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 ; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 ; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 ; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v43 ; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 ; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 ; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 ; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 ; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 ; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v1 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v43, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v0 +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: .LBB52_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v55 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v56 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v0, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v49 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v45 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v44 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v36 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: v_cvt_f16_f32_e32 v1, v47 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v52 -; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v50 -; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 -; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v44 -; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 -; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 -; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 -; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 -; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 -; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 -; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 -; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 -; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 -; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 -; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 -; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v43 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v42 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v40 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v48 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v63 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v8, v55 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v10, v53 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v31 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v12, v50 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v14, v39 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v63 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v58 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v18, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v32 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v62 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v22, v57 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 +; SI-NEXT: v_or_b32_e32 v23, v25, v23 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v24, v60 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 +; SI-NEXT: v_or_b32_e32 v25, v27, v25 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v26, v58 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_or_b32_e32 v26, v27, v26 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v62 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v27, v29, v27 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v48 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_or_b32_e32 v28, v29, v28 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v30 +; SI-NEXT: v_or_b32_e32 v29, v31, v29 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v15f64_to_v60f16: @@ -43053,22 +41753,22 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a ; SI-LABEL: bitcast_v15f64_to_v60f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 -; SI-NEXT: v_mov_b32_e32 v23, s16 -; SI-NEXT: v_mov_b32_e32 v24, s17 -; SI-NEXT: v_mov_b32_e32 v29, s18 -; SI-NEXT: v_mov_b32_e32 v30, s19 -; SI-NEXT: v_mov_b32_e32 v27, s20 -; SI-NEXT: v_mov_b32_e32 v28, s21 -; SI-NEXT: v_mov_b32_e32 v25, s22 -; SI-NEXT: v_mov_b32_e32 v26, s23 -; SI-NEXT: v_mov_b32_e32 v21, s24 -; SI-NEXT: v_mov_b32_e32 v22, s25 -; SI-NEXT: v_mov_b32_e32 v19, s26 -; SI-NEXT: v_mov_b32_e32 v20, s27 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: v_mov_b32_e32 v22, s16 +; SI-NEXT: v_mov_b32_e32 v23, s17 +; SI-NEXT: v_mov_b32_e32 v28, s18 +; SI-NEXT: v_mov_b32_e32 v29, s19 +; SI-NEXT: v_mov_b32_e32 v26, s20 +; SI-NEXT: v_mov_b32_e32 v27, s21 +; SI-NEXT: v_mov_b32_e32 v24, s22 +; SI-NEXT: v_mov_b32_e32 v25, s23 +; SI-NEXT: v_mov_b32_e32 v20, s24 +; SI-NEXT: v_mov_b32_e32 v21, s25 +; SI-NEXT: v_mov_b32_e32 v18, s26 +; SI-NEXT: v_mov_b32_e32 v19, s27 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mov_b32_e32 v17, s28 -; SI-NEXT: v_mov_b32_e32 v18, s29 +; SI-NEXT: v_mov_b32_e32 v16, s28 +; SI-NEXT: v_mov_b32_e32 v17, s29 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -43088,249 +41788,239 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a ; SI-NEXT: s_cbranch_scc0 .LBB53_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_cvt_f32_f16_e32 v42, v12 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v15 -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v14 +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v42, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v31 -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v54, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v30 +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v42, v10 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v12 -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v11 +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v42, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v31 -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v48, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v30 +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v42, v8 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v9 -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v8 +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v42, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v31 -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v35, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v30 +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v42, v6 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v42, v5 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v63, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v30 +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v4 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v20 +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v42, v3 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v42, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v15 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v42, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v31 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v1 ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v42, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v31 ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v42, v2 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v42, v0 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v42, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v27 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v42, v17 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v17 ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v42, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v27 ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v42, v17 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v26 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v42, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v24 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v18 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v29 ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v42, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v31 ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v42, v22 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v20 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v18 ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v42, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v26 -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v42, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v25 ; SI-NEXT: v_cvt_f32_f16_e32 v53, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v30 -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v30 +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v42, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v24 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v41, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v16 -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v31, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v13 +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v42, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v22 ; SI-NEXT: s_cbranch_execnz .LBB53_3 ; SI-NEXT: .LBB53_2: ; %cmp.true -; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 -; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 -; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 -; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v18 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v7 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v17 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v17 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v16 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v1 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v31 -; SI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v59 +; SI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v20 -; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v6 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v18 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v6 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v1, v31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v62 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v21 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v22 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v8 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v2, v58 +; SI-NEXT: v_add_f64 v[39:40], v[22:23], 1.0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v33 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v20 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v60 -; SI-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v4 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v2, v56 +; SI-NEXT: v_mov_b32_e32 v56, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v26 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v61 +; SI-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; SI-NEXT: v_add_f64 v[41:42], v[26:27], 1.0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v59 -; SI-NEXT: v_mov_b32_e32 v59, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v63 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v25 -; SI-NEXT: v_add_f64 v[55:56], v[23:24], 1.0 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v61 -; SI-NEXT: v_add_f64 v[29:30], v[29:30], 1.0 -; SI-NEXT: v_add_f64 v[41:42], v[27:28], 1.0 -; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 -; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 -; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 -; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v55 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v29 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v41 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v42 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v24 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v42 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v56 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v40 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 @@ -43340,316 +42030,220 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 ; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 ; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 ; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 ; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 ; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 ; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v23 -; SI-NEXT: v_mov_b32_e32 v33, v15 -; SI-NEXT: v_mov_b32_e32 v31, v16 -; SI-NEXT: v_mov_b32_e32 v62, v13 -; SI-NEXT: v_mov_b32_e32 v60, v14 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v39, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v22 +; SI-NEXT: v_mov_b32_e32 v31, v15 +; SI-NEXT: v_mov_b32_e32 v59, v13 +; SI-NEXT: v_mov_b32_e32 v58, v14 +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: .LBB53_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v56 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v43 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v0, v41 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v46 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v53 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v44 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v47 -; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v46 -; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v44 -; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v42 -; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 -; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v49 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v42 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v6, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v60 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v20, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v8, v32 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 -; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v10, v61 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 -; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v12, v57 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 -; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 -; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v63 -; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 -; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v58 -; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v18, v62 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v20, v56 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v22, v35 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 +; SI-NEXT: v_or_b32_e32 v23, v25, v23 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v24, v38 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 +; SI-NEXT: v_or_b32_e32 v25, v27, v25 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v26, v50 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v62 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v60 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v31 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_or_b32_e32 v26, v27, v26 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 +; SI-NEXT: v_or_b32_e32 v27, v29, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v58 ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -43666,61 +42260,65 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v28, v54 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_or_b32_e32 v28, v29, v28 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v30 +; SI-NEXT: v_or_b32_e32 v29, v31, v29 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB53_4: ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; kill: killed $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr31 @@ -44629,218 +43227,244 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v60f16_to_v15f64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v3 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: s_waitcnt expcnt(5) ; SI-NEXT: v_cvt_f16_f32_e32 v58, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:116 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:88 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:84 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v10 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v3 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v44 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v43 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:104 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:100 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:112 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:108 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v60 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v42 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v6 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v41 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v7 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v40 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v55 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v54 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v32 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v32 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v52 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v51 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v55 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v62 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v53 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v54 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v32 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v32, v47 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v45 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v44 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v46 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v43 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v30 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v31 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v46 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v42 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v46, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v41 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v35 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v40 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v62 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v29 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB54_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v45 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; kill: killed $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr30 @@ -44903,7 +43527,11 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v59 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v57 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v41 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v53 ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v51 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v49 ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v39 @@ -44914,6 +43542,10 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: v_or_b32_e32 v0, v58, v0 ; SI-NEXT: v_or_b32_e32 v1, v56, v1 +; SI-NEXT: v_or_b32_e32 v2, v46, v2 +; SI-NEXT: v_or_b32_e32 v20, v42, v20 +; SI-NEXT: v_or_b32_e32 v21, v40, v21 +; SI-NEXT: v_or_b32_e32 v22, v54, v22 ; SI-NEXT: v_or_b32_e32 v23, v52, v23 ; SI-NEXT: v_or_b32_e32 v24, v50, v24 ; SI-NEXT: v_or_b32_e32 v25, v48, v25 @@ -44925,10 +43557,19 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; kill: killed $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr50 @@ -44942,111 +43583,93 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v60 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_or_b32_e32 v21, v22, v21 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v63 -; SI-NEXT: v_or_b32_e32 v22, v62, v22 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: v_or_b32_e32 v19, v44, v19 +; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: .LBB54_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB54_4 ; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v57 @@ -45064,217 +43687,207 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v24, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v42 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v40 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v53 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v25, v52 ; SI-NEXT: v_cvt_f32_f16_e32 v26, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v39 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v39 ; SI-NEXT: v_cvt_f32_f16_e32 v28, v38 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v36 ; SI-NEXT: v_cvt_f32_f16_e32 v30, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v32 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 ; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 ; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 ; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v32 ; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 ; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 ; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v60 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 @@ -45282,9 +43895,12 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 @@ -45294,34 +43910,27 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_or_b32_e32 v18, v20, v18 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v44 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: v_or_b32_e32 v19, v20, v19 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v41 ; SI-NEXT: v_or_b32_e32 v20, v22, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v63 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v55 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v54 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_or_b32_e32 v22, v23, v22 @@ -45359,22 +43968,22 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v29, v31, v29 ; SI-NEXT: .LBB54_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -46200,509 +44809,565 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a ; SI-LABEL: bitcast_v60f16_to_v15f64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:36 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:48 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:44 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:56 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:52 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v7 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v3 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v17, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v1, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v12, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v2, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v11, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v3, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v10, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v4, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v9, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v5, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v8, s26 -; SI-NEXT: v_cvt_f16_f32_e32 v6, s29 -; SI-NEXT: v_cvt_f16_f32_e32 v7, s28 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v33 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v5 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v6 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v7 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v8 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v9 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v10 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v11 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v12 +; SI-NEXT: s_lshr_b32 s40, s17, 16 +; SI-NEXT: s_lshr_b32 s41, s16, 16 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v58, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s40 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v59, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s16 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v34 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v13 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_cbranch_scc0 .LBB55_2 -; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v10, v3 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v34 -; SI-NEXT: v_mov_b32_e32 v33, v32 -; SI-NEXT: v_or_b32_e32 v10, v32, v10 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_mov_b32_e32 v44, v43 -; SI-NEXT: v_or_b32_e32 v13, v43, v13 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v8, v5 -; SI-NEXT: v_mov_b32_e32 v57, v39 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v39 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v49 -; SI-NEXT: v_or_b32_e32 v7, v37, v7 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v59 -; SI-NEXT: v_or_b32_e32 v1, v12, v1 -; SI-NEXT: v_or_b32_e32 v2, v11, v2 -; SI-NEXT: v_or_b32_e32 v4, v9, v4 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v63 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v41 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 +; SI-NEXT: s_lshr_b32 s15, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v33 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v32, s15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s18 +; SI-NEXT: s_lshr_b32 s13, s20, 16 +; SI-NEXT: s_lshr_b32 s14, s19, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v33 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s20 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 +; SI-NEXT: s_lshr_b32 s11, s22, 16 +; SI-NEXT: s_lshr_b32 s12, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v33 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s22 +; SI-NEXT: s_lshr_b32 s9, s24, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 +; SI-NEXT: s_lshr_b32 s10, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v34 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v15 +; SI-NEXT: s_lshr_b32 s7, s26, 16 +; SI-NEXT: s_lshr_b32 s8, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 +; SI-NEXT: s_lshr_b32 s4, s29, 16 +; SI-NEXT: s_lshr_b32 s5, s28, 16 +; SI-NEXT: s_lshr_b32 s6, s27, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v60, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v61, s29 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB55_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v31 +; SI-NEXT: v_mov_b32_e32 v46, v45 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v45 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v59 +; SI-NEXT: v_or_b32_e32 v12, v62, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v56 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v44 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v63 +; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: v_or_b32_e32 v2, v57, v2 +; SI-NEXT: v_mov_b32_e32 v43, v42 +; SI-NEXT: v_or_b32_e32 v3, v42, v3 +; SI-NEXT: v_or_b32_e32 v4, v40, v4 +; SI-NEXT: v_or_b32_e32 v5, v53, v5 +; SI-NEXT: v_mov_b32_e32 v55, v54 +; SI-NEXT: v_mov_b32_e32 v53, v52 +; SI-NEXT: v_or_b32_e32 v6, v52, v6 +; SI-NEXT: v_mov_b32_e32 v50, v49 +; SI-NEXT: v_or_b32_e32 v7, v49, v7 +; SI-NEXT: v_or_b32_e32 v8, v48, v8 +; SI-NEXT: v_mov_b32_e32 v48, v39 +; SI-NEXT: v_mov_b32_e32 v35, v38 +; SI-NEXT: v_or_b32_e32 v9, v38, v9 +; SI-NEXT: v_mov_b32_e32 v61, v36 +; SI-NEXT: v_or_b32_e32 v10, v36, v10 +; SI-NEXT: v_mov_b32_e32 v33, v63 +; SI-NEXT: v_mov_b32_e32 v60, v34 +; SI-NEXT: v_or_b32_e32 v11, v34, v11 +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v30 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v18, v22, v18 -; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v52 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v50 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v48 -; SI-NEXT: v_or_b32_e32 v0, v58, v0 -; SI-NEXT: v_mov_b32_e32 v56, v34 -; SI-NEXT: v_mov_b32_e32 v47, v36 -; SI-NEXT: v_mov_b32_e32 v46, v35 -; SI-NEXT: v_or_b32_e32 v11, v35, v11 -; SI-NEXT: v_mov_b32_e32 v60, v63 -; SI-NEXT: v_mov_b32_e32 v45, v62 -; SI-NEXT: v_or_b32_e32 v12, v62, v12 -; SI-NEXT: v_mov_b32_e32 v42, v41 -; SI-NEXT: v_mov_b32_e32 v40, v55 -; SI-NEXT: v_or_b32_e32 v14, v55, v14 -; SI-NEXT: v_or_b32_e32 v15, v61, v15 -; SI-NEXT: v_or_b32_e32 v20, v53, v20 -; SI-NEXT: v_or_b32_e32 v21, v51, v21 -; SI-NEXT: v_or_b32_e32 v22, v30, v22 -; SI-NEXT: v_or_b32_e32 v23, v31, v23 -; SI-NEXT: s_mov_b64 s[4:5], 0 -; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_or_b32_e32 v17, v32, v17 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 ; SI-NEXT: v_or_b32_e32 v25, v26, v25 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v16, v43, v16 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 ; SI-NEXT: v_or_b32_e32 v26, v27, v26 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v35, v39 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 ; SI-NEXT: v_or_b32_e32 v27, v28, v27 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v37 -; SI-NEXT: v_or_b32_e32 v9, v39, v9 -; SI-NEXT: v_mov_b32_e32 v36, v37 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v45 +; SI-NEXT: v_or_b32_e32 v1, v41, v1 +; SI-NEXT: v_mov_b32_e32 v41, v40 +; SI-NEXT: v_mov_b32_e32 v34, v45 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 ; SI-NEXT: v_or_b32_e32 v28, v29, v28 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v8, v38, v8 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v58, v13 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; SI-NEXT: v_or_b32_e32 v29, v54, v29 -; SI-NEXT: v_mov_b32_e32 v54, v32 -; SI-NEXT: s_branch .LBB55_3 -; SI-NEXT: .LBB55_2: -; SI-NEXT: v_mov_b32_e32 v54, v53 -; SI-NEXT: v_mov_b32_e32 v53, v52 -; SI-NEXT: v_mov_b32_e32 v52, v51 -; SI-NEXT: v_mov_b32_e32 v51, v50 -; SI-NEXT: v_mov_b32_e32 v50, v30 -; SI-NEXT: v_mov_b32_e32 v49, v48 -; SI-NEXT: v_mov_b32_e32 v48, v31 -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v47, v36 -; SI-NEXT: v_mov_b32_e32 v46, v35 -; SI-NEXT: v_mov_b32_e32 v44, v43 -; SI-NEXT: v_mov_b32_e32 v30, v50 -; SI-NEXT: v_mov_b32_e32 v50, v51 -; SI-NEXT: v_mov_b32_e32 v51, v52 -; SI-NEXT: v_mov_b32_e32 v52, v53 -; SI-NEXT: v_mov_b32_e32 v53, v54 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v57, v39 -; SI-NEXT: v_mov_b32_e32 v56, v34 -; SI-NEXT: v_mov_b32_e32 v33, v32 -; SI-NEXT: v_mov_b32_e32 v60, v63 -; SI-NEXT: v_mov_b32_e32 v45, v62 -; SI-NEXT: v_mov_b32_e32 v42, v41 -; SI-NEXT: v_mov_b32_e32 v40, v55 -; SI-NEXT: s_mov_b64 s[4:5], -1 -; SI-NEXT: v_mov_b32_e32 v31, v48 -; SI-NEXT: v_mov_b32_e32 v48, v49 -; SI-NEXT: .LBB55_3: ; %Flow -; SI-NEXT: v_mov_b32_e32 v32, v33 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] -; SI-NEXT: v_mov_b32_e32 v61, v40 -; SI-NEXT: v_mov_b32_e32 v40, v44 -; SI-NEXT: s_cbranch_vccnz .LBB55_5 -; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v58 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v38 +; SI-NEXT: v_or_b32_e32 v29, v62, v29 +; SI-NEXT: s_cbranch_execnz .LBB55_3 +; SI-NEXT: .LBB55_2: ; %cmp.true +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v0, v46 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v41 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v50 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v61 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v58 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v40 -; SI-NEXT: v_mov_b32_e32 v55, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v43 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v48 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v31 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v57 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v56 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v44 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v55 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v51 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v48 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v37 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v33 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v59 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v30 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 @@ -46710,9 +45375,9 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 @@ -46722,7 +45387,7 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_or_b32_e32 v18, v20, v18 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 @@ -46734,39 +45399,46 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_or_b32_e32 v19, v20, v19 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v52 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_or_b32_e32 v20, v22, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v50 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v30 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_or_b32_e32 v22, v23, v22 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: v_or_b32_e32 v23, v25, v23 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; SI-NEXT: v_or_b32_e32 v24, v26, v24 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 @@ -46778,12 +45450,12 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 ; SI-NEXT: v_or_b32_e32 v25, v26, v25 ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v27 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 ; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 ; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 ; SI-NEXT: v_or_b32_e32 v26, v28, v26 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 ; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 @@ -46793,7 +45465,7 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a ; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 ; SI-NEXT: v_or_b32_e32 v27, v29, v27 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 ; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 @@ -46806,25 +45478,48 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a ; SI-NEXT: v_or_b32_e32 v28, v29, v28 ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v30 ; SI-NEXT: v_or_b32_e32 v29, v31, v29 -; SI-NEXT: .LBB55_5: ; %end -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: .LBB55_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB55_4: +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v58, v30 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: v_mov_b32_e32 v60, v34 +; SI-NEXT: v_mov_b32_e32 v30, v58 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v46, v45 +; SI-NEXT: v_mov_b32_e32 v43, v42 +; SI-NEXT: v_mov_b32_e32 v41, v40 +; SI-NEXT: v_mov_b32_e32 v55, v54 +; SI-NEXT: v_mov_b32_e32 v53, v52 +; SI-NEXT: v_mov_b32_e32 v50, v49 +; SI-NEXT: v_mov_b32_e32 v48, v39 +; SI-NEXT: v_mov_b32_e32 v35, v38 +; SI-NEXT: v_mov_b32_e32 v61, v36 +; SI-NEXT: v_mov_b32_e32 v33, v63 +; SI-NEXT: s_branch .LBB55_2 ; ; VI-LABEL: bitcast_v60f16_to_v15f64_scalar: ; VI: ; %bb.0: @@ -47559,992 +46254,874 @@ define <60 x half> @bitcast_v60i16_to_v60f16(<60 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v60i16_to_v60f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:120 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:116 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:112 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:108 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:104 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:100 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:88 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:52 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; kill: killed $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; kill: killed $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; kill: killed $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; kill: killed $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; kill: killed $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; kill: killed $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; kill: killed $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; kill: killed $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; kill: killed $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; kill: killed $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; kill: killed $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; kill: killed $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; kill: killed $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; kill: killed $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; kill: killed $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; kill: killed $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; kill: killed $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; kill: killed $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; kill: killed $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; kill: killed $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; kill: killed $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; kill: killed $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; kill: killed $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; kill: killed $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; kill: killed $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; kill: killed $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; kill: killed $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; kill: killed $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; kill: killed $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; kill: killed $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; kill: killed $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; kill: killed $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; kill: killed $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; kill: killed $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; kill: killed $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; kill: killed $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; kill: killed $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; kill: killed $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; kill: killed $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; kill: killed $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; kill: killed $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; kill: killed $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; kill: killed $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; kill: killed $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; kill: killed $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; kill: killed $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; kill: killed $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; kill: killed $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; kill: killed $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; kill: killed $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; kill: killed $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; kill: killed $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; kill: killed $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; kill: killed $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; kill: killed $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; kill: killed $vgpr47 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; kill: killed $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; kill: killed $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; kill: killed $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; kill: killed $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; kill: killed $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; kill: killed $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; kill: killed $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; kill: killed $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; kill: killed $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; kill: killed $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; kill: killed $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; kill: killed $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; kill: killed $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; kill: killed $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; kill: killed $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; kill: killed $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; kill: killed $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; kill: killed $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; kill: killed $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; kill: killed $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; kill: killed $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; kill: killed $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; kill: killed $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; kill: killed $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; kill: killed $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; kill: killed $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; kill: killed $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; kill: killed $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; kill: killed $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; kill: killed $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; kill: killed $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; kill: killed $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; kill: killed $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; kill: killed $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; kill: killed $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; kill: killed $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; kill: killed $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; kill: killed $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; kill: killed $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; kill: killed $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; kill: killed $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; kill: killed $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; kill: killed $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; kill: killed $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; kill: killed $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; kill: killed $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; kill: killed $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; kill: killed $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; kill: killed $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; kill: killed $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; kill: killed $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v9 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v1 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v0 +; SI-NEXT: ; kill: killed $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; kill: killed $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v40 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:24 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; kill: killed $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; kill: killed $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; kill: killed $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB56_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v4 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v47, v39 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v42 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v43 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v56 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v2 ; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v57 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v3 ; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v58 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v4 ; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v5 ; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v60 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v6 ; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v61 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v7 ; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v62 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v8 ; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v63 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v9 ; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v10 ; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v11 ; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v12 ; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v13 ; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v14 ; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v15 ; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v37 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v16 ; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v17 ; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v39 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v18 ; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v48 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v19 ; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v49 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v20 ; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v50 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v21 ; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v22 ; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v52 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v23 ; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v53 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v24 ; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v54 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v25 ; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v26 ; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v40 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v27 ; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v28 ; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v29 ; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: .LBB56_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB56_4 +; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v58 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v30 +; SI-NEXT: v_add_i32_e32 v56, vcc, 3, v56 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v59 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 +; SI-NEXT: v_add_i32_e32 v57, vcc, 3, v57 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v60 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v56 +; SI-NEXT: v_add_i32_e32 v58, vcc, 3, v58 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v61 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v2 +; SI-NEXT: v_add_i32_e32 v59, vcc, 3, v59 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v62 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v57 +; SI-NEXT: v_add_i32_e32 v47, vcc, 3, v60 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v63 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v40 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v3 +; SI-NEXT: v_add_i32_e32 v46, vcc, 3, v61 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v41 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v58 +; SI-NEXT: v_add_i32_e32 v45, vcc, 3, v62 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v42 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v4 +; SI-NEXT: v_add_i32_e32 v44, vcc, 3, v63 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 +; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v31 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v5 +; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v45 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v47 +; SI-NEXT: v_add_i32_e32 v33, vcc, 3, v33 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v46 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v6 +; SI-NEXT: v_add_i32_e32 v34, vcc, 3, v34 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v55 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v46 +; SI-NEXT: v_add_i32_e32 v35, vcc, 3, v35 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v7 +; SI-NEXT: v_add_i32_e32 v36, vcc, 3, v36 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v45 +; SI-NEXT: v_add_i32_e32 v37, vcc, 3, v37 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v8 +; SI-NEXT: v_add_i32_e32 v38, vcc, 3, v38 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v44 +; SI-NEXT: v_add_i32_e32 v39, vcc, 3, v39 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v9 +; SI-NEXT: v_add_i32_e32 v48, vcc, 3, v48 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v36 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v31 +; SI-NEXT: v_add_i32_e32 v49, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v37 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v10 +; SI-NEXT: v_add_i32_e32 v50, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v32 +; SI-NEXT: v_add_i32_e32 v51, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v48 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v11 +; SI-NEXT: v_add_i32_e32 v52, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v49 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v51 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v52 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v53 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v54 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: .LBB56_2: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB56_4 -; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v3 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v5 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v6 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v7 -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v8 -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v9 -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v10 -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v11 -; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v12 -; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v33 +; SI-NEXT: v_add_i32_e32 v53, vcc, 3, v53 ; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v12 +; SI-NEXT: v_add_i32_e32 v54, vcc, 3, v54 ; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v34 ; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v15 -; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 -; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 -; SI-NEXT: v_add_i32_e32 v58, vcc, 3, v58 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v17 -; SI-NEXT: v_add_i32_e32 v59, vcc, 3, v59 -; SI-NEXT: v_add_i32_e32 v60, vcc, 3, v60 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v18 -; SI-NEXT: v_add_i32_e32 v61, vcc, 3, v61 -; SI-NEXT: v_add_i32_e32 v62, vcc, 3, v62 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v19 -; SI-NEXT: v_add_i32_e32 v63, vcc, 3, v63 -; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_add_i32_e32 v40, vcc, 3, v40 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v20 -; SI-NEXT: v_add_i32_e32 v41, vcc, 3, v41 -; SI-NEXT: v_add_i32_e32 v42, vcc, 3, v42 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v13 +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v35 ; SI-NEXT: v_add_i32_e32 v43, vcc, 3, v43 -; SI-NEXT: v_add_i32_e32 v44, vcc, 3, v44 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v22 -; SI-NEXT: v_add_i32_e32 v45, vcc, 3, v45 -; SI-NEXT: v_add_i32_e32 v46, vcc, 3, v46 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v14 +; SI-NEXT: v_add_i32_e32 v42, vcc, 3, v42 +; SI-NEXT: v_add_i32_e32 v41, vcc, 3, v41 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v36 ; SI-NEXT: v_add_i32_e32 v55, vcc, 3, v55 -; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v31 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v24 -; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32 -; SI-NEXT: v_add_i32_e32 v33, vcc, 3, v33 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v25 -; SI-NEXT: v_add_i32_e32 v34, vcc, 3, v34 -; SI-NEXT: v_add_i32_e32 v35, vcc, 3, v35 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v26 -; SI-NEXT: v_add_i32_e32 v36, vcc, 3, v36 -; SI-NEXT: v_add_i32_e32 v37, vcc, 3, v37 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v27 -; SI-NEXT: v_add_i32_e32 v38, vcc, 3, v38 -; SI-NEXT: v_add_i32_e32 v48, vcc, 3, v48 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v28 -; SI-NEXT: v_add_i32_e32 v49, vcc, 3, v49 -; SI-NEXT: v_add_i32_e32 v50, vcc, 3, v50 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v29 -; SI-NEXT: v_add_i32_e32 v51, vcc, 3, v51 -; SI-NEXT: v_add_i32_e32 v52, vcc, 3, v52 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v30 -; SI-NEXT: v_add_i32_e32 v53, vcc, 3, v53 -; SI-NEXT: v_add_i32_e32 v54, vcc, 3, v54 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v58 -; SI-NEXT: v_add_i32_e32 v39, vcc, 3, v39 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v59 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v2 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v39 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v44, v55 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v61 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v43 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v62 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v37 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v63 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v40 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v38 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v41 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v42 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v39 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v43 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v44 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v48 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v45 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v46 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v49 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v55 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v20 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v31 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v50 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v32 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v21 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v33 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v35 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v36 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v37 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v53 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v48 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v54 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v49 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v51 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v40 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v52 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v53 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v54 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v29 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; SI-NEXT: .LBB56_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v26, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v47 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v56 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v2, v47 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v23, v25, v23 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v25, v27, v25 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_or_b32_e32 v26, v27, v26 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v45 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v27, v29, v27 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_or_b32_e32 v28, v29, v28 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v30 +; SI-NEXT: v_or_b32_e32 v29, v31, v29 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v60i16_to_v60f16: @@ -49153,936 +47730,657 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i ; SI-LABEL: bitcast_v60i16_to_v60f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:64 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:60 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:56 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:52 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:48 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:44 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v49 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: s_lshr_b32 s43, s29, 16 +; SI-NEXT: s_lshr_b32 s42, s28, 16 +; SI-NEXT: s_lshr_b32 s41, s27, 16 +; SI-NEXT: s_lshr_b32 s40, s26, 16 +; SI-NEXT: s_lshr_b32 s15, s25, 16 +; SI-NEXT: s_lshr_b32 s14, s24, 16 +; SI-NEXT: s_lshr_b32 s13, s23, 16 +; SI-NEXT: s_lshr_b32 s12, s22, 16 +; SI-NEXT: s_lshr_b32 s11, s21, 16 +; SI-NEXT: s_lshr_b32 s10, s20, 16 +; SI-NEXT: s_lshr_b32 s9, s19, 16 +; SI-NEXT: s_lshr_b32 s8, s18, 16 +; SI-NEXT: s_lshr_b32 s7, s17, 16 +; SI-NEXT: s_lshr_b32 s6, s16, 16 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v12 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: s_cbranch_scc0 .LBB57_2 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v0 +; SI-NEXT: s_cbranch_scc0 .LBB57_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v31, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v49, s16 -; SI-NEXT: s_mov_b64 s[4:5], 0 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v29 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v2 -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v49, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v48 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v3 -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v49, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v39 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v4 -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v49, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v38 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v5 -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v49, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v37 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v6 -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v49, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v36 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v7 -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v49, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v35 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v8 -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v49, s23 -; SI-NEXT: v_mov_b32_e32 v35, v34 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v9 -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v49, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v34 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v10 -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v49, s25 -; SI-NEXT: v_mov_b32_e32 v34, v33 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v11 -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v49, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v33 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v12 -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v49, s27 -; SI-NEXT: v_mov_b32_e32 v33, v32 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v16, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s26 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v13 -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v16, s6 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v49, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v32 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v17, s22 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v14 -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v16, s17 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v49, s29 -; SI-NEXT: v_mov_b32_e32 v32, v50 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v17, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s28 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v15 -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v16, s7 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v49, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v50 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v17, s23 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v16 -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v16, s18 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v49, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v63 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v17, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s29 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v17 -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v16, s8 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v49, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v62 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v17, s14 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v60 -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v16, s19 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v58 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v17, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v2 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v20 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v16, s9 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v21 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v17, s40 +; SI-NEXT: v_mov_b32_e32 v34, v20 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v22 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v16, s20 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v23 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v17, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v20 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v24 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v16, s10 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v25 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v17, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v1 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v26 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v16, s11 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v27 -; SI-NEXT: s_branch .LBB57_3 -; SI-NEXT: .LBB57_2: -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s43 +; SI-NEXT: v_mov_b32_e32 v35, v21 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v35, v34 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; kill: killed $vgpr49 -; SI-NEXT: v_mov_b32_e32 v34, v33 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; kill: killed $vgpr49 -; SI-NEXT: v_mov_b32_e32 v33, v32 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; kill: killed $vgpr49 -; SI-NEXT: v_mov_b32_e32 v32, v50 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; kill: killed $vgpr49 -; SI-NEXT: s_mov_b64 s[4:5], -1 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; kill: killed $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; kill: killed $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; kill: killed $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; kill: killed $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; kill: killed $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; kill: killed $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; kill: killed $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; kill: killed $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; kill: killed $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; kill: killed $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; kill: killed $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; kill: killed $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: .LBB57_3: ; %Flow -; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] -; SI-NEXT: v_mov_b32_e32 v49, v50 -; SI-NEXT: v_mov_b32_e32 v50, v52 -; SI-NEXT: v_mov_b32_e32 v52, v54 -; SI-NEXT: v_mov_b32_e32 v54, v40 -; SI-NEXT: v_mov_b32_e32 v40, v42 -; SI-NEXT: v_mov_b32_e32 v42, v44 -; SI-NEXT: v_mov_b32_e32 v44, v46 -; SI-NEXT: v_mov_b32_e32 v46, v56 -; SI-NEXT: v_mov_b32_e32 v56, v31 -; SI-NEXT: s_cbranch_vccnz .LBB57_5 -; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v32 -; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v33 -; SI-NEXT: v_add_i32_e32 v33, vcc, 3, v34 -; SI-NEXT: v_add_i32_e32 v34, vcc, 3, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v21 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v36, v22 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v16, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v3 +; SI-NEXT: v_mov_b32_e32 v37, v23 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v16, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v4 +; SI-NEXT: v_mov_b32_e32 v38, v24 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v5 +; SI-NEXT: v_mov_b32_e32 v39, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v6 +; SI-NEXT: v_mov_b32_e32 v48, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v7 +; SI-NEXT: v_mov_b32_e32 v49, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v8 +; SI-NEXT: v_mov_b32_e32 v50, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v9 +; SI-NEXT: v_mov_b32_e32 v51, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v10 +; SI-NEXT: v_mov_b32_e32 v52, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v11 +; SI-NEXT: v_mov_b32_e32 v53, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v12 +; SI-NEXT: v_mov_b32_e32 v33, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v13 +; SI-NEXT: v_mov_b32_e32 v19, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v14 +; SI-NEXT: v_mov_b32_e32 v21, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v15 +; SI-NEXT: v_mov_b32_e32 v32, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: s_cbranch_execnz .LBB57_3 +; SI-NEXT: .LBB57_2: ; %cmp.true ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v49, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s16 +; SI-NEXT: s_add_i32 s6, s6, 3 ; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_add_i32 s7, s7, 3 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v49, s17 -; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s6 ; SI-NEXT: s_add_i32 s21, s21, 3 -; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v49, s18 -; SI-NEXT: s_add_i32 s23, s23, 3 -; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: s_add_i32 s25, s25, 3 -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v49, s19 -; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: s_add_i32 s27, s27, 3 -; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v49, s20 -; SI-NEXT: s_add_i32 s29, s29, 3 -; SI-NEXT: v_add_i32_e32 v58, vcc, 3, v58 -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v49, s21 -; SI-NEXT: v_add_i32_e32 v59, vcc, 3, v59 -; SI-NEXT: v_add_i32_e32 v60, vcc, 3, v60 -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v49, s22 -; SI-NEXT: v_add_i32_e32 v61, vcc, 3, v61 -; SI-NEXT: v_add_i32_e32 v62, vcc, 3, v62 -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v49, s23 -; SI-NEXT: v_add_i32_e32 v63, vcc, 3, v63 -; SI-NEXT: v_add_i32_e32 v36, vcc, 3, v36 -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v49, s24 -; SI-NEXT: v_add_i32_e32 v37, vcc, 3, v37 -; SI-NEXT: v_add_i32_e32 v38, vcc, 3, v38 -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v49, s25 -; SI-NEXT: v_add_i32_e32 v39, vcc, 3, v39 -; SI-NEXT: v_add_i32_e32 v48, vcc, 3, v48 -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v49, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v38 -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v49, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v34 -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v49, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v31 -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_add_i32_e32 v35, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v49, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v62 -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v49, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v58 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v46, v29 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v56, v27 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v2 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v3 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v4 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v6 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v7 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v9 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v10 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v11 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v12 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s21 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v13 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v16, s17 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s8, s8, 3 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v14 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v16, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s22 +; SI-NEXT: s_add_i32 s12, s12, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v15 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v16, s18 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v17, s12 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v17 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v16, s8 +; SI-NEXT: s_add_i32 s9, s9, 3 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v18 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v17, s23 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v19 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v16, s19 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v20 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v16, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s24 +; SI-NEXT: s_add_i32 s14, s14, 3 +; SI-NEXT: s_add_i32 s10, s10, 3 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v21 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v16, s20 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v22 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v17, s14 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v23 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v16, s10 +; SI-NEXT: s_add_i32 s11, s11, 3 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v17, s25 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v25 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v16, s11 +; SI-NEXT: s_add_i32 s40, s40, 3 +; SI-NEXT: s_add_i32 s13, s13, 3 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v26 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v16, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s40 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_add_i32 s15, s15, 3 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v28 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v16, s15 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v30 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v17, s27 +; SI-NEXT: s_add_i32 s42, s42, 3 +; SI-NEXT: s_add_i32 s41, s41, 3 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v16, s41 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v59 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: .LBB57_5: ; %end +; SI-NEXT: v_cvt_f32_f16_e32 v17, s42 +; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v32 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v33 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v48 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v39 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v38 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v55, vcc, 3, v37 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v40, vcc, 3, v36 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v53, vcc, 3, v35 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v54, vcc, 3, v34 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: s_add_i32 s43, s43, 3 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s26 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v35, s28 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v19, s29 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: .LBB57_3: ; %end ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v14, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v62 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v2, v56 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v2, v46 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v44 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v58 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v44 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v42 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v40 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v54 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v52 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v50 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v49 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v59 +; SI-NEXT: v_or_b32_e32 v17, v21, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v55 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v32 +; SI-NEXT: v_or_b32_e32 v18, v21, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v63 +; SI-NEXT: v_or_b32_e32 v19, v33, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 +; SI-NEXT: v_or_b32_e32 v20, v32, v20 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v56 +; SI-NEXT: v_or_b32_e32 v21, v34, v21 +; SI-NEXT: v_or_b32_e32 v22, v32, v22 +; SI-NEXT: v_or_b32_e32 v23, v33, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v60 +; SI-NEXT: v_or_b32_e32 v24, v32, v24 +; SI-NEXT: v_or_b32_e32 v25, v33, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v61 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v26, v32, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v31 +; SI-NEXT: v_or_b32_e32 v27, v33, v27 +; SI-NEXT: v_or_b32_e32 v28, v30, v28 +; SI-NEXT: v_or_b32_e32 v29, v32, v29 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB57_4: +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; kill: killed $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: v_mov_b32_e32 v53, v57 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; kill: killed $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: v_mov_b32_e32 v52, v33 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; kill: killed $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: v_mov_b32_e32 v51, v32 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; kill: killed $vgpr17 +; SI-NEXT: v_mov_b32_e32 v50, v30 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; kill: killed $vgpr17 +; SI-NEXT: v_mov_b32_e32 v49, v27 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; kill: killed $vgpr17 +; SI-NEXT: v_mov_b32_e32 v48, v26 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; kill: killed $vgpr17 +; SI-NEXT: v_mov_b32_e32 v39, v25 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; kill: killed $vgpr17 +; SI-NEXT: v_mov_b32_e32 v38, v24 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: v_mov_b32_e32 v37, v23 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: v_mov_b32_e32 v36, v22 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: v_mov_b32_e32 v35, v21 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: v_mov_b32_e32 v34, v20 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: v_mov_b32_e32 v33, v19 +; SI-NEXT: v_mov_b32_e32 v32, v31 +; SI-NEXT: v_mov_b32_e32 v21, v29 +; SI-NEXT: v_mov_b32_e32 v19, v28 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; kill: killed $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; kill: killed $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_branch .LBB57_2 ; ; VI-LABEL: bitcast_v60i16_to_v60f16_scalar: ; VI: ; %bb.0: @@ -51036,636 +49334,540 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v60f16_to_v60i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:120 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v25 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:88 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v34 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v35 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v36 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v37 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v38 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v39 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v49 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v50 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v51 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v53 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v54 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v1 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v61, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v30 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v4 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v5 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v53, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v10 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v49 ; SI-NEXT: v_cvt_f16_f32_e32 v49, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v38 ; SI-NEXT: v_cvt_f16_f32_e32 v38, v16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v43 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v62 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v45 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f16_f32_e32 v13, v46 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f16_f32_e32 v50, v47 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f16_f32_e32 v30, v56 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f16_f32_e32 v63, v57 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f16_f32_e32 v62, v58 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f16_f32_e32 v36, v31 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f16_f32_e32 v45, v32 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v31, v59 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v32, v34 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v35 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:100 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:104 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:108 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:112 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:116 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v42, v3 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v9, v4 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v39, v34 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v41, v35 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v46 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v60 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_or_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: v_mov_b32_e32 v47, v21 -; SI-NEXT: v_mov_b32_e32 v56, v17 -; SI-NEXT: v_mov_b32_e32 v57, v6 -; SI-NEXT: v_mov_b32_e32 v58, v7 -; SI-NEXT: v_mov_b32_e32 v59, v33 -; SI-NEXT: s_xor_b64 exec, exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB58_2 ; SI-NEXT: ; %bb.1: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v62 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v63 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v7 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_or_b32_e32 v3, v3, v34 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v10 -; SI-NEXT: v_or_b32_e32 v9, v9, v34 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v32 -; SI-NEXT: v_or_b32_e32 v31, v31, v34 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v62 -; SI-NEXT: v_or_b32_e32 v63, v6, v34 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 ; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 ; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 ; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_or_b32_e32 v12, v12, v34 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_or_b32_e32 v15, v15, v34 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_or_b32_e32 v29, v29, v60 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_or_b32_e32 v18, v18, v34 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v22, v22, v34 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v27 -; SI-NEXT: v_or_b32_e32 v26, v26, v34 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v5 -; SI-NEXT: v_or_b32_e32 v11, v11, v34 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v20 -; SI-NEXT: v_or_b32_e32 v2, v2, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_or_b32_e32 v27, v27, v60 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v34 -; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_or_b32_e32 v37, v34, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v48 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v58 -; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v34 -; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_or_b32_e32 v48, v34, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v34 -; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_or_b32_e32 v25, v25, v60 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_or_b32_e32 v23, v23, v60 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_or_b32_e32 v52, v34, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v55 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 -; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v34 -; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_or_b32_e32 v55, v34, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v60 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v35, v6 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v21, v21, v60 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v19, v19, v60 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 ; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 -; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 ; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 -; SI-NEXT: v_or_b32_e32 v6, v35, v34 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: v_or_b32_e32 v17, v17, v60 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 ; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 ; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_add_f32_e32 v42, 0x38000000, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v42 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_or_b32_e32 v15, v15, v60 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_or_b32_e32 v13, v13, v60 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_or_b32_e32 v11, v11, v60 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v59, 0x38000000, v59 +; SI-NEXT: v_add_f32_e32 v58, 0x38000000, v58 +; SI-NEXT: v_add_f32_e32 v57, 0x38000000, v57 +; SI-NEXT: v_add_f32_e32 v56, 0x38000000, v56 +; SI-NEXT: v_add_f32_e32 v47, 0x38000000, v47 +; SI-NEXT: v_add_f32_e32 v46, 0x38000000, v46 +; SI-NEXT: v_add_f32_e32 v45, 0x38000000, v45 +; SI-NEXT: v_add_f32_e32 v44, 0x38000000, v44 ; SI-NEXT: v_add_f32_e32 v43, 0x38000000, v43 -; SI-NEXT: v_add_f32_e32 v51, 0x38000000, v51 -; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_add_f32_e32 v42, 0x38000000, v42 ; SI-NEXT: v_add_f32_e32 v41, 0x38000000, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 +; SI-NEXT: v_add_f32_e32 v40, 0x38000000, v40 +; SI-NEXT: v_add_f32_e32 v55, 0x38000000, v55 +; SI-NEXT: v_add_f32_e32 v52, 0x38000000, v52 +; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v48 +; SI-NEXT: v_or_b32_e32 v9, v9, v60 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v59 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v58 ; SI-NEXT: v_add_f32_e32 v54, 0x38000000, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v57 +; SI-NEXT: v_add_f32_e32 v53, 0x38000000, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v56 +; SI-NEXT: v_add_f32_e32 v51, 0x38000000, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v47 ; SI-NEXT: v_add_f32_e32 v50, 0x38000000, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v46 +; SI-NEXT: v_add_f32_e32 v49, 0x38000000, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v45 +; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v44 +; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v43 +; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v42 ; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 ; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 -; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v40 +; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v55 +; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_or_b32_e32 v7, v7, v60 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 ; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 ; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v43 -; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v41 -; SI-NEXT: v_or_b32_e32 v25, v25, v24 -; SI-NEXT: v_or_b32_e32 v29, v29, v28 -; SI-NEXT: v_or_b32_e32 v54, v54, v51 -; SI-NEXT: v_or_b32_e32 v50, v50, v30 -; SI-NEXT: v_or_b32_e32 v39, v39, v41 -; SI-NEXT: v_alignbit_b32 v60, v55, v34, 16 -; SI-NEXT: v_alignbit_b32 v24, v26, v24, 16 -; SI-NEXT: v_alignbit_b32 v28, v22, v28, 16 -; SI-NEXT: v_alignbit_b32 v51, v12, v51, 16 -; SI-NEXT: v_alignbit_b32 v30, v63, v30, 16 -; SI-NEXT: v_alignbit_b32 v41, v3, v41, 16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v35, v6 -; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 -; SI-NEXT: v_or_b32_e32 v6, v35, v1 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 -; SI-NEXT: v_alignbit_b32 v1, v52, v1, 16 -; SI-NEXT: v_add_f32_e32 v46, 0x38000000, v46 -; SI-NEXT: v_or_b32_e32 v58, v35, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v46 -; SI-NEXT: v_alignbit_b32 v8, v48, v8, 16 -; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 -; SI-NEXT: v_or_b32_e32 v57, v46, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v47 -; SI-NEXT: v_alignbit_b32 v14, v37, v14, 16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v56, v35, v17 -; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v59 -; SI-NEXT: v_add_f32_e32 v46, 0x38000000, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v46 -; SI-NEXT: v_or_b32_e32 v59, v46, v43 -; SI-NEXT: v_alignbit_b32 v43, v15, v43, 16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_or_b32_e32 v47, v35, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v61 -; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 ; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 -; SI-NEXT: v_add_f32_e32 v44, 0x38000000, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v44 -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; SI-NEXT: v_or_b32_e32 v61, v44, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v45 -; SI-NEXT: v_add_f32_e32 v44, 0x38000000, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v44 -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v44 -; SI-NEXT: v_or_b32_e32 v36, v36, v45 -; SI-NEXT: v_alignbit_b32 v44, v18, v35, 16 -; SI-NEXT: v_alignbit_b32 v45, v31, v45, 16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v33, v6 -; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 ; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: v_or_b32_e32 v6, v33, v42 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v2, v17, 16 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v11, v21, 16 -; SI-NEXT: v_alignbit_b32 v42, v9, v42, 16 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: .LBB58_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v60 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v34, 0xffff, v6 -; SI-NEXT: v_or_b32_e32 v34, v34, v35 -; SI-NEXT: buffer_store_dword v34, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v34, 0xffff, v55 -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v40 -; SI-NEXT: v_or_b32_e32 v34, v34, v35 -; SI-NEXT: v_add_i32_e32 v35, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v34, v35, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; SI-NEXT: v_and_b32_e32 v34, 0xffff, v6 -; SI-NEXT: v_or_b32_e32 v1, v34, v1 -; SI-NEXT: v_add_i32_e32 v34, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v1, v34, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v52 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v53 -; SI-NEXT: v_or_b32_e32 v1, v1, v34 -; SI-NEXT: v_add_i32_e32 v34, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v1, v34, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v58 -; SI-NEXT: v_or_b32_e32 v1, v1, v8 -; SI-NEXT: v_add_i32_e32 v8, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v1, v8, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v48 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v49 -; SI-NEXT: v_or_b32_e32 v1, v1, v8 -; SI-NEXT: v_add_i32_e32 v8, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v1, v8, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v57 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v14 -; SI-NEXT: v_or_b32_e32 v1, v1, v8 -; SI-NEXT: v_add_i32_e32 v8, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v1, v8, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v37 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v38 -; SI-NEXT: v_or_b32_e32 v1, v1, v8 -; SI-NEXT: v_add_i32_e32 v8, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v1, v8, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v56 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v6 -; SI-NEXT: v_or_b32_e32 v1, v1, v8 -; SI-NEXT: v_add_i32_e32 v8, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v1, v8, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v20 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v47 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v5 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v25 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v24 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v26 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v29 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v28 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v23 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v61 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v44 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v19 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v59 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v43 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v16 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v13 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v50 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v63 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v62 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v36 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v45 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v31 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v42 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_or_b32_e32 v5, v5, v60 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v60 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v60 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v58 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v56 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v46 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v45 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v44 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v41 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 +; SI-NEXT: v_or_b32_e32 v0, v0, v59 +; SI-NEXT: v_or_b32_e32 v54, v54, v58 +; SI-NEXT: v_or_b32_e32 v53, v53, v57 +; SI-NEXT: v_or_b32_e32 v51, v51, v56 +; SI-NEXT: v_or_b32_e32 v50, v50, v47 +; SI-NEXT: v_or_b32_e32 v49, v49, v46 +; SI-NEXT: v_or_b32_e32 v39, v39, v45 +; SI-NEXT: v_or_b32_e32 v38, v38, v44 +; SI-NEXT: v_or_b32_e32 v37, v37, v43 +; SI-NEXT: v_or_b32_e32 v36, v36, v42 +; SI-NEXT: v_or_b32_e32 v35, v35, v41 +; SI-NEXT: v_or_b32_e32 v34, v34, v40 +; SI-NEXT: v_or_b32_e32 v32, v32, v55 +; SI-NEXT: v_or_b32_e32 v33, v33, v52 +; SI-NEXT: v_or_b32_e32 v31, v31, v48 +; SI-NEXT: v_alignbit_b32 v59, v1, v59, 16 +; SI-NEXT: v_alignbit_b32 v58, v3, v58, 16 +; SI-NEXT: v_alignbit_b32 v57, v5, v57, 16 +; SI-NEXT: v_alignbit_b32 v56, v7, v56, 16 +; SI-NEXT: v_alignbit_b32 v47, v9, v47, 16 +; SI-NEXT: v_alignbit_b32 v46, v11, v46, 16 +; SI-NEXT: v_alignbit_b32 v45, v13, v45, 16 +; SI-NEXT: v_alignbit_b32 v44, v15, v44, 16 +; SI-NEXT: v_alignbit_b32 v43, v17, v43, 16 +; SI-NEXT: v_alignbit_b32 v42, v19, v42, 16 +; SI-NEXT: v_alignbit_b32 v41, v21, v41, 16 +; SI-NEXT: v_alignbit_b32 v40, v23, v40, 16 +; SI-NEXT: v_alignbit_b32 v55, v25, v55, 16 +; SI-NEXT: v_alignbit_b32 v52, v27, v52, 16 +; SI-NEXT: v_alignbit_b32 v48, v29, v48, 16 +; SI-NEXT: .LBB58_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v59 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v0, v0, v59 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v10 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v39 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v58 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v53 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v57 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v51 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v56 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v50 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v47 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v49 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v46 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v45 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v38 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v44 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v37 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v43 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v42 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v35 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v41 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v34 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v40 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v55 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_or_b32_e32 v24, v24, v32 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v52 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v31 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v48 +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_or_b32_e32 v2, v2, v54 +; SI-NEXT: v_or_b32_e32 v4, v4, v53 +; SI-NEXT: v_or_b32_e32 v6, v6, v51 +; SI-NEXT: v_or_b32_e32 v8, v8, v50 +; SI-NEXT: v_or_b32_e32 v10, v10, v49 +; SI-NEXT: v_or_b32_e32 v12, v12, v39 +; SI-NEXT: v_or_b32_e32 v14, v14, v38 +; SI-NEXT: v_or_b32_e32 v16, v16, v37 +; SI-NEXT: v_or_b32_e32 v18, v18, v36 +; SI-NEXT: v_or_b32_e32 v20, v20, v35 +; SI-NEXT: v_or_b32_e32 v22, v22, v34 +; SI-NEXT: v_or_b32_e32 v26, v26, v32 +; SI-NEXT: v_or_b32_e32 v28, v28, v31 +; SI-NEXT: v_or_b32_e32 v29, v29, v30 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v60f16_to_v60i16: @@ -52275,682 +50477,712 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i ; SI-LABEL: bitcast_v60f16_to_v60i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:56 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v60, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v24 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s16 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v1 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s10 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v7 +; SI-NEXT: s_lshr_b32 s40, s19, 16 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v13 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v20 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s40 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v33 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v5 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v22 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v3, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v32 +; SI-NEXT: s_lshr_b32 s8, s27, 16 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v62, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v23, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, s23 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 +; SI-NEXT: s_lshr_b32 s13, s22, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s41, s18, 16 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v43, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v5 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v8 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s41 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s11 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v13 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 +; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v21 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v29 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v58, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v28, s26 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v38 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v48, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v50 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v34, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v49, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v32, s28 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v53 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v55, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v44 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v35, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v38, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v33, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v44, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s42 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, s26 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 +; SI-NEXT: s_lshr_b32 s7, s28, 16 +; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s6 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v26 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v46, s25 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB59_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_cbranch_execnz .LBB59_3 ; SI-NEXT: .LBB59_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v5, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v58 -; SI-NEXT: v_mov_b32_e32 v43, v34 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v5 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_or_b32_e32 v35, v3, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v33 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v5 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_or_b32_e32 v33, v3, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v28 -; SI-NEXT: v_mov_b32_e32 v41, v32 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v1 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v5 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v46 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v49 -; SI-NEXT: v_mov_b32_e32 v53, v27 -; SI-NEXT: v_or_b32_e32 v28, v3, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v3, v30 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_mov_b32_e32 v38, v55 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v44 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v42 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v40 -; SI-NEXT: v_mov_b32_e32 v40, v54 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v47 -; SI-NEXT: v_lshr_b64 v[46:47], v[27:28], 16 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v52 -; SI-NEXT: v_mov_b32_e32 v52, v15 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v51, v11 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v2, v46 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v30, v3, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v3, v26 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v8 -; SI-NEXT: v_mov_b32_e32 v8, v48 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v44 -; SI-NEXT: v_lshr_b64 v[44:45], v[29:30], 16 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v43 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v26, v3, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v3, v22 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v22, v3, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v3, v18 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v4 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v47 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v34 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v47 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v18, v3, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v16 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v5 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v55 -; SI-NEXT: v_or_b32_e32 v16, v3, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v14 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v5 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v37 -; SI-NEXT: v_or_b32_e32 v14, v3, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v12 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v5 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v63 -; SI-NEXT: v_or_b32_e32 v12, v3, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v10 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v5 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v61 -; SI-NEXT: v_or_b32_e32 v10, v3, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v57 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v5 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 -; SI-NEXT: v_or_b32_e32 v57, v3, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v6 -; SI-NEXT: v_lshr_b64 v[58:59], v[34:35], 16 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v5 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v48 -; SI-NEXT: v_mov_b32_e32 v59, v48 -; SI-NEXT: v_or_b32_e32 v6, v3, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v8 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v3, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v31 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v60 -; SI-NEXT: v_or_b32_e32 v4, v3, v4 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v39 -; SI-NEXT: v_lshr_b64 v[47:48], v[17:18], 16 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v31 -; SI-NEXT: v_or_b32_e32 v2, v2, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v62 -; SI-NEXT: v_or_b32_e32 v62, v24, v32 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v54 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v42, v24, v27 -; SI-NEXT: v_mov_b32_e32 v48, v62 -; SI-NEXT: v_or_b32_e32 v50, v20, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v41 -; SI-NEXT: v_mov_b32_e32 v34, v42 -; SI-NEXT: v_lshr_b64 v[42:43], v[25:26], 16 -; SI-NEXT: v_mov_b32_e32 v62, v50 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v10 +; SI-NEXT: v_mov_b32_e32 v21, v40 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v33 +; SI-NEXT: v_mov_b32_e32 v23, v38 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v22 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v63 +; SI-NEXT: v_mov_b32_e32 v54, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v58 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v12 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v43 +; SI-NEXT: v_mov_b32_e32 v58, v35 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v39 +; SI-NEXT: v_mov_b32_e32 v35, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v14 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v56 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v26 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v56 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v42 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v42, v19, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v24 +; SI-NEXT: v_or_b32_e32 v19, v21, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_or_b32_e32 v41, v20, v29 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v38, v8 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v41 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v36 +; SI-NEXT: v_lshr_b64 v[50:51], v[16:17], 16 +; SI-NEXT: v_mov_b32_e32 v51, v46 +; SI-NEXT: v_mov_b32_e32 v46, v44 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v24, v8 -; SI-NEXT: v_or_b32_e32 v8, v38, v25 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v24, v20 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v20 +; SI-NEXT: v_or_b32_e32 v21, v23, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v37 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v22 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v8 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v38, v8 -; SI-NEXT: v_or_b32_e32 v8, v24, v21 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v37, v53 +; SI-NEXT: v_or_b32_e32 v23, v25, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v32 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_or_b32_e32 v53, v37, v2 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v31 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v31 +; SI-NEXT: v_or_b32_e32 v63, v29, v36 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v38, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v35 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_or_b32_e32 v8, v20, v17 +; SI-NEXT: v_lshr_b64 v[31:32], v[20:21], 16 +; SI-NEXT: v_lshr_b64 v[32:33], v[26:27], 16 +; SI-NEXT: v_mov_b32_e32 v33, v61 ; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 ; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v24, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v36 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_or_b32_e32 v8, v38, v15 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v61, v34 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v28 +; SI-NEXT: v_lshr_b64 v[28:29], v[10:11], 16 +; SI-NEXT: v_or_b32_e32 v49, v38, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v58 +; SI-NEXT: v_lshr_b64 v[58:59], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[59:60], v[14:15], 16 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 +; SI-NEXT: v_or_b32_e32 v35, v38, v14 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_or_b32_e32 v36, v36, v0 +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v36, v24 +; SI-NEXT: v_or_b32_e32 v24, v37, v4 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v37, v55 +; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v24, v36, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v52 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v55, v37, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v48 +; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v38, v53 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_lshr_b64 v[24:25], v[2:3], 16 +; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: v_or_b32_e32 v52, v36, v12 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v36, v30 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v48, v37, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v54 +; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[24:25], v[4:5], 16 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[24:25], v[6:7], 16 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[24:25], v[8:9], 16 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v54, v45 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[24:25], v[18:19], 16 +; SI-NEXT: v_lshr_b64 v[44:45], v[62:63], 16 +; SI-NEXT: v_mov_b32_e32 v25, v23 +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v38, v30 +; SI-NEXT: v_or_b32_e32 v30, v36, v41 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 ; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 -; SI-NEXT: v_or_b32_e32 v54, v24, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_or_b32_e32 v52, v38, v9 -; SI-NEXT: v_mov_b32_e32 v27, v52 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_or_b32_e32 v36, v20, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_lshr_b64 v[52:53], v[15:16], 16 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_or_b32_e32 v51, v24, v56 -; SI-NEXT: v_mov_b32_e32 v15, v51 -; SI-NEXT: v_lshr_b64 v[50:51], v[13:14], 16 -; SI-NEXT: v_or_b32_e32 v24, v20, v5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v30, v38, v18 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v38, v8 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v36, v30 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_or_b32_e32 v36, v36, v22 +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v38, v30 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 ; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v39, v8 -; SI-NEXT: v_or_b32_e32 v8, v38, v3 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v36, v38, v26 +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v39, v30 +; SI-NEXT: v_or_b32_e32 v30, v37, v20 ; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 ; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_or_b32_e32 v8, v39, v1 -; SI-NEXT: v_lshr_b64 v[38:39], v[32:33], 16 -; SI-NEXT: v_mov_b32_e32 v32, v41 -; SI-NEXT: v_lshr_b64 v[40:41], v[21:22], 16 -; SI-NEXT: v_lshr_b64 v[20:21], v[11:12], 16 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[20:21], v[56:57], 16 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v11, v24 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[8:9], v[9:10], 16 -; SI-NEXT: v_mov_b32_e32 v39, v31 -; SI-NEXT: v_mov_b32_e32 v31, v60 -; SI-NEXT: v_mov_b32_e32 v60, v61 -; SI-NEXT: v_mov_b32_e32 v61, v63 -; SI-NEXT: v_mov_b32_e32 v63, v37 -; SI-NEXT: v_mov_b32_e32 v37, v55 -; SI-NEXT: v_lshr_b64 v[55:56], v[5:6], 16 -; SI-NEXT: v_lshr_b64 v[24:25], v[3:4], 16 -; SI-NEXT: v_lshr_b64 v[20:21], v[1:2], 16 -; SI-NEXT: .LBB59_3: ; %end -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v58 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v62 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v35 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v23 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v38 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v48 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v33 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v19 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v36, v39, v62 +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v46 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v34 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v28 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v49 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v44 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v32 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v30 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_lshr_b64 v[36:37], v[41:42], 16 +; SI-NEXT: v_lshr_b64 v[39:40], v[12:13], 16 +; SI-NEXT: v_lshr_b64 v[37:38], v[22:23], 16 +; SI-NEXT: v_mov_b32_e32 v41, v50 +; SI-NEXT: v_mov_b32_e32 v50, v59 +; SI-NEXT: v_mov_b32_e32 v40, v19 +; SI-NEXT: v_mov_b32_e32 v38, v21 +; SI-NEXT: .LBB59_3: ; %end +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v58 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v42 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v53 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v26 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v49 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v55 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v52 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v35 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v24 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v40 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v31 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v38 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v37 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v63 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v46 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v51 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v47 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v34 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v42 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v18 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v54 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v28 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v43 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v39 +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v61 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v50 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v41 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v32 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v40 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v48 +; SI-NEXT: v_or_b32_e32 v16, v16, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v47 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v18, v18, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v57 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v20, v20, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v33 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v30 +; SI-NEXT: v_or_b32_e32 v22, v22, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v52 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v24, v24, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v56 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v26, v26, v28 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v37 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v50 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v63 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v36 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v13 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v61 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v27 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v8 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v60 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v8 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v57 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v55 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v59 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v31 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v20 +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: v_or_b32_e32 v28, v28, v30 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_or_b32_e32 v29, v29, v30 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB59_4: ; SI-NEXT: s_branch .LBB59_2 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll index da908bc280e6e..ccc46cc5df39e 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll @@ -1646,48 +1646,51 @@ define <6 x bfloat> @bitcast_v3i32_to_v6bf16(<3 x i32> %a, i32 %b) { ; SI-LABEL: bitcast_v3i32_to_v6bf16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v8, v2 -; SI-NEXT: v_mov_b32_e32 v7, v1 -; SI-NEXT: v_mov_b32_e32 v6, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB8_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB8_4 -; SI-NEXT: .LBB8_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB8_3: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v8 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v7 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v1 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB8_2 -; SI-NEXT: .LBB8_4: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v6 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v7 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v8 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v1 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v0 +; SI-NEXT: ; %bb.4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v8 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v7 +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v6 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v5 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v4 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_alignbit_b32 v2, v2, v3, 16 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v3i32_to_v6bf16: @@ -1759,38 +1762,44 @@ define inreg <6 x bfloat> @bitcast_v3i32_to_v6bf16_scalar(<3 x i32> inreg %a, i3 ; SI-NEXT: s_cmp_lg_u32 s19, 0 ; SI-NEXT: s_cbranch_scc0 .LBB9_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_and_b32 s6, s18, 0xffff0000 -; SI-NEXT: s_lshl_b32 s7, s18, 16 -; SI-NEXT: s_and_b32 s8, s17, 0xffff0000 -; SI-NEXT: s_lshl_b32 s9, s17, 16 -; SI-NEXT: s_and_b32 s10, s16, 0xffff0000 -; SI-NEXT: s_lshl_b32 s11, s16, 16 +; SI-NEXT: s_and_b32 s7, s18, 0xffff0000 +; SI-NEXT: s_lshl_b32 s6, s18, 16 +; SI-NEXT: s_and_b32 s9, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s8, s17, 16 +; SI-NEXT: s_and_b32 s11, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s10, s16, 16 ; SI-NEXT: s_cbranch_execnz .LBB9_3 ; SI-NEXT: .LBB9_2: ; %cmp.true ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_add_i32 s17, s17, 3 ; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_and_b32 s6, s18, 0xffff0000 -; SI-NEXT: s_lshl_b32 s7, s18, 16 -; SI-NEXT: s_and_b32 s8, s17, 0xffff0000 -; SI-NEXT: s_lshl_b32 s9, s17, 16 -; SI-NEXT: s_and_b32 s10, s16, 0xffff0000 -; SI-NEXT: s_lshl_b32 s11, s16, 16 +; SI-NEXT: s_and_b32 s7, s18, 0xffff0000 +; SI-NEXT: s_lshl_b32 s6, s18, 16 +; SI-NEXT: s_and_b32 s9, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s8, s17, 16 +; SI-NEXT: s_and_b32 s11, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s10, s16, 16 ; SI-NEXT: .LBB9_3: ; %end -; SI-NEXT: v_mov_b32_e32 v0, s11 -; SI-NEXT: v_mov_b32_e32 v1, s10 -; SI-NEXT: v_mov_b32_e32 v2, s9 -; SI-NEXT: v_mov_b32_e32 v3, s8 -; SI-NEXT: v_mov_b32_e32 v4, s7 -; SI-NEXT: v_mov_b32_e32 v5, s6 +; SI-NEXT: v_mul_f32_e64 v0, 1.0, s11 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_mul_f32_e64 v0, 1.0, s10 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s9 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s8 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s7 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s6 +; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB9_4: -; SI-NEXT: ; implicit-def: $sgpr11 ; SI-NEXT: ; implicit-def: $sgpr10 -; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $sgpr11 ; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr9 ; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr7 ; SI-NEXT: s_branch .LBB9_2 ; ; VI-LABEL: bitcast_v3i32_to_v6bf16_scalar: @@ -1872,13 +1881,19 @@ define <3 x i32> @bitcast_v6bf16_to_v3i32(<6 x bfloat> %a, i32 %b) { ; SI-LABEL: bitcast_v6bf16_to_v3i32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; SI-NEXT: v_mul_f32_e32 v8, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v9, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v6, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v7, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v3, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v6, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v2 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -1890,29 +1905,29 @@ define <3 x i32> @bitcast_v6bf16_to_v3i32(<6 x bfloat> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB10_3: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; SI-NEXT: v_alignbit_b32 v0, v0, v9, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v7, 16 +; SI-NEXT: v_alignbit_b32 v0, v0, v8, 16 +; SI-NEXT: v_alignbit_b32 v1, v1, v6, 16 ; SI-NEXT: v_alignbit_b32 v2, v2, v4, 16 -; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB10_2 ; SI-NEXT: .LBB10_4: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v8 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v9 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v7 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v8 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 ; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v7 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 @@ -2213,13 +2228,19 @@ define inreg <3 x i32> @bitcast_v6bf16_to_v3i32_scalar(<6 x bfloat> inreg %a, i3 ; SI-LABEL: bitcast_v6bf16_to_v3i32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_cmp_lg_u32 s22, 0 -; SI-NEXT: v_mul_f32_e64 v12, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v8, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v11, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v6, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v10, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v4, 1.0, s20 +; SI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; SI-NEXT: s_lshl_b32 s5, s18, 16 +; SI-NEXT: s_and_b32 s6, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s7, s17, 16 +; SI-NEXT: s_and_b32 s8, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s9, s16, 16 +; SI-NEXT: s_cmp_lg_u32 s19, 0 +; SI-NEXT: v_mul_f32_e64 v12, 1.0, s8 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s9 +; SI-NEXT: v_mul_f32_e64 v11, 1.0, s6 +; SI-NEXT: v_mul_f32_e64 v6, 1.0, s7 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s4 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s5 ; SI-NEXT: s_cbranch_scc0 .LBB11_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v12 @@ -2589,54 +2610,59 @@ define <6 x half> @bitcast_v3i32_to_v6f16(<3 x i32> %a, i32 %b) { ; SI-LABEL: bitcast_v3i32_to_v6f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v8, v2 -; SI-NEXT: v_mov_b32_e32 v7, v1 -; SI-NEXT: v_mov_b32_e32 v6, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB12_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB12_4 -; SI-NEXT: .LBB12_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB12_3: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v6 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB12_2 -; SI-NEXT: .LBB12_4: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v6 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v7 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: .LBB12_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB12_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v0 +; SI-NEXT: .LBB12_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; SI-NEXT: v_or_b32_e32 v1, v5, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v3i32_to_v6f16: @@ -2709,37 +2735,49 @@ define inreg <6 x half> @bitcast_v3i32_to_v6f16_scalar(<3 x i32> inreg %a, i32 i ; SI-NEXT: s_cbranch_scc0 .LBB13_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 ; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 ; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s16 ; SI-NEXT: s_cbranch_execnz .LBB13_3 ; SI-NEXT: .LBB13_2: ; %cmp.true ; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: s_add_i32 s17, s17, 3 ; SI-NEXT: s_lshr_b32 s4, s18, 16 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 ; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 ; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 ; SI-NEXT: .LBB13_3: ; %end +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v0, v3, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v6, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB13_4: +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: s_branch .LBB13_2 ; ; VI-LABEL: bitcast_v3i32_to_v6f16_scalar: @@ -2821,13 +2859,22 @@ define <3 x i32> @bitcast_v6f16_to_v3i32(<6 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v6f16_to_v3i32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v10, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v4 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v2 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -2839,45 +2886,45 @@ define <3 x i32> @bitcast_v6f16_to_v3i32(<6 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB14_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v7 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v5 -; SI-NEXT: v_or_b32_e32 v0, v9, v0 -; SI-NEXT: v_or_b32_e32 v1, v7, v1 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: v_or_b32_e32 v0, v8, v0 +; SI-NEXT: v_or_b32_e32 v1, v6, v1 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 ; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB14_2 ; SI-NEXT: .LBB14_4: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v6 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v5 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v4, v1 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -2956,13 +3003,22 @@ define inreg <3 x i32> @bitcast_v6f16_to_v3i32_scalar(<6 x half> inreg %a, i32 i ; SI-LABEL: bitcast_v6f16_to_v3i32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v8, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v7, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v6, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v5, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v4, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v3, s20 -; SI-NEXT: s_cmp_lg_u32 s22, 0 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v9 +; SI-NEXT: s_cmp_lg_u32 s19, 0 ; SI-NEXT: s_cbranch_scc0 .LBB15_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v8 @@ -3098,35 +3154,36 @@ define <6 x i16> @bitcast_v3i32_to_v6i16(<3 x i32> %a, i32 %b) { ; SI-LABEL: bitcast_v3i32_to_v6i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v4, v2 -; SI-NEXT: v_mov_b32_e32 v2, v1 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB16_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB16_4 -; SI-NEXT: .LBB16_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB16_3: ; %cmp.false -; SI-NEXT: v_alignbit_b32 v5, v0, v4, 16 -; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v3, v0, v2, 16 +; SI-NEXT: v_alignbit_b32 v4, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB16_2 -; SI-NEXT: .LBB16_4: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; SI-NEXT: v_alignbit_b32 v5, v0, v4, 16 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_alignbit_b32 v4, v1, v0, 16 +; SI-NEXT: v_alignbit_b32 v3, v0, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; SI-NEXT: ; %bb.4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v0, v0, v4 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v4 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v3i32_to_v6i16: @@ -3210,12 +3267,18 @@ define inreg <6 x i16> @bitcast_v3i32_to_v6i16_scalar(<3 x i32> inreg %a, i32 in ; SI-NEXT: s_lshr_b32 s10, s17, 16 ; SI-NEXT: s_lshr_b64 s[4:5], s[18:19], 16 ; SI-NEXT: .LBB17_3: ; %end -; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: s_and_b32 s5, s16, 0xffff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s17, 0xffff +; SI-NEXT: s_lshl_b32 s7, s10, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s18, 0xffff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_or_b32 s4, s7, s4 +; SI-NEXT: v_mov_b32_e32 v0, s5 ; SI-NEXT: v_mov_b32_e32 v1, s6 -; SI-NEXT: v_mov_b32_e32 v2, s17 -; SI-NEXT: v_mov_b32_e32 v3, s10 -; SI-NEXT: v_mov_b32_e32 v4, s18 -; SI-NEXT: v_mov_b32_e32 v5, s4 +; SI-NEXT: v_mov_b32_e32 v2, s4 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB17_4: ; SI-NEXT: ; implicit-def: $sgpr6 @@ -3302,12 +3365,16 @@ define <3 x i32> @bitcast_v6i16_to_v3i32(<6 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v6i16_to_v3i32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v8, v2 -; SI-NEXT: v_mov_b32_e32 v7, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; SI-NEXT: v_mov_b32_e32 v6, v2 +; SI-NEXT: v_mov_b32_e32 v4, v1 +; SI-NEXT: v_mov_b32_e32 v5, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v5 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -3319,29 +3386,29 @@ define <3 x i32> @bitcast_v6i16_to_v3i32(<6 x i16> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB18_3: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v7 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v4 -; SI-NEXT: v_or_b32_e32 v0, v0, v9 -; SI-NEXT: v_or_b32_e32 v1, v1, v6 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v0, v0, v8 +; SI-NEXT: v_or_b32_e32 v1, v1, v7 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB18_2 ; SI-NEXT: .LBB18_4: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v7 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v8 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v6 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v0, v9, v0 -; SI-NEXT: v_or_b32_e32 v1, v6, v1 +; SI-NEXT: v_or_b32_e32 v0, v8, v0 +; SI-NEXT: v_or_b32_e32 v1, v7, v1 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x30000, v1 @@ -3423,31 +3490,34 @@ define inreg <3 x i32> @bitcast_v6i16_to_v3i32_scalar(<6 x i16> inreg %a, i32 in ; SI-LABEL: bitcast_v6i16_to_v3i32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_cmp_lg_u32 s22, 0 +; SI-NEXT: s_lshr_b32 s7, s18, 16 +; SI-NEXT: s_lshr_b32 s10, s17, 16 +; SI-NEXT: s_lshr_b32 s11, s16, 16 +; SI-NEXT: s_cmp_lg_u32 s19, 0 ; SI-NEXT: s_cbranch_scc0 .LBB19_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_lshl_b32 s5, s11, 16 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s10, 16 ; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s6, s18, 0xffff +; SI-NEXT: s_lshl_b32 s8, s7, 16 +; SI-NEXT: s_or_b32 s6, s6, s8 ; SI-NEXT: s_cbranch_execnz .LBB19_3 ; SI-NEXT: .LBB19_2: ; %cmp.true ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_lshl_b32 s5, s11, 16 +; SI-NEXT: s_add_i32 s17, s17, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s10, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_and_b32 s6, s18, 0xffff +; SI-NEXT: s_lshl_b32 s7, s7, 16 ; SI-NEXT: s_or_b32 s6, s7, s6 ; SI-NEXT: s_add_i32 s4, s4, 0x30000 ; SI-NEXT: s_add_i32 s5, s5, 0x30000 @@ -4859,48 +4929,51 @@ define <6 x bfloat> @bitcast_v3f32_to_v6bf16(<3 x float> %a, i32 %b) { ; SI-LABEL: bitcast_v3f32_to_v6bf16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v8, v2 -; SI-NEXT: v_mov_b32_e32 v7, v1 -; SI-NEXT: v_mov_b32_e32 v6, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB24_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB24_4 -; SI-NEXT: .LBB24_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB24_3: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v8 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v7 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v1 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB24_2 -; SI-NEXT: .LBB24_4: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v0, 1.0, v6 -; SI-NEXT: v_add_f32_e32 v1, 1.0, v7 -; SI-NEXT: v_add_f32_e32 v2, 1.0, v8 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v1 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v0 +; SI-NEXT: ; %bb.4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v8 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v7 +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v6 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v5 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v4 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_alignbit_b32 v2, v2, v3, 16 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v3f32_to_v6bf16: @@ -4982,13 +5055,13 @@ define inreg <6 x bfloat> @bitcast_v3f32_to_v6bf16_scalar(<3 x float> inreg %a, ; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 ; SI-NEXT: v_add_f32_e64 v1, s17, 1.0 ; SI-NEXT: v_add_f32_e64 v2, s18, 1.0 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: s_branch .LBB25_5 ; SI-NEXT: .LBB25_3: ; SI-NEXT: ; implicit-def: $sgpr11 ; SI-NEXT: ; implicit-def: $sgpr10 @@ -5001,9 +5074,22 @@ define inreg <6 x bfloat> @bitcast_v3f32_to_v6bf16_scalar(<3 x float> inreg %a, ; SI-NEXT: v_mov_b32_e32 v0, s11 ; SI-NEXT: v_mov_b32_e32 v1, s10 ; SI-NEXT: v_mov_b32_e32 v2, s9 -; SI-NEXT: v_mov_b32_e32 v3, s8 -; SI-NEXT: v_mov_b32_e32 v4, s7 -; SI-NEXT: v_mov_b32_e32 v5, s6 +; SI-NEXT: v_mov_b32_e32 v5, s8 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s6 +; SI-NEXT: .LBB25_5: ; %end +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v5 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v2 +; SI-NEXT: v_lshr_b64 v[1:2], v[5:6], 16 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_lshr_b64 v[2:3], v[3:4], 16 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v3f32_to_v6bf16_scalar: @@ -5089,13 +5175,19 @@ define <3 x float> @bitcast_v6bf16_to_v3f32(<6 x bfloat> %a, i32 %b) { ; SI-LABEL: bitcast_v6bf16_to_v3f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; SI-NEXT: v_mul_f32_e32 v8, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v9, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v6, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v7, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v3, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v6, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v2 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -5107,29 +5199,29 @@ define <3 x float> @bitcast_v6bf16_to_v3f32(<6 x bfloat> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB26_3: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; SI-NEXT: v_alignbit_b32 v0, v0, v9, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v7, 16 +; SI-NEXT: v_alignbit_b32 v0, v0, v8, 16 +; SI-NEXT: v_alignbit_b32 v1, v1, v6, 16 ; SI-NEXT: v_alignbit_b32 v2, v2, v4, 16 -; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB26_2 ; SI-NEXT: .LBB26_4: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v8 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v9 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v7 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v8 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 ; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v7 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 @@ -5430,13 +5522,19 @@ define inreg <3 x float> @bitcast_v6bf16_to_v3f32_scalar(<6 x bfloat> inreg %a, ; SI-LABEL: bitcast_v6bf16_to_v3f32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_cmp_lg_u32 s22, 0 -; SI-NEXT: v_mul_f32_e64 v12, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v8, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v11, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v6, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v10, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v4, 1.0, s20 +; SI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; SI-NEXT: s_lshl_b32 s5, s18, 16 +; SI-NEXT: s_and_b32 s6, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s7, s17, 16 +; SI-NEXT: s_and_b32 s8, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s9, s16, 16 +; SI-NEXT: s_cmp_lg_u32 s19, 0 +; SI-NEXT: v_mul_f32_e64 v12, 1.0, s8 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s9 +; SI-NEXT: v_mul_f32_e64 v11, 1.0, s6 +; SI-NEXT: v_mul_f32_e64 v6, 1.0, s7 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s4 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s5 ; SI-NEXT: s_cbranch_scc0 .LBB27_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v12 @@ -5806,54 +5904,59 @@ define <6 x half> @bitcast_v3f32_to_v6f16(<3 x float> %a, i32 %b) { ; SI-LABEL: bitcast_v3f32_to_v6f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v8, v2 -; SI-NEXT: v_mov_b32_e32 v7, v1 -; SI-NEXT: v_mov_b32_e32 v6, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB28_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB28_4 -; SI-NEXT: .LBB28_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB28_3: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v6 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB28_2 -; SI-NEXT: .LBB28_4: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v1, 1.0, v6 -; SI-NEXT: v_add_f32_e32 v3, 1.0, v7 -; SI-NEXT: v_add_f32_e32 v5, 1.0, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: .LBB28_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB28_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v0 +; SI-NEXT: .LBB28_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; SI-NEXT: v_or_b32_e32 v1, v5, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v3f32_to_v6f16: @@ -5925,37 +6028,49 @@ define inreg <6 x half> @bitcast_v3f32_to_v6f16_scalar(<3 x float> inreg %a, i32 ; SI-NEXT: s_cbranch_scc0 .LBB29_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 ; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 ; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s16 ; SI-NEXT: s_cbranch_execnz .LBB29_3 ; SI-NEXT: .LBB29_2: ; %cmp.true -; SI-NEXT: v_add_f32_e64 v1, s16, 1.0 -; SI-NEXT: v_add_f32_e64 v3, s17, 1.0 -; SI-NEXT: v_add_f32_e64 v5, s18, 1.0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 +; SI-NEXT: v_add_f32_e64 v5, s16, 1.0 +; SI-NEXT: v_add_f32_e64 v4, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v5 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: .LBB29_3: ; %end +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v0, v3, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v6, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB29_4: +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: s_branch .LBB29_2 ; ; VI-LABEL: bitcast_v3f32_to_v6f16_scalar: @@ -6041,13 +6156,22 @@ define <3 x float> @bitcast_v6f16_to_v3f32(<6 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v6f16_to_v3f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v10, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v4 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v2 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -6059,45 +6183,45 @@ define <3 x float> @bitcast_v6f16_to_v3f32(<6 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB30_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v7 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v5 -; SI-NEXT: v_or_b32_e32 v0, v9, v0 -; SI-NEXT: v_or_b32_e32 v1, v7, v1 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: v_or_b32_e32 v0, v8, v0 +; SI-NEXT: v_or_b32_e32 v1, v6, v1 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 ; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB30_2 ; SI-NEXT: .LBB30_4: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v6 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v5 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v4, v1 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -6176,13 +6300,22 @@ define inreg <3 x float> @bitcast_v6f16_to_v3f32_scalar(<6 x half> inreg %a, i32 ; SI-LABEL: bitcast_v6f16_to_v3f32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v8, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v7, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v6, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v5, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v4, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v3, s20 -; SI-NEXT: s_cmp_lg_u32 s22, 0 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v9 +; SI-NEXT: s_cmp_lg_u32 s19, 0 ; SI-NEXT: s_cbranch_scc0 .LBB31_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v8 @@ -6318,35 +6451,36 @@ define <6 x i16> @bitcast_v3f32_to_v6i16(<3 x float> %a, i32 %b) { ; SI-LABEL: bitcast_v3f32_to_v6i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v4, v2 -; SI-NEXT: v_mov_b32_e32 v2, v1 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB32_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB32_4 -; SI-NEXT: .LBB32_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB32_3: ; %cmp.false -; SI-NEXT: v_alignbit_b32 v5, v0, v4, 16 -; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v3, v0, v2, 16 +; SI-NEXT: v_alignbit_b32 v4, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB32_2 -; SI-NEXT: .LBB32_4: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; SI-NEXT: v_alignbit_b32 v5, v0, v4, 16 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_alignbit_b32 v4, v1, v0, 16 +; SI-NEXT: v_alignbit_b32 v3, v0, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; SI-NEXT: ; %bb.4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v0, v0, v4 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v4 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v3f32_to_v6i16: @@ -6422,12 +6556,12 @@ define inreg <6 x i16> @bitcast_v3f32_to_v6i16_scalar(<3 x float> inreg %a, i32 ; SI-NEXT: s_lshr_b64 s[6:7], s[16:17], 16 ; SI-NEXT: s_cbranch_execnz .LBB33_4 ; SI-NEXT: .LBB33_2: ; %cmp.true -; SI-NEXT: v_add_f32_e64 v4, s18, 1.0 -; SI-NEXT: v_add_f32_e64 v8, s17, 1.0 -; SI-NEXT: v_add_f32_e64 v7, s16, 1.0 -; SI-NEXT: v_lshr_b64 v[1:2], v[7:8], 16 -; SI-NEXT: v_lshr_b64 v[5:6], v[4:5], 16 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v8 +; SI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: v_lshr_b64 v[5:6], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[3:4], v[2:3], 16 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 ; SI-NEXT: s_branch .LBB33_5 ; SI-NEXT: .LBB33_3: ; SI-NEXT: ; implicit-def: $sgpr6 @@ -6435,15 +6569,22 @@ define inreg <6 x i16> @bitcast_v3f32_to_v6i16_scalar(<3 x float> inreg %a, i32 ; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: s_branch .LBB33_2 ; SI-NEXT: .LBB33_4: -; SI-NEXT: v_mov_b32_e32 v7, s16 -; SI-NEXT: v_mov_b32_e32 v8, s17 -; SI-NEXT: v_mov_b32_e32 v4, s18 -; SI-NEXT: v_mov_b32_e32 v3, s10 -; SI-NEXT: v_mov_b32_e32 v5, s4 -; SI-NEXT: v_mov_b32_e32 v1, s6 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v4, s10 +; SI-NEXT: v_mov_b32_e32 v3, s4 +; SI-NEXT: v_mov_b32_e32 v5, s6 ; SI-NEXT: .LBB33_5: ; %end -; SI-NEXT: v_mov_b32_e32 v0, v7 -; SI-NEXT: v_mov_b32_e32 v2, v8 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v0, v0, v5 +; SI-NEXT: v_or_b32_e32 v1, v1, v4 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v3f32_to_v6i16_scalar: @@ -6529,12 +6670,16 @@ define <3 x float> @bitcast_v6i16_to_v3f32(<6 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v6i16_to_v3f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v8, v2 -; SI-NEXT: v_mov_b32_e32 v7, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; SI-NEXT: v_mov_b32_e32 v6, v2 +; SI-NEXT: v_mov_b32_e32 v4, v1 +; SI-NEXT: v_mov_b32_e32 v5, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v5 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -6546,29 +6691,29 @@ define <3 x float> @bitcast_v6i16_to_v3f32(<6 x i16> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB34_3: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v7 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v4 -; SI-NEXT: v_or_b32_e32 v0, v0, v9 -; SI-NEXT: v_or_b32_e32 v1, v1, v6 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v0, v0, v8 +; SI-NEXT: v_or_b32_e32 v1, v1, v7 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB34_2 ; SI-NEXT: .LBB34_4: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v7 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v8 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v6 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v0, v9, v0 -; SI-NEXT: v_or_b32_e32 v1, v6, v1 +; SI-NEXT: v_or_b32_e32 v0, v8, v0 +; SI-NEXT: v_or_b32_e32 v1, v7, v1 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x30000, v1 @@ -6650,31 +6795,34 @@ define inreg <3 x float> @bitcast_v6i16_to_v3f32_scalar(<6 x i16> inreg %a, i32 ; SI-LABEL: bitcast_v6i16_to_v3f32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_cmp_lg_u32 s22, 0 +; SI-NEXT: s_lshr_b32 s7, s18, 16 +; SI-NEXT: s_lshr_b32 s10, s17, 16 +; SI-NEXT: s_lshr_b32 s11, s16, 16 +; SI-NEXT: s_cmp_lg_u32 s19, 0 ; SI-NEXT: s_cbranch_scc0 .LBB35_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_lshl_b32 s5, s11, 16 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s10, 16 ; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s6, s18, 0xffff +; SI-NEXT: s_lshl_b32 s8, s7, 16 +; SI-NEXT: s_or_b32 s6, s6, s8 ; SI-NEXT: s_cbranch_execnz .LBB35_3 ; SI-NEXT: .LBB35_2: ; %cmp.true ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_lshl_b32 s5, s11, 16 +; SI-NEXT: s_add_i32 s17, s17, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s10, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_and_b32 s6, s18, 0xffff +; SI-NEXT: s_lshl_b32 s7, s7, 16 ; SI-NEXT: s_or_b32 s6, s7, s6 ; SI-NEXT: s_add_i32 s4, s4, 0x30000 ; SI-NEXT: s_add_i32 s5, s5, 0x30000 @@ -6780,106 +6928,114 @@ define <6 x bfloat> @bitcast_v12i8_to_v6bf16(<12 x i8> %a, i32 %b) { ; SI-LABEL: bitcast_v12i8_to_v6bf16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v14, v1 -; SI-NEXT: v_mov_b32_e32 v13, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; SI-NEXT: v_lshlrev_b32_e32 v15, 24, v3 +; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v3 ; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v5 -; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; SI-NEXT: v_lshlrev_b32_e32 v14, 24, v7 ; SI-NEXT: v_lshlrev_b32_e32 v16, 24, v11 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB36_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xff, v13 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v14 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v4 -; SI-NEXT: v_or_b32_e32 v2, v2, v17 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v3, v7, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v9 -; SI-NEXT: v_or_b32_e32 v11, v4, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v15, v1 -; SI-NEXT: v_or_b32_e32 v5, v16, v2 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; SI-NEXT: v_or_b32_e32 v3, v1, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v5, v12, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v4 +; SI-NEXT: v_or_b32_e32 v0, v0, v17 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v11, v14, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v9 +; SI-NEXT: v_or_b32_e32 v13, v1, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v15, v16, v0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: .LBB36_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB36_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v8 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v9 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v10 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v16, v1 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v8 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v9 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v10 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x300, v3 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v5, v16, v5 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 ; SI-NEXT: s_mov_b32 s7, 0x3000000 -; SI-NEXT: v_add_i32_e32 v8, vcc, s7, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v4 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v6 -; SI-NEXT: s_movk_i32 s6, 0x300 -; SI-NEXT: v_or_b32_e32 v0, v17, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v7, v1 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v4, vcc, s7, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, s7, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v4 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v14 +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v6 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v2 +; SI-NEXT: s_movk_i32 s6, 0x300 +; SI-NEXT: v_or_b32_e32 v3, v17, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v4, v14, v4 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v15, v1 +; SI-NEXT: v_or_b32_e32 v1, v12, v1 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v4, vcc, s7, v3 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x3000000, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v4 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v8 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v8 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v4 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v8 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v8 ; SI-NEXT: .LBB36_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_mov_b32_e32 v2, v12 -; SI-NEXT: v_mov_b32_e32 v4, v11 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v5 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v3 +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v11 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v7 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v15 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v13 +; SI-NEXT: v_alignbit_b32 v2, v2, v3, 16 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v12i8_to_v6bf16: @@ -7234,19 +7390,19 @@ define inreg <6 x bfloat> @bitcast_v12i8_to_v6bf16_scalar(<12 x i8> inreg %a, i3 ; SI-NEXT: s_and_b32 s4, s18, 0xff ; SI-NEXT: s_lshl_b32 s4, s4, 16 ; SI-NEXT: s_lshl_b32 s5, s19, 24 -; SI-NEXT: s_or_b32 s7, s5, s4 +; SI-NEXT: s_or_b32 s8, s5, s4 ; SI-NEXT: s_and_b32 s4, s20, 0xff ; SI-NEXT: s_lshl_b32 s5, s21, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_lshl_b32 s8, s4, 16 +; SI-NEXT: s_lshl_b32 s7, s4, 16 ; SI-NEXT: s_and_b32 s4, s22, 0xff ; SI-NEXT: s_lshl_b32 s4, s4, 16 ; SI-NEXT: s_lshl_b32 s5, s23, 24 -; SI-NEXT: s_or_b32 s9, s5, s4 +; SI-NEXT: s_or_b32 s10, s5, s4 ; SI-NEXT: s_and_b32 s4, s24, 0xff ; SI-NEXT: s_lshl_b32 s4, s4, 16 ; SI-NEXT: s_lshl_b32 s5, s25, 24 -; SI-NEXT: s_or_b32 s10, s5, s4 +; SI-NEXT: s_or_b32 s9, s5, s4 ; SI-NEXT: s_and_b32 s4, s26, 0xff ; SI-NEXT: s_lshl_b32 s4, s4, 16 ; SI-NEXT: s_lshl_b32 s5, s27, 24 @@ -7292,26 +7448,32 @@ define inreg <6 x bfloat> @bitcast_v12i8_to_v6bf16_scalar(<12 x i8> inreg %a, i3 ; SI-NEXT: s_add_i32 s4, s4, 0x3000000 ; SI-NEXT: s_add_i32 s5, s5, 0x3000000 ; SI-NEXT: s_add_i32 s6, s6, 0x3000000 -; SI-NEXT: s_and_b32 s7, s6, 0xffff0000 +; SI-NEXT: s_and_b32 s8, s6, 0xffff0000 ; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: s_and_b32 s9, s5, 0xffff0000 -; SI-NEXT: s_lshl_b32 s8, s5, 16 +; SI-NEXT: s_and_b32 s10, s5, 0xffff0000 +; SI-NEXT: s_lshl_b32 s7, s5, 16 ; SI-NEXT: s_and_b32 s11, s4, 0xffff0000 -; SI-NEXT: s_lshl_b32 s10, s4, 16 +; SI-NEXT: s_lshl_b32 s9, s4, 16 ; SI-NEXT: .LBB37_3: ; %end -; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: v_mov_b32_e32 v1, s7 -; SI-NEXT: v_mov_b32_e32 v2, s8 -; SI-NEXT: v_mov_b32_e32 v3, s9 -; SI-NEXT: v_mov_b32_e32 v4, s10 -; SI-NEXT: v_mov_b32_e32 v5, s11 +; SI-NEXT: v_mul_f32_e64 v0, 1.0, s8 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_mul_f32_e64 v0, 1.0, s6 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s10 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s7 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s11 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s9 +; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB37_4: ; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; implicit-def: $sgpr7 ; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $sgpr7 ; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr9 ; SI-NEXT: ; implicit-def: $sgpr11 ; SI-NEXT: s_branch .LBB37_2 ; @@ -7585,13 +7747,19 @@ define <12 x i8> @bitcast_v6bf16_to_v12i8(<6 x bfloat> %a, i32 %b) { ; SI-LABEL: bitcast_v6bf16_to_v12i8: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; SI-NEXT: v_mul_f32_e32 v16, 1.0, v1 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v6 ; SI-NEXT: v_mul_f32_e32 v17, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v14, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v15, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v12, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v13, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v14, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v15, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v12, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v13, 1.0, v2 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 @@ -8099,13 +8267,19 @@ define inreg <12 x i8> @bitcast_v6bf16_to_v12i8_scalar(<6 x bfloat> inreg %a, i3 ; SI-LABEL: bitcast_v6bf16_to_v12i8_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_cmp_lg_u32 s22, 0 -; SI-NEXT: v_mul_f32_e64 v21, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v0, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v20, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v5, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v19, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v9, 1.0, s20 +; SI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; SI-NEXT: s_lshl_b32 s5, s18, 16 +; SI-NEXT: s_and_b32 s6, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s7, s17, 16 +; SI-NEXT: s_and_b32 s8, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s9, s16, 16 +; SI-NEXT: s_cmp_lg_u32 s19, 0 +; SI-NEXT: v_mul_f32_e64 v21, 1.0, s8 +; SI-NEXT: v_mul_f32_e64 v0, 1.0, s9 +; SI-NEXT: v_mul_f32_e64 v20, 1.0, s6 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s7 +; SI-NEXT: v_mul_f32_e64 v19, 1.0, s4 +; SI-NEXT: v_mul_f32_e64 v9, 1.0, s5 ; SI-NEXT: s_cbranch_scc0 .LBB39_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v21 @@ -8654,93 +8828,102 @@ define <6 x half> @bitcast_v12i8_to_v6f16(<12 x i8> %a, i32 %b) { ; SI-LABEL: bitcast_v12i8_to_v6f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v13, v2 -; SI-NEXT: v_mov_b32_e32 v14, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 ; SI-NEXT: v_lshlrev_b32_e32 v12, 8, v1 -; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v3 -; SI-NEXT: v_lshlrev_b32_e32 v16, 8, v5 -; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 -; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v9 -; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v11 -; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: v_lshlrev_b32_e32 v13, 8, v3 +; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v16, 8, v9 +; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v11 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB40_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v2, 0xff, v4 -; SI-NEXT: v_and_b32_e32 v4, 0xff, v8 -; SI-NEXT: v_or_b32_e32 v4, v4, v17 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v14 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v13 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v4 -; SI-NEXT: v_and_b32_e32 v4, 0xff, v10 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: v_or_b32_e32 v0, v0, v12 -; SI-NEXT: v_or_b32_e32 v1, v1, v15 -; SI-NEXT: v_or_b32_e32 v2, v2, v16 -; SI-NEXT: v_or_b32_e32 v3, v3, v7 -; SI-NEXT: v_or_b32_e32 v4, v4, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v4 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v4 +; SI-NEXT: v_or_b32_e32 v0, v0, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v0, v0, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v8 +; SI-NEXT: v_or_b32_e32 v0, v0, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v10 +; SI-NEXT: v_or_b32_e32 v0, v0, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: .LBB40_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB40_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v10 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v11, v0 -; SI-NEXT: v_add_i32_e32 v5, vcc, 0x300, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v8 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v10 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v1, v17, v1 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x300, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v8 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: s_movk_i32 s6, 0x300 -; SI-NEXT: v_or_b32_e32 v0, v17, v0 -; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v6 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v7, v0 -; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v4 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v16, v0 -; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v13 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v15, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v14 +; SI-NEXT: v_or_b32_e32 v1, v16, v1 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v6 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v1, v15, v1 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v1, v14, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v1, v13, v1 ; SI-NEXT: v_or_b32_e32 v0, v12, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x300, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v10 ; SI-NEXT: .LBB40_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_mov_b32_e32 v4, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v7 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_or_b32_e32 v2, v5, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v12i8_to_v6f16: @@ -9103,11 +9286,11 @@ define inreg <6 x half> @bitcast_v12i8_to_v6f16_scalar(<12 x i8> inreg %a, i32 i ; SI-NEXT: s_and_b32 s4, s22, 0xff ; SI-NEXT: s_lshl_b32 s5, s23, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 ; SI-NEXT: s_and_b32 s4, s24, 0xff ; SI-NEXT: s_lshl_b32 s5, s25, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 ; SI-NEXT: s_and_b32 s4, s26, 0xff ; SI-NEXT: s_lshl_b32 s5, s27, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 @@ -9147,17 +9330,29 @@ define inreg <6 x half> @bitcast_v12i8_to_v6f16_scalar(<12 x i8> inreg %a, i32 i ; SI-NEXT: v_cvt_f32_f16_e32 v0, s9 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s8 ; SI-NEXT: v_cvt_f32_f16_e32 v2, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 ; SI-NEXT: .LBB41_3: ; %end +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v5 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB41_4: ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: s_branch .LBB41_2 ; @@ -9431,14 +9626,22 @@ define <12 x i8> @bitcast_v6f16_to_v12i8(<6 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v6f16_to_v12i8: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v7, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v1 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v4 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v2 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 @@ -9746,13 +9949,22 @@ define inreg <12 x i8> @bitcast_v6f16_to_v12i8_scalar(<6 x half> inreg %a, i32 i ; SI-LABEL: bitcast_v6f16_to_v12i8_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v16, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v15, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v6, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v14, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v10, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v0, s20 -; SI-NEXT: s_cmp_lg_u32 s22, 0 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 +; SI-NEXT: s_cmp_lg_u32 s19, 0 ; SI-NEXT: s_cbranch_scc0 .LBB43_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v16 @@ -10028,110 +10240,112 @@ define <6 x i16> @bitcast_v12i8_to_v6i16(<12 x i8> %a, i32 %b) { ; SI-LABEL: bitcast_v12i8_to_v6i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v14, v4 -; SI-NEXT: v_mov_b32_e32 v15, v2 -; SI-NEXT: v_mov_b32_e32 v13, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; SI-NEXT: v_lshlrev_b32_e32 v16, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v5 ; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v7 -; SI-NEXT: v_lshlrev_b32_e32 v17, 24, v3 -; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v16, 24, v3 +; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v1 ; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 -; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v11 -; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v11 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB44_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB44_4 -; SI-NEXT: .LBB44_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB44_3: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xff, v14 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v6 -; SI-NEXT: v_or_b32_e32 v0, v0, v16 +; SI-NEXT: s_cbranch_execz .LBB44_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v1, 0xff, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v1, v1, v15 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v4, v12, v3 +; SI-NEXT: v_or_b32_e32 v3, v1, v4 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v17 +; SI-NEXT: v_or_b32_e32 v1, v16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v3, v12, v1 -; SI-NEXT: v_or_b32_e32 v2, v0, v3 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v15 -; SI-NEXT: v_and_b32_e32 v4, 0xff, v13 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v4, v4, v18 -; SI-NEXT: v_or_b32_e32 v0, v17, v0 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; SI-NEXT: v_or_b32_e32 v0, v4, v0 -; SI-NEXT: v_and_b32_e32 v4, 0xff, v8 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v10 -; SI-NEXT: v_or_b32_e32 v4, v4, v9 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_or_b32_e32 v4, v4, v5 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: v_alignbit_b32 v7, v3, v1, 16 +; SI-NEXT: v_or_b32_e32 v1, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v8 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v10 +; SI-NEXT: v_or_b32_e32 v0, v0, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v2, v5, v2 +; SI-NEXT: v_or_b32_e32 v11, v0, v2 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v2 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: .LBB44_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB44_2 -; SI-NEXT: .LBB44_4: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v13 +; SI-NEXT: s_cbranch_execz .LBB44_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v15 -; SI-NEXT: v_or_b32_e32 v0, v18, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v2 +; SI-NEXT: v_or_b32_e32 v0, v17, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v17, v1 +; SI-NEXT: v_or_b32_e32 v1, v16, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v14 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_mov_b32 s7, 0x3000000 +; SI-NEXT: v_add_i32_e32 v1, vcc, s7, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v6 ; SI-NEXT: s_movk_i32 s6, 0x300 -; SI-NEXT: v_or_b32_e32 v1, v16, v1 +; SI-NEXT: v_or_b32_e32 v0, v15, v0 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v2, v12, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x3000000, v1 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v8 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v10 -; SI-NEXT: v_or_b32_e32 v1, v9, v1 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v3, v7, v3 -; SI-NEXT: s_mov_b32 s7, 0x3000000 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_add_i32_e32 v0, vcc, s7, v0 -; SI-NEXT: v_add_i32_e32 v4, vcc, 0x3000000, v1 -; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; SI-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v8 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v10 +; SI-NEXT: v_or_b32_e32 v0, v9, v0 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v2, v5, v2 +; SI-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x3000000, v0 +; SI-NEXT: v_alignbit_b32 v7, v3, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v11 +; SI-NEXT: .LBB44_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v7 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v13 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v14 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v12i8_to_v6i16: @@ -10555,12 +10769,18 @@ define inreg <6 x i16> @bitcast_v12i8_to_v6i16_scalar(<12 x i8> inreg %a, i32 in ; SI-NEXT: s_lshr_b32 s7, s5, 16 ; SI-NEXT: s_lshr_b32 s11, s10, 16 ; SI-NEXT: .LBB45_3: ; %end +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_or_b32 s4, s4, s6 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s6, s7, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s10, 0xffff +; SI-NEXT: s_lshl_b32 s7, s11, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 ; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s6 -; SI-NEXT: v_mov_b32_e32 v2, s5 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: v_mov_b32_e32 v4, s10 -; SI-NEXT: v_mov_b32_e32 v5, s11 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB45_4: ; SI-NEXT: ; implicit-def: $sgpr4 @@ -10840,26 +11060,25 @@ define <12 x i8> @bitcast_v6i16_to_v12i8(<6 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v6i16_to_v12i8: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v15, v5 -; SI-NEXT: v_mov_b32_e32 v16, v3 -; SI-NEXT: v_mov_b32_e32 v12, v4 -; SI-NEXT: v_mov_b32_e32 v13, v2 +; SI-NEXT: v_mov_b32_e32 v12, v2 +; SI-NEXT: v_mov_b32_e32 v13, v1 ; SI-NEXT: v_mov_b32_e32 v14, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v14 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v10 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -10873,39 +11092,37 @@ define <12 x i8> @bitcast_v6i16_to_v12i8(<6 x i16> %a, i32 %b) { ; SI-NEXT: .LBB46_3: ; %cmp.false ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v14 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v12 -; SI-NEXT: v_or_b32_e32 v0, v0, v19 -; SI-NEXT: v_or_b32_e32 v4, v1, v18 -; SI-NEXT: v_or_b32_e32 v8, v6, v17 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v0, v0, v17 +; SI-NEXT: v_or_b32_e32 v4, v1, v16 +; SI-NEXT: v_or_b32_e32 v8, v7, v15 ; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 ; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 ; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 ; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 ; SI-NEXT: v_lshrrev_b32_e32 v9, 8, v8 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v16 -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v15 -; SI-NEXT: v_bfe_u32 v7, v16, 8, 8 -; SI-NEXT: v_bfe_u32 v11, v15, 8, 8 +; SI-NEXT: v_bfe_u32 v7, v6, 8, 8 +; SI-NEXT: v_bfe_u32 v11, v10, 8, 8 ; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB46_2 ; SI-NEXT: .LBB46_4: ; %cmp.true ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v13 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_mov_b32 s6, 0x30000 -; SI-NEXT: v_or_b32_e32 v1, v18, v1 +; SI-NEXT: v_or_b32_e32 v1, v16, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v14 ; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v1 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v12 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v0, v19, v0 -; SI-NEXT: v_or_b32_e32 v1, v17, v1 +; SI-NEXT: v_or_b32_e32 v0, v17, v0 +; SI-NEXT: v_or_b32_e32 v1, v15, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1 ; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 @@ -11153,53 +11370,54 @@ define inreg <12 x i8> @bitcast_v6i16_to_v12i8_scalar(<6 x i16> inreg %a, i32 in ; SI-LABEL: bitcast_v6i16_to_v12i8_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_cmp_lg_u32 s22, 0 +; SI-NEXT: s_lshr_b32 s14, s18, 16 +; SI-NEXT: s_lshr_b32 s15, s17, 16 +; SI-NEXT: s_lshr_b32 s20, s16, 16 +; SI-NEXT: s_cmp_lg_u32 s19, 0 ; SI-NEXT: s_cbranch_scc0 .LBB47_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_lshl_b32 s5, s20, 16 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s15, 16 ; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_lshr_b64 s[8:9], s[4:5], 16 ; SI-NEXT: s_lshr_b64 s[10:11], s[4:5], 8 -; SI-NEXT: s_and_b32 s9, s20, 0xffff -; SI-NEXT: s_lshl_b32 s11, s21, 16 +; SI-NEXT: s_and_b32 s9, s18, 0xffff +; SI-NEXT: s_lshl_b32 s11, s14, 16 ; SI-NEXT: s_lshr_b64 s[6:7], s[4:5], 24 -; SI-NEXT: s_or_b32 s14, s9, s11 +; SI-NEXT: s_or_b32 s19, s9, s11 ; SI-NEXT: s_lshr_b32 s7, s5, 8 -; SI-NEXT: s_lshr_b32 s15, s14, 8 -; SI-NEXT: s_and_b32 s9, s19, 0xffff -; SI-NEXT: s_and_b32 s22, s21, 0xffff -; SI-NEXT: s_bfe_u32 s11, s19, 0x80008 -; SI-NEXT: s_bfe_u32 s23, s21, 0x80008 +; SI-NEXT: s_lshr_b32 s21, s19, 8 +; SI-NEXT: s_bfe_u32 s9, s15, 0x80008 +; SI-NEXT: s_bfe_u32 s11, s14, 0x80008 ; SI-NEXT: s_cbranch_execnz .LBB47_3 ; SI-NEXT: .LBB47_2: ; %cmp.true ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_lshl_b32 s5, s20, 16 +; SI-NEXT: s_add_i32 s17, s17, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s15, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_and_b32 s6, s18, 0xffff +; SI-NEXT: s_lshl_b32 s7, s14, 16 ; SI-NEXT: s_add_i32 s4, s4, 0x30000 ; SI-NEXT: s_add_i32 s5, s5, 0x30000 ; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_add_i32 s14, s6, 0x30000 +; SI-NEXT: s_add_i32 s19, s6, 0x30000 ; SI-NEXT: s_lshr_b64 s[6:7], s[4:5], 24 ; SI-NEXT: s_lshr_b64 s[8:9], s[4:5], 16 ; SI-NEXT: s_lshr_b64 s[10:11], s[4:5], 8 -; SI-NEXT: s_lshr_b32 s11, s5, 24 -; SI-NEXT: s_lshr_b32 s9, s5, 16 +; SI-NEXT: s_lshr_b32 s9, s5, 24 +; SI-NEXT: s_lshr_b32 s15, s5, 16 ; SI-NEXT: s_lshr_b32 s7, s5, 8 -; SI-NEXT: s_lshr_b32 s23, s14, 24 -; SI-NEXT: s_lshr_b32 s22, s14, 16 -; SI-NEXT: s_lshr_b32 s15, s14, 8 +; SI-NEXT: s_lshr_b32 s11, s19, 24 +; SI-NEXT: s_lshr_b32 s14, s19, 16 +; SI-NEXT: s_lshr_b32 s21, s19, 8 ; SI-NEXT: .LBB47_3: ; %end ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s10 @@ -11207,12 +11425,12 @@ define inreg <12 x i8> @bitcast_v6i16_to_v12i8_scalar(<6 x i16> inreg %a, i32 in ; SI-NEXT: v_mov_b32_e32 v3, s6 ; SI-NEXT: v_mov_b32_e32 v4, s5 ; SI-NEXT: v_mov_b32_e32 v5, s7 -; SI-NEXT: v_mov_b32_e32 v6, s9 -; SI-NEXT: v_mov_b32_e32 v7, s11 -; SI-NEXT: v_mov_b32_e32 v8, s14 -; SI-NEXT: v_mov_b32_e32 v9, s15 -; SI-NEXT: v_mov_b32_e32 v10, s22 -; SI-NEXT: v_mov_b32_e32 v11, s23 +; SI-NEXT: v_mov_b32_e32 v6, s15 +; SI-NEXT: v_mov_b32_e32 v7, s9 +; SI-NEXT: v_mov_b32_e32 v8, s19 +; SI-NEXT: v_mov_b32_e32 v9, s21 +; SI-NEXT: v_mov_b32_e32 v10, s14 +; SI-NEXT: v_mov_b32_e32 v11, s11 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB47_4: ; SI-NEXT: ; implicit-def: $sgpr4 @@ -11221,11 +11439,9 @@ define inreg <12 x i8> @bitcast_v6i16_to_v12i8_scalar(<6 x i16> inreg %a, i32 in ; SI-NEXT: ; implicit-def: $sgpr6 ; SI-NEXT: ; implicit-def: $sgpr7 ; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $sgpr19 +; SI-NEXT: ; implicit-def: $sgpr21 ; SI-NEXT: ; implicit-def: $sgpr11 -; SI-NEXT: ; implicit-def: $sgpr14 -; SI-NEXT: ; implicit-def: $sgpr15 -; SI-NEXT: ; implicit-def: $sgpr22 -; SI-NEXT: ; implicit-def: $sgpr23 ; SI-NEXT: s_branch .LBB47_2 ; ; VI-LABEL: bitcast_v6i16_to_v12i8_scalar: @@ -11431,40 +11647,40 @@ define <6 x half> @bitcast_v6bf16_to_v6f16(<6 x bfloat> %a, i32 %b) { ; SI-LABEL: bitcast_v6bf16_to_v6f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 ; SI-NEXT: v_mul_f32_e32 v6, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v7, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v8, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v9, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v10, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v11, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v4 ; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB48_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB48_4 -; SI-NEXT: .LBB48_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB48_3: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v6 +; SI-NEXT: s_cbranch_execz .LBB48_2 +; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v8 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v10 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr10 @@ -11472,34 +11688,48 @@ define <6 x half> @bitcast_v6bf16_to_v6f16(<6 x bfloat> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: .LBB48_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB48_2 -; SI-NEXT: .LBB48_4: ; %cmp.true +; SI-NEXT: s_cbranch_execz .LBB48_4 +; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v11 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v10 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v9 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v8 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v7 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v6 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: .LBB48_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v1, v1, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v6bf16_to_v6f16: @@ -11790,62 +12020,80 @@ define inreg <6 x half> @bitcast_v6bf16_to_v6f16_scalar(<6 x bfloat> inreg %a, i ; SI-LABEL: bitcast_v6bf16_to_v6f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_cmp_lg_u32 s22, 0 -; SI-NEXT: v_mul_f32_e64 v6, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v7, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v8, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v9, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v10, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v11, 1.0, s21 +; SI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; SI-NEXT: s_lshl_b32 s5, s18, 16 +; SI-NEXT: s_and_b32 s6, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s7, s17, 16 +; SI-NEXT: s_and_b32 s8, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s9, s16, 16 +; SI-NEXT: s_cmp_lg_u32 s19, 0 +; SI-NEXT: v_mul_f32_e64 v0, 1.0, s9 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s8 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s7 +; SI-NEXT: v_mul_f32_e64 v6, 1.0, s6 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s5 +; SI-NEXT: v_mul_f32_e64 v11, 1.0, s4 ; SI-NEXT: s_cbranch_scc0 .LBB49_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: s_cbranch_execnz .LBB49_3 ; SI-NEXT: .LBB49_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v11 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v10 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v9 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v8 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v7 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v6 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v8 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: .LBB49_3: ; %end +; SI-NEXT: v_cvt_f16_f32_e32 v0, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_or_b32_e32 v2, v5, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB49_4: -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: s_branch .LBB49_2 ; ; VI-LABEL: bitcast_v6bf16_to_v6f16_scalar: @@ -12183,69 +12431,85 @@ define <6 x bfloat> @bitcast_v6f16_to_v6bf16(<6 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v6f16_to_v6bf16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v7, v0 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v5 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 ; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB50_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB50_4 -; SI-NEXT: .LBB50_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB50_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v7 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v7 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v12 -; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v11 ; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB50_2 -; SI-NEXT: .LBB50_4: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v8 +; SI-NEXT: s_cbranch_execz .LBB50_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v6 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v9 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v8 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: .LBB50_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_alignbit_b32 v0, v3, v0, 16 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v4 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_alignbit_b32 v1, v3, v1, 16 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v5 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v6f16_to_v6bf16: @@ -12323,56 +12587,77 @@ define inreg <6 x bfloat> @bitcast_v6f16_to_v6bf16_scalar(<6 x half> inreg %a, i ; SI-LABEL: bitcast_v6f16_to_v6bf16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v6, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v7, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v8, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v9, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v10, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v11, s21 -; SI-NEXT: s_cmp_lg_u32 s22, 0 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v2 +; SI-NEXT: s_cmp_lg_u32 s19, 0 ; SI-NEXT: s_cbranch_scc0 .LBB51_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v11 ; SI-NEXT: s_cbranch_execnz .LBB51_3 ; SI-NEXT: .LBB51_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v6 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v2 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: .LBB51_3: ; %end +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v9 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v5 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v8 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v2 +; SI-NEXT: v_lshr_b64 v[1:2], v[4:5], 16 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v6 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_lshr_b64 v[2:3], v[3:4], 16 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB51_4: -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: s_branch .LBB51_2 ; ; VI-LABEL: bitcast_v6f16_to_v6bf16_scalar: @@ -12472,65 +12757,75 @@ define <6 x i16> @bitcast_v6bf16_to_v6i16(<6 x bfloat> %a, i32 %b) { ; SI-LABEL: bitcast_v6bf16_to_v6i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 ; SI-NEXT: v_mul_f32_e32 v11, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v10, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v7, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v6, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v9, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v8, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v6, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v4 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB52_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB52_4 -; SI-NEXT: .LBB52_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB52_3: ; %cmp.false +; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v8 ; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB52_2 -; SI-NEXT: .LBB52_4: ; %cmp.true +; SI-NEXT: s_cbranch_execz .LBB52_4 +; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v10 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v11 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v8 -; SI-NEXT: v_alignbit_b32 v0, v2, v0, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v9 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 -; SI-NEXT: v_alignbit_b32 v4, v5, v2, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v7 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v8 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v9 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 +; SI-NEXT: v_alignbit_b32 v2, v3, v1, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v7 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_alignbit_b32 v1, v4, v1, 16 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_alignbit_b32 v5, v1, v5, 16 +; SI-NEXT: .LBB52_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v0, v0, v5 +; SI-NEXT: v_or_b32_e32 v1, v1, v4 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v6bf16_to_v6i16: @@ -12815,53 +13110,65 @@ define inreg <6 x i16> @bitcast_v6bf16_to_v6i16_scalar(<6 x bfloat> inreg %a, i3 ; SI-LABEL: bitcast_v6bf16_to_v6i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_cmp_lg_u32 s22, 0 -; SI-NEXT: v_mul_f32_e64 v11, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v3, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v8, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v10, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v9, 1.0, s21 +; SI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; SI-NEXT: s_lshl_b32 s5, s18, 16 +; SI-NEXT: s_and_b32 s6, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s7, s17, 16 +; SI-NEXT: s_and_b32 s8, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s9, s16, 16 +; SI-NEXT: s_cmp_lg_u32 s19, 0 +; SI-NEXT: v_mul_f32_e64 v11, 1.0, s9 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s8 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s7 +; SI-NEXT: v_mul_f32_e64 v0, 1.0, s6 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s5 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s4 ; SI-NEXT: s_cbranch_scc0 .LBB53_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v7 ; SI-NEXT: s_cbranch_execnz .LBB53_3 ; SI-NEXT: .LBB53_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v11 -; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v1 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v11 -; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v9 +; SI-NEXT: v_lshr_b64 v[3:4], v[3:4], 16 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v10 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v9 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v7 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v8 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; SI-NEXT: v_lshr_b64 v[6:7], v[4:5], 16 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 -; SI-NEXT: v_lshr_b64 v[2:3], v[3:4], 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v11 -; SI-NEXT: v_lshr_b64 v[7:8], v[1:2], 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 +; SI-NEXT: v_lshr_b64 v[1:2], v[7:8], 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v9 +; SI-NEXT: v_lshr_b64 v[9:10], v[0:1], 16 ; SI-NEXT: .LBB53_3: ; %end -; SI-NEXT: v_mov_b32_e32 v1, v7 -; SI-NEXT: v_mov_b32_e32 v3, v4 -; SI-NEXT: v_mov_b32_e32 v4, v6 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v9 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v8 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB53_4: -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: s_branch .LBB53_2 @@ -13180,54 +13487,61 @@ define <6 x bfloat> @bitcast_v6i16_to_v6bf16(<6 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v6i16_to_v6bf16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v9, v4 -; SI-NEXT: v_mov_b32_e32 v7, v2 -; SI-NEXT: v_mov_b32_e32 v8, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB54_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB54_4 -; SI-NEXT: .LBB54_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB54_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v9 -; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB54_2 -; SI-NEXT: .LBB54_4: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v9 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v5, v0 -; SI-NEXT: v_add_i32_e32 v4, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v7 +; SI-NEXT: s_cbranch_execz .LBB54_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v2, v5, v2 +; SI-NEXT: v_or_b32_e32 v1, v4, v1 ; SI-NEXT: v_or_b32_e32 v0, v3, v0 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v8 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x30000, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x30000, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v0 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v1 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v2 +; SI-NEXT: .LBB54_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v3 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v6 +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v4 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v7 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v5 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v8 +; SI-NEXT: v_alignbit_b32 v2, v2, v3, 16 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v6i16_to_v6bf16: @@ -13304,53 +13618,62 @@ define inreg <6 x bfloat> @bitcast_v6i16_to_v6bf16_scalar(<6 x i16> inreg %a, i3 ; SI-LABEL: bitcast_v6i16_to_v6bf16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_cmp_lg_u32 s22, 0 +; SI-NEXT: s_lshr_b32 s14, s18, 16 +; SI-NEXT: s_lshr_b32 s13, s17, 16 +; SI-NEXT: s_lshr_b32 s12, s16, 16 +; SI-NEXT: s_cmp_lg_u32 s19, 0 ; SI-NEXT: s_cbranch_scc0 .LBB55_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshl_b32 s6, s16, 16 +; SI-NEXT: s_lshl_b32 s8, s16, 16 +; SI-NEXT: s_lshl_b32 s11, s12, 16 ; SI-NEXT: s_lshl_b32 s7, s17, 16 -; SI-NEXT: s_lshl_b32 s8, s18, 16 -; SI-NEXT: s_lshl_b32 s9, s19, 16 -; SI-NEXT: s_lshl_b32 s11, s20, 16 -; SI-NEXT: s_lshl_b32 s10, s21, 16 +; SI-NEXT: s_lshl_b32 s10, s13, 16 +; SI-NEXT: s_lshl_b32 s6, s18, 16 +; SI-NEXT: s_lshl_b32 s9, s14, 16 ; SI-NEXT: s_cbranch_execnz .LBB55_3 ; SI-NEXT: .LBB55_2: ; %cmp.true -; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: s_and_b32 s4, s20, 0xffff -; SI-NEXT: s_lshl_b32 s5, s21, 16 ; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: s_lshl_b32 s5, s14, 16 +; SI-NEXT: s_add_i32 s17, s17, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s13, 16 ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_or_b32 s5, s6, s5 ; SI-NEXT: s_and_b32 s6, s16, 0xffff -; SI-NEXT: s_lshl_b32 s7, s17, 16 +; SI-NEXT: s_lshl_b32 s7, s12, 16 ; SI-NEXT: s_or_b32 s6, s7, s6 ; SI-NEXT: s_add_i32 s4, s4, 0x30000 ; SI-NEXT: s_add_i32 s5, s5, 0x30000 ; SI-NEXT: s_add_i32 s6, s6, 0x30000 -; SI-NEXT: s_and_b32 s7, s6, 0xffff0000 -; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: s_and_b32 s9, s5, 0xffff0000 -; SI-NEXT: s_lshl_b32 s8, s5, 16 -; SI-NEXT: s_and_b32 s10, s4, 0xffff0000 -; SI-NEXT: s_lshl_b32 s11, s4, 16 +; SI-NEXT: s_and_b32 s11, s6, 0xffff0000 +; SI-NEXT: s_lshl_b32 s8, s6, 16 +; SI-NEXT: s_and_b32 s10, s5, 0xffff0000 +; SI-NEXT: s_lshl_b32 s7, s5, 16 +; SI-NEXT: s_and_b32 s9, s4, 0xffff0000 +; SI-NEXT: s_lshl_b32 s6, s4, 16 ; SI-NEXT: .LBB55_3: ; %end -; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: v_mov_b32_e32 v1, s7 -; SI-NEXT: v_mov_b32_e32 v2, s8 -; SI-NEXT: v_mov_b32_e32 v3, s9 -; SI-NEXT: v_mov_b32_e32 v4, s11 -; SI-NEXT: v_mov_b32_e32 v5, s10 +; SI-NEXT: v_mul_f32_e64 v0, 1.0, s11 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_mul_f32_e64 v0, 1.0, s8 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s10 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s7 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s9 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s6 +; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB55_4: -; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; implicit-def: $sgpr7 ; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: ; implicit-def: $sgpr9 ; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr7 ; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr9 ; SI-NEXT: s_branch .LBB55_2 ; ; VI-LABEL: bitcast_v6i16_to_v6bf16_scalar: @@ -13446,45 +13769,63 @@ define <6 x i16> @bitcast_v6f16_to_v6i16(<6 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v6f16_to_v6i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB56_2 ; SI-NEXT: ; %bb.1: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v5 -; SI-NEXT: v_or_b32_e32 v4, v4, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v2, v2, v6 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_or_b32_e32 v0, v0, v3 +; SI-NEXT: v_alignbit_b32 v6, v2, v3, 16 ; SI-NEXT: .LBB56_2: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v3 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v6f16_to_v6i16: @@ -13562,44 +13903,61 @@ define inreg <6 x i16> @bitcast_v6f16_to_v6i16_scalar(<6 x half> inreg %a, i32 i ; SI-LABEL: bitcast_v6f16_to_v6i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v6, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v2, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v3, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v4, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v5, s21 -; SI-NEXT: s_cmp_lg_u32 s22, 0 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v0 +; SI-NEXT: s_cmp_lg_u32 s19, 0 ; SI-NEXT: s_cbranch_scc0 .LBB57_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_cbranch_execnz .LBB57_3 ; SI-NEXT: .LBB57_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v6, v2, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v2, v2, v6 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v5 -; SI-NEXT: v_lshr_b64 v[6:7], v[1:2], 16 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_or_b32_e32 v4, v4, v8 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v2 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v2 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v5 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_lshr_b64 v[2:3], v[0:1], 16 +; SI-NEXT: v_or_b32_e32 v3, v8, v7 ; SI-NEXT: .LBB57_3: ; %end -; SI-NEXT: v_mov_b32_e32 v1, v6 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v5 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB57_4: ; SI-NEXT: s_branch .LBB57_2 @@ -13701,57 +14059,62 @@ define <6 x half> @bitcast_v6i16_to_v6f16(<6 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v6i16_to_v6f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v12, v5 -; SI-NEXT: v_mov_b32_e32 v7, v4 -; SI-NEXT: v_mov_b32_e32 v8, v3 -; SI-NEXT: v_mov_b32_e32 v9, v2 -; SI-NEXT: v_mov_b32_e32 v10, v1 -; SI-NEXT: v_mov_b32_e32 v11, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 ; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB58_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB58_4 -; SI-NEXT: .LBB58_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB58_3: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v0, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: s_cbranch_execz .LBB58_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v9 ; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: .LBB58_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB58_2 -; SI-NEXT: .LBB58_4: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v12 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v7 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v8 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v9 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v10 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: s_cbranch_execz .LBB58_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: .LBB58_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_cvt_f16_f32_e32 v0, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_or_b32_e32 v2, v5, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v6i16_to_v6f16: @@ -13828,38 +14191,53 @@ define inreg <6 x half> @bitcast_v6i16_to_v6f16_scalar(<6 x i16> inreg %a, i32 i ; SI-LABEL: bitcast_v6i16_to_v6f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_cmp_lg_u32 s22, 0 +; SI-NEXT: s_lshr_b32 s8, s18, 16 +; SI-NEXT: s_lshr_b32 s7, s17, 16 +; SI-NEXT: s_lshr_b32 s6, s16, 16 +; SI-NEXT: s_cmp_lg_u32 s19, 0 ; SI-NEXT: s_cbranch_scc0 .LBB59_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s8 ; SI-NEXT: s_cbranch_execnz .LBB59_3 ; SI-NEXT: .LBB59_2: ; %cmp.true -; SI-NEXT: s_add_i32 s21, s21, 3 -; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s8, s8, 3 ; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s7, s7, 3 ; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s6, s6, 3 ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s8 ; SI-NEXT: .LBB59_3: ; %end +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v0, v0, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB59_4: ; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: s_branch .LBB59_2 ; ; VI-LABEL: bitcast_v6i16_to_v6f16_scalar: diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll index f752cea3526af..daa771a843ee6 100644 --- a/llvm/test/CodeGen/AMDGPU/bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/bf16.ll @@ -100,10 +100,8 @@ define <2 x bfloat> @v_load_global_v2bf16(ptr addrspace(1) %ptr) { ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s4, s6 ; GCN-NEXT: s_mov_b32 s5, s6 -; GCN-NEXT: buffer_load_dword v1, v[0:1], s[4:7], 0 addr64 +; GCN-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_load_global_v2bf16: @@ -113,10 +111,8 @@ define <2 x bfloat> @v_load_global_v2bf16(ptr addrspace(1) %ptr) { ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v1, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_load_global_v2bf16: @@ -166,11 +162,9 @@ define <3 x bfloat> @v_load_global_v3bf16(ptr addrspace(1) %ptr) { ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s4, s6 ; GCN-NEXT: s_mov_b32 s5, s6 -; GCN-NEXT: buffer_load_dwordx2 v[1:2], v[0:1], s[4:7], 0 addr64 +; GCN-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_load_global_v3bf16: @@ -180,11 +174,9 @@ define <3 x bfloat> @v_load_global_v3bf16(ptr addrspace(1) %ptr) { ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dwordx2 v[1:2], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_load_global_v3bf16: @@ -234,12 +226,8 @@ define <4 x bfloat> @v_load_global_v4bf16(ptr addrspace(1) %ptr) { ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s4, s6 ; GCN-NEXT: s_mov_b32 s5, s6 -; GCN-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 +; GCN-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_load_global_v4bf16: @@ -249,12 +237,8 @@ define <4 x bfloat> @v_load_global_v4bf16(ptr addrspace(1) %ptr) { ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_load_global_v4bf16: @@ -304,14 +288,8 @@ define <6 x bfloat> @v_load_global_v6bf16(ptr addrspace(1) %ptr) { ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s4, s6 ; GCN-NEXT: s_mov_b32 s5, s6 -; GCN-NEXT: buffer_load_dwordx4 v[3:6], v[0:1], s[4:7], 0 addr64 +; GCN-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_load_global_v6bf16: @@ -321,14 +299,8 @@ define <6 x bfloat> @v_load_global_v6bf16(ptr addrspace(1) %ptr) { ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dwordx3 v[3:5], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_dwordx3 v[0:2], v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v3 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_load_global_v6bf16: @@ -378,16 +350,8 @@ define <8 x bfloat> @v_load_global_v8bf16(ptr addrspace(1) %ptr) { ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s4, s6 ; GCN-NEXT: s_mov_b32 s5, s6 -; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 +; GCN-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v5 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v6 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_load_global_v8bf16: @@ -397,16 +361,8 @@ define <8 x bfloat> @v_load_global_v8bf16(ptr addrspace(1) %ptr) { ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v5 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_load_global_v8bf16: @@ -452,59 +408,29 @@ define <16 x bfloat> @v_load_global_v16bf16(ptr addrspace(1) %ptr) { ; GCN-LABEL: v_load_global_v16bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v5, v1 +; GCN-NEXT: v_mov_b32_e32 v4, v0 ; GCN-NEXT: s_mov_b32 s6, 0 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s4, s6 ; GCN-NEXT: s_mov_b32 s5, s6 -; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 -; GCN-NEXT: buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:16 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v5 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v6 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GCN-NEXT: buffer_load_dwordx4 v[0:3], v[4:5], s[4:7], 0 addr64 +; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[4:5], s[4:7], 0 addr64 offset:16 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v12 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v13 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v14 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v15 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_load_global_v16bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: v_mov_b32_e32 v5, v1 +; GFX7-NEXT: v_mov_b32_e32 v4, v0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:16 -; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v5 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[4:5], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_dwordx4 v[4:7], v[4:5], s[4:7], 0 addr64 offset:16 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v12 -; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v12 -; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v13 -; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v13 -; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v14 -; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v14 -; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v15 -; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_load_global_v16bf16: @@ -581,99 +507,33 @@ define <32 x bfloat> @v_load_global_v32bf16(ptr addrspace(1) %ptr) { ; GCN-LABEL: v_load_global_v32bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v13, v1 +; GCN-NEXT: v_mov_b32_e32 v12, v0 ; GCN-NEXT: s_mov_b32 s6, 0 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s4, s6 ; GCN-NEXT: s_mov_b32 s5, s6 -; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 -; GCN-NEXT: buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:16 -; GCN-NEXT: buffer_load_dwordx4 v[20:23], v[0:1], s[4:7], 0 addr64 offset:32 -; GCN-NEXT: buffer_load_dwordx4 v[28:31], v[0:1], s[4:7], 0 addr64 offset:48 -; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v5 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v6 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v12 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v13 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v14 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v15 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v20 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v21 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v22 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v23 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; GCN-NEXT: buffer_load_dwordx4 v[0:3], v[12:13], s[4:7], 0 addr64 +; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[12:13], s[4:7], 0 addr64 offset:16 +; GCN-NEXT: buffer_load_dwordx4 v[8:11], v[12:13], s[4:7], 0 addr64 offset:32 +; GCN-NEXT: buffer_load_dwordx4 v[12:15], v[12:13], s[4:7], 0 addr64 offset:48 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v28 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v29 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v30 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v31 -; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_load_global_v32bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: v_mov_b32_e32 v13, v1 +; GFX7-NEXT: v_mov_b32_e32 v12, v0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:16 -; GFX7-NEXT: buffer_load_dwordx4 v[20:23], v[0:1], s[4:7], 0 addr64 offset:32 -; GFX7-NEXT: buffer_load_dwordx4 v[28:31], v[0:1], s[4:7], 0 addr64 offset:48 -; GFX7-NEXT: s_waitcnt vmcnt(3) -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v5 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX7-NEXT: s_waitcnt vmcnt(2) -; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v12 -; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v12 -; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v13 -; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v13 -; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v14 -; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v14 -; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v15 -; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_lshlrev_b32_e32 v16, 16, v20 -; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v20 -; GFX7-NEXT: v_lshlrev_b32_e32 v18, 16, v21 -; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v21 -; GFX7-NEXT: v_lshlrev_b32_e32 v20, 16, v22 -; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v22 -; GFX7-NEXT: v_lshlrev_b32_e32 v22, 16, v23 -; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[12:13], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_dwordx4 v[4:7], v[12:13], s[4:7], 0 addr64 offset:16 +; GFX7-NEXT: buffer_load_dwordx4 v[8:11], v[12:13], s[4:7], 0 addr64 offset:32 +; GFX7-NEXT: buffer_load_dwordx4 v[12:15], v[12:13], s[4:7], 0 addr64 offset:48 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshlrev_b32_e32 v24, 16, v28 -; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v28 -; GFX7-NEXT: v_lshlrev_b32_e32 v26, 16, v29 -; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v29 -; GFX7-NEXT: v_lshlrev_b32_e32 v28, 16, v30 -; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v30 -; GFX7-NEXT: v_lshlrev_b32_e32 v30, 16, v31 -; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_load_global_v32bf16: @@ -766,184 +626,40 @@ define <64 x bfloat> @v_load_global_v64bf16(ptr addrspace(1) %ptr) { ; GCN-LABEL: v_load_global_v64bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: v_mov_b32_e32 v29, v1 +; GCN-NEXT: v_mov_b32_e32 v28, v0 ; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 0x7c, v0 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 0x78, v0 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 0x74, v0 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 0x70, v0 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 0x6c, v0 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 0x68, v0 +; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s4, s6 ; GCN-NEXT: s_mov_b32 s5, s6 -; GCN-NEXT: buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:112 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 0x64, v0 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 0x60, v0 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 0x5c, v0 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v5, v8, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v9, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v3, v10, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:96 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 0x58, v0 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 0x54, v0 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 0x50, v0 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v6, v11, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v5, v12, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v13, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v3, v14, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:80 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 0x4c, v0 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 0x48, v0 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 0x44, v0 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v6, v15, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v8, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v3, v9, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:64 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 64, v0 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 60, v0 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 56, v0 +; GCN-NEXT: buffer_load_dwordx4 v[0:3], v[28:29], s[4:7], 0 addr64 +; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[28:29], s[4:7], 0 addr64 offset:16 +; GCN-NEXT: buffer_load_dwordx4 v[8:11], v[28:29], s[4:7], 0 addr64 offset:32 +; GCN-NEXT: buffer_load_dwordx4 v[12:15], v[28:29], s[4:7], 0 addr64 offset:48 +; GCN-NEXT: buffer_load_dwordx4 v[16:19], v[28:29], s[4:7], 0 addr64 offset:64 +; GCN-NEXT: buffer_load_dwordx4 v[20:23], v[28:29], s[4:7], 0 addr64 offset:80 +; GCN-NEXT: buffer_load_dwordx4 v[24:27], v[28:29], s[4:7], 0 addr64 offset:96 +; GCN-NEXT: buffer_load_dwordx4 v[28:31], v[28:29], s[4:7], 0 addr64 offset:112 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v6, v10, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v5, v11, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v12, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:32 -; GCN-NEXT: buffer_load_dwordx4 v[7:10], v[1:2], s[4:7], 0 addr64 offset:48 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 52, v0 -; GCN-NEXT: buffer_load_dwordx4 v[11:14], v[1:2], s[4:7], 0 addr64 -; GCN-NEXT: buffer_load_dwordx4 v[15:18], v[1:2], s[4:7], 0 addr64 offset:16 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: buffer_store_dword v10, v19, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v1, vcc, 48, v0 -; GCN-NEXT: buffer_store_dword v9, v20, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v2, vcc, 44, v0 -; GCN-NEXT: buffer_store_dword v8, v21, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v8, vcc, 40, v0 -; GCN-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v1, vcc, 36, v0 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v7, vcc, 32, v0 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 28, v0 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 24, v0 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 20, v0 -; GCN-NEXT: buffer_store_dword v6, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v2, vcc, 16, v0 -; GCN-NEXT: buffer_store_dword v5, v8, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v5, vcc, 12, v0 -; GCN-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v1, vcc, 8, v0 -; GCN-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v3, vcc, 4, v0 -; GCN-NEXT: s_waitcnt vmcnt(8) -; GCN-NEXT: buffer_store_dword v18, v9, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v10, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v19, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v5, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_load_global_v64bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: v_mov_b32_e32 v29, v1 +; GFX7-NEXT: v_mov_b32_e32 v28, v0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:112 -; GFX7-NEXT: v_add_i32_e32 v7, vcc, 0x7c, v0 -; GFX7-NEXT: v_add_i32_e32 v8, vcc, 0x78, v0 -; GFX7-NEXT: v_add_i32_e32 v9, vcc, 0x74, v0 -; GFX7-NEXT: v_add_i32_e32 v10, vcc, 0x70, v0 -; GFX7-NEXT: v_add_i32_e32 v19, vcc, 52, v0 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen -; GFX7-NEXT: buffer_store_dword v5, v8, s[0:3], 0 offen -; GFX7-NEXT: buffer_store_dword v4, v9, s[0:3], 0 offen -; GFX7-NEXT: buffer_store_dword v3, v10, s[0:3], 0 offen -; GFX7-NEXT: buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:96 -; GFX7-NEXT: v_add_i32_e32 v7, vcc, 0x6c, v0 -; GFX7-NEXT: v_add_i32_e32 v8, vcc, 0x68, v0 -; GFX7-NEXT: v_add_i32_e32 v9, vcc, 0x64, v0 -; GFX7-NEXT: v_add_i32_e32 v10, vcc, 0x60, v0 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen -; GFX7-NEXT: buffer_store_dword v5, v8, s[0:3], 0 offen -; GFX7-NEXT: buffer_store_dword v4, v9, s[0:3], 0 offen -; GFX7-NEXT: buffer_store_dword v3, v10, s[0:3], 0 offen -; GFX7-NEXT: buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:80 -; GFX7-NEXT: v_add_i32_e32 v7, vcc, 0x5c, v0 -; GFX7-NEXT: v_add_i32_e32 v8, vcc, 0x58, v0 -; GFX7-NEXT: v_add_i32_e32 v9, vcc, 0x54, v0 -; GFX7-NEXT: v_add_i32_e32 v10, vcc, 0x50, v0 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen -; GFX7-NEXT: buffer_store_dword v5, v8, s[0:3], 0 offen -; GFX7-NEXT: buffer_store_dword v4, v9, s[0:3], 0 offen -; GFX7-NEXT: buffer_store_dword v3, v10, s[0:3], 0 offen -; GFX7-NEXT: buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:64 -; GFX7-NEXT: v_add_i32_e32 v7, vcc, 0x4c, v0 -; GFX7-NEXT: v_add_i32_e32 v8, vcc, 0x48, v0 -; GFX7-NEXT: v_add_i32_e32 v9, vcc, 0x44, v0 -; GFX7-NEXT: v_add_i32_e32 v10, vcc, 64, v0 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen -; GFX7-NEXT: buffer_store_dword v5, v8, s[0:3], 0 offen -; GFX7-NEXT: buffer_store_dword v4, v9, s[0:3], 0 offen -; GFX7-NEXT: buffer_store_dword v3, v10, s[0:3], 0 offen -; GFX7-NEXT: buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:48 -; GFX7-NEXT: buffer_load_dwordx4 v[7:10], v[1:2], s[4:7], 0 addr64 offset:32 -; GFX7-NEXT: buffer_load_dwordx4 v[11:14], v[1:2], s[4:7], 0 addr64 offset:16 -; GFX7-NEXT: buffer_load_dwordx4 v[15:18], v[1:2], s[4:7], 0 addr64 -; GFX7-NEXT: v_add_i32_e32 v1, vcc, 60, v0 -; GFX7-NEXT: v_add_i32_e32 v2, vcc, 56, v0 -; GFX7-NEXT: s_waitcnt vmcnt(3) -; GFX7-NEXT: buffer_store_dword v6, v1, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v1, vcc, 48, v0 -; GFX7-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v2, vcc, 44, v0 -; GFX7-NEXT: buffer_store_dword v4, v19, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v4, vcc, 40, v0 -; GFX7-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v1, vcc, 36, v0 -; GFX7-NEXT: v_add_i32_e32 v3, vcc, 32, v0 -; GFX7-NEXT: v_add_i32_e32 v5, vcc, 28, v0 -; GFX7-NEXT: v_add_i32_e32 v6, vcc, 24, v0 -; GFX7-NEXT: v_add_i32_e32 v19, vcc, 20, v0 -; GFX7-NEXT: s_waitcnt vmcnt(6) -; GFX7-NEXT: buffer_store_dword v10, v2, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v2, vcc, 16, v0 -; GFX7-NEXT: buffer_store_dword v9, v4, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v4, vcc, 12, v0 -; GFX7-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v1, vcc, 8, v0 -; GFX7-NEXT: buffer_store_dword v7, v3, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v3, vcc, 4, v0 -; GFX7-NEXT: s_waitcnt vmcnt(9) -; GFX7-NEXT: buffer_store_dword v14, v5, s[0:3], 0 offen -; GFX7-NEXT: buffer_store_dword v13, v6, s[0:3], 0 offen -; GFX7-NEXT: buffer_store_dword v12, v19, s[0:3], 0 offen -; GFX7-NEXT: buffer_store_dword v11, v2, s[0:3], 0 offen -; GFX7-NEXT: s_waitcnt vmcnt(12) -; GFX7-NEXT: buffer_store_dword v18, v4, s[0:3], 0 offen -; GFX7-NEXT: buffer_store_dword v17, v1, s[0:3], 0 offen -; GFX7-NEXT: buffer_store_dword v16, v3, s[0:3], 0 offen -; GFX7-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen +; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[28:29], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_dwordx4 v[4:7], v[28:29], s[4:7], 0 addr64 offset:16 +; GFX7-NEXT: buffer_load_dwordx4 v[8:11], v[28:29], s[4:7], 0 addr64 offset:32 +; GFX7-NEXT: buffer_load_dwordx4 v[12:15], v[28:29], s[4:7], 0 addr64 offset:48 +; GFX7-NEXT: buffer_load_dwordx4 v[16:19], v[28:29], s[4:7], 0 addr64 offset:64 +; GFX7-NEXT: buffer_load_dwordx4 v[20:23], v[28:29], s[4:7], 0 addr64 offset:80 +; GFX7-NEXT: buffer_load_dwordx4 v[24:27], v[28:29], s[4:7], 0 addr64 offset:96 +; GFX7-NEXT: buffer_load_dwordx4 v[28:31], v[28:29], s[4:7], 0 addr64 offset:112 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -1073,30 +789,22 @@ define void @v_store_global_v2bf16(<2 x bfloat> %val, ptr addrspace(1) %ptr) { ; GCN-LABEL: v_store_global_v2bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s4, s6 ; GCN-NEXT: s_mov_b32 s5, s6 -; GCN-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64 +; GCN-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_store_global_v2bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: v_alignbit_b32 v0, v1, v0, 16 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -1150,36 +858,24 @@ define void @v_store_global_v3bf16(<3 x bfloat> %val, ptr addrspace(1) %ptr) { ; GCN-LABEL: v_store_global_v3bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s4, s6 ; GCN-NEXT: s_mov_b32 s5, s6 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; GCN-NEXT: buffer_store_short v2, v[3:4], s[4:7], 0 addr64 offset:4 -; GCN-NEXT: buffer_store_dword v0, v[3:4], s[4:7], 0 addr64 +; GCN-NEXT: buffer_store_short v1, v[2:3], s[4:7], 0 addr64 offset:4 +; GCN-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_store_global_v3bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_alignbit_b32 v0, v1, v0, 16 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v2 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: buffer_store_short v1, v[3:4], s[4:7], 0 addr64 offset:4 -; GFX7-NEXT: buffer_store_dword v0, v[3:4], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_store_short v1, v[2:3], s[4:7], 0 addr64 offset:4 +; GFX7-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -1232,38 +928,22 @@ define void @v_store_global_v4bf16(<4 x bfloat> %val, ptr addrspace(1) %ptr) { ; GCN-LABEL: v_store_global_v4bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v1 -; GCN-NEXT: v_alignbit_b32 v1, v3, v2, 16 -; GCN-NEXT: v_alignbit_b32 v0, v6, v0, 16 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s4, s6 ; GCN-NEXT: s_mov_b32 s5, s6 -; GCN-NEXT: buffer_store_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64 +; GCN-NEXT: buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_store_global_v4bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_store_dwordx2 v[1:2], v[4:5], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -1307,54 +987,22 @@ define void @v_store_global_v8bf16(<8 x bfloat> %val, ptr addrspace(1) %ptr) { ; GCN-LABEL: v_store_global_v8bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s4, s6 ; GCN-NEXT: s_mov_b32 s5, s6 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v1 -; GCN-NEXT: v_alignbit_b32 v3, v2, v6, 16 -; GCN-NEXT: v_alignbit_b32 v2, v5, v4, 16 -; GCN-NEXT: v_alignbit_b32 v1, v7, v10, 16 -; GCN-NEXT: v_alignbit_b32 v0, v11, v0, 16 -; GCN-NEXT: buffer_store_dwordx4 v[0:3], v[8:9], s[4:7], 0 addr64 +; GCN-NEXT: buffer_store_dwordx4 v[0:3], v[4:5], s[4:7], 0 addr64 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_store_global_v8bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: v_alignbit_b32 v6, v7, v6, 16 -; GFX7-NEXT: v_alignbit_b32 v5, v5, v4, 16 -; GFX7-NEXT: v_alignbit_b32 v4, v3, v2, 16 -; GFX7-NEXT: v_alignbit_b32 v3, v1, v0, 16 -; GFX7-NEXT: buffer_store_dwordx4 v[3:6], v[8:9], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_store_dwordx4 v[0:3], v[4:5], s[4:7], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -1398,88 +1046,24 @@ define void @v_store_global_v16bf16(<16 x bfloat> %val, ptr addrspace(1) %ptr) { ; GCN-LABEL: v_store_global_v16bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v15 -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s4, s6 ; GCN-NEXT: s_mov_b32 s5, s6 -; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_alignbit_b32 v3, v7, v6, 16 -; GCN-NEXT: v_alignbit_b32 v2, v5, v4, 16 -; GCN-NEXT: v_alignbit_b32 v1, v15, v18, 16 -; GCN-NEXT: v_alignbit_b32 v0, v19, v0, 16 -; GCN-NEXT: v_alignbit_b32 v7, v20, v14, 16 -; GCN-NEXT: v_alignbit_b32 v6, v13, v12, 16 -; GCN-NEXT: v_alignbit_b32 v5, v11, v10, 16 -; GCN-NEXT: v_alignbit_b32 v4, v9, v8, 16 -; GCN-NEXT: buffer_store_dwordx4 v[4:7], v[16:17], s[4:7], 0 addr64 offset:16 -; GCN-NEXT: buffer_store_dwordx4 v[0:3], v[16:17], s[4:7], 0 addr64 +; GCN-NEXT: buffer_store_dwordx4 v[4:7], v[8:9], s[4:7], 0 addr64 offset:16 +; GCN-NEXT: buffer_store_dwordx4 v[0:3], v[8:9], s[4:7], 0 addr64 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_store_global_v16bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_alignbit_b32 v5, v5, v4, 16 -; GFX7-NEXT: v_alignbit_b32 v4, v3, v2, 16 -; GFX7-NEXT: v_alignbit_b32 v3, v1, v0, 16 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v15 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v14 -; GFX7-NEXT: v_alignbit_b32 v14, v0, v1, 16 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v13 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v12 -; GFX7-NEXT: v_alignbit_b32 v13, v0, v1, 16 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v11 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v10 -; GFX7-NEXT: v_alignbit_b32 v12, v0, v1, 16 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v9 -; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v8 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: v_alignbit_b32 v11, v0, v1, 16 -; GFX7-NEXT: v_alignbit_b32 v6, v7, v6, 16 -; GFX7-NEXT: buffer_store_dwordx4 v[11:14], v[16:17], s[4:7], 0 addr64 offset:16 -; GFX7-NEXT: buffer_store_dwordx4 v[3:6], v[16:17], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_store_dwordx4 v[4:7], v[8:9], s[4:7], 0 addr64 offset:16 +; GFX7-NEXT: buffer_store_dwordx4 v[0:3], v[8:9], s[4:7], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -1533,167 +1117,28 @@ define void @v_store_global_v32bf16(<32 x bfloat> %val, ptr addrspace(1) %ptr) { ; GCN-LABEL: v_store_global_v32bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 -; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 -; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 -; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v21 -; GCN-NEXT: v_alignbit_b32 v21, v23, v22, 16 -; GCN-NEXT: v_alignbit_b32 v20, v31, v20, 16 -; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_alignbit_b32 v19, v19, v18, 16 -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_alignbit_b32 v18, v17, v16, 16 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v5 -; GCN-NEXT: v_alignbit_b32 v5, v7, v6, 16 -; GCN-NEXT: v_alignbit_b32 v4, v16, v4, 16 ; GCN-NEXT: s_mov_b32 s6, 0 ; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v8 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v29 -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v28 -; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v27 -; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v26 -; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 -; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v3 -; GCN-NEXT: v_alignbit_b32 v3, v0, v2, 16 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 -; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v30 ; GCN-NEXT: s_mov_b32 s4, s6 ; GCN-NEXT: s_mov_b32 s5, s6 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_alignbit_b32 v2, v2, v7, 16 -; GCN-NEXT: v_alignbit_b32 v9, v6, v14, 16 -; GCN-NEXT: v_alignbit_b32 v8, v13, v12, 16 -; GCN-NEXT: v_alignbit_b32 v7, v11, v10, 16 -; GCN-NEXT: v_alignbit_b32 v6, v15, v16, 16 -; GCN-NEXT: v_alignbit_b32 v12, v28, v17, 16 -; GCN-NEXT: v_alignbit_b32 v11, v22, v23, 16 -; GCN-NEXT: v_alignbit_b32 v10, v25, v24, 16 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: buffer_store_dwordx4 v[18:21], v[0:1], s[4:7], 0 addr64 offset:32 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v26 -; GCN-NEXT: buffer_store_dwordx4 v[6:9], v[0:1], s[4:7], 0 addr64 offset:16 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v13 -; GCN-NEXT: v_alignbit_b32 v13, v6, v27, 16 -; GCN-NEXT: buffer_store_dwordx4 v[10:13], v[0:1], s[4:7], 0 addr64 offset:48 -; GCN-NEXT: buffer_store_dwordx4 v[2:5], v[0:1], s[4:7], 0 addr64 +; GCN-NEXT: buffer_store_dwordx4 v[12:15], v[16:17], s[4:7], 0 addr64 offset:48 +; GCN-NEXT: buffer_store_dwordx4 v[8:11], v[16:17], s[4:7], 0 addr64 offset:32 +; GCN-NEXT: buffer_store_dwordx4 v[4:7], v[16:17], s[4:7], 0 addr64 offset:16 +; GCN-NEXT: buffer_store_dwordx4 v[0:3], v[16:17], s[4:7], 0 addr64 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_store_global_v32bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 -; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GFX7-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GFX7-NEXT: v_alignbit_b32 v25, v25, v24, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v24, 16, v5 -; GFX7-NEXT: v_alignbit_b32 v5, v7, v6, 16 -; GFX7-NEXT: buffer_load_dword v6, off, s[0:3], s32 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_alignbit_b32 v3, v3, v2, 16 -; GFX7-NEXT: v_alignbit_b32 v2, v1, v0, 16 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v15 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v14 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v13 -; GFX7-NEXT: v_alignbit_b32 v13, v0, v1, 16 -; GFX7-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 -; GFX7-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 -; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29 -; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GFX7-NEXT: v_lshrrev_b32_e32 v29, 16, v29 -; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28 -; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27 -; GFX7-NEXT: v_alignbit_b32 v12, v7, v12, 16 -; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v30 -; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v27 -; GFX7-NEXT: v_alignbit_b32 v27, v29, v28, 16 -; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 -; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GFX7-NEXT: v_alignbit_b32 v11, v11, v10, 16 -; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v20 -; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: v_alignbit_b32 v26, v31, v26, 16 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: v_alignbit_b32 v4, v24, v4, 16 -; GFX7-NEXT: s_waitcnt vmcnt(2) -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX7-NEXT: v_alignbit_b32 v28, v6, v7, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v9 -; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v8 -; GFX7-NEXT: v_alignbit_b32 v10, v6, v7, 16 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v23 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v22 -; GFX7-NEXT: v_alignbit_b32 v9, v6, v7, 16 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v19 -; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v21 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v18 -; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GFX7-NEXT: v_alignbit_b32 v7, v6, v7, 16 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v17 -; GFX7-NEXT: v_alignbit_b32 v8, v8, v14, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v16 -; GFX7-NEXT: v_alignbit_b32 v6, v6, v14, 16 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_store_dwordx4 v[25:28], v[0:1], s[4:7], 0 addr64 offset:48 -; GFX7-NEXT: buffer_store_dwordx4 v[6:9], v[0:1], s[4:7], 0 addr64 offset:32 -; GFX7-NEXT: buffer_store_dwordx4 v[10:13], v[0:1], s[4:7], 0 addr64 offset:16 -; GFX7-NEXT: buffer_store_dwordx4 v[2:5], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_store_dwordx4 v[12:15], v[16:17], s[4:7], 0 addr64 offset:48 +; GFX7-NEXT: buffer_store_dwordx4 v[8:11], v[16:17], s[4:7], 0 addr64 offset:32 +; GFX7-NEXT: buffer_store_dwordx4 v[4:7], v[16:17], s[4:7], 0 addr64 offset:16 +; GFX7-NEXT: buffer_store_dwordx4 v[0:3], v[16:17], s[4:7], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -1761,423 +1206,44 @@ define void @v_store_global_v64bf16(<64 x bfloat> %val, ptr addrspace(1) %ptr) { ; GCN-LABEL: v_store_global_v64bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 -; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 -; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 -; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v21 -; GCN-NEXT: v_alignbit_b32 v21, v23, v22, 16 -; GCN-NEXT: v_alignbit_b32 v20, v31, v20, 16 -; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_alignbit_b32 v19, v19, v18, 16 -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_alignbit_b32 v18, v17, v16, 16 -; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v13 -; GCN-NEXT: v_alignbit_b32 v13, v15, v14, 16 -; GCN-NEXT: v_alignbit_b32 v12, v16, v12, 16 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GCN-NEXT: s_mov_b32 s6, 0 ; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v29 -; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v28 -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v27 -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_alignbit_b32 v11, v11, v10, 16 -; GCN-NEXT: v_alignbit_b32 v10, v9, v8, 16 -; GCN-NEXT: v_alignbit_b32 v3, v7, v6, 16 -; GCN-NEXT: v_alignbit_b32 v2, v5, v4, 16 -; GCN-NEXT: v_alignbit_b32 v1, v22, v14, 16 -; GCN-NEXT: v_alignbit_b32 v0, v23, v0, 16 -; GCN-NEXT: v_alignbit_b32 v6, v26, v15, 16 -; GCN-NEXT: v_alignbit_b32 v5, v16, v17, 16 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:136 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:132 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:128 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:124 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:120 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:116 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:112 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:108 ; GCN-NEXT: s_mov_b32 s4, s6 ; GCN-NEXT: s_mov_b32 s5, s6 -; GCN-NEXT: s_waitcnt vmcnt(6) -; GCN-NEXT: buffer_store_dwordx4 v[18:21], v[8:9], s[4:7], 0 addr64 offset:32 -; GCN-NEXT: buffer_store_dwordx4 v[10:13], v[8:9], s[4:7], 0 addr64 offset:16 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:104 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:100 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:96 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:92 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:88 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:84 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:80 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:76 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v25 -; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v24 -; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v30 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_alignbit_b32 v4, v4, v23, 16 -; GCN-NEXT: s_waitcnt vmcnt(14) -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 -; GCN-NEXT: s_waitcnt vmcnt(13) -; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 -; GCN-NEXT: s_waitcnt vmcnt(12) -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; GCN-NEXT: s_waitcnt vmcnt(11) -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GCN-NEXT: s_waitcnt vmcnt(10) -; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 -; GCN-NEXT: s_waitcnt vmcnt(7) -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 -; GCN-NEXT: s_waitcnt vmcnt(6) -; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v11 -; GCN-NEXT: s_waitcnt vmcnt(5) -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v12 -; GCN-NEXT: s_waitcnt vmcnt(4) -; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v13 -; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v18 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v19 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v20 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v13 -; GCN-NEXT: v_alignbit_b32 v13, v7, v14, 16 -; GCN-NEXT: v_alignbit_b32 v12, v15, v16, 16 -; GCN-NEXT: v_alignbit_b32 v11, v17, v22, 16 -; GCN-NEXT: v_alignbit_b32 v10, v10, v23, 16 -; GCN-NEXT: v_alignbit_b32 v17, v20, v25, 16 -; GCN-NEXT: v_alignbit_b32 v16, v21, v18, 16 -; GCN-NEXT: v_alignbit_b32 v15, v26, v19, 16 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:72 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:68 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:16 -; GCN-NEXT: s_waitcnt vmcnt(7) -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GCN-NEXT: s_waitcnt vmcnt(6) -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 -; GCN-NEXT: s_waitcnt vmcnt(5) -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 -; GCN-NEXT: s_waitcnt vmcnt(4) -; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 -; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v21 -; GCN-NEXT: v_alignbit_b32 v14, v7, v14, 16 -; GCN-NEXT: v_alignbit_b32 v7, v18, v24, 16 -; GCN-NEXT: v_alignbit_b32 v21, v19, v20, 16 -; GCN-NEXT: v_alignbit_b32 v20, v25, v22, 16 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:60 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:48 -; GCN-NEXT: s_waitcnt vmcnt(7) -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v23 -; GCN-NEXT: v_alignbit_b32 v19, v19, v18, 16 -; GCN-NEXT: s_waitcnt vmcnt(6) -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v22 -; GCN-NEXT: s_waitcnt vmcnt(5) -; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_alignbit_b32 v18, v18, v22, 16 -; GCN-NEXT: s_waitcnt vmcnt(4) -; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v25 -; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v26 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v27 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_alignbit_b32 v25, v22, v23, 16 -; GCN-NEXT: v_alignbit_b32 v24, v24, v26, 16 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:44 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v29 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:36 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_alignbit_b32 v23, v23, v22, 16 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v26 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v27 -; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_alignbit_b32 v22, v22, v26, 16 -; GCN-NEXT: buffer_store_dwordx4 v[10:13], v[8:9], s[4:7], 0 addr64 offset:112 -; GCN-NEXT: buffer_store_dwordx4 v[14:17], v[8:9], s[4:7], 0 addr64 offset:96 -; GCN-NEXT: buffer_store_dwordx4 v[22:25], v[8:9], s[4:7], 0 addr64 offset:80 -; GCN-NEXT: buffer_store_dwordx4 v[18:21], v[8:9], s[4:7], 0 addr64 offset:64 -; GCN-NEXT: buffer_store_dwordx4 v[4:7], v[8:9], s[4:7], 0 addr64 offset:48 -; GCN-NEXT: buffer_store_dwordx4 v[0:3], v[8:9], s[4:7], 0 addr64 +; GCN-NEXT: buffer_store_dwordx4 v[28:31], v[32:33], s[4:7], 0 addr64 offset:112 +; GCN-NEXT: buffer_store_dwordx4 v[24:27], v[32:33], s[4:7], 0 addr64 offset:96 +; GCN-NEXT: buffer_store_dwordx4 v[20:23], v[32:33], s[4:7], 0 addr64 offset:80 +; GCN-NEXT: buffer_store_dwordx4 v[16:19], v[32:33], s[4:7], 0 addr64 offset:64 +; GCN-NEXT: buffer_store_dwordx4 v[12:15], v[32:33], s[4:7], 0 addr64 offset:48 +; GCN-NEXT: buffer_store_dwordx4 v[8:11], v[32:33], s[4:7], 0 addr64 offset:32 +; GCN-NEXT: buffer_store_dwordx4 v[4:7], v[32:33], s[4:7], 0 addr64 offset:16 +; GCN-NEXT: buffer_store_dwordx4 v[0:3], v[32:33], s[4:7], 0 addr64 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_store_global_v64bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124 -; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:120 -; GFX7-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:116 -; GFX7-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:112 -; GFX7-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:108 -; GFX7-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:104 -; GFX7-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:100 +; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GFX7-NEXT: v_alignbit_b32 v3, v3, v2, 16 -; GFX7-NEXT: v_alignbit_b32 v2, v1, v0, 16 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v15 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v14 -; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v29 -; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v28 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: s_waitcnt vmcnt(7) -; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31 -; GFX7-NEXT: s_waitcnt vmcnt(6) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; GFX7-NEXT: s_waitcnt vmcnt(5) -; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 -; GFX7-NEXT: v_alignbit_b32 v36, v31, v32, 16 -; GFX7-NEXT: s_waitcnt vmcnt(3) -; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v37 -; GFX7-NEXT: v_mul_f32_e32 v34, 1.0, v34 -; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33 -; GFX7-NEXT: s_waitcnt vmcnt(2) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v38 -; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; GFX7-NEXT: v_alignbit_b32 v35, v33, v34, 16 -; GFX7-NEXT: v_alignbit_b32 v34, v31, v32, 16 -; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v39 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v48 -; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; GFX7-NEXT: v_alignbit_b32 v33, v31, v32, 16 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:136 -; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:132 -; GFX7-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:96 -; GFX7-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:92 -; GFX7-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:88 -; GFX7-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:84 -; GFX7-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:80 -; GFX7-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:76 -; GFX7-NEXT: s_waitcnt vmcnt(6) -; GFX7-NEXT: buffer_store_dwordx4 v[33:36], v[31:32], s[4:7], 0 addr64 offset:112 -; GFX7-NEXT: s_waitcnt vmcnt(6) -; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v37 -; GFX7-NEXT: s_waitcnt vmcnt(5) -; GFX7-NEXT: v_mul_f32_e32 v34, 1.0, v38 -; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33 -; GFX7-NEXT: s_waitcnt vmcnt(4) -; GFX7-NEXT: v_mul_f32_e32 v35, 1.0, v39 -; GFX7-NEXT: v_alignbit_b32 v36, v33, v34, 16 -; GFX7-NEXT: s_waitcnt vmcnt(2) -; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v49 -; GFX7-NEXT: v_mul_f32_e32 v37, 1.0, v48 -; GFX7-NEXT: v_lshrrev_b32_e32 v35, 16, v35 -; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_mul_f32_e32 v34, 1.0, v50 -; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33 -; GFX7-NEXT: v_alignbit_b32 v35, v35, v37, 16 -; GFX7-NEXT: v_alignbit_b32 v34, v33, v34, 16 -; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:72 -; GFX7-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:68 -; GFX7-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:64 -; GFX7-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:60 -; GFX7-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:56 -; GFX7-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:52 -; GFX7-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:48 -; GFX7-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:44 -; GFX7-NEXT: s_waitcnt vmcnt(7) -; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 -; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33 -; GFX7-NEXT: s_waitcnt vmcnt(6) -; GFX7-NEXT: v_mul_f32_e32 v37, 1.0, v37 -; GFX7-NEXT: v_alignbit_b32 v33, v33, v37, 16 -; GFX7-NEXT: buffer_store_dwordx4 v[33:36], v[31:32], s[4:7], 0 addr64 offset:96 -; GFX7-NEXT: s_waitcnt vmcnt(3) -; GFX7-NEXT: v_mul_f32_e32 v37, 1.0, v49 -; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v38 -; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33 -; GFX7-NEXT: v_mul_f32_e32 v34, 1.0, v39 -; GFX7-NEXT: v_mul_f32_e32 v35, 1.0, v48 -; GFX7-NEXT: v_alignbit_b32 v36, v33, v34, 16 -; GFX7-NEXT: s_waitcnt vmcnt(2) -; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v50 -; GFX7-NEXT: v_lshrrev_b32_e32 v35, 16, v35 -; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33 -; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_mul_f32_e32 v34, 1.0, v51 -; GFX7-NEXT: v_alignbit_b32 v35, v35, v37, 16 -; GFX7-NEXT: v_alignbit_b32 v34, v33, v34, 16 -; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:40 -; GFX7-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:36 -; GFX7-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:32 -; GFX7-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:28 -; GFX7-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:24 -; GFX7-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:20 -; GFX7-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:16 -; GFX7-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:12 -; GFX7-NEXT: s_waitcnt vmcnt(7) -; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 -; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33 -; GFX7-NEXT: s_waitcnt vmcnt(6) -; GFX7-NEXT: v_mul_f32_e32 v37, 1.0, v37 -; GFX7-NEXT: v_alignbit_b32 v33, v33, v37, 16 -; GFX7-NEXT: buffer_store_dwordx4 v[33:36], v[31:32], s[4:7], 0 addr64 offset:80 -; GFX7-NEXT: s_waitcnt vmcnt(3) -; GFX7-NEXT: v_mul_f32_e32 v37, 1.0, v49 -; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v38 -; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33 -; GFX7-NEXT: v_mul_f32_e32 v34, 1.0, v39 -; GFX7-NEXT: v_mul_f32_e32 v35, 1.0, v48 -; GFX7-NEXT: v_alignbit_b32 v36, v33, v34, 16 -; GFX7-NEXT: s_waitcnt vmcnt(2) -; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v50 -; GFX7-NEXT: v_lshrrev_b32_e32 v35, 16, v35 -; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33 -; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_mul_f32_e32 v34, 1.0, v51 -; GFX7-NEXT: v_alignbit_b32 v35, v35, v37, 16 -; GFX7-NEXT: v_alignbit_b32 v34, v33, v34, 16 -; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 -; GFX7-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:4 -; GFX7-NEXT: buffer_load_dword v38, off, s[0:3], s32 -; GFX7-NEXT: s_waitcnt vmcnt(2) -; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 -; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33 -; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_mul_f32_e32 v37, 1.0, v37 -; GFX7-NEXT: v_alignbit_b32 v33, v33, v37, 16 -; GFX7-NEXT: buffer_store_dwordx4 v[33:36], v[31:32], s[4:7], 0 addr64 offset:64 -; GFX7-NEXT: s_nop 0 -; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v5 -; GFX7-NEXT: v_alignbit_b32 v5, v7, v6, 16 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v13 -; GFX7-NEXT: v_alignbit_b32 v13, v0, v1, 16 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v11 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v10 -; GFX7-NEXT: v_alignbit_b32 v11, v0, v1, 16 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v9 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v8 -; GFX7-NEXT: v_alignbit_b32 v10, v0, v1, 16 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v23 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v12 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v22 -; GFX7-NEXT: v_alignbit_b32 v12, v6, v7, 16 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v21 -; GFX7-NEXT: v_alignbit_b32 v9, v0, v1, 16 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v19 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v20 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v18 -; GFX7-NEXT: v_alignbit_b32 v8, v6, v7, 16 -; GFX7-NEXT: v_alignbit_b32 v7, v0, v1, 16 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v17 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v16 -; GFX7-NEXT: v_alignbit_b32 v6, v0, v1, 16 -; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v38 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v30 -; GFX7-NEXT: v_alignbit_b32 v17, v0, v1, 16 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v27 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v26 -; GFX7-NEXT: v_alignbit_b32 v16, v14, v15, 16 -; GFX7-NEXT: v_alignbit_b32 v15, v0, v1, 16 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v25 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v24 -; GFX7-NEXT: v_alignbit_b32 v14, v0, v1, 16 -; GFX7-NEXT: v_alignbit_b32 v4, v33, v4, 16 -; GFX7-NEXT: buffer_store_dwordx4 v[14:17], v[31:32], s[4:7], 0 addr64 offset:48 -; GFX7-NEXT: buffer_store_dwordx4 v[6:9], v[31:32], s[4:7], 0 addr64 offset:32 -; GFX7-NEXT: buffer_store_dwordx4 v[10:13], v[31:32], s[4:7], 0 addr64 offset:16 -; GFX7-NEXT: buffer_store_dwordx4 v[2:5], v[31:32], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_store_dwordx4 v[28:31], v[32:33], s[4:7], 0 addr64 offset:112 +; GFX7-NEXT: buffer_store_dwordx4 v[24:27], v[32:33], s[4:7], 0 addr64 offset:96 +; GFX7-NEXT: buffer_store_dwordx4 v[20:23], v[32:33], s[4:7], 0 addr64 offset:80 +; GFX7-NEXT: buffer_store_dwordx4 v[16:19], v[32:33], s[4:7], 0 addr64 offset:64 +; GFX7-NEXT: buffer_store_dwordx4 v[12:15], v[32:33], s[4:7], 0 addr64 offset:48 +; GFX7-NEXT: buffer_store_dwordx4 v[8:11], v[32:33], s[4:7], 0 addr64 offset:32 +; GFX7-NEXT: buffer_store_dwordx4 v[4:7], v[32:33], s[4:7], 0 addr64 offset:16 +; GFX7-NEXT: buffer_store_dwordx4 v[0:3], v[32:33], s[4:7], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -3287,30 +2353,22 @@ define void @test_arg_store_v2bf16(<2 x bfloat> %in, ptr addrspace(1) %out) { ; GCN-LABEL: test_arg_store_v2bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s4, s6 ; GCN-NEXT: s_mov_b32 s5, s6 -; GCN-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64 +; GCN-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: test_arg_store_v2bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: v_alignbit_b32 v0, v1, v0, 16 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -3364,36 +2422,24 @@ define void @test_arg_store_v3bf16(<3 x bfloat> %in, ptr addrspace(1) %out) { ; GCN-LABEL: test_arg_store_v3bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s4, s6 ; GCN-NEXT: s_mov_b32 s5, s6 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; GCN-NEXT: buffer_store_short v2, v[3:4], s[4:7], 0 addr64 offset:4 -; GCN-NEXT: buffer_store_dword v0, v[3:4], s[4:7], 0 addr64 +; GCN-NEXT: buffer_store_short v1, v[2:3], s[4:7], 0 addr64 offset:4 +; GCN-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: test_arg_store_v3bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_alignbit_b32 v0, v1, v0, 16 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v2 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: buffer_store_short v1, v[3:4], s[4:7], 0 addr64 offset:4 -; GFX7-NEXT: buffer_store_dword v0, v[3:4], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_store_short v1, v[2:3], s[4:7], 0 addr64 offset:4 +; GFX7-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -3446,38 +2492,22 @@ define void @test_arg_store_v4bf16(<4 x bfloat> %in, ptr addrspace(1) %out) { ; GCN-LABEL: test_arg_store_v4bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v1 -; GCN-NEXT: v_alignbit_b32 v1, v3, v2, 16 -; GCN-NEXT: v_alignbit_b32 v0, v6, v0, 16 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s4, s6 ; GCN-NEXT: s_mov_b32 s5, s6 -; GCN-NEXT: buffer_store_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64 +; GCN-NEXT: buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: test_arg_store_v4bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_store_dwordx2 v[1:2], v[4:5], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -3521,54 +2551,22 @@ define void @test_arg_store_v8bf16(<8 x bfloat> %in, ptr addrspace(1) %out) { ; GCN-LABEL: test_arg_store_v8bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s4, s6 ; GCN-NEXT: s_mov_b32 s5, s6 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v1 -; GCN-NEXT: v_alignbit_b32 v3, v2, v6, 16 -; GCN-NEXT: v_alignbit_b32 v2, v5, v4, 16 -; GCN-NEXT: v_alignbit_b32 v1, v7, v10, 16 -; GCN-NEXT: v_alignbit_b32 v0, v11, v0, 16 -; GCN-NEXT: buffer_store_dwordx4 v[0:3], v[8:9], s[4:7], 0 addr64 +; GCN-NEXT: buffer_store_dwordx4 v[0:3], v[4:5], s[4:7], 0 addr64 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: test_arg_store_v8bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: v_alignbit_b32 v6, v7, v6, 16 -; GFX7-NEXT: v_alignbit_b32 v5, v5, v4, 16 -; GFX7-NEXT: v_alignbit_b32 v4, v3, v2, 16 -; GFX7-NEXT: v_alignbit_b32 v3, v1, v0, 16 -; GFX7-NEXT: buffer_store_dwordx4 v[3:6], v[8:9], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_store_dwordx4 v[0:3], v[4:5], s[4:7], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -3612,88 +2610,24 @@ define void @test_arg_store_v16bf16(<16 x bfloat> %in, ptr addrspace(1) %out) { ; GCN-LABEL: test_arg_store_v16bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v15 -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s4, s6 ; GCN-NEXT: s_mov_b32 s5, s6 -; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_alignbit_b32 v3, v7, v6, 16 -; GCN-NEXT: v_alignbit_b32 v2, v5, v4, 16 -; GCN-NEXT: v_alignbit_b32 v1, v15, v18, 16 -; GCN-NEXT: v_alignbit_b32 v0, v19, v0, 16 -; GCN-NEXT: v_alignbit_b32 v7, v20, v14, 16 -; GCN-NEXT: v_alignbit_b32 v6, v13, v12, 16 -; GCN-NEXT: v_alignbit_b32 v5, v11, v10, 16 -; GCN-NEXT: v_alignbit_b32 v4, v9, v8, 16 -; GCN-NEXT: buffer_store_dwordx4 v[4:7], v[16:17], s[4:7], 0 addr64 offset:16 -; GCN-NEXT: buffer_store_dwordx4 v[0:3], v[16:17], s[4:7], 0 addr64 +; GCN-NEXT: buffer_store_dwordx4 v[4:7], v[8:9], s[4:7], 0 addr64 offset:16 +; GCN-NEXT: buffer_store_dwordx4 v[0:3], v[8:9], s[4:7], 0 addr64 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: test_arg_store_v16bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_alignbit_b32 v5, v5, v4, 16 -; GFX7-NEXT: v_alignbit_b32 v4, v3, v2, 16 -; GFX7-NEXT: v_alignbit_b32 v3, v1, v0, 16 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v15 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v14 -; GFX7-NEXT: v_alignbit_b32 v14, v0, v1, 16 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v13 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v12 -; GFX7-NEXT: v_alignbit_b32 v13, v0, v1, 16 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v11 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v10 -; GFX7-NEXT: v_alignbit_b32 v12, v0, v1, 16 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v9 -; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v8 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: v_alignbit_b32 v11, v0, v1, 16 -; GFX7-NEXT: v_alignbit_b32 v6, v7, v6, 16 -; GFX7-NEXT: buffer_store_dwordx4 v[11:14], v[16:17], s[4:7], 0 addr64 offset:16 -; GFX7-NEXT: buffer_store_dwordx4 v[3:6], v[16:17], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_store_dwordx4 v[4:7], v[8:9], s[4:7], 0 addr64 offset:16 +; GFX7-NEXT: buffer_store_dwordx4 v[0:3], v[8:9], s[4:7], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -4216,11 +3150,13 @@ define <3 x bfloat> @test_ret_v3bf16(<3 x bfloat> %in) { ; GCN-LABEL: test_ret_v3bf16: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: test_ret_v3bf16: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: test_ret_v3bf16: @@ -4627,32 +3563,25 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) { ; GCN-NEXT: s_mov_b32 s18, s33 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_xor_saveexec_b64 s[16:17], -1 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[16:17] ; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_writelane_b32 v4, s30, 0 -; GCN-NEXT: v_writelane_b32 v4, s31, 1 +; GCN-NEXT: v_writelane_b32 v2, s30, 0 +; GCN-NEXT: v_writelane_b32 v2, s31, 1 ; GCN-NEXT: s_getpc_b64 s[16:17] ; GCN-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 ; GCN-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 ; GCN-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 2, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: buffer_store_short v1, v3, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_short v0, v2, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_readlane_b32 s31, v4, 1 -; GCN-NEXT: v_readlane_b32 s30, v4, 0 +; GCN-NEXT: v_readlane_b32 s31, v2, 1 +; GCN-NEXT: v_readlane_b32 s30, v2, 0 ; GCN-NEXT: s_mov_b32 s32, s33 ; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_mov_b32 s33, s18 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) @@ -4664,31 +3593,24 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) { ; GFX7-NEXT: s_mov_b32 s18, s33 ; GFX7-NEXT: s_mov_b32 s33, s32 ; GFX7-NEXT: s_xor_saveexec_b64 s[16:17], -1 -; GFX7-NEXT: buffer_store_dword v4, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX7-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX7-NEXT: s_mov_b64 exec, s[16:17] ; GFX7-NEXT: s_addk_i32 s32, 0x400 ; GFX7-NEXT: s_getpc_b64 s[16:17] ; GFX7-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 ; GFX7-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 ; GFX7-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 -; GFX7-NEXT: v_writelane_b32 v4, s30, 0 -; GFX7-NEXT: v_writelane_b32 v4, s31, 1 +; GFX7-NEXT: v_writelane_b32 v2, s30, 0 +; GFX7-NEXT: v_writelane_b32 v2, s31, 1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_add_i32_e32 v3, vcc, 2, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: buffer_store_short v1, v3, s[0:3], 0 offen -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_store_short v0, v2, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_readlane_b32 s31, v4, 1 -; GFX7-NEXT: v_readlane_b32 s30, v4, 0 +; GFX7-NEXT: v_readlane_b32 s31, v2, 1 +; GFX7-NEXT: v_readlane_b32 s30, v2, 0 ; GFX7-NEXT: s_mov_b32 s32, s33 ; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX7-NEXT: buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX7-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX7-NEXT: s_mov_b64 exec, s[4:5] ; GFX7-NEXT: s_mov_b32 s33, s18 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -4885,34 +3807,29 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) { ; GCN-NEXT: s_mov_b32 s18, s33 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_xor_saveexec_b64 s[16:17], -1 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s33 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[16:17] ; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_writelane_b32 v5, s30, 0 -; GCN-NEXT: v_writelane_b32 v5, s31, 1 +; GCN-NEXT: v_writelane_b32 v4, s30, 0 +; GCN-NEXT: v_writelane_b32 v4, s31, 1 ; GCN-NEXT: s_getpc_b64 s[16:17] ; GCN-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 ; GCN-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 ; GCN-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 4, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; GCN-NEXT: buffer_store_short v2, v4, s[0:3], 0 offen +; GCN-NEXT: v_add_i32_e32 v3, vcc, 4, v2 +; GCN-NEXT: buffer_store_short v1, v3, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, v3, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_readlane_b32 s31, v5, 1 -; GCN-NEXT: v_readlane_b32 s30, v5, 0 +; GCN-NEXT: v_readlane_b32 s31, v4, 1 +; GCN-NEXT: v_readlane_b32 s30, v4, 0 ; GCN-NEXT: s_mov_b32 s32, s33 ; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_mov_b32 s33, s18 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) @@ -4932,19 +3849,14 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) { ; GFX7-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 ; GFX7-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 ; GFX7-NEXT: v_writelane_b32 v4, s30, 0 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX7-NEXT: v_writelane_b32 v4, s31, 1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_add_i32_e32 v2, vcc, 4, v3 -; GFX7-NEXT: buffer_store_short v1, v2, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v3, vcc, 4, v2 +; GFX7-NEXT: buffer_store_short v1, v3, s[0:3], 0 offen ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_store_dword v0, v3, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_readlane_b32 s31, v4, 1 ; GFX7-NEXT: v_readlane_b32 s30, v4, 0 @@ -5175,24 +4087,18 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) { ; GCN-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 6, v4 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 4, v4 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 2, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: buffer_store_short v3, v5, s[0:3], 0 offen +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GCN-NEXT: v_add_i32_e32 v5, vcc, 4, v2 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 6, v2 +; GCN-NEXT: v_add_i32_e32 v7, vcc, 2, v2 +; GCN-NEXT: buffer_store_short v1, v5, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_short v2, v6, s[0:3], 0 offen +; GCN-NEXT: buffer_store_short v0, v2, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_short v1, v7, s[0:3], 0 offen +; GCN-NEXT: buffer_store_short v4, v6, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_short v0, v4, s[0:3], 0 offen +; GCN-NEXT: buffer_store_short v3, v7, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_readlane_b32 s31, v8, 1 ; GCN-NEXT: v_readlane_b32 s30, v8, 0 @@ -5221,24 +4127,18 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) { ; GFX7-NEXT: v_writelane_b32 v6, s31, 1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_add_i32_e32 v5, vcc, 6, v4 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: buffer_store_short v3, v5, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX7-NEXT: buffer_store_short v1, v5, s[0:3], 0 offen ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v3, vcc, 4, v4 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: buffer_store_short v2, v3, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_short v0, v2, s[0:3], 0 offen ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v2, vcc, 2, v4 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: buffer_store_short v1, v2, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 6, v2 +; GFX7-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_store_short v0, v4, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 2, v2 +; GFX7-NEXT: buffer_store_short v3, v0, s[0:3], 0 offen ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_readlane_b32 s31, v6, 1 ; GFX7-NEXT: v_readlane_b32 s30, v6, 0 @@ -5462,44 +4362,32 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) { ; GCN-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 14, v8 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 12, v8 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 10, v8 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 8, v8 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 6, v8 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 4, v8 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 2, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: buffer_store_short v7, v9, s[0:3], 0 offen +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v3 +; GCN-NEXT: v_add_i32_e32 v9, vcc, 12, v4 +; GCN-NEXT: v_add_i32_e32 v10, vcc, 8, v4 +; GCN-NEXT: v_add_i32_e32 v11, vcc, 4, v4 +; GCN-NEXT: v_add_i32_e32 v12, vcc, 14, v4 +; GCN-NEXT: v_add_i32_e32 v13, vcc, 10, v4 +; GCN-NEXT: v_add_i32_e32 v14, vcc, 6, v4 +; GCN-NEXT: v_add_i32_e32 v15, vcc, 2, v4 +; GCN-NEXT: buffer_store_short v3, v9, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_short v6, v10, s[0:3], 0 offen +; GCN-NEXT: buffer_store_short v2, v10, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_short v5, v11, s[0:3], 0 offen +; GCN-NEXT: buffer_store_short v1, v11, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_short v4, v12, s[0:3], 0 offen +; GCN-NEXT: buffer_store_short v0, v4, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_short v3, v13, s[0:3], 0 offen +; GCN-NEXT: buffer_store_short v8, v12, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_short v2, v14, s[0:3], 0 offen +; GCN-NEXT: buffer_store_short v7, v13, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_short v1, v15, s[0:3], 0 offen +; GCN-NEXT: buffer_store_short v6, v14, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_short v0, v8, s[0:3], 0 offen +; GCN-NEXT: buffer_store_short v5, v15, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_readlane_b32 s31, v16, 1 ; GCN-NEXT: v_readlane_b32 s30, v16, 0 @@ -5528,44 +4416,32 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) { ; GFX7-NEXT: v_writelane_b32 v10, s31, 1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GFX7-NEXT: v_add_i32_e32 v9, vcc, 14, v8 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX7-NEXT: buffer_store_short v7, v9, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v9, vcc, 12, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v3 +; GFX7-NEXT: buffer_store_short v3, v9, s[0:3], 0 offen ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v7, vcc, 12, v8 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: buffer_store_short v6, v7, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v3, vcc, 8, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; GFX7-NEXT: buffer_store_short v2, v3, s[0:3], 0 offen ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v6, vcc, 10, v8 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: buffer_store_short v5, v6, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v2, vcc, 4, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GFX7-NEXT: buffer_store_short v1, v2, s[0:3], 0 offen ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v5, vcc, 8, v8 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: buffer_store_short v4, v5, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_short v0, v4, s[0:3], 0 offen ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v4, vcc, 6, v8 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: buffer_store_short v3, v4, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 14, v4 +; GFX7-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v3, vcc, 4, v8 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: buffer_store_short v2, v3, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 10, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; GFX7-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v2, vcc, 2, v8 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: buffer_store_short v1, v2, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 6, v4 +; GFX7-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_store_short v0, v8, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 2, v4 +; GFX7-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_readlane_b32 s31, v10, 1 ; GFX7-NEXT: v_readlane_b32 s30, v10, 0 @@ -5801,84 +4677,60 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) { ; GCN-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 30, v16 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 28, v16 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 26, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; GCN-NEXT: buffer_store_short v15, v17, s[0:3], 0 offen +; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v7 +; GCN-NEXT: v_add_i32_e32 v17, vcc, 28, v8 +; GCN-NEXT: v_add_i32_e32 v18, vcc, 24, v8 +; GCN-NEXT: v_add_i32_e32 v19, vcc, 20, v8 +; GCN-NEXT: buffer_store_short v7, v17, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: v_add_i32_e32 v15, vcc, 24, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 22, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GCN-NEXT: buffer_store_short v14, v18, s[0:3], 0 offen +; GCN-NEXT: v_add_i32_e32 v7, vcc, 16, v8 +; GCN-NEXT: v_add_i32_e32 v17, vcc, 12, v8 +; GCN-NEXT: buffer_store_short v6, v18, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: v_add_i32_e32 v14, vcc, 20, v16 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 18, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; GCN-NEXT: buffer_store_short v13, v19, s[0:3], 0 offen +; GCN-NEXT: v_add_i32_e32 v6, vcc, 8, v8 +; GCN-NEXT: v_add_i32_e32 v18, vcc, 4, v8 +; GCN-NEXT: buffer_store_short v5, v19, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: v_add_i32_e32 v13, vcc, 16, v16 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 14, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GCN-NEXT: buffer_store_short v12, v15, s[0:3], 0 offen +; GCN-NEXT: v_add_i32_e32 v5, vcc, 30, v8 +; GCN-NEXT: v_add_i32_e32 v19, vcc, 26, v8 +; GCN-NEXT: buffer_store_short v4, v7, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: v_add_i32_e32 v12, vcc, 12, v16 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 10, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GCN-NEXT: buffer_store_short v11, v17, s[0:3], 0 offen +; GCN-NEXT: v_add_i32_e32 v4, vcc, 22, v8 +; GCN-NEXT: v_add_i32_e32 v7, vcc, 18, v8 +; GCN-NEXT: buffer_store_short v3, v17, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: v_add_i32_e32 v11, vcc, 8, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 6, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; GCN-NEXT: buffer_store_short v10, v14, s[0:3], 0 offen +; GCN-NEXT: v_add_i32_e32 v3, vcc, 14, v8 +; GCN-NEXT: v_add_i32_e32 v17, vcc, 10, v8 +; GCN-NEXT: buffer_store_short v2, v6, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: v_add_i32_e32 v10, vcc, 4, v16 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 2, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GCN-NEXT: buffer_store_short v9, v18, s[0:3], 0 offen +; GCN-NEXT: v_add_i32_e32 v2, vcc, 6, v8 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 2, v8 +; GCN-NEXT: buffer_store_short v1, v18, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_short v8, v13, s[0:3], 0 offen +; GCN-NEXT: buffer_store_short v0, v8, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_short v7, v19, s[0:3], 0 offen +; GCN-NEXT: buffer_store_short v16, v5, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_short v6, v12, s[0:3], 0 offen +; GCN-NEXT: buffer_store_short v15, v19, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_short v5, v15, s[0:3], 0 offen +; GCN-NEXT: buffer_store_short v14, v4, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_short v4, v11, s[0:3], 0 offen +; GCN-NEXT: buffer_store_short v13, v7, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_short v3, v17, s[0:3], 0 offen +; GCN-NEXT: buffer_store_short v12, v3, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_short v2, v10, s[0:3], 0 offen +; GCN-NEXT: buffer_store_short v11, v17, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_short v1, v14, s[0:3], 0 offen +; GCN-NEXT: buffer_store_short v10, v2, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_short v0, v16, s[0:3], 0 offen +; GCN-NEXT: buffer_store_short v9, v6, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_readlane_b32 s31, v20, 1 ; GCN-NEXT: v_readlane_b32 s30, v20, 0 @@ -5907,84 +4759,60 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) { ; GFX7-NEXT: v_writelane_b32 v18, s31, 1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 -; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 -; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; GFX7-NEXT: v_add_i32_e32 v17, vcc, 30, v16 -; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GFX7-NEXT: buffer_store_short v15, v17, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v17, vcc, 28, v8 +; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v7 +; GFX7-NEXT: buffer_store_short v7, v17, s[0:3], 0 offen ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v15, vcc, 28, v16 -; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 -; GFX7-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; GFX7-NEXT: buffer_store_short v14, v15, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v7, vcc, 24, v8 +; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v6 +; GFX7-NEXT: buffer_store_short v6, v7, s[0:3], 0 offen ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v14, vcc, 26, v16 -; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GFX7-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GFX7-NEXT: buffer_store_short v13, v14, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v6, vcc, 20, v8 +; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v5 +; GFX7-NEXT: buffer_store_short v5, v6, s[0:3], 0 offen ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v13, vcc, 24, v16 -; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 -; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GFX7-NEXT: buffer_store_short v12, v13, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v5, vcc, 16, v8 +; GFX7-NEXT: v_lshrrev_b32_e32 v13, 16, v4 +; GFX7-NEXT: buffer_store_short v4, v5, s[0:3], 0 offen ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v12, vcc, 22, v16 -; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; GFX7-NEXT: buffer_store_short v11, v12, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 12, v8 +; GFX7-NEXT: v_lshrrev_b32_e32 v12, 16, v3 +; GFX7-NEXT: buffer_store_short v3, v4, s[0:3], 0 offen ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v11, vcc, 20, v16 -; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GFX7-NEXT: buffer_store_short v10, v11, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v3, vcc, 8, v8 +; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v2 +; GFX7-NEXT: buffer_store_short v2, v3, s[0:3], 0 offen ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v10, vcc, 18, v16 -; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GFX7-NEXT: buffer_store_short v9, v10, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v2, vcc, 4, v8 +; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v0 +; GFX7-NEXT: buffer_store_short v1, v2, s[0:3], 0 offen ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v9, vcc, 16, v16 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GFX7-NEXT: buffer_store_short v8, v9, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_short v0, v8, s[0:3], 0 offen ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v8, vcc, 14, v16 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX7-NEXT: buffer_store_short v7, v8, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 30, v8 +; GFX7-NEXT: buffer_store_short v16, v0, s[0:3], 0 offen ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v7, vcc, 12, v16 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: buffer_store_short v6, v7, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 26, v8 +; GFX7-NEXT: buffer_store_short v15, v0, s[0:3], 0 offen ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v6, vcc, 10, v16 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: buffer_store_short v5, v6, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 22, v8 +; GFX7-NEXT: buffer_store_short v14, v0, s[0:3], 0 offen ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v5, vcc, 8, v16 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: buffer_store_short v4, v5, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 18, v8 +; GFX7-NEXT: buffer_store_short v13, v0, s[0:3], 0 offen ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v4, vcc, 6, v16 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: buffer_store_short v3, v4, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 14, v8 +; GFX7-NEXT: buffer_store_short v12, v0, s[0:3], 0 offen ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v3, vcc, 4, v16 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: buffer_store_short v2, v3, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 10, v8 +; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v1 +; GFX7-NEXT: buffer_store_short v11, v0, s[0:3], 0 offen ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v2, vcc, 2, v16 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: buffer_store_short v1, v2, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 6, v8 +; GFX7-NEXT: buffer_store_short v10, v0, s[0:3], 0 offen ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_store_short v0, v16, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 2, v8 +; GFX7-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_readlane_b32 s31, v18, 1 ; GFX7-NEXT: v_readlane_b32 s30, v18, 0 @@ -10820,35 +9648,27 @@ define <2 x bfloat> @v_fadd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GCN-LABEL: v_fadd_v2bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_add_f32_e32 v1, v1, v3 -; GCN-NEXT: v_add_f32_e32 v0, v0, v2 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GCN-NEXT: v_add_f32_e32 v2, v3, v2 +; GCN-NEXT: v_add_f32_e32 v0, v0, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_alignbit_b32 v0, v0, v2, 16 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_fadd_v2bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX7-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX7-NEXT: v_add_f32_e32 v2, v3, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_alignbit_b32 v0, v0, v2, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fadd_v2bf16: @@ -11000,47 +9820,35 @@ define <3 x bfloat> @v_fadd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GCN-LABEL: v_fadd_v3bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v0 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_add_f32_e32 v2, v2, v5 -; GCN-NEXT: v_add_f32_e32 v1, v1, v4 -; GCN-NEXT: v_add_f32_e32 v0, v0, v3 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GCN-NEXT: v_add_f32_e32 v1, v1, v3 +; GCN-NEXT: v_add_f32_e32 v3, v5, v4 +; GCN-NEXT: v_add_f32_e32 v0, v0, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_alignbit_b32 v0, v0, v3, 16 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_fadd_v3bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v0 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_add_f32_e32 v2, v2, v5 -; GFX7-NEXT: v_add_f32_e32 v1, v1, v4 -; GFX7-NEXT: v_add_f32_e32 v0, v0, v3 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX7-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX7-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_alignbit_b32 v0, v0, v3, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fadd_v3bf16: @@ -11242,59 +10050,43 @@ define <4 x bfloat> @v_fadd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GCN-LABEL: v_fadd_v4bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v1 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_add_f32_e32 v3, v3, v7 -; GCN-NEXT: v_add_f32_e32 v2, v2, v6 -; GCN-NEXT: v_add_f32_e32 v1, v1, v5 -; GCN-NEXT: v_add_f32_e32 v0, v0, v4 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v0 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GCN-NEXT: v_add_f32_e32 v4, v5, v4 +; GCN-NEXT: v_add_f32_e32 v1, v1, v3 +; GCN-NEXT: v_add_f32_e32 v3, v7, v6 +; GCN-NEXT: v_add_f32_e32 v0, v0, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GCN-NEXT: v_alignbit_b32 v1, v1, v4, 16 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_fadd_v4bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v1 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_add_f32_e32 v3, v3, v7 -; GFX7-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX7-NEXT: v_add_f32_e32 v1, v1, v5 -; GFX7-NEXT: v_add_f32_e32 v0, v0, v4 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: v_add_f32_e32 v4, v5, v4 +; GFX7-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v0 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX7-NEXT: v_alignbit_b32 v1, v1, v4, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fadd_v4bf16: @@ -11540,107 +10332,75 @@ define <8 x bfloat> @v_fadd_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GCN-LABEL: v_fadd_v8bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v3 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v2 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_add_f32_e32 v7, v7, v15 -; GCN-NEXT: v_add_f32_e32 v6, v6, v14 -; GCN-NEXT: v_add_f32_e32 v5, v5, v13 -; GCN-NEXT: v_add_f32_e32 v4, v4, v12 -; GCN-NEXT: v_add_f32_e32 v3, v3, v11 -; GCN-NEXT: v_add_f32_e32 v2, v2, v10 -; GCN-NEXT: v_add_f32_e32 v1, v1, v9 -; GCN-NEXT: v_add_f32_e32 v0, v0, v8 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v1 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v0 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GCN-NEXT: v_add_f32_e32 v8, v9, v8 +; GCN-NEXT: v_add_f32_e32 v3, v3, v7 +; GCN-NEXT: v_add_f32_e32 v7, v11, v10 +; GCN-NEXT: v_add_f32_e32 v2, v2, v6 +; GCN-NEXT: v_add_f32_e32 v6, v13, v12 +; GCN-NEXT: v_add_f32_e32 v1, v1, v5 +; GCN-NEXT: v_add_f32_e32 v5, v15, v14 +; GCN-NEXT: v_add_f32_e32 v0, v0, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_alignbit_b32 v0, v0, v5, 16 +; GCN-NEXT: v_alignbit_b32 v1, v1, v6, 16 +; GCN-NEXT: v_alignbit_b32 v2, v2, v7, 16 +; GCN-NEXT: v_alignbit_b32 v3, v3, v8, 16 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_fadd_v8bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 -; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 -; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v7 +; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v3 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GFX7-NEXT: v_add_f32_e32 v8, v9, v8 +; GFX7-NEXT: v_add_f32_e32 v3, v3, v7 +; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v2 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v15 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v14 -; GFX7-NEXT: v_add_f32_e32 v5, v5, v13 -; GFX7-NEXT: v_add_f32_e32 v4, v4, v12 -; GFX7-NEXT: v_add_f32_e32 v3, v3, v11 -; GFX7-NEXT: v_add_f32_e32 v2, v2, v10 -; GFX7-NEXT: v_add_f32_e32 v1, v1, v9 -; GFX7-NEXT: v_add_f32_e32 v0, v0, v8 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_add_f32_e32 v7, v9, v7 +; GFX7-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v1 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX7-NEXT: v_add_f32_e32 v6, v9, v6 +; GFX7-NEXT: v_add_f32_e32 v1, v1, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v0 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_add_f32_e32 v0, v0, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_add_f32_e32 v5, v9, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_alignbit_b32 v0, v0, v5, 16 +; GFX7-NEXT: v_alignbit_b32 v1, v1, v6, 16 +; GFX7-NEXT: v_alignbit_b32 v2, v2, v7, 16 +; GFX7-NEXT: v_alignbit_b32 v3, v3, v8, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fadd_v8bf16: @@ -12088,207 +10848,139 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GCN-LABEL: v_fadd_v16bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30 -; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GCN-NEXT: v_add_f32_e32 v14, v14, v30 -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GCN-NEXT: v_add_f32_e32 v13, v13, v29 -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28 -; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GCN-NEXT: v_add_f32_e32 v12, v12, v28 -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GCN-NEXT: v_add_f32_e32 v11, v11, v27 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v15 +; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v7 +; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v14 +; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v6 +; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; GCN-NEXT: v_add_f32_e32 v16, v17, v16 +; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v13 +; GCN-NEXT: v_add_f32_e32 v7, v7, v15 +; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v5 +; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; GCN-NEXT: v_add_f32_e32 v18, v19, v18 +; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v12 +; GCN-NEXT: v_add_f32_e32 v6, v6, v14 +; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v4 +; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; GCN-NEXT: v_add_f32_e32 v15, v15, v17 +; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v11 +; GCN-NEXT: v_add_f32_e32 v5, v5, v13 +; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v3 +; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; GCN-NEXT: v_add_f32_e32 v14, v14, v19 +; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v10 +; GCN-NEXT: v_add_f32_e32 v4, v4, v12 +; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v2 +; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GCN-NEXT: v_add_f32_e32 v10, v10, v26 -; GCN-NEXT: v_add_f32_e32 v9, v9, v25 -; GCN-NEXT: v_add_f32_e32 v8, v8, v24 -; GCN-NEXT: v_add_f32_e32 v7, v7, v23 -; GCN-NEXT: v_add_f32_e32 v6, v6, v22 -; GCN-NEXT: v_add_f32_e32 v5, v5, v21 -; GCN-NEXT: v_add_f32_e32 v4, v4, v20 -; GCN-NEXT: v_add_f32_e32 v3, v3, v19 -; GCN-NEXT: v_add_f32_e32 v2, v2, v18 -; GCN-NEXT: v_add_f32_e32 v1, v1, v17 -; GCN-NEXT: v_add_f32_e32 v0, v0, v16 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GCN-NEXT: v_add_f32_e32 v13, v13, v17 +; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v9 +; GCN-NEXT: v_add_f32_e32 v3, v3, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v1 +; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GCN-NEXT: v_add_f32_e32 v12, v12, v19 +; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v8 +; GCN-NEXT: v_add_f32_e32 v2, v2, v10 +; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v0 ; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v27 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GCN-NEXT: v_add_f32_e32 v15, v15, v16 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GCN-NEXT: v_add_f32_e32 v11, v11, v17 +; GCN-NEXT: v_add_f32_e32 v1, v1, v9 +; GCN-NEXT: v_add_f32_e32 v9, v10, v19 +; GCN-NEXT: v_add_f32_e32 v0, v0, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_alignbit_b32 v0, v0, v9, 16 +; GCN-NEXT: v_alignbit_b32 v1, v1, v11, 16 +; GCN-NEXT: v_alignbit_b32 v2, v2, v12, 16 +; GCN-NEXT: v_alignbit_b32 v3, v3, v13, 16 +; GCN-NEXT: v_alignbit_b32 v4, v4, v14, 16 +; GCN-NEXT: v_alignbit_b32 v5, v5, v15, 16 +; GCN-NEXT: v_alignbit_b32 v6, v6, v18, 16 +; GCN-NEXT: v_alignbit_b32 v7, v7, v16, 16 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_fadd_v16bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27 -; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GFX7-NEXT: v_add_f32_e32 v11, v11, v27 -; GFX7-NEXT: buffer_load_dword v27, off, s[0:3], s32 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 -; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 -; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30 -; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29 -; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 -; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28 -; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 -; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26 -; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 -; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 -; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23 -; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v22 -; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; GFX7-NEXT: v_lshlrev_b32_e32 v16, 16, v15 +; GFX7-NEXT: v_lshlrev_b32_e32 v17, 16, v7 +; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX7-NEXT: v_add_f32_e32 v16, v17, v16 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v15 +; GFX7-NEXT: v_lshlrev_b32_e32 v15, 16, v14 +; GFX7-NEXT: v_lshlrev_b32_e32 v17, 16, v6 ; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX7-NEXT: v_add_f32_e32 v15, v17, v15 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v14 +; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v13 +; GFX7-NEXT: v_lshlrev_b32_e32 v17, 16, v5 ; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; GFX7-NEXT: v_add_f32_e32 v14, v17, v14 +; GFX7-NEXT: v_add_f32_e32 v5, v5, v13 +; GFX7-NEXT: v_lshlrev_b32_e32 v13, 16, v12 +; GFX7-NEXT: v_lshlrev_b32_e32 v17, 16, v4 +; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; GFX7-NEXT: v_add_f32_e32 v13, v17, v13 +; GFX7-NEXT: v_add_f32_e32 v4, v4, v12 +; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v11 +; GFX7-NEXT: v_lshlrev_b32_e32 v17, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; GFX7-NEXT: v_add_f32_e32 v12, v17, v12 +; GFX7-NEXT: v_add_f32_e32 v3, v3, v11 +; GFX7-NEXT: v_lshlrev_b32_e32 v11, 16, v10 +; GFX7-NEXT: v_lshlrev_b32_e32 v17, 16, v2 +; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_add_f32_e32 v14, v14, v30 -; GFX7-NEXT: v_add_f32_e32 v13, v13, v29 -; GFX7-NEXT: v_add_f32_e32 v12, v12, v28 -; GFX7-NEXT: v_add_f32_e32 v10, v10, v26 -; GFX7-NEXT: v_add_f32_e32 v9, v9, v25 -; GFX7-NEXT: v_add_f32_e32 v8, v8, v24 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v23 -; GFX7-NEXT: v_add_f32_e32 v5, v5, v21 -; GFX7-NEXT: v_add_f32_e32 v4, v4, v20 -; GFX7-NEXT: v_add_f32_e32 v3, v3, v19 -; GFX7-NEXT: v_add_f32_e32 v2, v2, v18 -; GFX7-NEXT: v_add_f32_e32 v1, v1, v17 -; GFX7-NEXT: v_add_f32_e32 v0, v0, v16 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_add_f32_e32 v11, v17, v11 +; GFX7-NEXT: v_add_f32_e32 v2, v2, v10 +; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v9 +; GFX7-NEXT: v_lshlrev_b32_e32 v17, 16, v1 +; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v27 -; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GFX7-NEXT: v_add_f32_e32 v15, v15, v22 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX7-NEXT: v_add_f32_e32 v10, v17, v10 +; GFX7-NEXT: v_add_f32_e32 v1, v1, v9 +; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v17, 16, v0 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_add_f32_e32 v0, v0, v8 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_add_f32_e32 v9, v17, v9 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_alignbit_b32 v0, v0, v9, 16 +; GFX7-NEXT: v_alignbit_b32 v1, v1, v10, 16 +; GFX7-NEXT: v_alignbit_b32 v2, v2, v11, 16 +; GFX7-NEXT: v_alignbit_b32 v3, v3, v12, 16 +; GFX7-NEXT: v_alignbit_b32 v4, v4, v13, 16 +; GFX7-NEXT: v_alignbit_b32 v5, v5, v14, 16 +; GFX7-NEXT: v_alignbit_b32 v6, v6, v15, 16 +; GFX7-NEXT: v_alignbit_b32 v7, v7, v16, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fadd_v16bf16: @@ -13132,527 +11824,271 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GCN-LABEL: v_fadd_v32bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:124 -; GCN-NEXT: v_add_f32_e32 v31, v31, v32 -; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30 +; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v30 +; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v14 +; GCN-NEXT: v_add_f32_e32 v31, v32, v31 ; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:120 -; GCN-NEXT: v_add_f32_e32 v30, v30, v32 -; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:116 -; GCN-NEXT: v_add_f32_e32 v29, v29, v32 -; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28 -; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:112 -; GCN-NEXT: v_add_f32_e32 v28, v28, v32 -; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:108 -; GCN-NEXT: v_add_f32_e32 v27, v27, v32 -; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:104 -; GCN-NEXT: v_add_f32_e32 v26, v26, v32 -; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:100 -; GCN-NEXT: v_add_f32_e32 v25, v25, v32 -; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:96 -; GCN-NEXT: v_add_f32_e32 v24, v24, v32 -; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92 -; GCN-NEXT: v_add_f32_e32 v23, v23, v32 -; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:88 -; GCN-NEXT: v_add_f32_e32 v22, v22, v32 -; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:84 -; GCN-NEXT: v_add_f32_e32 v21, v21, v32 -; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:80 -; GCN-NEXT: v_add_f32_e32 v20, v20, v32 -; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:76 -; GCN-NEXT: v_add_f32_e32 v19, v19, v32 -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:72 -; GCN-NEXT: v_add_f32_e32 v18, v18, v32 -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:68 -; GCN-NEXT: v_add_f32_e32 v17, v17, v32 -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:64 -; GCN-NEXT: v_add_f32_e32 v16, v16, v32 -; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:60 -; GCN-NEXT: v_add_f32_e32 v15, v15, v32 -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 ; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:56 -; GCN-NEXT: v_add_f32_e32 v14, v14, v32 -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 +; GCN-NEXT: v_add_f32_e32 v30, v14, v30 +; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v29 +; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v13 +; GCN-NEXT: v_add_f32_e32 v14, v32, v14 +; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 ; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:52 -; GCN-NEXT: v_add_f32_e32 v13, v13, v32 -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 +; GCN-NEXT: v_add_f32_e32 v29, v13, v29 +; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v28 +; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v12 +; GCN-NEXT: v_add_f32_e32 v13, v32, v13 +; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 ; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:48 -; GCN-NEXT: v_add_f32_e32 v12, v12, v32 -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 +; GCN-NEXT: v_add_f32_e32 v28, v12, v28 +; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v27 +; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v11 +; GCN-NEXT: v_add_f32_e32 v12, v32, v12 +; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 ; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:44 -; GCN-NEXT: v_add_f32_e32 v11, v11, v32 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 +; GCN-NEXT: v_add_f32_e32 v27, v11, v27 +; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v26 +; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v10 +; GCN-NEXT: v_add_f32_e32 v11, v32, v11 +; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:40 -; GCN-NEXT: v_add_f32_e32 v10, v10, v32 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; GCN-NEXT: v_add_f32_e32 v10, v10, v26 +; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v25 +; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v9 +; GCN-NEXT: v_add_f32_e32 v26, v32, v26 +; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 ; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:36 -; GCN-NEXT: v_add_f32_e32 v9, v9, v32 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 +; GCN-NEXT: v_add_f32_e32 v9, v9, v25 +; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v24 +; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v8 +; GCN-NEXT: v_add_f32_e32 v25, v32, v25 +; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 ; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:32 -; GCN-NEXT: v_add_f32_e32 v8, v8, v32 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GCN-NEXT: v_add_f32_e32 v8, v8, v24 +; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v23 +; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v7 +; GCN-NEXT: v_add_f32_e32 v24, v32, v24 +; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:28 -; GCN-NEXT: v_add_f32_e32 v7, v7, v32 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GCN-NEXT: v_add_f32_e32 v7, v7, v23 +; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v22 +; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v6 +; GCN-NEXT: v_add_f32_e32 v23, v32, v23 +; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:24 -; GCN-NEXT: v_add_f32_e32 v6, v6, v32 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_add_f32_e32 v6, v6, v22 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v21 +; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v5 +; GCN-NEXT: v_add_f32_e32 v22, v32, v22 +; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20 -; GCN-NEXT: v_add_f32_e32 v5, v5, v32 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GCN-NEXT: v_add_f32_e32 v5, v5, v21 +; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v20 +; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v4 +; GCN-NEXT: v_add_f32_e32 v21, v32, v21 +; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:16 -; GCN-NEXT: v_add_f32_e32 v4, v4, v32 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_add_f32_e32 v4, v4, v20 +; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v19 +; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v3 +; GCN-NEXT: v_add_f32_e32 v20, v32, v20 +; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12 -; GCN-NEXT: v_add_f32_e32 v3, v3, v32 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_add_f32_e32 v3, v3, v19 +; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v18 +; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v2 +; GCN-NEXT: v_add_f32_e32 v19, v32, v19 +; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 -; GCN-NEXT: v_add_f32_e32 v2, v2, v32 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_add_f32_e32 v2, v2, v18 +; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v17 +; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v1 +; GCN-NEXT: v_add_f32_e32 v18, v32, v18 +; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4 -; GCN-NEXT: v_add_f32_e32 v1, v1, v32 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_add_f32_e32 v0, v0, v32 +; GCN-NEXT: v_add_f32_e32 v1, v1, v17 +; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v16 +; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v0 +; GCN-NEXT: v_add_f32_e32 v17, v32, v17 +; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GCN-NEXT: v_add_f32_e32 v0, v0, v16 +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_alignbit_b32 v0, v0, v17, 16 +; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v15 ; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; GCN-NEXT: v_alignbit_b32 v1, v1, v18, 16 +; GCN-NEXT: v_alignbit_b32 v2, v2, v19, 16 +; GCN-NEXT: v_alignbit_b32 v3, v3, v20, 16 +; GCN-NEXT: v_alignbit_b32 v4, v4, v21, 16 +; GCN-NEXT: v_alignbit_b32 v5, v5, v22, 16 +; GCN-NEXT: v_alignbit_b32 v6, v6, v23, 16 +; GCN-NEXT: v_alignbit_b32 v7, v7, v24, 16 +; GCN-NEXT: v_alignbit_b32 v8, v8, v25, 16 +; GCN-NEXT: v_alignbit_b32 v9, v9, v26, 16 +; GCN-NEXT: v_alignbit_b32 v10, v10, v11, 16 +; GCN-NEXT: v_alignbit_b32 v11, v27, v12, 16 +; GCN-NEXT: v_alignbit_b32 v12, v28, v13, 16 +; GCN-NEXT: v_alignbit_b32 v13, v29, v14, 16 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v16 ; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; GCN-NEXT: v_add_f32_e32 v17, v17, v14 +; GCN-NEXT: v_add_f32_e32 v14, v15, v16 +; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; GCN-NEXT: v_alignbit_b32 v14, v30, v31, 16 +; GCN-NEXT: v_alignbit_b32 v15, v15, v17, 16 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_fadd_v32bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128 -; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30 +; GFX7-NEXT: v_lshlrev_b32_e32 v31, 16, v30 +; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v14 ; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29 -; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28 -; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27 -; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26 -; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 -; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 -; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23 -; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 -; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21 -; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20 -; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19 -; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18 -; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 -; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 ; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 +; GFX7-NEXT: v_add_f32_e32 v31, v32, v31 +; GFX7-NEXT: v_add_f32_e32 v14, v14, v30 +; GFX7-NEXT: v_lshlrev_b32_e32 v30, 16, v29 +; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v13 +; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 ; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 +; GFX7-NEXT: v_add_f32_e32 v30, v32, v30 +; GFX7-NEXT: v_add_f32_e32 v13, v13, v29 +; GFX7-NEXT: v_lshlrev_b32_e32 v29, 16, v28 +; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v12 +; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 ; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 +; GFX7-NEXT: v_add_f32_e32 v29, v32, v29 +; GFX7-NEXT: v_add_f32_e32 v12, v12, v28 +; GFX7-NEXT: v_lshlrev_b32_e32 v28, 16, v27 +; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v11 +; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 ; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 +; GFX7-NEXT: v_add_f32_e32 v28, v32, v28 +; GFX7-NEXT: v_add_f32_e32 v11, v11, v27 +; GFX7-NEXT: v_lshlrev_b32_e32 v27, 16, v26 +; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v10 +; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; GFX7-NEXT: v_add_f32_e32 v27, v32, v27 +; GFX7-NEXT: v_add_f32_e32 v10, v10, v26 +; GFX7-NEXT: v_lshlrev_b32_e32 v26, 16, v25 +; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v9 +; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 ; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 +; GFX7-NEXT: v_add_f32_e32 v26, v32, v26 +; GFX7-NEXT: v_add_f32_e32 v9, v9, v25 +; GFX7-NEXT: v_lshlrev_b32_e32 v25, 16, v24 +; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v8 +; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GFX7-NEXT: v_add_f32_e32 v8, v8, v24 +; GFX7-NEXT: buffer_load_dword v24, off, s[0:3], s32 +; GFX7-NEXT: v_add_f32_e32 v25, v32, v25 +; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v15 +; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; GFX7-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; GFX7-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; GFX7-NEXT: v_alignbit_b32 v8, v8, v25, 16 +; GFX7-NEXT: v_alignbit_b32 v9, v9, v26, 16 +; GFX7-NEXT: v_alignbit_b32 v10, v10, v27, 16 +; GFX7-NEXT: v_alignbit_b32 v11, v11, v28, 16 +; GFX7-NEXT: v_alignbit_b32 v12, v12, v29, 16 +; GFX7-NEXT: v_alignbit_b32 v13, v13, v30, 16 +; GFX7-NEXT: v_alignbit_b32 v14, v14, v31, 16 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshlrev_b32_e32 v33, 16, v24 +; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; GFX7-NEXT: v_add_f32_e32 v32, v32, v33 +; GFX7-NEXT: v_add_f32_e32 v15, v15, v24 +; GFX7-NEXT: v_lshlrev_b32_e32 v24, 16, v23 +; GFX7-NEXT: v_lshlrev_b32_e32 v33, 16, v7 +; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GFX7-NEXT: v_add_f32_e32 v24, v33, v24 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v23 +; GFX7-NEXT: v_lshlrev_b32_e32 v23, 16, v22 +; GFX7-NEXT: v_lshlrev_b32_e32 v33, 16, v6 +; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_add_f32_e32 v23, v33, v23 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v22 +; GFX7-NEXT: v_lshlrev_b32_e32 v22, 16, v21 +; GFX7-NEXT: v_lshlrev_b32_e32 v33, 16, v5 +; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_add_f32_e32 v22, v33, v22 +; GFX7-NEXT: v_add_f32_e32 v5, v5, v21 +; GFX7-NEXT: v_lshlrev_b32_e32 v21, 16, v20 +; GFX7-NEXT: v_lshlrev_b32_e32 v33, 16, v4 +; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_add_f32_e32 v21, v33, v21 +; GFX7-NEXT: v_add_f32_e32 v4, v4, v20 +; GFX7-NEXT: v_lshlrev_b32_e32 v20, 16, v19 +; GFX7-NEXT: v_lshlrev_b32_e32 v33, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_add_f32_e32 v20, v33, v20 +; GFX7-NEXT: v_add_f32_e32 v3, v3, v19 +; GFX7-NEXT: v_lshlrev_b32_e32 v19, 16, v18 +; GFX7-NEXT: v_lshlrev_b32_e32 v33, 16, v2 +; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_add_f32_e32 v19, v33, v19 +; GFX7-NEXT: v_add_f32_e32 v2, v2, v18 +; GFX7-NEXT: v_lshlrev_b32_e32 v18, 16, v17 +; GFX7-NEXT: v_lshlrev_b32_e32 v33, 16, v1 +; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 -; GFX7-NEXT: v_add_f32_e32 v31, v31, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124 -; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_add_f32_e32 v30, v30, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:120 -; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_add_f32_e32 v29, v29, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:116 -; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_add_f32_e32 v28, v28, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:112 -; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_add_f32_e32 v27, v27, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:108 -; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_add_f32_e32 v26, v26, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:104 -; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_add_f32_e32 v25, v25, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:100 -; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_add_f32_e32 v24, v24, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:96 -; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_add_f32_e32 v23, v23, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:92 -; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_add_f32_e32 v22, v22, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:88 -; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_add_f32_e32 v21, v21, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:84 -; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_add_f32_e32 v20, v20, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:80 -; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_add_f32_e32 v19, v19, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:76 -; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_add_f32_e32 v18, v18, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72 -; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_add_f32_e32 v17, v17, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68 -; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_add_f32_e32 v16, v16, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64 -; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_add_f32_e32 v15, v15, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60 -; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_add_f32_e32 v14, v14, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56 -; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_add_f32_e32 v13, v13, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52 -; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_add_f32_e32 v12, v12, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:48 -; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_add_f32_e32 v11, v11, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:44 -; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_add_f32_e32 v10, v10, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:40 -; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_add_f32_e32 v9, v9, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36 -; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_add_f32_e32 v8, v8, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:32 -; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_add_f32_e32 v5, v5, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:20 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_add_f32_e32 v4, v4, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:16 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_add_f32_e32 v3, v3, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_add_f32_e32 v2, v2, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_add_f32_e32 v1, v1, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_add_f32_e32 v0, v0, v32 +; GFX7-NEXT: v_add_f32_e32 v18, v33, v18 +; GFX7-NEXT: v_add_f32_e32 v1, v1, v17 +; GFX7-NEXT: v_lshlrev_b32_e32 v17, 16, v16 +; GFX7-NEXT: v_lshlrev_b32_e32 v33, 16, v0 +; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_add_f32_e32 v0, v0, v16 +; GFX7-NEXT: v_add_f32_e32 v17, v33, v17 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; GFX7-NEXT: v_alignbit_b32 v0, v0, v17, 16 +; GFX7-NEXT: v_alignbit_b32 v1, v1, v18, 16 +; GFX7-NEXT: v_alignbit_b32 v2, v2, v19, 16 +; GFX7-NEXT: v_alignbit_b32 v3, v3, v20, 16 +; GFX7-NEXT: v_alignbit_b32 v4, v4, v21, 16 +; GFX7-NEXT: v_alignbit_b32 v5, v5, v22, 16 +; GFX7-NEXT: v_alignbit_b32 v6, v6, v23, 16 +; GFX7-NEXT: v_alignbit_b32 v7, v7, v24, 16 +; GFX7-NEXT: v_alignbit_b32 v15, v15, v32, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fadd_v32bf16: @@ -15637,35 +14073,27 @@ define <2 x bfloat> @v_fsub_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GCN-LABEL: v_fsub_v2bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_sub_f32_e32 v1, v1, v3 -; GCN-NEXT: v_sub_f32_e32 v0, v0, v2 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GCN-NEXT: v_sub_f32_e32 v2, v3, v2 +; GCN-NEXT: v_sub_f32_e32 v0, v0, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_alignbit_b32 v0, v0, v2, 16 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_fsub_v2bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_sub_f32_e32 v1, v1, v3 -; GFX7-NEXT: v_sub_f32_e32 v0, v0, v2 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX7-NEXT: v_sub_f32_e32 v2, v3, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_alignbit_b32 v0, v0, v2, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fsub_v2bf16: @@ -15817,47 +14245,35 @@ define <3 x bfloat> @v_fsub_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GCN-LABEL: v_fsub_v3bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v0 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_sub_f32_e32 v2, v2, v5 -; GCN-NEXT: v_sub_f32_e32 v1, v1, v4 -; GCN-NEXT: v_sub_f32_e32 v0, v0, v3 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GCN-NEXT: v_sub_f32_e32 v1, v1, v3 +; GCN-NEXT: v_sub_f32_e32 v3, v5, v4 +; GCN-NEXT: v_sub_f32_e32 v0, v0, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_alignbit_b32 v0, v0, v3, 16 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_fsub_v3bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_sub_f32_e32 v1, v1, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v0 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_sub_f32_e32 v2, v2, v5 -; GFX7-NEXT: v_sub_f32_e32 v1, v1, v4 -; GFX7-NEXT: v_sub_f32_e32 v0, v0, v3 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX7-NEXT: v_sub_f32_e32 v0, v0, v2 +; GFX7-NEXT: v_sub_f32_e32 v3, v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_alignbit_b32 v0, v0, v3, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fsub_v3bf16: @@ -16083,59 +14499,43 @@ define <4 x bfloat> @v_fsub_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GCN-LABEL: v_fsub_v4bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v1 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_sub_f32_e32 v3, v3, v7 -; GCN-NEXT: v_sub_f32_e32 v2, v2, v6 -; GCN-NEXT: v_sub_f32_e32 v1, v1, v5 -; GCN-NEXT: v_sub_f32_e32 v0, v0, v4 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v0 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GCN-NEXT: v_sub_f32_e32 v4, v5, v4 +; GCN-NEXT: v_sub_f32_e32 v1, v1, v3 +; GCN-NEXT: v_sub_f32_e32 v3, v7, v6 +; GCN-NEXT: v_sub_f32_e32 v0, v0, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GCN-NEXT: v_alignbit_b32 v1, v1, v4, 16 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_fsub_v4bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v1 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_sub_f32_e32 v3, v3, v7 -; GFX7-NEXT: v_sub_f32_e32 v2, v2, v6 -; GFX7-NEXT: v_sub_f32_e32 v1, v1, v5 -; GFX7-NEXT: v_sub_f32_e32 v0, v0, v4 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: v_sub_f32_e32 v4, v5, v4 +; GFX7-NEXT: v_sub_f32_e32 v1, v1, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v0 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_sub_f32_e32 v0, v0, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_sub_f32_e32 v3, v5, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX7-NEXT: v_alignbit_b32 v1, v1, v4, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fsub_v4bf16: @@ -16528,35 +14928,27 @@ define <2 x bfloat> @v_fmul_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GCN-LABEL: v_fmul_v2bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_mul_f32_e32 v1, v1, v3 -; GCN-NEXT: v_mul_f32_e32 v0, v0, v2 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GCN-NEXT: v_mul_f32_e32 v2, v3, v2 +; GCN-NEXT: v_mul_f32_e32 v0, v0, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_alignbit_b32 v0, v0, v2, 16 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_fmul_v2bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_mul_f32_e32 v1, v1, v3 -; GFX7-NEXT: v_mul_f32_e32 v0, v0, v2 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX7-NEXT: v_mul_f32_e32 v2, v3, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_alignbit_b32 v0, v0, v2, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fmul_v2bf16: @@ -16708,47 +15100,35 @@ define <3 x bfloat> @v_fmul_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GCN-LABEL: v_fmul_v3bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v0 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_mul_f32_e32 v2, v2, v5 -; GCN-NEXT: v_mul_f32_e32 v1, v1, v4 -; GCN-NEXT: v_mul_f32_e32 v0, v0, v3 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GCN-NEXT: v_mul_f32_e32 v1, v1, v3 +; GCN-NEXT: v_mul_f32_e32 v3, v5, v4 +; GCN-NEXT: v_mul_f32_e32 v0, v0, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_alignbit_b32 v0, v0, v3, 16 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_fmul_v3bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_mul_f32_e32 v1, v1, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v0 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_mul_f32_e32 v2, v2, v5 -; GFX7-NEXT: v_mul_f32_e32 v1, v1, v4 -; GFX7-NEXT: v_mul_f32_e32 v0, v0, v3 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX7-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX7-NEXT: v_mul_f32_e32 v3, v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_alignbit_b32 v0, v0, v3, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fmul_v3bf16: @@ -16950,59 +15330,43 @@ define <4 x bfloat> @v_fmul_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GCN-LABEL: v_fmul_v4bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v1 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_mul_f32_e32 v3, v3, v7 -; GCN-NEXT: v_mul_f32_e32 v2, v2, v6 -; GCN-NEXT: v_mul_f32_e32 v1, v1, v5 -; GCN-NEXT: v_mul_f32_e32 v0, v0, v4 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v0 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GCN-NEXT: v_mul_f32_e32 v4, v5, v4 +; GCN-NEXT: v_mul_f32_e32 v1, v1, v3 +; GCN-NEXT: v_mul_f32_e32 v3, v7, v6 +; GCN-NEXT: v_mul_f32_e32 v0, v0, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GCN-NEXT: v_alignbit_b32 v1, v1, v4, 16 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_fmul_v4bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v1 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_mul_f32_e32 v3, v3, v7 -; GFX7-NEXT: v_mul_f32_e32 v2, v2, v6 -; GFX7-NEXT: v_mul_f32_e32 v1, v1, v5 -; GFX7-NEXT: v_mul_f32_e32 v0, v0, v4 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: v_mul_f32_e32 v4, v5, v4 +; GFX7-NEXT: v_mul_f32_e32 v1, v1, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v0 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_mul_f32_e32 v3, v5, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX7-NEXT: v_alignbit_b32 v1, v1, v4, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fmul_v4bf16: @@ -17248,107 +15612,75 @@ define <8 x bfloat> @v_fmul_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GCN-LABEL: v_fmul_v8bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v3 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v2 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_mul_f32_e32 v7, v7, v15 -; GCN-NEXT: v_mul_f32_e32 v6, v6, v14 -; GCN-NEXT: v_mul_f32_e32 v5, v5, v13 -; GCN-NEXT: v_mul_f32_e32 v4, v4, v12 -; GCN-NEXT: v_mul_f32_e32 v3, v3, v11 -; GCN-NEXT: v_mul_f32_e32 v2, v2, v10 -; GCN-NEXT: v_mul_f32_e32 v1, v1, v9 -; GCN-NEXT: v_mul_f32_e32 v0, v0, v8 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v1 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v0 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GCN-NEXT: v_mul_f32_e32 v8, v9, v8 +; GCN-NEXT: v_mul_f32_e32 v3, v3, v7 +; GCN-NEXT: v_mul_f32_e32 v7, v11, v10 +; GCN-NEXT: v_mul_f32_e32 v2, v2, v6 +; GCN-NEXT: v_mul_f32_e32 v6, v13, v12 +; GCN-NEXT: v_mul_f32_e32 v1, v1, v5 +; GCN-NEXT: v_mul_f32_e32 v5, v15, v14 +; GCN-NEXT: v_mul_f32_e32 v0, v0, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_alignbit_b32 v0, v0, v5, 16 +; GCN-NEXT: v_alignbit_b32 v1, v1, v6, 16 +; GCN-NEXT: v_alignbit_b32 v2, v2, v7, 16 +; GCN-NEXT: v_alignbit_b32 v3, v3, v8, 16 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_fmul_v8bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 -; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 -; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v7 +; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v3 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GFX7-NEXT: v_mul_f32_e32 v8, v9, v8 +; GFX7-NEXT: v_mul_f32_e32 v3, v3, v7 +; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v2 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_mul_f32_e32 v7, v7, v15 -; GFX7-NEXT: v_mul_f32_e32 v6, v6, v14 -; GFX7-NEXT: v_mul_f32_e32 v5, v5, v13 -; GFX7-NEXT: v_mul_f32_e32 v4, v4, v12 -; GFX7-NEXT: v_mul_f32_e32 v3, v3, v11 -; GFX7-NEXT: v_mul_f32_e32 v2, v2, v10 -; GFX7-NEXT: v_mul_f32_e32 v1, v1, v9 -; GFX7-NEXT: v_mul_f32_e32 v0, v0, v8 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_mul_f32_e32 v7, v9, v7 +; GFX7-NEXT: v_mul_f32_e32 v2, v2, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v1 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX7-NEXT: v_mul_f32_e32 v6, v9, v6 +; GFX7-NEXT: v_mul_f32_e32 v1, v1, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v0 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_mul_f32_e32 v0, v0, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_mul_f32_e32 v5, v9, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_alignbit_b32 v0, v0, v5, 16 +; GFX7-NEXT: v_alignbit_b32 v1, v1, v6, 16 +; GFX7-NEXT: v_alignbit_b32 v2, v2, v7, 16 +; GFX7-NEXT: v_alignbit_b32 v3, v3, v8, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fmul_v8bf16: @@ -17796,207 +16128,139 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GCN-LABEL: v_fmul_v16bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30 -; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GCN-NEXT: v_mul_f32_e32 v14, v14, v30 -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GCN-NEXT: v_mul_f32_e32 v13, v13, v29 -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28 -; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GCN-NEXT: v_mul_f32_e32 v12, v12, v28 -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GCN-NEXT: v_mul_f32_e32 v11, v11, v27 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v15 +; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v7 +; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v14 +; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v6 +; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; GCN-NEXT: v_mul_f32_e32 v16, v17, v16 +; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v13 +; GCN-NEXT: v_mul_f32_e32 v7, v7, v15 +; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v5 +; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; GCN-NEXT: v_mul_f32_e32 v18, v19, v18 +; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v12 +; GCN-NEXT: v_mul_f32_e32 v6, v6, v14 +; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v4 +; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; GCN-NEXT: v_mul_f32_e32 v15, v15, v17 +; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v11 +; GCN-NEXT: v_mul_f32_e32 v5, v5, v13 +; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v3 +; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; GCN-NEXT: v_mul_f32_e32 v14, v14, v19 +; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v10 +; GCN-NEXT: v_mul_f32_e32 v4, v4, v12 +; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v2 +; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GCN-NEXT: v_mul_f32_e32 v10, v10, v26 -; GCN-NEXT: v_mul_f32_e32 v9, v9, v25 -; GCN-NEXT: v_mul_f32_e32 v8, v8, v24 -; GCN-NEXT: v_mul_f32_e32 v7, v7, v23 -; GCN-NEXT: v_mul_f32_e32 v6, v6, v22 -; GCN-NEXT: v_mul_f32_e32 v5, v5, v21 -; GCN-NEXT: v_mul_f32_e32 v4, v4, v20 -; GCN-NEXT: v_mul_f32_e32 v3, v3, v19 -; GCN-NEXT: v_mul_f32_e32 v2, v2, v18 -; GCN-NEXT: v_mul_f32_e32 v1, v1, v17 -; GCN-NEXT: v_mul_f32_e32 v0, v0, v16 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GCN-NEXT: v_mul_f32_e32 v13, v13, v17 +; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v9 +; GCN-NEXT: v_mul_f32_e32 v3, v3, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v1 +; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GCN-NEXT: v_mul_f32_e32 v12, v12, v19 +; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v8 +; GCN-NEXT: v_mul_f32_e32 v2, v2, v10 +; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v0 ; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v27 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GCN-NEXT: v_mul_f32_e32 v15, v15, v16 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GCN-NEXT: v_mul_f32_e32 v11, v11, v17 +; GCN-NEXT: v_mul_f32_e32 v1, v1, v9 +; GCN-NEXT: v_mul_f32_e32 v9, v10, v19 +; GCN-NEXT: v_mul_f32_e32 v0, v0, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_alignbit_b32 v0, v0, v9, 16 +; GCN-NEXT: v_alignbit_b32 v1, v1, v11, 16 +; GCN-NEXT: v_alignbit_b32 v2, v2, v12, 16 +; GCN-NEXT: v_alignbit_b32 v3, v3, v13, 16 +; GCN-NEXT: v_alignbit_b32 v4, v4, v14, 16 +; GCN-NEXT: v_alignbit_b32 v5, v5, v15, 16 +; GCN-NEXT: v_alignbit_b32 v6, v6, v18, 16 +; GCN-NEXT: v_alignbit_b32 v7, v7, v16, 16 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_fmul_v16bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27 -; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GFX7-NEXT: v_mul_f32_e32 v11, v11, v27 -; GFX7-NEXT: buffer_load_dword v27, off, s[0:3], s32 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 -; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 -; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30 -; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29 -; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 -; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28 -; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 -; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26 -; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 -; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 -; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23 -; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_mul_f32_e32 v6, v6, v22 -; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; GFX7-NEXT: v_lshlrev_b32_e32 v16, 16, v15 +; GFX7-NEXT: v_lshlrev_b32_e32 v17, 16, v7 +; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX7-NEXT: v_mul_f32_e32 v16, v17, v16 +; GFX7-NEXT: v_mul_f32_e32 v7, v7, v15 +; GFX7-NEXT: v_lshlrev_b32_e32 v15, 16, v14 +; GFX7-NEXT: v_lshlrev_b32_e32 v17, 16, v6 ; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX7-NEXT: v_mul_f32_e32 v15, v17, v15 +; GFX7-NEXT: v_mul_f32_e32 v6, v6, v14 +; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v13 +; GFX7-NEXT: v_lshlrev_b32_e32 v17, 16, v5 ; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; GFX7-NEXT: v_mul_f32_e32 v14, v17, v14 +; GFX7-NEXT: v_mul_f32_e32 v5, v5, v13 +; GFX7-NEXT: v_lshlrev_b32_e32 v13, 16, v12 +; GFX7-NEXT: v_lshlrev_b32_e32 v17, 16, v4 +; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; GFX7-NEXT: v_mul_f32_e32 v13, v17, v13 +; GFX7-NEXT: v_mul_f32_e32 v4, v4, v12 +; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v11 +; GFX7-NEXT: v_lshlrev_b32_e32 v17, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; GFX7-NEXT: v_mul_f32_e32 v12, v17, v12 +; GFX7-NEXT: v_mul_f32_e32 v3, v3, v11 +; GFX7-NEXT: v_lshlrev_b32_e32 v11, 16, v10 +; GFX7-NEXT: v_lshlrev_b32_e32 v17, 16, v2 +; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_mul_f32_e32 v14, v14, v30 -; GFX7-NEXT: v_mul_f32_e32 v13, v13, v29 -; GFX7-NEXT: v_mul_f32_e32 v12, v12, v28 -; GFX7-NEXT: v_mul_f32_e32 v10, v10, v26 -; GFX7-NEXT: v_mul_f32_e32 v9, v9, v25 -; GFX7-NEXT: v_mul_f32_e32 v8, v8, v24 -; GFX7-NEXT: v_mul_f32_e32 v7, v7, v23 -; GFX7-NEXT: v_mul_f32_e32 v5, v5, v21 -; GFX7-NEXT: v_mul_f32_e32 v4, v4, v20 -; GFX7-NEXT: v_mul_f32_e32 v3, v3, v19 -; GFX7-NEXT: v_mul_f32_e32 v2, v2, v18 -; GFX7-NEXT: v_mul_f32_e32 v1, v1, v17 -; GFX7-NEXT: v_mul_f32_e32 v0, v0, v16 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_mul_f32_e32 v11, v17, v11 +; GFX7-NEXT: v_mul_f32_e32 v2, v2, v10 +; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v9 +; GFX7-NEXT: v_lshlrev_b32_e32 v17, 16, v1 +; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v27 -; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GFX7-NEXT: v_mul_f32_e32 v15, v15, v22 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX7-NEXT: v_mul_f32_e32 v10, v17, v10 +; GFX7-NEXT: v_mul_f32_e32 v1, v1, v9 +; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v17, 16, v0 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_mul_f32_e32 v0, v0, v8 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_mul_f32_e32 v9, v17, v9 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_alignbit_b32 v0, v0, v9, 16 +; GFX7-NEXT: v_alignbit_b32 v1, v1, v10, 16 +; GFX7-NEXT: v_alignbit_b32 v2, v2, v11, 16 +; GFX7-NEXT: v_alignbit_b32 v3, v3, v12, 16 +; GFX7-NEXT: v_alignbit_b32 v4, v4, v13, 16 +; GFX7-NEXT: v_alignbit_b32 v5, v5, v14, 16 +; GFX7-NEXT: v_alignbit_b32 v6, v6, v15, 16 +; GFX7-NEXT: v_alignbit_b32 v7, v7, v16, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fmul_v16bf16: @@ -18840,527 +17104,271 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GCN-LABEL: v_fmul_v32bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:124 -; GCN-NEXT: v_mul_f32_e32 v31, v31, v32 -; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30 +; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v30 +; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v14 +; GCN-NEXT: v_mul_f32_e32 v31, v32, v31 ; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:120 -; GCN-NEXT: v_mul_f32_e32 v30, v30, v32 -; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:116 -; GCN-NEXT: v_mul_f32_e32 v29, v29, v32 -; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28 -; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:112 -; GCN-NEXT: v_mul_f32_e32 v28, v28, v32 -; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:108 -; GCN-NEXT: v_mul_f32_e32 v27, v27, v32 -; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:104 -; GCN-NEXT: v_mul_f32_e32 v26, v26, v32 -; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:100 -; GCN-NEXT: v_mul_f32_e32 v25, v25, v32 -; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:96 -; GCN-NEXT: v_mul_f32_e32 v24, v24, v32 -; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92 -; GCN-NEXT: v_mul_f32_e32 v23, v23, v32 -; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:88 -; GCN-NEXT: v_mul_f32_e32 v22, v22, v32 -; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:84 -; GCN-NEXT: v_mul_f32_e32 v21, v21, v32 -; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:80 -; GCN-NEXT: v_mul_f32_e32 v20, v20, v32 -; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:76 -; GCN-NEXT: v_mul_f32_e32 v19, v19, v32 -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:72 -; GCN-NEXT: v_mul_f32_e32 v18, v18, v32 -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:68 -; GCN-NEXT: v_mul_f32_e32 v17, v17, v32 -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:64 -; GCN-NEXT: v_mul_f32_e32 v16, v16, v32 -; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:60 -; GCN-NEXT: v_mul_f32_e32 v15, v15, v32 -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 ; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:56 -; GCN-NEXT: v_mul_f32_e32 v14, v14, v32 -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 +; GCN-NEXT: v_mul_f32_e32 v30, v14, v30 +; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v29 +; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v13 +; GCN-NEXT: v_mul_f32_e32 v14, v32, v14 +; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 ; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:52 -; GCN-NEXT: v_mul_f32_e32 v13, v13, v32 -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 +; GCN-NEXT: v_mul_f32_e32 v29, v13, v29 +; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v28 +; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v12 +; GCN-NEXT: v_mul_f32_e32 v13, v32, v13 +; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 ; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:48 -; GCN-NEXT: v_mul_f32_e32 v12, v12, v32 -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 +; GCN-NEXT: v_mul_f32_e32 v28, v12, v28 +; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v27 +; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v11 +; GCN-NEXT: v_mul_f32_e32 v12, v32, v12 +; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 ; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:44 -; GCN-NEXT: v_mul_f32_e32 v11, v11, v32 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 +; GCN-NEXT: v_mul_f32_e32 v27, v11, v27 +; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v26 +; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v10 +; GCN-NEXT: v_mul_f32_e32 v11, v32, v11 +; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:40 -; GCN-NEXT: v_mul_f32_e32 v10, v10, v32 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; GCN-NEXT: v_mul_f32_e32 v10, v10, v26 +; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v25 +; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v9 +; GCN-NEXT: v_mul_f32_e32 v26, v32, v26 +; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 ; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:36 -; GCN-NEXT: v_mul_f32_e32 v9, v9, v32 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 +; GCN-NEXT: v_mul_f32_e32 v9, v9, v25 +; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v24 +; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v8 +; GCN-NEXT: v_mul_f32_e32 v25, v32, v25 +; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 ; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:32 -; GCN-NEXT: v_mul_f32_e32 v8, v8, v32 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GCN-NEXT: v_mul_f32_e32 v8, v8, v24 +; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v23 +; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v7 +; GCN-NEXT: v_mul_f32_e32 v24, v32, v24 +; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:28 -; GCN-NEXT: v_mul_f32_e32 v7, v7, v32 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GCN-NEXT: v_mul_f32_e32 v7, v7, v23 +; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v22 +; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v6 +; GCN-NEXT: v_mul_f32_e32 v23, v32, v23 +; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:24 -; GCN-NEXT: v_mul_f32_e32 v6, v6, v32 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_mul_f32_e32 v6, v6, v22 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v21 +; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v5 +; GCN-NEXT: v_mul_f32_e32 v22, v32, v22 +; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20 -; GCN-NEXT: v_mul_f32_e32 v5, v5, v32 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GCN-NEXT: v_mul_f32_e32 v5, v5, v21 +; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v20 +; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v4 +; GCN-NEXT: v_mul_f32_e32 v21, v32, v21 +; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:16 -; GCN-NEXT: v_mul_f32_e32 v4, v4, v32 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v4, v4, v20 +; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v19 +; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v3 +; GCN-NEXT: v_mul_f32_e32 v20, v32, v20 +; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12 -; GCN-NEXT: v_mul_f32_e32 v3, v3, v32 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v3, v3, v19 +; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v18 +; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v2 +; GCN-NEXT: v_mul_f32_e32 v19, v32, v19 +; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 -; GCN-NEXT: v_mul_f32_e32 v2, v2, v32 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v2, v2, v18 +; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v17 +; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v1 +; GCN-NEXT: v_mul_f32_e32 v18, v32, v18 +; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4 -; GCN-NEXT: v_mul_f32_e32 v1, v1, v32 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v0, v0, v32 +; GCN-NEXT: v_mul_f32_e32 v1, v1, v17 +; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v16 +; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v0 +; GCN-NEXT: v_mul_f32_e32 v17, v32, v17 +; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GCN-NEXT: v_mul_f32_e32 v0, v0, v16 +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_alignbit_b32 v0, v0, v17, 16 +; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v15 ; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; GCN-NEXT: v_alignbit_b32 v1, v1, v18, 16 +; GCN-NEXT: v_alignbit_b32 v2, v2, v19, 16 +; GCN-NEXT: v_alignbit_b32 v3, v3, v20, 16 +; GCN-NEXT: v_alignbit_b32 v4, v4, v21, 16 +; GCN-NEXT: v_alignbit_b32 v5, v5, v22, 16 +; GCN-NEXT: v_alignbit_b32 v6, v6, v23, 16 +; GCN-NEXT: v_alignbit_b32 v7, v7, v24, 16 +; GCN-NEXT: v_alignbit_b32 v8, v8, v25, 16 +; GCN-NEXT: v_alignbit_b32 v9, v9, v26, 16 +; GCN-NEXT: v_alignbit_b32 v10, v10, v11, 16 +; GCN-NEXT: v_alignbit_b32 v11, v27, v12, 16 +; GCN-NEXT: v_alignbit_b32 v12, v28, v13, 16 +; GCN-NEXT: v_alignbit_b32 v13, v29, v14, 16 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v16 ; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; GCN-NEXT: v_mul_f32_e32 v17, v17, v14 +; GCN-NEXT: v_mul_f32_e32 v14, v15, v16 +; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; GCN-NEXT: v_alignbit_b32 v14, v30, v31, 16 +; GCN-NEXT: v_alignbit_b32 v15, v15, v17, 16 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_fmul_v32bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128 -; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30 +; GFX7-NEXT: v_lshlrev_b32_e32 v31, 16, v30 +; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v14 ; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29 -; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28 -; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27 -; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26 -; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 -; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 -; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23 -; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 -; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21 -; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20 -; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19 -; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18 -; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 -; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 ; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 +; GFX7-NEXT: v_mul_f32_e32 v31, v32, v31 +; GFX7-NEXT: v_mul_f32_e32 v14, v14, v30 +; GFX7-NEXT: v_lshlrev_b32_e32 v30, 16, v29 +; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v13 +; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 ; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 +; GFX7-NEXT: v_mul_f32_e32 v30, v32, v30 +; GFX7-NEXT: v_mul_f32_e32 v13, v13, v29 +; GFX7-NEXT: v_lshlrev_b32_e32 v29, 16, v28 +; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v12 +; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 ; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 +; GFX7-NEXT: v_mul_f32_e32 v29, v32, v29 +; GFX7-NEXT: v_mul_f32_e32 v12, v12, v28 +; GFX7-NEXT: v_lshlrev_b32_e32 v28, 16, v27 +; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v11 +; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 ; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 +; GFX7-NEXT: v_mul_f32_e32 v28, v32, v28 +; GFX7-NEXT: v_mul_f32_e32 v11, v11, v27 +; GFX7-NEXT: v_lshlrev_b32_e32 v27, 16, v26 +; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v10 +; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; GFX7-NEXT: v_mul_f32_e32 v27, v32, v27 +; GFX7-NEXT: v_mul_f32_e32 v10, v10, v26 +; GFX7-NEXT: v_lshlrev_b32_e32 v26, 16, v25 +; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v9 +; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 ; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 +; GFX7-NEXT: v_mul_f32_e32 v26, v32, v26 +; GFX7-NEXT: v_mul_f32_e32 v9, v9, v25 +; GFX7-NEXT: v_lshlrev_b32_e32 v25, 16, v24 +; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v8 +; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 -; GFX7-NEXT: v_mul_f32_e32 v31, v31, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124 -; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v30, v30, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:120 -; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v29, v29, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:116 -; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v28, v28, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:112 -; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v27, v27, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:108 -; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v26, v26, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:104 -; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v25, v25, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:100 -; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; GFX7-NEXT: v_mul_f32_e32 v8, v8, v24 +; GFX7-NEXT: buffer_load_dword v24, off, s[0:3], s32 +; GFX7-NEXT: v_mul_f32_e32 v25, v32, v25 +; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v15 +; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; GFX7-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; GFX7-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; GFX7-NEXT: v_alignbit_b32 v8, v8, v25, 16 +; GFX7-NEXT: v_alignbit_b32 v9, v9, v26, 16 +; GFX7-NEXT: v_alignbit_b32 v10, v10, v27, 16 +; GFX7-NEXT: v_alignbit_b32 v11, v11, v28, 16 +; GFX7-NEXT: v_alignbit_b32 v12, v12, v29, 16 +; GFX7-NEXT: v_alignbit_b32 v13, v13, v30, 16 +; GFX7-NEXT: v_alignbit_b32 v14, v14, v31, 16 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v24, v24, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:96 +; GFX7-NEXT: v_lshlrev_b32_e32 v33, 16, v24 ; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v23, v23, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:92 +; GFX7-NEXT: v_mul_f32_e32 v32, v32, v33 +; GFX7-NEXT: v_mul_f32_e32 v15, v15, v24 +; GFX7-NEXT: v_lshlrev_b32_e32 v24, 16, v23 +; GFX7-NEXT: v_lshlrev_b32_e32 v33, 16, v7 ; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v22, v22, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:88 -; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v21, v21, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:84 -; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v20, v20, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:80 -; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v19, v19, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:76 -; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v18, v18, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72 -; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v17, v17, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68 -; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v16, v16, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64 -; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v15, v15, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60 -; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v14, v14, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56 -; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v13, v13, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52 -; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v12, v12, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:48 -; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v11, v11, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:44 -; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v10, v10, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:40 -; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v9, v9, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36 -; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v8, v8, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:32 -; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v7, v7, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v6, v6, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24 +; GFX7-NEXT: v_mul_f32_e32 v24, v33, v24 +; GFX7-NEXT: v_mul_f32_e32 v7, v7, v23 +; GFX7-NEXT: v_lshlrev_b32_e32 v23, 16, v22 +; GFX7-NEXT: v_lshlrev_b32_e32 v33, 16, v6 +; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v5, v5, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:20 +; GFX7-NEXT: v_mul_f32_e32 v23, v33, v23 +; GFX7-NEXT: v_mul_f32_e32 v6, v6, v22 +; GFX7-NEXT: v_lshlrev_b32_e32 v22, 16, v21 +; GFX7-NEXT: v_lshlrev_b32_e32 v33, 16, v5 +; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v4, v4, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:16 +; GFX7-NEXT: v_mul_f32_e32 v22, v33, v22 +; GFX7-NEXT: v_mul_f32_e32 v5, v5, v21 +; GFX7-NEXT: v_lshlrev_b32_e32 v21, 16, v20 +; GFX7-NEXT: v_lshlrev_b32_e32 v33, 16, v4 +; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v3, v3, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12 +; GFX7-NEXT: v_mul_f32_e32 v21, v33, v21 +; GFX7-NEXT: v_mul_f32_e32 v4, v4, v20 +; GFX7-NEXT: v_lshlrev_b32_e32 v20, 16, v19 +; GFX7-NEXT: v_lshlrev_b32_e32 v33, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v2, v2, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 +; GFX7-NEXT: v_mul_f32_e32 v20, v33, v20 +; GFX7-NEXT: v_mul_f32_e32 v3, v3, v19 +; GFX7-NEXT: v_lshlrev_b32_e32 v19, 16, v18 +; GFX7-NEXT: v_lshlrev_b32_e32 v33, 16, v2 +; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v1, v1, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; GFX7-NEXT: v_mul_f32_e32 v19, v33, v19 +; GFX7-NEXT: v_mul_f32_e32 v2, v2, v18 +; GFX7-NEXT: v_lshlrev_b32_e32 v18, 16, v17 +; GFX7-NEXT: v_lshlrev_b32_e32 v33, 16, v1 +; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v0, v0, v32 +; GFX7-NEXT: v_mul_f32_e32 v18, v33, v18 +; GFX7-NEXT: v_mul_f32_e32 v1, v1, v17 +; GFX7-NEXT: v_lshlrev_b32_e32 v17, 16, v16 +; GFX7-NEXT: v_lshlrev_b32_e32 v33, 16, v0 +; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_mul_f32_e32 v0, v0, v16 +; GFX7-NEXT: v_mul_f32_e32 v17, v33, v17 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; GFX7-NEXT: v_alignbit_b32 v0, v0, v17, 16 +; GFX7-NEXT: v_alignbit_b32 v1, v1, v18, 16 +; GFX7-NEXT: v_alignbit_b32 v2, v2, v19, 16 +; GFX7-NEXT: v_alignbit_b32 v3, v3, v20, 16 +; GFX7-NEXT: v_alignbit_b32 v4, v4, v21, 16 +; GFX7-NEXT: v_alignbit_b32 v5, v5, v22, 16 +; GFX7-NEXT: v_alignbit_b32 v6, v6, v23, 16 +; GFX7-NEXT: v_alignbit_b32 v7, v7, v24, 16 +; GFX7-NEXT: v_alignbit_b32 v15, v15, v32, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fmul_v32bf16: @@ -21722,35 +19730,35 @@ define <2 x bfloat> @v_minnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GCN-LABEL: v_minnum_v2bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_min_f32_e32 v1, v1, v3 -; GCN-NEXT: v_min_f32_e32 v0, v0, v2 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_min_f32_e32 v2, v3, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_min_f32_e32 v0, v0, v1 +; GCN-NEXT: v_alignbit_b32 v0, v2, v0, 16 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_minnum_v2bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_min_f32_e32 v1, v1, v3 -; GFX7-NEXT: v_min_f32_e32 v0, v0, v2 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_min_f32_e32 v2, v3, v2 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX7-NEXT: v_alignbit_b32 v0, v2, v0, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_minnum_v2bf16: @@ -21902,47 +19910,47 @@ define <3 x bfloat> @v_minnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GCN-LABEL: v_minnum_v3bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_min_f32_e32 v2, v2, v5 -; GCN-NEXT: v_min_f32_e32 v1, v1, v4 -; GCN-NEXT: v_min_f32_e32 v0, v0, v3 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_min_f32_e32 v1, v1, v3 +; GCN-NEXT: v_min_f32_e32 v3, v5, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_min_f32_e32 v0, v0, v2 +; GCN-NEXT: v_alignbit_b32 v0, v3, v0, 16 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_minnum_v3bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_min_f32_e32 v1, v1, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_min_f32_e32 v3, v4, v3 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_min_f32_e32 v2, v2, v5 -; GFX7-NEXT: v_min_f32_e32 v1, v1, v4 -; GFX7-NEXT: v_min_f32_e32 v0, v0, v3 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_alignbit_b32 v0, v3, v0, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_minnum_v3bf16: @@ -22144,59 +20152,59 @@ define <4 x bfloat> @v_minnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GCN-LABEL: v_minnum_v4bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_min_f32_e32 v3, v3, v7 -; GCN-NEXT: v_min_f32_e32 v2, v2, v6 -; GCN-NEXT: v_min_f32_e32 v1, v1, v5 -; GCN-NEXT: v_min_f32_e32 v0, v0, v4 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_min_f32_e32 v4, v5, v4 +; GCN-NEXT: v_min_f32_e32 v1, v1, v3 +; GCN-NEXT: v_min_f32_e32 v3, v7, v6 +; GCN-NEXT: v_min_f32_e32 v0, v0, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_alignbit_b32 v0, v3, v0, 16 +; GCN-NEXT: v_alignbit_b32 v1, v2, v1, 16 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_minnum_v4bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_min_f32_e32 v4, v5, v4 +; GFX7-NEXT: v_min_f32_e32 v1, v1, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_min_f32_e32 v3, v5, v3 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_min_f32_e32 v3, v3, v7 -; GFX7-NEXT: v_min_f32_e32 v2, v2, v6 -; GFX7-NEXT: v_min_f32_e32 v1, v1, v5 -; GFX7-NEXT: v_min_f32_e32 v0, v0, v4 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX7-NEXT: v_alignbit_b32 v0, v3, v0, 16 +; GFX7-NEXT: v_alignbit_b32 v1, v4, v1, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_minnum_v4bf16: @@ -22442,107 +20450,107 @@ define <8 x bfloat> @v_minnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GCN-LABEL: v_minnum_v8bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v7 +; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v6 +; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v5 +; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v4 +; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 ; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_min_f32_e32 v7, v7, v15 -; GCN-NEXT: v_min_f32_e32 v6, v6, v14 -; GCN-NEXT: v_min_f32_e32 v5, v5, v13 -; GCN-NEXT: v_min_f32_e32 v4, v4, v12 -; GCN-NEXT: v_min_f32_e32 v3, v3, v11 -; GCN-NEXT: v_min_f32_e32 v2, v2, v10 -; GCN-NEXT: v_min_f32_e32 v1, v1, v9 -; GCN-NEXT: v_min_f32_e32 v0, v0, v8 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_min_f32_e32 v8, v9, v8 +; GCN-NEXT: v_min_f32_e32 v3, v3, v7 +; GCN-NEXT: v_min_f32_e32 v7, v11, v10 +; GCN-NEXT: v_min_f32_e32 v2, v2, v6 +; GCN-NEXT: v_min_f32_e32 v6, v13, v12 +; GCN-NEXT: v_min_f32_e32 v1, v1, v5 +; GCN-NEXT: v_min_f32_e32 v5, v15, v14 +; GCN-NEXT: v_min_f32_e32 v0, v0, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_alignbit_b32 v0, v5, v0, 16 +; GCN-NEXT: v_alignbit_b32 v1, v6, v1, 16 +; GCN-NEXT: v_alignbit_b32 v2, v7, v2, 16 +; GCN-NEXT: v_alignbit_b32 v3, v4, v3, 16 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_minnum_v8bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v7 +; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 +; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 +; GFX7-NEXT: v_min_f32_e32 v8, v9, v8 +; GFX7-NEXT: v_min_f32_e32 v3, v3, v7 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v6 +; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 -; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_min_f32_e32 v7, v7, v15 -; GFX7-NEXT: v_min_f32_e32 v6, v6, v14 -; GFX7-NEXT: v_min_f32_e32 v5, v5, v13 -; GFX7-NEXT: v_min_f32_e32 v4, v4, v12 -; GFX7-NEXT: v_min_f32_e32 v3, v3, v11 -; GFX7-NEXT: v_min_f32_e32 v2, v2, v10 -; GFX7-NEXT: v_min_f32_e32 v1, v1, v9 -; GFX7-NEXT: v_min_f32_e32 v0, v0, v8 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_min_f32_e32 v7, v9, v7 +; GFX7-NEXT: v_min_f32_e32 v2, v2, v6 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_min_f32_e32 v6, v9, v6 +; GFX7-NEXT: v_min_f32_e32 v1, v1, v5 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v0 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_min_f32_e32 v5, v9, v5 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_min_f32_e32 v0, v0, v4 +; GFX7-NEXT: v_alignbit_b32 v0, v5, v0, 16 +; GFX7-NEXT: v_alignbit_b32 v1, v6, v1, 16 +; GFX7-NEXT: v_alignbit_b32 v2, v7, v2, 16 +; GFX7-NEXT: v_alignbit_b32 v3, v8, v3, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_minnum_v8bf16: @@ -22990,207 +20998,203 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GCN-LABEL: v_minnum_v16bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30 -; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GCN-NEXT: v_min_f32_e32 v14, v14, v30 -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GCN-NEXT: v_min_f32_e32 v13, v13, v29 -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28 -; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GCN-NEXT: v_min_f32_e32 v12, v12, v28 -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24 +; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v15 +; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v14 +; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 +; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; GCN-NEXT: v_min_f32_e32 v16, v17, v16 +; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v13 +; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 +; GCN-NEXT: v_min_f32_e32 v7, v7, v15 +; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 +; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 +; GCN-NEXT: v_min_f32_e32 v18, v19, v18 +; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v12 +; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; GCN-NEXT: v_min_f32_e32 v6, v6, v14 +; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 +; GCN-NEXT: v_min_f32_e32 v15, v15, v17 +; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v11 +; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 +; GCN-NEXT: v_min_f32_e32 v5, v5, v13 +; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 +; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 +; GCN-NEXT: v_min_f32_e32 v14, v14, v19 +; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v10 +; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 +; GCN-NEXT: v_min_f32_e32 v4, v4, v12 +; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 +; GCN-NEXT: v_min_f32_e32 v13, v13, v17 +; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v9 +; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_min_f32_e32 v3, v3, v11 +; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 +; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 +; GCN-NEXT: v_min_f32_e32 v12, v12, v19 +; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v8 +; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_min_f32_e32 v2, v2, v10 +; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 +; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 +; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 +; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GCN-NEXT: v_min_f32_e32 v11, v11, v27 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GCN-NEXT: v_min_f32_e32 v10, v10, v26 -; GCN-NEXT: v_min_f32_e32 v9, v9, v25 -; GCN-NEXT: v_min_f32_e32 v8, v8, v24 -; GCN-NEXT: v_min_f32_e32 v7, v7, v23 -; GCN-NEXT: v_min_f32_e32 v6, v6, v22 -; GCN-NEXT: v_min_f32_e32 v5, v5, v21 -; GCN-NEXT: v_min_f32_e32 v4, v4, v20 -; GCN-NEXT: v_min_f32_e32 v3, v3, v19 -; GCN-NEXT: v_min_f32_e32 v2, v2, v18 -; GCN-NEXT: v_min_f32_e32 v1, v1, v17 -; GCN-NEXT: v_min_f32_e32 v0, v0, v16 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v27 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GCN-NEXT: v_min_f32_e32 v15, v15, v16 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GCN-NEXT: v_min_f32_e32 v11, v11, v17 +; GCN-NEXT: v_min_f32_e32 v1, v1, v9 +; GCN-NEXT: v_min_f32_e32 v9, v10, v19 +; GCN-NEXT: v_min_f32_e32 v0, v0, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v16 +; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v18 +; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GCN-NEXT: v_alignbit_b32 v0, v9, v0, 16 +; GCN-NEXT: v_alignbit_b32 v1, v11, v1, 16 +; GCN-NEXT: v_alignbit_b32 v2, v12, v2, 16 +; GCN-NEXT: v_alignbit_b32 v3, v13, v3, 16 +; GCN-NEXT: v_alignbit_b32 v4, v14, v4, 16 +; GCN-NEXT: v_alignbit_b32 v5, v15, v5, 16 +; GCN-NEXT: v_alignbit_b32 v6, v10, v6, 16 +; GCN-NEXT: v_alignbit_b32 v7, v8, v7, 16 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_minnum_v16bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27 -; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GFX7-NEXT: v_min_f32_e32 v11, v11, v27 -; GFX7-NEXT: buffer_load_dword v27, off, s[0:3], s32 +; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v15 +; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v7 +; GFX7-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16 +; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 +; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GFX7-NEXT: v_min_f32_e32 v16, v17, v16 +; GFX7-NEXT: v_min_f32_e32 v7, v7, v15 +; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v14 +; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 +; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 -; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX7-NEXT: v_min_f32_e32 v15, v17, v15 +; GFX7-NEXT: v_min_f32_e32 v6, v6, v14 +; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v13 +; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 -; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30 +; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 ; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29 -; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 -; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28 -; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 -; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26 -; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 -; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 -; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23 -; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_min_f32_e32 v6, v6, v22 -; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21 +; GFX7-NEXT: v_min_f32_e32 v14, v17, v14 +; GFX7-NEXT: v_min_f32_e32 v5, v5, v13 +; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v12 +; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 +; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20 +; GFX7-NEXT: v_min_f32_e32 v13, v17, v13 +; GFX7-NEXT: v_min_f32_e32 v4, v4, v12 +; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v11 +; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 +; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19 +; GFX7-NEXT: v_min_f32_e32 v12, v17, v12 +; GFX7-NEXT: v_min_f32_e32 v3, v3, v11 +; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v10 +; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 +; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18 +; GFX7-NEXT: v_min_f32_e32 v11, v17, v11 +; GFX7-NEXT: v_min_f32_e32 v2, v2, v10 +; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v9 +; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 +; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_min_f32_e32 v10, v17, v10 +; GFX7-NEXT: v_min_f32_e32 v1, v1, v9 +; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v8 +; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v0 +; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 ; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_min_f32_e32 v9, v17, v9 +; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_min_f32_e32 v14, v14, v30 -; GFX7-NEXT: v_min_f32_e32 v13, v13, v29 -; GFX7-NEXT: v_min_f32_e32 v12, v12, v28 -; GFX7-NEXT: v_min_f32_e32 v10, v10, v26 -; GFX7-NEXT: v_min_f32_e32 v9, v9, v25 -; GFX7-NEXT: v_min_f32_e32 v8, v8, v24 -; GFX7-NEXT: v_min_f32_e32 v7, v7, v23 -; GFX7-NEXT: v_min_f32_e32 v5, v5, v21 -; GFX7-NEXT: v_min_f32_e32 v4, v4, v20 -; GFX7-NEXT: v_min_f32_e32 v3, v3, v19 -; GFX7-NEXT: v_min_f32_e32 v2, v2, v18 -; GFX7-NEXT: v_min_f32_e32 v1, v1, v17 -; GFX7-NEXT: v_min_f32_e32 v0, v0, v16 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v27 -; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GFX7-NEXT: v_min_f32_e32 v15, v15, v22 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; GFX7-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; GFX7-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GFX7-NEXT: v_min_f32_e32 v0, v0, v8 +; GFX7-NEXT: v_alignbit_b32 v0, v9, v0, 16 +; GFX7-NEXT: v_alignbit_b32 v1, v10, v1, 16 +; GFX7-NEXT: v_alignbit_b32 v2, v11, v2, 16 +; GFX7-NEXT: v_alignbit_b32 v3, v12, v3, 16 +; GFX7-NEXT: v_alignbit_b32 v4, v13, v4, 16 +; GFX7-NEXT: v_alignbit_b32 v5, v14, v5, 16 +; GFX7-NEXT: v_alignbit_b32 v6, v15, v6, 16 +; GFX7-NEXT: v_alignbit_b32 v7, v16, v7, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_minnum_v16bf16: @@ -24034,527 +22038,399 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GCN-LABEL: v_minnum_v32bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128 -; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v30 ; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31 -; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v14 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:124 -; GCN-NEXT: v_min_f32_e32 v31, v31, v32 +; GCN-NEXT: v_min_f32_e32 v31, v32, v31 +; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30 -; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:120 -; GCN-NEXT: v_min_f32_e32 v30, v30, v32 -; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:116 -; GCN-NEXT: v_min_f32_e32 v29, v29, v32 -; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28 -; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:112 -; GCN-NEXT: v_min_f32_e32 v28, v28, v32 -; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:108 -; GCN-NEXT: v_min_f32_e32 v27, v27, v32 -; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:104 -; GCN-NEXT: v_min_f32_e32 v26, v26, v32 -; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:100 -; GCN-NEXT: v_min_f32_e32 v25, v25, v32 -; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:96 -; GCN-NEXT: v_min_f32_e32 v24, v24, v32 -; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92 -; GCN-NEXT: v_min_f32_e32 v23, v23, v32 -; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:88 -; GCN-NEXT: v_min_f32_e32 v22, v22, v32 -; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:84 -; GCN-NEXT: v_min_f32_e32 v21, v21, v32 -; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:80 -; GCN-NEXT: v_min_f32_e32 v20, v20, v32 -; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:76 -; GCN-NEXT: v_min_f32_e32 v19, v19, v32 -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:72 -; GCN-NEXT: v_min_f32_e32 v18, v18, v32 -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:68 -; GCN-NEXT: v_min_f32_e32 v17, v17, v32 -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:64 -; GCN-NEXT: v_min_f32_e32 v16, v16, v32 -; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:60 -; GCN-NEXT: v_min_f32_e32 v15, v15, v32 ; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:56 -; GCN-NEXT: v_min_f32_e32 v14, v14, v32 +; GCN-NEXT: v_min_f32_e32 v14, v14, v30 +; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v29 +; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v13 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_min_f32_e32 v30, v32, v30 +; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29 ; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:52 -; GCN-NEXT: v_min_f32_e32 v13, v13, v32 +; GCN-NEXT: v_min_f32_e32 v13, v13, v29 +; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v28 +; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v12 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_min_f32_e32 v29, v32, v29 +; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28 ; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:48 -; GCN-NEXT: v_min_f32_e32 v12, v12, v32 +; GCN-NEXT: v_min_f32_e32 v12, v12, v28 +; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v27 +; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v11 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_min_f32_e32 v28, v32, v28 +; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27 ; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:44 -; GCN-NEXT: v_min_f32_e32 v11, v11, v32 +; GCN-NEXT: v_min_f32_e32 v11, v11, v27 +; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v26 +; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v10 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_min_f32_e32 v27, v32, v27 +; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26 ; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:40 -; GCN-NEXT: v_min_f32_e32 v10, v10, v32 +; GCN-NEXT: v_min_f32_e32 v10, v10, v26 +; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v25 +; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v9 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_min_f32_e32 v26, v32, v26 +; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 ; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:36 -; GCN-NEXT: v_min_f32_e32 v9, v9, v32 +; GCN-NEXT: v_min_f32_e32 v9, v9, v25 +; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v24 +; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v8 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_min_f32_e32 v25, v32, v25 +; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24 ; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:32 -; GCN-NEXT: v_min_f32_e32 v8, v8, v32 +; GCN-NEXT: v_min_f32_e32 v8, v8, v24 +; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v23 +; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v7 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_min_f32_e32 v24, v32, v24 +; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:28 -; GCN-NEXT: v_min_f32_e32 v7, v7, v32 +; GCN-NEXT: v_min_f32_e32 v7, v7, v23 +; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v22 +; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v6 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_min_f32_e32 v23, v32, v23 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:24 -; GCN-NEXT: v_min_f32_e32 v6, v6, v32 +; GCN-NEXT: v_min_f32_e32 v6, v6, v22 +; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v21 +; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v5 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_min_f32_e32 v22, v32, v22 +; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20 -; GCN-NEXT: v_min_f32_e32 v5, v5, v32 +; GCN-NEXT: v_min_f32_e32 v5, v5, v21 +; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v20 +; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v4 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_min_f32_e32 v21, v32, v21 +; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:16 -; GCN-NEXT: v_min_f32_e32 v4, v4, v32 +; GCN-NEXT: v_min_f32_e32 v4, v4, v20 +; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v19 +; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v3 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_min_f32_e32 v20, v32, v20 +; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12 -; GCN-NEXT: v_min_f32_e32 v3, v3, v32 +; GCN-NEXT: v_min_f32_e32 v3, v3, v19 +; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v18 +; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v2 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_min_f32_e32 v19, v32, v19 +; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 -; GCN-NEXT: v_min_f32_e32 v2, v2, v32 +; GCN-NEXT: v_min_f32_e32 v2, v2, v18 +; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v17 +; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v1 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_min_f32_e32 v18, v32, v18 +; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4 -; GCN-NEXT: v_min_f32_e32 v1, v1, v32 +; GCN-NEXT: v_min_f32_e32 v1, v1, v17 +; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v16 +; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v0 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_min_f32_e32 v17, v32, v17 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_min_f32_e32 v0, v0, v32 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; GCN-NEXT: v_min_f32_e32 v0, v0, v16 +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 +; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GCN-NEXT: v_alignbit_b32 v0, v17, v0, 16 +; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v15 +; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 +; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; GCN-NEXT: v_alignbit_b32 v1, v18, v1, 16 +; GCN-NEXT: v_alignbit_b32 v2, v19, v2, 16 +; GCN-NEXT: v_alignbit_b32 v3, v20, v3, 16 +; GCN-NEXT: v_alignbit_b32 v4, v21, v4, 16 +; GCN-NEXT: v_alignbit_b32 v5, v22, v5, 16 +; GCN-NEXT: v_alignbit_b32 v6, v23, v6, 16 +; GCN-NEXT: v_alignbit_b32 v7, v24, v7, 16 +; GCN-NEXT: v_alignbit_b32 v8, v25, v8, 16 +; GCN-NEXT: v_alignbit_b32 v9, v26, v9, 16 +; GCN-NEXT: v_alignbit_b32 v10, v27, v10, 16 +; GCN-NEXT: v_alignbit_b32 v11, v28, v11, 16 +; GCN-NEXT: v_alignbit_b32 v12, v29, v12, 16 +; GCN-NEXT: v_alignbit_b32 v13, v30, v13, 16 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v16 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 +; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 +; GCN-NEXT: v_min_f32_e32 v17, v17, v18 +; GCN-NEXT: v_min_f32_e32 v15, v15, v16 +; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v17 +; GCN-NEXT: v_alignbit_b32 v14, v31, v14, 16 +; GCN-NEXT: v_alignbit_b32 v15, v16, v15, 16 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_minnum_v32bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128 +; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v30 +; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v14 +; GFX7-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31 +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30 -; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29 -; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28 -; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27 -; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26 -; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 -; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 -; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23 -; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 -; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21 -; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20 -; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19 -; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18 -; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 -; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 -; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GFX7-NEXT: v_min_f32_e32 v31, v32, v31 +; GFX7-NEXT: v_min_f32_e32 v14, v14, v30 +; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v29 +; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v13 +; GFX7-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; GFX7-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30 +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29 ; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GFX7-NEXT: v_min_f32_e32 v30, v32, v30 +; GFX7-NEXT: v_min_f32_e32 v13, v13, v29 +; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v28 +; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v12 +; GFX7-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29 +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28 ; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 -; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GFX7-NEXT: v_min_f32_e32 v29, v32, v29 +; GFX7-NEXT: v_min_f32_e32 v12, v12, v28 +; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v27 +; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v11 +; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28 +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GFX7-NEXT: v_min_f32_e32 v28, v32, v28 +; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; GFX7-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GFX7-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27 ; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX7-NEXT: v_min_f32_e32 v11, v11, v27 +; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v15 +; GFX7-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27 +; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v32 +; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GFX7-NEXT: v_min_f32_e32 v27, v27, v33 +; GFX7-NEXT: v_min_f32_e32 v15, v15, v32 +; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v26 +; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v10 +; GFX7-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26 ; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 -; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GFX7-NEXT: v_min_f32_e32 v32, v33, v32 +; GFX7-NEXT: v_min_f32_e32 v10, v10, v26 +; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v25 +; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v9 +; GFX7-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26 +; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 ; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX7-NEXT: v_min_f32_e32 v26, v33, v26 +; GFX7-NEXT: v_min_f32_e32 v9, v9, v25 +; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v24 +; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 +; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 ; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GFX7-NEXT: v_min_f32_e32 v25, v33, v25 +; GFX7-NEXT: v_min_f32_e32 v8, v8, v24 +; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v23 +; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v7 +; GFX7-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 +; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX7-NEXT: v_min_f32_e32 v24, v33, v24 +; GFX7-NEXT: v_min_f32_e32 v7, v7, v23 +; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v22 +; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23 +; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX7-NEXT: v_min_f32_e32 v23, v33, v23 +; GFX7-NEXT: v_min_f32_e32 v6, v6, v22 +; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v21 +; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX7-NEXT: v_min_f32_e32 v22, v33, v22 +; GFX7-NEXT: v_min_f32_e32 v5, v5, v21 +; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v20 +; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21 +; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX7-NEXT: v_min_f32_e32 v21, v33, v21 +; GFX7-NEXT: v_min_f32_e32 v4, v4, v20 +; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v19 +; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20 +; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX7-NEXT: v_min_f32_e32 v20, v33, v20 +; GFX7-NEXT: v_min_f32_e32 v3, v3, v19 +; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v18 +; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19 +; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX7-NEXT: v_min_f32_e32 v19, v33, v19 +; GFX7-NEXT: v_min_f32_e32 v2, v2, v18 +; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v17 +; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18 +; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: v_min_f32_e32 v18, v33, v18 +; GFX7-NEXT: v_min_f32_e32 v1, v1, v17 +; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v16 +; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v0 +; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GFX7-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_min_f32_e32 v17, v33, v17 +; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 -; GFX7-NEXT: v_min_f32_e32 v31, v31, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124 -; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_min_f32_e32 v30, v30, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:120 -; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_min_f32_e32 v29, v29, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:116 -; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_min_f32_e32 v28, v28, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:112 -; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_min_f32_e32 v27, v27, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:108 -; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_min_f32_e32 v26, v26, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:104 -; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_min_f32_e32 v25, v25, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:100 -; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_min_f32_e32 v24, v24, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:96 -; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_min_f32_e32 v23, v23, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:92 -; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_min_f32_e32 v22, v22, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:88 -; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_min_f32_e32 v21, v21, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:84 -; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_min_f32_e32 v20, v20, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:80 -; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_min_f32_e32 v19, v19, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:76 -; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_min_f32_e32 v18, v18, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72 -; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_min_f32_e32 v17, v17, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68 -; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_min_f32_e32 v16, v16, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64 -; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_min_f32_e32 v15, v15, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60 -; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_min_f32_e32 v14, v14, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56 -; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_min_f32_e32 v13, v13, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52 -; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_min_f32_e32 v12, v12, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:48 -; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_min_f32_e32 v11, v11, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:44 -; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_min_f32_e32 v10, v10, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:40 -; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_min_f32_e32 v9, v9, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36 -; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_min_f32_e32 v8, v8, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:32 -; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_min_f32_e32 v7, v7, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_min_f32_e32 v6, v6, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_min_f32_e32 v5, v5, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:20 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_min_f32_e32 v4, v4, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:16 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_min_f32_e32 v3, v3, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_min_f32_e32 v2, v2, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_min_f32_e32 v1, v1, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_min_f32_e32 v0, v0, v32 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_min_f32_e32 v0, v0, v16 +; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v17 +; GFX7-NEXT: v_alignbit_b32 v0, v16, v0, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v18 +; GFX7-NEXT: v_alignbit_b32 v1, v16, v1, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v19 +; GFX7-NEXT: v_alignbit_b32 v2, v16, v2, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v20 +; GFX7-NEXT: v_alignbit_b32 v3, v16, v3, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v21 +; GFX7-NEXT: v_alignbit_b32 v4, v16, v4, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v22 +; GFX7-NEXT: v_alignbit_b32 v5, v16, v5, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v23 +; GFX7-NEXT: v_alignbit_b32 v6, v16, v6, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v24 +; GFX7-NEXT: v_alignbit_b32 v7, v16, v7, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v25 +; GFX7-NEXT: v_alignbit_b32 v8, v16, v8, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v26 +; GFX7-NEXT: v_alignbit_b32 v9, v16, v9, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v32 +; GFX7-NEXT: v_alignbit_b32 v10, v16, v10, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v28 +; GFX7-NEXT: v_alignbit_b32 v11, v16, v11, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v27 +; GFX7-NEXT: v_lshrrev_b32_e32 v17, 16, v31 +; GFX7-NEXT: v_lshrrev_b32_e32 v18, 16, v30 +; GFX7-NEXT: v_lshrrev_b32_e32 v19, 16, v29 +; GFX7-NEXT: v_alignbit_b32 v12, v19, v12, 16 +; GFX7-NEXT: v_alignbit_b32 v13, v18, v13, 16 +; GFX7-NEXT: v_alignbit_b32 v14, v17, v14, 16 +; GFX7-NEXT: v_alignbit_b32 v15, v16, v15, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_minnum_v32bf16: @@ -26298,35 +24174,35 @@ define <2 x bfloat> @v_maxnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GCN-LABEL: v_maxnum_v2bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_max_f32_e32 v1, v1, v3 -; GCN-NEXT: v_max_f32_e32 v0, v0, v2 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_max_f32_e32 v2, v3, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_max_f32_e32 v0, v0, v1 +; GCN-NEXT: v_alignbit_b32 v0, v2, v0, 16 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_maxnum_v2bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_max_f32_e32 v1, v1, v3 -; GFX7-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_max_f32_e32 v2, v3, v2 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX7-NEXT: v_alignbit_b32 v0, v2, v0, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_maxnum_v2bf16: @@ -26478,47 +24354,47 @@ define <3 x bfloat> @v_maxnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GCN-LABEL: v_maxnum_v3bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_max_f32_e32 v2, v2, v5 -; GCN-NEXT: v_max_f32_e32 v1, v1, v4 -; GCN-NEXT: v_max_f32_e32 v0, v0, v3 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_max_f32_e32 v1, v1, v3 +; GCN-NEXT: v_max_f32_e32 v3, v5, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_max_f32_e32 v0, v0, v2 +; GCN-NEXT: v_alignbit_b32 v0, v3, v0, 16 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_maxnum_v3bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_max_f32_e32 v1, v1, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_max_f32_e32 v3, v4, v3 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_max_f32_e32 v2, v2, v5 -; GFX7-NEXT: v_max_f32_e32 v1, v1, v4 -; GFX7-NEXT: v_max_f32_e32 v0, v0, v3 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_alignbit_b32 v0, v3, v0, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_maxnum_v3bf16: @@ -26720,59 +24596,59 @@ define <4 x bfloat> @v_maxnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GCN-LABEL: v_maxnum_v4bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_max_f32_e32 v3, v3, v7 -; GCN-NEXT: v_max_f32_e32 v2, v2, v6 -; GCN-NEXT: v_max_f32_e32 v1, v1, v5 -; GCN-NEXT: v_max_f32_e32 v0, v0, v4 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_max_f32_e32 v4, v5, v4 +; GCN-NEXT: v_max_f32_e32 v1, v1, v3 +; GCN-NEXT: v_max_f32_e32 v3, v7, v6 +; GCN-NEXT: v_max_f32_e32 v0, v0, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_alignbit_b32 v0, v3, v0, 16 +; GCN-NEXT: v_alignbit_b32 v1, v2, v1, 16 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_maxnum_v4bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_max_f32_e32 v4, v5, v4 +; GFX7-NEXT: v_max_f32_e32 v1, v1, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_max_f32_e32 v3, v5, v3 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_max_f32_e32 v3, v3, v7 -; GFX7-NEXT: v_max_f32_e32 v2, v2, v6 -; GFX7-NEXT: v_max_f32_e32 v1, v1, v5 -; GFX7-NEXT: v_max_f32_e32 v0, v0, v4 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX7-NEXT: v_alignbit_b32 v0, v3, v0, 16 +; GFX7-NEXT: v_alignbit_b32 v1, v4, v1, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_maxnum_v4bf16: @@ -27018,107 +24894,107 @@ define <8 x bfloat> @v_maxnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GCN-LABEL: v_maxnum_v8bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v7 +; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v6 +; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v5 +; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v4 +; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 ; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_max_f32_e32 v7, v7, v15 -; GCN-NEXT: v_max_f32_e32 v6, v6, v14 -; GCN-NEXT: v_max_f32_e32 v5, v5, v13 -; GCN-NEXT: v_max_f32_e32 v4, v4, v12 -; GCN-NEXT: v_max_f32_e32 v3, v3, v11 -; GCN-NEXT: v_max_f32_e32 v2, v2, v10 -; GCN-NEXT: v_max_f32_e32 v1, v1, v9 -; GCN-NEXT: v_max_f32_e32 v0, v0, v8 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_max_f32_e32 v8, v9, v8 +; GCN-NEXT: v_max_f32_e32 v3, v3, v7 +; GCN-NEXT: v_max_f32_e32 v7, v11, v10 +; GCN-NEXT: v_max_f32_e32 v2, v2, v6 +; GCN-NEXT: v_max_f32_e32 v6, v13, v12 +; GCN-NEXT: v_max_f32_e32 v1, v1, v5 +; GCN-NEXT: v_max_f32_e32 v5, v15, v14 +; GCN-NEXT: v_max_f32_e32 v0, v0, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_alignbit_b32 v0, v5, v0, 16 +; GCN-NEXT: v_alignbit_b32 v1, v6, v1, 16 +; GCN-NEXT: v_alignbit_b32 v2, v7, v2, 16 +; GCN-NEXT: v_alignbit_b32 v3, v4, v3, 16 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_maxnum_v8bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v7 +; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 +; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 +; GFX7-NEXT: v_max_f32_e32 v8, v9, v8 +; GFX7-NEXT: v_max_f32_e32 v3, v3, v7 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v6 +; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 -; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_max_f32_e32 v7, v7, v15 -; GFX7-NEXT: v_max_f32_e32 v6, v6, v14 -; GFX7-NEXT: v_max_f32_e32 v5, v5, v13 -; GFX7-NEXT: v_max_f32_e32 v4, v4, v12 -; GFX7-NEXT: v_max_f32_e32 v3, v3, v11 -; GFX7-NEXT: v_max_f32_e32 v2, v2, v10 -; GFX7-NEXT: v_max_f32_e32 v1, v1, v9 -; GFX7-NEXT: v_max_f32_e32 v0, v0, v8 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_max_f32_e32 v7, v9, v7 +; GFX7-NEXT: v_max_f32_e32 v2, v2, v6 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_max_f32_e32 v6, v9, v6 +; GFX7-NEXT: v_max_f32_e32 v1, v1, v5 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v0 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_max_f32_e32 v5, v9, v5 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_max_f32_e32 v0, v0, v4 +; GFX7-NEXT: v_alignbit_b32 v0, v5, v0, 16 +; GFX7-NEXT: v_alignbit_b32 v1, v6, v1, 16 +; GFX7-NEXT: v_alignbit_b32 v2, v7, v2, 16 +; GFX7-NEXT: v_alignbit_b32 v3, v8, v3, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_maxnum_v8bf16: @@ -27566,207 +25442,203 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GCN-LABEL: v_maxnum_v16bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30 -; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GCN-NEXT: v_max_f32_e32 v14, v14, v30 -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GCN-NEXT: v_max_f32_e32 v13, v13, v29 -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28 -; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GCN-NEXT: v_max_f32_e32 v12, v12, v28 -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24 +; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v15 +; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v14 +; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 +; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; GCN-NEXT: v_max_f32_e32 v16, v17, v16 +; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v13 +; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 +; GCN-NEXT: v_max_f32_e32 v7, v7, v15 +; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 +; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 +; GCN-NEXT: v_max_f32_e32 v18, v19, v18 +; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v12 +; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; GCN-NEXT: v_max_f32_e32 v6, v6, v14 +; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 +; GCN-NEXT: v_max_f32_e32 v15, v15, v17 +; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v11 +; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 +; GCN-NEXT: v_max_f32_e32 v5, v5, v13 +; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 +; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 +; GCN-NEXT: v_max_f32_e32 v14, v14, v19 +; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v10 +; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 +; GCN-NEXT: v_max_f32_e32 v4, v4, v12 +; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 +; GCN-NEXT: v_max_f32_e32 v13, v13, v17 +; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v9 +; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_max_f32_e32 v3, v3, v11 +; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 +; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 +; GCN-NEXT: v_max_f32_e32 v12, v12, v19 +; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v8 +; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_max_f32_e32 v2, v2, v10 +; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 +; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 +; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 +; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GCN-NEXT: v_max_f32_e32 v11, v11, v27 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GCN-NEXT: v_max_f32_e32 v10, v10, v26 -; GCN-NEXT: v_max_f32_e32 v9, v9, v25 -; GCN-NEXT: v_max_f32_e32 v8, v8, v24 -; GCN-NEXT: v_max_f32_e32 v7, v7, v23 -; GCN-NEXT: v_max_f32_e32 v6, v6, v22 -; GCN-NEXT: v_max_f32_e32 v5, v5, v21 -; GCN-NEXT: v_max_f32_e32 v4, v4, v20 -; GCN-NEXT: v_max_f32_e32 v3, v3, v19 -; GCN-NEXT: v_max_f32_e32 v2, v2, v18 -; GCN-NEXT: v_max_f32_e32 v1, v1, v17 -; GCN-NEXT: v_max_f32_e32 v0, v0, v16 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v27 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GCN-NEXT: v_max_f32_e32 v15, v15, v16 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GCN-NEXT: v_max_f32_e32 v11, v11, v17 +; GCN-NEXT: v_max_f32_e32 v1, v1, v9 +; GCN-NEXT: v_max_f32_e32 v9, v10, v19 +; GCN-NEXT: v_max_f32_e32 v0, v0, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v16 +; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v18 +; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GCN-NEXT: v_alignbit_b32 v0, v9, v0, 16 +; GCN-NEXT: v_alignbit_b32 v1, v11, v1, 16 +; GCN-NEXT: v_alignbit_b32 v2, v12, v2, 16 +; GCN-NEXT: v_alignbit_b32 v3, v13, v3, 16 +; GCN-NEXT: v_alignbit_b32 v4, v14, v4, 16 +; GCN-NEXT: v_alignbit_b32 v5, v15, v5, 16 +; GCN-NEXT: v_alignbit_b32 v6, v10, v6, 16 +; GCN-NEXT: v_alignbit_b32 v7, v8, v7, 16 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_maxnum_v16bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27 -; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GFX7-NEXT: v_max_f32_e32 v11, v11, v27 -; GFX7-NEXT: buffer_load_dword v27, off, s[0:3], s32 +; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v15 +; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v7 +; GFX7-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16 +; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 +; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GFX7-NEXT: v_max_f32_e32 v16, v17, v16 +; GFX7-NEXT: v_max_f32_e32 v7, v7, v15 +; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v14 +; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 +; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 -; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX7-NEXT: v_max_f32_e32 v15, v17, v15 +; GFX7-NEXT: v_max_f32_e32 v6, v6, v14 +; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v13 +; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 -; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30 +; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 ; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29 -; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 -; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28 -; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 -; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26 -; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 -; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 -; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23 -; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_max_f32_e32 v6, v6, v22 -; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21 +; GFX7-NEXT: v_max_f32_e32 v14, v17, v14 +; GFX7-NEXT: v_max_f32_e32 v5, v5, v13 +; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v12 +; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 +; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20 +; GFX7-NEXT: v_max_f32_e32 v13, v17, v13 +; GFX7-NEXT: v_max_f32_e32 v4, v4, v12 +; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v11 +; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 +; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19 +; GFX7-NEXT: v_max_f32_e32 v12, v17, v12 +; GFX7-NEXT: v_max_f32_e32 v3, v3, v11 +; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v10 +; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 +; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18 +; GFX7-NEXT: v_max_f32_e32 v11, v17, v11 +; GFX7-NEXT: v_max_f32_e32 v2, v2, v10 +; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v9 +; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 +; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_max_f32_e32 v10, v17, v10 +; GFX7-NEXT: v_max_f32_e32 v1, v1, v9 +; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v8 +; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v0 +; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 ; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_max_f32_e32 v9, v17, v9 +; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_max_f32_e32 v14, v14, v30 -; GFX7-NEXT: v_max_f32_e32 v13, v13, v29 -; GFX7-NEXT: v_max_f32_e32 v12, v12, v28 -; GFX7-NEXT: v_max_f32_e32 v10, v10, v26 -; GFX7-NEXT: v_max_f32_e32 v9, v9, v25 -; GFX7-NEXT: v_max_f32_e32 v8, v8, v24 -; GFX7-NEXT: v_max_f32_e32 v7, v7, v23 -; GFX7-NEXT: v_max_f32_e32 v5, v5, v21 -; GFX7-NEXT: v_max_f32_e32 v4, v4, v20 -; GFX7-NEXT: v_max_f32_e32 v3, v3, v19 -; GFX7-NEXT: v_max_f32_e32 v2, v2, v18 -; GFX7-NEXT: v_max_f32_e32 v1, v1, v17 -; GFX7-NEXT: v_max_f32_e32 v0, v0, v16 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v27 -; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GFX7-NEXT: v_max_f32_e32 v15, v15, v22 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; GFX7-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; GFX7-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GFX7-NEXT: v_max_f32_e32 v0, v0, v8 +; GFX7-NEXT: v_alignbit_b32 v0, v9, v0, 16 +; GFX7-NEXT: v_alignbit_b32 v1, v10, v1, 16 +; GFX7-NEXT: v_alignbit_b32 v2, v11, v2, 16 +; GFX7-NEXT: v_alignbit_b32 v3, v12, v3, 16 +; GFX7-NEXT: v_alignbit_b32 v4, v13, v4, 16 +; GFX7-NEXT: v_alignbit_b32 v5, v14, v5, 16 +; GFX7-NEXT: v_alignbit_b32 v6, v15, v6, 16 +; GFX7-NEXT: v_alignbit_b32 v7, v16, v7, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_maxnum_v16bf16: @@ -28610,527 +26482,399 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GCN-LABEL: v_maxnum_v32bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128 -; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v30 ; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31 -; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v14 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:124 -; GCN-NEXT: v_max_f32_e32 v31, v31, v32 +; GCN-NEXT: v_max_f32_e32 v31, v32, v31 +; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30 -; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:120 -; GCN-NEXT: v_max_f32_e32 v30, v30, v32 -; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:116 -; GCN-NEXT: v_max_f32_e32 v29, v29, v32 -; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28 -; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:112 -; GCN-NEXT: v_max_f32_e32 v28, v28, v32 -; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:108 -; GCN-NEXT: v_max_f32_e32 v27, v27, v32 -; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:104 -; GCN-NEXT: v_max_f32_e32 v26, v26, v32 -; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:100 -; GCN-NEXT: v_max_f32_e32 v25, v25, v32 -; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:96 -; GCN-NEXT: v_max_f32_e32 v24, v24, v32 -; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92 -; GCN-NEXT: v_max_f32_e32 v23, v23, v32 -; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:88 -; GCN-NEXT: v_max_f32_e32 v22, v22, v32 -; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:84 -; GCN-NEXT: v_max_f32_e32 v21, v21, v32 -; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:80 -; GCN-NEXT: v_max_f32_e32 v20, v20, v32 -; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:76 -; GCN-NEXT: v_max_f32_e32 v19, v19, v32 -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:72 -; GCN-NEXT: v_max_f32_e32 v18, v18, v32 -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:68 -; GCN-NEXT: v_max_f32_e32 v17, v17, v32 -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:64 -; GCN-NEXT: v_max_f32_e32 v16, v16, v32 -; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:60 -; GCN-NEXT: v_max_f32_e32 v15, v15, v32 ; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:56 -; GCN-NEXT: v_max_f32_e32 v14, v14, v32 +; GCN-NEXT: v_max_f32_e32 v14, v14, v30 +; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v29 +; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v13 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_max_f32_e32 v30, v32, v30 +; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29 ; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:52 -; GCN-NEXT: v_max_f32_e32 v13, v13, v32 +; GCN-NEXT: v_max_f32_e32 v13, v13, v29 +; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v28 +; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v12 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_max_f32_e32 v29, v32, v29 +; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28 ; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:48 -; GCN-NEXT: v_max_f32_e32 v12, v12, v32 +; GCN-NEXT: v_max_f32_e32 v12, v12, v28 +; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v27 +; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v11 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_max_f32_e32 v28, v32, v28 +; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27 ; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:44 -; GCN-NEXT: v_max_f32_e32 v11, v11, v32 +; GCN-NEXT: v_max_f32_e32 v11, v11, v27 +; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v26 +; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v10 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_max_f32_e32 v27, v32, v27 +; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26 ; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:40 -; GCN-NEXT: v_max_f32_e32 v10, v10, v32 +; GCN-NEXT: v_max_f32_e32 v10, v10, v26 +; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v25 +; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v9 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_max_f32_e32 v26, v32, v26 +; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 ; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:36 -; GCN-NEXT: v_max_f32_e32 v9, v9, v32 +; GCN-NEXT: v_max_f32_e32 v9, v9, v25 +; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v24 +; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v8 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_max_f32_e32 v25, v32, v25 +; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24 ; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:32 -; GCN-NEXT: v_max_f32_e32 v8, v8, v32 +; GCN-NEXT: v_max_f32_e32 v8, v8, v24 +; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v23 +; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v7 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_max_f32_e32 v24, v32, v24 +; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:28 -; GCN-NEXT: v_max_f32_e32 v7, v7, v32 +; GCN-NEXT: v_max_f32_e32 v7, v7, v23 +; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v22 +; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v6 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_max_f32_e32 v23, v32, v23 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:24 -; GCN-NEXT: v_max_f32_e32 v6, v6, v32 +; GCN-NEXT: v_max_f32_e32 v6, v6, v22 +; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v21 +; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v5 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_max_f32_e32 v22, v32, v22 +; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20 -; GCN-NEXT: v_max_f32_e32 v5, v5, v32 +; GCN-NEXT: v_max_f32_e32 v5, v5, v21 +; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v20 +; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v4 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_max_f32_e32 v21, v32, v21 +; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:16 -; GCN-NEXT: v_max_f32_e32 v4, v4, v32 +; GCN-NEXT: v_max_f32_e32 v4, v4, v20 +; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v19 +; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v3 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_max_f32_e32 v20, v32, v20 +; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12 -; GCN-NEXT: v_max_f32_e32 v3, v3, v32 +; GCN-NEXT: v_max_f32_e32 v3, v3, v19 +; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v18 +; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v2 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_max_f32_e32 v19, v32, v19 +; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 -; GCN-NEXT: v_max_f32_e32 v2, v2, v32 +; GCN-NEXT: v_max_f32_e32 v2, v2, v18 +; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v17 +; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v1 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_max_f32_e32 v18, v32, v18 +; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4 -; GCN-NEXT: v_max_f32_e32 v1, v1, v32 +; GCN-NEXT: v_max_f32_e32 v1, v1, v17 +; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v16 +; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v0 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_max_f32_e32 v17, v32, v17 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_max_f32_e32 v0, v0, v32 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; GCN-NEXT: v_max_f32_e32 v0, v0, v16 +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 +; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GCN-NEXT: v_alignbit_b32 v0, v17, v0, 16 +; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v15 +; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 +; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; GCN-NEXT: v_alignbit_b32 v1, v18, v1, 16 +; GCN-NEXT: v_alignbit_b32 v2, v19, v2, 16 +; GCN-NEXT: v_alignbit_b32 v3, v20, v3, 16 +; GCN-NEXT: v_alignbit_b32 v4, v21, v4, 16 +; GCN-NEXT: v_alignbit_b32 v5, v22, v5, 16 +; GCN-NEXT: v_alignbit_b32 v6, v23, v6, 16 +; GCN-NEXT: v_alignbit_b32 v7, v24, v7, 16 +; GCN-NEXT: v_alignbit_b32 v8, v25, v8, 16 +; GCN-NEXT: v_alignbit_b32 v9, v26, v9, 16 +; GCN-NEXT: v_alignbit_b32 v10, v27, v10, 16 +; GCN-NEXT: v_alignbit_b32 v11, v28, v11, 16 +; GCN-NEXT: v_alignbit_b32 v12, v29, v12, 16 +; GCN-NEXT: v_alignbit_b32 v13, v30, v13, 16 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v16 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 +; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 +; GCN-NEXT: v_max_f32_e32 v17, v17, v18 +; GCN-NEXT: v_max_f32_e32 v15, v15, v16 +; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v17 +; GCN-NEXT: v_alignbit_b32 v14, v31, v14, 16 +; GCN-NEXT: v_alignbit_b32 v15, v16, v15, 16 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_maxnum_v32bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128 +; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v30 +; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v14 +; GFX7-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31 +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30 -; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29 -; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28 -; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27 -; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26 -; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 -; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 -; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23 -; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 -; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21 -; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20 -; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19 -; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18 -; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 -; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 -; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GFX7-NEXT: v_max_f32_e32 v31, v32, v31 +; GFX7-NEXT: v_max_f32_e32 v14, v14, v30 +; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v29 +; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v13 +; GFX7-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; GFX7-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30 +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29 ; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GFX7-NEXT: v_max_f32_e32 v30, v32, v30 +; GFX7-NEXT: v_max_f32_e32 v13, v13, v29 +; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v28 +; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v12 +; GFX7-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29 +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28 ; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 -; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GFX7-NEXT: v_max_f32_e32 v29, v32, v29 +; GFX7-NEXT: v_max_f32_e32 v12, v12, v28 +; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v27 +; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v11 +; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28 +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GFX7-NEXT: v_max_f32_e32 v28, v32, v28 +; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; GFX7-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GFX7-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27 ; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX7-NEXT: v_max_f32_e32 v11, v11, v27 +; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v15 +; GFX7-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27 +; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v32 +; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GFX7-NEXT: v_max_f32_e32 v27, v27, v33 +; GFX7-NEXT: v_max_f32_e32 v15, v15, v32 +; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v26 +; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v10 +; GFX7-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26 ; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 -; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GFX7-NEXT: v_max_f32_e32 v32, v33, v32 +; GFX7-NEXT: v_max_f32_e32 v10, v10, v26 +; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v25 +; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v9 +; GFX7-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26 +; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 ; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX7-NEXT: v_max_f32_e32 v26, v33, v26 +; GFX7-NEXT: v_max_f32_e32 v9, v9, v25 +; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v24 +; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 +; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 ; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GFX7-NEXT: v_max_f32_e32 v25, v33, v25 +; GFX7-NEXT: v_max_f32_e32 v8, v8, v24 +; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v23 +; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v7 +; GFX7-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 +; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX7-NEXT: v_max_f32_e32 v24, v33, v24 +; GFX7-NEXT: v_max_f32_e32 v7, v7, v23 +; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v22 +; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23 +; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX7-NEXT: v_max_f32_e32 v23, v33, v23 +; GFX7-NEXT: v_max_f32_e32 v6, v6, v22 +; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v21 +; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX7-NEXT: v_max_f32_e32 v22, v33, v22 +; GFX7-NEXT: v_max_f32_e32 v5, v5, v21 +; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v20 +; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21 +; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX7-NEXT: v_max_f32_e32 v21, v33, v21 +; GFX7-NEXT: v_max_f32_e32 v4, v4, v20 +; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v19 +; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20 +; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX7-NEXT: v_max_f32_e32 v20, v33, v20 +; GFX7-NEXT: v_max_f32_e32 v3, v3, v19 +; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v18 +; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19 +; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX7-NEXT: v_max_f32_e32 v19, v33, v19 +; GFX7-NEXT: v_max_f32_e32 v2, v2, v18 +; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v17 +; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18 +; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: v_max_f32_e32 v18, v33, v18 +; GFX7-NEXT: v_max_f32_e32 v1, v1, v17 +; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v16 +; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v0 +; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GFX7-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_max_f32_e32 v17, v33, v17 +; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 -; GFX7-NEXT: v_max_f32_e32 v31, v31, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124 -; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_max_f32_e32 v30, v30, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:120 -; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_max_f32_e32 v29, v29, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:116 -; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_max_f32_e32 v28, v28, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:112 -; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_max_f32_e32 v27, v27, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:108 -; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_max_f32_e32 v26, v26, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:104 -; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_max_f32_e32 v25, v25, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:100 -; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_max_f32_e32 v24, v24, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:96 -; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_max_f32_e32 v23, v23, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:92 -; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_max_f32_e32 v22, v22, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:88 -; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_max_f32_e32 v21, v21, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:84 -; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_max_f32_e32 v20, v20, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:80 -; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_max_f32_e32 v19, v19, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:76 -; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_max_f32_e32 v18, v18, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72 -; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_max_f32_e32 v17, v17, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68 -; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_max_f32_e32 v16, v16, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64 -; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_max_f32_e32 v15, v15, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60 -; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_max_f32_e32 v14, v14, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56 -; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_max_f32_e32 v13, v13, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52 -; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_max_f32_e32 v12, v12, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:48 -; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_max_f32_e32 v11, v11, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:44 -; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_max_f32_e32 v10, v10, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:40 -; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_max_f32_e32 v9, v9, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36 -; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_max_f32_e32 v8, v8, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:32 -; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_max_f32_e32 v7, v7, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_max_f32_e32 v6, v6, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_max_f32_e32 v5, v5, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:20 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_max_f32_e32 v4, v4, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:16 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_max_f32_e32 v3, v3, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_max_f32_e32 v2, v2, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_max_f32_e32 v1, v1, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_max_f32_e32 v0, v0, v32 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_max_f32_e32 v0, v0, v16 +; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v17 +; GFX7-NEXT: v_alignbit_b32 v0, v16, v0, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v18 +; GFX7-NEXT: v_alignbit_b32 v1, v16, v1, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v19 +; GFX7-NEXT: v_alignbit_b32 v2, v16, v2, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v20 +; GFX7-NEXT: v_alignbit_b32 v3, v16, v3, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v21 +; GFX7-NEXT: v_alignbit_b32 v4, v16, v4, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v22 +; GFX7-NEXT: v_alignbit_b32 v5, v16, v5, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v23 +; GFX7-NEXT: v_alignbit_b32 v6, v16, v6, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v24 +; GFX7-NEXT: v_alignbit_b32 v7, v16, v7, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v25 +; GFX7-NEXT: v_alignbit_b32 v8, v16, v8, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v26 +; GFX7-NEXT: v_alignbit_b32 v9, v16, v9, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v32 +; GFX7-NEXT: v_alignbit_b32 v10, v16, v10, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v28 +; GFX7-NEXT: v_alignbit_b32 v11, v16, v11, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v27 +; GFX7-NEXT: v_lshrrev_b32_e32 v17, 16, v31 +; GFX7-NEXT: v_lshrrev_b32_e32 v18, 16, v30 +; GFX7-NEXT: v_lshrrev_b32_e32 v19, 16, v29 +; GFX7-NEXT: v_alignbit_b32 v12, v19, v12, 16 +; GFX7-NEXT: v_alignbit_b32 v13, v18, v13, 16 +; GFX7-NEXT: v_alignbit_b32 v14, v17, v14, 16 +; GFX7-NEXT: v_alignbit_b32 v15, v16, v15, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_maxnum_v32bf16: @@ -36269,31 +34013,25 @@ define <2 x i16> @v_fptosi_v2bf16_to_v2i16(<2 x bfloat> %x) { ; GCN-LABEL: v_fptosi_v2bf16_to_v2i16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GCN-NEXT: v_cvt_i32_f32_e32 v1, v1 ; GCN-NEXT: v_cvt_i32_f32_e32 v0, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_or_b32_e32 v0, v0, v2 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GCN-NEXT: v_or_b32_e32 v0, v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_fptosi_v2bf16_to_v2i16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_cvt_i32_f32_e32 v1, v1 ; GFX7-NEXT: v_cvt_i32_f32_e32 v0, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fptosi_v2bf16_to_v2i16: @@ -36364,39 +34102,31 @@ define <3 x i16> @v_fptosi_v3bf16_to_v3i16(<3 x bfloat> %x) { ; GCN-LABEL: v_fptosi_v3bf16_to_v3i16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GCN-NEXT: v_cvt_i32_f32_e32 v1, v1 +; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 ; GCN-NEXT: v_cvt_i32_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_i32_f32_e32 v3, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v3 -; GCN-NEXT: v_or_b32_e32 v0, v0, v1 -; GCN-NEXT: v_alignbit_b32 v1, v3, v1, 16 +; GCN-NEXT: v_or_b32_e32 v0, v0, v2 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_fptosi_v3bf16_to_v3i16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_cvt_i32_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_i32_f32_e32 v0, v0 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_cvt_i32_f32_e32 v3, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_cvt_i32_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_i32_f32_e32 v1, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v3 -; GFX7-NEXT: v_alignbit_b32 v1, v3, v1, 16 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fptosi_v3bf16_to_v3i16: @@ -36478,51 +34208,39 @@ define <4 x i16> @v_fptosi_v4bf16_to_v4i16(<4 x bfloat> %x) { ; GCN-LABEL: v_fptosi_v4bf16_to_v4i16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 ; GCN-NEXT: v_cvt_i32_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_i32_f32_e32 v0, v0 ; GCN-NEXT: v_cvt_i32_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_or_b32_e32 v0, v0, v1 -; GCN-NEXT: v_or_b32_e32 v2, v2, v4 -; GCN-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GCN-NEXT: v_or_b32_e32 v0, v0, v3 +; GCN-NEXT: v_or_b32_e32 v1, v1, v2 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_fptosi_v4bf16_to_v4i16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_cvt_i32_f32_e32 v3, v3 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_i32_f32_e32 v1, v1 +; GFX7-NEXT: v_cvt_i32_f32_e32 v3, v3 ; GFX7-NEXT: v_cvt_i32_f32_e32 v0, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_cvt_i32_f32_e32 v1, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7-NEXT: v_or_b32_e32 v2, v2, v4 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX7-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v3 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fptosi_v4bf16_to_v4i16: @@ -36695,23 +34413,20 @@ define <2 x i32> @v_fptosi_v2bf16_to_v2i32(<2 x bfloat> %x) { ; GCN-LABEL: v_fptosi_v2bf16_to_v2i32: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_cvt_i32_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_i32_f32_e32 v1, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 +; GCN-NEXT: v_cvt_i32_f32_e32 v0, v1 +; GCN-NEXT: v_cvt_i32_f32_e32 v1, v2 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_fptosi_v2bf16_to_v2i32: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; GFX7-NEXT: v_cvt_i32_f32_e32 v2, v1 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_cvt_i32_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_i32_f32_e32 v1, v1 +; GFX7-NEXT: v_cvt_i32_f32_e32 v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fptosi_v2bf16_to_v2i32: @@ -36771,29 +34486,25 @@ define <3 x i32> @v_fptosi_v3bf16_to_v3i32(<3 x bfloat> %x) { ; GCN-LABEL: v_fptosi_v3bf16_to_v3i32: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GCN-NEXT: v_cvt_i32_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_i32_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v1 +; GCN-NEXT: v_cvt_i32_f32_e32 v0, v2 +; GCN-NEXT: v_cvt_i32_f32_e32 v1, v3 +; GCN-NEXT: v_cvt_i32_f32_e32 v2, v4 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_fptosi_v3bf16_to_v3i32: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_cvt_i32_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_i32_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_i32_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_i32_f32_e32 v4, v2 +; GFX7-NEXT: v_cvt_i32_f32_e32 v3, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX7-NEXT: v_cvt_i32_f32_e32 v2, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, v4 +; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fptosi_v3bf16_to_v3i32: @@ -36866,35 +34577,29 @@ define <4 x i32> @v_fptosi_v4bf16_to_v4i32(<4 x bfloat> %x) { ; GCN-LABEL: v_fptosi_v4bf16_to_v4i32: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GCN-NEXT: v_cvt_i32_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_i32_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_i32_f32_e32 v3, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v1 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GCN-NEXT: v_cvt_i32_f32_e32 v0, v2 +; GCN-NEXT: v_cvt_i32_f32_e32 v1, v3 +; GCN-NEXT: v_cvt_i32_f32_e32 v2, v4 +; GCN-NEXT: v_cvt_i32_f32_e32 v3, v5 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_fptosi_v4bf16_to_v4i32: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX7-NEXT: v_cvt_i32_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_i32_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_i32_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_i32_f32_e32 v5, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX7-NEXT: v_cvt_i32_f32_e32 v4, v2 +; GFX7-NEXT: v_cvt_i32_f32_e32 v2, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v1 +; GFX7-NEXT: v_cvt_i32_f32_e32 v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, v4 +; GFX7-NEXT: v_mov_b32_e32 v1, v5 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fptosi_v4bf16_to_v4i32: @@ -37184,31 +34889,29 @@ define <2 x i64> @v_fptosi_v2bf16_to_v2i64(<2 x bfloat> %x) { ; GCN-LABEL: v_fptosi_v2bf16_to_v2i64: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GCN-NEXT: s_mov_b32 s4, 0x2f800000 ; GCN-NEXT: s_mov_b32 s5, 0xcf800000 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_trunc_f32_e32 v0, v0 ; GCN-NEXT: v_trunc_f32_e32 v1, v1 -; GCN-NEXT: v_mul_f32_e64 v2, |v0|, s4 -; GCN-NEXT: v_ashrrev_i32_e32 v3, 31, v0 -; GCN-NEXT: v_mul_f32_e64 v4, |v1|, s4 -; GCN-NEXT: v_ashrrev_i32_e32 v5, 31, v1 +; GCN-NEXT: v_trunc_f32_e32 v0, v0 +; GCN-NEXT: v_mul_f32_e64 v2, |v1|, s4 +; GCN-NEXT: v_ashrrev_i32_e32 v3, 31, v1 +; GCN-NEXT: v_mul_f32_e64 v4, |v0|, s4 +; GCN-NEXT: v_ashrrev_i32_e32 v5, 31, v0 ; GCN-NEXT: v_floor_f32_e32 v2, v2 ; GCN-NEXT: v_floor_f32_e32 v4, v4 -; GCN-NEXT: v_fma_f32 v0, v2, s5, |v0| +; GCN-NEXT: v_fma_f32 v1, v2, s5, |v1| ; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GCN-NEXT: v_fma_f32 v1, v4, s5, |v1| +; GCN-NEXT: v_fma_f32 v0, v4, s5, |v0| ; GCN-NEXT: v_cvt_u32_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_xor_b32_e32 v2, v2, v3 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GCN-NEXT: v_xor_b32_e32 v2, v2, v3 +; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GCN-NEXT: v_xor_b32_e32 v4, v4, v5 -; GCN-NEXT: v_xor_b32_e32 v0, v0, v3 -; GCN-NEXT: v_xor_b32_e32 v6, v1, v5 -; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 +; GCN-NEXT: v_xor_b32_e32 v1, v1, v3 +; GCN-NEXT: v_xor_b32_e32 v6, v0, v5 +; GCN-NEXT: v_sub_i32_e32 v0, vcc, v1, v3 ; GCN-NEXT: v_subb_u32_e32 v1, vcc, v2, v3, vcc ; GCN-NEXT: v_sub_i32_e32 v2, vcc, v6, v5 ; GCN-NEXT: v_subb_u32_e32 v3, vcc, v4, v5, vcc @@ -37217,30 +34920,28 @@ define <2 x i64> @v_fptosi_v2bf16_to_v2i64(<2 x bfloat> %x) { ; GFX7-LABEL: v_fptosi_v2bf16_to_v2i64: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_trunc_f32_e32 v0, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; GFX7-NEXT: v_trunc_f32_e32 v1, v1 ; GFX7-NEXT: s_mov_b32 s4, 0x2f800000 -; GFX7-NEXT: v_mul_f32_e64 v2, |v0|, s4 +; GFX7-NEXT: v_mul_f32_e64 v2, |v1|, s4 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: v_floor_f32_e32 v2, v2 ; GFX7-NEXT: s_mov_b32 s5, 0xcf800000 -; GFX7-NEXT: v_fma_f32 v3, v2, s5, |v0| +; GFX7-NEXT: v_trunc_f32_e32 v4, v0 +; GFX7-NEXT: v_fma_f32 v3, v2, s5, |v1| +; GFX7-NEXT: v_mul_f32_e64 v0, |v4|, s4 ; GFX7-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_ashrrev_i32_e32 v4, 31, v0 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_xor_b32_e32 v0, v3, v4 -; GFX7-NEXT: v_trunc_f32_e32 v3, v1 -; GFX7-NEXT: v_mul_f32_e64 v1, |v3|, s4 -; GFX7-NEXT: v_floor_f32_e32 v1, v1 +; GFX7-NEXT: v_floor_f32_e32 v0, v0 ; GFX7-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GFX7-NEXT: v_fma_f32 v5, v1, s5, |v3| +; GFX7-NEXT: v_fma_f32 v5, v0, s5, |v4| ; GFX7-NEXT: v_cvt_u32_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_u32_f32_e32 v6, v1 -; GFX7-NEXT: v_xor_b32_e32 v2, v2, v4 -; GFX7-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 -; GFX7-NEXT: v_ashrrev_i32_e32 v3, 31, v3 -; GFX7-NEXT: v_subb_u32_e32 v1, vcc, v2, v4, vcc +; GFX7-NEXT: v_ashrrev_i32_e32 v1, 31, v1 +; GFX7-NEXT: v_cvt_u32_f32_e32 v6, v0 +; GFX7-NEXT: v_xor_b32_e32 v3, v3, v1 +; GFX7-NEXT: v_xor_b32_e32 v2, v2, v1 +; GFX7-NEXT: v_sub_i32_e32 v0, vcc, v3, v1 +; GFX7-NEXT: v_ashrrev_i32_e32 v3, 31, v4 +; GFX7-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc ; GFX7-NEXT: v_xor_b32_e32 v2, v5, v3 ; GFX7-NEXT: v_xor_b32_e32 v4, v6, v3 ; GFX7-NEXT: v_sub_i32_e32 v2, vcc, v2, v3 @@ -37451,42 +35152,39 @@ define <3 x i64> @v_fptosi_v3bf16_to_v3i64(<3 x bfloat> %x) { ; GCN-LABEL: v_fptosi_v3bf16_to_v3i64: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v0 ; GCN-NEXT: s_mov_b32 s4, 0x2f800000 ; GCN-NEXT: s_mov_b32 s5, 0xcf800000 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-NEXT: v_trunc_f32_e32 v0, v0 ; GCN-NEXT: v_trunc_f32_e32 v1, v1 -; GCN-NEXT: v_trunc_f32_e32 v2, v2 -; GCN-NEXT: v_mul_f32_e64 v3, |v0|, s4 -; GCN-NEXT: v_ashrrev_i32_e32 v4, 31, v0 -; GCN-NEXT: v_mul_f32_e64 v5, |v1|, s4 -; GCN-NEXT: v_ashrrev_i32_e32 v6, 31, v1 -; GCN-NEXT: v_mul_f32_e64 v7, |v2|, s4 -; GCN-NEXT: v_ashrrev_i32_e32 v8, 31, v2 +; GCN-NEXT: v_mul_f32_e64 v3, |v2|, s4 +; GCN-NEXT: v_ashrrev_i32_e32 v4, 31, v2 +; GCN-NEXT: v_mul_f32_e64 v5, |v0|, s4 +; GCN-NEXT: v_ashrrev_i32_e32 v6, 31, v0 +; GCN-NEXT: v_mul_f32_e64 v7, |v1|, s4 +; GCN-NEXT: v_ashrrev_i32_e32 v8, 31, v1 ; GCN-NEXT: v_floor_f32_e32 v3, v3 ; GCN-NEXT: v_floor_f32_e32 v5, v5 ; GCN-NEXT: v_floor_f32_e32 v7, v7 -; GCN-NEXT: v_fma_f32 v0, v3, s5, |v0| +; GCN-NEXT: v_fma_f32 v2, v3, s5, |v2| ; GCN-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GCN-NEXT: v_fma_f32 v1, v5, s5, |v1| +; GCN-NEXT: v_fma_f32 v0, v5, s5, |v0| ; GCN-NEXT: v_cvt_u32_f32_e32 v5, v5 -; GCN-NEXT: v_fma_f32 v2, v7, s5, |v2| +; GCN-NEXT: v_fma_f32 v1, v7, s5, |v1| ; GCN-NEXT: v_cvt_u32_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GCN-NEXT: v_xor_b32_e32 v3, v3, v4 -; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GCN-NEXT: v_xor_b32_e32 v5, v5, v6 -; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GCN-NEXT: v_xor_b32_e32 v7, v7, v8 -; GCN-NEXT: v_xor_b32_e32 v0, v0, v4 -; GCN-NEXT: v_xor_b32_e32 v9, v1, v6 -; GCN-NEXT: v_xor_b32_e32 v10, v2, v8 -; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 +; GCN-NEXT: v_xor_b32_e32 v2, v2, v4 +; GCN-NEXT: v_xor_b32_e32 v9, v0, v6 +; GCN-NEXT: v_xor_b32_e32 v10, v1, v8 +; GCN-NEXT: v_sub_i32_e32 v0, vcc, v2, v4 ; GCN-NEXT: v_subb_u32_e32 v1, vcc, v3, v4, vcc ; GCN-NEXT: v_sub_i32_e32 v2, vcc, v9, v6 ; GCN-NEXT: v_subb_u32_e32 v3, vcc, v5, v6, vcc @@ -37497,47 +35195,45 @@ define <3 x i64> @v_fptosi_v3bf16_to_v3i64(<3 x bfloat> %x) { ; GFX7-LABEL: v_fptosi_v3bf16_to_v3i64: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_trunc_f32_e32 v0, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX7-NEXT: v_trunc_f32_e32 v2, v2 ; GFX7-NEXT: s_mov_b32 s4, 0x2f800000 -; GFX7-NEXT: v_mul_f32_e64 v3, |v0|, s4 +; GFX7-NEXT: v_mul_f32_e64 v3, |v2|, s4 ; GFX7-NEXT: v_floor_f32_e32 v3, v3 ; GFX7-NEXT: s_mov_b32 s5, 0xcf800000 -; GFX7-NEXT: v_fma_f32 v4, v3, s5, |v0| +; GFX7-NEXT: v_fma_f32 v4, v3, s5, |v2| +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: v_cvt_u32_f32_e32 v4, v4 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_ashrrev_i32_e32 v5, 31, v0 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_xor_b32_e32 v0, v4, v5 -; GFX7-NEXT: v_trunc_f32_e32 v4, v1 -; GFX7-NEXT: v_mul_f32_e64 v1, |v4|, s4 +; GFX7-NEXT: v_trunc_f32_e32 v5, v0 ; GFX7-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GFX7-NEXT: v_floor_f32_e32 v1, v1 -; GFX7-NEXT: v_fma_f32 v6, v1, s5, |v4| -; GFX7-NEXT: v_cvt_u32_f32_e32 v6, v6 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_xor_b32_e32 v3, v3, v5 -; GFX7-NEXT: v_sub_i32_e32 v0, vcc, v0, v5 -; GFX7-NEXT: v_cvt_u32_f32_e32 v7, v1 -; GFX7-NEXT: v_subb_u32_e32 v1, vcc, v3, v5, vcc -; GFX7-NEXT: v_ashrrev_i32_e32 v3, 31, v4 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_xor_b32_e32 v5, v6, v3 -; GFX7-NEXT: v_trunc_f32_e32 v6, v2 -; GFX7-NEXT: v_mul_f32_e64 v2, |v6|, s4 -; GFX7-NEXT: v_floor_f32_e32 v2, v2 -; GFX7-NEXT: v_xor_b32_e32 v4, v7, v3 -; GFX7-NEXT: v_fma_f32 v7, v2, s5, |v6| +; GFX7-NEXT: v_mul_f32_e64 v0, |v5|, s4 +; GFX7-NEXT: v_floor_f32_e32 v0, v0 +; GFX7-NEXT: v_ashrrev_i32_e32 v2, 31, v2 +; GFX7-NEXT: v_fma_f32 v6, v0, s5, |v5| +; GFX7-NEXT: v_xor_b32_e32 v4, v4, v2 +; GFX7-NEXT: v_cvt_u32_f32_e32 v7, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_xor_b32_e32 v3, v3, v2 +; GFX7-NEXT: v_cvt_u32_f32_e32 v8, v0 +; GFX7-NEXT: v_sub_i32_e32 v0, vcc, v4, v2 +; GFX7-NEXT: v_trunc_f32_e32 v1, v1 +; GFX7-NEXT: v_subb_u32_e32 v6, vcc, v3, v2, vcc +; GFX7-NEXT: v_ashrrev_i32_e32 v3, 31, v5 +; GFX7-NEXT: v_mul_f32_e64 v5, |v1|, s4 +; GFX7-NEXT: v_floor_f32_e32 v5, v5 +; GFX7-NEXT: v_xor_b32_e32 v2, v7, v3 +; GFX7-NEXT: v_fma_f32 v7, v5, s5, |v1| ; GFX7-NEXT: v_cvt_u32_f32_e32 v7, v7 -; GFX7-NEXT: v_cvt_u32_f32_e32 v8, v2 -; GFX7-NEXT: v_sub_i32_e32 v2, vcc, v5, v3 -; GFX7-NEXT: v_ashrrev_i32_e32 v5, 31, v6 +; GFX7-NEXT: v_cvt_u32_f32_e32 v5, v5 +; GFX7-NEXT: v_xor_b32_e32 v4, v8, v3 +; GFX7-NEXT: v_sub_i32_e32 v2, vcc, v2, v3 +; GFX7-NEXT: v_ashrrev_i32_e32 v1, 31, v1 ; GFX7-NEXT: v_subb_u32_e32 v3, vcc, v4, v3, vcc -; GFX7-NEXT: v_xor_b32_e32 v4, v7, v5 -; GFX7-NEXT: v_xor_b32_e32 v6, v8, v5 -; GFX7-NEXT: v_sub_i32_e32 v4, vcc, v4, v5 -; GFX7-NEXT: v_subb_u32_e32 v5, vcc, v6, v5, vcc +; GFX7-NEXT: v_xor_b32_e32 v4, v7, v1 +; GFX7-NEXT: v_xor_b32_e32 v5, v5, v1 +; GFX7-NEXT: v_sub_i32_e32 v4, vcc, v4, v1 +; GFX7-NEXT: v_subb_u32_e32 v5, vcc, v5, v1, vcc +; GFX7-NEXT: v_mov_b32_e32 v1, v6 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fptosi_v3bf16_to_v3i64: @@ -37817,53 +35513,49 @@ define <4 x i64> @v_fptosi_v4bf16_to_v4i64(<4 x bfloat> %x) { ; GCN-LABEL: v_fptosi_v4bf16_to_v4i64: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v0 ; GCN-NEXT: s_mov_b32 s4, 0x2f800000 ; GCN-NEXT: s_mov_b32 s5, 0xcf800000 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GCN-NEXT: v_trunc_f32_e32 v0, v0 -; GCN-NEXT: v_trunc_f32_e32 v1, v1 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 +; GCN-NEXT: v_trunc_f32_e32 v0, v0 ; GCN-NEXT: v_trunc_f32_e32 v3, v3 -; GCN-NEXT: v_mul_f32_e64 v4, |v0|, s4 -; GCN-NEXT: v_ashrrev_i32_e32 v5, 31, v0 -; GCN-NEXT: v_mul_f32_e64 v6, |v1|, s4 -; GCN-NEXT: v_ashrrev_i32_e32 v7, 31, v1 -; GCN-NEXT: v_mul_f32_e64 v8, |v2|, s4 -; GCN-NEXT: v_ashrrev_i32_e32 v9, 31, v2 -; GCN-NEXT: v_mul_f32_e64 v10, |v3|, s4 -; GCN-NEXT: v_ashrrev_i32_e32 v11, 31, v3 +; GCN-NEXT: v_trunc_f32_e32 v1, v1 +; GCN-NEXT: v_mul_f32_e64 v4, |v2|, s4 +; GCN-NEXT: v_ashrrev_i32_e32 v5, 31, v2 +; GCN-NEXT: v_mul_f32_e64 v6, |v0|, s4 +; GCN-NEXT: v_ashrrev_i32_e32 v7, 31, v0 +; GCN-NEXT: v_mul_f32_e64 v8, |v3|, s4 +; GCN-NEXT: v_ashrrev_i32_e32 v9, 31, v3 +; GCN-NEXT: v_mul_f32_e64 v10, |v1|, s4 +; GCN-NEXT: v_ashrrev_i32_e32 v11, 31, v1 ; GCN-NEXT: v_floor_f32_e32 v4, v4 ; GCN-NEXT: v_floor_f32_e32 v6, v6 ; GCN-NEXT: v_floor_f32_e32 v8, v8 ; GCN-NEXT: v_floor_f32_e32 v10, v10 -; GCN-NEXT: v_fma_f32 v0, v4, s5, |v0| +; GCN-NEXT: v_fma_f32 v2, v4, s5, |v2| ; GCN-NEXT: v_cvt_u32_f32_e32 v4, v4 -; GCN-NEXT: v_fma_f32 v1, v6, s5, |v1| +; GCN-NEXT: v_fma_f32 v0, v6, s5, |v0| ; GCN-NEXT: v_cvt_u32_f32_e32 v6, v6 -; GCN-NEXT: v_fma_f32 v2, v8, s5, |v2| +; GCN-NEXT: v_fma_f32 v3, v8, s5, |v3| ; GCN-NEXT: v_cvt_u32_f32_e32 v8, v8 -; GCN-NEXT: v_fma_f32 v3, v10, s5, |v3| +; GCN-NEXT: v_fma_f32 v1, v10, s5, |v1| ; GCN-NEXT: v_cvt_u32_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GCN-NEXT: v_xor_b32_e32 v4, v4, v5 -; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GCN-NEXT: v_xor_b32_e32 v6, v6, v7 -; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GCN-NEXT: v_xor_b32_e32 v8, v8, v9 ; GCN-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GCN-NEXT: v_xor_b32_e32 v8, v8, v9 +; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GCN-NEXT: v_xor_b32_e32 v10, v10, v11 -; GCN-NEXT: v_xor_b32_e32 v0, v0, v5 -; GCN-NEXT: v_xor_b32_e32 v12, v1, v7 -; GCN-NEXT: v_xor_b32_e32 v13, v2, v9 -; GCN-NEXT: v_xor_b32_e32 v14, v3, v11 -; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v5 +; GCN-NEXT: v_xor_b32_e32 v2, v2, v5 +; GCN-NEXT: v_xor_b32_e32 v12, v0, v7 +; GCN-NEXT: v_xor_b32_e32 v13, v3, v9 +; GCN-NEXT: v_xor_b32_e32 v14, v1, v11 +; GCN-NEXT: v_sub_i32_e32 v0, vcc, v2, v5 ; GCN-NEXT: v_subb_u32_e32 v1, vcc, v4, v5, vcc ; GCN-NEXT: v_sub_i32_e32 v2, vcc, v12, v7 ; GCN-NEXT: v_subb_u32_e32 v3, vcc, v6, v7, vcc @@ -37876,60 +35568,57 @@ define <4 x i64> @v_fptosi_v4bf16_to_v4i64(<4 x bfloat> %x) { ; GFX7-LABEL: v_fptosi_v4bf16_to_v4i64: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_trunc_f32_e32 v0, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX7-NEXT: v_trunc_f32_e32 v2, v2 ; GFX7-NEXT: s_mov_b32 s4, 0x2f800000 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e64 v3, |v0|, s4 +; GFX7-NEXT: v_mul_f32_e64 v3, |v2|, s4 ; GFX7-NEXT: v_floor_f32_e32 v3, v3 ; GFX7-NEXT: s_mov_b32 s5, 0xcf800000 -; GFX7-NEXT: v_fma_f32 v5, v3, s5, |v0| -; GFX7-NEXT: v_cvt_u32_f32_e32 v5, v5 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_ashrrev_i32_e32 v6, 31, v0 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_xor_b32_e32 v0, v5, v6 -; GFX7-NEXT: v_trunc_f32_e32 v5, v1 -; GFX7-NEXT: v_mul_f32_e64 v1, |v5|, s4 +; GFX7-NEXT: v_fma_f32 v4, v3, s5, |v2| +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_cvt_u32_f32_e32 v4, v4 +; GFX7-NEXT: v_trunc_f32_e32 v5, v0 ; GFX7-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GFX7-NEXT: v_floor_f32_e32 v1, v1 -; GFX7-NEXT: v_fma_f32 v7, v1, s5, |v5| -; GFX7-NEXT: v_cvt_u32_f32_e32 v7, v7 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_xor_b32_e32 v3, v3, v6 -; GFX7-NEXT: v_sub_i32_e32 v0, vcc, v0, v6 -; GFX7-NEXT: v_cvt_u32_f32_e32 v8, v1 -; GFX7-NEXT: v_subb_u32_e32 v1, vcc, v3, v6, vcc +; GFX7-NEXT: v_mul_f32_e64 v0, |v5|, s4 +; GFX7-NEXT: v_floor_f32_e32 v0, v0 +; GFX7-NEXT: v_ashrrev_i32_e32 v2, 31, v2 +; GFX7-NEXT: v_fma_f32 v6, v0, s5, |v5| +; GFX7-NEXT: v_xor_b32_e32 v4, v4, v2 +; GFX7-NEXT: v_cvt_u32_f32_e32 v6, v6 +; GFX7-NEXT: v_xor_b32_e32 v3, v3, v2 +; GFX7-NEXT: v_cvt_u32_f32_e32 v7, v0 +; GFX7-NEXT: v_sub_i32_e32 v0, vcc, v4, v2 +; GFX7-NEXT: v_subb_u32_e32 v8, vcc, v3, v2, vcc ; GFX7-NEXT: v_ashrrev_i32_e32 v3, 31, v5 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_xor_b32_e32 v6, v7, v3 -; GFX7-NEXT: v_trunc_f32_e32 v7, v2 -; GFX7-NEXT: v_mul_f32_e64 v2, |v7|, s4 -; GFX7-NEXT: v_floor_f32_e32 v2, v2 -; GFX7-NEXT: v_xor_b32_e32 v5, v8, v3 -; GFX7-NEXT: v_fma_f32 v8, v2, s5, |v7| -; GFX7-NEXT: v_cvt_u32_f32_e32 v8, v8 -; GFX7-NEXT: v_cvt_u32_f32_e32 v9, v2 -; GFX7-NEXT: v_sub_i32_e32 v2, vcc, v6, v3 -; GFX7-NEXT: v_subb_u32_e32 v3, vcc, v5, v3, vcc -; GFX7-NEXT: v_ashrrev_i32_e32 v5, 31, v7 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX7-NEXT: v_xor_b32_e32 v7, v8, v5 -; GFX7-NEXT: v_trunc_f32_e32 v8, v4 -; GFX7-NEXT: v_mul_f32_e64 v4, |v8|, s4 -; GFX7-NEXT: v_floor_f32_e32 v4, v4 -; GFX7-NEXT: v_xor_b32_e32 v6, v9, v5 -; GFX7-NEXT: v_fma_f32 v9, v4, s5, |v8| +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v1 +; GFX7-NEXT: v_trunc_f32_e32 v5, v5 +; GFX7-NEXT: v_xor_b32_e32 v2, v6, v3 +; GFX7-NEXT: v_mul_f32_e64 v6, |v5|, s4 +; GFX7-NEXT: v_floor_f32_e32 v6, v6 +; GFX7-NEXT: v_xor_b32_e32 v4, v7, v3 +; GFX7-NEXT: v_fma_f32 v7, v6, s5, |v5| +; GFX7-NEXT: v_cvt_u32_f32_e32 v7, v7 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: v_sub_i32_e32 v2, vcc, v2, v3 +; GFX7-NEXT: v_ashrrev_i32_e32 v5, 31, v5 +; GFX7-NEXT: v_trunc_f32_e32 v1, v1 +; GFX7-NEXT: v_subb_u32_e32 v3, vcc, v4, v3, vcc +; GFX7-NEXT: v_xor_b32_e32 v4, v7, v5 +; GFX7-NEXT: v_mul_f32_e64 v7, |v1|, s4 +; GFX7-NEXT: v_floor_f32_e32 v7, v7 +; GFX7-NEXT: v_cvt_u32_f32_e32 v6, v6 +; GFX7-NEXT: v_fma_f32 v9, v7, s5, |v1| ; GFX7-NEXT: v_cvt_u32_f32_e32 v9, v9 -; GFX7-NEXT: v_cvt_u32_f32_e32 v10, v4 -; GFX7-NEXT: v_sub_i32_e32 v4, vcc, v7, v5 -; GFX7-NEXT: v_ashrrev_i32_e32 v7, 31, v8 +; GFX7-NEXT: v_cvt_u32_f32_e32 v7, v7 +; GFX7-NEXT: v_xor_b32_e32 v6, v6, v5 +; GFX7-NEXT: v_sub_i32_e32 v4, vcc, v4, v5 +; GFX7-NEXT: v_ashrrev_i32_e32 v1, 31, v1 ; GFX7-NEXT: v_subb_u32_e32 v5, vcc, v6, v5, vcc -; GFX7-NEXT: v_xor_b32_e32 v6, v9, v7 -; GFX7-NEXT: v_xor_b32_e32 v8, v10, v7 -; GFX7-NEXT: v_sub_i32_e32 v6, vcc, v6, v7 -; GFX7-NEXT: v_subb_u32_e32 v7, vcc, v8, v7, vcc +; GFX7-NEXT: v_xor_b32_e32 v6, v9, v1 +; GFX7-NEXT: v_xor_b32_e32 v7, v7, v1 +; GFX7-NEXT: v_sub_i32_e32 v6, vcc, v6, v1 +; GFX7-NEXT: v_subb_u32_e32 v7, vcc, v7, v1, vcc +; GFX7-NEXT: v_mov_b32_e32 v1, v8 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fptosi_v4bf16_to_v4i64: @@ -38384,23 +36073,23 @@ define <2 x bfloat> @v_sitofp_v2i16_to_v2bf16(<2 x i16> %x) { ; GCN-LABEL: v_sitofp_v2i16_to_v2bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_ashrrev_i32_e32 v1, 16, v0 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GCN-NEXT: v_bfe_i32 v1, v1, 0, 16 ; GCN-NEXT: v_cvt_f32_i32_e32 v1, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GCN-NEXT: v_cvt_f32_i32_e32 v0, v0 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_sitofp_v2i16_to_v2bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_ashrrev_i32_e32 v1, 16, v0 ; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GFX7-NEXT: v_bfe_i32 v1, v1, 0, 16 -; GFX7-NEXT: v_cvt_f32_i32_e32 v0, v0 ; GFX7-NEXT: v_cvt_f32_i32_e32 v1, v1 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_alignbit_b32 v0, v1, v0, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_sitofp_v2i16_to_v2bf16: @@ -38538,29 +36227,29 @@ define <3 x bfloat> @v_sitofp_v3i16_to_v3bf16(<3 x i16> %x) { ; GCN-LABEL: v_sitofp_v3i16_to_v3bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_ashrrev_i32_e32 v2, 16, v0 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GCN-NEXT: v_bfe_i32 v1, v1, 0, 16 -; GCN-NEXT: v_bfe_i32 v2, v2, 0, 16 -; GCN-NEXT: v_cvt_f32_i32_e32 v2, v2 ; GCN-NEXT: v_cvt_f32_i32_e32 v1, v1 +; GCN-NEXT: v_cvt_f32_i32_e32 v2, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GCN-NEXT: v_cvt_f32_i32_e32 v0, v0 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GCN-NEXT: v_alignbit_b32 v0, v2, v0, 16 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_sitofp_v3i16_to_v3bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_ashrrev_i32_e32 v2, 16, v0 ; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GFX7-NEXT: v_bfe_i32 v1, v1, 0, 16 -; GFX7-NEXT: v_bfe_i32 v2, v2, 0, 16 -; GFX7-NEXT: v_cvt_f32_i32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_i32_e32 v1, v1 ; GFX7-NEXT: v_cvt_f32_i32_e32 v2, v2 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX7-NEXT: v_cvt_f32_i32_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_alignbit_b32 v0, v2, v0, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_sitofp_v3i16_to_v3bf16: @@ -38761,35 +36450,35 @@ define <4 x bfloat> @v_sitofp_v4i16_to_v4bf16(<4 x i16> %x) { ; GCN-LABEL: v_sitofp_v4i16_to_v4bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_ashrrev_i32_e32 v2, 16, v0 +; GCN-NEXT: v_ashrrev_i32_e32 v3, 16, v1 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GCN-NEXT: v_bfe_i32 v1, v1, 0, 16 -; GCN-NEXT: v_bfe_i32 v2, v2, 0, 16 -; GCN-NEXT: v_bfe_i32 v3, v3, 0, 16 ; GCN-NEXT: v_cvt_f32_i32_e32 v3, v3 ; GCN-NEXT: v_cvt_f32_i32_e32 v2, v2 ; GCN-NEXT: v_cvt_f32_i32_e32 v1, v1 ; GCN-NEXT: v_cvt_f32_i32_e32 v0, v0 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_alignbit_b32 v0, v2, v0, 16 +; GCN-NEXT: v_alignbit_b32 v1, v3, v1, 16 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_sitofp_v4i16_to_v4bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_ashrrev_i32_e32 v2, 16, v0 +; GFX7-NEXT: v_ashrrev_i32_e32 v3, 16, v1 ; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GFX7-NEXT: v_bfe_i32 v1, v1, 0, 16 -; GFX7-NEXT: v_bfe_i32 v2, v2, 0, 16 -; GFX7-NEXT: v_bfe_i32 v3, v3, 0, 16 +; GFX7-NEXT: v_cvt_f32_i32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f32_i32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_i32_e32 v0, v0 ; GFX7-NEXT: v_cvt_f32_i32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f32_i32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f32_i32_e32 v3, v3 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_alignbit_b32 v0, v2, v0, 16 +; GFX7-NEXT: v_alignbit_b32 v1, v3, v1, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_sitofp_v4i16_to_v4bf16: @@ -39110,18 +36799,18 @@ define <2 x bfloat> @v_sitofp_v2i32_to_v2bf16(<2 x i32> %x) { ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_cvt_f32_i32_e32 v1, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GCN-NEXT: v_cvt_f32_i32_e32 v0, v0 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_sitofp_v2i32_to_v2bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_i32_e32 v0, v0 ; GFX7-NEXT: v_cvt_f32_i32_e32 v1, v1 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_alignbit_b32 v0, v1, v0, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_sitofp_v2i32_to_v2bf16: @@ -39251,22 +36940,22 @@ define <3 x bfloat> @v_sitofp_v3i32_to_v3bf16(<3 x i32> %x) { ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_cvt_f32_i32_e32 v2, v2 -; GCN-NEXT: v_cvt_f32_i32_e32 v1, v1 +; GCN-NEXT: v_cvt_f32_i32_e32 v3, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v3 ; GCN-NEXT: v_cvt_f32_i32_e32 v0, v0 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GCN-NEXT: v_alignbit_b32 v0, v2, v0, 16 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_sitofp_v3i32_to_v3bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_i32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_i32_e32 v1, v1 ; GFX7-NEXT: v_cvt_f32_i32_e32 v2, v2 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX7-NEXT: v_cvt_f32_i32_e32 v3, v1 +; GFX7-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; GFX7-NEXT: v_alignbit_b32 v0, v2, v0, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_sitofp_v3i32_to_v3bf16: @@ -39450,26 +37139,26 @@ define <4 x bfloat> @v_sitofp_v4i32_to_v4bf16(<4 x i32> %x) { ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_cvt_f32_i32_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_i32_e32 v2, v2 ; GCN-NEXT: v_cvt_f32_i32_e32 v1, v1 +; GCN-NEXT: v_cvt_f32_i32_e32 v2, v2 ; GCN-NEXT: v_cvt_f32_i32_e32 v0, v0 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GCN-NEXT: v_alignbit_b32 v1, v3, v2, 16 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_sitofp_v4i32_to_v4bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_i32_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_i32_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_i32_e32 v0, v0 ; GFX7-NEXT: v_cvt_f32_i32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f32_i32_e32 v3, v3 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX7-NEXT: v_alignbit_b32 v1, v3, v2, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_sitofp_v4i32_to_v4bf16: @@ -39882,10 +37571,10 @@ define <2 x bfloat> @v_sitofp_v2i64_to_v2bf16(<2 x i64> %x) { ; GCN-LABEL: v_sitofp_v2i64_to_v2bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_ffbh_i32_e32 v4, v3 -; GCN-NEXT: v_xor_b32_e32 v5, v2, v3 -; GCN-NEXT: v_ffbh_i32_e32 v6, v1 -; GCN-NEXT: v_xor_b32_e32 v7, v0, v1 +; GCN-NEXT: v_ffbh_i32_e32 v4, v1 +; GCN-NEXT: v_xor_b32_e32 v5, v0, v1 +; GCN-NEXT: v_ffbh_i32_e32 v6, v3 +; GCN-NEXT: v_xor_b32_e32 v7, v2, v3 ; GCN-NEXT: v_add_i32_e32 v4, vcc, -1, v4 ; GCN-NEXT: v_ashrrev_i32_e32 v5, 31, v5 ; GCN-NEXT: v_add_i32_e32 v6, vcc, -1, v6 @@ -39894,51 +37583,51 @@ define <2 x bfloat> @v_sitofp_v2i64_to_v2bf16(<2 x i64> %x) { ; GCN-NEXT: v_add_i32_e32 v7, vcc, 32, v7 ; GCN-NEXT: v_min_u32_e32 v4, v4, v5 ; GCN-NEXT: v_min_u32_e32 v5, v6, v7 -; GCN-NEXT: v_lshl_b64 v[2:3], v[2:3], v4 +; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], v4 ; GCN-NEXT: v_sub_i32_e32 v4, vcc, 32, v4 -; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], v5 +; GCN-NEXT: v_lshl_b64 v[2:3], v[2:3], v5 ; GCN-NEXT: v_sub_i32_e32 v5, vcc, 32, v5 -; GCN-NEXT: v_min_u32_e32 v2, 1, v2 ; GCN-NEXT: v_min_u32_e32 v0, 1, v0 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 +; GCN-NEXT: v_min_u32_e32 v2, 1, v2 ; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_cvt_f32_i32_e32 v1, v2 +; GCN-NEXT: v_or_b32_e32 v1, v3, v2 ; GCN-NEXT: v_cvt_f32_i32_e32 v0, v0 -; GCN-NEXT: v_ldexp_f32_e32 v1, v1, v4 -; GCN-NEXT: v_ldexp_f32_e32 v0, v0, v5 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GCN-NEXT: v_cvt_f32_i32_e32 v1, v1 +; GCN-NEXT: v_ldexp_f32_e32 v0, v0, v4 +; GCN-NEXT: v_ldexp_f32_e32 v1, v1, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_sitofp_v2i64_to_v2bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_xor_b32_e32 v5, v2, v3 -; GFX7-NEXT: v_ffbh_i32_e32 v4, v3 +; GFX7-NEXT: v_xor_b32_e32 v5, v0, v1 +; GFX7-NEXT: v_ffbh_i32_e32 v4, v1 ; GFX7-NEXT: v_ashrrev_i32_e32 v5, 31, v5 ; GFX7-NEXT: v_add_i32_e32 v4, vcc, -1, v4 ; GFX7-NEXT: v_add_i32_e32 v5, vcc, 32, v5 ; GFX7-NEXT: v_min_u32_e32 v4, v4, v5 -; GFX7-NEXT: v_lshl_b64 v[2:3], v[2:3], v4 -; GFX7-NEXT: v_xor_b32_e32 v5, v0, v1 -; GFX7-NEXT: v_min_u32_e32 v2, 1, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX7-NEXT: v_ffbh_i32_e32 v3, v1 -; GFX7-NEXT: v_ashrrev_i32_e32 v5, 31, v5 -; GFX7-NEXT: v_add_i32_e32 v3, vcc, -1, v3 -; GFX7-NEXT: v_add_i32_e32 v5, vcc, 32, v5 -; GFX7-NEXT: v_min_u32_e32 v3, v3, v5 -; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], v3 -; GFX7-NEXT: v_cvt_f32_i32_e32 v2, v2 +; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], v4 +; GFX7-NEXT: v_min_u32_e32 v0, 1, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_xor_b32_e32 v1, v2, v3 +; GFX7-NEXT: v_cvt_f32_i32_e32 v5, v0 +; GFX7-NEXT: v_ffbh_i32_e32 v0, v3 +; GFX7-NEXT: v_ashrrev_i32_e32 v1, 31, v1 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, -1, v0 +; GFX7-NEXT: v_add_i32_e32 v1, vcc, 32, v1 +; GFX7-NEXT: v_min_u32_e32 v6, v0, v1 +; GFX7-NEXT: v_lshl_b64 v[0:1], v[2:3], v6 +; GFX7-NEXT: v_sub_i32_e32 v2, vcc, 32, v4 ; GFX7-NEXT: v_min_u32_e32 v0, 1, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: v_cvt_f32_i32_e32 v0, v0 -; GFX7-NEXT: v_sub_i32_e32 v4, vcc, 32, v4 -; GFX7-NEXT: v_ldexp_f32_e32 v1, v2, v4 -; GFX7-NEXT: v_sub_i32_e32 v2, vcc, 32, v3 +; GFX7-NEXT: v_ldexp_f32_e32 v1, v5, v2 +; GFX7-NEXT: v_sub_i32_e32 v2, vcc, 32, v6 ; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v2 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_sitofp_v2i64_to_v2bf16: @@ -40245,10 +37934,10 @@ define <3 x bfloat> @v_sitofp_v3i64_to_v3bf16(<3 x i64> %x) { ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_ffbh_i32_e32 v6, v5 ; GCN-NEXT: v_xor_b32_e32 v7, v4, v5 -; GCN-NEXT: v_ffbh_i32_e32 v8, v3 -; GCN-NEXT: v_xor_b32_e32 v9, v2, v3 -; GCN-NEXT: v_ffbh_i32_e32 v10, v1 -; GCN-NEXT: v_xor_b32_e32 v11, v0, v1 +; GCN-NEXT: v_ffbh_i32_e32 v8, v1 +; GCN-NEXT: v_xor_b32_e32 v9, v0, v1 +; GCN-NEXT: v_ffbh_i32_e32 v10, v3 +; GCN-NEXT: v_xor_b32_e32 v11, v2, v3 ; GCN-NEXT: v_add_i32_e32 v6, vcc, -1, v6 ; GCN-NEXT: v_ashrrev_i32_e32 v7, 31, v7 ; GCN-NEXT: v_add_i32_e32 v8, vcc, -1, v8 @@ -40263,25 +37952,25 @@ define <3 x bfloat> @v_sitofp_v3i64_to_v3bf16(<3 x i64> %x) { ; GCN-NEXT: v_min_u32_e32 v8, v10, v11 ; GCN-NEXT: v_lshl_b64 v[4:5], v[4:5], v6 ; GCN-NEXT: v_sub_i32_e32 v6, vcc, 32, v6 -; GCN-NEXT: v_lshl_b64 v[2:3], v[2:3], v7 +; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], v7 ; GCN-NEXT: v_sub_i32_e32 v7, vcc, 32, v7 -; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], v8 +; GCN-NEXT: v_lshl_b64 v[2:3], v[2:3], v8 ; GCN-NEXT: v_sub_i32_e32 v8, vcc, 32, v8 ; GCN-NEXT: v_min_u32_e32 v4, 1, v4 -; GCN-NEXT: v_min_u32_e32 v2, 1, v2 ; GCN-NEXT: v_min_u32_e32 v0, 1, v0 +; GCN-NEXT: v_min_u32_e32 v2, 1, v2 ; GCN-NEXT: v_or_b32_e32 v4, v5, v4 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 ; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_cvt_f32_i32_e32 v1, v4 -; GCN-NEXT: v_cvt_f32_i32_e32 v2, v2 +; GCN-NEXT: v_or_b32_e32 v1, v3, v2 +; GCN-NEXT: v_cvt_f32_i32_e32 v2, v4 ; GCN-NEXT: v_cvt_f32_i32_e32 v0, v0 -; GCN-NEXT: v_ldexp_f32_e32 v3, v1, v6 -; GCN-NEXT: v_ldexp_f32_e32 v1, v2, v7 -; GCN-NEXT: v_ldexp_f32_e32 v0, v0, v8 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; GCN-NEXT: v_cvt_f32_i32_e32 v1, v1 +; GCN-NEXT: v_ldexp_f32_e32 v2, v2, v6 +; GCN-NEXT: v_ldexp_f32_e32 v0, v0, v7 +; GCN-NEXT: v_ldexp_f32_e32 v3, v1, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; GCN-NEXT: v_alignbit_b32 v0, v2, v0, 16 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_sitofp_v3i64_to_v3bf16: @@ -40294,38 +37983,38 @@ define <3 x bfloat> @v_sitofp_v3i64_to_v3bf16(<3 x i64> %x) { ; GFX7-NEXT: v_add_i32_e32 v7, vcc, 32, v7 ; GFX7-NEXT: v_min_u32_e32 v6, v6, v7 ; GFX7-NEXT: v_lshl_b64 v[4:5], v[4:5], v6 -; GFX7-NEXT: v_xor_b32_e32 v7, v2, v3 ; GFX7-NEXT: v_min_u32_e32 v4, 1, v4 ; GFX7-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX7-NEXT: v_sub_i32_e32 v5, vcc, 32, v6 -; GFX7-NEXT: v_ffbh_i32_e32 v6, v3 -; GFX7-NEXT: v_ashrrev_i32_e32 v7, 31, v7 ; GFX7-NEXT: v_cvt_f32_i32_e32 v4, v4 -; GFX7-NEXT: v_add_i32_e32 v6, vcc, -1, v6 -; GFX7-NEXT: v_add_i32_e32 v7, vcc, 32, v7 -; GFX7-NEXT: v_min_u32_e32 v6, v6, v7 -; GFX7-NEXT: v_lshl_b64 v[2:3], v[2:3], v6 -; GFX7-NEXT: v_ldexp_f32_e32 v4, v4, v5 -; GFX7-NEXT: v_min_u32_e32 v2, 1, v2 +; GFX7-NEXT: v_sub_i32_e32 v5, vcc, 32, v6 +; GFX7-NEXT: v_ldexp_f32_e32 v6, v4, v5 ; GFX7-NEXT: v_xor_b32_e32 v5, v0, v1 -; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX7-NEXT: v_ffbh_i32_e32 v3, v1 +; GFX7-NEXT: v_ffbh_i32_e32 v4, v1 ; GFX7-NEXT: v_ashrrev_i32_e32 v5, 31, v5 -; GFX7-NEXT: v_add_i32_e32 v3, vcc, -1, v3 +; GFX7-NEXT: v_add_i32_e32 v4, vcc, -1, v4 ; GFX7-NEXT: v_add_i32_e32 v5, vcc, 32, v5 -; GFX7-NEXT: v_min_u32_e32 v3, v3, v5 -; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], v3 -; GFX7-NEXT: v_cvt_f32_i32_e32 v2, v2 -; GFX7-NEXT: v_min_u32_e32 v0, 1, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_min_u32_e32 v7, v4, v5 +; GFX7-NEXT: v_lshl_b64 v[4:5], v[0:1], v7 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; GFX7-NEXT: v_min_u32_e32 v0, 1, v4 +; GFX7-NEXT: v_or_b32_e32 v0, v5, v0 +; GFX7-NEXT: v_xor_b32_e32 v5, v2, v3 +; GFX7-NEXT: v_ffbh_i32_e32 v4, v3 +; GFX7-NEXT: v_ashrrev_i32_e32 v5, 31, v5 +; GFX7-NEXT: v_add_i32_e32 v4, vcc, -1, v4 +; GFX7-NEXT: v_add_i32_e32 v5, vcc, 32, v5 +; GFX7-NEXT: v_min_u32_e32 v4, v4, v5 +; GFX7-NEXT: v_lshl_b64 v[2:3], v[2:3], v4 ; GFX7-NEXT: v_cvt_f32_i32_e32 v0, v0 -; GFX7-NEXT: v_sub_i32_e32 v5, vcc, 32, v6 -; GFX7-NEXT: v_ldexp_f32_e32 v1, v2, v5 -; GFX7-NEXT: v_sub_i32_e32 v2, vcc, 32, v3 -; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v2 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 +; GFX7-NEXT: v_min_u32_e32 v2, 1, v2 +; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX7-NEXT: v_cvt_f32_i32_e32 v2, v2 +; GFX7-NEXT: v_sub_i32_e32 v3, vcc, 32, v4 +; GFX7-NEXT: v_sub_i32_e32 v5, vcc, 32, v7 +; GFX7-NEXT: v_ldexp_f32_e32 v2, v2, v3 +; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_alignbit_b32 v0, v2, v0, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_sitofp_v3i64_to_v3bf16: @@ -40798,14 +38487,14 @@ define <4 x bfloat> @v_sitofp_v4i64_to_v4bf16(<4 x i64> %x) { ; GCN-LABEL: v_sitofp_v4i64_to_v4bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_ffbh_i32_e32 v8, v7 -; GCN-NEXT: v_xor_b32_e32 v9, v6, v7 -; GCN-NEXT: v_ffbh_i32_e32 v10, v5 -; GCN-NEXT: v_xor_b32_e32 v11, v4, v5 -; GCN-NEXT: v_ffbh_i32_e32 v12, v3 -; GCN-NEXT: v_xor_b32_e32 v13, v2, v3 -; GCN-NEXT: v_ffbh_i32_e32 v14, v1 -; GCN-NEXT: v_xor_b32_e32 v15, v0, v1 +; GCN-NEXT: v_ffbh_i32_e32 v8, v5 +; GCN-NEXT: v_xor_b32_e32 v9, v4, v5 +; GCN-NEXT: v_ffbh_i32_e32 v10, v7 +; GCN-NEXT: v_xor_b32_e32 v11, v6, v7 +; GCN-NEXT: v_ffbh_i32_e32 v12, v1 +; GCN-NEXT: v_xor_b32_e32 v13, v0, v1 +; GCN-NEXT: v_ffbh_i32_e32 v14, v3 +; GCN-NEXT: v_xor_b32_e32 v15, v2, v3 ; GCN-NEXT: v_add_i32_e32 v8, vcc, -1, v8 ; GCN-NEXT: v_ashrrev_i32_e32 v9, 31, v9 ; GCN-NEXT: v_add_i32_e32 v10, vcc, -1, v10 @@ -40822,91 +38511,91 @@ define <4 x bfloat> @v_sitofp_v4i64_to_v4bf16(<4 x i64> %x) { ; GCN-NEXT: v_min_u32_e32 v9, v10, v11 ; GCN-NEXT: v_min_u32_e32 v10, v12, v13 ; GCN-NEXT: v_min_u32_e32 v11, v14, v15 -; GCN-NEXT: v_lshl_b64 v[6:7], v[6:7], v8 +; GCN-NEXT: v_lshl_b64 v[4:5], v[4:5], v8 ; GCN-NEXT: v_sub_i32_e32 v8, vcc, 32, v8 -; GCN-NEXT: v_lshl_b64 v[4:5], v[4:5], v9 +; GCN-NEXT: v_lshl_b64 v[6:7], v[6:7], v9 ; GCN-NEXT: v_sub_i32_e32 v9, vcc, 32, v9 -; GCN-NEXT: v_lshl_b64 v[2:3], v[2:3], v10 +; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], v10 ; GCN-NEXT: v_sub_i32_e32 v10, vcc, 32, v10 -; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], v11 +; GCN-NEXT: v_lshl_b64 v[2:3], v[2:3], v11 ; GCN-NEXT: v_sub_i32_e32 v11, vcc, 32, v11 -; GCN-NEXT: v_min_u32_e32 v6, 1, v6 ; GCN-NEXT: v_min_u32_e32 v4, 1, v4 -; GCN-NEXT: v_min_u32_e32 v2, 1, v2 +; GCN-NEXT: v_min_u32_e32 v6, 1, v6 ; GCN-NEXT: v_min_u32_e32 v0, 1, v0 -; GCN-NEXT: v_or_b32_e32 v6, v7, v6 +; GCN-NEXT: v_min_u32_e32 v2, 1, v2 ; GCN-NEXT: v_or_b32_e32 v4, v5, v4 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 +; GCN-NEXT: v_or_b32_e32 v5, v7, v6 ; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_cvt_f32_i32_e32 v1, v6 -; GCN-NEXT: v_cvt_f32_i32_e32 v3, v4 -; GCN-NEXT: v_cvt_f32_i32_e32 v2, v2 +; GCN-NEXT: v_or_b32_e32 v1, v3, v2 +; GCN-NEXT: v_cvt_f32_i32_e32 v2, v4 +; GCN-NEXT: v_cvt_f32_i32_e32 v3, v5 ; GCN-NEXT: v_cvt_f32_i32_e32 v0, v0 -; GCN-NEXT: v_ldexp_f32_e32 v4, v1, v8 +; GCN-NEXT: v_cvt_f32_i32_e32 v1, v1 +; GCN-NEXT: v_ldexp_f32_e32 v2, v2, v8 ; GCN-NEXT: v_ldexp_f32_e32 v3, v3, v9 -; GCN-NEXT: v_ldexp_f32_e32 v1, v2, v10 -; GCN-NEXT: v_ldexp_f32_e32 v0, v0, v11 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 +; GCN-NEXT: v_ldexp_f32_e32 v0, v0, v10 +; GCN-NEXT: v_ldexp_f32_e32 v1, v1, v11 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GCN-NEXT: v_alignbit_b32 v1, v3, v2, 16 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_sitofp_v4i64_to_v4bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_xor_b32_e32 v9, v6, v7 -; GFX7-NEXT: v_ffbh_i32_e32 v8, v7 -; GFX7-NEXT: v_ashrrev_i32_e32 v9, 31, v9 -; GFX7-NEXT: v_add_i32_e32 v8, vcc, -1, v8 -; GFX7-NEXT: v_add_i32_e32 v9, vcc, 32, v9 -; GFX7-NEXT: v_min_u32_e32 v8, v8, v9 -; GFX7-NEXT: v_lshl_b64 v[6:7], v[6:7], v8 ; GFX7-NEXT: v_xor_b32_e32 v9, v4, v5 -; GFX7-NEXT: v_min_u32_e32 v6, 1, v6 -; GFX7-NEXT: v_or_b32_e32 v6, v7, v6 -; GFX7-NEXT: v_sub_i32_e32 v7, vcc, 32, v8 ; GFX7-NEXT: v_ffbh_i32_e32 v8, v5 ; GFX7-NEXT: v_ashrrev_i32_e32 v9, 31, v9 ; GFX7-NEXT: v_add_i32_e32 v8, vcc, -1, v8 ; GFX7-NEXT: v_add_i32_e32 v9, vcc, 32, v9 ; GFX7-NEXT: v_min_u32_e32 v8, v8, v9 -; GFX7-NEXT: v_cvt_f32_i32_e32 v6, v6 ; GFX7-NEXT: v_lshl_b64 v[4:5], v[4:5], v8 ; GFX7-NEXT: v_min_u32_e32 v4, 1, v4 ; GFX7-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX7-NEXT: v_sub_i32_e32 v5, vcc, 32, v8 -; GFX7-NEXT: v_xor_b32_e32 v8, v2, v3 -; GFX7-NEXT: v_ldexp_f32_e32 v6, v6, v7 -; GFX7-NEXT: v_ffbh_i32_e32 v7, v3 -; GFX7-NEXT: v_ashrrev_i32_e32 v8, 31, v8 -; GFX7-NEXT: v_cvt_f32_i32_e32 v4, v4 -; GFX7-NEXT: v_add_i32_e32 v7, vcc, -1, v7 -; GFX7-NEXT: v_add_i32_e32 v8, vcc, 32, v8 -; GFX7-NEXT: v_min_u32_e32 v7, v7, v8 -; GFX7-NEXT: v_lshl_b64 v[2:3], v[2:3], v7 -; GFX7-NEXT: v_ldexp_f32_e32 v4, v4, v5 -; GFX7-NEXT: v_min_u32_e32 v2, 1, v2 -; GFX7-NEXT: v_xor_b32_e32 v5, v0, v1 -; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX7-NEXT: v_ffbh_i32_e32 v3, v1 +; GFX7-NEXT: v_xor_b32_e32 v5, v6, v7 +; GFX7-NEXT: v_cvt_f32_i32_e32 v9, v4 +; GFX7-NEXT: v_ffbh_i32_e32 v4, v7 ; GFX7-NEXT: v_ashrrev_i32_e32 v5, 31, v5 -; GFX7-NEXT: v_add_i32_e32 v3, vcc, -1, v3 +; GFX7-NEXT: v_add_i32_e32 v4, vcc, -1, v4 ; GFX7-NEXT: v_add_i32_e32 v5, vcc, 32, v5 -; GFX7-NEXT: v_min_u32_e32 v3, v3, v5 -; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], v3 -; GFX7-NEXT: v_cvt_f32_i32_e32 v2, v2 +; GFX7-NEXT: v_min_u32_e32 v10, v4, v5 +; GFX7-NEXT: v_lshl_b64 v[4:5], v[6:7], v10 +; GFX7-NEXT: v_sub_i32_e32 v6, vcc, 32, v8 +; GFX7-NEXT: v_min_u32_e32 v4, 1, v4 +; GFX7-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX7-NEXT: v_cvt_f32_i32_e32 v4, v4 +; GFX7-NEXT: v_ldexp_f32_e32 v5, v9, v6 +; GFX7-NEXT: v_sub_i32_e32 v6, vcc, 32, v10 +; GFX7-NEXT: v_xor_b32_e32 v7, v0, v1 +; GFX7-NEXT: v_ldexp_f32_e32 v4, v4, v6 +; GFX7-NEXT: v_ffbh_i32_e32 v6, v1 +; GFX7-NEXT: v_ashrrev_i32_e32 v7, 31, v7 +; GFX7-NEXT: v_add_i32_e32 v6, vcc, -1, v6 +; GFX7-NEXT: v_add_i32_e32 v7, vcc, 32, v7 +; GFX7-NEXT: v_min_u32_e32 v6, v6, v7 +; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_min_u32_e32 v0, 1, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_xor_b32_e32 v1, v2, v3 +; GFX7-NEXT: v_cvt_f32_i32_e32 v7, v0 +; GFX7-NEXT: v_ffbh_i32_e32 v0, v3 +; GFX7-NEXT: v_ashrrev_i32_e32 v1, 31, v1 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, -1, v0 +; GFX7-NEXT: v_add_i32_e32 v1, vcc, 32, v1 +; GFX7-NEXT: v_min_u32_e32 v8, v0, v1 +; GFX7-NEXT: v_lshl_b64 v[0:1], v[2:3], v8 +; GFX7-NEXT: v_sub_i32_e32 v2, vcc, 32, v6 ; GFX7-NEXT: v_min_u32_e32 v0, 1, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: v_cvt_f32_i32_e32 v0, v0 -; GFX7-NEXT: v_sub_i32_e32 v5, vcc, 32, v7 -; GFX7-NEXT: v_ldexp_f32_e32 v1, v2, v5 -; GFX7-NEXT: v_sub_i32_e32 v2, vcc, 32, v3 +; GFX7-NEXT: v_ldexp_f32_e32 v1, v7, v2 +; GFX7-NEXT: v_sub_i32_e32 v2, vcc, 32, v8 ; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v2 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; GFX7-NEXT: v_alignbit_b32 v1, v4, v5, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_sitofp_v4i64_to_v4bf16: @@ -41558,23 +39247,23 @@ define <2 x bfloat> @v_uitofp_v2i16_to_v2bf16(<2 x i16> %x) { ; GCN-LABEL: v_uitofp_v2i16_to_v2bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_cvt_f32_u32_e32 v1, v1 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GCN-NEXT: v_cvt_f32_u32_e32 v0, v0 -; GCN-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0x7fff0000, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_cvt_f32_u32_e32 v1, v1 +; GCN-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_uitofp_v2i16_to_v2bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_cvt_f32_u32_e32 v0, v0 ; GFX7-NEXT: v_cvt_f32_u32_e32 v1, v1 -; GFX7-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0 -; GFX7-NEXT: v_and_b32_e32 v1, 0x7fff0000, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_uitofp_v2i16_to_v2bf16: @@ -41727,29 +39416,29 @@ define <3 x bfloat> @v_uitofp_v3i16_to_v3bf16(<3 x i16> %x) { ; GCN-LABEL: v_uitofp_v3i16_to_v3bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_cvt_f32_u32_e32 v2, v2 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, v1 ; GCN-NEXT: v_cvt_f32_u32_e32 v0, v0 -; GCN-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0x7fff0000, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0x7fff0000, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_cvt_f32_u32_e32 v2, v2 +; GCN-NEXT: v_alignbit_b32 v0, v0, v2, 16 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_uitofp_v3i16_to_v3bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX7-NEXT: v_cvt_f32_u32_e32 v0, v0 ; GFX7-NEXT: v_cvt_f32_u32_e32 v1, v1 ; GFX7-NEXT: v_cvt_f32_u32_e32 v2, v2 -; GFX7-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0 -; GFX7-NEXT: v_and_b32_e32 v1, 0x7fff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v2, 0x7fff0000, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_alignbit_b32 v0, v0, v2, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_uitofp_v3i16_to_v3bf16: @@ -41956,35 +39645,35 @@ define <4 x bfloat> @v_uitofp_v4i16_to_v4bf16(<4 x i16> %x) { ; GCN-LABEL: v_uitofp_v4i16_to_v4bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_cvt_f32_u32_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_u32_e32 v2, v2 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, v1 +; GCN-NEXT: v_cvt_f32_u32_e32 v3, v3 ; GCN-NEXT: v_cvt_f32_u32_e32 v0, v0 -; GCN-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0x7fff0000, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0x7fff0000, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0x7fff0000, v3 +; GCN-NEXT: v_cvt_f32_u32_e32 v2, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; GCN-NEXT: v_alignbit_b32 v1, v1, v3, 16 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_uitofp_v4i16_to_v4bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX7-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_cvt_f32_u32_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_u32_e32 v0, v0 ; GFX7-NEXT: v_cvt_f32_u32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_u32_e32 v3, v3 -; GFX7-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0 -; GFX7-NEXT: v_and_b32_e32 v1, 0x7fff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v2, 0x7fff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v3, 0x7fff0000, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; GFX7-NEXT: v_alignbit_b32 v1, v1, v3, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_uitofp_v4i16_to_v4bf16: @@ -42331,18 +40020,18 @@ define <2 x bfloat> @v_uitofp_v2i32_to_v2bf16(<2 x i32> %x) { ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_cvt_f32_u32_e32 v1, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GCN-NEXT: v_cvt_f32_u32_e32 v0, v0 -; GCN-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0x7fff0000, v1 +; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_uitofp_v2i32_to_v2bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_u32_e32 v0, v0 ; GFX7-NEXT: v_cvt_f32_u32_e32 v1, v1 -; GFX7-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0 -; GFX7-NEXT: v_and_b32_e32 v1, 0x7fff0000, v1 +; GFX7-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_alignbit_b32 v0, v1, v0, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_uitofp_v2i32_to_v2bf16: @@ -42472,22 +40161,22 @@ define <3 x bfloat> @v_uitofp_v3i32_to_v3bf16(<3 x i32> %x) { ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_cvt_f32_u32_e32 v2, v2 -; GCN-NEXT: v_cvt_f32_u32_e32 v1, v1 +; GCN-NEXT: v_cvt_f32_u32_e32 v3, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v3 ; GCN-NEXT: v_cvt_f32_u32_e32 v0, v0 -; GCN-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0x7fff0000, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0x7fff0000, v2 +; GCN-NEXT: v_alignbit_b32 v0, v2, v0, 16 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_uitofp_v3i32_to_v3bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_u32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_u32_e32 v1, v1 ; GFX7-NEXT: v_cvt_f32_u32_e32 v2, v2 -; GFX7-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0 -; GFX7-NEXT: v_and_b32_e32 v1, 0x7fff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v2, 0x7fff0000, v2 +; GFX7-NEXT: v_cvt_f32_u32_e32 v3, v1 +; GFX7-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; GFX7-NEXT: v_alignbit_b32 v0, v2, v0, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_uitofp_v3i32_to_v3bf16: @@ -42671,26 +40360,26 @@ define <4 x bfloat> @v_uitofp_v4i32_to_v4bf16(<4 x i32> %x) { ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_cvt_f32_u32_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_u32_e32 v2, v2 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, v1 +; GCN-NEXT: v_cvt_f32_u32_e32 v2, v2 ; GCN-NEXT: v_cvt_f32_u32_e32 v0, v0 -; GCN-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0x7fff0000, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0x7fff0000, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0x7fff0000, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GCN-NEXT: v_alignbit_b32 v1, v3, v2, 16 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_uitofp_v4i32_to_v4bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_u32_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_u32_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_u32_e32 v0, v0 ; GFX7-NEXT: v_cvt_f32_u32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f32_u32_e32 v3, v3 -; GFX7-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0 -; GFX7-NEXT: v_and_b32_e32 v1, 0x7fff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v2, 0x7fff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v3, 0x7fff0000, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX7-NEXT: v_alignbit_b32 v1, v3, v2, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_uitofp_v4i32_to_v4bf16: @@ -43063,47 +40752,47 @@ define <2 x bfloat> @v_uitofp_v2i64_to_v2bf16(<2 x i64> %x) { ; GCN-LABEL: v_uitofp_v2i64_to_v2bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_ffbh_u32_e32 v4, v3 -; GCN-NEXT: v_ffbh_u32_e32 v5, v1 +; GCN-NEXT: v_ffbh_u32_e32 v4, v1 +; GCN-NEXT: v_ffbh_u32_e32 v5, v3 ; GCN-NEXT: v_min_u32_e32 v4, 32, v4 ; GCN-NEXT: v_min_u32_e32 v5, 32, v5 -; GCN-NEXT: v_lshl_b64 v[2:3], v[2:3], v4 +; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], v4 ; GCN-NEXT: v_sub_i32_e32 v4, vcc, 32, v4 -; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], v5 +; GCN-NEXT: v_lshl_b64 v[2:3], v[2:3], v5 ; GCN-NEXT: v_sub_i32_e32 v5, vcc, 32, v5 -; GCN-NEXT: v_min_u32_e32 v2, 1, v2 ; GCN-NEXT: v_min_u32_e32 v0, 1, v0 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 +; GCN-NEXT: v_min_u32_e32 v2, 1, v2 ; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_cvt_f32_u32_e32 v1, v2 +; GCN-NEXT: v_or_b32_e32 v1, v3, v2 ; GCN-NEXT: v_cvt_f32_u32_e32 v0, v0 -; GCN-NEXT: v_ldexp_f32_e32 v1, v1, v4 -; GCN-NEXT: v_ldexp_f32_e32 v0, v0, v5 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GCN-NEXT: v_cvt_f32_u32_e32 v1, v1 +; GCN-NEXT: v_ldexp_f32_e32 v0, v0, v4 +; GCN-NEXT: v_ldexp_f32_e32 v1, v1, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_uitofp_v2i64_to_v2bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_ffbh_u32_e32 v4, v3 +; GFX7-NEXT: v_ffbh_u32_e32 v4, v1 ; GFX7-NEXT: v_min_u32_e32 v4, 32, v4 -; GFX7-NEXT: v_lshl_b64 v[2:3], v[2:3], v4 -; GFX7-NEXT: v_sub_i32_e32 v4, vcc, 32, v4 -; GFX7-NEXT: v_min_u32_e32 v2, 1, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX7-NEXT: v_ffbh_u32_e32 v3, v1 -; GFX7-NEXT: v_min_u32_e32 v3, 32, v3 -; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], v3 -; GFX7-NEXT: v_cvt_f32_u32_e32 v2, v2 +; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], v4 +; GFX7-NEXT: v_min_u32_e32 v0, 1, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_cvt_f32_u32_e32 v5, v0 +; GFX7-NEXT: v_ffbh_u32_e32 v0, v3 +; GFX7-NEXT: v_min_u32_e32 v6, 32, v0 +; GFX7-NEXT: v_lshl_b64 v[0:1], v[2:3], v6 +; GFX7-NEXT: v_sub_i32_e32 v2, vcc, 32, v4 ; GFX7-NEXT: v_min_u32_e32 v0, 1, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: v_cvt_f32_u32_e32 v0, v0 -; GFX7-NEXT: v_ldexp_f32_e32 v1, v2, v4 -; GFX7-NEXT: v_sub_i32_e32 v2, vcc, 32, v3 +; GFX7-NEXT: v_ldexp_f32_e32 v1, v5, v2 +; GFX7-NEXT: v_sub_i32_e32 v2, vcc, 32, v6 ; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v2 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_uitofp_v2i64_to_v2bf16: @@ -43348,32 +41037,32 @@ define <3 x bfloat> @v_uitofp_v3i64_to_v3bf16(<3 x i64> %x) { ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_ffbh_u32_e32 v6, v5 -; GCN-NEXT: v_ffbh_u32_e32 v7, v3 -; GCN-NEXT: v_ffbh_u32_e32 v8, v1 +; GCN-NEXT: v_ffbh_u32_e32 v7, v1 +; GCN-NEXT: v_ffbh_u32_e32 v8, v3 ; GCN-NEXT: v_min_u32_e32 v6, 32, v6 ; GCN-NEXT: v_min_u32_e32 v7, 32, v7 ; GCN-NEXT: v_min_u32_e32 v8, 32, v8 ; GCN-NEXT: v_lshl_b64 v[4:5], v[4:5], v6 ; GCN-NEXT: v_sub_i32_e32 v6, vcc, 32, v6 -; GCN-NEXT: v_lshl_b64 v[2:3], v[2:3], v7 +; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], v7 ; GCN-NEXT: v_sub_i32_e32 v7, vcc, 32, v7 -; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], v8 +; GCN-NEXT: v_lshl_b64 v[2:3], v[2:3], v8 ; GCN-NEXT: v_sub_i32_e32 v8, vcc, 32, v8 ; GCN-NEXT: v_min_u32_e32 v4, 1, v4 -; GCN-NEXT: v_min_u32_e32 v2, 1, v2 ; GCN-NEXT: v_min_u32_e32 v0, 1, v0 +; GCN-NEXT: v_min_u32_e32 v2, 1, v2 ; GCN-NEXT: v_or_b32_e32 v4, v5, v4 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 ; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_cvt_f32_u32_e32 v1, v4 -; GCN-NEXT: v_cvt_f32_u32_e32 v2, v2 +; GCN-NEXT: v_or_b32_e32 v1, v3, v2 +; GCN-NEXT: v_cvt_f32_u32_e32 v2, v4 ; GCN-NEXT: v_cvt_f32_u32_e32 v0, v0 -; GCN-NEXT: v_ldexp_f32_e32 v3, v1, v6 -; GCN-NEXT: v_ldexp_f32_e32 v1, v2, v7 -; GCN-NEXT: v_ldexp_f32_e32 v0, v0, v8 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; GCN-NEXT: v_cvt_f32_u32_e32 v1, v1 +; GCN-NEXT: v_ldexp_f32_e32 v2, v2, v6 +; GCN-NEXT: v_ldexp_f32_e32 v0, v0, v7 +; GCN-NEXT: v_ldexp_f32_e32 v3, v1, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; GCN-NEXT: v_alignbit_b32 v0, v2, v0, 16 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_uitofp_v3i64_to_v3bf16: @@ -43384,28 +41073,28 @@ define <3 x bfloat> @v_uitofp_v3i64_to_v3bf16(<3 x i64> %x) { ; GFX7-NEXT: v_lshl_b64 v[4:5], v[4:5], v6 ; GFX7-NEXT: v_min_u32_e32 v4, 1, v4 ; GFX7-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX7-NEXT: v_sub_i32_e32 v5, vcc, 32, v6 -; GFX7-NEXT: v_ffbh_u32_e32 v6, v3 -; GFX7-NEXT: v_min_u32_e32 v6, 32, v6 -; GFX7-NEXT: v_lshl_b64 v[2:3], v[2:3], v6 ; GFX7-NEXT: v_cvt_f32_u32_e32 v4, v4 +; GFX7-NEXT: v_sub_i32_e32 v5, vcc, 32, v6 +; GFX7-NEXT: v_ldexp_f32_e32 v6, v4, v5 +; GFX7-NEXT: v_ffbh_u32_e32 v4, v1 +; GFX7-NEXT: v_min_u32_e32 v7, 32, v4 +; GFX7-NEXT: v_lshl_b64 v[4:5], v[0:1], v7 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; GFX7-NEXT: v_min_u32_e32 v0, 1, v4 +; GFX7-NEXT: v_ffbh_u32_e32 v4, v3 +; GFX7-NEXT: v_min_u32_e32 v4, 32, v4 +; GFX7-NEXT: v_lshl_b64 v[2:3], v[2:3], v4 +; GFX7-NEXT: v_or_b32_e32 v0, v5, v0 ; GFX7-NEXT: v_min_u32_e32 v2, 1, v2 ; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX7-NEXT: v_ffbh_u32_e32 v3, v1 -; GFX7-NEXT: v_min_u32_e32 v3, 32, v3 -; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], v3 ; GFX7-NEXT: v_cvt_f32_u32_e32 v2, v2 -; GFX7-NEXT: v_min_u32_e32 v0, 1, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: v_cvt_f32_u32_e32 v0, v0 -; GFX7-NEXT: v_ldexp_f32_e32 v4, v4, v5 -; GFX7-NEXT: v_sub_i32_e32 v5, vcc, 32, v6 -; GFX7-NEXT: v_ldexp_f32_e32 v1, v2, v5 -; GFX7-NEXT: v_sub_i32_e32 v2, vcc, 32, v3 -; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v2 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 +; GFX7-NEXT: v_sub_i32_e32 v3, vcc, 32, v4 +; GFX7-NEXT: v_sub_i32_e32 v5, vcc, 32, v7 +; GFX7-NEXT: v_ldexp_f32_e32 v2, v2, v3 +; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_alignbit_b32 v0, v2, v0, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_uitofp_v3i64_to_v3bf16: @@ -43784,83 +41473,83 @@ define <4 x bfloat> @v_uitofp_v4i64_to_v4bf16(<4 x i64> %x) { ; GCN-LABEL: v_uitofp_v4i64_to_v4bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_ffbh_u32_e32 v8, v7 -; GCN-NEXT: v_ffbh_u32_e32 v9, v5 -; GCN-NEXT: v_ffbh_u32_e32 v10, v3 -; GCN-NEXT: v_ffbh_u32_e32 v11, v1 +; GCN-NEXT: v_ffbh_u32_e32 v8, v5 +; GCN-NEXT: v_ffbh_u32_e32 v9, v7 +; GCN-NEXT: v_ffbh_u32_e32 v10, v1 +; GCN-NEXT: v_ffbh_u32_e32 v11, v3 ; GCN-NEXT: v_min_u32_e32 v8, 32, v8 ; GCN-NEXT: v_min_u32_e32 v9, 32, v9 ; GCN-NEXT: v_min_u32_e32 v10, 32, v10 ; GCN-NEXT: v_min_u32_e32 v11, 32, v11 -; GCN-NEXT: v_lshl_b64 v[6:7], v[6:7], v8 +; GCN-NEXT: v_lshl_b64 v[4:5], v[4:5], v8 ; GCN-NEXT: v_sub_i32_e32 v8, vcc, 32, v8 -; GCN-NEXT: v_lshl_b64 v[4:5], v[4:5], v9 +; GCN-NEXT: v_lshl_b64 v[6:7], v[6:7], v9 ; GCN-NEXT: v_sub_i32_e32 v9, vcc, 32, v9 -; GCN-NEXT: v_lshl_b64 v[2:3], v[2:3], v10 +; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], v10 ; GCN-NEXT: v_sub_i32_e32 v10, vcc, 32, v10 -; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], v11 +; GCN-NEXT: v_lshl_b64 v[2:3], v[2:3], v11 ; GCN-NEXT: v_sub_i32_e32 v11, vcc, 32, v11 -; GCN-NEXT: v_min_u32_e32 v6, 1, v6 ; GCN-NEXT: v_min_u32_e32 v4, 1, v4 -; GCN-NEXT: v_min_u32_e32 v2, 1, v2 +; GCN-NEXT: v_min_u32_e32 v6, 1, v6 ; GCN-NEXT: v_min_u32_e32 v0, 1, v0 -; GCN-NEXT: v_or_b32_e32 v6, v7, v6 +; GCN-NEXT: v_min_u32_e32 v2, 1, v2 ; GCN-NEXT: v_or_b32_e32 v4, v5, v4 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 +; GCN-NEXT: v_or_b32_e32 v5, v7, v6 ; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_cvt_f32_u32_e32 v1, v6 -; GCN-NEXT: v_cvt_f32_u32_e32 v3, v4 -; GCN-NEXT: v_cvt_f32_u32_e32 v2, v2 +; GCN-NEXT: v_or_b32_e32 v1, v3, v2 +; GCN-NEXT: v_cvt_f32_u32_e32 v2, v4 +; GCN-NEXT: v_cvt_f32_u32_e32 v3, v5 ; GCN-NEXT: v_cvt_f32_u32_e32 v0, v0 -; GCN-NEXT: v_ldexp_f32_e32 v4, v1, v8 +; GCN-NEXT: v_cvt_f32_u32_e32 v1, v1 +; GCN-NEXT: v_ldexp_f32_e32 v2, v2, v8 ; GCN-NEXT: v_ldexp_f32_e32 v3, v3, v9 -; GCN-NEXT: v_ldexp_f32_e32 v1, v2, v10 -; GCN-NEXT: v_ldexp_f32_e32 v0, v0, v11 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 +; GCN-NEXT: v_ldexp_f32_e32 v0, v0, v10 +; GCN-NEXT: v_ldexp_f32_e32 v1, v1, v11 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GCN-NEXT: v_alignbit_b32 v1, v3, v2, 16 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_uitofp_v4i64_to_v4bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_ffbh_u32_e32 v8, v7 -; GFX7-NEXT: v_min_u32_e32 v8, 32, v8 -; GFX7-NEXT: v_lshl_b64 v[6:7], v[6:7], v8 -; GFX7-NEXT: v_min_u32_e32 v6, 1, v6 -; GFX7-NEXT: v_or_b32_e32 v6, v7, v6 -; GFX7-NEXT: v_cvt_f32_u32_e32 v6, v6 -; GFX7-NEXT: v_sub_i32_e32 v7, vcc, 32, v8 ; GFX7-NEXT: v_ffbh_u32_e32 v8, v5 -; GFX7-NEXT: v_ldexp_f32_e32 v6, v6, v7 -; GFX7-NEXT: v_ffbh_u32_e32 v7, v3 -; GFX7-NEXT: v_min_u32_e32 v7, 32, v7 -; GFX7-NEXT: v_lshl_b64 v[2:3], v[2:3], v7 ; GFX7-NEXT: v_min_u32_e32 v8, 32, v8 -; GFX7-NEXT: v_min_u32_e32 v2, 1, v2 ; GFX7-NEXT: v_lshl_b64 v[4:5], v[4:5], v8 -; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX7-NEXT: v_ffbh_u32_e32 v3, v1 -; GFX7-NEXT: v_min_u32_e32 v3, 32, v3 ; GFX7-NEXT: v_min_u32_e32 v4, 1, v4 -; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], v3 +; GFX7-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX7-NEXT: v_cvt_f32_u32_e32 v9, v4 +; GFX7-NEXT: v_ffbh_u32_e32 v4, v7 +; GFX7-NEXT: v_min_u32_e32 v10, 32, v4 +; GFX7-NEXT: v_lshl_b64 v[4:5], v[6:7], v10 +; GFX7-NEXT: v_sub_i32_e32 v6, vcc, 32, v8 +; GFX7-NEXT: v_min_u32_e32 v4, 1, v4 ; GFX7-NEXT: v_or_b32_e32 v4, v5, v4 ; GFX7-NEXT: v_cvt_f32_u32_e32 v4, v4 +; GFX7-NEXT: v_ldexp_f32_e32 v5, v9, v6 +; GFX7-NEXT: v_sub_i32_e32 v6, vcc, 32, v10 +; GFX7-NEXT: v_ldexp_f32_e32 v4, v4, v6 +; GFX7-NEXT: v_ffbh_u32_e32 v6, v1 +; GFX7-NEXT: v_min_u32_e32 v6, 32, v6 +; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_min_u32_e32 v0, 1, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_cvt_f32_u32_e32 v7, v0 +; GFX7-NEXT: v_ffbh_u32_e32 v0, v3 +; GFX7-NEXT: v_min_u32_e32 v8, 32, v0 +; GFX7-NEXT: v_lshl_b64 v[0:1], v[2:3], v8 +; GFX7-NEXT: v_sub_i32_e32 v2, vcc, 32, v6 ; GFX7-NEXT: v_min_u32_e32 v0, 1, v0 -; GFX7-NEXT: v_cvt_f32_u32_e32 v2, v2 ; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: v_cvt_f32_u32_e32 v0, v0 -; GFX7-NEXT: v_sub_i32_e32 v5, vcc, 32, v8 -; GFX7-NEXT: v_ldexp_f32_e32 v4, v4, v5 -; GFX7-NEXT: v_sub_i32_e32 v5, vcc, 32, v7 -; GFX7-NEXT: v_ldexp_f32_e32 v1, v2, v5 -; GFX7-NEXT: v_sub_i32_e32 v2, vcc, 32, v3 +; GFX7-NEXT: v_ldexp_f32_e32 v1, v7, v2 +; GFX7-NEXT: v_sub_i32_e32 v2, vcc, 32, v8 ; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v2 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; GFX7-NEXT: v_alignbit_b32 v1, v4, v5, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_uitofp_v4i64_to_v4bf16: @@ -44589,39 +42278,29 @@ define <2 x bfloat> @v_select_v2bf16(i1 %cond, <2 x bfloat> %a, <2 x bfloat> %b) ; GCN-LABEL: v_select_v2bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v2 ; GCN-NEXT: v_and_b32_e32 v0, 1, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GCN-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GCN-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc +; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GCN-NEXT: v_or_b32_e32 v0, v1, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_select_v2bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX7-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_select_v2bf16: @@ -44721,18 +42400,14 @@ define <2 x bfloat> @v_vselect_v2bf16(<2 x i1> %cond, <2 x bfloat> %a, <2 x bflo ; GCN-LABEL: v_vselect_v2bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GCN-NEXT: v_and_b32_e32 v0, 1, v0 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GCN-NEXT: v_and_b32_e32 v1, 1, v1 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 -; GCN-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc +; GCN-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GCN-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GCN-NEXT: s_mov_b32 s4, 0xffff +; GCN-NEXT: v_bfi_b32 v0, s4, v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_vselect_v2bf16: @@ -44740,16 +42415,12 @@ define <2 x bfloat> @v_vselect_v2bf16(<2 x i1> %cond, <2 x bfloat> %a, <2 x bflo ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v1, 1, v1 ; GFX7-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX7-NEXT: s_mov_b32 s4, 0xffff +; GFX7-NEXT: v_bfi_b32 v0, s4, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_vselect_v2bf16: @@ -44979,37 +42650,35 @@ define amdgpu_ps i32 @s_select_bf16(bfloat inreg %a, bfloat inreg %b, i32 %c) { define amdgpu_ps i32 @s_select_v2bf16(<2 x bfloat> inreg %a, <2 x bfloat> inreg %b, i32 %c) { ; GCN-LABEL: s_select_v2bf16: ; GCN: ; %bb.0: -; GCN-NEXT: v_mul_f32_e64 v1, 1.0, s1 -; GCN-NEXT: v_mul_f32_e64 v2, 1.0, s3 -; GCN-NEXT: v_mul_f32_e64 v3, 1.0, s0 -; GCN-NEXT: v_mul_f32_e64 v4, 1.0, s2 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GCN-NEXT: s_lshr_b32 s2, s0, 16 +; GCN-NEXT: s_lshr_b32 s3, s1, 16 +; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: v_mov_b32_e32 v2, s0 +; GCN-NEXT: v_mov_b32_e32 v3, s3 +; GCN-NEXT: v_mov_b32_e32 v4, s2 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc -; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v0, v0, v1 +; GCN-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GCN-NEXT: v_or_b32_e32 v0, v1, v0 ; GCN-NEXT: v_readfirstlane_b32 s0, v0 ; GCN-NEXT: ; return to shader part epilog ; ; GFX7-LABEL: s_select_v2bf16: ; GFX7: ; %bb.0: -; GFX7-NEXT: v_mul_f32_e64 v1, 1.0, s1 -; GFX7-NEXT: v_mul_f32_e64 v2, 1.0, s3 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_mul_f32_e64 v3, 1.0, s0 -; GFX7-NEXT: v_mul_f32_e64 v4, 1.0, s2 +; GFX7-NEXT: s_lshr_b32 s2, s0, 16 +; GFX7-NEXT: s_lshr_b32 s3, s1, 16 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: v_readfirstlane_b32 s0, v0 ; GFX7-NEXT: ; return to shader part epilog ; @@ -45144,33 +42813,27 @@ define amdgpu_ps i32 @s_select_v2bf16(<2 x bfloat> inreg %a, <2 x bfloat> inreg define amdgpu_ps i32 @s_vselect_v2bf16(<2 x bfloat> inreg %a, <2 x bfloat> inreg %b, <2 x i32> %c) { ; GCN-LABEL: s_vselect_v2bf16: ; GCN: ; %bb.0: -; GCN-NEXT: v_mul_f32_e64 v2, 1.0, s0 -; GCN-NEXT: v_mul_f32_e64 v3, 1.0, s2 -; GCN-NEXT: v_mul_f32_e64 v4, 1.0, s1 -; GCN-NEXT: v_mul_f32_e64 v5, 1.0, s3 +; GCN-NEXT: v_mov_b32_e32 v2, s1 +; GCN-NEXT: v_mov_b32_e32 v3, s0 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GCN-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc +; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_or_b32_e32 v0, v0, v1 +; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GCN-NEXT: s_mov_b32 s0, 0xffff +; GCN-NEXT: v_bfi_b32 v0, s0, v0, v1 ; GCN-NEXT: v_readfirstlane_b32 s0, v0 ; GCN-NEXT: ; return to shader part epilog ; ; GFX7-LABEL: s_vselect_v2bf16: ; GFX7: ; %bb.0: -; GFX7-NEXT: v_mul_f32_e64 v4, 1.0, s1 -; GFX7-NEXT: v_mul_f32_e64 v5, 1.0, s3 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s0 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX7-NEXT: v_mul_f32_e64 v2, 1.0, s0 -; GFX7-NEXT: v_mul_f32_e64 v3, 1.0, s2 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX7-NEXT: s_mov_b32 s0, 0xffff +; GFX7-NEXT: v_bfi_b32 v0, s0, v0, v1 ; GFX7-NEXT: v_readfirstlane_b32 s0, v0 ; GFX7-NEXT: ; return to shader part epilog ; @@ -45315,49 +42978,21 @@ define <3 x bfloat> @v_select_v3bf16(i1 %cond, <3 x bfloat> %a, <3 x bfloat> %b) ; GCN-LABEL: v_select_v3bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GCN-NEXT: v_and_b32_e32 v0, 1, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; GCN-NEXT: v_alignbit_b32 v2, v5, v4, 16 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GCN-NEXT: v_cndmask_b32_e32 v0, v6, v3, vcc -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GCN-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc +; GCN-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_select_v3bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v5 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX7-NEXT: v_alignbit_b32 v2, v2, v4, 16 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v3, vcc -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_select_v3bf16: @@ -45423,59 +43058,19 @@ define <4 x bfloat> @v_select_v4bf16(i1 %cond, <4 x bfloat> %a, <4 x bfloat> %b) ; GCN-LABEL: v_select_v4bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GCN-NEXT: v_and_b32_e32 v0, 1, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; GCN-NEXT: v_alignbit_b32 v2, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v3, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v4, v8, v7, 16 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GCN-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc -; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GCN-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc +; GCN-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_select_v4bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v6 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_alignbit_b32 v3, v4, v3, 16 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v8 -; GFX7-NEXT: v_alignbit_b32 v2, v2, v5, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v7 ; GFX7-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX7-NEXT: v_alignbit_b32 v4, v4, v5, 16 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX7-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_select_v4bf16: @@ -45541,81 +43136,21 @@ define <6 x bfloat> @v_select_v6bf16(i1 %cond, <6 x bfloat> %a, <6 x bfloat> %b) ; GCN-LABEL: v_select_v6bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 ; GCN-NEXT: v_and_b32_e32 v0, 1, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; GCN-NEXT: v_alignbit_b32 v2, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v3, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v4, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v6, v12, v11, 16 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GCN-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc -; GCN-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc -; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GCN-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc +; GCN-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_select_v6bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v8 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GFX7-NEXT: v_alignbit_b32 v3, v4, v3, 16 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v10 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_alignbit_b32 v2, v2, v7, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v9 -; GFX7-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v12 -; GFX7-NEXT: v_alignbit_b32 v4, v4, v7, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v11 ; GFX7-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX7-NEXT: v_alignbit_b32 v6, v6, v7, 16 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX7-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_select_v6bf16: @@ -45687,103 +43222,23 @@ define <8 x bfloat> @v_select_v8bf16(i1 %cond, <8 x bfloat> %a, <8 x bfloat> %b) ; GCN-LABEL: v_select_v8bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 ; GCN-NEXT: v_and_b32_e32 v0, 1, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; GCN-NEXT: v_alignbit_b32 v2, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v3, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v4, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v6, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v7, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v8, v16, v15, 16 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GCN-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc -; GCN-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc -; GCN-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc -; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GCN-NEXT: v_cndmask_b32_e32 v0, v5, v1, vcc +; GCN-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc +; GCN-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc +; GCN-NEXT: v_cndmask_b32_e32 v3, v8, v4, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_select_v8bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v10 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GFX7-NEXT: v_alignbit_b32 v3, v4, v3, 16 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v12 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; GFX7-NEXT: v_alignbit_b32 v2, v2, v9, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v11 -; GFX7-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v14 -; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GFX7-NEXT: v_alignbit_b32 v4, v4, v9, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v13 -; GFX7-NEXT: v_alignbit_b32 v7, v8, v7, 16 -; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v16 -; GFX7-NEXT: v_alignbit_b32 v6, v6, v9, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v15 ; GFX7-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX7-NEXT: v_alignbit_b32 v8, v8, v9, 16 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX7-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v5, v1, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v3, v8, v4, vcc ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_select_v8bf16: @@ -45859,199 +43314,31 @@ define <16 x bfloat> @v_select_v16bf16(i1 %cond, <16 x bfloat> %a, <16 x bfloat> ; GCN-LABEL: v_select_v16bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v18 -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_alignbit_b32 v2, v2, v17, 16 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_alignbit_b32 v3, v4, v3, 16 ; GCN-NEXT: v_and_b32_e32 v0, 1, v0 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v20 -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v19 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v22 -; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v21 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v24 -; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v23 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v26 -; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v25 -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v28 -; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v27 -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v30 -; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v29 -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_alignbit_b32 v4, v4, v17, 16 -; GCN-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 -; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_alignbit_b32 v18, v18, v19, 16 -; GCN-NEXT: v_alignbit_b32 v7, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v8, v20, v21, 16 -; GCN-NEXT: v_alignbit_b32 v9, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v10, v22, v23, 16 -; GCN-NEXT: v_alignbit_b32 v11, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v12, v24, v25, 16 -; GCN-NEXT: v_alignbit_b32 v13, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v14, v26, v27, 16 -; GCN-NEXT: v_alignbit_b32 v15, v16, v15, 16 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GCN-NEXT: v_cndmask_b32_e32 v13, v14, v13, vcc -; GCN-NEXT: v_cndmask_b32_e32 v11, v12, v11, vcc -; GCN-NEXT: v_cndmask_b32_e32 v9, v10, v9, vcc -; GCN-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc -; GCN-NEXT: v_cndmask_b32_e32 v5, v18, v5, vcc -; GCN-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc -; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v6 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_alignbit_b32 v14, v14, v16, 16 -; GCN-NEXT: v_cndmask_b32_e32 v15, v14, v15, vcc -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v15 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GCN-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc +; GCN-NEXT: v_cndmask_b32_e32 v1, v10, v2, vcc +; GCN-NEXT: v_cndmask_b32_e32 v2, v11, v3, vcc +; GCN-NEXT: v_cndmask_b32_e32 v3, v12, v4, vcc +; GCN-NEXT: v_cndmask_b32_e32 v4, v13, v5, vcc +; GCN-NEXT: v_cndmask_b32_e32 v5, v14, v6, vcc +; GCN-NEXT: v_cndmask_b32_e32 v6, v15, v7, vcc +; GCN-NEXT: v_cndmask_b32_e32 v7, v16, v8, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_select_v16bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v18 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GFX7-NEXT: v_alignbit_b32 v3, v4, v3, 16 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v20 -; GFX7-NEXT: v_alignbit_b32 v2, v2, v17, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v19 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GFX7-NEXT: v_alignbit_b32 v4, v4, v17, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:4 -; GFX7-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; GFX7-NEXT: buffer_load_dword v6, off, s[0:3], s32 -; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v22 -; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 -; GFX7-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v21 -; GFX7-NEXT: v_alignbit_b32 v7, v8, v7, 16 -; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v24 -; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 -; GFX7-NEXT: v_alignbit_b32 v18, v18, v19, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v23 -; GFX7-NEXT: v_alignbit_b32 v9, v10, v9, 16 -; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v26 -; GFX7-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 -; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; GFX7-NEXT: v_alignbit_b32 v8, v8, v19, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v25 -; GFX7-NEXT: v_alignbit_b32 v11, v12, v11, 16 -; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v28 -; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 -; GFX7-NEXT: v_alignbit_b32 v10, v10, v19, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v27 -; GFX7-NEXT: v_alignbit_b32 v13, v14, v13, 16 -; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v30 -; GFX7-NEXT: v_alignbit_b32 v15, v16, v15, 16 -; GFX7-NEXT: v_alignbit_b32 v12, v12, v19, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v29 ; GFX7-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX7-NEXT: v_alignbit_b32 v14, v14, v19, 16 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX7-NEXT: v_cndmask_b32_e32 v13, v14, v13, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v11, v12, v11, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v9, v10, v9, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v5, v18, v5, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v17 -; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GFX7-NEXT: v_alignbit_b32 v6, v16, v6, 16 -; GFX7-NEXT: v_cndmask_b32_e32 v15, v6, v15, vcc -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v15 -; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v1, v10, v2, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v2, v11, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v3, v12, v4, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v4, v13, v5, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v5, v14, v6, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v6, v15, v7, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v7, v16, v8, vcc ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_select_v16bf16: @@ -46149,495 +43436,53 @@ define <32 x bfloat> @v_select_v32bf16(i1 %cond, <32 x bfloat> %a, <32 x bfloat> ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_and_b32_e32 v0, 1, v0 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_alignbit_b32 v0, v0, v1, 16 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_alignbit_b32 v2, v2, v3, 16 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v8 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v10 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_alignbit_b32 v4, v4, v5, 16 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v12 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_alignbit_b32 v5, v5, v6, 16 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v14 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_alignbit_b32 v6, v6, v7, 16 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v16 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_alignbit_b32 v7, v7, v8, 16 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v18 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_alignbit_b32 v8, v8, v9, 16 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v20 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_alignbit_b32 v9, v9, v10, 16 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:12 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v22 -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_alignbit_b32 v10, v10, v11, 16 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:8 -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v24 -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_alignbit_b32 v11, v11, v12, 16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:20 -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v26 -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_alignbit_b32 v12, v12, v13, 16 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:16 -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v28 -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v27 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_alignbit_b32 v13, v13, v14, 16 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:28 -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v30 -; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v29 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_alignbit_b32 v14, v14, v20, 16 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:24 -; GCN-NEXT: s_waitcnt vmcnt(5) -; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 -; GCN-NEXT: s_waitcnt vmcnt(4) -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_alignbit_b32 v15, v15, v16, 16 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:36 -; GCN-NEXT: s_waitcnt vmcnt(4) -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v17 -; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_alignbit_b32 v16, v16, v17, 16 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:32 -; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v19 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_alignbit_b32 v17, v17, v19, 16 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:44 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v21 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:40 -; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_alignbit_b32 v18, v20, v18, 16 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v21 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:48 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_alignbit_b32 v19, v19, v20, 16 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v21 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:60 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:56 -; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_alignbit_b32 v20, v20, v21, 16 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v22 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v23 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:68 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:64 -; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_alignbit_b32 v21, v21, v22, 16 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v23 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v24 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:72 -; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_alignbit_b32 v22, v22, v23, 16 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v24 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v25 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:84 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:80 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_alignbit_b32 v23, v23, v24, 16 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v25 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v26 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:92 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:88 -; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_alignbit_b32 v24, v24, v25, 16 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v26 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v27 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:100 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:96 -; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_alignbit_b32 v25, v25, v26, 16 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v27 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v28 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:108 -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:104 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_alignbit_b32 v26, v26, v27, 16 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v28 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v29 -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:116 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:112 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v27 -; GCN-NEXT: v_alignbit_b32 v27, v27, v28, 16 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v29 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v30 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:124 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_alignbit_b32 v28, v28, v29, 16 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v30 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v31 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v29 -; GCN-NEXT: v_alignbit_b32 v29, v29, v30, 16 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v31 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:132 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:128 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_alignbit_b32 v30, v30, v31, 16 +; GCN-NEXT: v_cndmask_b32_e32 v0, v17, v1, vcc +; GCN-NEXT: v_cndmask_b32_e32 v1, v18, v2, vcc +; GCN-NEXT: v_cndmask_b32_e32 v2, v19, v3, vcc +; GCN-NEXT: v_cndmask_b32_e32 v3, v20, v4, vcc +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:4 +; GCN-NEXT: v_cndmask_b32_e32 v4, v21, v5, vcc +; GCN-NEXT: v_cndmask_b32_e32 v5, v22, v6, vcc +; GCN-NEXT: v_cndmask_b32_e32 v6, v23, v7, vcc +; GCN-NEXT: v_cndmask_b32_e32 v7, v24, v8, vcc +; GCN-NEXT: v_cndmask_b32_e32 v8, v25, v9, vcc +; GCN-NEXT: v_cndmask_b32_e32 v9, v26, v10, vcc +; GCN-NEXT: v_cndmask_b32_e32 v10, v27, v11, vcc +; GCN-NEXT: v_cndmask_b32_e32 v11, v28, v12, vcc +; GCN-NEXT: v_cndmask_b32_e32 v12, v29, v13, vcc +; GCN-NEXT: v_cndmask_b32_e32 v13, v30, v14, vcc ; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v32 +; GCN-NEXT: v_cndmask_b32_e32 v14, v17, v15, vcc ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; GCN-NEXT: v_alignbit_b32 v31, v31, v32, 16 -; GCN-NEXT: v_cndmask_b32_e32 v31, v31, v30, vcc -; GCN-NEXT: v_cndmask_b32_e32 v29, v29, v14, vcc -; GCN-NEXT: v_cndmask_b32_e32 v28, v28, v13, vcc -; GCN-NEXT: v_cndmask_b32_e32 v27, v27, v12, vcc -; GCN-NEXT: v_cndmask_b32_e32 v26, v26, v11, vcc -; GCN-NEXT: v_cndmask_b32_e32 v25, v25, v10, vcc -; GCN-NEXT: v_cndmask_b32_e32 v24, v24, v9, vcc -; GCN-NEXT: v_cndmask_b32_e32 v23, v23, v8, vcc -; GCN-NEXT: v_cndmask_b32_e32 v22, v22, v7, vcc -; GCN-NEXT: v_cndmask_b32_e32 v13, v21, v6, vcc -; GCN-NEXT: v_cndmask_b32_e32 v11, v20, v5, vcc -; GCN-NEXT: v_cndmask_b32_e32 v9, v19, v4, vcc -; GCN-NEXT: v_cndmask_b32_e32 v7, v18, v3, vcc -; GCN-NEXT: v_cndmask_b32_e32 v5, v17, v2, vcc -; GCN-NEXT: v_cndmask_b32_e32 v3, v16, v1, vcc -; GCN-NEXT: v_cndmask_b32_e32 v1, v15, v0, vcc -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v22 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v23 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v24 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v25 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v26 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v27 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v28 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v29 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v31 -; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; GCN-NEXT: v_cndmask_b32_e32 v15, v18, v16, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_select_v32bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v4 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_alignbit_b32 v2, v2, v3, 16 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v6 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v5 -; GFX7-NEXT: v_alignbit_b32 v3, v3, v4, 16 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v8 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v7 -; GFX7-NEXT: v_alignbit_b32 v4, v4, v5, 16 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v10 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v9 -; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18 -; GFX7-NEXT: v_alignbit_b32 v5, v5, v6, 16 -; GFX7-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:12 -; GFX7-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:16 -; GFX7-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:24 -; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:40 -; GFX7-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GFX7-NEXT: v_alignbit_b32 v17, v18, v17, 16 -; GFX7-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 -; GFX7-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:8 -; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 -; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28 -; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GFX7-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27 -; GFX7-NEXT: v_alignbit_b32 v13, v14, v13, 16 -; GFX7-NEXT: v_alignbit_b32 v27, v28, v27, 16 -; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 -; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 -; GFX7-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GFX7-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23 -; GFX7-NEXT: v_alignbit_b32 v11, v12, v11, 16 -; GFX7-NEXT: v_alignbit_b32 v23, v24, v23, 16 -; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 -; GFX7-NEXT: v_alignbit_b32 v15, v16, v15, 16 -; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20 -; GFX7-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19 -; GFX7-NEXT: v_alignbit_b32 v19, v20, v19, 16 -; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 -; GFX7-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21 -; GFX7-NEXT: v_alignbit_b32 v21, v22, v21, 16 -; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26 -; GFX7-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 -; GFX7-NEXT: v_alignbit_b32 v25, v26, v25, 16 -; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30 -; GFX7-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29 -; GFX7-NEXT: v_alignbit_b32 v29, v30, v29, 16 ; GFX7-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX7-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 -; GFX7-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60 -; GFX7-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:116 -; GFX7-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:52 -; GFX7-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 -; GFX7-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:68 -; GFX7-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:84 -; GFX7-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:92 -; GFX7-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:108 -; GFX7-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:124 -; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:128 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 -; GFX7-NEXT: s_waitcnt vmcnt(14) -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31 -; GFX7-NEXT: s_waitcnt vmcnt(13) -; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18 -; GFX7-NEXT: s_waitcnt vmcnt(12) -; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GFX7-NEXT: v_alignbit_b32 v6, v6, v7, 16 -; GFX7-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:20 -; GFX7-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: s_waitcnt vmcnt(12) -; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 -; GFX7-NEXT: s_waitcnt vmcnt(11) -; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 -; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GFX7-NEXT: s_waitcnt vmcnt(9) -; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 -; GFX7-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GFX7-NEXT: s_waitcnt vmcnt(7) -; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; GFX7-NEXT: s_waitcnt vmcnt(6) -; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20 -; GFX7-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; GFX7-NEXT: s_waitcnt vmcnt(5) -; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 -; GFX7-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 -; GFX7-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; GFX7-NEXT: s_waitcnt vmcnt(4) -; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26 -; GFX7-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28 -; GFX7-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; GFX7-NEXT: s_waitcnt vmcnt(3) -; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30 -; GFX7-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v17, v1, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v1, v18, v2, vcc +; GFX7-NEXT: buffer_load_dword v17, off, s[0:3], s32 +; GFX7-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:4 +; GFX7-NEXT: v_cndmask_b32_e32 v2, v19, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v3, v20, v4, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v4, v21, v5, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v5, v22, v6, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v6, v23, v7, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v7, v24, v8, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v8, v25, v9, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v9, v26, v10, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v10, v27, v11, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v11, v28, v12, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v12, v29, v13, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v13, v30, v14, vcc ; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GFX7-NEXT: v_alignbit_b32 v7, v7, v8, 16 -; GFX7-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:28 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GFX7-NEXT: v_alignbit_b32 v8, v8, v9, 16 -; GFX7-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:36 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GFX7-NEXT: v_alignbit_b32 v9, v9, v10, 16 -; GFX7-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 -; GFX7-NEXT: v_cndmask_b32_e32 v9, v9, v4, vcc -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v9 +; GFX7-NEXT: v_cndmask_b32_e32 v14, v17, v15, vcc ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 -; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; GFX7-NEXT: v_alignbit_b32 v10, v10, v31, 16 -; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:48 -; GFX7-NEXT: v_cndmask_b32_e32 v10, v10, v5, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v5, v8, v3, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v9 -; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v10 -; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v10 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31 -; GFX7-NEXT: v_alignbit_b32 v12, v12, v31, 16 -; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:56 -; GFX7-NEXT: v_cndmask_b32_e32 v11, v12, v11, vcc -; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31 -; GFX7-NEXT: v_alignbit_b32 v14, v14, v31, 16 -; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:64 -; GFX7-NEXT: v_cndmask_b32_e32 v13, v14, v13, vcc -; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31 -; GFX7-NEXT: v_alignbit_b32 v16, v16, v31, 16 -; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:72 -; GFX7-NEXT: v_cndmask_b32_e32 v15, v16, v15, vcc -; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v15 -; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31 -; GFX7-NEXT: v_alignbit_b32 v18, v18, v31, 16 -; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80 -; GFX7-NEXT: v_cndmask_b32_e32 v17, v18, v17, vcc -; GFX7-NEXT: v_lshlrev_b32_e32 v16, 16, v17 -; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31 -; GFX7-NEXT: v_alignbit_b32 v20, v20, v31, 16 -; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:88 -; GFX7-NEXT: v_cndmask_b32_e32 v19, v20, v19, vcc -; GFX7-NEXT: v_lshlrev_b32_e32 v18, 16, v19 -; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31 -; GFX7-NEXT: v_alignbit_b32 v22, v22, v31, 16 -; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:96 -; GFX7-NEXT: v_cndmask_b32_e32 v21, v22, v21, vcc -; GFX7-NEXT: v_lshlrev_b32_e32 v20, 16, v21 -; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31 -; GFX7-NEXT: v_alignbit_b32 v24, v24, v31, 16 -; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:104 -; GFX7-NEXT: v_cndmask_b32_e32 v23, v24, v23, vcc -; GFX7-NEXT: v_lshlrev_b32_e32 v22, 16, v23 -; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31 -; GFX7-NEXT: v_alignbit_b32 v26, v26, v31, 16 -; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:112 -; GFX7-NEXT: v_cndmask_b32_e32 v25, v26, v25, vcc -; GFX7-NEXT: v_lshlrev_b32_e32 v24, 16, v25 -; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31 -; GFX7-NEXT: v_alignbit_b32 v28, v28, v31, 16 -; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120 -; GFX7-NEXT: v_cndmask_b32_e32 v27, v28, v27, vcc -; GFX7-NEXT: v_lshlrev_b32_e32 v26, 16, v27 -; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31 -; GFX7-NEXT: v_alignbit_b32 v30, v30, v31, 16 -; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 -; GFX7-NEXT: v_cndmask_b32_e32 v29, v30, v29, vcc -; GFX7-NEXT: v_lshlrev_b32_e32 v28, 16, v29 -; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31 -; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; GFX7-NEXT: v_alignbit_b32 v31, v31, v32, 16 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:132 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GFX7-NEXT: v_alignbit_b32 v32, v32, v33, 16 -; GFX7-NEXT: v_cndmask_b32_e32 v31, v32, v31, vcc -; GFX7-NEXT: v_lshlrev_b32_e32 v30, 16, v31 -; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; GFX7-NEXT: v_cndmask_b32_e32 v15, v18, v16, vcc ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_select_v32bf16: @@ -46797,44 +43642,30 @@ define <32 x bfloat> @v_select_v32bf16(i1 %cond, <32 x bfloat> %a, <32 x bfloat> define amdgpu_ps <2 x i32> @s_select_v3bf16(<3 x bfloat> inreg %a, <3 x bfloat> inreg %b, i32 %c) { ; GCN-LABEL: s_select_v3bf16: ; GCN: ; %bb.0: -; GCN-NEXT: v_mul_f32_e64 v2, 1.0, s1 -; GCN-NEXT: v_mul_f32_e64 v1, 1.0, s0 -; GCN-NEXT: v_mul_f32_e64 v4, 1.0, s4 -; GCN-NEXT: v_mul_f32_e64 v3, 1.0, s3 -; GCN-NEXT: v_mul_f32_e64 v5, 1.0, s2 -; GCN-NEXT: v_mul_f32_e64 v6, 1.0, s5 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; GCN-NEXT: v_lshr_b64 v[2:3], v[3:4], 16 +; GCN-NEXT: v_mov_b32_e32 v1, s2 +; GCN-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN-NEXT: v_cndmask_b32_e32 v0, v6, v5, vcc -; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GCN-NEXT: v_readfirstlane_b32 s0, v1 -; GCN-NEXT: v_readfirstlane_b32 s1, v0 +; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: v_mov_b32_e32 v2, s1 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GCN-NEXT: v_readfirstlane_b32 s0, v0 +; GCN-NEXT: v_readfirstlane_b32 s1, v1 ; GCN-NEXT: ; return to shader part epilog ; ; GFX7-LABEL: s_select_v3bf16: ; GFX7: ; %bb.0: -; GFX7-NEXT: v_mul_f32_e64 v1, 1.0, s1 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; GFX7-NEXT: v_mul_f32_e64 v1, 1.0, s0 -; GFX7-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; GFX7-NEXT: v_mul_f32_e64 v2, 1.0, s4 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GFX7-NEXT: v_mul_f32_e64 v2, 1.0, s3 -; GFX7-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 -; GFX7-NEXT: v_mul_f32_e64 v3, 1.0, s2 -; GFX7-NEXT: v_mul_f32_e64 v4, 1.0, s5 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX7-NEXT: v_readfirstlane_b32 s0, v1 -; GFX7-NEXT: v_readfirstlane_b32 s1, v0 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-NEXT: v_readfirstlane_b32 s0, v0 +; GFX7-NEXT: v_readfirstlane_b32 s1, v1 ; GFX7-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_select_v3bf16: @@ -46936,50 +43767,26 @@ define amdgpu_ps <2 x i32> @s_select_v3bf16(<3 x bfloat> inreg %a, <3 x bfloat> define amdgpu_ps <2 x i32> @s_select_v4bf16(<4 x bfloat> inreg %a, <4 x bfloat> inreg %b, i32 %c) { ; GCN-LABEL: s_select_v4bf16: ; GCN: ; %bb.0: -; GCN-NEXT: v_mul_f32_e64 v2, 1.0, s1 -; GCN-NEXT: v_mul_f32_e64 v1, 1.0, s0 -; GCN-NEXT: v_mul_f32_e64 v4, 1.0, s5 -; GCN-NEXT: v_mul_f32_e64 v3, 1.0, s4 -; GCN-NEXT: v_mul_f32_e64 v6, 1.0, s3 -; GCN-NEXT: v_mul_f32_e64 v5, 1.0, s2 -; GCN-NEXT: v_mul_f32_e64 v8, 1.0, s7 -; GCN-NEXT: v_mul_f32_e64 v7, 1.0, s6 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; GCN-NEXT: v_lshr_b64 v[2:3], v[3:4], 16 -; GCN-NEXT: v_lshr_b64 v[3:4], v[5:6], 16 -; GCN-NEXT: v_lshr_b64 v[4:5], v[7:8], 16 +; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: v_mov_b32_e32 v2, s1 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc -; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GCN-NEXT: v_mov_b32_e32 v1, s2 +; GCN-NEXT: v_mov_b32_e32 v2, s0 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GCN-NEXT: v_readfirstlane_b32 s0, v1 ; GCN-NEXT: v_readfirstlane_b32 s1, v0 ; GCN-NEXT: ; return to shader part epilog ; ; GFX7-LABEL: s_select_v4bf16: ; GFX7: ; %bb.0: -; GFX7-NEXT: v_mul_f32_e64 v1, 1.0, s1 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; GFX7-NEXT: v_mul_f32_e64 v1, 1.0, s0 -; GFX7-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; GFX7-NEXT: v_mul_f32_e64 v2, 1.0, s5 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GFX7-NEXT: v_mul_f32_e64 v2, 1.0, s4 -; GFX7-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 -; GFX7-NEXT: v_mul_f32_e64 v3, 1.0, s3 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; GFX7-NEXT: v_mul_f32_e64 v3, 1.0, s2 -; GFX7-NEXT: v_lshr_b64 v[3:4], v[3:4], 16 -; GFX7-NEXT: v_mul_f32_e64 v4, 1.0, s7 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; GFX7-NEXT: v_mul_f32_e64 v4, 1.0, s6 -; GFX7-NEXT: v_lshr_b64 v[4:5], v[4:5], 16 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX7-NEXT: v_readfirstlane_b32 s0, v1 ; GFX7-NEXT: v_readfirstlane_b32 s1, v0 ; GFX7-NEXT: ; return to shader part epilog @@ -47074,56 +43881,42 @@ define amdgpu_ps <2 x i32> @s_select_v4bf16(<4 x bfloat> inreg %a, <4 x bfloat> define amdgpu_ps <2 x i32> @s_vselect_v4bf16(<4 x bfloat> inreg %a, <4 x bfloat> inreg %b, <4 x i32> %c) { ; GCN-LABEL: s_vselect_v4bf16: ; GCN: ; %bb.0: -; GCN-NEXT: v_mul_f32_e64 v4, 1.0, s0 -; GCN-NEXT: v_mul_f32_e64 v5, 1.0, s4 -; GCN-NEXT: v_mul_f32_e64 v6, 1.0, s1 -; GCN-NEXT: v_mul_f32_e64 v7, 1.0, s5 -; GCN-NEXT: v_mul_f32_e64 v8, 1.0, s2 -; GCN-NEXT: v_mul_f32_e64 v9, 1.0, s6 -; GCN-NEXT: v_mul_f32_e64 v10, 1.0, s3 -; GCN-NEXT: v_mul_f32_e64 v11, 1.0, s7 +; GCN-NEXT: v_mov_b32_e32 v4, s3 +; GCN-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GCN-NEXT: v_cndmask_b32_e32 v3, v11, v10, vcc +; GCN-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GCN-NEXT: v_cndmask_b32_e32 v2, v9, v8, vcc +; GCN-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc +; GCN-NEXT: s_mov_b32 s1, 0xffff +; GCN-NEXT: v_mov_b32_e32 v4, s2 +; GCN-NEXT: v_mov_b32_e32 v5, s0 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GCN-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc +; GCN-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_or_b32_e32 v2, v2, v3 -; GCN-NEXT: v_or_b32_e32 v0, v0, v1 +; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; GCN-NEXT: v_bfi_b32 v2, s1, v2, v3 +; GCN-NEXT: v_bfi_b32 v0, s1, v0, v1 ; GCN-NEXT: v_readfirstlane_b32 s0, v0 ; GCN-NEXT: v_readfirstlane_b32 s1, v2 ; GCN-NEXT: ; return to shader part epilog ; ; GFX7-LABEL: s_vselect_v4bf16: ; GFX7: ; %bb.0: -; GFX7-NEXT: v_mul_f32_e64 v10, 1.0, s3 -; GFX7-NEXT: v_mul_f32_e64 v11, 1.0, s7 +; GFX7-NEXT: v_mov_b32_e32 v4, s3 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX7-NEXT: v_mul_f32_e64 v8, 1.0, s2 -; GFX7-NEXT: v_mul_f32_e64 v9, 1.0, s6 -; GFX7-NEXT: v_cndmask_b32_e32 v3, v11, v10, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX7-NEXT: v_mul_f32_e64 v6, 1.0, s1 -; GFX7-NEXT: v_mul_f32_e64 v7, 1.0, s5 -; GFX7-NEXT: v_cndmask_b32_e32 v2, v9, v8, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc +; GFX7-NEXT: s_mov_b32 s1, 0xffff +; GFX7-NEXT: v_bfi_b32 v2, s1, v2, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, s2 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX7-NEXT: v_mul_f32_e64 v4, 1.0, s0 -; GFX7-NEXT: v_mul_f32_e64 v5, 1.0, s4 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc +; GFX7-NEXT: v_bfi_b32 v0, s1, v0, v1 ; GFX7-NEXT: v_readfirstlane_b32 s0, v0 ; GFX7-NEXT: v_readfirstlane_b32 s1, v2 ; GFX7-NEXT: ; return to shader part epilog @@ -47361,30 +44154,21 @@ define <4 x bfloat> @v_vselect_v4bf16(<4 x i1> %cond, <4 x bfloat> %a, <4 x bflo ; GCN-LABEL: v_vselect_v4bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 ; GCN-NEXT: v_and_b32_e32 v0, 1, v0 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 ; GCN-NEXT: v_and_b32_e32 v1, 1, v1 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 ; GCN-NEXT: v_and_b32_e32 v2, 1, v2 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 ; GCN-NEXT: v_and_b32_e32 v3, 1, v3 +; GCN-NEXT: s_mov_b32 s4, 0xffff ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 -; GCN-NEXT: v_cndmask_b32_e32 v3, v11, v7, vcc +; GCN-NEXT: v_cndmask_b32_e32 v3, v7, v5, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GCN-NEXT: v_cndmask_b32_e32 v2, v10, v6, vcc +; GCN-NEXT: v_cndmask_b32_e32 v2, v7, v5, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 -; GCN-NEXT: v_cndmask_b32_e32 v1, v9, v5, vcc +; GCN-NEXT: v_cndmask_b32_e32 v1, v6, v4, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GCN-NEXT: v_cndmask_b32_e32 v0, v8, v4, vcc -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GCN-NEXT: v_cndmask_b32_e32 v0, v6, v4, vcc +; GCN-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GCN-NEXT: v_bfi_b32 v1, s4, v2, v3 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_vselect_v4bf16: @@ -47392,28 +44176,19 @@ define <4 x bfloat> @v_vselect_v4bf16(<4 x i1> %cond, <4 x bfloat> %a, <4 x bflo ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v3, 1, v3 ; GFX7-NEXT: v_and_b32_e32 v2, 1, v2 -; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 ; GFX7-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 -; GFX7-NEXT: v_cndmask_b32_e32 v3, v11, v7, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v3, v7, v5, vcc ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GFX7-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GFX7-NEXT: v_cndmask_b32_e32 v2, v10, v6, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v2, v7, v5, vcc ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v9, v5, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v4, vcc ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v8, v4, vcc -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v4, vcc +; GFX7-NEXT: s_mov_b32 s4, 0xffff +; GFX7-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX7-NEXT: v_bfi_b32 v1, s4, v2, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_vselect_v4bf16: @@ -47600,107 +44375,69 @@ define <8 x bfloat> @v_vselect_v8bf16(<8 x i1> %cond, <8 x bfloat> %a, <8 x bflo ; GCN-LABEL: v_vselect_v8bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_and_b32_e32 v7, 1, v7 -; GCN-NEXT: v_and_b32_e32 v6, 1, v6 -; GCN-NEXT: v_and_b32_e32 v5, 1, v5 -; GCN-NEXT: v_and_b32_e32 v4, 1, v4 -; GCN-NEXT: v_and_b32_e32 v3, 1, v3 -; GCN-NEXT: v_and_b32_e32 v2, 1, v2 -; GCN-NEXT: v_and_b32_e32 v1, 1, v1 ; GCN-NEXT: v_and_b32_e32 v0, 1, v0 -; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 +; GCN-NEXT: v_and_b32_e32 v1, 1, v1 +; GCN-NEXT: v_and_b32_e32 v2, 1, v2 +; GCN-NEXT: v_and_b32_e32 v3, 1, v3 +; GCN-NEXT: v_and_b32_e32 v4, 1, v4 +; GCN-NEXT: v_and_b32_e32 v5, 1, v5 +; GCN-NEXT: v_and_b32_e32 v6, 1, v6 +; GCN-NEXT: v_and_b32_e32 v7, 1, v7 +; GCN-NEXT: s_mov_b32 s4, 0xffff ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 -; GCN-NEXT: v_cndmask_b32_e32 v7, v23, v15, vcc +; GCN-NEXT: v_cndmask_b32_e32 v7, v15, v11, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 -; GCN-NEXT: v_cndmask_b32_e32 v6, v22, v14, vcc +; GCN-NEXT: v_cndmask_b32_e32 v6, v15, v11, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5 -; GCN-NEXT: v_cndmask_b32_e32 v5, v21, v13, vcc +; GCN-NEXT: v_cndmask_b32_e32 v5, v14, v10, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 -; GCN-NEXT: v_cndmask_b32_e32 v4, v20, v12, vcc +; GCN-NEXT: v_cndmask_b32_e32 v4, v14, v10, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 -; GCN-NEXT: v_cndmask_b32_e32 v3, v19, v11, vcc +; GCN-NEXT: v_cndmask_b32_e32 v3, v13, v9, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GCN-NEXT: v_cndmask_b32_e32 v2, v18, v10, vcc +; GCN-NEXT: v_cndmask_b32_e32 v2, v13, v9, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 -; GCN-NEXT: v_cndmask_b32_e32 v1, v17, v9, vcc +; GCN-NEXT: v_cndmask_b32_e32 v1, v12, v8, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GCN-NEXT: v_cndmask_b32_e32 v0, v16, v8, vcc -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GCN-NEXT: v_cndmask_b32_e32 v0, v12, v8, vcc +; GCN-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GCN-NEXT: v_bfi_b32 v1, s4, v2, v3 +; GCN-NEXT: v_bfi_b32 v2, s4, v4, v5 +; GCN-NEXT: v_bfi_b32 v3, s4, v6, v7 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_vselect_v8bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v7, 1, v7 -; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 -; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 ; GFX7-NEXT: v_and_b32_e32 v6, 1, v6 -; GFX7-NEXT: v_cndmask_b32_e32 v7, v23, v15, vcc -; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 -; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v22 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 ; GFX7-NEXT: v_and_b32_e32 v5, 1, v5 -; GFX7-NEXT: v_cndmask_b32_e32 v6, v15, v14, vcc -; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v21 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5 +; GFX7-NEXT: v_cndmask_b32_e32 v7, v15, v11, vcc +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 ; GFX7-NEXT: v_and_b32_e32 v4, 1, v4 -; GFX7-NEXT: v_cndmask_b32_e32 v5, v14, v13, vcc -; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 -; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v20 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 +; GFX7-NEXT: v_cndmask_b32_e32 v6, v15, v11, vcc +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5 ; GFX7-NEXT: v_and_b32_e32 v3, 1, v3 -; GFX7-NEXT: v_cndmask_b32_e32 v4, v13, v12, vcc -; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v19 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 +; GFX7-NEXT: v_cndmask_b32_e32 v5, v14, v10, vcc +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 ; GFX7-NEXT: v_and_b32_e32 v2, 1, v2 -; GFX7-NEXT: v_cndmask_b32_e32 v3, v12, v11, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v4, v14, v10, vcc +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 ; GFX7-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 -; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v18 +; GFX7-NEXT: v_cndmask_b32_e32 v3, v13, v9, vcc ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GFX7-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v17 -; GFX7-NEXT: v_cndmask_b32_e32 v2, v13, v10, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v2, v13, v9, vcc ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 -; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v16 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v12, v9, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v1, v12, v8, vcc ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v11, v8, vcc -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v12, v8, vcc +; GFX7-NEXT: s_mov_b32 s4, 0xffff +; GFX7-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX7-NEXT: v_bfi_b32 v1, s4, v2, v3 +; GFX7-NEXT: v_bfi_b32 v2, s4, v4, v5 +; GFX7-NEXT: v_bfi_b32 v3, s4, v6, v7 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_vselect_v8bf16: @@ -48024,268 +44761,129 @@ define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x ; GCN-LABEL: v_vselect_v16bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_and_b32_e32 v10, 1, v10 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v10 +; GCN-NEXT: v_and_b32_e32 v10, 1, v12 +; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v10 +; GCN-NEXT: v_and_b32_e32 v10, 1, v13 +; GCN-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v10 +; GCN-NEXT: v_cndmask_b32_e64 v10, v30, v22, s[6:7] +; GCN-NEXT: v_cndmask_b32_e64 v12, v30, v22, s[4:5] +; GCN-NEXT: v_and_b32_e32 v2, 1, v2 +; GCN-NEXT: v_and_b32_e32 v4, 1, v4 +; GCN-NEXT: v_and_b32_e32 v6, 1, v6 +; GCN-NEXT: v_and_b32_e32 v8, 1, v8 +; GCN-NEXT: v_and_b32_e32 v11, 1, v11 +; GCN-NEXT: v_and_b32_e32 v9, 1, v9 +; GCN-NEXT: v_and_b32_e32 v7, 1, v7 +; GCN-NEXT: v_and_b32_e32 v5, 1, v5 +; GCN-NEXT: v_and_b32_e32 v3, 1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 1, v1 ; GCN-NEXT: v_and_b32_e32 v0, 1, v0 +; GCN-NEXT: v_and_b32_e32 v13, 1, v14 +; GCN-NEXT: v_and_b32_e32 v14, 1, v15 +; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v11 +; GCN-NEXT: v_cndmask_b32_e64 v11, v29, v21, s[4:5] +; GCN-NEXT: v_cndmask_b32_e32 v15, v29, v21, vcc +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 +; GCN-NEXT: s_mov_b32 s4, 0xffff +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v9 +; GCN-NEXT: v_cndmask_b32_e32 v9, v28, v20, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 +; GCN-NEXT: v_cndmask_b32_e32 v8, v28, v20, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 +; GCN-NEXT: v_cndmask_b32_e32 v7, v27, v19, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 +; GCN-NEXT: v_cndmask_b32_e32 v6, v27, v19, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5 +; GCN-NEXT: v_cndmask_b32_e32 v5, v26, v18, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 +; GCN-NEXT: v_cndmask_b32_e32 v4, v26, v18, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 +; GCN-NEXT: v_cndmask_b32_e32 v3, v25, v17, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GCN-NEXT: v_cndmask_b32_e32 v2, v25, v17, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; GCN-NEXT: v_cndmask_b32_e32 v1, v24, v16, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GCN-NEXT: v_and_b32_e32 v0, 1, v1 -; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0 -; GCN-NEXT: v_and_b32_e32 v0, 1, v2 -; GCN-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v0 -; GCN-NEXT: v_and_b32_e32 v0, 1, v3 -; GCN-NEXT: v_cmp_eq_u32_e64 s[8:9], 1, v0 -; GCN-NEXT: v_and_b32_e32 v0, 1, v4 -; GCN-NEXT: v_cmp_eq_u32_e64 s[10:11], 1, v0 -; GCN-NEXT: v_and_b32_e32 v0, 1, v5 -; GCN-NEXT: v_cmp_eq_u32_e64 s[12:13], 1, v0 -; GCN-NEXT: v_and_b32_e32 v0, 1, v6 -; GCN-NEXT: v_cmp_eq_u32_e64 s[14:15], 1, v0 -; GCN-NEXT: v_and_b32_e32 v0, 1, v7 -; GCN-NEXT: v_cmp_eq_u32_e64 s[16:17], 1, v0 -; GCN-NEXT: v_and_b32_e32 v0, 1, v8 -; GCN-NEXT: v_cmp_eq_u32_e64 s[18:19], 1, v0 -; GCN-NEXT: v_and_b32_e32 v0, 1, v9 -; GCN-NEXT: v_cmp_eq_u32_e64 s[20:21], 1, v0 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v16 -; GCN-NEXT: v_and_b32_e32 v1, 1, v10 -; GCN-NEXT: v_cmp_eq_u32_e64 s[22:23], 1, v1 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:4 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v17 -; GCN-NEXT: v_and_b32_e32 v2, 1, v11 -; GCN-NEXT: v_cmp_eq_u32_e64 s[24:25], 1, v2 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:8 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v18 -; GCN-NEXT: v_and_b32_e32 v3, 1, v12 -; GCN-NEXT: v_cmp_eq_u32_e64 s[26:27], 1, v3 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:12 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v19 -; GCN-NEXT: v_and_b32_e32 v7, 1, v13 -; GCN-NEXT: v_and_b32_e32 v8, 1, v14 -; GCN-NEXT: v_cmp_eq_u32_e64 s[28:29], 1, v7 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 -; GCN-NEXT: v_cmp_eq_u32_e64 s[40:41], 1, v8 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:64 -; GCN-NEXT: v_and_b32_e32 v9, 1, v15 -; GCN-NEXT: v_cmp_eq_u32_e64 s[42:43], 1, v9 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:60 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_cndmask_b32_e64 v15, v8, v7, s[42:43] -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:56 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v30 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_cndmask_b32_e64 v14, v9, v8, s[40:41] -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:52 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v29 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_cndmask_b32_e64 v13, v7, v9, s[28:29] -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:48 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v28 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_cndmask_b32_e64 v12, v8, v9, s[26:27] -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:44 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v27 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_cndmask_b32_e64 v11, v7, v9, s[24:25] -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:40 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v26 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_cndmask_b32_e64 v10, v8, v9, s[22:23] -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:36 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v25 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_cndmask_b32_e64 v9, v7, v9, s[20:21] -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:32 -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v24 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_cndmask_b32_e64 v8, v8, v16, s[18:19] -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:28 -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v23 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_cndmask_b32_e64 v7, v7, v17, s[16:17] -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:24 -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v22 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_cndmask_b32_e64 v16, v16, v18, s[14:15] -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:16 -; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v20 -; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v21 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_cndmask_b32_e64 v17, v17, v20, s[12:13] -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:20 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 -; GCN-NEXT: v_cndmask_b32_e64 v19, v20, v19, s[10:11] -; GCN-NEXT: v_cndmask_b32_e64 v3, v18, v3, s[8:9] -; GCN-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[6:7] -; GCN-NEXT: v_cndmask_b32_e64 v1, v5, v1, s[4:5] -; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v19 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v17 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v16 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GCN-NEXT: v_cndmask_b32_e32 v0, v24, v16, vcc +; GCN-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GCN-NEXT: v_bfi_b32 v1, s4, v2, v3 +; GCN-NEXT: v_bfi_b32 v2, s4, v4, v5 +; GCN-NEXT: v_bfi_b32 v3, s4, v6, v7 +; GCN-NEXT: v_bfi_b32 v4, s4, v8, v9 +; GCN-NEXT: v_bfi_b32 v5, s4, v15, v11 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v14 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cndmask_b32_e32 v7, v21, v23, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v13 +; GCN-NEXT: v_cndmask_b32_e32 v8, v21, v23, vcc +; GCN-NEXT: v_bfi_b32 v6, s4, v12, v10 +; GCN-NEXT: v_bfi_b32 v7, s4, v8, v7 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_vselect_v16bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v10, 1, v10 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v10 +; GFX7-NEXT: v_and_b32_e32 v10, 1, v12 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v10 +; GFX7-NEXT: v_and_b32_e32 v11, 1, v11 +; GFX7-NEXT: v_cndmask_b32_e64 v12, v30, v22, s[4:5] +; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v11 +; GFX7-NEXT: v_and_b32_e32 v10, 1, v13 +; GFX7-NEXT: v_cndmask_b32_e64 v11, v29, v21, s[4:5] +; GFX7-NEXT: v_cndmask_b32_e32 v13, v29, v21, vcc +; GFX7-NEXT: buffer_load_dword v21, off, s[0:3], s32 +; GFX7-NEXT: v_and_b32_e32 v9, 1, v9 +; GFX7-NEXT: v_and_b32_e32 v8, 1, v8 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v9 +; GFX7-NEXT: v_and_b32_e32 v7, 1, v7 +; GFX7-NEXT: v_cndmask_b32_e32 v9, v28, v20, vcc +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 +; GFX7-NEXT: v_and_b32_e32 v6, 1, v6 +; GFX7-NEXT: v_cndmask_b32_e32 v8, v28, v20, vcc +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 +; GFX7-NEXT: v_and_b32_e32 v5, 1, v5 +; GFX7-NEXT: v_cndmask_b32_e32 v7, v27, v19, vcc +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 +; GFX7-NEXT: v_and_b32_e32 v4, 1, v4 +; GFX7-NEXT: v_cndmask_b32_e32 v6, v27, v19, vcc +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5 +; GFX7-NEXT: v_and_b32_e32 v3, 1, v3 +; GFX7-NEXT: v_cndmask_b32_e32 v5, v26, v18, vcc +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 +; GFX7-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX7-NEXT: v_cndmask_b32_e32 v4, v26, v18, vcc +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 +; GFX7-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX7-NEXT: v_cndmask_b32_e32 v3, v25, v17, vcc +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GFX7-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX7-NEXT: v_cndmask_b32_e32 v2, v25, v17, vcc +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; GFX7-NEXT: v_and_b32_e32 v15, 1, v15 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v24, v16, vcc ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX7-NEXT: v_and_b32_e32 v0, 1, v1 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0 -; GFX7-NEXT: v_and_b32_e32 v0, 1, v2 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v0 -; GFX7-NEXT: v_and_b32_e32 v0, 1, v3 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[8:9], 1, v0 -; GFX7-NEXT: v_and_b32_e32 v0, 1, v4 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[10:11], 1, v0 -; GFX7-NEXT: v_and_b32_e32 v0, 1, v5 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[12:13], 1, v0 -; GFX7-NEXT: v_and_b32_e32 v0, 1, v6 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[14:15], 1, v0 -; GFX7-NEXT: v_and_b32_e32 v0, 1, v7 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[16:17], 1, v0 -; GFX7-NEXT: v_and_b32_e32 v0, 1, v8 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[18:19], 1, v0 -; GFX7-NEXT: v_and_b32_e32 v0, 1, v9 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[20:21], 1, v0 -; GFX7-NEXT: v_and_b32_e32 v0, 1, v10 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[22:23], 1, v0 -; GFX7-NEXT: v_and_b32_e32 v0, 1, v11 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[24:25], 1, v0 -; GFX7-NEXT: buffer_load_dword v0, off, s[0:3], s32 -; GFX7-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 -; GFX7-NEXT: v_and_b32_e32 v2, 1, v12 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[26:27], 1, v2 -; GFX7-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:60 -; GFX7-NEXT: v_and_b32_e32 v3, 1, v13 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[28:29], 1, v3 -; GFX7-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:56 -; GFX7-NEXT: v_and_b32_e32 v4, 1, v14 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[40:41], 1, v4 -; GFX7-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:52 -; GFX7-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:48 -; GFX7-NEXT: v_and_b32_e32 v4, 1, v15 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[42:43], 1, v4 -; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18 -; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19 -; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20 -; GFX7-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:40 -; GFX7-NEXT: s_waitcnt vmcnt(6) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: s_waitcnt vmcnt(5) -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_cndmask_b32_e64 v15, v1, v0, s[42:43] -; GFX7-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v30 -; GFX7-NEXT: s_waitcnt vmcnt(5) -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_cndmask_b32_e64 v14, v2, v1, s[40:41] -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v29 -; GFX7-NEXT: s_waitcnt vmcnt(4) -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v3 -; GFX7-NEXT: v_cndmask_b32_e64 v13, v2, v1, s[28:29] -; GFX7-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:36 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v28 -; GFX7-NEXT: s_waitcnt vmcnt(4) -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v5 -; GFX7-NEXT: v_cndmask_b32_e64 v12, v2, v1, s[26:27] -; GFX7-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:32 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v27 -; GFX7-NEXT: s_waitcnt vmcnt(4) -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v6 -; GFX7-NEXT: v_cndmask_b32_e64 v11, v5, v1, s[24:25] -; GFX7-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:28 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v26 -; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX7-NEXT: s_waitcnt vmcnt(4) -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: s_waitcnt vmcnt(3) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_cndmask_b32_e64 v10, v0, v5, s[22:23] -; GFX7-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:24 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v25 -; GFX7-NEXT: v_cndmask_b32_e64 v9, v4, v5, s[20:21] -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v24 -; GFX7-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:4 -; GFX7-NEXT: s_waitcnt vmcnt(4) -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_cndmask_b32_e64 v8, v3, v5, s[18:19] -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v23 -; GFX7-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8 -; GFX7-NEXT: s_waitcnt vmcnt(4) -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_cndmask_b32_e64 v7, v2, v5, s[16:17] -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v22 -; GFX7-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 -; GFX7-NEXT: s_waitcnt vmcnt(4) -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_cndmask_b32_e64 v6, v1, v5, s[14:15] -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v21 -; GFX7-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:16 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX7-NEXT: s_waitcnt vmcnt(4) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_cndmask_b32_e64 v5, v0, v5, s[12:13] -; GFX7-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:20 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX7-NEXT: s_waitcnt vmcnt(4) -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: s_waitcnt vmcnt(3) -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: s_waitcnt vmcnt(2) -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v18, s[6:7] -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_cndmask_b32_e64 v19, v1, v19, s[8:9] -; GFX7-NEXT: v_cndmask_b32_e64 v1, v3, v17, s[4:5] -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v19 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_cndmask_b32_e64 v20, v0, v20, s[10:11] -; GFX7-NEXT: v_cndmask_b32_e32 v0, v4, v16, vcc -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v20 +; GFX7-NEXT: v_and_b32_e32 v14, 1, v14 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v24, v16, vcc +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v15 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v10 +; GFX7-NEXT: v_cndmask_b32_e64 v10, v30, v22, s[6:7] +; GFX7-NEXT: s_mov_b32 s4, 0xffff +; GFX7-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX7-NEXT: v_bfi_b32 v1, s4, v2, v3 +; GFX7-NEXT: v_bfi_b32 v2, s4, v4, v5 +; GFX7-NEXT: v_bfi_b32 v3, s4, v6, v7 +; GFX7-NEXT: v_bfi_b32 v4, s4, v8, v9 +; GFX7-NEXT: v_bfi_b32 v5, s4, v13, v11 +; GFX7-NEXT: v_bfi_b32 v6, s4, v12, v10 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cndmask_b32_e32 v15, v21, v23, vcc +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v14 +; GFX7-NEXT: v_cndmask_b32_e32 v14, v21, v23, vcc +; GFX7-NEXT: v_bfi_b32 v7, s4, v14, v15 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_vselect_v16bf16: @@ -48886,657 +45484,332 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x ; GCN-LABEL: v_vselect_v32bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; GCN-NEXT: v_and_b32_e32 v0, 1, v0 -; GCN-NEXT: v_and_b32_e32 v1, 1, v1 -; GCN-NEXT: v_and_b32_e32 v2, 1, v2 -; GCN-NEXT: v_and_b32_e32 v36, 1, v13 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:180 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:184 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:188 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:192 -; GCN-NEXT: v_and_b32_e32 v53, 1, v26 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:84 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:88 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:92 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:96 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:100 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:104 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:108 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:112 -; GCN-NEXT: v_and_b32_e32 v27, 1, v27 -; GCN-NEXT: v_and_b32_e32 v28, 1, v28 -; GCN-NEXT: v_and_b32_e32 v29, 1, v29 -; GCN-NEXT: v_and_b32_e32 v30, 1, v30 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:116 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:120 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:124 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:252 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:248 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:244 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:240 -; GCN-NEXT: s_waitcnt vmcnt(14) -; GCN-NEXT: v_mul_f32_e32 v40, 1.0, v37 -; GCN-NEXT: v_mul_f32_e32 v38, 1.0, v38 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v36 -; GCN-NEXT: s_waitcnt vmcnt(5) -; GCN-NEXT: v_mul_f32_e32 v36, 1.0, v43 -; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_mul_f32_e32 v37, 1.0, v44 -; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v30 -; GCN-NEXT: v_cndmask_b32_e64 v30, v37, v36, s[4:5] -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:236 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:232 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:228 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:224 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:220 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:216 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:212 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:128 -; GCN-NEXT: v_mul_f32_e32 v42, 1.0, v42 -; GCN-NEXT: s_waitcnt vmcnt(10) -; GCN-NEXT: v_mul_f32_e32 v43, 1.0, v45 -; GCN-NEXT: v_mul_f32_e32 v41, 1.0, v41 -; GCN-NEXT: s_waitcnt vmcnt(9) -; GCN-NEXT: v_mul_f32_e32 v44, 1.0, v46 -; GCN-NEXT: v_mul_f32_e32 v55, 1.0, v55 -; GCN-NEXT: s_waitcnt vmcnt(8) -; GCN-NEXT: v_mul_f32_e32 v45, 1.0, v47 -; GCN-NEXT: v_mul_f32_e32 v54, 1.0, v54 -; GCN-NEXT: s_waitcnt vmcnt(7) -; GCN-NEXT: v_mul_f32_e32 v36, 1.0, v36 -; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v29 -; GCN-NEXT: v_cndmask_b32_e64 v29, v43, v42, s[4:5] -; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v28 -; GCN-NEXT: v_cndmask_b32_e64 v28, v44, v41, s[4:5] -; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v27 -; GCN-NEXT: v_cndmask_b32_e64 v27, v45, v55, s[4:5] -; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v53 -; GCN-NEXT: v_cndmask_b32_e64 v36, v36, v54, s[4:5] -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:132 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:136 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:140 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:144 -; GCN-NEXT: v_and_b32_e32 v3, 1, v3 -; GCN-NEXT: v_and_b32_e32 v4, 1, v4 -; GCN-NEXT: v_and_b32_e32 v5, 1, v5 -; GCN-NEXT: v_and_b32_e32 v6, 1, v6 -; GCN-NEXT: v_and_b32_e32 v18, 1, v18 -; GCN-NEXT: v_and_b32_e32 v22, 1, v22 -; GCN-NEXT: v_and_b32_e32 v23, 1, v23 -; GCN-NEXT: v_and_b32_e32 v24, 1, v24 -; GCN-NEXT: v_and_b32_e32 v25, 1, v25 -; GCN-NEXT: v_mul_f32_e32 v52, 1.0, v52 -; GCN-NEXT: s_waitcnt vmcnt(14) -; GCN-NEXT: v_mul_f32_e32 v46, 1.0, v56 -; GCN-NEXT: v_mul_f32_e32 v51, 1.0, v51 -; GCN-NEXT: s_waitcnt vmcnt(13) -; GCN-NEXT: v_mul_f32_e32 v47, 1.0, v57 -; GCN-NEXT: v_mul_f32_e32 v50, 1.0, v50 -; GCN-NEXT: s_waitcnt vmcnt(12) -; GCN-NEXT: v_mul_f32_e32 v56, 1.0, v58 -; GCN-NEXT: v_mul_f32_e32 v49, 1.0, v49 -; GCN-NEXT: s_waitcnt vmcnt(11) -; GCN-NEXT: v_mul_f32_e32 v57, 1.0, v59 -; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v25 -; GCN-NEXT: v_cndmask_b32_e64 v25, v46, v52, s[4:5] -; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v24 -; GCN-NEXT: v_cndmask_b32_e64 v24, v47, v51, s[4:5] -; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v23 -; GCN-NEXT: v_cndmask_b32_e64 v23, v56, v50, s[4:5] -; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v22 -; GCN-NEXT: v_cndmask_b32_e64 v22, v57, v49, s[4:5] -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:68 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:196 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:72 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:200 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:76 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:204 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:80 -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:208 -; GCN-NEXT: v_and_b32_e32 v19, 1, v19 -; GCN-NEXT: v_and_b32_e32 v20, 1, v20 -; GCN-NEXT: v_and_b32_e32 v21, 1, v21 -; GCN-NEXT: v_mul_f32_e32 v48, 1.0, v48 -; GCN-NEXT: s_waitcnt vmcnt(14) -; GCN-NEXT: v_mul_f32_e32 v58, 1.0, v60 -; GCN-NEXT: v_mul_f32_e32 v39, 1.0, v39 -; GCN-NEXT: v_mul_f32_e32 v59, 1.0, v61 +; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0 +; GCN-NEXT: v_and_b32_e32 v0, 1, v1 +; GCN-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v0 +; GCN-NEXT: v_and_b32_e32 v0, 1, v2 +; GCN-NEXT: v_cmp_eq_u32_e64 s[8:9], 1, v0 +; GCN-NEXT: v_and_b32_e32 v0, 1, v3 +; GCN-NEXT: v_cmp_eq_u32_e64 s[10:11], 1, v0 +; GCN-NEXT: v_and_b32_e32 v0, 1, v4 +; GCN-NEXT: v_cmp_eq_u32_e64 s[12:13], 1, v0 +; GCN-NEXT: v_and_b32_e32 v0, 1, v5 +; GCN-NEXT: v_cmp_eq_u32_e64 s[14:15], 1, v0 +; GCN-NEXT: v_and_b32_e32 v0, 1, v6 +; GCN-NEXT: v_cmp_eq_u32_e64 s[16:17], 1, v0 +; GCN-NEXT: v_and_b32_e32 v0, 1, v7 +; GCN-NEXT: v_cmp_eq_u32_e64 s[18:19], 1, v0 +; GCN-NEXT: v_and_b32_e32 v0, 1, v8 +; GCN-NEXT: v_cmp_eq_u32_e64 s[20:21], 1, v0 +; GCN-NEXT: v_and_b32_e32 v0, 1, v9 +; GCN-NEXT: v_cmp_eq_u32_e64 s[22:23], 1, v0 +; GCN-NEXT: v_and_b32_e32 v0, 1, v10 +; GCN-NEXT: v_and_b32_e32 v1, 1, v11 +; GCN-NEXT: v_and_b32_e32 v2, 1, v12 +; GCN-NEXT: v_and_b32_e32 v3, 1, v13 +; GCN-NEXT: v_and_b32_e32 v4, 1, v14 +; GCN-NEXT: v_and_b32_e32 v5, 1, v15 +; GCN-NEXT: v_and_b32_e32 v6, 1, v16 +; GCN-NEXT: v_and_b32_e32 v7, 1, v17 +; GCN-NEXT: v_and_b32_e32 v8, 1, v18 +; GCN-NEXT: v_and_b32_e32 v9, 1, v19 +; GCN-NEXT: v_and_b32_e32 v10, 1, v20 +; GCN-NEXT: v_and_b32_e32 v11, 1, v21 +; GCN-NEXT: v_and_b32_e32 v12, 1, v22 +; GCN-NEXT: v_and_b32_e32 v13, 1, v23 +; GCN-NEXT: v_and_b32_e32 v14, 1, v24 +; GCN-NEXT: v_and_b32_e32 v17, 1, v25 +; GCN-NEXT: v_and_b32_e32 v16, 1, v26 +; GCN-NEXT: v_and_b32_e32 v18, 1, v27 +; GCN-NEXT: v_and_b32_e32 v15, 1, v28 +; GCN-NEXT: v_and_b32_e32 v19, 1, v29 +; GCN-NEXT: v_and_b32_e32 v20, 1, v30 +; GCN-NEXT: v_cmp_eq_u32_e64 s[24:25], 1, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:24 +; GCN-NEXT: v_cmp_eq_u32_e64 s[26:27], 1, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:88 +; GCN-NEXT: v_cmp_eq_u32_e64 s[28:29], 1, v2 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:28 +; GCN-NEXT: v_cmp_eq_u32_e64 s[40:41], 1, v3 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:92 +; GCN-NEXT: v_cmp_eq_u32_e64 s[42:43], 1, v4 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 +; GCN-NEXT: v_cmp_eq_u32_e64 s[44:45], 1, v5 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:96 +; GCN-NEXT: v_cmp_eq_u32_e64 s[46:47], 1, v6 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:36 +; GCN-NEXT: v_cmp_eq_u32_e64 s[56:57], 1, v7 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:100 +; GCN-NEXT: v_cmp_eq_u32_e64 s[58:59], 1, v8 +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:40 +; GCN-NEXT: v_cmp_eq_u32_e64 s[60:61], 1, v9 +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:104 +; GCN-NEXT: v_cmp_eq_u32_e64 s[62:63], 1, v10 +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:44 +; GCN-NEXT: v_cmp_eq_u32_e64 s[72:73], 1, v11 +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:48 +; GCN-NEXT: v_cmp_eq_u32_e64 s[74:75], 1, v12 +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:60 +; GCN-NEXT: v_cmp_eq_u32_e64 s[76:77], 1, v13 +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:124 +; GCN-NEXT: v_cmp_eq_u32_e64 s[78:79], 1, v14 +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:56 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v19 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_cndmask_b32_e32 v14, v12, v11, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v15 +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:120 +; GCN-NEXT: v_cndmask_b32_e32 v15, v12, v11, vcc +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:52 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v18 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_cndmask_b32_e32 v13, v19, v22, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v16 +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:116 +; GCN-NEXT: v_cndmask_b32_e32 v16, v19, v22, vcc +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:112 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v17 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_cndmask_b32_e32 v12, v18, v11, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v20 +; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108 +; GCN-NEXT: v_cndmask_b32_e64 v17, v18, v11, s[78:79] +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_cndmask_b32_e64 v11, v19, v10, s[76:77] +; GCN-NEXT: v_cndmask_b32_e64 v18, v19, v10, s[74:75] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cndmask_b32_e64 v10, v20, v9, s[72:73] +; GCN-NEXT: v_cndmask_b32_e64 v19, v20, v9, s[62:63] +; GCN-NEXT: v_cndmask_b32_e64 v9, v21, v8, s[60:61] +; GCN-NEXT: v_cndmask_b32_e64 v20, v21, v8, s[58:59] +; GCN-NEXT: v_cndmask_b32_e64 v8, v7, v6, s[56:57] +; GCN-NEXT: v_cndmask_b32_e64 v21, v7, v6, s[46:47] +; GCN-NEXT: v_cndmask_b32_e64 v7, v5, v4, s[44:45] +; GCN-NEXT: v_cndmask_b32_e64 v22, v5, v4, s[42:43] +; GCN-NEXT: v_cndmask_b32_e64 v6, v3, v2, s[40:41] +; GCN-NEXT: v_cndmask_b32_e64 v23, v3, v2, s[28:29] +; GCN-NEXT: v_cndmask_b32_e64 v5, v1, v0, s[26:27] +; GCN-NEXT: v_cndmask_b32_e64 v24, v1, v0, s[24:25] +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:12 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:20 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 +; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:16 +; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:80 +; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:76 ; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_mul_f32_e32 v46, 1.0, v46 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_mul_f32_e32 v47, 1.0, v47 +; GCN-NEXT: v_cndmask_b32_e64 v4, v2, v1, s[22:23] +; GCN-NEXT: v_cndmask_b32_e64 v25, v2, v1, s[20:21] ; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v56, 1.0, v56 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v57, 1.0, v57 -; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v21 -; GCN-NEXT: v_cndmask_b32_e64 v21, v58, v48, s[4:5] -; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v20 -; GCN-NEXT: v_cndmask_b32_e64 v20, v59, v39, s[4:5] -; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v19 -; GCN-NEXT: v_cndmask_b32_e64 v19, v57, v56, s[4:5] -; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v18 -; GCN-NEXT: v_cndmask_b32_e64 v18, v47, v46, s[4:5] -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:148 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:152 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:156 -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:160 -; GCN-NEXT: v_and_b32_e32 v7, 1, v7 -; GCN-NEXT: v_and_b32_e32 v8, 1, v8 -; GCN-NEXT: v_and_b32_e32 v9, 1, v9 -; GCN-NEXT: v_and_b32_e32 v10, 1, v10 -; GCN-NEXT: v_and_b32_e32 v14, 1, v14 -; GCN-NEXT: v_and_b32_e32 v15, 1, v15 -; GCN-NEXT: v_and_b32_e32 v16, 1, v16 -; GCN-NEXT: v_and_b32_e32 v17, 1, v17 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33 -; GCN-NEXT: v_mul_f32_e32 v34, 1.0, v34 -; GCN-NEXT: v_mul_f32_e32 v35, 1.0, v35 -; GCN-NEXT: v_mul_f32_e32 v49, 1.0, v49 -; GCN-NEXT: v_mul_f32_e32 v50, 1.0, v50 -; GCN-NEXT: v_mul_f32_e32 v51, 1.0, v51 -; GCN-NEXT: v_mul_f32_e32 v52, 1.0, v52 -; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v17 -; GCN-NEXT: v_cndmask_b32_e64 v17, v52, v51, s[4:5] -; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v16 -; GCN-NEXT: v_cndmask_b32_e64 v16, v50, v49, s[4:5] -; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v15 -; GCN-NEXT: v_cndmask_b32_e64 v15, v35, v34, s[4:5] -; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v14 -; GCN-NEXT: v_cndmask_b32_e64 v14, v33, v32, s[4:5] -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:164 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:168 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:172 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:176 -; GCN-NEXT: v_and_b32_e32 v11, 1, v11 -; GCN-NEXT: v_and_b32_e32 v12, 1, v12 -; GCN-NEXT: v_cndmask_b32_e32 v38, v38, v40, vcc -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:256 -; GCN-NEXT: v_and_b32_e32 v26, 1, v26 -; GCN-NEXT: v_mul_f32_e32 v53, 1.0, v53 -; GCN-NEXT: v_mul_f32_e32 v54, 1.0, v54 -; GCN-NEXT: v_mul_f32_e32 v55, 1.0, v55 -; GCN-NEXT: v_mul_f32_e32 v41, 1.0, v41 -; GCN-NEXT: v_mul_f32_e32 v42, 1.0, v42 -; GCN-NEXT: v_mul_f32_e32 v43, 1.0, v43 -; GCN-NEXT: v_mul_f32_e32 v44, 1.0, v44 -; GCN-NEXT: v_mul_f32_e32 v45, 1.0, v45 -; GCN-NEXT: s_waitcnt vmcnt(14) -; GCN-NEXT: v_mul_f32_e32 v39, 1.0, v39 -; GCN-NEXT: v_mul_f32_e32 v48, 1.0, v48 -; GCN-NEXT: v_mul_f32_e32 v46, 1.0, v46 -; GCN-NEXT: s_waitcnt vmcnt(13) -; GCN-NEXT: v_mul_f32_e32 v47, 1.0, v47 -; GCN-NEXT: s_waitcnt vmcnt(12) -; GCN-NEXT: v_mul_f32_e32 v56, 1.0, v56 -; GCN-NEXT: s_waitcnt vmcnt(11) -; GCN-NEXT: v_mul_f32_e32 v57, 1.0, v57 -; GCN-NEXT: s_waitcnt vmcnt(10) -; GCN-NEXT: v_mul_f32_e32 v58, 1.0, v58 -; GCN-NEXT: s_waitcnt vmcnt(9) -; GCN-NEXT: v_mul_f32_e32 v59, 1.0, v59 -; GCN-NEXT: s_waitcnt vmcnt(8) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GCN-NEXT: s_waitcnt vmcnt(7) -; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33 -; GCN-NEXT: s_waitcnt vmcnt(6) -; GCN-NEXT: v_mul_f32_e32 v34, 1.0, v34 -; GCN-NEXT: s_waitcnt vmcnt(5) -; GCN-NEXT: v_mul_f32_e32 v35, 1.0, v35 -; GCN-NEXT: s_waitcnt vmcnt(4) -; GCN-NEXT: v_mul_f32_e32 v49, 1.0, v49 +; GCN-NEXT: v_cndmask_b32_e64 v3, v27, v26, s[18:19] +; GCN-NEXT: v_cndmask_b32_e64 v26, v27, v26, s[16:17] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cndmask_b32_e64 v2, v28, v0, s[14:15] +; GCN-NEXT: v_cndmask_b32_e64 v27, v28, v0, s[12:13] +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:8 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:72 +; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:68 +; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_mul_f32_e32 v50, 1.0, v50 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_mul_f32_e32 v51, 1.0, v51 +; GCN-NEXT: v_cndmask_b32_e64 v31, v1, v0, s[10:11] +; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v0, s[8:9] ; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v52, 1.0, v52 -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31 -; GCN-NEXT: v_mul_f32_e32 v37, 1.0, v37 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v40, 1.0, v40 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v12 -; GCN-NEXT: v_cndmask_b32_e32 v12, v31, v13, vcc -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v11 -; GCN-NEXT: v_cndmask_b32_e32 v11, v52, v51, vcc -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v10 -; GCN-NEXT: v_cndmask_b32_e32 v10, v50, v49, vcc -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v9 -; GCN-NEXT: v_cndmask_b32_e32 v9, v35, v34, vcc -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 -; GCN-NEXT: v_cndmask_b32_e32 v8, v33, v32, vcc -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 -; GCN-NEXT: v_cndmask_b32_e32 v7, v59, v58, vcc -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 -; GCN-NEXT: v_cndmask_b32_e32 v6, v57, v56, vcc -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5 -; GCN-NEXT: v_cndmask_b32_e32 v5, v47, v46, vcc -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 -; GCN-NEXT: v_cndmask_b32_e32 v4, v48, v39, vcc -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 -; GCN-NEXT: v_cndmask_b32_e32 v3, v45, v44, vcc -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GCN-NEXT: v_cndmask_b32_e32 v2, v43, v42, vcc -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 -; GCN-NEXT: v_cndmask_b32_e32 v1, v41, v55, vcc -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GCN-NEXT: v_cndmask_b32_e32 v0, v54, v53, vcc -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v26 -; GCN-NEXT: v_cndmask_b32_e32 v31, v40, v37, vcc -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v38 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v36 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; GCN-NEXT: v_cndmask_b32_e64 v0, v29, v28, s[6:7] +; GCN-NEXT: v_cndmask_b32_e64 v28, v29, v28, s[4:5] ; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v29, 1, v30 +; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v29 +; GCN-NEXT: s_mov_b32 s6, 0xffff +; GCN-NEXT: v_bfi_b32 v0, s6, v28, v0 +; GCN-NEXT: v_bfi_b32 v1, s6, v1, v31 +; GCN-NEXT: v_bfi_b32 v2, s6, v27, v2 +; GCN-NEXT: v_bfi_b32 v3, s6, v26, v3 +; GCN-NEXT: v_bfi_b32 v4, s6, v25, v4 +; GCN-NEXT: v_bfi_b32 v5, s6, v24, v5 +; GCN-NEXT: v_bfi_b32 v6, s6, v23, v6 +; GCN-NEXT: v_bfi_b32 v7, s6, v22, v7 +; GCN-NEXT: v_bfi_b32 v8, s6, v21, v8 +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:64 +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:128 +; GCN-NEXT: v_bfi_b32 v9, s6, v20, v9 +; GCN-NEXT: v_bfi_b32 v10, s6, v19, v10 +; GCN-NEXT: v_bfi_b32 v11, s6, v18, v11 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cndmask_b32_e32 v18, v22, v21, vcc +; GCN-NEXT: v_bfi_b32 v12, s6, v17, v12 +; GCN-NEXT: v_bfi_b32 v13, s6, v16, v13 +; GCN-NEXT: v_cndmask_b32_e64 v16, v22, v21, s[4:5] +; GCN-NEXT: v_bfi_b32 v14, s6, v15, v14 +; GCN-NEXT: v_bfi_b32 v15, s6, v18, v16 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_vselect_v32bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v24, 1, v24 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v24 -; GFX7-NEXT: buffer_load_dword v24, off, s[0:3], s32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:228 -; GFX7-NEXT: v_and_b32_e32 v25, 1, v25 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v25 -; GFX7-NEXT: v_and_b32_e32 v30, 1, v30 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[12:13], 1, v30 -; GFX7-NEXT: v_and_b32_e32 v29, 1, v29 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[14:15], 1, v29 -; GFX7-NEXT: v_and_b32_e32 v28, 1, v28 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[16:17], 1, v28 -; GFX7-NEXT: v_and_b32_e32 v27, 1, v27 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[10:11], 1, v27 -; GFX7-NEXT: v_and_b32_e32 v26, 1, v26 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[8:9], 1, v26 -; GFX7-NEXT: v_and_b32_e32 v23, 1, v23 -; GFX7-NEXT: v_and_b32_e32 v22, 1, v22 -; GFX7-NEXT: v_and_b32_e32 v21, 1, v21 -; GFX7-NEXT: v_and_b32_e32 v20, 1, v20 -; GFX7-NEXT: v_and_b32_e32 v19, 1, v19 -; GFX7-NEXT: v_and_b32_e32 v18, 1, v18 -; GFX7-NEXT: v_and_b32_e32 v17, 1, v17 -; GFX7-NEXT: v_and_b32_e32 v16, 1, v16 -; GFX7-NEXT: v_and_b32_e32 v15, 1, v15 -; GFX7-NEXT: v_and_b32_e32 v14, 1, v14 -; GFX7-NEXT: v_and_b32_e32 v13, 1, v13 -; GFX7-NEXT: v_and_b32_e32 v12, 1, v12 -; GFX7-NEXT: v_and_b32_e32 v11, 1, v11 -; GFX7-NEXT: v_and_b32_e32 v10, 1, v10 -; GFX7-NEXT: v_and_b32_e32 v9, 1, v9 -; GFX7-NEXT: v_and_b32_e32 v8, 1, v8 -; GFX7-NEXT: v_and_b32_e32 v7, 1, v7 -; GFX7-NEXT: v_and_b32_e32 v6, 1, v6 -; GFX7-NEXT: v_and_b32_e32 v5, 1, v5 -; GFX7-NEXT: v_and_b32_e32 v4, 1, v4 -; GFX7-NEXT: v_and_b32_e32 v3, 1, v3 -; GFX7-NEXT: v_and_b32_e32 v2, 1, v2 -; GFX7-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX7-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX7-NEXT: s_mov_b64 exec, s[4:5] ; GFX7-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX7-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:252 -; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:256 -; GFX7-NEXT: s_waitcnt vmcnt(3) -; GFX7-NEXT: v_and_b32_e32 v24, 1, v24 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v24 -; GFX7-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:124 -; GFX7-NEXT: s_waitcnt vmcnt(3) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: s_waitcnt vmcnt(2) -; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 -; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 -; GFX7-NEXT: v_cndmask_b32_e64 v30, v25, v24, s[12:13] -; GFX7-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:120 -; GFX7-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:248 -; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 -; GFX7-NEXT: v_cndmask_b32_e64 v29, v25, v24, s[14:15] -; GFX7-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 -; GFX7-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:244 -; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 -; GFX7-NEXT: v_cndmask_b32_e64 v28, v25, v24, s[16:17] -; GFX7-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:112 -; GFX7-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:240 -; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 -; GFX7-NEXT: v_cndmask_b32_e64 v27, v25, v24, s[10:11] -; GFX7-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:108 -; GFX7-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:236 -; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 -; GFX7-NEXT: v_cndmask_b32_e64 v26, v25, v24, s[8:9] -; GFX7-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:104 -; GFX7-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:232 -; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 -; GFX7-NEXT: v_cndmask_b32_e64 v25, v25, v24, s[6:7] -; GFX7-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:128 -; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 -; GFX7-NEXT: v_cndmask_b32_e64 v31, v31, v24, s[4:5] -; GFX7-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 -; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 -; GFX7-NEXT: v_cndmask_b32_e32 v24, v32, v24, vcc -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v23 -; GFX7-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:96 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:224 -; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_cndmask_b32_e32 v23, v32, v23, vcc -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v22 -; GFX7-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:92 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:220 -; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_cndmask_b32_e32 v22, v32, v22, vcc -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v21 -; GFX7-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:88 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:216 -; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_cndmask_b32_e32 v21, v32, v21, vcc -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v20 -; GFX7-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:84 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:212 -; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_cndmask_b32_e32 v20, v32, v20, vcc -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v19 -; GFX7-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:80 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:208 -; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_cndmask_b32_e32 v19, v32, v19, vcc -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v18 -; GFX7-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:204 -; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_cndmask_b32_e32 v18, v32, v18, vcc -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v17 -; GFX7-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:72 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:200 -; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_cndmask_b32_e32 v17, v32, v17, vcc -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v16 -; GFX7-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:68 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:196 -; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_cndmask_b32_e32 v16, v32, v16, vcc -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v15 -; GFX7-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:64 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:192 -; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_cndmask_b32_e32 v15, v32, v15, vcc -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v14 -; GFX7-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:188 -; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_cndmask_b32_e32 v14, v32, v14, vcc -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v13 -; GFX7-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:56 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:184 -; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_cndmask_b32_e32 v13, v32, v13, vcc -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v12 -; GFX7-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:52 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:180 -; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_cndmask_b32_e32 v12, v32, v12, vcc -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v11 -; GFX7-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:48 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:176 -; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_cndmask_b32_e32 v11, v32, v11, vcc -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v10 -; GFX7-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:172 -; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_cndmask_b32_e32 v10, v32, v10, vcc -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v9 -; GFX7-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:40 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:168 -; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_cndmask_b32_e32 v9, v32, v9, vcc -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 -; GFX7-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:36 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:164 -; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_cndmask_b32_e32 v8, v32, v8, vcc -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 -; GFX7-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:160 -; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_cndmask_b32_e32 v7, v32, v7, vcc -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 -; GFX7-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:156 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_cndmask_b32_e32 v6, v32, v6, vcc -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5 -; GFX7-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:152 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_cndmask_b32_e32 v5, v32, v5, vcc -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 -; GFX7-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:148 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_cndmask_b32_e32 v4, v32, v4, vcc -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 -; GFX7-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:144 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_cndmask_b32_e32 v3, v32, v3, vcc -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX7-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:140 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_cndmask_b32_e32 v2, v32, v2, vcc -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 -; GFX7-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:136 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v32, v1, vcc ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 1, v1 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 1, v2 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 1, v3 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[8:9], 1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 1, v4 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[10:11], 1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 1, v5 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[12:13], 1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 1, v6 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[14:15], 1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 1, v7 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[16:17], 1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 1, v8 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[18:19], 1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 1, v9 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[20:21], 1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 1, v10 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[22:23], 1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 1, v11 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[24:25], 1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 1, v12 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[26:27], 1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 1, v13 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[28:29], 1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 1, v14 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[40:41], 1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 1, v15 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[42:43], 1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 1, v16 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[44:45], 1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 1, v17 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[46:47], 1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 1, v18 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[56:57], 1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 1, v19 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[58:59], 1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 1, v20 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[60:61], 1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 1, v21 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[62:63], 1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 1, v22 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[72:73], 1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 1, v23 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[74:75], 1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 1, v24 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[76:77], 1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 1, v25 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[78:79], 1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 1, v26 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[88:89], 1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 1, v27 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[90:91], 1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 1, v28 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[92:93], 1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 1, v29 +; GFX7-NEXT: v_writelane_b32 v33, s30, 0 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[94:95], 1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 1, v30 +; GFX7-NEXT: v_writelane_b32 v33, s31, 1 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[30:31], 1, v0 +; GFX7-NEXT: buffer_load_dword v0, off, s[0:3], s32 +; GFX7-NEXT: v_writelane_b32 v33, s34, 2 +; GFX7-NEXT: v_writelane_b32 v33, s35, 3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[34:35], 1, v0 ; GFX7-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:132 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 +; GFX7-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 +; GFX7-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:72 +; GFX7-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:12 +; GFX7-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:76 +; GFX7-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 +; GFX7-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:80 +; GFX7-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:20 +; GFX7-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:84 +; GFX7-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:24 +; GFX7-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:88 +; GFX7-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:28 +; GFX7-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:92 +; GFX7-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:32 +; GFX7-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:96 +; GFX7-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:36 +; GFX7-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:100 +; GFX7-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:40 +; GFX7-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:104 +; GFX7-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:44 +; GFX7-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:108 +; GFX7-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:48 +; GFX7-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:112 +; GFX7-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:52 +; GFX7-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:116 +; GFX7-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:56 +; GFX7-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:120 +; GFX7-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:60 +; GFX7-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:124 +; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:64 +; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cndmask_b32_e64 v30, v32, v31, s[34:35] +; GFX7-NEXT: v_cndmask_b32_e64 v31, v32, v31, s[30:31] +; GFX7-NEXT: v_cndmask_b32_e64 v32, v29, v28, s[94:95] +; GFX7-NEXT: v_cndmask_b32_e64 v28, v29, v28, s[92:93] +; GFX7-NEXT: v_cndmask_b32_e64 v29, v27, v26, s[90:91] +; GFX7-NEXT: v_cndmask_b32_e64 v26, v27, v26, s[88:89] +; GFX7-NEXT: v_cndmask_b32_e64 v27, v25, v24, s[78:79] +; GFX7-NEXT: v_cndmask_b32_e64 v24, v25, v24, s[76:77] +; GFX7-NEXT: v_cndmask_b32_e64 v25, v23, v22, s[74:75] +; GFX7-NEXT: v_cndmask_b32_e64 v22, v23, v22, s[72:73] +; GFX7-NEXT: v_cndmask_b32_e64 v23, v21, v20, s[62:63] +; GFX7-NEXT: v_cndmask_b32_e64 v20, v21, v20, s[60:61] +; GFX7-NEXT: v_cndmask_b32_e64 v21, v19, v18, s[58:59] +; GFX7-NEXT: v_cndmask_b32_e64 v18, v19, v18, s[56:57] +; GFX7-NEXT: v_cndmask_b32_e64 v19, v17, v16, s[46:47] +; GFX7-NEXT: v_cndmask_b32_e64 v16, v17, v16, s[44:45] +; GFX7-NEXT: v_cndmask_b32_e64 v17, v15, v14, s[42:43] +; GFX7-NEXT: v_cndmask_b32_e64 v14, v15, v14, s[40:41] +; GFX7-NEXT: v_cndmask_b32_e64 v15, v13, v12, s[28:29] +; GFX7-NEXT: v_cndmask_b32_e64 v12, v13, v12, s[26:27] +; GFX7-NEXT: v_cndmask_b32_e64 v13, v11, v10, s[24:25] +; GFX7-NEXT: v_cndmask_b32_e64 v10, v11, v10, s[22:23] +; GFX7-NEXT: v_cndmask_b32_e64 v11, v9, v8, s[20:21] +; GFX7-NEXT: v_cndmask_b32_e64 v8, v9, v8, s[18:19] +; GFX7-NEXT: v_cndmask_b32_e64 v9, v7, v6, s[16:17] +; GFX7-NEXT: v_cndmask_b32_e64 v6, v7, v6, s[14:15] +; GFX7-NEXT: v_cndmask_b32_e64 v7, v5, v4, s[12:13] +; GFX7-NEXT: v_cndmask_b32_e64 v4, v5, v4, s[10:11] +; GFX7-NEXT: v_cndmask_b32_e64 v5, v3, v2, s[8:9] +; GFX7-NEXT: v_cndmask_b32_e64 v2, v3, v2, s[6:7] +; GFX7-NEXT: v_cndmask_b32_e64 v3, v1, v0, s[4:5] +; GFX7-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GFX7-NEXT: s_mov_b32 s4, 0xffff +; GFX7-NEXT: v_bfi_b32 v0, s4, v0, v3 +; GFX7-NEXT: v_bfi_b32 v1, s4, v2, v5 +; GFX7-NEXT: v_bfi_b32 v2, s4, v4, v7 +; GFX7-NEXT: v_bfi_b32 v3, s4, v6, v9 +; GFX7-NEXT: v_bfi_b32 v4, s4, v8, v11 +; GFX7-NEXT: v_bfi_b32 v5, s4, v10, v13 +; GFX7-NEXT: v_bfi_b32 v6, s4, v12, v15 +; GFX7-NEXT: v_bfi_b32 v7, s4, v14, v17 +; GFX7-NEXT: v_bfi_b32 v8, s4, v16, v19 +; GFX7-NEXT: v_bfi_b32 v9, s4, v18, v21 +; GFX7-NEXT: v_bfi_b32 v10, s4, v20, v23 +; GFX7-NEXT: v_bfi_b32 v11, s4, v22, v25 +; GFX7-NEXT: v_bfi_b32 v12, s4, v24, v27 +; GFX7-NEXT: v_bfi_b32 v13, s4, v26, v29 +; GFX7-NEXT: v_bfi_b32 v14, s4, v28, v32 +; GFX7-NEXT: v_bfi_b32 v15, s4, v31, v30 +; GFX7-NEXT: v_readlane_b32 s35, v33, 3 +; GFX7-NEXT: v_readlane_b32 s34, v33, 2 +; GFX7-NEXT: v_readlane_b32 s31, v33, 1 +; GFX7-NEXT: v_readlane_b32 s30, v33, 0 +; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX7-NEXT: s_mov_b64 exec, s[4:5] ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v32, v0, vcc -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_vselect_v32bf16: @@ -51215,43 +47488,31 @@ define <2 x bfloat> @v_fma_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> ; GCN-LABEL: v_fma_v2bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v0 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_fma_f32 v1, v1, v3, v5 -; GCN-NEXT: v_fma_f32 v0, v0, v2, v4 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GCN-NEXT: v_fma_f32 v3, v5, v4, v3 +; GCN-NEXT: v_fma_f32 v0, v0, v1, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_alignbit_b32 v0, v0, v3, 16 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_fma_v2bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_fma_f32 v1, v1, v3, v5 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v0 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_fma_f32 v0, v0, v2, v3 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_fma_f32 v0, v0, v1, v2 +; GFX7-NEXT: v_fma_f32 v3, v5, v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_alignbit_b32 v0, v0, v3, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fma_v2bf16: @@ -51413,59 +47674,41 @@ define <3 x bfloat> @v_fma_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat> ; GCN-LABEL: v_fma_v3bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v0 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_fma_f32 v2, v2, v5, v8 -; GCN-NEXT: v_fma_f32 v1, v1, v4, v7 -; GCN-NEXT: v_fma_f32 v0, v0, v3, v6 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GCN-NEXT: v_fma_f32 v1, v1, v3, v5 +; GCN-NEXT: v_fma_f32 v3, v8, v7, v6 +; GCN-NEXT: v_fma_f32 v0, v0, v2, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_alignbit_b32 v0, v0, v3, 16 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_fma_v3bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GFX7-NEXT: v_fma_f32 v2, v2, v5, v8 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_fma_f32 v1, v1, v3, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v0 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_fma_f32 v1, v1, v4, v5 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_fma_f32 v0, v0, v3, v4 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: v_fma_f32 v0, v0, v2, v4 +; GFX7-NEXT: v_fma_f32 v3, v6, v5, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_alignbit_b32 v0, v0, v3, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fma_v3bf16: @@ -51689,75 +47932,51 @@ define <4 x bfloat> @v_fma_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat> ; GCN-LABEL: v_fma_v4bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v1 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v0 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_fma_f32 v3, v3, v7, v11 -; GCN-NEXT: v_fma_f32 v2, v2, v6, v10 -; GCN-NEXT: v_fma_f32 v1, v1, v5, v9 -; GCN-NEXT: v_fma_f32 v0, v0, v4, v8 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GCN-NEXT: v_fma_f32 v6, v8, v7, v6 +; GCN-NEXT: v_fma_f32 v1, v1, v3, v5 +; GCN-NEXT: v_fma_f32 v3, v11, v10, v9 +; GCN-NEXT: v_fma_f32 v0, v0, v2, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GCN-NEXT: v_alignbit_b32 v1, v1, v6, 16 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_fma_v4bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 -; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GFX7-NEXT: v_fma_f32 v3, v3, v7, v11 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v10 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; GFX7-NEXT: v_fma_f32 v2, v2, v6, v7 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v9 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v1 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_fma_f32 v1, v1, v5, v6 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v8 +; GFX7-NEXT: v_fma_f32 v6, v8, v7, v6 +; GFX7-NEXT: v_fma_f32 v1, v1, v3, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v0 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_fma_f32 v0, v0, v4, v5 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_fma_f32 v0, v0, v2, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_fma_f32 v3, v7, v5, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX7-NEXT: v_alignbit_b32 v1, v1, v6, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fma_v4bf16: @@ -52031,139 +48250,91 @@ define <8 x bfloat> @v_fma_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b, <8 x bfloat> ; GCN-LABEL: v_fma_v8bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v3 ; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v10 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v2 ; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v5 +; GCN-NEXT: v_fma_f32 v12, v14, v13, v12 +; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v1 ; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v8 +; GCN-NEXT: v_fma_f32 v3, v3, v7, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v0 ; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_fma_f32 v7, v7, v15, v23 -; GCN-NEXT: v_fma_f32 v6, v6, v14, v22 -; GCN-NEXT: v_fma_f32 v5, v5, v13, v21 -; GCN-NEXT: v_fma_f32 v4, v4, v12, v20 -; GCN-NEXT: v_fma_f32 v3, v3, v11, v19 -; GCN-NEXT: v_fma_f32 v2, v2, v10, v18 -; GCN-NEXT: v_fma_f32 v1, v1, v9, v17 -; GCN-NEXT: v_fma_f32 v0, v0, v8, v16 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GCN-NEXT: v_fma_f32 v15, v17, v16, v15 +; GCN-NEXT: v_fma_f32 v2, v2, v6, v10 +; GCN-NEXT: v_fma_f32 v6, v13, v19, v18 +; GCN-NEXT: v_fma_f32 v1, v1, v5, v9 +; GCN-NEXT: v_fma_f32 v5, v11, v7, v14 +; GCN-NEXT: v_fma_f32 v0, v0, v4, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_alignbit_b32 v0, v0, v5, 16 +; GCN-NEXT: v_alignbit_b32 v1, v1, v6, 16 +; GCN-NEXT: v_alignbit_b32 v2, v2, v15, 16 +; GCN-NEXT: v_alignbit_b32 v3, v3, v12, 16 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_fma_v8bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 -; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23 -; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX7-NEXT: v_fma_f32 v7, v7, v15, v23 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 -; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v22 -; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX7-NEXT: v_fma_f32 v6, v6, v14, v15 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v21 -; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX7-NEXT: v_fma_f32 v5, v5, v13, v14 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 -; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v20 -; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX7-NEXT: v_fma_f32 v4, v4, v12, v13 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v19 -; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v11 +; GFX7-NEXT: v_lshlrev_b32_e32 v13, 16, v7 +; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v3 ; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX7-NEXT: v_fma_f32 v3, v3, v11, v12 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 -; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v18 -; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX7-NEXT: v_fma_f32 v12, v14, v13, v12 +; GFX7-NEXT: v_fma_f32 v3, v3, v7, v11 +; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v10 +; GFX7-NEXT: v_lshlrev_b32_e32 v11, 16, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v13, 16, v2 ; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_fma_f32 v2, v2, v10, v11 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v17 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v16 -; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX7-NEXT: v_fma_f32 v7, v13, v11, v7 +; GFX7-NEXT: v_fma_f32 v2, v2, v6, v10 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v9 +; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v11, 16, v1 ; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_fma_f32 v1, v1, v9, v11 -; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v10 +; GFX7-NEXT: v_fma_f32 v6, v11, v10, v6 +; GFX7-NEXT: v_fma_f32 v1, v1, v5, v9 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v0 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_fma_f32 v0, v0, v8, v9 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_fma_f32 v0, v0, v4, v8 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_fma_f32 v5, v10, v9, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_alignbit_b32 v0, v0, v5, 16 +; GFX7-NEXT: v_alignbit_b32 v1, v1, v6, 16 +; GFX7-NEXT: v_alignbit_b32 v2, v2, v7, 16 +; GFX7-NEXT: v_alignbit_b32 v3, v3, v12, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fma_v8bf16: @@ -52607,388 +48778,224 @@ define <8 x bfloat> @v_fma_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b, <8 x bfloat> ; GFX11FAKE16-NEXT: v_bfe_u32 v14, v2, 16, 1 ; GFX11FAKE16-NEXT: v_or_b32_e32 v1, 0x400000, v2 ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_2) -; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v6, v6, v13, vcc_lo -; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v16, 16, v4 -; GFX11FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX11FAKE16-NEXT: v_or_b32_e32 v13, 0x400000, v9 -; GFX11FAKE16-NEXT: v_fmac_f32_e32 v8, v0, v4 -; GFX11FAKE16-NEXT: v_add3_u32 v0, v14, v2, 0x7fff -; GFX11FAKE16-NEXT: v_bfe_u32 v4, v9, 16, 1 -; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11FAKE16-NEXT: v_bfe_u32 v7, v8, 16, 1 -; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc_lo -; GFX11FAKE16-NEXT: v_fmac_f32_e32 v15, v18, v16 -; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11FAKE16-NEXT: v_add3_u32 v0, v4, v9, 0x7fff -; GFX11FAKE16-NEXT: v_bfe_u32 v5, v15, 16, 1 -; GFX11FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v15 -; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 -; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11FAKE16-NEXT: v_add3_u32 v2, v5, v15, 0x7fff -; GFX11FAKE16-NEXT: v_add3_u32 v5, v7, v8, 0x7fff -; GFX11FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v8 -; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo -; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 -; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc_lo -; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 -; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v5, v0, v13, vcc_lo -; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 -; GFX11FAKE16-NEXT: v_perm_b32 v0, v4, v2, 0x7060302 -; GFX11FAKE16-NEXT: v_perm_b32 v2, v6, v3, 0x7060302 -; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11FAKE16-NEXT: v_perm_b32 v1, v5, v1, 0x7060302 -; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v7, v12, v17, vcc_lo -; GFX11FAKE16-NEXT: v_perm_b32 v3, v7, v10, 0x7060302 -; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] -; -; GFX1250-LABEL: v_fma_v8bf16: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_pk_fma_bf16 v0, v0, v4, v8 -; GFX1250-NEXT: v_pk_fma_bf16 v1, v1, v5, v9 -; GFX1250-NEXT: v_pk_fma_bf16 v2, v2, v6, v10 -; GFX1250-NEXT: v_pk_fma_bf16 v3, v3, v7, v11 -; GFX1250-NEXT: s_set_pc_i64 s[30:31] - %op = call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %a, <8 x bfloat> %b, <8 x bfloat> %c) - ret <8 x bfloat> %op -} - -define <16 x bfloat> @v_fma_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b, <16 x bfloat> %c) { -; GCN-LABEL: v_fma_v16bf16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64 -; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 -; GCN-NEXT: v_fma_f32 v15, v15, v31, v32 -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:60 -; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30 -; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31 -; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 -; GCN-NEXT: v_fma_f32 v14, v14, v30, v31 -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:56 -; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30 -; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; GCN-NEXT: v_fma_f32 v13, v13, v29, v30 -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:52 -; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28 -; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; GCN-NEXT: v_fma_f32 v12, v12, v28, v29 -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:48 -; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28 -; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; GCN-NEXT: v_fma_f32 v11, v11, v27, v28 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:44 -; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GCN-NEXT: v_fma_f32 v10, v10, v26, v27 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:40 -; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; GCN-NEXT: v_fma_f32 v9, v9, v25, v26 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:36 -; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GCN-NEXT: v_fma_f32 v8, v8, v24, v25 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:32 -; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v6, v6, v13, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v16, 16, v4 +; GFX11FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX11FAKE16-NEXT: v_or_b32_e32 v13, 0x400000, v9 +; GFX11FAKE16-NEXT: v_fmac_f32_e32 v8, v0, v4 +; GFX11FAKE16-NEXT: v_add3_u32 v0, v14, v2, 0x7fff +; GFX11FAKE16-NEXT: v_bfe_u32 v4, v9, 16, 1 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11FAKE16-NEXT: v_bfe_u32 v7, v8, 16, 1 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc_lo +; GFX11FAKE16-NEXT: v_fmac_f32_e32 v15, v18, v16 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11FAKE16-NEXT: v_add3_u32 v0, v4, v9, 0x7fff +; GFX11FAKE16-NEXT: v_bfe_u32 v5, v15, 16, 1 +; GFX11FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v15 +; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11FAKE16-NEXT: v_add3_u32 v2, v5, v15, 0x7fff +; GFX11FAKE16-NEXT: v_add3_u32 v5, v7, v8, 0x7fff +; GFX11FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v8 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v5, v0, v13, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX11FAKE16-NEXT: v_perm_b32 v0, v4, v2, 0x7060302 +; GFX11FAKE16-NEXT: v_perm_b32 v2, v6, v3, 0x7060302 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11FAKE16-NEXT: v_perm_b32 v1, v5, v1, 0x7060302 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v7, v12, v17, vcc_lo +; GFX11FAKE16-NEXT: v_perm_b32 v3, v7, v10, 0x7060302 +; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_fma_v8bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_pk_fma_bf16 v0, v0, v4, v8 +; GFX1250-NEXT: v_pk_fma_bf16 v1, v1, v5, v9 +; GFX1250-NEXT: v_pk_fma_bf16 v2, v2, v6, v10 +; GFX1250-NEXT: v_pk_fma_bf16 v3, v3, v7, v11 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %op = call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %a, <8 x bfloat> %b, <8 x bfloat> %c) + ret <8 x bfloat> %op +} + +define <16 x bfloat> @v_fma_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b, <16 x bfloat> %c) { +; GCN-LABEL: v_fma_v16bf16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v23 +; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v15 +; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v7 +; GCN-NEXT: v_fma_f32 v24, v26, v25, v24 ; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GCN-NEXT: v_fma_f32 v7, v7, v23, v24 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:28 -; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; GCN-NEXT: v_fma_f32 v7, v7, v15, v23 +; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v22 +; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v14 +; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v6 +; GCN-NEXT: v_fma_f32 v15, v25, v23, v15 ; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; GCN-NEXT: v_fma_f32 v6, v6, v22, v23 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:24 -; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 +; GCN-NEXT: v_fma_f32 v6, v6, v14, v22 +; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v21 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v13 +; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v5 +; GCN-NEXT: v_fma_f32 v14, v23, v22, v14 ; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GCN-NEXT: v_fma_f32 v5, v5, v21, v22 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:20 -; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v20 +; GCN-NEXT: v_fma_f32 v5, v5, v13, v21 +; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v12 +; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v4 ; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; GCN-NEXT: v_fma_f32 v4, v4, v20, v21 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:16 -; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 +; GCN-NEXT: v_fma_f32 v13, v21, v13, v22 +; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v19 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v11 +; GCN-NEXT: v_fma_f32 v4, v4, v12, v20 +; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v3 ; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; GCN-NEXT: v_fma_f32 v3, v3, v19, v20 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:12 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:4 +; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v18 +; GCN-NEXT: v_fma_f32 v12, v12, v22, v21 +; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v10 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v2 ; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; GCN-NEXT: v_fma_f32 v2, v2, v18, v19 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:8 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; GCN-NEXT: v_fma_f32 v3, v3, v11, v19 +; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v17 +; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v9 +; GCN-NEXT: v_fma_f32 v20, v22, v21, v20 +; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v1 ; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v16 +; GCN-NEXT: v_fma_f32 v2, v2, v10, v18 +; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v0 ; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v20 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; GCN-NEXT: v_fma_f32 v1, v1, v17, v18 -; GCN-NEXT: v_fma_f32 v0, v0, v16, v19 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GCN-NEXT: v_fma_f32 v11, v21, v19, v11 +; GCN-NEXT: v_fma_f32 v1, v1, v9, v17 +; GCN-NEXT: v_fma_f32 v9, v18, v10, v22 +; GCN-NEXT: v_fma_f32 v0, v0, v8, v16 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_alignbit_b32 v0, v0, v9, 16 +; GCN-NEXT: v_alignbit_b32 v1, v1, v11, 16 +; GCN-NEXT: v_alignbit_b32 v2, v2, v20, 16 +; GCN-NEXT: v_alignbit_b32 v3, v3, v12, 16 +; GCN-NEXT: v_alignbit_b32 v4, v4, v13, 16 +; GCN-NEXT: v_alignbit_b32 v5, v5, v14, 16 +; GCN-NEXT: v_alignbit_b32 v6, v6, v15, 16 +; GCN-NEXT: v_alignbit_b32 v7, v7, v24, 16 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_fma_v16bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64 -; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 -; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 -; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30 -; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29 -; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 -; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28 -; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27 -; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 -; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26 -; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 -; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 -; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23 +; GFX7-NEXT: v_lshlrev_b32_e32 v24, 16, v23 +; GFX7-NEXT: v_lshlrev_b32_e32 v25, 16, v15 +; GFX7-NEXT: v_lshlrev_b32_e32 v26, 16, v7 ; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; GFX7-NEXT: v_fma_f32 v24, v26, v25, v24 +; GFX7-NEXT: v_fma_f32 v7, v7, v15, v23 +; GFX7-NEXT: v_lshlrev_b32_e32 v15, 16, v22 +; GFX7-NEXT: v_lshlrev_b32_e32 v23, 16, v14 +; GFX7-NEXT: v_lshlrev_b32_e32 v25, 16, v6 ; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21 +; GFX7-NEXT: v_fma_f32 v15, v25, v23, v15 +; GFX7-NEXT: v_fma_f32 v6, v6, v14, v22 +; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v21 +; GFX7-NEXT: v_lshlrev_b32_e32 v22, 16, v13 +; GFX7-NEXT: v_lshlrev_b32_e32 v23, 16, v5 ; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20 +; GFX7-NEXT: v_fma_f32 v14, v23, v22, v14 +; GFX7-NEXT: v_fma_f32 v5, v5, v13, v21 +; GFX7-NEXT: v_lshlrev_b32_e32 v13, 16, v20 +; GFX7-NEXT: v_lshlrev_b32_e32 v21, 16, v12 +; GFX7-NEXT: v_lshlrev_b32_e32 v22, 16, v4 ; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19 +; GFX7-NEXT: v_fma_f32 v13, v22, v21, v13 +; GFX7-NEXT: v_fma_f32 v4, v4, v12, v20 +; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v19 +; GFX7-NEXT: v_lshlrev_b32_e32 v20, 16, v11 +; GFX7-NEXT: v_lshlrev_b32_e32 v21, 16, v3 ; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18 +; GFX7-NEXT: v_fma_f32 v12, v21, v20, v12 +; GFX7-NEXT: v_fma_f32 v3, v3, v11, v19 +; GFX7-NEXT: v_lshlrev_b32_e32 v11, 16, v18 +; GFX7-NEXT: v_lshlrev_b32_e32 v19, 16, v10 +; GFX7-NEXT: v_lshlrev_b32_e32 v20, 16, v2 ; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16 +; GFX7-NEXT: v_fma_f32 v11, v20, v19, v11 +; GFX7-NEXT: v_fma_f32 v2, v2, v10, v18 +; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v17 +; GFX7-NEXT: v_lshlrev_b32_e32 v18, 16, v9 +; GFX7-NEXT: v_lshlrev_b32_e32 v19, 16, v1 ; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: v_fma_f32 v10, v19, v18, v10 +; GFX7-NEXT: v_fma_f32 v1, v1, v9, v17 +; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v16 +; GFX7-NEXT: v_lshlrev_b32_e32 v17, 16, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v18, 16, v0 ; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 -; GFX7-NEXT: v_fma_f32 v15, v15, v31, v32 -; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:60 -; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31 -; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 -; GFX7-NEXT: v_fma_f32 v14, v14, v30, v31 -; GFX7-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:56 -; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30 -; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; GFX7-NEXT: v_fma_f32 v13, v13, v29, v30 -; GFX7-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:52 -; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29 -; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; GFX7-NEXT: v_fma_f32 v12, v12, v28, v29 -; GFX7-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:48 -; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28 -; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; GFX7-NEXT: v_fma_f32 v11, v11, v27, v28 -; GFX7-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:44 -; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27 -; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GFX7-NEXT: v_fma_f32 v10, v10, v26, v27 -; GFX7-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:40 -; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26 -; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; GFX7-NEXT: v_fma_f32 v9, v9, v25, v26 -; GFX7-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:36 -; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 -; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GFX7-NEXT: v_fma_f32 v8, v8, v24, v25 -; GFX7-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:32 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 -; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GFX7-NEXT: v_fma_f32 v7, v7, v23, v24 -; GFX7-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:28 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23 -; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; GFX7-NEXT: v_fma_f32 v6, v6, v22, v23 -; GFX7-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:24 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 -; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GFX7-NEXT: v_fma_f32 v5, v5, v21, v22 -; GFX7-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:20 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21 -; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; GFX7-NEXT: v_fma_f32 v4, v4, v20, v21 -; GFX7-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:16 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20 -; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; GFX7-NEXT: v_fma_f32 v3, v3, v19, v20 -; GFX7-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:12 -; GFX7-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:4 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19 -; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; GFX7-NEXT: v_fma_f32 v2, v2, v18, v19 -; GFX7-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:8 -; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v20 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18 -; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; GFX7-NEXT: v_fma_f32 v1, v1, v17, v18 -; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v19 -; GFX7-NEXT: v_fma_f32 v0, v0, v16, v17 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: v_fma_f32 v0, v0, v8, v16 +; GFX7-NEXT: v_fma_f32 v9, v18, v17, v9 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_alignbit_b32 v0, v0, v9, 16 +; GFX7-NEXT: v_alignbit_b32 v1, v1, v10, 16 +; GFX7-NEXT: v_alignbit_b32 v2, v2, v11, 16 +; GFX7-NEXT: v_alignbit_b32 v3, v3, v12, 16 +; GFX7-NEXT: v_alignbit_b32 v4, v4, v13, 16 +; GFX7-NEXT: v_alignbit_b32 v5, v5, v14, 16 +; GFX7-NEXT: v_alignbit_b32 v6, v6, v15, 16 +; GFX7-NEXT: v_alignbit_b32 v7, v7, v24, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fma_v16bf16: @@ -53906,783 +49913,399 @@ define <32 x bfloat> @v_fma_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b, <32 x bf ; GCN-LABEL: v_fma_v32bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:256 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 -; GCN-NEXT: v_fma_f32 v31, v31, v32, v33 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:252 -; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30 -; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_fma_f32 v30, v30, v32, v33 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:120 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:248 -; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_fma_f32 v29, v29, v32, v33 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:116 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:244 -; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28 -; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_fma_f32 v28, v28, v32, v33 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:112 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:240 -; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_fma_f32 v27, v27, v32, v33 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:108 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:236 -; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_fma_f32 v26, v26, v32, v33 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:104 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:232 -; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_fma_f32 v25, v25, v32, v33 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:100 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:228 -; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_fma_f32 v24, v24, v32, v33 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:96 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:224 -; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_fma_f32 v23, v23, v32, v33 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:92 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:220 -; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_fma_f32 v22, v22, v32, v33 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:88 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:216 -; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_fma_f32 v21, v21, v32, v33 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:84 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:212 -; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_fma_f32 v20, v20, v32, v33 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:80 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:208 -; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_fma_f32 v19, v19, v32, v33 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:76 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:204 -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_fma_f32 v18, v18, v32, v33 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:200 -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 +; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v15 ; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v32 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v33 +; GCN-NEXT: v_fma_f32 v31, v31, v35, v34 +; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_fma_f32 v17, v17, v32, v33 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:196 -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_fma_f32 v16, v16, v32, v33 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:192 -; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:60 +; GCN-NEXT: v_fma_f32 v32, v15, v33, v32 +; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v30 +; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v14 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_fma_f32 v15, v15, v32, v33 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:188 -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 +; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v34 +; GCN-NEXT: v_fma_f32 v15, v33, v15, v35 +; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_fma_f32 v14, v14, v32, v33 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:184 -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 +; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v34 +; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:56 +; GCN-NEXT: v_fma_f32 v30, v14, v30, v33 +; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v29 +; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v13 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v34 +; GCN-NEXT: v_fma_f32 v14, v33, v14, v35 +; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 ; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_fma_f32 v13, v13, v32, v33 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:180 -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 +; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v34 +; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:52 +; GCN-NEXT: v_fma_f32 v29, v13, v29, v33 +; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v28 +; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v12 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v34 +; GCN-NEXT: v_fma_f32 v13, v33, v13, v35 +; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 ; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_fma_f32 v12, v12, v32, v33 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:176 -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 +; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v34 +; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:48 +; GCN-NEXT: v_fma_f32 v28, v12, v28, v33 +; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v27 +; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v11 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v34 +; GCN-NEXT: v_fma_f32 v12, v33, v12, v35 +; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 ; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_fma_f32 v11, v11, v32, v33 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:172 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 +; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v34 +; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:44 +; GCN-NEXT: v_fma_f32 v27, v11, v27, v33 +; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v26 +; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v10 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v34 +; GCN-NEXT: v_fma_f32 v11, v33, v11, v35 +; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v34 +; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:40 +; GCN-NEXT: v_fma_f32 v10, v10, v26, v33 +; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v25 +; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v9 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_fma_f32 v10, v10, v32, v33 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:168 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v34 +; GCN-NEXT: v_fma_f32 v26, v33, v26, v35 +; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 ; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_fma_f32 v9, v9, v32, v33 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:164 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 +; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v34 +; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:36 +; GCN-NEXT: v_fma_f32 v25, v9, v25, v33 +; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v24 +; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v8 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v34 +; GCN-NEXT: v_fma_f32 v9, v33, v9, v35 +; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 ; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_fma_f32 v8, v8, v32, v33 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:160 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v34 +; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:32 +; GCN-NEXT: v_fma_f32 v24, v8, v24, v33 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v23 +; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v7 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v34 +; GCN-NEXT: v_fma_f32 v8, v33, v8, v35 +; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_fma_f32 v7, v7, v32, v33 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:156 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v34 +; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:28 +; GCN-NEXT: v_fma_f32 v7, v7, v23, v33 +; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v22 +; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v6 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v34 +; GCN-NEXT: v_fma_f32 v23, v33, v23, v35 +; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_fma_f32 v6, v6, v32, v33 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:152 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_fma_f32 v5, v5, v32, v33 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:148 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_fma_f32 v4, v4, v32, v33 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:144 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_fma_f32 v3, v3, v32, v33 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:140 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_fma_f32 v2, v2, v32, v33 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:136 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_fma_f32 v1, v1, v32, v33 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:132 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33 -; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_fma_f32 v0, v0, v32, v33 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v34 +; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:24 +; GCN-NEXT: v_fma_f32 v6, v6, v22, v33 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v21 +; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v5 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v34 +; GCN-NEXT: v_fma_f32 v22, v33, v22, v35 +; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v34 +; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:20 +; GCN-NEXT: v_fma_f32 v5, v5, v21, v33 +; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v20 +; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v4 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v34 +; GCN-NEXT: v_fma_f32 v21, v33, v21, v35 ; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v34 +; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:16 +; GCN-NEXT: v_fma_f32 v4, v4, v20, v33 +; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v19 +; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v3 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v34 +; GCN-NEXT: v_fma_f32 v20, v33, v20, v35 +; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v34 +; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12 +; GCN-NEXT: v_fma_f32 v3, v3, v19, v33 +; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v18 +; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v2 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v34 +; GCN-NEXT: v_fma_f32 v19, v33, v19, v35 +; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v34 +; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:8 +; GCN-NEXT: v_fma_f32 v2, v2, v18, v33 +; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v17 +; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v1 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v34 +; GCN-NEXT: v_fma_f32 v18, v33, v18, v35 +; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v34 +; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:4 +; GCN-NEXT: v_fma_f32 v1, v1, v17, v33 +; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v16 +; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v34 +; GCN-NEXT: v_fma_f32 v17, v33, v17, v35 +; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v34 +; GCN-NEXT: v_fma_f32 v0, v0, v16, v33 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_alignbit_b32 v0, v0, v17, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_alignbit_b32 v1, v1, v18, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_alignbit_b32 v2, v2, v19, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_alignbit_b32 v3, v3, v20, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_alignbit_b32 v4, v4, v21, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_alignbit_b32 v5, v5, v22, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_alignbit_b32 v6, v6, v23, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v24 +; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v25 +; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v27 +; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v32 +; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v30 +; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v29 +; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v28 +; GCN-NEXT: v_alignbit_b32 v7, v7, v8, 16 +; GCN-NEXT: v_alignbit_b32 v8, v16, v9, 16 +; GCN-NEXT: v_alignbit_b32 v9, v17, v26, 16 +; GCN-NEXT: v_alignbit_b32 v10, v10, v11, 16 +; GCN-NEXT: v_alignbit_b32 v11, v18, v12, 16 +; GCN-NEXT: v_alignbit_b32 v12, v22, v13, 16 +; GCN-NEXT: v_alignbit_b32 v13, v21, v14, 16 +; GCN-NEXT: v_alignbit_b32 v14, v20, v15, 16 +; GCN-NEXT: v_alignbit_b32 v15, v19, v31, 16 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_fma_v32bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128 -; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:256 -; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30 -; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29 -; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28 -; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27 -; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26 -; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 -; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 -; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23 -; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 -; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21 -; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20 -; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19 -; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18 -; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 +; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64 +; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 +; GFX7-NEXT: v_lshlrev_b32_e32 v31, 16, v15 ; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 -; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 -; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 -; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: s_waitcnt vmcnt(2) -; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31 ; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GFX7-NEXT: v_lshlrev_b32_e32 v34, 16, v32 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 -; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; GFX7-NEXT: v_lshlrev_b32_e32 v35, 16, v33 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 -; GFX7-NEXT: v_fma_f32 v31, v31, v32, v33 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124 -; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:252 -; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 -; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 ; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_fma_f32 v30, v30, v32, v33 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:120 -; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:248 +; GFX7-NEXT: v_fma_f32 v15, v15, v33, v32 +; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:60 +; GFX7-NEXT: v_fma_f32 v31, v31, v35, v34 +; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v30 +; GFX7-NEXT: v_lshlrev_b32_e32 v34, 16, v14 ; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 -; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_fma_f32 v29, v29, v32, v33 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:116 -; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:244 -; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 -; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_fma_f32 v28, v28, v32, v33 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:112 -; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:240 -; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 -; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_fma_f32 v27, v27, v32, v33 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:108 -; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:236 -; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 -; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_fma_f32 v26, v26, v32, v33 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:104 -; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:232 -; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 -; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_fma_f32 v25, v25, v32, v33 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:100 -; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:228 -; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 -; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_fma_f32 v24, v24, v32, v33 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:96 -; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:224 -; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 -; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_fma_f32 v23, v23, v32, v33 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:92 -; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:220 -; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 -; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_fma_f32 v22, v22, v32, v33 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:88 -; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:216 -; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 -; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_fma_f32 v21, v21, v32, v33 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:84 -; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:212 -; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 -; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_fma_f32 v20, v20, v32, v33 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:80 -; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:208 -; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 -; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_fma_f32 v19, v19, v32, v33 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:76 -; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:204 -; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 -; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_fma_f32 v18, v18, v32, v33 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72 -; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:200 -; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 -; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_fma_f32 v17, v17, v32, v33 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68 -; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:196 -; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 -; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_fma_f32 v16, v16, v32, v33 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64 -; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:192 -; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 -; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_fma_f32 v15, v15, v32, v33 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60 -; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:188 -; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 -; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_fma_f32 v14, v14, v32, v33 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56 -; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:184 ; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; GFX7-NEXT: v_alignbit_b32 v15, v15, v31, 16 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GFX7-NEXT: v_lshlrev_b32_e32 v35, 16, v33 ; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_fma_f32 v13, v13, v32, v33 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52 -; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:180 +; GFX7-NEXT: v_fma_f32 v14, v14, v30, v33 +; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:56 +; GFX7-NEXT: v_fma_f32 v32, v34, v32, v35 +; GFX7-NEXT: v_lshlrev_b32_e32 v30, 16, v29 +; GFX7-NEXT: v_lshlrev_b32_e32 v34, 16, v13 +; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 ; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; GFX7-NEXT: v_alignbit_b32 v14, v14, v32, 16 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GFX7-NEXT: v_lshlrev_b32_e32 v35, 16, v33 ; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_fma_f32 v12, v12, v32, v33 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:48 -; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:176 +; GFX7-NEXT: v_fma_f32 v13, v13, v29, v33 +; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:52 +; GFX7-NEXT: v_fma_f32 v30, v34, v30, v35 +; GFX7-NEXT: v_lshlrev_b32_e32 v29, 16, v28 +; GFX7-NEXT: v_lshlrev_b32_e32 v34, 16, v12 +; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 ; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GFX7-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; GFX7-NEXT: v_alignbit_b32 v13, v13, v30, 16 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GFX7-NEXT: v_lshlrev_b32_e32 v35, 16, v33 ; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_fma_f32 v11, v11, v32, v33 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:44 -; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:172 +; GFX7-NEXT: v_fma_f32 v12, v12, v28, v33 +; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:48 +; GFX7-NEXT: v_fma_f32 v29, v34, v29, v35 +; GFX7-NEXT: v_lshlrev_b32_e32 v28, 16, v27 +; GFX7-NEXT: v_lshlrev_b32_e32 v34, 16, v11 +; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 ; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GFX7-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; GFX7-NEXT: v_alignbit_b32 v12, v12, v29, 16 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GFX7-NEXT: v_lshlrev_b32_e32 v35, 16, v33 ; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_fma_f32 v10, v10, v32, v33 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:40 -; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:168 +; GFX7-NEXT: v_fma_f32 v11, v11, v27, v33 +; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:44 +; GFX7-NEXT: v_fma_f32 v28, v34, v28, v35 +; GFX7-NEXT: v_lshlrev_b32_e32 v27, 16, v26 +; GFX7-NEXT: v_lshlrev_b32_e32 v34, 16, v10 +; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; GFX7-NEXT: v_alignbit_b32 v11, v11, v28, 16 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GFX7-NEXT: v_lshlrev_b32_e32 v35, 16, v33 ; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_fma_f32 v9, v9, v32, v33 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36 -; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:164 +; GFX7-NEXT: v_fma_f32 v10, v10, v26, v33 +; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:40 +; GFX7-NEXT: v_fma_f32 v27, v34, v27, v35 +; GFX7-NEXT: v_lshlrev_b32_e32 v34, 16, v25 +; GFX7-NEXT: v_lshlrev_b32_e32 v35, 16, v9 +; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 ; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GFX7-NEXT: v_alignbit_b32 v10, v10, v27, 16 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GFX7-NEXT: v_lshlrev_b32_e32 v26, 16, v33 ; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_fma_f32 v8, v8, v32, v33 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:32 -; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:160 +; GFX7-NEXT: v_fma_f32 v9, v9, v25, v33 +; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:36 +; GFX7-NEXT: v_fma_f32 v26, v35, v34, v26 +; GFX7-NEXT: v_lshlrev_b32_e32 v34, 16, v24 +; GFX7-NEXT: v_lshlrev_b32_e32 v35, 16, v8 +; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GFX7-NEXT: v_alignbit_b32 v9, v9, v26, 16 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GFX7-NEXT: v_lshlrev_b32_e32 v25, 16, v33 ; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_fma_f32 v7, v7, v32, v33 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28 -; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:156 +; GFX7-NEXT: v_fma_f32 v8, v8, v24, v33 +; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:32 +; GFX7-NEXT: v_fma_f32 v25, v35, v34, v25 +; GFX7-NEXT: v_lshlrev_b32_e32 v34, 16, v23 +; GFX7-NEXT: v_lshlrev_b32_e32 v35, 16, v7 +; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GFX7-NEXT: v_alignbit_b32 v8, v8, v25, 16 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GFX7-NEXT: v_lshlrev_b32_e32 v24, 16, v33 ; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_fma_f32 v6, v6, v32, v33 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24 -; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:152 +; GFX7-NEXT: v_fma_f32 v7, v7, v23, v33 +; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:28 +; GFX7-NEXT: v_fma_f32 v24, v35, v34, v24 +; GFX7-NEXT: v_lshlrev_b32_e32 v34, 16, v22 +; GFX7-NEXT: v_lshlrev_b32_e32 v35, 16, v6 +; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GFX7-NEXT: v_alignbit_b32 v7, v7, v24, 16 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GFX7-NEXT: v_lshlrev_b32_e32 v23, 16, v33 ; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_fma_f32 v5, v5, v32, v33 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:20 -; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:148 +; GFX7-NEXT: v_fma_f32 v6, v6, v22, v33 +; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:24 +; GFX7-NEXT: v_fma_f32 v23, v35, v34, v23 +; GFX7-NEXT: v_lshlrev_b32_e32 v34, 16, v21 +; GFX7-NEXT: v_lshlrev_b32_e32 v35, 16, v5 +; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX7-NEXT: v_alignbit_b32 v6, v6, v23, 16 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GFX7-NEXT: v_lshlrev_b32_e32 v22, 16, v33 ; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_fma_f32 v4, v4, v32, v33 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:16 -; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:144 +; GFX7-NEXT: v_fma_f32 v5, v5, v21, v33 +; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20 +; GFX7-NEXT: v_fma_f32 v22, v35, v34, v22 +; GFX7-NEXT: v_lshlrev_b32_e32 v34, 16, v20 +; GFX7-NEXT: v_lshlrev_b32_e32 v35, 16, v4 +; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_alignbit_b32 v5, v5, v22, 16 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GFX7-NEXT: v_lshlrev_b32_e32 v21, 16, v33 ; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_fma_f32 v3, v3, v32, v33 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12 -; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:140 +; GFX7-NEXT: v_fma_f32 v4, v4, v20, v33 +; GFX7-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:16 +; GFX7-NEXT: v_fma_f32 v21, v35, v34, v21 +; GFX7-NEXT: v_lshlrev_b32_e32 v34, 16, v19 +; GFX7-NEXT: v_lshlrev_b32_e32 v35, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_alignbit_b32 v4, v4, v21, 16 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 -; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_fma_f32 v2, v2, v32, v33 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 -; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:136 +; GFX7-NEXT: v_lshlrev_b32_e32 v33, 16, v20 +; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; GFX7-NEXT: v_fma_f32 v3, v3, v19, v20 +; GFX7-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:12 +; GFX7-NEXT: v_fma_f32 v33, v35, v34, v33 +; GFX7-NEXT: v_lshlrev_b32_e32 v34, 16, v18 +; GFX7-NEXT: v_lshlrev_b32_e32 v35, 16, v2 +; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_alignbit_b32 v3, v3, v33, 16 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 -; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_fma_f32 v1, v1, v32, v33 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:132 +; GFX7-NEXT: v_lshlrev_b32_e32 v20, 16, v19 +; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; GFX7-NEXT: v_fma_f32 v2, v2, v18, v19 +; GFX7-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:8 +; GFX7-NEXT: v_fma_f32 v20, v35, v34, v20 +; GFX7-NEXT: v_lshlrev_b32_e32 v34, 16, v17 +; GFX7-NEXT: v_lshlrev_b32_e32 v35, 16, v1 +; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_alignbit_b32 v2, v2, v20, 16 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 -; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_fma_f32 v0, v0, v32, v33 +; GFX7-NEXT: v_lshlrev_b32_e32 v19, 16, v18 +; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; GFX7-NEXT: v_fma_f32 v1, v1, v17, v18 +; GFX7-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:4 +; GFX7-NEXT: v_fma_f32 v19, v35, v34, v19 +; GFX7-NEXT: v_lshlrev_b32_e32 v34, 16, v16 +; GFX7-NEXT: v_lshlrev_b32_e32 v35, 16, v0 +; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_alignbit_b32 v1, v1, v19, 16 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshlrev_b32_e32 v18, 16, v17 +; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; GFX7-NEXT: v_fma_f32 v0, v0, v16, v17 +; GFX7-NEXT: v_fma_f32 v18, v35, v34, v18 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_alignbit_b32 v0, v0, v18, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fma_v32bf16: @@ -56851,51 +52474,39 @@ define <2 x bfloat> @v_fmuladd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfl ; GCN-LABEL: v_fmuladd_v2bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GCN-NEXT: v_mul_f32_e32 v1, v1, v3 -; GCN-NEXT: v_mul_f32_e32 v0, v0, v2 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GCN-NEXT: v_mul_f32_e32 v3, v4, v3 +; GCN-NEXT: v_mul_f32_e32 v0, v0, v1 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GCN-NEXT: v_add_f32_e32 v1, v1, v5 -; GCN-NEXT: v_add_f32_e32 v0, v0, v4 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GCN-NEXT: v_add_f32_e32 v0, v0, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_fmuladd_v2bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_mul_f32_e32 v1, v1, v3 -; GFX7-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v0 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 -; GFX7-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX7-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX7-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX7-NEXT: v_mul_f32_e32 v3, v4, v3 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX7-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_alignbit_b32 v0, v0, v3, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fmuladd_v2bf16: @@ -57057,71 +52668,53 @@ define <3 x bfloat> @v_fmuladd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfl ; GCN-LABEL: v_fmuladd_v3bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v4 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GCN-NEXT: v_mul_f32_e32 v2, v2, v5 -; GCN-NEXT: v_mul_f32_e32 v1, v1, v4 -; GCN-NEXT: v_mul_f32_e32 v0, v0, v3 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GCN-NEXT: v_mul_f32_e32 v1, v1, v3 +; GCN-NEXT: v_mul_f32_e32 v3, v7, v6 +; GCN-NEXT: v_mul_f32_e32 v0, v0, v2 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GCN-NEXT: v_add_f32_e32 v1, v1, v5 ; GCN-NEXT: v_add_f32_e32 v2, v2, v8 -; GCN-NEXT: v_add_f32_e32 v1, v1, v7 -; GCN-NEXT: v_add_f32_e32 v0, v0, v6 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GCN-NEXT: v_add_f32_e32 v0, v0, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_alignbit_b32 v0, v0, v2, 16 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_fmuladd_v3bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_mul_f32_e32 v1, v1, v3 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; GFX7-NEXT: v_mul_f32_e32 v2, v2, v5 -; GFX7-NEXT: v_mul_f32_e32 v1, v1, v4 -; GFX7-NEXT: v_mul_f32_e32 v0, v0, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX7-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v0 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v8 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v7 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 -; GFX7-NEXT: v_add_f32_e32 v2, v2, v5 -; GFX7-NEXT: v_add_f32_e32 v1, v1, v4 -; GFX7-NEXT: v_add_f32_e32 v0, v0, v3 +; GFX7-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX7-NEXT: v_mul_f32_e32 v3, v5, v3 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX7-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX7-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_alignbit_b32 v0, v0, v3, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fmuladd_v3bf16: @@ -57345,91 +52938,67 @@ define <4 x bfloat> @v_fmuladd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfl ; GCN-LABEL: v_fmuladd_v4bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v5 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GCN-NEXT: v_mul_f32_e32 v3, v3, v7 -; GCN-NEXT: v_mul_f32_e32 v2, v2, v6 -; GCN-NEXT: v_mul_f32_e32 v1, v1, v5 -; GCN-NEXT: v_mul_f32_e32 v0, v0, v4 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v4 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_add_f32_e32 v3, v3, v11 -; GCN-NEXT: v_add_f32_e32 v2, v2, v10 -; GCN-NEXT: v_add_f32_e32 v1, v1, v9 -; GCN-NEXT: v_add_f32_e32 v0, v0, v8 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GCN-NEXT: v_mul_f32_e32 v6, v7, v6 +; GCN-NEXT: v_mul_f32_e32 v1, v1, v3 +; GCN-NEXT: v_mul_f32_e32 v3, v10, v9 +; GCN-NEXT: v_mul_f32_e32 v0, v0, v2 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GCN-NEXT: v_add_f32_e32 v2, v2, v8 +; GCN-NEXT: v_add_f32_e32 v1, v1, v5 +; GCN-NEXT: v_add_f32_e32 v3, v3, v11 +; GCN-NEXT: v_add_f32_e32 v0, v0, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GCN-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_fmuladd_v4bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v1 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 -; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GFX7-NEXT: v_mul_f32_e32 v3, v3, v7 -; GFX7-NEXT: v_mul_f32_e32 v2, v2, v6 -; GFX7-NEXT: v_mul_f32_e32 v1, v1, v5 -; GFX7-NEXT: v_mul_f32_e32 v0, v0, v4 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v11 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v10 +; GFX7-NEXT: v_mul_f32_e32 v1, v1, v3 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v9 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 +; GFX7-NEXT: v_mul_f32_e32 v6, v7, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v5 +; GFX7-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v8 -; GFX7-NEXT: v_add_f32_e32 v3, v3, v7 -; GFX7-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX7-NEXT: v_add_f32_e32 v1, v1, v5 -; GFX7-NEXT: v_add_f32_e32 v0, v0, v4 +; GFX7-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX7-NEXT: v_mul_f32_e32 v3, v5, v3 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX7-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v7 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX7-NEXT: v_alignbit_b32 v1, v1, v6, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fmuladd_v4bf16: diff --git a/llvm/test/CodeGen/AMDGPU/bswap.ll b/llvm/test/CodeGen/AMDGPU/bswap.ll index 7c4854408d716..286f89220ec3b 100644 --- a/llvm/test/CodeGen/AMDGPU/bswap.ll +++ b/llvm/test/CodeGen/AMDGPU/bswap.ll @@ -604,15 +604,11 @@ define <2 x i16> @v_bswap_v2i16(<2 x i16> %src) { ; SI-LABEL: v_bswap_v2i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_alignbit_b32 v2, v0, v0, 8 +; SI-NEXT: v_alignbit_b32 v1, v0, v0, 8 ; SI-NEXT: v_alignbit_b32 v0, v0, v0, 24 ; SI-NEXT: s_mov_b32 s4, 0xff00ff -; SI-NEXT: v_alignbit_b32 v3, v1, v1, 8 -; SI-NEXT: v_alignbit_b32 v1, v1, v1, 24 -; SI-NEXT: v_bfi_b32 v0, s4, v0, v2 -; SI-NEXT: v_bfi_b32 v1, s4, v1, v3 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_bfi_b32 v0, s4, v0, v1 +; SI-NEXT: v_alignbit_b32 v0, v0, v0, 16 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_bswap_v2i16: @@ -635,20 +631,15 @@ define <3 x i16> @v_bswap_v3i16(<3 x i16> %src) { ; SI-LABEL: v_bswap_v3i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_alignbit_b32 v3, v0, v0, 8 +; SI-NEXT: v_alignbit_b32 v2, v0, v0, 8 ; SI-NEXT: v_alignbit_b32 v0, v0, v0, 24 ; SI-NEXT: s_mov_b32 s4, 0xff00ff -; SI-NEXT: v_alignbit_b32 v4, v1, v1, 8 +; SI-NEXT: v_alignbit_b32 v3, v1, v1, 8 ; SI-NEXT: v_alignbit_b32 v1, v1, v1, 24 -; SI-NEXT: v_alignbit_b32 v5, v2, v2, 8 -; SI-NEXT: v_alignbit_b32 v2, v2, v2, 24 -; SI-NEXT: v_bfi_b32 v0, s4, v0, v3 -; SI-NEXT: v_bfi_b32 v1, s4, v1, v4 -; SI-NEXT: v_bfi_b32 v2, s4, v2, v5 +; SI-NEXT: v_bfi_b32 v0, s4, v0, v2 +; SI-NEXT: v_bfi_b32 v1, s4, v1, v3 +; SI-NEXT: v_alignbit_b32 v0, v0, v0, 16 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_bswap_v3i16: @@ -673,25 +664,15 @@ define <4 x i16> @v_bswap_v4i16(<4 x i16> %src) { ; SI-LABEL: v_bswap_v4i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_alignbit_b32 v4, v2, v2, 8 -; SI-NEXT: v_alignbit_b32 v2, v2, v2, 24 -; SI-NEXT: s_mov_b32 s4, 0xff00ff -; SI-NEXT: v_alignbit_b32 v5, v3, v3, 8 -; SI-NEXT: v_alignbit_b32 v3, v3, v3, 24 -; SI-NEXT: v_alignbit_b32 v6, v0, v0, 8 +; SI-NEXT: v_alignbit_b32 v2, v0, v0, 8 ; SI-NEXT: v_alignbit_b32 v0, v0, v0, 24 -; SI-NEXT: v_alignbit_b32 v7, v1, v1, 8 +; SI-NEXT: s_mov_b32 s4, 0xff00ff +; SI-NEXT: v_alignbit_b32 v3, v1, v1, 8 ; SI-NEXT: v_alignbit_b32 v1, v1, v1, 24 -; SI-NEXT: v_bfi_b32 v2, s4, v2, v4 -; SI-NEXT: v_bfi_b32 v3, s4, v3, v5 -; SI-NEXT: v_bfi_b32 v0, s4, v0, v6 -; SI-NEXT: v_bfi_b32 v1, s4, v1, v7 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_bfi_b32 v0, s4, v0, v2 +; SI-NEXT: v_bfi_b32 v1, s4, v1, v3 +; SI-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v1, v1, 16 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_bswap_v4i16: diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll index 70e7ec2ea8b67..3546141afe5bb 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll @@ -7030,93 +7030,107 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s20 -; GFX7-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s20 +; GFX7-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v2 +; GFX7-NEXT: v_mov_b32_e32 v2, s20 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 -; GFX7-NEXT: v_mov_b32_e32 v4, s20 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5 ; GFX7-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v6 -; GFX7-NEXT: v_or_b32_e32 v6, v0, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v0 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_add_f32_e32 v5, v5, v0 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_or_b32_e32 v5, v3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GFX7-NEXT: v_or_b32_e32 v4, v6, v3 ; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[16:19], 0 offen offset:1024 glc +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen offset:1024 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB19_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s20 -; GFX6-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v0 +; GFX6-NEXT: v_mov_b32_e32 v1, s20 +; GFX6-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v2 +; GFX6-NEXT: v_mov_b32_e32 v2, s6 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v4 -; GFX6-NEXT: v_mov_b32_e32 v4, s6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5 ; GFX6-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v6 -; GFX6-NEXT: v_or_b32_e32 v6, v0, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v0 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_add_f32_e32 v5, v5, v0 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_or_b32_e32 v5, v3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GFX6-NEXT: v_or_b32_e32 v4, v6, v3 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[16:19], 0 offen glc +; GFX6-NEXT: v_mov_b32_e32 v6, v4 +; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB19_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256 @@ -7253,18 +7267,21 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fin ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s20 -; GFX7-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s20 +; GFX7-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v2 +; GFX7-NEXT: v_mov_b32_e32 v2, s20 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GFX7-NEXT: v_mov_b32_e32 v2, s20 ; GFX7-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -7298,19 +7315,22 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fin ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s20 -; GFX6-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v0 +; GFX6-NEXT: v_mov_b32_e32 v1, s20 +; GFX6-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v2 +; GFX6-NEXT: v_mov_b32_e32 v2, s6 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GFX6-NEXT: v_mov_b32_e32 v2, s6 ; GFX6-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -7677,6 +7697,9 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX7-NEXT: s_mov_b64 s[6:7], exec ; GFX7-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0 @@ -7692,33 +7715,33 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall ; GFX7-NEXT: s_cbranch_execnz .LBB21_1 ; GFX7-NEXT: ; %bb.2: ; GFX7-NEXT: s_mov_b64 exec, s[6:7] +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v9, v5 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v8 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v8 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v8 +; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v8 +; GFX7-NEXT: v_cvt_f32_f16_e32 v9, v7 ; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v11, v9 ; GFX7-NEXT: s_mov_b64 s[6:7], 0 ; GFX7-NEXT: .LBB21_3: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Loop Header: Depth=1 ; GFX7-NEXT: ; Child Loop BB21_4 Depth 2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v8 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v7 ; GFX7-NEXT: s_mov_b64 s[12:13], exec -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v6 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_or_b32_e32 v6, v6, v5 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v10 -; GFX7-NEXT: v_add_f32_e32 v8, v8, v11 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v5, v6 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v9 +; GFX7-NEXT: v_add_f32_e32 v8, v8, v10 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 ; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v8 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v7 ; GFX7-NEXT: v_or_b32_e32 v5, v8, v5 -; GFX7-NEXT: v_mov_b32_e32 v9, v6 -; GFX7-NEXT: v_mov_b32_e32 v8, v5 +; GFX7-NEXT: v_mov_b32_e32 v8, v6 +; GFX7-NEXT: v_mov_b32_e32 v7, v5 ; GFX7-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX7-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0 @@ -7730,30 +7753,35 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall ; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v4, s[8:11], 0 offen offset:1024 glc +; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[8:11], 0 offen offset:1024 glc ; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB21_4 ; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 ; GFX7-NEXT: s_mov_b64 exec, s[12:13] ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v8 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v8 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v8, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v8 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX7-NEXT: s_cbranch_execnz .LBB21_3 ; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX7-NEXT: v_mov_b32_e32 v0, v7 -; GFX7-NEXT: v_mov_b32_e32 v1, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v8 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_add_i32_e32 v9, vcc, 0x400, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX6-NEXT: v_add_i32_e32 v8, vcc, 0x400, v4 ; GFX6-NEXT: s_mov_b64 s[6:7], exec ; GFX6-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_readfirstlane_b32 s8, v0 @@ -7771,33 +7799,32 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall ; GFX6-NEXT: ; %bb.2: ; GFX6-NEXT: s_mov_b64 exec, s[6:7] ; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v10, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v11, v8 +; GFX6-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v9, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v10, v5 ; GFX6-NEXT: s_mov_b64 s[6:7], 0 ; GFX6-NEXT: .LBB21_3: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Loop Header: Depth=1 ; GFX6-NEXT: ; Child Loop BB21_4 Depth 2 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v7 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX6-NEXT: s_mov_b64 s[12:13], exec ; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v10 -; GFX6-NEXT: v_add_f32_e32 v7, v7, v11 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX6-NEXT: v_or_b32_e32 v5, v4, v5 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v9 +; GFX6-NEXT: v_add_f32_e32 v7, v7, v10 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v4, v7, v4 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 +; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX6-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX6-NEXT: v_readfirstlane_b32 s8, v0 @@ -7809,25 +7836,26 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall ; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v9, s[8:11], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v8, s[8:11], 0 offen glc ; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB21_4 ; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 ; GFX6-NEXT: s_mov_b64 exec, s[12:13] -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 ; GFX6-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX6-NEXT: s_cbranch_execnz .LBB21_3 ; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX6-NEXT: v_mov_b32_e32 v0, v4 -; GFX6-NEXT: v_mov_b32_e32 v1, v5 -; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v7 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256 %result = atomicrmw fadd ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -7997,93 +8025,107 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s20 -; GFX7-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s20 +; GFX7-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v2 +; GFX7-NEXT: v_mov_b32_e32 v2, s20 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 -; GFX7-NEXT: v_mov_b32_e32 v4, s20 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5 ; GFX7-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v6 -; GFX7-NEXT: v_or_b32_e32 v6, v0, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v0 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_add_f32_e32 v5, v5, v0 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_or_b32_e32 v5, v3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GFX7-NEXT: v_or_b32_e32 v4, v6, v3 ; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[16:19], 0 offen offset:1024 glc +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen offset:1024 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB22_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s20 -; GFX6-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v0 +; GFX6-NEXT: v_mov_b32_e32 v1, s20 +; GFX6-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v2 +; GFX6-NEXT: v_mov_b32_e32 v2, s6 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v4 -; GFX6-NEXT: v_mov_b32_e32 v4, s6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5 ; GFX6-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v6 -; GFX6-NEXT: v_or_b32_e32 v6, v0, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v0 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_add_f32_e32 v5, v5, v0 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_or_b32_e32 v5, v3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GFX6-NEXT: v_or_b32_e32 v4, v6, v3 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[16:19], 0 offen glc +; GFX6-NEXT: v_mov_b32_e32 v6, v4 +; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB22_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256 @@ -8250,18 +8292,21 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace( ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s20 -; GFX7-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s20 +; GFX7-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v2 +; GFX7-NEXT: v_mov_b32_e32 v2, s20 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GFX7-NEXT: v_mov_b32_e32 v2, s20 ; GFX7-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -8295,19 +8340,22 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace( ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s20 -; GFX6-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v0 +; GFX6-NEXT: v_mov_b32_e32 v1, s20 +; GFX6-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v2 +; GFX6-NEXT: v_mov_b32_e32 v2, s6 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GFX6-NEXT: v_mov_b32_e32 v2, s6 ; GFX6-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -8507,93 +8555,107 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s20 -; GFX7-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s20 +; GFX7-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v2 +; GFX7-NEXT: v_mov_b32_e32 v2, s20 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 -; GFX7-NEXT: v_mov_b32_e32 v4, s20 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5 ; GFX7-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v6 -; GFX7-NEXT: v_or_b32_e32 v6, v0, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v0 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_add_f32_e32 v5, v5, v0 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_or_b32_e32 v5, v3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GFX7-NEXT: v_or_b32_e32 v4, v6, v3 ; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[16:19], 0 offen offset:1024 glc +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen offset:1024 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB24_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_remote_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s20 -; GFX6-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v0 +; GFX6-NEXT: v_mov_b32_e32 v1, s20 +; GFX6-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v2 +; GFX6-NEXT: v_mov_b32_e32 v2, s6 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v4 -; GFX6-NEXT: v_mov_b32_e32 v4, s6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5 ; GFX6-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v6 -; GFX6-NEXT: v_or_b32_e32 v6, v0, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v0 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_add_f32_e32 v5, v5, v0 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_or_b32_e32 v5, v3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GFX6-NEXT: v_or_b32_e32 v4, v6, v3 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[16:19], 0 offen glc +; GFX6-NEXT: v_mov_b32_e32 v6, v4 +; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB24_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256 @@ -8760,18 +8822,21 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_rem ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s20 -; GFX7-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s20 +; GFX7-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v2 +; GFX7-NEXT: v_mov_b32_e32 v2, s20 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GFX7-NEXT: v_mov_b32_e32 v2, s20 ; GFX7-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -8805,19 +8870,22 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_rem ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_remote_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s20 -; GFX6-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v0 +; GFX6-NEXT: v_mov_b32_e32 v1, s20 +; GFX6-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v2 +; GFX6-NEXT: v_mov_b32_e32 v2, s6 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GFX6-NEXT: v_mov_b32_e32 v2, s6 ; GFX6-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -9198,85 +9266,97 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s20 -; GFX7-NEXT: buffer_load_dword v4, v2, s[16:19], 0 offen offset:1024 +; GFX7-NEXT: v_mov_b32_e32 v1, s20 +; GFX7-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; GFX7-NEXT: v_mov_b32_e32 v4, s20 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_mov_b32_e32 v2, s20 ; GFX7-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v3 -; GFX7-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v6 -; GFX7-NEXT: v_alignbit_b32 v0, v0, v5, 16 -; GFX7-NEXT: v_mov_b32_e32 v6, v1 -; GFX7-NEXT: v_mov_b32_e32 v5, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen offset:1024 glc +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v1 +; GFX7-NEXT: v_add_f32_e32 v5, v5, v0 +; GFX7-NEXT: v_alignbit_b32 v4, v3, v4, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_alignbit_b32 v3, v3, v5, 16 +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: v_mov_b32_e32 v5, v3 +; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen offset:1024 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB26_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v4 +; GFX7-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s20 -; GFX6-NEXT: buffer_load_dword v4, v2, s[16:19], 0 offen offset:1024 +; GFX6-NEXT: v_mov_b32_e32 v1, s20 +; GFX6-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; GFX6-NEXT: v_mov_b32_e32 v4, s6 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX6-NEXT: v_mov_b32_e32 v2, s6 ; GFX6-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v3 -; GFX6-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX6-NEXT: v_alignbit_b32 v1, v1, v0, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v6 -; GFX6-NEXT: v_alignbit_b32 v0, v0, v5, 16 -; GFX6-NEXT: v_mov_b32_e32 v6, v1 -; GFX6-NEXT: v_mov_b32_e32 v5, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v1 +; GFX6-NEXT: v_add_f32_e32 v5, v5, v0 +; GFX6-NEXT: v_alignbit_b32 v4, v3, v4, 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v6 +; GFX6-NEXT: v_alignbit_b32 v3, v3, v5, 16 +; GFX6-NEXT: v_mov_b32_e32 v6, v4 +; GFX6-NEXT: v_mov_b32_e32 v5, v3 +; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB26_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v4 +; GFX6-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(7) %ptr, i32 256 @@ -9611,8 +9691,10 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s20 -; GFX7-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 +; GFX7-NEXT: v_mov_b32_e32 v1, s20 +; GFX7-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 @@ -9652,8 +9734,10 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s20 -; GFX6-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 +; GFX6-NEXT: v_mov_b32_e32 v1, s20 +; GFX6-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 @@ -10277,6 +10361,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v5 ; GFX7-NEXT: s_mov_b64 s[6:7], exec ; GFX7-NEXT: .LBB28_1: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0 @@ -10292,30 +10378,30 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX7-NEXT: s_cbranch_execnz .LBB28_1 ; GFX7-NEXT: ; %bb.2: ; GFX7-NEXT: s_mov_b64 exec, s[6:7] -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v5 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v6 +; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v8 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v8 ; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; GFX7-NEXT: s_mov_b64 s[6:7], 0 -; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v5 +; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v7 +; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v6 ; GFX7-NEXT: .LBB28_3: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Loop Header: Depth=1 ; GFX7-NEXT: ; Child Loop BB28_4 Depth 2 -; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v8 -; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v7 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX7-NEXT: v_add_f32_e32 v8, v8, v11 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v5 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v8 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v6 ; GFX7-NEXT: v_add_f32_e32 v5, v5, v10 -; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v9 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_alignbit_b32 v5, v5, v7, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v8 ; GFX7-NEXT: v_alignbit_b32 v6, v7, v6, 16 -; GFX7-NEXT: v_alignbit_b32 v5, v8, v5, 16 -; GFX7-NEXT: v_mov_b32_e32 v9, v6 +; GFX7-NEXT: v_mov_b32_e32 v8, v6 ; GFX7-NEXT: s_mov_b64 s[12:13], exec -; GFX7-NEXT: v_mov_b32_e32 v8, v5 +; GFX7-NEXT: v_mov_b32_e32 v7, v5 ; GFX7-NEXT: .LBB28_4: ; Parent Loop BB28_3 Depth=1 ; GFX7-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0 @@ -10327,28 +10413,32 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v4, s[8:11], 0 offen offset:1024 glc +; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[8:11], 0 offen offset:1024 glc ; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB28_4 ; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB28_3 Depth=1 ; GFX7-NEXT: s_mov_b64 exec, s[12:13] ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v8, v6 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v8 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 ; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v7 ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX7-NEXT: s_cbranch_execnz .LBB28_3 ; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX7-NEXT: v_mov_b32_e32 v0, v8 -; GFX7-NEXT: v_mov_b32_e32 v1, v7 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v8 +; GFX7-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX6-NEXT: v_add_i32_e32 v8, vcc, 0x400, v4 ; GFX6-NEXT: s_mov_b64 s[6:7], exec ; GFX6-NEXT: .LBB28_1: ; =>This Inner Loop Header: Depth=1 @@ -10417,8 +10507,10 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX6-NEXT: s_cbranch_execnz .LBB28_3 ; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX6-NEXT: v_mov_b32_e32 v0, v7 -; GFX6-NEXT: v_mov_b32_e32 v1, v4 +; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v7 +; GFX6-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(7) %ptr, i32 256 %result = atomicrmw fadd ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -10765,85 +10857,97 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s20 -; GFX7-NEXT: buffer_load_dword v4, v2, s[16:19], 0 offen offset:1024 +; GFX7-NEXT: v_mov_b32_e32 v1, s20 +; GFX7-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; GFX7-NEXT: v_mov_b32_e32 v4, s20 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_mov_b32_e32 v2, s20 ; GFX7-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v3 -; GFX7-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v6 -; GFX7-NEXT: v_alignbit_b32 v0, v0, v5, 16 -; GFX7-NEXT: v_mov_b32_e32 v6, v1 -; GFX7-NEXT: v_mov_b32_e32 v5, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen offset:1024 glc +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v1 +; GFX7-NEXT: v_add_f32_e32 v5, v5, v0 +; GFX7-NEXT: v_alignbit_b32 v4, v3, v4, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_alignbit_b32 v3, v3, v5, 16 +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: v_mov_b32_e32 v5, v3 +; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen offset:1024 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB29_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v4 +; GFX7-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s20 -; GFX6-NEXT: buffer_load_dword v4, v2, s[16:19], 0 offen offset:1024 +; GFX6-NEXT: v_mov_b32_e32 v1, s20 +; GFX6-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; GFX6-NEXT: v_mov_b32_e32 v4, s6 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX6-NEXT: v_mov_b32_e32 v2, s6 ; GFX6-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v3 -; GFX6-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX6-NEXT: v_alignbit_b32 v1, v1, v0, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v6 -; GFX6-NEXT: v_alignbit_b32 v0, v0, v5, 16 -; GFX6-NEXT: v_mov_b32_e32 v6, v1 -; GFX6-NEXT: v_mov_b32_e32 v5, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v1 +; GFX6-NEXT: v_add_f32_e32 v5, v5, v0 +; GFX6-NEXT: v_alignbit_b32 v4, v3, v4, 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v6 +; GFX6-NEXT: v_alignbit_b32 v3, v3, v5, 16 +; GFX6-NEXT: v_mov_b32_e32 v6, v4 +; GFX6-NEXT: v_mov_b32_e32 v5, v3 +; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB29_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v4 +; GFX6-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(7) %ptr, i32 256 @@ -11178,8 +11282,10 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s20 -; GFX7-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 +; GFX7-NEXT: v_mov_b32_e32 v1, s20 +; GFX7-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 @@ -11219,8 +11325,10 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s20 -; GFX6-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 +; GFX6-NEXT: v_mov_b32_e32 v1, s20 +; GFX6-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 @@ -11604,85 +11712,97 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s20 -; GFX7-NEXT: buffer_load_dword v4, v2, s[16:19], 0 offen offset:1024 +; GFX7-NEXT: v_mov_b32_e32 v1, s20 +; GFX7-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; GFX7-NEXT: v_mov_b32_e32 v4, s20 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_mov_b32_e32 v2, s20 ; GFX7-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v3 -; GFX7-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v6 -; GFX7-NEXT: v_alignbit_b32 v0, v0, v5, 16 -; GFX7-NEXT: v_mov_b32_e32 v6, v1 -; GFX7-NEXT: v_mov_b32_e32 v5, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen offset:1024 glc +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v1 +; GFX7-NEXT: v_add_f32_e32 v5, v5, v0 +; GFX7-NEXT: v_alignbit_b32 v4, v3, v4, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_alignbit_b32 v3, v3, v5, 16 +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: v_mov_b32_e32 v5, v3 +; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen offset:1024 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB31_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v4 +; GFX7-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_remote_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s20 -; GFX6-NEXT: buffer_load_dword v4, v2, s[16:19], 0 offen offset:1024 +; GFX6-NEXT: v_mov_b32_e32 v1, s20 +; GFX6-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; GFX6-NEXT: v_mov_b32_e32 v4, s6 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX6-NEXT: v_mov_b32_e32 v2, s6 ; GFX6-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v3 -; GFX6-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX6-NEXT: v_alignbit_b32 v1, v1, v0, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v6 -; GFX6-NEXT: v_alignbit_b32 v0, v0, v5, 16 -; GFX6-NEXT: v_mov_b32_e32 v6, v1 -; GFX6-NEXT: v_mov_b32_e32 v5, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v1 +; GFX6-NEXT: v_add_f32_e32 v5, v5, v0 +; GFX6-NEXT: v_alignbit_b32 v4, v3, v4, 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v6 +; GFX6-NEXT: v_alignbit_b32 v3, v3, v5, 16 +; GFX6-NEXT: v_mov_b32_e32 v6, v4 +; GFX6-NEXT: v_mov_b32_e32 v5, v3 +; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB31_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v4 +; GFX6-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(7) %ptr, i32 256 @@ -12017,8 +12137,10 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s20 -; GFX7-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 +; GFX7-NEXT: v_mov_b32_e32 v1, s20 +; GFX7-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 @@ -12058,8 +12180,10 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_remote_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s20 -; GFX6-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 +; GFX6-NEXT: v_mov_b32_e32 v1, s20 +; GFX6-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 @@ -12430,8 +12554,10 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s20 -; GFX7-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 +; GFX7-NEXT: v_mov_b32_e32 v1, s20 +; GFX7-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 @@ -12471,8 +12597,10 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s20 -; GFX6-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 +; GFX6-NEXT: v_mov_b32_e32 v1, s20 +; GFX6-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll index 064b36cc261b3..7896edd5016f0 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll @@ -6250,93 +6250,107 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s20 -; GFX7-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s20 +; GFX7-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v2 +; GFX7-NEXT: v_mov_b32_e32 v2, s20 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 -; GFX7-NEXT: v_mov_b32_e32 v4, s20 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5 ; GFX7-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX7-NEXT: v_max_f32_e32 v6, v6, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v6 -; GFX7-NEXT: v_or_b32_e32 v6, v0, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v0 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_max_f32_e32 v5, v5, v0 +; GFX7-NEXT: v_max_f32_e32 v6, v6, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_or_b32_e32 v5, v3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GFX7-NEXT: v_or_b32_e32 v4, v6, v3 ; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[16:19], 0 offen offset:1024 glc +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen offset:1024 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB16_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s20 -; GFX6-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v0 +; GFX6-NEXT: v_mov_b32_e32 v1, s20 +; GFX6-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v2 +; GFX6-NEXT: v_mov_b32_e32 v2, s6 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v4 -; GFX6-NEXT: v_mov_b32_e32 v4, s6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5 ; GFX6-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX6-NEXT: v_max_f32_e32 v6, v6, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v6 -; GFX6-NEXT: v_or_b32_e32 v6, v0, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v0 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_max_f32_e32 v5, v5, v0 +; GFX6-NEXT: v_max_f32_e32 v6, v6, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_or_b32_e32 v5, v3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GFX6-NEXT: v_or_b32_e32 v4, v6, v3 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[16:19], 0 offen glc +; GFX6-NEXT: v_mov_b32_e32 v6, v4 +; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB16_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256 @@ -6552,18 +6566,21 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s20 -; GFX7-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s20 +; GFX7-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v2 +; GFX7-NEXT: v_mov_b32_e32 v2, s20 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GFX7-NEXT: v_mov_b32_e32 v2, s20 ; GFX7-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -6597,19 +6614,22 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s20 -; GFX6-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v0 +; GFX6-NEXT: v_mov_b32_e32 v1, s20 +; GFX6-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v2 +; GFX6-NEXT: v_mov_b32_e32 v2, s6 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GFX6-NEXT: v_mov_b32_e32 v2, s6 ; GFX6-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -7085,6 +7105,9 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX7-NEXT: s_mov_b64 s[6:7], exec ; GFX7-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0 @@ -7100,33 +7123,33 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX7-NEXT: s_cbranch_execnz .LBB18_1 ; GFX7-NEXT: ; %bb.2: ; GFX7-NEXT: s_mov_b64 exec, s[6:7] +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v9, v5 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v8 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v8 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v8 +; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v8 +; GFX7-NEXT: v_cvt_f32_f16_e32 v9, v7 ; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v11, v9 ; GFX7-NEXT: s_mov_b64 s[6:7], 0 ; GFX7-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Loop Header: Depth=1 ; GFX7-NEXT: ; Child Loop BB18_4 Depth 2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v8 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v7 ; GFX7-NEXT: s_mov_b64 s[12:13], exec -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v6 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_or_b32_e32 v6, v6, v5 -; GFX7-NEXT: v_max_f32_e32 v7, v7, v10 -; GFX7-NEXT: v_max_f32_e32 v8, v8, v11 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v5, v6 +; GFX7-NEXT: v_max_f32_e32 v7, v7, v9 +; GFX7-NEXT: v_max_f32_e32 v8, v8, v10 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 ; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v8 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v7 ; GFX7-NEXT: v_or_b32_e32 v5, v8, v5 -; GFX7-NEXT: v_mov_b32_e32 v9, v6 -; GFX7-NEXT: v_mov_b32_e32 v8, v5 +; GFX7-NEXT: v_mov_b32_e32 v8, v6 +; GFX7-NEXT: v_mov_b32_e32 v7, v5 ; GFX7-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX7-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0 @@ -7138,30 +7161,35 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v4, s[8:11], 0 offen offset:1024 glc +; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[8:11], 0 offen offset:1024 glc ; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB18_4 ; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 ; GFX7-NEXT: s_mov_b64 exec, s[12:13] ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v8 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v8 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v8, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v8 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX7-NEXT: s_cbranch_execnz .LBB18_3 ; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX7-NEXT: v_mov_b32_e32 v0, v7 -; GFX7-NEXT: v_mov_b32_e32 v1, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v8 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_add_i32_e32 v9, vcc, 0x400, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX6-NEXT: v_add_i32_e32 v8, vcc, 0x400, v4 ; GFX6-NEXT: s_mov_b64 s[6:7], exec ; GFX6-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_readfirstlane_b32 s8, v0 @@ -7179,33 +7207,32 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX6-NEXT: ; %bb.2: ; GFX6-NEXT: s_mov_b64 exec, s[6:7] ; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v10, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v11, v8 +; GFX6-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v9, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v10, v5 ; GFX6-NEXT: s_mov_b64 s[6:7], 0 ; GFX6-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Loop Header: Depth=1 ; GFX6-NEXT: ; Child Loop BB18_4 Depth 2 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v7 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX6-NEXT: s_mov_b64 s[12:13], exec ; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_max_f32_e32 v6, v6, v10 -; GFX6-NEXT: v_max_f32_e32 v7, v7, v11 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX6-NEXT: v_or_b32_e32 v5, v4, v5 +; GFX6-NEXT: v_max_f32_e32 v6, v6, v9 +; GFX6-NEXT: v_max_f32_e32 v7, v7, v10 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v4, v7, v4 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 +; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX6-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX6-NEXT: v_readfirstlane_b32 s8, v0 @@ -7217,25 +7244,26 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v9, s[8:11], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v8, s[8:11], 0 offen glc ; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB18_4 ; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 ; GFX6-NEXT: s_mov_b64 exec, s[12:13] -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 ; GFX6-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX6-NEXT: s_cbranch_execnz .LBB18_3 ; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX6-NEXT: v_mov_b32_e32 v0, v4 -; GFX6-NEXT: v_mov_b32_e32 v1, v5 -; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v7 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256 %result = atomicrmw fmax ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -7678,85 +7706,97 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s20 -; GFX7-NEXT: buffer_load_dword v4, v2, s[16:19], 0 offen offset:1024 +; GFX7-NEXT: v_mov_b32_e32 v1, s20 +; GFX7-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v0 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v1 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; GFX7-NEXT: v_mov_b32_e32 v4, s20 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX7-NEXT: v_mov_b32_e32 v2, s20 ; GFX7-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX7-NEXT: v_max_f32_e32 v6, v6, v3 -; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v5 -; GFX7-NEXT: v_alignbit_b32 v0, v0, v6, 16 -; GFX7-NEXT: v_mov_b32_e32 v6, v1 -; GFX7-NEXT: v_mov_b32_e32 v5, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen offset:1024 glc +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_max_f32_e32 v5, v5, v0 +; GFX7-NEXT: v_max_f32_e32 v6, v6, v1 +; GFX7-NEXT: v_alignbit_b32 v4, v4, v3, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX7-NEXT: v_alignbit_b32 v3, v3, v6, 16 +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: v_mov_b32_e32 v5, v3 +; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen offset:1024 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB19_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v3 +; GFX7-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s20 -; GFX6-NEXT: buffer_load_dword v4, v2, s[16:19], 0 offen offset:1024 +; GFX6-NEXT: v_mov_b32_e32 v1, s20 +; GFX6-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v0 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff0000, v1 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; GFX6-NEXT: v_mov_b32_e32 v4, s6 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX6-NEXT: v_mov_b32_e32 v2, s6 ; GFX6-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX6-NEXT: v_max_f32_e32 v6, v6, v3 -; GFX6-NEXT: v_alignbit_b32 v1, v1, v0, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v5 -; GFX6-NEXT: v_alignbit_b32 v0, v0, v6, 16 -; GFX6-NEXT: v_mov_b32_e32 v6, v1 -; GFX6-NEXT: v_mov_b32_e32 v5, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_max_f32_e32 v5, v5, v0 +; GFX6-NEXT: v_max_f32_e32 v6, v6, v1 +; GFX6-NEXT: v_alignbit_b32 v4, v4, v3, 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX6-NEXT: v_alignbit_b32 v3, v3, v6, 16 +; GFX6-NEXT: v_mov_b32_e32 v6, v4 +; GFX6-NEXT: v_mov_b32_e32 v5, v3 +; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB19_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v3 +; GFX6-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(7) %ptr, i32 256 @@ -8175,8 +8215,10 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s20 -; GFX7-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 +; GFX7-NEXT: v_mov_b32_e32 v1, s20 +; GFX7-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v0 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 @@ -8216,8 +8258,10 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s20 -; GFX6-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 +; GFX6-NEXT: v_mov_b32_e32 v1, s20 +; GFX6-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v0 @@ -8985,6 +9029,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: s_mov_b64 s[6:7], exec ; GFX7-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0 @@ -8995,7 +9041,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] ; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX7-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024 +; GFX7-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 ; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB21_1 ; GFX7-NEXT: ; %bb.2: @@ -9003,27 +9049,27 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v8 -; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v7 +; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; GFX7-NEXT: s_mov_b64 s[6:7], 0 -; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v6 -; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v5 +; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v6 +; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v5 ; GFX7-NEXT: .LBB21_3: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Loop Header: Depth=1 ; GFX7-NEXT: ; Child Loop BB21_4 Depth 2 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v7 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v8 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v8 -; GFX7-NEXT: v_max_f32_e32 v5, v5, v10 +; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GFX7-NEXT: v_max_f32_e32 v5, v5, v9 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v7 ; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_max_f32_e32 v8, v8, v11 +; GFX7-NEXT: v_max_f32_e32 v8, v8, v10 ; GFX7-NEXT: v_alignbit_b32 v6, v6, v7, 16 ; GFX7-NEXT: v_alignbit_b32 v5, v5, v8, 16 -; GFX7-NEXT: v_mov_b32_e32 v9, v6 +; GFX7-NEXT: v_mov_b32_e32 v8, v6 ; GFX7-NEXT: s_mov_b64 s[12:13], exec -; GFX7-NEXT: v_mov_b32_e32 v8, v5 +; GFX7-NEXT: v_mov_b32_e32 v7, v5 ; GFX7-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX7-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0 @@ -9035,28 +9081,32 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v4, s[8:11], 0 offen offset:1024 glc +; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[8:11], 0 offen offset:1024 glc ; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB21_4 ; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 ; GFX7-NEXT: s_mov_b64 exec, s[12:13] ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v8, v6 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v8 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v7 ; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX7-NEXT: s_cbranch_execnz .LBB21_3 ; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX7-NEXT: v_mov_b32_e32 v0, v8 -; GFX7-NEXT: v_mov_b32_e32 v1, v7 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v8 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v7 +; GFX7-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX6-NEXT: v_add_i32_e32 v8, vcc, 0x400, v4 ; GFX6-NEXT: s_mov_b64 s[6:7], exec ; GFX6-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 @@ -9068,7 +9118,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] ; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX6-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 +; GFX6-NEXT: buffer_load_dword v9, v4, s[8:11], 0 offen offset:1024 ; GFX6-NEXT: ; implicit-def: $vgpr4 ; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB21_1 @@ -9077,24 +9127,24 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v7 -; GFX6-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v9 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v9 ; GFX6-NEXT: s_mov_b64 s[6:7], 0 ; GFX6-NEXT: v_and_b32_e32 v9, 0xffff0000, v6 ; GFX6-NEXT: v_and_b32_e32 v10, 0xffff0000, v5 ; GFX6-NEXT: .LBB21_3: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Loop Header: Depth=1 ; GFX6-NEXT: ; Child Loop BB21_4 Depth 2 -; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v4 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 -; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v7 -; GFX6-NEXT: v_max_f32_e32 v4, v4, v9 -; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v6 +; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v7 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX6-NEXT: v_mul_f32_e32 v7, 1.0, v4 +; GFX6-NEXT: v_max_f32_e32 v6, v6, v9 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v7 ; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_max_f32_e32 v7, v7, v10 -; GFX6-NEXT: v_alignbit_b32 v5, v5, v6, 16 -; GFX6-NEXT: v_alignbit_b32 v4, v4, v7, 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX6-NEXT: v_max_f32_e32 v4, v4, v10 +; GFX6-NEXT: v_alignbit_b32 v5, v5, v7, 16 +; GFX6-NEXT: v_alignbit_b32 v4, v6, v4, 16 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 ; GFX6-NEXT: s_mov_b64 s[12:13], exec ; GFX6-NEXT: v_mov_b32_e32 v6, v4 @@ -9116,17 +9166,19 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX6-NEXT: s_mov_b64 exec, s[12:13] ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 -; GFX6-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_lshlrev_b32_e32 v7, 16, v6 +; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v6 +; GFX6-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX6-NEXT: s_cbranch_execnz .LBB21_3 ; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX6-NEXT: v_mov_b32_e32 v0, v7 -; GFX6-NEXT: v_mov_b32_e32 v1, v4 +; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v7 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v4 +; GFX6-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(7) %ptr, i32 256 %result = atomicrmw fmax ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll index c8e7540124fd9..2ade237eaa6da 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll @@ -6250,93 +6250,107 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s20 -; GFX7-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s20 +; GFX7-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v2 +; GFX7-NEXT: v_mov_b32_e32 v2, s20 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 -; GFX7-NEXT: v_mov_b32_e32 v4, s20 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5 ; GFX7-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX7-NEXT: v_min_f32_e32 v6, v6, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v6 -; GFX7-NEXT: v_or_b32_e32 v6, v0, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v0 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_min_f32_e32 v5, v5, v0 +; GFX7-NEXT: v_min_f32_e32 v6, v6, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_or_b32_e32 v5, v3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GFX7-NEXT: v_or_b32_e32 v4, v6, v3 ; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[16:19], 0 offen offset:1024 glc +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen offset:1024 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB16_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s20 -; GFX6-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v0 +; GFX6-NEXT: v_mov_b32_e32 v1, s20 +; GFX6-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v2 +; GFX6-NEXT: v_mov_b32_e32 v2, s6 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v4 -; GFX6-NEXT: v_mov_b32_e32 v4, s6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5 ; GFX6-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX6-NEXT: v_min_f32_e32 v6, v6, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v6 -; GFX6-NEXT: v_or_b32_e32 v6, v0, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v0 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_min_f32_e32 v5, v5, v0 +; GFX6-NEXT: v_min_f32_e32 v6, v6, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_or_b32_e32 v5, v3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GFX6-NEXT: v_or_b32_e32 v4, v6, v3 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[16:19], 0 offen glc +; GFX6-NEXT: v_mov_b32_e32 v6, v4 +; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB16_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256 @@ -6552,18 +6566,21 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s20 -; GFX7-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s20 +; GFX7-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v2 +; GFX7-NEXT: v_mov_b32_e32 v2, s20 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GFX7-NEXT: v_mov_b32_e32 v2, s20 ; GFX7-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -6597,19 +6614,22 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s20 -; GFX6-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v0 +; GFX6-NEXT: v_mov_b32_e32 v1, s20 +; GFX6-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v2 +; GFX6-NEXT: v_mov_b32_e32 v2, s6 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GFX6-NEXT: v_mov_b32_e32 v2, s6 ; GFX6-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -7085,6 +7105,9 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX7-NEXT: s_mov_b64 s[6:7], exec ; GFX7-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0 @@ -7100,33 +7123,33 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX7-NEXT: s_cbranch_execnz .LBB18_1 ; GFX7-NEXT: ; %bb.2: ; GFX7-NEXT: s_mov_b64 exec, s[6:7] +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v9, v5 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v8 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v8 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v8 +; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v8 +; GFX7-NEXT: v_cvt_f32_f16_e32 v9, v7 ; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v11, v9 ; GFX7-NEXT: s_mov_b64 s[6:7], 0 ; GFX7-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Loop Header: Depth=1 ; GFX7-NEXT: ; Child Loop BB18_4 Depth 2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v8 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v7 ; GFX7-NEXT: s_mov_b64 s[12:13], exec -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v6 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_or_b32_e32 v6, v6, v5 -; GFX7-NEXT: v_min_f32_e32 v7, v7, v10 -; GFX7-NEXT: v_min_f32_e32 v8, v8, v11 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v5, v6 +; GFX7-NEXT: v_min_f32_e32 v7, v7, v9 +; GFX7-NEXT: v_min_f32_e32 v8, v8, v10 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 ; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v8 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v7 ; GFX7-NEXT: v_or_b32_e32 v5, v8, v5 -; GFX7-NEXT: v_mov_b32_e32 v9, v6 -; GFX7-NEXT: v_mov_b32_e32 v8, v5 +; GFX7-NEXT: v_mov_b32_e32 v8, v6 +; GFX7-NEXT: v_mov_b32_e32 v7, v5 ; GFX7-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX7-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0 @@ -7138,30 +7161,35 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v4, s[8:11], 0 offen offset:1024 glc +; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[8:11], 0 offen offset:1024 glc ; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB18_4 ; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 ; GFX7-NEXT: s_mov_b64 exec, s[12:13] ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v8 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v8 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v8, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v8 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX7-NEXT: s_cbranch_execnz .LBB18_3 ; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX7-NEXT: v_mov_b32_e32 v0, v7 -; GFX7-NEXT: v_mov_b32_e32 v1, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v8 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_add_i32_e32 v9, vcc, 0x400, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX6-NEXT: v_add_i32_e32 v8, vcc, 0x400, v4 ; GFX6-NEXT: s_mov_b64 s[6:7], exec ; GFX6-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_readfirstlane_b32 s8, v0 @@ -7179,33 +7207,32 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX6-NEXT: ; %bb.2: ; GFX6-NEXT: s_mov_b64 exec, s[6:7] ; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v10, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v11, v8 +; GFX6-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v9, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v10, v5 ; GFX6-NEXT: s_mov_b64 s[6:7], 0 ; GFX6-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Loop Header: Depth=1 ; GFX6-NEXT: ; Child Loop BB18_4 Depth 2 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v7 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX6-NEXT: s_mov_b64 s[12:13], exec ; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_min_f32_e32 v6, v6, v10 -; GFX6-NEXT: v_min_f32_e32 v7, v7, v11 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX6-NEXT: v_or_b32_e32 v5, v4, v5 +; GFX6-NEXT: v_min_f32_e32 v6, v6, v9 +; GFX6-NEXT: v_min_f32_e32 v7, v7, v10 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v4, v7, v4 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 +; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX6-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX6-NEXT: v_readfirstlane_b32 s8, v0 @@ -7217,25 +7244,26 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v9, s[8:11], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v8, s[8:11], 0 offen glc ; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB18_4 ; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 ; GFX6-NEXT: s_mov_b64 exec, s[12:13] -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 ; GFX6-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX6-NEXT: s_cbranch_execnz .LBB18_3 ; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX6-NEXT: v_mov_b32_e32 v0, v4 -; GFX6-NEXT: v_mov_b32_e32 v1, v5 -; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v7 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256 %result = atomicrmw fmin ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -7678,85 +7706,97 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s20 -; GFX7-NEXT: buffer_load_dword v4, v2, s[16:19], 0 offen offset:1024 +; GFX7-NEXT: v_mov_b32_e32 v1, s20 +; GFX7-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v0 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v1 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; GFX7-NEXT: v_mov_b32_e32 v4, s20 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX7-NEXT: v_mov_b32_e32 v2, s20 ; GFX7-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX7-NEXT: v_min_f32_e32 v6, v6, v3 -; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v5 -; GFX7-NEXT: v_alignbit_b32 v0, v0, v6, 16 -; GFX7-NEXT: v_mov_b32_e32 v6, v1 -; GFX7-NEXT: v_mov_b32_e32 v5, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen offset:1024 glc +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_min_f32_e32 v5, v5, v0 +; GFX7-NEXT: v_min_f32_e32 v6, v6, v1 +; GFX7-NEXT: v_alignbit_b32 v4, v4, v3, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX7-NEXT: v_alignbit_b32 v3, v3, v6, 16 +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: v_mov_b32_e32 v5, v3 +; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen offset:1024 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB19_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v3 +; GFX7-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s20 -; GFX6-NEXT: buffer_load_dword v4, v2, s[16:19], 0 offen offset:1024 +; GFX6-NEXT: v_mov_b32_e32 v1, s20 +; GFX6-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v0 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff0000, v1 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; GFX6-NEXT: v_mov_b32_e32 v4, s6 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX6-NEXT: v_mov_b32_e32 v2, s6 ; GFX6-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX6-NEXT: v_min_f32_e32 v6, v6, v3 -; GFX6-NEXT: v_alignbit_b32 v1, v1, v0, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v5 -; GFX6-NEXT: v_alignbit_b32 v0, v0, v6, 16 -; GFX6-NEXT: v_mov_b32_e32 v6, v1 -; GFX6-NEXT: v_mov_b32_e32 v5, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_min_f32_e32 v5, v5, v0 +; GFX6-NEXT: v_min_f32_e32 v6, v6, v1 +; GFX6-NEXT: v_alignbit_b32 v4, v4, v3, 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX6-NEXT: v_alignbit_b32 v3, v3, v6, 16 +; GFX6-NEXT: v_mov_b32_e32 v6, v4 +; GFX6-NEXT: v_mov_b32_e32 v5, v3 +; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB19_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v3 +; GFX6-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(7) %ptr, i32 256 @@ -8175,8 +8215,10 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s20 -; GFX7-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 +; GFX7-NEXT: v_mov_b32_e32 v1, s20 +; GFX7-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v0 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 @@ -8216,8 +8258,10 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s20 -; GFX6-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 +; GFX6-NEXT: v_mov_b32_e32 v1, s20 +; GFX6-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v0 @@ -8985,6 +9029,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: s_mov_b64 s[6:7], exec ; GFX7-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0 @@ -8995,7 +9041,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] ; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX7-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024 +; GFX7-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 ; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB21_1 ; GFX7-NEXT: ; %bb.2: @@ -9003,27 +9049,27 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v8 -; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v7 +; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; GFX7-NEXT: s_mov_b64 s[6:7], 0 -; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v6 -; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v5 +; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v6 +; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v5 ; GFX7-NEXT: .LBB21_3: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Loop Header: Depth=1 ; GFX7-NEXT: ; Child Loop BB21_4 Depth 2 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v7 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v8 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v8 -; GFX7-NEXT: v_min_f32_e32 v5, v5, v10 +; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GFX7-NEXT: v_min_f32_e32 v5, v5, v9 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v7 ; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_min_f32_e32 v8, v8, v11 +; GFX7-NEXT: v_min_f32_e32 v8, v8, v10 ; GFX7-NEXT: v_alignbit_b32 v6, v6, v7, 16 ; GFX7-NEXT: v_alignbit_b32 v5, v5, v8, 16 -; GFX7-NEXT: v_mov_b32_e32 v9, v6 +; GFX7-NEXT: v_mov_b32_e32 v8, v6 ; GFX7-NEXT: s_mov_b64 s[12:13], exec -; GFX7-NEXT: v_mov_b32_e32 v8, v5 +; GFX7-NEXT: v_mov_b32_e32 v7, v5 ; GFX7-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX7-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0 @@ -9035,28 +9081,32 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v4, s[8:11], 0 offen offset:1024 glc +; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[8:11], 0 offen offset:1024 glc ; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB21_4 ; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 ; GFX7-NEXT: s_mov_b64 exec, s[12:13] ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v8, v6 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v8 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v7 ; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX7-NEXT: s_cbranch_execnz .LBB21_3 ; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX7-NEXT: v_mov_b32_e32 v0, v8 -; GFX7-NEXT: v_mov_b32_e32 v1, v7 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v8 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v7 +; GFX7-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX6-NEXT: v_add_i32_e32 v8, vcc, 0x400, v4 ; GFX6-NEXT: s_mov_b64 s[6:7], exec ; GFX6-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 @@ -9068,7 +9118,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] ; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX6-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 +; GFX6-NEXT: buffer_load_dword v9, v4, s[8:11], 0 offen offset:1024 ; GFX6-NEXT: ; implicit-def: $vgpr4 ; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB21_1 @@ -9077,24 +9127,24 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v7 -; GFX6-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v9 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v9 ; GFX6-NEXT: s_mov_b64 s[6:7], 0 ; GFX6-NEXT: v_and_b32_e32 v9, 0xffff0000, v6 ; GFX6-NEXT: v_and_b32_e32 v10, 0xffff0000, v5 ; GFX6-NEXT: .LBB21_3: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Loop Header: Depth=1 ; GFX6-NEXT: ; Child Loop BB21_4 Depth 2 -; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v4 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 -; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v7 -; GFX6-NEXT: v_min_f32_e32 v4, v4, v9 -; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v6 +; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v7 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX6-NEXT: v_mul_f32_e32 v7, 1.0, v4 +; GFX6-NEXT: v_min_f32_e32 v6, v6, v9 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v7 ; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_min_f32_e32 v7, v7, v10 -; GFX6-NEXT: v_alignbit_b32 v5, v5, v6, 16 -; GFX6-NEXT: v_alignbit_b32 v4, v4, v7, 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX6-NEXT: v_min_f32_e32 v4, v4, v10 +; GFX6-NEXT: v_alignbit_b32 v5, v5, v7, 16 +; GFX6-NEXT: v_alignbit_b32 v4, v6, v4, 16 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 ; GFX6-NEXT: s_mov_b64 s[12:13], exec ; GFX6-NEXT: v_mov_b32_e32 v6, v4 @@ -9116,17 +9166,19 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX6-NEXT: s_mov_b64 exec, s[12:13] ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 -; GFX6-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_lshlrev_b32_e32 v7, 16, v6 +; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v6 +; GFX6-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX6-NEXT: s_cbranch_execnz .LBB21_3 ; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX6-NEXT: v_mov_b32_e32 v0, v7 -; GFX6-NEXT: v_mov_b32_e32 v1, v4 +; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v7 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v4 +; GFX6-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(7) %ptr, i32 256 %result = atomicrmw fmin ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 diff --git a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll index c407f7645315d..f67e5b86497ba 100644 --- a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll +++ b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll @@ -2933,8 +2933,6 @@ define amdgpu_kernel void @test_call_external_void_func_v2i16() #0 { ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v2i16@rel32@hi+12 ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: s_mov_b32 s32, 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CI-NEXT: s_endpgm ; @@ -3046,7 +3044,7 @@ define amdgpu_kernel void @test_call_external_void_func_v3i16() #0 { ; CI-NEXT: s_add_u32 s36, s36, s3 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 -; CI-NEXT: buffer_load_dwordx2 v[3:4], off, s[0:3], 0 +; CI-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 ; CI-NEXT: s_addc_u32 s37, s37, 0 ; CI-NEXT: s_mov_b64 s[6:7], s[0:1] ; CI-NEXT: s_mov_b64 s[0:1], s[36:37] @@ -3056,9 +3054,7 @@ define amdgpu_kernel void @test_call_external_void_func_v3i16() #0 { ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: s_mov_b32 s32, 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_lshr_b64 v[1:2], v[3:4], 16 -; CI-NEXT: v_mov_b32_e32 v0, v3 -; CI-NEXT: v_mov_b32_e32 v2, v4 +; CI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CI-NEXT: s_endpgm ; @@ -3171,7 +3167,7 @@ define amdgpu_kernel void @test_call_external_void_func_v3f16() #0 { ; CI-NEXT: s_add_u32 s36, s36, s3 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 -; CI-NEXT: buffer_load_dwordx2 v[1:2], off, s[0:3], 0 +; CI-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 ; CI-NEXT: s_addc_u32 s37, s37, 0 ; CI-NEXT: s_mov_b64 s[6:7], s[0:1] ; CI-NEXT: s_mov_b64 s[0:1], s[36:37] @@ -3181,10 +3177,7 @@ define amdgpu_kernel void @test_call_external_void_func_v3f16() #0 { ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: s_mov_b32 s32, 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_cvt_f32_f16_e32 v0, v1 -; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CI-NEXT: s_endpgm ; @@ -3301,9 +3294,8 @@ define amdgpu_kernel void @test_call_external_void_func_v3i16_imm() #0 { ; CI-NEXT: s_add_u32 s4, s4, external_void_func_v3i16@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v3i16@rel32@hi+12 ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] -; CI-NEXT: v_mov_b32_e32 v0, 1 -; CI-NEXT: v_mov_b32_e32 v1, 2 -; CI-NEXT: v_mov_b32_e32 v2, 3 +; CI-NEXT: v_mov_b32_e32 v0, 0x20001 +; CI-NEXT: v_mov_b32_e32 v1, 3 ; CI-NEXT: s_mov_b32 s32, 0 ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CI-NEXT: s_endpgm @@ -3414,9 +3406,8 @@ define amdgpu_kernel void @test_call_external_void_func_v3f16_imm() #0 { ; CI-NEXT: s_add_u32 s4, s4, external_void_func_v3f16@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v3f16@rel32@hi+12 ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] -; CI-NEXT: v_mov_b32_e32 v0, 1.0 -; CI-NEXT: v_mov_b32_e32 v1, 2.0 -; CI-NEXT: v_mov_b32_e32 v2, 4.0 +; CI-NEXT: v_mov_b32_e32 v0, 0x40003c00 +; CI-NEXT: v_mov_b32_e32 v1, 0x4400 ; CI-NEXT: s_mov_b32 s32, 0 ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CI-NEXT: s_endpgm @@ -3533,11 +3524,6 @@ define amdgpu_kernel void @test_call_external_void_func_v4i16() #0 { ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v4i16@rel32@hi+12 ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: s_mov_b32 s32, 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; CI-NEXT: v_mov_b32_e32 v2, v1 -; CI-NEXT: v_mov_b32_e32 v1, v4 ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CI-NEXT: s_endpgm ; @@ -3654,10 +3640,8 @@ define amdgpu_kernel void @test_call_external_void_func_v4i16_imm() #0 { ; CI-NEXT: s_add_u32 s4, s4, external_void_func_v4i16@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v4i16@rel32@hi+12 ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] -; CI-NEXT: v_mov_b32_e32 v0, 1 -; CI-NEXT: v_mov_b32_e32 v1, 2 -; CI-NEXT: v_mov_b32_e32 v2, 3 -; CI-NEXT: v_mov_b32_e32 v3, 4 +; CI-NEXT: v_mov_b32_e32 v0, 0x20001 +; CI-NEXT: v_mov_b32_e32 v1, 0x40003 ; CI-NEXT: s_mov_b32 s32, 0 ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CI-NEXT: s_endpgm @@ -3765,7 +3749,7 @@ define amdgpu_kernel void @test_call_external_void_func_v2f16() #0 { ; CI-NEXT: s_add_u32 s36, s36, s3 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 -; CI-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; CI-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; CI-NEXT: s_addc_u32 s37, s37, 0 ; CI-NEXT: s_mov_b64 s[6:7], s[0:1] ; CI-NEXT: s_mov_b64 s[0:1], s[36:37] @@ -3774,10 +3758,6 @@ define amdgpu_kernel void @test_call_external_void_func_v2f16() #0 { ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v2f16@rel32@hi+12 ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: s_mov_b32 s32, 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_cvt_f32_f16_e32 v0, v1 -; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CI-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll index ef5438e63f667..30dc25388767d 100644 --- a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll +++ b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll @@ -626,12 +626,16 @@ define amdgpu_hs half @hs_mesa(half %arg0) { define amdgpu_ps <2 x half> @ps_mesa_v2f16(<2 x half> %arg0) { ; SI-LABEL: ps_mesa_v2f16: ; SI: ; %bb.0: -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_readfirstlane_b32 s0, v0 ; SI-NEXT: ; return to shader part epilog ; ; VI-LABEL: ps_mesa_v2f16: @@ -660,12 +664,16 @@ define amdgpu_ps <2 x half> @ps_mesa_v2f16(<2 x half> %arg0) { define amdgpu_ps <2 x half> @ps_mesa_inreg_v2f16(<2 x half> inreg %arg0) { ; SI-LABEL: ps_mesa_inreg_v2f16: ; SI: ; %bb.0: -; SI-NEXT: v_cvt_f16_f32_e32 v0, s1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, s0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 +; SI-NEXT: s_lshr_b32 s1, s0, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s1 ; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_add_f32_e32 v1, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_readfirstlane_b32 s0, v0 ; SI-NEXT: ; return to shader part epilog ; ; VI-LABEL: ps_mesa_inreg_v2f16: @@ -696,11 +704,10 @@ define amdgpu_ps <2 x half> @ps_mesa_inreg_v2f16(<2 x half> inreg %arg0) { define amdgpu_ps void @ps_mesa_v2i16(<2 x i16> %arg0) { ; SI-LABEL: ps_mesa_v2i16: ; SI: ; %bb.0: -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_add_i32_e32 v0, vcc, 1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 1, v0 +; SI-NEXT: s_mov_b32 s0, 0xffff0000 ; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_bfi_b32 v0, s0, v0, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x10000, v0 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -735,7 +742,7 @@ define amdgpu_ps void @ps_mesa_v2i16(<2 x i16> %arg0) { define amdgpu_ps void @ps_mesa_inreg_v2i16(<2 x i16> inreg %arg0) { ; SI-LABEL: ps_mesa_inreg_v2i16: ; SI: ; %bb.0: -; SI-NEXT: s_lshl_b32 s1, s1, 16 +; SI-NEXT: s_and_b32 s1, s0, 0xffff0000 ; SI-NEXT: s_add_i32 s0, s0, 1 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_and_b32 s0, s0, 0xffff @@ -779,18 +786,26 @@ define amdgpu_ps void @ps_mesa_inreg_v2i16(<2 x i16> inreg %arg0) { define amdgpu_ps <4 x half> @ps_mesa_v4f16(<4 x half> %arg0) { ; SI-LABEL: ps_mesa_v4f16: ; SI: ; %bb.0: -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_readfirstlane_b32 s0, v0 +; SI-NEXT: v_readfirstlane_b32 s1, v1 ; SI-NEXT: ; return to shader part epilog ; ; VI-LABEL: ps_mesa_v4f16: @@ -824,18 +839,26 @@ define amdgpu_ps <4 x half> @ps_mesa_v4f16(<4 x half> %arg0) { define amdgpu_ps <4 x half> @ps_mesa_inreg_v4f16(<4 x half> inreg %arg0) { ; SI-LABEL: ps_mesa_inreg_v4f16: ; SI: ; %bb.0: -; SI-NEXT: v_cvt_f16_f32_e32 v0, s3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, s2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, s1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, s0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v3 -; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: s_lshr_b32 s2, s0, 16 +; SI-NEXT: s_lshr_b32 s3, s1, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s3 ; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_add_f32_e32 v2, 1.0, v5 -; SI-NEXT: v_add_f32_e32 v3, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_readfirstlane_b32 s0, v0 +; SI-NEXT: v_readfirstlane_b32 s1, v1 ; SI-NEXT: ; return to shader part epilog ; ; VI-LABEL: ps_mesa_inreg_v4f16: diff --git a/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll b/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll index a84872d8eac0f..b5e0d3aeace32 100644 --- a/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll +++ b/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll @@ -1607,8 +1607,12 @@ define <2 x half> @v_clamp_cvt_pkrtz_src_v2f16_denorm(float %a, float %b) #0 { ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cvt_pkrtz_f16_f32_e32 v0, v0, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e64 v0, v0 clamp ; SI-NEXT: v_cvt_f32_f16_e64 v1, v1 clamp +; SI-NEXT: v_cvt_f32_f16_e64 v0, v0 clamp +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_clamp_cvt_pkrtz_src_v2f16_denorm: diff --git a/llvm/test/CodeGen/AMDGPU/constant-address-space-32bit.ll b/llvm/test/CodeGen/AMDGPU/constant-address-space-32bit.ll index 31ff0572bfd29..40efd06c2bdfd 100644 --- a/llvm/test/CodeGen/AMDGPU/constant-address-space-32bit.ll +++ b/llvm/test/CodeGen/AMDGPU/constant-address-space-32bit.ll @@ -1052,10 +1052,11 @@ define amdgpu_vs <2 x half> @load_v2i16(ptr addrspace(6) inreg %p0, ptr addrspac ; GFX67-NEXT: s_waitcnt lgkmcnt(0) ; GFX67-NEXT: s_lshr_b32 s1, s2, 16 ; GFX67-NEXT: s_lshr_b32 s3, s0, 16 -; GFX67-NEXT: s_add_i32 s3, s3, s1 ; GFX67-NEXT: s_add_i32 s0, s0, s2 -; GFX67-NEXT: v_cvt_f32_f16_e32 v0, s0 -; GFX67-NEXT: v_cvt_f32_f16_e32 v1, s3 +; GFX67-NEXT: s_add_i32 s3, s3, s1 +; GFX67-NEXT: s_and_b32 s0, s0, 0xffff +; GFX67-NEXT: s_lshl_b32 s1, s3, 16 +; GFX67-NEXT: s_or_b32 s0, s0, s1 ; GFX67-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: load_v2i16: @@ -1106,12 +1107,13 @@ define amdgpu_vs <3 x half> @load_v3i16(ptr addrspace(6) inreg %p0, ptr addrspac ; GFX67-NEXT: s_waitcnt lgkmcnt(0) ; GFX67-NEXT: s_lshr_b32 s4, s2, 16 ; GFX67-NEXT: s_lshr_b32 s5, s0, 16 +; GFX67-NEXT: s_add_i32 s0, s0, s2 ; GFX67-NEXT: s_add_i32 s5, s5, s4 ; GFX67-NEXT: s_add_i32 s1, s1, s3 -; GFX67-NEXT: s_add_i32 s0, s0, s2 -; GFX67-NEXT: v_cvt_f32_f16_e32 v0, s0 -; GFX67-NEXT: v_cvt_f32_f16_e32 v1, s5 -; GFX67-NEXT: v_cvt_f32_f16_e32 v2, s1 +; GFX67-NEXT: s_and_b32 s0, s0, 0xffff +; GFX67-NEXT: s_lshl_b32 s2, s5, 16 +; GFX67-NEXT: s_or_b32 s0, s0, s2 +; GFX67-NEXT: s_and_b32 s1, s1, 0xffff ; GFX67-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: load_v3i16: @@ -1165,17 +1167,19 @@ define amdgpu_vs <4 x half> @load_v4i16(ptr addrspace(6) inreg %p0, ptr addrspac ; GFX67-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x4 ; GFX67-NEXT: s_waitcnt lgkmcnt(0) ; GFX67-NEXT: s_lshr_b32 s4, s0, 16 -; GFX67-NEXT: s_lshr_b32 s5, s1, 16 ; GFX67-NEXT: s_lshr_b32 s6, s2, 16 +; GFX67-NEXT: s_lshr_b32 s5, s1, 16 ; GFX67-NEXT: s_lshr_b32 s7, s3, 16 -; GFX67-NEXT: s_add_i32 s5, s5, s7 +; GFX67-NEXT: s_add_i32 s0, s0, s2 ; GFX67-NEXT: s_add_i32 s4, s4, s6 ; GFX67-NEXT: s_add_i32 s1, s1, s3 -; GFX67-NEXT: s_add_i32 s0, s0, s2 -; GFX67-NEXT: v_cvt_f32_f16_e32 v0, s0 -; GFX67-NEXT: v_cvt_f32_f16_e32 v1, s4 -; GFX67-NEXT: v_cvt_f32_f16_e32 v2, s1 -; GFX67-NEXT: v_cvt_f32_f16_e32 v3, s5 +; GFX67-NEXT: s_add_i32 s5, s5, s7 +; GFX67-NEXT: s_and_b32 s0, s0, 0xffff +; GFX67-NEXT: s_lshl_b32 s2, s4, 16 +; GFX67-NEXT: s_and_b32 s1, s1, 0xffff +; GFX67-NEXT: s_or_b32 s0, s0, s2 +; GFX67-NEXT: s_lshl_b32 s2, s5, 16 +; GFX67-NEXT: s_or_b32 s1, s1, s2 ; GFX67-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: load_v4i16: @@ -1235,23 +1239,26 @@ define amdgpu_vs <6 x half> @load_v6i16(ptr addrspace(6) inreg %p0, ptr addrspac ; GFX67-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x8 ; GFX67-NEXT: s_waitcnt lgkmcnt(0) ; GFX67-NEXT: s_lshr_b32 s3, s0, 16 -; GFX67-NEXT: s_lshr_b32 s7, s1, 16 -; GFX67-NEXT: s_lshr_b32 s8, s2, 16 ; GFX67-NEXT: s_lshr_b32 s9, s4, 16 +; GFX67-NEXT: s_lshr_b32 s7, s1, 16 ; GFX67-NEXT: s_lshr_b32 s10, s5, 16 +; GFX67-NEXT: s_add_i32 s0, s0, s4 +; GFX67-NEXT: s_add_i32 s3, s3, s9 +; GFX67-NEXT: s_lshr_b32 s8, s2, 16 ; GFX67-NEXT: s_lshr_b32 s11, s6, 16 -; GFX67-NEXT: s_add_i32 s8, s8, s11 +; GFX67-NEXT: s_add_i32 s1, s1, s5 ; GFX67-NEXT: s_add_i32 s7, s7, s10 -; GFX67-NEXT: s_add_i32 s3, s3, s9 +; GFX67-NEXT: s_and_b32 s0, s0, 0xffff +; GFX67-NEXT: s_lshl_b32 s3, s3, 16 ; GFX67-NEXT: s_add_i32 s2, s2, s6 -; GFX67-NEXT: s_add_i32 s1, s1, s5 -; GFX67-NEXT: s_add_i32 s0, s0, s4 -; GFX67-NEXT: v_cvt_f32_f16_e32 v0, s0 -; GFX67-NEXT: v_cvt_f32_f16_e32 v2, s1 -; GFX67-NEXT: v_cvt_f32_f16_e32 v4, s2 -; GFX67-NEXT: v_cvt_f32_f16_e32 v1, s3 -; GFX67-NEXT: v_cvt_f32_f16_e32 v3, s7 -; GFX67-NEXT: v_cvt_f32_f16_e32 v5, s8 +; GFX67-NEXT: s_add_i32 s8, s8, s11 +; GFX67-NEXT: s_and_b32 s1, s1, 0xffff +; GFX67-NEXT: s_or_b32 s0, s0, s3 +; GFX67-NEXT: s_lshl_b32 s3, s7, 16 +; GFX67-NEXT: s_and_b32 s2, s2, 0xffff +; GFX67-NEXT: s_or_b32 s1, s1, s3 +; GFX67-NEXT: s_lshl_b32 s3, s8, 16 +; GFX67-NEXT: s_or_b32 s2, s2, s3 ; GFX67-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: load_v6i16: @@ -1321,29 +1328,33 @@ define amdgpu_vs <8 x half> @load_v8i16(ptr addrspace(6) inreg %p0, ptr addrspac ; GFX67-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x8 ; GFX67-NEXT: s_waitcnt lgkmcnt(0) ; GFX67-NEXT: s_lshr_b32 s8, s0, 16 -; GFX67-NEXT: s_lshr_b32 s9, s1, 16 -; GFX67-NEXT: s_lshr_b32 s10, s2, 16 -; GFX67-NEXT: s_lshr_b32 s11, s3, 16 ; GFX67-NEXT: s_lshr_b32 s12, s4, 16 +; GFX67-NEXT: s_lshr_b32 s9, s1, 16 ; GFX67-NEXT: s_lshr_b32 s13, s5, 16 +; GFX67-NEXT: s_add_i32 s0, s0, s4 +; GFX67-NEXT: s_add_i32 s8, s8, s12 +; GFX67-NEXT: s_lshr_b32 s10, s2, 16 ; GFX67-NEXT: s_lshr_b32 s14, s6, 16 +; GFX67-NEXT: s_add_i32 s1, s1, s5 +; GFX67-NEXT: s_add_i32 s9, s9, s13 +; GFX67-NEXT: s_and_b32 s0, s0, 0xffff +; GFX67-NEXT: s_lshl_b32 s4, s8, 16 +; GFX67-NEXT: s_lshr_b32 s11, s3, 16 ; GFX67-NEXT: s_lshr_b32 s15, s7, 16 -; GFX67-NEXT: s_add_i32 s11, s11, s15 +; GFX67-NEXT: s_add_i32 s2, s2, s6 ; GFX67-NEXT: s_add_i32 s10, s10, s14 -; GFX67-NEXT: s_add_i32 s9, s9, s13 -; GFX67-NEXT: s_add_i32 s8, s8, s12 +; GFX67-NEXT: s_and_b32 s1, s1, 0xffff +; GFX67-NEXT: s_or_b32 s0, s0, s4 +; GFX67-NEXT: s_lshl_b32 s4, s9, 16 ; GFX67-NEXT: s_add_i32 s3, s3, s7 -; GFX67-NEXT: s_add_i32 s2, s2, s6 -; GFX67-NEXT: s_add_i32 s1, s1, s5 -; GFX67-NEXT: s_add_i32 s0, s0, s4 -; GFX67-NEXT: v_cvt_f32_f16_e32 v0, s0 -; GFX67-NEXT: v_cvt_f32_f16_e32 v2, s1 -; GFX67-NEXT: v_cvt_f32_f16_e32 v4, s2 -; GFX67-NEXT: v_cvt_f32_f16_e32 v6, s3 -; GFX67-NEXT: v_cvt_f32_f16_e32 v1, s8 -; GFX67-NEXT: v_cvt_f32_f16_e32 v3, s9 -; GFX67-NEXT: v_cvt_f32_f16_e32 v5, s10 -; GFX67-NEXT: v_cvt_f32_f16_e32 v7, s11 +; GFX67-NEXT: s_add_i32 s11, s11, s15 +; GFX67-NEXT: s_and_b32 s2, s2, 0xffff +; GFX67-NEXT: s_or_b32 s1, s1, s4 +; GFX67-NEXT: s_lshl_b32 s4, s10, 16 +; GFX67-NEXT: s_and_b32 s3, s3, 0xffff +; GFX67-NEXT: s_or_b32 s2, s2, s4 +; GFX67-NEXT: s_lshl_b32 s4, s11, 16 +; GFX67-NEXT: s_or_b32 s3, s3, s4 ; GFX67-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: load_v8i16: @@ -1423,53 +1434,61 @@ define amdgpu_vs <16 x half> @load_v16i16(ptr addrspace(6) inreg %p0, ptr addrsp ; GFX67-NEXT: s_load_dwordx8 s[8:15], s[8:9], 0x10 ; GFX67-NEXT: s_waitcnt lgkmcnt(0) ; GFX67-NEXT: s_lshr_b32 s16, s0, 16 -; GFX67-NEXT: s_lshr_b32 s17, s1, 16 -; GFX67-NEXT: s_lshr_b32 s18, s2, 16 -; GFX67-NEXT: s_lshr_b32 s19, s3, 16 -; GFX67-NEXT: s_lshr_b32 s20, s4, 16 -; GFX67-NEXT: s_lshr_b32 s21, s5, 16 -; GFX67-NEXT: s_lshr_b32 s22, s6, 16 -; GFX67-NEXT: s_lshr_b32 s23, s7, 16 ; GFX67-NEXT: s_lshr_b32 s24, s8, 16 +; GFX67-NEXT: s_lshr_b32 s17, s1, 16 ; GFX67-NEXT: s_lshr_b32 s25, s9, 16 +; GFX67-NEXT: s_add_i32 s0, s0, s8 +; GFX67-NEXT: s_add_i32 s16, s16, s24 +; GFX67-NEXT: s_lshr_b32 s18, s2, 16 ; GFX67-NEXT: s_lshr_b32 s26, s10, 16 +; GFX67-NEXT: s_add_i32 s1, s1, s9 +; GFX67-NEXT: s_add_i32 s17, s17, s25 +; GFX67-NEXT: s_and_b32 s0, s0, 0xffff +; GFX67-NEXT: s_lshl_b32 s8, s16, 16 +; GFX67-NEXT: s_lshr_b32 s19, s3, 16 ; GFX67-NEXT: s_lshr_b32 s27, s11, 16 +; GFX67-NEXT: s_add_i32 s2, s2, s10 +; GFX67-NEXT: s_add_i32 s18, s18, s26 +; GFX67-NEXT: s_and_b32 s1, s1, 0xffff +; GFX67-NEXT: s_or_b32 s0, s0, s8 +; GFX67-NEXT: s_lshl_b32 s8, s17, 16 +; GFX67-NEXT: s_lshr_b32 s20, s4, 16 ; GFX67-NEXT: s_lshr_b32 s28, s12, 16 +; GFX67-NEXT: s_add_i32 s3, s3, s11 +; GFX67-NEXT: s_add_i32 s19, s19, s27 +; GFX67-NEXT: s_and_b32 s2, s2, 0xffff +; GFX67-NEXT: s_or_b32 s1, s1, s8 +; GFX67-NEXT: s_lshl_b32 s8, s18, 16 +; GFX67-NEXT: s_lshr_b32 s21, s5, 16 ; GFX67-NEXT: s_lshr_b32 s29, s13, 16 +; GFX67-NEXT: s_add_i32 s4, s4, s12 +; GFX67-NEXT: s_add_i32 s20, s20, s28 +; GFX67-NEXT: s_and_b32 s3, s3, 0xffff +; GFX67-NEXT: s_or_b32 s2, s2, s8 +; GFX67-NEXT: s_lshl_b32 s8, s19, 16 +; GFX67-NEXT: s_lshr_b32 s22, s6, 16 ; GFX67-NEXT: s_lshr_b32 s30, s14, 16 +; GFX67-NEXT: s_add_i32 s5, s5, s13 +; GFX67-NEXT: s_add_i32 s21, s21, s29 +; GFX67-NEXT: s_and_b32 s4, s4, 0xffff +; GFX67-NEXT: s_or_b32 s3, s3, s8 +; GFX67-NEXT: s_lshl_b32 s8, s20, 16 +; GFX67-NEXT: s_lshr_b32 s23, s7, 16 ; GFX67-NEXT: s_lshr_b32 s31, s15, 16 -; GFX67-NEXT: s_add_i32 s23, s23, s31 +; GFX67-NEXT: s_add_i32 s6, s6, s14 ; GFX67-NEXT: s_add_i32 s22, s22, s30 -; GFX67-NEXT: s_add_i32 s21, s21, s29 -; GFX67-NEXT: s_add_i32 s20, s20, s28 -; GFX67-NEXT: s_add_i32 s19, s19, s27 -; GFX67-NEXT: s_add_i32 s18, s18, s26 -; GFX67-NEXT: s_add_i32 s17, s17, s25 -; GFX67-NEXT: s_add_i32 s16, s16, s24 +; GFX67-NEXT: s_and_b32 s5, s5, 0xffff +; GFX67-NEXT: s_or_b32 s4, s4, s8 +; GFX67-NEXT: s_lshl_b32 s8, s21, 16 ; GFX67-NEXT: s_add_i32 s7, s7, s15 -; GFX67-NEXT: s_add_i32 s6, s6, s14 -; GFX67-NEXT: s_add_i32 s5, s5, s13 -; GFX67-NEXT: s_add_i32 s4, s4, s12 -; GFX67-NEXT: s_add_i32 s3, s3, s11 -; GFX67-NEXT: s_add_i32 s2, s2, s10 -; GFX67-NEXT: s_add_i32 s1, s1, s9 -; GFX67-NEXT: s_add_i32 s0, s0, s8 -; GFX67-NEXT: v_cvt_f32_f16_e32 v0, s0 -; GFX67-NEXT: v_cvt_f32_f16_e32 v2, s1 -; GFX67-NEXT: v_cvt_f32_f16_e32 v4, s2 -; GFX67-NEXT: v_cvt_f32_f16_e32 v6, s3 -; GFX67-NEXT: v_cvt_f32_f16_e32 v8, s4 -; GFX67-NEXT: v_cvt_f32_f16_e32 v10, s5 -; GFX67-NEXT: v_cvt_f32_f16_e32 v12, s6 -; GFX67-NEXT: v_cvt_f32_f16_e32 v14, s7 -; GFX67-NEXT: v_cvt_f32_f16_e32 v1, s16 -; GFX67-NEXT: v_cvt_f32_f16_e32 v3, s17 -; GFX67-NEXT: v_cvt_f32_f16_e32 v5, s18 -; GFX67-NEXT: v_cvt_f32_f16_e32 v7, s19 -; GFX67-NEXT: v_cvt_f32_f16_e32 v9, s20 -; GFX67-NEXT: v_cvt_f32_f16_e32 v11, s21 -; GFX67-NEXT: v_cvt_f32_f16_e32 v13, s22 -; GFX67-NEXT: v_cvt_f32_f16_e32 v15, s23 +; GFX67-NEXT: s_add_i32 s23, s23, s31 +; GFX67-NEXT: s_and_b32 s6, s6, 0xffff +; GFX67-NEXT: s_or_b32 s5, s5, s8 +; GFX67-NEXT: s_lshl_b32 s8, s22, 16 +; GFX67-NEXT: s_and_b32 s7, s7, 0xffff +; GFX67-NEXT: s_or_b32 s6, s6, s8 +; GFX67-NEXT: s_lshl_b32 s8, s23, 16 +; GFX67-NEXT: s_or_b32 s7, s7, s8 ; GFX67-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: load_v16i16: @@ -1773,20 +1792,25 @@ define amdgpu_vs <2 x float> @sextload_v2i16(ptr addrspace(6) inreg %p0, ptr add define amdgpu_vs <2 x half> @load_v2f16(ptr addrspace(6) inreg %p0, ptr addrspace(6) inreg %p1) #0 { ; GFX67-LABEL: load_v2f16: ; GFX67: ; %bb.0: -; GFX67-NEXT: s_mov_b32 s3, 0 ; GFX67-NEXT: s_mov_b32 s2, s1 +; GFX67-NEXT: s_mov_b32 s3, 0 ; GFX67-NEXT: s_mov_b32 s1, s3 +; GFX67-NEXT: s_load_dword s2, s[2:3], 0x2 ; GFX67-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX67-NEXT: s_load_dword s1, s[2:3], 0x2 ; GFX67-NEXT: s_waitcnt lgkmcnt(0) -; GFX67-NEXT: s_lshr_b32 s2, s0, 16 -; GFX67-NEXT: v_cvt_f32_f16_e32 v0, s0 -; GFX67-NEXT: s_lshr_b32 s0, s1, 16 -; GFX67-NEXT: v_cvt_f32_f16_e32 v1, s2 -; GFX67-NEXT: v_cvt_f32_f16_e32 v2, s1 +; GFX67-NEXT: s_lshr_b32 s1, s2, 16 +; GFX67-NEXT: s_lshr_b32 s3, s0, 16 +; GFX67-NEXT: v_cvt_f32_f16_e32 v0, s3 +; GFX67-NEXT: v_cvt_f32_f16_e32 v1, s1 +; GFX67-NEXT: v_cvt_f32_f16_e32 v2, s2 ; GFX67-NEXT: v_cvt_f32_f16_e32 v3, s0 -; GFX67-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX67-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX67-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX67-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX67-NEXT: v_add_f32_e32 v1, v3, v2 +; GFX67-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX67-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX67-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX67-NEXT: v_readfirstlane_b32 s0, v0 ; GFX67-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: load_v2f16: @@ -1837,14 +1861,21 @@ define amdgpu_vs <3 x half> @load_v3f16(ptr addrspace(6) inreg %p0, ptr addrspac ; GFX67-NEXT: s_lshr_b32 s4, s0, 16 ; GFX67-NEXT: v_cvt_f32_f16_e32 v0, s0 ; GFX67-NEXT: s_lshr_b32 s0, s2, 16 -; GFX67-NEXT: v_cvt_f32_f16_e32 v1, s1 -; GFX67-NEXT: v_cvt_f32_f16_e32 v3, s4 +; GFX67-NEXT: v_cvt_f32_f16_e32 v1, s4 +; GFX67-NEXT: v_cvt_f32_f16_e32 v3, s0 ; GFX67-NEXT: v_cvt_f32_f16_e32 v2, s2 -; GFX67-NEXT: v_cvt_f32_f16_e32 v4, s3 -; GFX67-NEXT: v_cvt_f32_f16_e32 v5, s0 +; GFX67-NEXT: v_cvt_f32_f16_e32 v4, s1 +; GFX67-NEXT: v_cvt_f32_f16_e32 v5, s3 +; GFX67-NEXT: v_add_f32_e32 v1, v1, v3 ; GFX67-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX67-NEXT: v_add_f32_e32 v2, v1, v4 -; GFX67-NEXT: v_add_f32_e32 v1, v3, v5 +; GFX67-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX67-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX67-NEXT: v_add_f32_e32 v2, v4, v5 +; GFX67-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX67-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX67-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX67-NEXT: v_readfirstlane_b32 s0, v0 +; GFX67-NEXT: v_readfirstlane_b32 s1, v2 ; GFX67-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: load_v3f16: @@ -1896,22 +1927,32 @@ define amdgpu_vs <4 x half> @load_v4f16(ptr addrspace(6) inreg %p0, ptr addrspac ; GFX67-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX67-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x4 ; GFX67-NEXT: s_waitcnt lgkmcnt(0) -; GFX67-NEXT: s_lshr_b32 s4, s1, 16 -; GFX67-NEXT: s_lshr_b32 s5, s0, 16 -; GFX67-NEXT: v_cvt_f32_f16_e32 v2, s1 +; GFX67-NEXT: s_lshr_b32 s4, s0, 16 ; GFX67-NEXT: v_cvt_f32_f16_e32 v0, s0 +; GFX67-NEXT: s_lshr_b32 s0, s1, 16 +; GFX67-NEXT: v_cvt_f32_f16_e32 v2, s0 +; GFX67-NEXT: s_lshr_b32 s0, s2, 16 +; GFX67-NEXT: v_cvt_f32_f16_e32 v1, s4 +; GFX67-NEXT: v_cvt_f32_f16_e32 v4, s0 +; GFX67-NEXT: v_cvt_f32_f16_e32 v6, s2 ; GFX67-NEXT: s_lshr_b32 s0, s3, 16 -; GFX67-NEXT: s_lshr_b32 s1, s2, 16 -; GFX67-NEXT: v_cvt_f32_f16_e32 v3, s4 -; GFX67-NEXT: v_cvt_f32_f16_e32 v1, s5 -; GFX67-NEXT: v_cvt_f32_f16_e32 v4, s2 -; GFX67-NEXT: v_cvt_f32_f16_e32 v5, s3 -; GFX67-NEXT: v_cvt_f32_f16_e32 v6, s1 ; GFX67-NEXT: v_cvt_f32_f16_e32 v7, s0 -; GFX67-NEXT: v_add_f32_e32 v0, v0, v4 -; GFX67-NEXT: v_add_f32_e32 v2, v2, v5 -; GFX67-NEXT: v_add_f32_e32 v1, v1, v6 -; GFX67-NEXT: v_add_f32_e32 v3, v3, v7 +; GFX67-NEXT: v_cvt_f32_f16_e32 v3, s1 +; GFX67-NEXT: v_cvt_f32_f16_e32 v5, s3 +; GFX67-NEXT: v_add_f32_e32 v1, v1, v4 +; GFX67-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX67-NEXT: v_add_f32_e32 v0, v0, v6 +; GFX67-NEXT: v_add_f32_e32 v2, v2, v7 +; GFX67-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX67-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX67-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX67-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX67-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX67-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX67-NEXT: v_readfirstlane_b32 s0, v0 +; GFX67-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX67-NEXT: v_or_b32_e32 v0, v3, v0 +; GFX67-NEXT: v_readfirstlane_b32 s1, v0 ; GFX67-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: load_v4f16: @@ -1967,33 +2008,47 @@ define amdgpu_vs <6 x half> @load_v6f16(ptr addrspace(6) inreg %p0, ptr addrspac ; GFX67-NEXT: s_mov_b32 s4, s1 ; GFX67-NEXT: s_mov_b32 s1, s5 ; GFX67-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 -; GFX67-NEXT: s_waitcnt lgkmcnt(0) -; GFX67-NEXT: s_lshr_b32 s6, s1, 16 -; GFX67-NEXT: v_cvt_f32_f16_e32 v3, s6 ; GFX67-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x8 -; GFX67-NEXT: s_lshr_b32 s3, s2, 16 -; GFX67-NEXT: v_cvt_f32_f16_e32 v5, s3 +; GFX67-NEXT: s_waitcnt lgkmcnt(0) ; GFX67-NEXT: s_lshr_b32 s3, s0, 16 ; GFX67-NEXT: v_cvt_f32_f16_e32 v0, s0 -; GFX67-NEXT: s_waitcnt lgkmcnt(0) -; GFX67-NEXT: s_lshr_b32 s0, s6, 16 -; GFX67-NEXT: v_cvt_f32_f16_e32 v6, s0 -; GFX67-NEXT: s_lshr_b32 s0, s5, 16 -; GFX67-NEXT: v_cvt_f32_f16_e32 v7, s0 +; GFX67-NEXT: s_lshr_b32 s0, s1, 16 +; GFX67-NEXT: v_cvt_f32_f16_e32 v2, s0 +; GFX67-NEXT: s_lshr_b32 s0, s2, 16 +; GFX67-NEXT: v_cvt_f32_f16_e32 v4, s0 ; GFX67-NEXT: s_lshr_b32 s0, s4, 16 ; GFX67-NEXT: v_cvt_f32_f16_e32 v1, s3 -; GFX67-NEXT: v_cvt_f32_f16_e32 v4, s2 -; GFX67-NEXT: v_cvt_f32_f16_e32 v2, s1 -; GFX67-NEXT: v_cvt_f32_f16_e32 v8, s4 -; GFX67-NEXT: v_cvt_f32_f16_e32 v9, s5 -; GFX67-NEXT: v_cvt_f32_f16_e32 v10, s6 +; GFX67-NEXT: v_cvt_f32_f16_e32 v6, s0 +; GFX67-NEXT: v_cvt_f32_f16_e32 v7, s4 +; GFX67-NEXT: s_lshr_b32 s0, s5, 16 ; GFX67-NEXT: v_cvt_f32_f16_e32 v11, s0 -; GFX67-NEXT: v_add_f32_e32 v0, v0, v8 -; GFX67-NEXT: v_add_f32_e32 v2, v2, v9 -; GFX67-NEXT: v_add_f32_e32 v4, v4, v10 -; GFX67-NEXT: v_add_f32_e32 v1, v1, v11 -; GFX67-NEXT: v_add_f32_e32 v3, v3, v7 -; GFX67-NEXT: v_add_f32_e32 v5, v5, v6 +; GFX67-NEXT: v_add_f32_e32 v1, v1, v6 +; GFX67-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX67-NEXT: v_add_f32_e32 v0, v0, v7 +; GFX67-NEXT: v_cvt_f32_f16_e32 v3, s1 +; GFX67-NEXT: s_lshr_b32 s1, s6, 16 +; GFX67-NEXT: v_cvt_f32_f16_e32 v10, s5 +; GFX67-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX67-NEXT: v_cvt_f32_f16_e32 v9, s1 +; GFX67-NEXT: v_add_f32_e32 v2, v2, v11 +; GFX67-NEXT: v_cvt_f32_f16_e32 v5, s2 +; GFX67-NEXT: v_cvt_f32_f16_e32 v8, s6 +; GFX67-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX67-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX67-NEXT: v_add_f32_e32 v3, v3, v10 +; GFX67-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX67-NEXT: v_add_f32_e32 v4, v4, v9 +; GFX67-NEXT: v_readfirstlane_b32 s0, v0 +; GFX67-NEXT: v_cvt_f16_f32_e32 v0, v3 +; GFX67-NEXT: v_add_f32_e32 v5, v5, v8 +; GFX67-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX67-NEXT: v_cvt_f16_f32_e32 v2, v4 +; GFX67-NEXT: v_cvt_f16_f32_e32 v3, v5 +; GFX67-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX67-NEXT: v_readfirstlane_b32 s1, v0 +; GFX67-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX67-NEXT: v_or_b32_e32 v0, v3, v0 +; GFX67-NEXT: v_readfirstlane_b32 s2, v0 ; GFX67-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: load_v6f16: @@ -2060,40 +2115,60 @@ define amdgpu_vs <8 x half> @load_v8f16(ptr addrspace(6) inreg %p0, ptr addrspac ; GFX67-NEXT: s_mov_b32 s1, s5 ; GFX67-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX67-NEXT: s_waitcnt lgkmcnt(0) -; GFX67-NEXT: s_lshr_b32 s6, s3, 16 -; GFX67-NEXT: v_cvt_f32_f16_e32 v7, s6 -; GFX67-NEXT: s_lshr_b32 s6, s1, 16 -; GFX67-NEXT: s_lshr_b32 s7, s2, 16 -; GFX67-NEXT: v_cvt_f32_f16_e32 v3, s6 ; GFX67-NEXT: s_lshr_b32 s6, s0, 16 -; GFX67-NEXT: v_cvt_f32_f16_e32 v5, s7 ; GFX67-NEXT: v_cvt_f32_f16_e32 v1, s6 ; GFX67-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x8 ; GFX67-NEXT: v_cvt_f32_f16_e32 v0, s0 -; GFX67-NEXT: v_cvt_f32_f16_e32 v6, s3 -; GFX67-NEXT: v_cvt_f32_f16_e32 v4, s2 -; GFX67-NEXT: v_cvt_f32_f16_e32 v2, s1 +; GFX67-NEXT: s_lshr_b32 s0, s1, 16 +; GFX67-NEXT: v_cvt_f32_f16_e32 v2, s0 +; GFX67-NEXT: s_lshr_b32 s0, s2, 16 +; GFX67-NEXT: v_cvt_f32_f16_e32 v4, s0 +; GFX67-NEXT: s_lshr_b32 s0, s3, 16 +; GFX67-NEXT: v_cvt_f32_f16_e32 v6, s0 ; GFX67-NEXT: s_waitcnt lgkmcnt(0) -; GFX67-NEXT: s_lshr_b32 s0, s7, 16 +; GFX67-NEXT: s_lshr_b32 s0, s4, 16 ; GFX67-NEXT: v_cvt_f32_f16_e32 v8, s0 -; GFX67-NEXT: s_lshr_b32 s0, s6, 16 -; GFX67-NEXT: v_cvt_f32_f16_e32 v9, s0 +; GFX67-NEXT: v_cvt_f32_f16_e32 v9, s4 ; GFX67-NEXT: s_lshr_b32 s0, s5, 16 ; GFX67-NEXT: v_cvt_f32_f16_e32 v10, s0 -; GFX67-NEXT: s_lshr_b32 s0, s4, 16 -; GFX67-NEXT: v_cvt_f32_f16_e32 v11, s0 -; GFX67-NEXT: v_cvt_f32_f16_e32 v12, s4 -; GFX67-NEXT: v_cvt_f32_f16_e32 v13, s5 +; GFX67-NEXT: v_add_f32_e32 v1, v1, v8 +; GFX67-NEXT: v_cvt_f32_f16_e32 v3, s1 +; GFX67-NEXT: v_cvt_f32_f16_e32 v11, s5 +; GFX67-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX67-NEXT: v_add_f32_e32 v0, v0, v9 +; GFX67-NEXT: s_lshr_b32 s0, s6, 16 +; GFX67-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX67-NEXT: v_cvt_f32_f16_e32 v15, s0 +; GFX67-NEXT: v_add_f32_e32 v2, v2, v10 +; GFX67-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX67-NEXT: v_add_f32_e32 v3, v3, v11 +; GFX67-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX67-NEXT: v_cvt_f32_f16_e32 v5, s2 +; GFX67-NEXT: s_lshr_b32 s1, s7, 16 ; GFX67-NEXT: v_cvt_f32_f16_e32 v14, s6 -; GFX67-NEXT: v_cvt_f32_f16_e32 v15, s7 -; GFX67-NEXT: v_add_f32_e32 v0, v0, v12 -; GFX67-NEXT: v_add_f32_e32 v2, v2, v13 -; GFX67-NEXT: v_add_f32_e32 v4, v4, v14 -; GFX67-NEXT: v_add_f32_e32 v6, v6, v15 -; GFX67-NEXT: v_add_f32_e32 v1, v1, v11 -; GFX67-NEXT: v_add_f32_e32 v3, v3, v10 -; GFX67-NEXT: v_add_f32_e32 v5, v5, v9 -; GFX67-NEXT: v_add_f32_e32 v7, v7, v8 +; GFX67-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX67-NEXT: v_cvt_f16_f32_e32 v1, v3 +; GFX67-NEXT: v_cvt_f32_f16_e32 v13, s1 +; GFX67-NEXT: v_add_f32_e32 v4, v4, v15 +; GFX67-NEXT: v_cvt_f32_f16_e32 v7, s3 +; GFX67-NEXT: v_cvt_f32_f16_e32 v12, s7 +; GFX67-NEXT: v_readfirstlane_b32 s0, v0 +; GFX67-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX67-NEXT: v_cvt_f16_f32_e32 v2, v4 +; GFX67-NEXT: v_add_f32_e32 v5, v5, v14 +; GFX67-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX67-NEXT: v_add_f32_e32 v6, v6, v13 +; GFX67-NEXT: v_readfirstlane_b32 s1, v0 +; GFX67-NEXT: v_cvt_f16_f32_e32 v0, v5 +; GFX67-NEXT: v_add_f32_e32 v7, v7, v12 +; GFX67-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX67-NEXT: v_cvt_f16_f32_e32 v2, v6 +; GFX67-NEXT: v_cvt_f16_f32_e32 v3, v7 +; GFX67-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX67-NEXT: v_readfirstlane_b32 s2, v0 +; GFX67-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX67-NEXT: v_or_b32_e32 v0, v3, v0 +; GFX67-NEXT: v_readfirstlane_b32 s3, v0 ; GFX67-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: load_v8f16: @@ -2165,77 +2240,117 @@ define amdgpu_vs <8 x half> @load_v8f16(ptr addrspace(6) inreg %p0, ptr addrspac define amdgpu_vs <16 x half> @load_v16f16(ptr addrspace(6) inreg %p0, ptr addrspace(6) inreg %p1) #0 { ; GFX67-LABEL: load_v16f16: ; GFX67: ; %bb.0: -; GFX67-NEXT: s_mov_b32 s9, 0 -; GFX67-NEXT: s_mov_b32 s8, s1 -; GFX67-NEXT: s_mov_b32 s1, s9 -; GFX67-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x0 +; GFX67-NEXT: s_mov_b32 s3, 0 +; GFX67-NEXT: s_mov_b32 s2, s1 +; GFX67-NEXT: s_mov_b32 s1, s3 +; GFX67-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x0 ; GFX67-NEXT: s_waitcnt lgkmcnt(0) -; GFX67-NEXT: s_lshr_b32 s10, s7, 16 -; GFX67-NEXT: v_cvt_f32_f16_e32 v15, s10 -; GFX67-NEXT: s_lshr_b32 s10, s5, 16 -; GFX67-NEXT: v_cvt_f32_f16_e32 v11, s10 -; GFX67-NEXT: s_lshr_b32 s10, s4, 16 -; GFX67-NEXT: v_cvt_f32_f16_e32 v9, s10 -; GFX67-NEXT: s_lshr_b32 s10, s3, 16 -; GFX67-NEXT: v_cvt_f32_f16_e32 v7, s10 -; GFX67-NEXT: s_lshr_b32 s10, s2, 16 -; GFX67-NEXT: v_cvt_f32_f16_e32 v5, s10 -; GFX67-NEXT: s_lshr_b32 s10, s1, 16 -; GFX67-NEXT: s_lshr_b32 s11, s6, 16 -; GFX67-NEXT: v_cvt_f32_f16_e32 v3, s10 -; GFX67-NEXT: s_lshr_b32 s10, s0, 16 -; GFX67-NEXT: v_cvt_f32_f16_e32 v13, s11 -; GFX67-NEXT: v_cvt_f32_f16_e32 v1, s10 -; GFX67-NEXT: v_cvt_f32_f16_e32 v14, s7 -; GFX67-NEXT: v_cvt_f32_f16_e32 v12, s6 -; GFX67-NEXT: v_cvt_f32_f16_e32 v10, s5 -; GFX67-NEXT: v_cvt_f32_f16_e32 v8, s4 -; GFX67-NEXT: s_load_dwordx8 s[4:11], s[8:9], 0x10 -; GFX67-NEXT: v_cvt_f32_f16_e32 v4, s2 +; GFX67-NEXT: s_lshr_b32 s0, s4, 16 +; GFX67-NEXT: v_cvt_f32_f16_e32 v1, s0 +; GFX67-NEXT: s_lshr_b32 s0, s6, 16 +; GFX67-NEXT: v_cvt_f32_f16_e32 v4, s0 +; GFX67-NEXT: s_lshr_b32 s0, s7, 16 +; GFX67-NEXT: v_cvt_f32_f16_e32 v6, s0 +; GFX67-NEXT: s_lshr_b32 s0, s8, 16 +; GFX67-NEXT: v_cvt_f32_f16_e32 v8, s0 +; GFX67-NEXT: s_lshr_b32 s0, s9, 16 +; GFX67-NEXT: s_lshr_b32 s1, s5, 16 +; GFX67-NEXT: v_cvt_f32_f16_e32 v10, s0 +; GFX67-NEXT: s_lshr_b32 s0, s10, 16 +; GFX67-NEXT: v_cvt_f32_f16_e32 v0, s4 ; GFX67-NEXT: v_cvt_f32_f16_e32 v2, s1 -; GFX67-NEXT: v_cvt_f32_f16_e32 v0, s0 -; GFX67-NEXT: v_cvt_f32_f16_e32 v6, s3 +; GFX67-NEXT: v_cvt_f32_f16_e32 v3, s5 +; GFX67-NEXT: v_cvt_f32_f16_e32 v5, s6 +; GFX67-NEXT: v_cvt_f32_f16_e32 v7, s7 +; GFX67-NEXT: v_cvt_f32_f16_e32 v12, s0 +; GFX67-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x10 +; GFX67-NEXT: v_cvt_f32_f16_e32 v9, s8 +; GFX67-NEXT: s_lshr_b32 s8, s11, 16 +; GFX67-NEXT: v_cvt_f32_f16_e32 v14, s8 +; GFX67-NEXT: v_cvt_f32_f16_e32 v15, s11 ; GFX67-NEXT: s_waitcnt lgkmcnt(0) -; GFX67-NEXT: v_cvt_f32_f16_e32 v17, s4 -; GFX67-NEXT: v_cvt_f32_f16_e32 v18, s5 -; GFX67-NEXT: v_cvt_f32_f16_e32 v19, s6 -; GFX67-NEXT: s_lshr_b32 s4, s4, 16 -; GFX67-NEXT: v_add_f32_e32 v0, v0, v17 +; GFX67-NEXT: s_lshr_b32 s8, s0, 16 +; GFX67-NEXT: v_cvt_f32_f16_e32 v18, s0 +; GFX67-NEXT: s_lshr_b32 s0, s7, 16 +; GFX67-NEXT: v_cvt_f32_f16_e32 v19, s0 ; GFX67-NEXT: v_cvt_f32_f16_e32 v17, s7 -; GFX67-NEXT: v_add_f32_e32 v2, v2, v18 -; GFX67-NEXT: v_cvt_f32_f16_e32 v18, s8 -; GFX67-NEXT: v_add_f32_e32 v4, v4, v19 -; GFX67-NEXT: v_cvt_f32_f16_e32 v19, s9 -; GFX67-NEXT: v_add_f32_e32 v6, v6, v17 -; GFX67-NEXT: v_cvt_f32_f16_e32 v17, s10 -; GFX67-NEXT: v_add_f32_e32 v8, v8, v18 -; GFX67-NEXT: v_cvt_f32_f16_e32 v18, s11 +; GFX67-NEXT: s_lshr_b32 s0, s6, 16 +; GFX67-NEXT: v_cvt_f32_f16_e32 v13, s10 +; GFX67-NEXT: v_add_f32_e32 v14, v14, v19 +; GFX67-NEXT: v_cvt_f32_f16_e32 v19, s0 +; GFX67-NEXT: v_add_f32_e32 v15, v15, v17 +; GFX67-NEXT: v_cvt_f32_f16_e32 v17, s6 +; GFX67-NEXT: s_lshr_b32 s0, s5, 16 +; GFX67-NEXT: v_add_f32_e32 v12, v12, v19 +; GFX67-NEXT: v_cvt_f32_f16_e32 v19, s0 +; GFX67-NEXT: v_cvt_f32_f16_e32 v11, s9 +; GFX67-NEXT: v_add_f32_e32 v13, v13, v17 +; GFX67-NEXT: v_cvt_f32_f16_e32 v17, s5 +; GFX67-NEXT: s_lshr_b32 s0, s4, 16 ; GFX67-NEXT: v_add_f32_e32 v10, v10, v19 -; GFX67-NEXT: v_cvt_f32_f16_e32 v19, s4 -; GFX67-NEXT: s_lshr_b32 s3, s7, 16 -; GFX67-NEXT: s_lshr_b32 s12, s6, 16 -; GFX67-NEXT: s_lshr_b32 s13, s5, 16 -; GFX67-NEXT: v_add_f32_e32 v12, v12, v17 -; GFX67-NEXT: v_cvt_f32_f16_e32 v17, s13 -; GFX67-NEXT: v_add_f32_e32 v14, v14, v18 -; GFX67-NEXT: v_cvt_f32_f16_e32 v18, s12 -; GFX67-NEXT: v_add_f32_e32 v1, v1, v19 -; GFX67-NEXT: v_cvt_f32_f16_e32 v19, s3 -; GFX67-NEXT: s_lshr_b32 s1, s10, 16 -; GFX67-NEXT: s_lshr_b32 s0, s11, 16 -; GFX67-NEXT: v_cvt_f32_f16_e32 v16, s1 -; GFX67-NEXT: s_lshr_b32 s1, s9, 16 -; GFX67-NEXT: s_lshr_b32 s2, s8, 16 -; GFX67-NEXT: v_add_f32_e32 v3, v3, v17 -; GFX67-NEXT: v_cvt_f32_f16_e32 v17, s2 -; GFX67-NEXT: v_add_f32_e32 v5, v5, v18 -; GFX67-NEXT: v_cvt_f32_f16_e32 v18, s1 -; GFX67-NEXT: v_add_f32_e32 v7, v7, v19 +; GFX67-NEXT: v_cvt_f32_f16_e32 v19, s0 +; GFX67-NEXT: v_add_f32_e32 v11, v11, v17 +; GFX67-NEXT: v_cvt_f32_f16_e32 v17, s4 +; GFX67-NEXT: s_lshr_b32 s0, s3, 16 +; GFX67-NEXT: v_add_f32_e32 v8, v8, v19 ; GFX67-NEXT: v_cvt_f32_f16_e32 v19, s0 ; GFX67-NEXT: v_add_f32_e32 v9, v9, v17 -; GFX67-NEXT: v_add_f32_e32 v11, v11, v18 -; GFX67-NEXT: v_add_f32_e32 v13, v13, v16 -; GFX67-NEXT: v_add_f32_e32 v15, v15, v19 +; GFX67-NEXT: v_cvt_f32_f16_e32 v17, s3 +; GFX67-NEXT: s_lshr_b32 s0, s2, 16 +; GFX67-NEXT: v_add_f32_e32 v6, v6, v19 +; GFX67-NEXT: v_cvt_f32_f16_e32 v19, s0 +; GFX67-NEXT: v_cvt_f32_f16_e32 v16, s8 +; GFX67-NEXT: v_add_f32_e32 v7, v7, v17 +; GFX67-NEXT: v_cvt_f32_f16_e32 v17, s2 +; GFX67-NEXT: s_lshr_b32 s0, s1, 16 +; GFX67-NEXT: v_add_f32_e32 v4, v4, v19 +; GFX67-NEXT: v_cvt_f32_f16_e32 v19, s0 +; GFX67-NEXT: v_add_f32_e32 v1, v1, v16 +; GFX67-NEXT: v_add_f32_e32 v5, v5, v17 +; GFX67-NEXT: v_cvt_f32_f16_e32 v17, s1 +; GFX67-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX67-NEXT: v_add_f32_e32 v0, v0, v18 +; GFX67-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX67-NEXT: v_add_f32_e32 v2, v2, v19 +; GFX67-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX67-NEXT: v_add_f32_e32 v3, v3, v17 +; GFX67-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX67-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX67-NEXT: v_cvt_f16_f32_e32 v1, v3 +; GFX67-NEXT: v_readfirstlane_b32 s0, v0 +; GFX67-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX67-NEXT: v_cvt_f16_f32_e32 v2, v4 +; GFX67-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX67-NEXT: v_cvt_f16_f32_e32 v1, v5 +; GFX67-NEXT: v_readfirstlane_b32 s1, v0 +; GFX67-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX67-NEXT: v_cvt_f16_f32_e32 v2, v6 +; GFX67-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX67-NEXT: v_cvt_f16_f32_e32 v1, v7 +; GFX67-NEXT: v_readfirstlane_b32 s2, v0 +; GFX67-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX67-NEXT: v_cvt_f16_f32_e32 v2, v8 +; GFX67-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX67-NEXT: v_cvt_f16_f32_e32 v1, v9 +; GFX67-NEXT: v_readfirstlane_b32 s3, v0 +; GFX67-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX67-NEXT: v_cvt_f16_f32_e32 v2, v10 +; GFX67-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX67-NEXT: v_cvt_f16_f32_e32 v1, v11 +; GFX67-NEXT: v_readfirstlane_b32 s4, v0 +; GFX67-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX67-NEXT: v_cvt_f16_f32_e32 v2, v12 +; GFX67-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX67-NEXT: v_readfirstlane_b32 s5, v0 +; GFX67-NEXT: v_cvt_f16_f32_e32 v0, v13 +; GFX67-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX67-NEXT: v_cvt_f16_f32_e32 v2, v14 +; GFX67-NEXT: v_cvt_f16_f32_e32 v3, v15 +; GFX67-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX67-NEXT: v_readfirstlane_b32 s6, v0 +; GFX67-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX67-NEXT: v_or_b32_e32 v0, v3, v0 +; GFX67-NEXT: v_readfirstlane_b32 s7, v0 ; GFX67-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: load_v16f16: @@ -2361,16 +2476,17 @@ define amdgpu_vs <2 x bfloat> @load_v2bf16(ptr addrspace(6) inreg %p0, ptr addrs ; GFX67-NEXT: s_load_dword s2, s[2:3], 0x2 ; GFX67-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX67-NEXT: s_waitcnt lgkmcnt(0) -; GFX67-NEXT: s_and_b32 s1, s2, 0xffff0000 -; GFX67-NEXT: s_and_b32 s3, s0, 0xffff0000 -; GFX67-NEXT: v_mov_b32_e32 v0, s1 ; GFX67-NEXT: s_lshl_b32 s1, s2, 16 -; GFX67-NEXT: v_add_f32_e32 v1, s3, v0 -; GFX67-NEXT: s_lshl_b32 s0, s0, 16 ; GFX67-NEXT: v_mov_b32_e32 v0, s1 -; GFX67-NEXT: v_add_f32_e32 v0, s0, v0 -; GFX67-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX67-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX67-NEXT: s_and_b32 s1, s2, 0xffff0000 +; GFX67-NEXT: s_lshl_b32 s3, s0, 16 +; GFX67-NEXT: s_and_b32 s0, s0, 0xffff0000 +; GFX67-NEXT: v_mov_b32_e32 v1, s1 +; GFX67-NEXT: v_add_f32_e32 v1, s0, v1 +; GFX67-NEXT: v_add_f32_e32 v0, s3, v0 +; GFX67-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX67-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; GFX67-NEXT: v_readfirstlane_b32 s0, v0 ; GFX67-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: load_v2bf16: diff --git a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll index cb05b5978c384..26e1b66161a2a 100644 --- a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll +++ b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll @@ -2359,12 +2359,12 @@ define <2 x i16> @v_ctlz_zero_undef_v2i16(<2 x i16> %val) { ; SI-LABEL: v_ctlz_zero_undef_v2i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_ffbh_u32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_ffbh_u32_e32 v0, v0 -; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_ctlz_zero_undef_v2i16: @@ -2402,15 +2402,13 @@ define <3 x i16> @v_ctlz_zero_undef_v3i16(<3 x i16> %val) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_ffbh_u32_e32 v1, v1 +; SI-NEXT: v_ffbh_u32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_ffbh_u32_e32 v0, v0 -; SI-NEXT: v_ffbh_u32_e32 v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_or_b32_e32 v2, 0x200000, v3 -; SI-NEXT: v_alignbit_b32 v1, v3, v0, 16 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_ctlz_zero_undef_v3i16: @@ -2451,20 +2449,19 @@ define <4 x i16> @v_ctlz_zero_undef_v4i16(<4 x i16> %val) { ; SI-LABEL: v_ctlz_zero_undef_v4i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_ffbh_u32_e32 v3, v3 ; SI-NEXT: v_ffbh_u32_e32 v2, v2 ; SI-NEXT: v_ffbh_u32_e32 v1, v1 +; SI-NEXT: v_ffbh_u32_e32 v3, v3 ; SI-NEXT: v_ffbh_u32_e32 v0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 63, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_ctlz_zero_undef_v4i16: diff --git a/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll index d1b8a17915adc..777b703d5319d 100644 --- a/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll +++ b/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll @@ -1757,31 +1757,37 @@ define <2 x half> @fmul_select_v2f16_test3(<2 x half> %x, <2 x i32> %bool.arg1, ; GFX7-SDAG-LABEL: fmul_select_v2f16_test3: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v3, 1.0, 2.0, vcc -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 2.0, vcc ; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 ; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 2.0, vcc -; GFX7-SDAG-NEXT: v_mul_f32_e32 v1, v1, v3 -; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX7-SDAG-NEXT: v_mul_f32_e32 v2, v5, v2 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: fmul_select_v2f16_test3: ; GFX7-GISEL: ; %bb.0: ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX7-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 ; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX7-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GFX7-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v2 -; GFX7-GISEL-NEXT: v_ldexp_f32_e32 v1, v1, v3 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; GFX7-GISEL-NEXT: v_ldexp_f32_e32 v1, v3, v2 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-SDAG-LABEL: fmul_select_v2f16_test3: @@ -1913,31 +1919,37 @@ define <2 x half> @fmul_select_v2f16_test4(<2 x half> %x, <2 x i32> %bool.arg1, ; GFX7-SDAG-LABEL: fmul_select_v2f16_test4: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v3, 1.0, 0.5, vcc -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0.5, vcc ; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 ; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0.5, vcc -; GFX7-SDAG-NEXT: v_mul_f32_e32 v1, v1, v3 -; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX7-SDAG-NEXT: v_mul_f32_e32 v2, v5, v2 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: fmul_select_v2f16_test4: ; GFX7-GISEL: ; %bb.0: ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc ; GFX7-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 ; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc -; GFX7-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc -; GFX7-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v2 -; GFX7-GISEL-NEXT: v_ldexp_f32_e32 v1, v1, v3 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; GFX7-GISEL-NEXT: v_ldexp_f32_e32 v1, v3, v2 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-SDAG-LABEL: fmul_select_v2f16_test4: @@ -3031,17 +3043,15 @@ define <2 x bfloat> @fmul_select_v2bf16_test3(<2 x bfloat> %x, <2 x i32> %bool.a ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_cndmask_b32_e64 v2, 1.0, 2.0, vcc -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX7-NEXT: v_cndmask_b32_e64 v3, 1.0, 2.0, vcc -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_mul_f32_e32 v1, v1, v3 +; GFX7-NEXT: v_cndmask_b32_e64 v1, 1.0, 2.0, vcc ; GFX7-NEXT: v_mul_f32_e32 v0, v0, v2 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: v_mul_f32_e32 v1, v3, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: fmul_select_v2bf16_test3: @@ -3236,17 +3246,15 @@ define <2 x bfloat> @fmul_select_v2bf16_test4(<2 x bfloat> %x, <2 x i32> %bool.a ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_cndmask_b32_e64 v2, 1.0, 0.5, vcc -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX7-NEXT: v_cndmask_b32_e64 v3, 1.0, 0.5, vcc -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_mul_f32_e32 v1, v1, v3 +; GFX7-NEXT: v_cndmask_b32_e64 v1, 1.0, 0.5, vcc ; GFX7-NEXT: v_mul_f32_e32 v0, v0, v2 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: v_mul_f32_e32 v1, v3, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: fmul_select_v2bf16_test4: diff --git a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll index 6d3ea5f492373..78a00dd51c2b2 100644 --- a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll +++ b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll @@ -32,20 +32,20 @@ define <4 x i16> @vec_8xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1) ; SI-NEXT: buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:14 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_or_b32_e32 v2, v6, v2 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v4 +; SI-NEXT: v_or_b32_e32 v3, v6, v2 +; SI-NEXT: v_or_b32_e32 v2, v5, v7 ; SI-NEXT: s_mov_b64 vcc, exec ; SI-NEXT: s_cbranch_execnz .LBB0_3 ; SI-NEXT: .LBB0_2: ; %T ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:2 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:4 glc +; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:4 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:6 glc ; SI-NEXT: s_waitcnt vmcnt(0) @@ -59,33 +59,32 @@ define <4 x i16> @vec_8xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1) ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; SI-NEXT: v_or_b32_e32 v2, v2, v0 -; SI-NEXT: v_or_b32_e32 v3, v3, v1 +; SI-NEXT: v_or_b32_e32 v3, v3, v0 +; SI-NEXT: v_or_b32_e32 v2, v2, v1 ; SI-NEXT: .LBB0_3: ; %exit ; SI-NEXT: v_bfe_i32 v0, v3, 0, 16 -; SI-NEXT: v_bfe_i32 v1, v4, 0, 16 -; SI-NEXT: v_bfe_i32 v2, v2, 0, 16 -; SI-NEXT: v_mov_b32_e32 v3, 0xffff -; SI-NEXT: v_mov_b32_e32 v4, 0x8000 -; SI-NEXT: v_mov_b32_e32 v5, 0xffff0000 -; SI-NEXT: v_bfrev_b32_e32 v6, 1 -; SI-NEXT: v_mov_b32_e32 v7, 0xffff8000 +; SI-NEXT: v_bfe_i32 v1, v2, 0, 16 +; SI-NEXT: v_bfe_i32 v2, v4, 0, 16 +; SI-NEXT: v_mov_b32_e32 v3, 0xffff8000 +; SI-NEXT: v_mov_b32_e32 v4, 0xffff +; SI-NEXT: v_mov_b32_e32 v5, 0x8000 +; SI-NEXT: v_mov_b32_e32 v6, 0xffff0000 +; SI-NEXT: v_bfrev_b32_e32 v7, 1 ; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v0 -; SI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc +; SI-NEXT: v_cndmask_b32_e32 v3, -1, v3, vcc ; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v1 -; SI-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc +; SI-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc ; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v2 -; SI-NEXT: v_cndmask_b32_e32 v2, -1, v7, vcc +; SI-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v4 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB0_4: -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: s_mov_b64 vcc, 0 ; SI-NEXT: s_branch .LBB0_2 ; @@ -215,18 +214,18 @@ define <4 x i16> @vec_8xi16_extract_4xi16_2(ptr addrspace(1) %p0, ptr addrspace( ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:6 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:8 glc +; SI-NEXT: buffer_load_ushort v6, v[2:3], s[4:7], 0 addr64 offset:8 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:10 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v6, v[2:3], s[4:7], 0 addr64 offset:12 glc +; SI-NEXT: buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:12 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:14 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v4 -; SI-NEXT: v_or_b32_e32 v3, v6, v3 -; SI-NEXT: v_or_b32_e32 v5, v5, v7 +; SI-NEXT: v_or_b32_e32 v5, v5, v3 +; SI-NEXT: v_or_b32_e32 v3, v6, v7 ; SI-NEXT: s_mov_b64 vcc, exec ; SI-NEXT: s_cbranch_execnz .LBB1_3 ; SI-NEXT: .LBB1_2: ; %T @@ -241,44 +240,42 @@ define <4 x i16> @vec_8xi16_extract_4xi16_2(ptr addrspace(1) %p0, ptr addrspace( ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:6 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:8 glc +; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:8 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:10 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:12 glc +; SI-NEXT: buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:12 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:14 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; SI-NEXT: v_or_b32_e32 v3, v3, v0 -; SI-NEXT: v_or_b32_e32 v5, v5, v1 +; SI-NEXT: v_or_b32_e32 v5, v5, v0 +; SI-NEXT: v_or_b32_e32 v3, v3, v1 ; SI-NEXT: .LBB1_3: ; %exit ; SI-NEXT: v_bfe_i32 v0, v5, 0, 16 -; SI-NEXT: v_bfe_i32 v1, v4, 0, 16 -; SI-NEXT: v_bfe_i32 v3, v3, 0, 16 -; SI-NEXT: v_bfe_i32 v2, v2, 0, 16 +; SI-NEXT: v_bfe_i32 v1, v2, 0, 16 +; SI-NEXT: v_bfe_i32 v2, v3, 0, 16 +; SI-NEXT: v_bfe_i32 v3, v4, 0, 16 ; SI-NEXT: v_mov_b32_e32 v4, 0xffff ; SI-NEXT: v_mov_b32_e32 v5, 0x8000 ; SI-NEXT: v_mov_b32_e32 v6, 0xffff0000 ; SI-NEXT: v_bfrev_b32_e32 v7, 1 ; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v0 -; SI-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; SI-NEXT: v_cndmask_b32_e32 v8, v4, v5, vcc ; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v1 ; SI-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc -; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v3 -; SI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v2 -; SI-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_or_b32_e32 v2, v3, v4 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v3 +; SI-NEXT: v_cndmask_b32_e32 v2, v6, v7, vcc +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: v_or_b32_e32 v1, v8, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB1_4: -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: s_mov_b64 vcc, 0 ; SI-NEXT: s_branch .LBB1_2 @@ -417,13 +414,13 @@ define <4 x half> @vec_8xf16_extract_4xf16(ptr addrspace(1) %p0, ptr addrspace(1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:14 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v7 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v5 +; SI-NEXT: v_or_b32_e32 v3, v6, v3 +; SI-NEXT: v_or_b32_e32 v5, v4, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v5 -; SI-NEXT: v_or_b32_e32 v2, v6, v2 -; SI-NEXT: v_or_b32_e32 v4, v4, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: s_mov_b64 vcc, exec ; SI-NEXT: s_cbranch_execnz .LBB2_3 ; SI-NEXT: .LBB2_2: ; %T @@ -432,11 +429,11 @@ define <4 x half> @vec_8xf16_extract_4xf16(ptr addrspace(1) %p0, ptr addrspace(1 ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:2 glc +; SI-NEXT: buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:2 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:4 glc +; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:4 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:6 glc +; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:6 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:8 glc ; SI-NEXT: s_waitcnt vmcnt(0) @@ -446,13 +443,13 @@ define <4 x half> @vec_8xf16_extract_4xf16(ptr addrspace(1) %p0, ptr addrspace(1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:14 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; SI-NEXT: v_or_b32_e32 v0, v4, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; SI-NEXT: v_or_b32_e32 v0, v3, v0 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v5 ; SI-NEXT: .LBB2_3: ; %exit ; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v3 @@ -468,12 +465,18 @@ define <4 x half> @vec_8xf16_extract_4xf16(ptr addrspace(1) %p0, ptr addrspace(1 ; SI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v2 ; SI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; SI-NEXT: v_mov_b32_e32 v3, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB2_4: -; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: s_mov_b64 vcc, 0 ; SI-NEXT: s_branch .LBB2_2 ; @@ -636,20 +639,20 @@ define <4 x i16> @vec_16xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1 ; SI-NEXT: buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:30 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_or_b32_e32 v2, v6, v2 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v4 +; SI-NEXT: v_or_b32_e32 v3, v6, v2 +; SI-NEXT: v_or_b32_e32 v2, v5, v7 ; SI-NEXT: s_mov_b64 vcc, exec ; SI-NEXT: s_cbranch_execnz .LBB3_3 ; SI-NEXT: .LBB3_2: ; %T ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:2 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:4 glc +; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:4 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:6 glc ; SI-NEXT: s_waitcnt vmcnt(0) @@ -679,33 +682,32 @@ define <4 x i16> @vec_16xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; SI-NEXT: v_or_b32_e32 v2, v2, v0 -; SI-NEXT: v_or_b32_e32 v3, v3, v1 +; SI-NEXT: v_or_b32_e32 v3, v3, v0 +; SI-NEXT: v_or_b32_e32 v2, v2, v1 ; SI-NEXT: .LBB3_3: ; %exit ; SI-NEXT: v_bfe_i32 v0, v3, 0, 16 -; SI-NEXT: v_bfe_i32 v1, v4, 0, 16 -; SI-NEXT: v_bfe_i32 v2, v2, 0, 16 -; SI-NEXT: v_mov_b32_e32 v3, 0xffff -; SI-NEXT: v_mov_b32_e32 v4, 0x8000 -; SI-NEXT: v_mov_b32_e32 v5, 0xffff0000 -; SI-NEXT: v_bfrev_b32_e32 v6, 1 -; SI-NEXT: v_mov_b32_e32 v7, 0xffff8000 +; SI-NEXT: v_bfe_i32 v1, v2, 0, 16 +; SI-NEXT: v_bfe_i32 v2, v4, 0, 16 +; SI-NEXT: v_mov_b32_e32 v3, 0xffff8000 +; SI-NEXT: v_mov_b32_e32 v4, 0xffff +; SI-NEXT: v_mov_b32_e32 v5, 0x8000 +; SI-NEXT: v_mov_b32_e32 v6, 0xffff0000 +; SI-NEXT: v_bfrev_b32_e32 v7, 1 ; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v0 -; SI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc +; SI-NEXT: v_cndmask_b32_e32 v3, -1, v3, vcc ; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v1 -; SI-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc +; SI-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc ; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v2 -; SI-NEXT: v_cndmask_b32_e32 v2, -1, v7, vcc +; SI-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v4 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB3_4: -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: s_mov_b64 vcc, 0 ; SI-NEXT: s_branch .LBB3_2 ; @@ -875,9 +877,9 @@ define <4 x i16> @vec_16xi16_extract_4xi16_2(ptr addrspace(1) %p0, ptr addrspace ; SI-NEXT: buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:30 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_or_b32_e32 v2, v7, v2 -; SI-NEXT: v_or_b32_e32 v3, v6, v3 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v4 +; SI-NEXT: v_or_b32_e32 v3, v7, v2 +; SI-NEXT: v_or_b32_e32 v2, v6, v8 ; SI-NEXT: s_mov_b64 vcc, exec ; SI-NEXT: s_cbranch_execnz .LBB4_3 ; SI-NEXT: .LBB4_2: ; %T @@ -892,11 +894,11 @@ define <4 x i16> @vec_16xi16_extract_4xi16_2(ptr addrspace(1) %p0, ptr addrspace ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:6 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:8 glc +; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:8 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:10 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:12 glc +; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:12 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:14 glc ; SI-NEXT: s_waitcnt vmcnt(0) @@ -918,34 +920,32 @@ define <4 x i16> @vec_16xi16_extract_4xi16_2(ptr addrspace(1) %p0, ptr addrspace ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; SI-NEXT: v_or_b32_e32 v2, v2, v0 -; SI-NEXT: v_or_b32_e32 v3, v3, v1 +; SI-NEXT: v_or_b32_e32 v3, v3, v0 +; SI-NEXT: v_or_b32_e32 v2, v2, v1 ; SI-NEXT: .LBB4_3: ; %exit ; SI-NEXT: v_bfe_i32 v0, v3, 0, 16 -; SI-NEXT: v_bfe_i32 v1, v4, 0, 16 +; SI-NEXT: v_bfe_i32 v1, v5, 0, 16 ; SI-NEXT: v_bfe_i32 v2, v2, 0, 16 -; SI-NEXT: v_bfe_i32 v3, v5, 0, 16 +; SI-NEXT: v_bfe_i32 v3, v4, 0, 16 ; SI-NEXT: v_mov_b32_e32 v4, 0xffff ; SI-NEXT: v_mov_b32_e32 v5, 0x8000 ; SI-NEXT: v_mov_b32_e32 v6, 0xffff0000 ; SI-NEXT: v_bfrev_b32_e32 v7, 1 ; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v0 -; SI-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; SI-NEXT: v_cndmask_b32_e32 v8, v4, v5, vcc ; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v1 ; SI-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc ; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v2 -; SI-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc +; SI-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc ; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v3 -; SI-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_cndmask_b32_e32 v2, v6, v7, vcc +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: v_or_b32_e32 v1, v8, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB4_4: -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: s_mov_b64 vcc, 0 ; SI-NEXT: s_branch .LBB4_2 @@ -1115,13 +1115,13 @@ define <4 x half> @vec_16xf16_extract_4xf16(ptr addrspace(1) %p0, ptr addrspace( ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:30 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v7 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v5 +; SI-NEXT: v_or_b32_e32 v3, v6, v3 +; SI-NEXT: v_or_b32_e32 v5, v4, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v5 -; SI-NEXT: v_or_b32_e32 v2, v6, v2 -; SI-NEXT: v_or_b32_e32 v4, v4, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: s_mov_b64 vcc, exec ; SI-NEXT: s_cbranch_execnz .LBB5_3 ; SI-NEXT: .LBB5_2: ; %T @@ -1130,11 +1130,11 @@ define <4 x half> @vec_16xf16_extract_4xf16(ptr addrspace(1) %p0, ptr addrspace( ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:2 glc +; SI-NEXT: buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:2 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:4 glc +; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:4 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:6 glc +; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:6 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:8 glc ; SI-NEXT: s_waitcnt vmcnt(0) @@ -1160,13 +1160,13 @@ define <4 x half> @vec_16xf16_extract_4xf16(ptr addrspace(1) %p0, ptr addrspace( ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:30 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; SI-NEXT: v_or_b32_e32 v0, v4, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; SI-NEXT: v_or_b32_e32 v0, v3, v0 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v5 ; SI-NEXT: .LBB5_3: ; %exit ; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v3 @@ -1182,12 +1182,18 @@ define <4 x half> @vec_16xf16_extract_4xf16(ptr addrspace(1) %p0, ptr addrspace( ; SI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v2 ; SI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; SI-NEXT: v_mov_b32_e32 v3, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB5_4: -; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: s_mov_b64 vcc, 0 ; SI-NEXT: s_branch .LBB5_2 ; @@ -1326,21 +1332,14 @@ define <8 x i16> @large_vector(ptr addrspace(3) %p, i32 %idxp) { ; SI-NEXT: v_lshlrev_b32_e32 v1, 5, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 -; SI-NEXT: v_add_i32_e32 v5, vcc, 12, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 ; SI-NEXT: s_mov_b32 m0, -1 ; SI-NEXT: ds_read_b32 v0, v0 -; SI-NEXT: ds_read_b32 v2, v1 -; SI-NEXT: ds_read_b32 v4, v3 -; SI-NEXT: ds_read_b32 v6, v5 -; SI-NEXT: s_waitcnt lgkmcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SI-NEXT: s_waitcnt lgkmcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; SI-NEXT: s_waitcnt lgkmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; SI-NEXT: ds_read_b32 v1, v1 +; SI-NEXT: ds_read_b32 v2, v2 +; SI-NEXT: ds_read_b32 v3, v3 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: large_vector: @@ -1419,19 +1418,19 @@ define amdgpu_gfx <8 x i16> @vec_16xi16_extract_8xi16_0(i1 inreg %cond, ptr addr ; SI-NEXT: s_mov_b32 s39, 0xf000 ; SI-NEXT: s_mov_b32 s36, s38 ; SI-NEXT: s_mov_b32 s37, s38 -; SI-NEXT: buffer_load_ushort v9, v[2:3], s[36:39], 0 addr64 glc +; SI-NEXT: buffer_load_ushort v10, v[2:3], s[36:39], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v5, v[2:3], s[36:39], 0 addr64 offset:2 glc +; SI-NEXT: buffer_load_ushort v4, v[2:3], s[36:39], 0 addr64 offset:2 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v10, v[2:3], s[36:39], 0 addr64 offset:4 glc +; SI-NEXT: buffer_load_ushort v11, v[2:3], s[36:39], 0 addr64 offset:4 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v4, v[2:3], s[36:39], 0 addr64 offset:6 glc +; SI-NEXT: buffer_load_ushort v5, v[2:3], s[36:39], 0 addr64 offset:6 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v8, v[2:3], s[36:39], 0 addr64 offset:8 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v6, v[2:3], s[36:39], 0 addr64 offset:10 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v11, v[2:3], s[36:39], 0 addr64 offset:12 glc +; SI-NEXT: buffer_load_ushort v9, v[2:3], s[36:39], 0 addr64 offset:12 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v7, v[2:3], s[36:39], 0 addr64 offset:14 glc ; SI-NEXT: s_waitcnt vmcnt(0) @@ -1452,43 +1451,43 @@ define amdgpu_gfx <8 x i16> @vec_16xi16_extract_8xi16_0(i1 inreg %cond, ptr addr ; SI-NEXT: buffer_load_ushort v2, v[2:3], s[36:39], 0 addr64 offset:30 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v5 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v5 -; SI-NEXT: v_or_b32_e32 v3, v11, v2 -; SI-NEXT: v_or_b32_e32 v8, v8, v12 +; SI-NEXT: v_or_b32_e32 v9, v9, v2 +; SI-NEXT: v_or_b32_e32 v8, v8, v3 +; SI-NEXT: v_or_b32_e32 v3, v11, v12 ; SI-NEXT: v_or_b32_e32 v2, v10, v13 -; SI-NEXT: v_or_b32_e32 v9, v9, v14 ; SI-NEXT: s_mov_b64 vcc, exec ; SI-NEXT: s_cbranch_execz .LBB7_3 ; SI-NEXT: s_branch .LBB7_4 ; SI-NEXT: .LBB7_2: -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: s_mov_b64 vcc, 0 ; SI-NEXT: .LBB7_3: ; %T ; SI-NEXT: s_mov_b32 s39, 0xf000 ; SI-NEXT: s_mov_b32 s36, s38 ; SI-NEXT: s_mov_b32 s37, s38 -; SI-NEXT: buffer_load_ushort v9, v[0:1], s[36:39], 0 addr64 glc +; SI-NEXT: buffer_load_ushort v2, v[0:1], s[36:39], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v5, v[0:1], s[36:39], 0 addr64 offset:2 glc +; SI-NEXT: buffer_load_ushort v4, v[0:1], s[36:39], 0 addr64 offset:2 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v2, v[0:1], s[36:39], 0 addr64 offset:4 glc +; SI-NEXT: buffer_load_ushort v3, v[0:1], s[36:39], 0 addr64 offset:4 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v4, v[0:1], s[36:39], 0 addr64 offset:6 glc +; SI-NEXT: buffer_load_ushort v5, v[0:1], s[36:39], 0 addr64 offset:6 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v8, v[0:1], s[36:39], 0 addr64 offset:8 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v6, v[0:1], s[36:39], 0 addr64 offset:10 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v3, v[0:1], s[36:39], 0 addr64 offset:12 glc +; SI-NEXT: buffer_load_ushort v9, v[0:1], s[36:39], 0 addr64 offset:12 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v7, v[0:1], s[36:39], 0 addr64 offset:14 glc ; SI-NEXT: s_waitcnt vmcnt(0) @@ -1510,50 +1509,46 @@ define amdgpu_gfx <8 x i16> @vec_16xi16_extract_8xi16_0(i1 inreg %cond, ptr addr ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v7 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v5 -; SI-NEXT: v_or_b32_e32 v3, v3, v0 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v4 +; SI-NEXT: v_or_b32_e32 v9, v9, v0 ; SI-NEXT: v_or_b32_e32 v8, v8, v1 -; SI-NEXT: v_or_b32_e32 v2, v2, v10 -; SI-NEXT: v_or_b32_e32 v9, v9, v11 +; SI-NEXT: v_or_b32_e32 v3, v3, v10 +; SI-NEXT: v_or_b32_e32 v2, v2, v11 ; SI-NEXT: .LBB7_4: ; %exit ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v9 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v8 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v8 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; SI-NEXT: s_movk_i32 s34, 0x3800 ; SI-NEXT: v_mov_b32_e32 v8, 0x3d00 ; SI-NEXT: v_mov_b32_e32 v9, 0x3900 ; SI-NEXT: v_mov_b32_e32 v10, 0x3d000000 ; SI-NEXT: v_mov_b32_e32 v11, 0x39000000 ; SI-NEXT: v_cmp_lt_u32_e32 vcc, s34, v0 -; SI-NEXT: v_cndmask_b32_e32 v0, v8, v9, vcc +; SI-NEXT: v_cndmask_b32_e32 v12, v8, v9, vcc ; SI-NEXT: v_cmp_lt_u32_e32 vcc, s34, v1 -; SI-NEXT: v_cndmask_b32_e32 v1, v10, v11, vcc -; SI-NEXT: v_cmp_lt_u32_e32 vcc, s34, v5 -; SI-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; SI-NEXT: v_cmp_lt_u32_e32 vcc, s34, v6 -; SI-NEXT: v_cndmask_b32_e32 v12, v10, v11, vcc -; SI-NEXT: v_cmp_lt_u32_e32 vcc, s34, v3 -; SI-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc +; SI-NEXT: v_cndmask_b32_e32 v13, v10, v11, vcc ; SI-NEXT: v_cmp_lt_u32_e32 vcc, s34, v7 -; SI-NEXT: v_cndmask_b32_e32 v7, v10, v11, vcc +; SI-NEXT: v_cndmask_b32_e32 v7, v8, v9, vcc +; SI-NEXT: v_cmp_lt_u32_e32 vcc, s34, v3 +; SI-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc +; SI-NEXT: v_cmp_lt_u32_e32 vcc, s34, v5 +; SI-NEXT: v_cndmask_b32_e32 v3, v10, v11, vcc ; SI-NEXT: v_cmp_lt_u32_e32 vcc, s34, v2 -; SI-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc +; SI-NEXT: v_cndmask_b32_e32 v0, v8, v9, vcc ; SI-NEXT: v_cmp_lt_u32_e32 vcc, s34, v4 -; SI-NEXT: v_cndmask_b32_e32 v8, v10, v11, vcc -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_or_b32_e32 v4, v5, v12 -; SI-NEXT: v_or_b32_e32 v6, v3, v7 -; SI-NEXT: v_or_b32_e32 v2, v2, v8 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v8 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: v_alignbit_b32 v5, v6, v12, 16 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_cndmask_b32_e32 v2, v10, v11, vcc +; SI-NEXT: v_cmp_lt_u32_e32 vcc, s34, v6 +; SI-NEXT: v_cndmask_b32_e32 v4, v10, v11, vcc +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_or_b32_e32 v2, v7, v4 +; SI-NEXT: v_or_b32_e32 v3, v12, v13 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: vec_16xi16_extract_8xi16_0: @@ -1744,21 +1739,21 @@ define amdgpu_gfx <8 x half> @vec_16xf16_extract_8xf16_0(i1 inreg %cond, ptr add ; SI-NEXT: s_mov_b32 s39, 0xf000 ; SI-NEXT: s_mov_b32 s36, s38 ; SI-NEXT: s_mov_b32 s37, s38 -; SI-NEXT: buffer_load_ushort v5, v[2:3], s[36:39], 0 addr64 glc +; SI-NEXT: buffer_load_ushort v6, v[2:3], s[36:39], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v6, v[2:3], s[36:39], 0 addr64 offset:2 glc +; SI-NEXT: buffer_load_ushort v7, v[2:3], s[36:39], 0 addr64 offset:2 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v7, v[2:3], s[36:39], 0 addr64 offset:4 glc +; SI-NEXT: buffer_load_ushort v8, v[2:3], s[36:39], 0 addr64 offset:4 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v4, v[2:3], s[36:39], 0 addr64 offset:6 glc +; SI-NEXT: buffer_load_ushort v9, v[2:3], s[36:39], 0 addr64 offset:6 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v8, v[2:3], s[36:39], 0 addr64 offset:8 glc +; SI-NEXT: buffer_load_ushort v10, v[2:3], s[36:39], 0 addr64 offset:8 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v9, v[2:3], s[36:39], 0 addr64 offset:10 glc +; SI-NEXT: buffer_load_ushort v4, v[2:3], s[36:39], 0 addr64 offset:10 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v10, v[2:3], s[36:39], 0 addr64 offset:12 glc +; SI-NEXT: buffer_load_ushort v11, v[2:3], s[36:39], 0 addr64 offset:12 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v11, v[2:3], s[36:39], 0 addr64 offset:14 glc +; SI-NEXT: buffer_load_ushort v5, v[2:3], s[36:39], 0 addr64 offset:14 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v12, v[2:3], s[36:39], 0 addr64 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) @@ -1776,122 +1771,138 @@ define amdgpu_gfx <8 x half> @vec_16xf16_extract_8xf16_0(i1 inreg %cond, ptr add ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v2, v[2:3], s[36:39], 0 addr64 offset:30 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v9 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_or_b32_e32 v9, v10, v12 -; SI-NEXT: v_or_b32_e32 v8, v8, v13 -; SI-NEXT: v_or_b32_e32 v10, v7, v14 -; SI-NEXT: v_or_b32_e32 v11, v5, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v7 +; SI-NEXT: v_or_b32_e32 v7, v11, v12 +; SI-NEXT: v_or_b32_e32 v10, v10, v13 +; SI-NEXT: v_or_b32_e32 v11, v8, v14 +; SI-NEXT: v_or_b32_e32 v6, v6, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: s_mov_b64 vcc, exec ; SI-NEXT: s_cbranch_execz .LBB8_3 ; SI-NEXT: s_branch .LBB8_4 ; SI-NEXT: .LBB8_2: -; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: s_mov_b64 vcc, 0 ; SI-NEXT: .LBB8_3: ; %T ; SI-NEXT: s_mov_b32 s39, 0xf000 ; SI-NEXT: s_mov_b32 s36, s38 ; SI-NEXT: s_mov_b32 s37, s38 -; SI-NEXT: buffer_load_ushort v5, v[0:1], s[36:39], 0 addr64 glc +; SI-NEXT: buffer_load_ushort v3, v[0:1], s[36:39], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v6, v[0:1], s[36:39], 0 addr64 offset:2 glc +; SI-NEXT: buffer_load_ushort v10, v[0:1], s[36:39], 0 addr64 offset:2 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v7, v[0:1], s[36:39], 0 addr64 offset:4 glc +; SI-NEXT: buffer_load_ushort v6, v[0:1], s[36:39], 0 addr64 offset:4 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v4, v[0:1], s[36:39], 0 addr64 offset:6 glc +; SI-NEXT: buffer_load_ushort v2, v[0:1], s[36:39], 0 addr64 offset:6 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v8, v[0:1], s[36:39], 0 addr64 offset:8 glc +; SI-NEXT: buffer_load_ushort v7, v[0:1], s[36:39], 0 addr64 offset:8 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v3, v[0:1], s[36:39], 0 addr64 offset:10 glc +; SI-NEXT: buffer_load_ushort v4, v[0:1], s[36:39], 0 addr64 offset:10 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v9, v[0:1], s[36:39], 0 addr64 offset:12 glc +; SI-NEXT: buffer_load_ushort v8, v[0:1], s[36:39], 0 addr64 offset:12 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v2, v[0:1], s[36:39], 0 addr64 offset:14 glc +; SI-NEXT: buffer_load_ushort v5, v[0:1], s[36:39], 0 addr64 offset:14 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v10, v[0:1], s[36:39], 0 addr64 offset:16 glc +; SI-NEXT: buffer_load_ushort v9, v[0:1], s[36:39], 0 addr64 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v10, v[0:1], s[36:39], 0 addr64 offset:18 glc +; SI-NEXT: buffer_load_ushort v9, v[0:1], s[36:39], 0 addr64 offset:18 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v10, v[0:1], s[36:39], 0 addr64 offset:20 glc +; SI-NEXT: buffer_load_ushort v9, v[0:1], s[36:39], 0 addr64 offset:20 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v10, v[0:1], s[36:39], 0 addr64 offset:22 glc +; SI-NEXT: buffer_load_ushort v9, v[0:1], s[36:39], 0 addr64 offset:22 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v10, v[0:1], s[36:39], 0 addr64 offset:24 glc +; SI-NEXT: buffer_load_ushort v9, v[0:1], s[36:39], 0 addr64 offset:24 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v10, v[0:1], s[36:39], 0 addr64 offset:26 glc +; SI-NEXT: buffer_load_ushort v9, v[0:1], s[36:39], 0 addr64 offset:26 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v10, v[0:1], s[36:39], 0 addr64 offset:28 glc +; SI-NEXT: buffer_load_ushort v9, v[0:1], s[36:39], 0 addr64 offset:28 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v0, v[0:1], s[36:39], 0 addr64 offset:30 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_or_b32_e32 v0, v9, v0 -; SI-NEXT: v_or_b32_e32 v1, v8, v1 -; SI-NEXT: v_or_b32_e32 v8, v7, v10 -; SI-NEXT: v_or_b32_e32 v9, v5, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v0, v8, v0 +; SI-NEXT: v_or_b32_e32 v1, v7, v1 +; SI-NEXT: v_or_b32_e32 v6, v6, v9 +; SI-NEXT: v_or_b32_e32 v3, v3, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v10 ; SI-NEXT: .LBB8_4: ; %exit ; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_mov_b32_e32 v8, 0x3fa00000 ; SI-NEXT: v_mov_b32_e32 v9, 0x3f200000 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v0 ; SI-NEXT: v_cndmask_b32_e32 v0, v8, v9, vcc ; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v1 ; SI-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc -; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v6 -; SI-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc +; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v5 +; SI-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc ; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v4 -; SI-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc -; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v7 ; SI-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc -; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v10 -; SI-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v11 -; SI-NEXT: v_cndmask_b32_e32 v6, v8, v9, vcc -; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v12 +; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v7 ; SI-NEXT: v_cndmask_b32_e32 v7, v8, v9, vcc +; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v2 +; SI-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc +; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v6 +; SI-NEXT: v_cndmask_b32_e32 v6, v8, v9, vcc +; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v3 +; SI-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v6, v0 +; SI-NEXT: v_or_b32_e32 v1, v7, v2 +; SI-NEXT: v_or_b32_e32 v2, v5, v3 +; SI-NEXT: v_or_b32_e32 v3, v8, v4 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: vec_16xf16_extract_8xf16_0: diff --git a/llvm/test/CodeGen/AMDGPU/extract-subvector.ll b/llvm/test/CodeGen/AMDGPU/extract-subvector.ll index 1c687734731b1..87d7a73c5c01f 100644 --- a/llvm/test/CodeGen/AMDGPU/extract-subvector.ll +++ b/llvm/test/CodeGen/AMDGPU/extract-subvector.ll @@ -68,14 +68,13 @@ define <2 x i16> @extract_2xi16(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 % ; GCN-NEXT: v_bfe_i32 v1, v4, 0, 16 ; GCN-NEXT: v_mov_b32_e32 v2, 0xffff ; GCN-NEXT: v_mov_b32_e32 v3, 0x8000 -; GCN-NEXT: v_mov_b32_e32 v4, 0xffff8000 +; GCN-NEXT: v_mov_b32_e32 v4, 0xffff0000 +; GCN-NEXT: v_bfrev_b32_e32 v5, 1 ; GCN-NEXT: v_cmp_lt_i32_e32 vcc, -1, v0 ; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc ; GCN-NEXT: v_cmp_lt_i32_e32 vcc, -1, v1 -; GCN-NEXT: v_cndmask_b32_e32 v1, -1, v4, vcc -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GCN-NEXT: v_or_b32_e32 v0, v0, v2 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GCN-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc +; GCN-NEXT: v_or_b32_e32 v0, v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] br i1 %c0, label %T, label %F diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll index 2335da7f0abde..65e2b26a79fbd 100644 --- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll @@ -215,10 +215,14 @@ define <2 x half> @v_test_canonicalize_build_vector_v2f16(half %lo, half %hi) #1 ; CI-LABEL: v_test_canonicalize_build_vector_v2f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_or_b32_e32 v0, v0, v1 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: v_test_canonicalize_build_vector_v2f16: @@ -2441,12 +2445,15 @@ define <3 x half> @v_test_canonicalize_var_v3f16(<3 x half> %val) #1 { ; CI-LABEL: v_test_canonicalize_var_v3f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; CI-NEXT: v_or_b32_e32 v0, v0, v2 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_test_canonicalize_var_v3f16: @@ -2481,14 +2488,20 @@ define <4 x half> @v_test_canonicalize_var_v4f16(<4 x half> %val) #1 { ; CI-LABEL: v_test_canonicalize_var_v4f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; CI-NEXT: v_or_b32_e32 v0, v0, v2 +; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; CI-NEXT: v_or_b32_e32 v1, v1, v2 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_test_canonicalize_var_v4f16: @@ -2560,8 +2573,11 @@ define <2 x half> @v_test_canonicalize_reg_undef_v2f16(half %val) #1 { ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; CI-NEXT: v_mov_b32_e32 v1, 0x7fc00000 +; CI-NEXT: v_cvt_f16_f32_e32 v1, 0x7fc00000 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_or_b32_e32 v0, v0, v1 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: v_test_canonicalize_reg_undef_v2f16: @@ -2600,8 +2616,11 @@ define <2 x half> @v_test_canonicalize_undef_reg_v2f16(half %val) #1 { ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; CI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; CI-NEXT: v_mov_b32_e32 v0, 0x7fc00000 +; CI-NEXT: v_cvt_f16_f32_e32 v1, 0x7fc00000 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; CI-NEXT: v_or_b32_e32 v0, v1, v0 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: v_test_canonicalize_undef_reg_v2f16: @@ -2638,8 +2657,7 @@ define <2 x half> @v_test_canonicalize_undef_lo_imm_hi_v2f16() #1 { ; CI-LABEL: v_test_canonicalize_undef_lo_imm_hi_v2f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v0, 0 -; CI-NEXT: v_mov_b32_e32 v1, 1.0 +; CI-NEXT: v_bfrev_b32_e32 v0, 60 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_test_canonicalize_undef_lo_imm_hi_v2f16: @@ -2668,8 +2686,7 @@ define <2 x half> @v_test_canonicalize_imm_lo_undef_hi_v2f16() #1 { ; CI-LABEL: v_test_canonicalize_imm_lo_undef_hi_v2f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v0, 1.0 -; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: v_mov_b32_e32 v0, 0x3c00 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_test_canonicalize_imm_lo_undef_hi_v2f16: @@ -2698,8 +2715,7 @@ define <2 x half> @v_test_canonicalize_undef_lo_k_hi_v2f16() #1 { ; CI-LABEL: v_test_canonicalize_undef_lo_k_hi_v2f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v0, 0 -; CI-NEXT: v_mov_b32_e32 v1, 0x41800000 +; CI-NEXT: v_bfrev_b32_e32 v0, 50 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_test_canonicalize_undef_lo_k_hi_v2f16: @@ -2728,8 +2744,7 @@ define <2 x half> @v_test_canonicalize_k_lo_undef_hi_v2f16() #1 { ; CI-LABEL: v_test_canonicalize_k_lo_undef_hi_v2f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v0, 0x41800000 -; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: v_mov_b32_e32 v0, 0x4c00 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_test_canonicalize_k_lo_undef_hi_v2f16: @@ -2761,8 +2776,11 @@ define <2 x half> @v_test_canonicalize_reg_k_v2f16(half %val) #1 { ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; CI-NEXT: v_mov_b32_e32 v1, 2.0 +; CI-NEXT: v_cvt_f16_f32_e32 v1, 2.0 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_or_b32_e32 v0, v0, v1 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: v_test_canonicalize_reg_k_v2f16: @@ -2804,8 +2822,11 @@ define <2 x half> @v_test_canonicalize_k_reg_v2f16(half %val) #1 { ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; CI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; CI-NEXT: v_mov_b32_e32 v0, 2.0 +; CI-NEXT: v_cvt_f16_f32_e32 v1, 2.0 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; CI-NEXT: v_or_b32_e32 v0, v1, v0 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: v_test_canonicalize_k_reg_v2f16: @@ -2895,10 +2916,12 @@ define <4 x half> @v_test_canonicalize_reg_undef_undef_undef_v4f16(half %val) #1 ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; CI-NEXT: v_mov_b32_e32 v1, 0x7fc00000 -; CI-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; CI-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; CI-NEXT: v_cvt_f16_f32_e32 v1, 0x7fc00000 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; CI-NEXT: v_or_b32_e32 v1, v1, v2 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_or_b32_e32 v0, v0, v2 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: v_test_canonicalize_reg_undef_undef_undef_v4f16: @@ -2944,12 +2967,17 @@ define <4 x half> @v_test_canonicalize_reg_reg_undef_undef_v4f16(half %val0, hal ; CI-LABEL: v_test_canonicalize_reg_reg_undef_undef_v4f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; CI-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_cvt_f16_f32_e32 v2, 0x7fc00000 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f16_f32_e32 v3, v1 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; CI-NEXT: v_or_b32_e32 v1, v2, v1 +; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; CI-NEXT: v_or_b32_e32 v0, v0, v2 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: v_test_canonicalize_reg_reg_undef_undef_v4f16: @@ -2999,13 +3027,20 @@ define <4 x half> @v_test_canonicalize_reg_undef_reg_reg_v4f16(half %val0, half ; CI-LABEL: v_test_canonicalize_reg_undef_reg_reg_v4f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_cvt_f16_f32_e32 v3, v2 +; CI-NEXT: v_cvt_f16_f32_e32 v3, 0x7fc00000 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v1 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_mov_b32_e32 v1, 0x7fc00000 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; CI-NEXT: v_or_b32_e32 v0, v0, v3 +; CI-NEXT: v_or_b32_e32 v1, v1, v2 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: v_test_canonicalize_reg_undef_reg_reg_v4f16: @@ -3060,18 +3095,27 @@ define <6 x half> @v_test_canonicalize_var_v6f16(<6 x half> %val) #1 { ; CI-LABEL: v_test_canonicalize_var_v6f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; CI-NEXT: v_or_b32_e32 v0, v0, v3 +; CI-NEXT: v_cvt_f16_f32_e32 v3, v4 +; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NEXT: v_cvt_f16_f32_e32 v4, v5 +; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; CI-NEXT: v_or_b32_e32 v1, v1, v3 +; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; CI-NEXT: v_or_b32_e32 v2, v2, v3 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_test_canonicalize_var_v6f16: @@ -3115,22 +3159,34 @@ define <8 x half> @v_test_canonicalize_var_v8f16(<8 x half> %val) #1 { ; CI-LABEL: v_test_canonicalize_var_v8f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; CI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; CI-NEXT: v_or_b32_e32 v0, v0, v4 +; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; CI-NEXT: v_or_b32_e32 v1, v1, v4 +; CI-NEXT: v_cvt_f16_f32_e32 v4, v6 +; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; CI-NEXT: v_cvt_f16_f32_e32 v5, v7 +; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; CI-NEXT: v_or_b32_e32 v2, v2, v4 +; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; CI-NEXT: v_or_b32_e32 v3, v3, v4 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_test_canonicalize_var_v8f16: @@ -3183,30 +3239,48 @@ define <12 x half> @v_test_canonicalize_var_v12f16(<12 x half> %val) #1 { ; CI-LABEL: v_test_canonicalize_var_v12f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; CI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; CI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; CI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; CI-NEXT: v_lshrrev_b32_e32 v8, 16, v2 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; CI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; CI-NEXT: v_lshrrev_b32_e32 v9, 16, v3 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; CI-NEXT: v_or_b32_e32 v0, v0, v6 +; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 +; CI-NEXT: v_or_b32_e32 v1, v1, v6 +; CI-NEXT: v_cvt_f16_f32_e32 v6, v8 +; CI-NEXT: v_lshrrev_b32_e32 v10, 16, v4 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; CI-NEXT: v_cvt_f16_f32_e32 v7, v9 +; CI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; CI-NEXT: v_lshrrev_b32_e32 v11, 16, v5 ; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; CI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; CI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; CI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; CI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; CI-NEXT: v_or_b32_e32 v2, v2, v6 +; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 +; CI-NEXT: v_or_b32_e32 v3, v3, v6 +; CI-NEXT: v_cvt_f16_f32_e32 v6, v10 +; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; CI-NEXT: v_cvt_f16_f32_e32 v7, v11 +; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; CI-NEXT: v_or_b32_e32 v4, v4, v6 +; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 +; CI-NEXT: v_or_b32_e32 v5, v5, v6 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_test_canonicalize_var_v12f16: @@ -3269,38 +3343,62 @@ define <16 x half> @v_test_canonicalize_var_v16f16(<16 x half> %val) #1 { ; CI-LABEL: v_test_canonicalize_var_v16f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; CI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; CI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; CI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; CI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; CI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; CI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; CI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; CI-NEXT: v_lshrrev_b32_e32 v9, 16, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; CI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; CI-NEXT: v_lshrrev_b32_e32 v10, 16, v2 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; CI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; CI-NEXT: v_lshrrev_b32_e32 v11, 16, v3 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; CI-NEXT: v_or_b32_e32 v0, v0, v8 +; CI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; CI-NEXT: v_or_b32_e32 v1, v1, v8 +; CI-NEXT: v_cvt_f16_f32_e32 v8, v10 +; CI-NEXT: v_lshrrev_b32_e32 v12, 16, v4 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; CI-NEXT: v_cvt_f16_f32_e32 v9, v11 +; CI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; CI-NEXT: v_lshrrev_b32_e32 v13, 16, v5 ; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; CI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; CI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; CI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; CI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; CI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; CI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; CI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; CI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; CI-NEXT: v_or_b32_e32 v2, v2, v8 +; CI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; CI-NEXT: v_or_b32_e32 v3, v3, v8 +; CI-NEXT: v_cvt_f16_f32_e32 v8, v12 +; CI-NEXT: v_lshrrev_b32_e32 v14, 16, v6 +; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; CI-NEXT: v_cvt_f16_f32_e32 v9, v13 +; CI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; CI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 +; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; CI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; CI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; CI-NEXT: v_or_b32_e32 v4, v4, v8 +; CI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; CI-NEXT: v_or_b32_e32 v5, v5, v8 +; CI-NEXT: v_cvt_f16_f32_e32 v8, v14 +; CI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; CI-NEXT: v_cvt_f16_f32_e32 v9, v15 +; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; CI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; CI-NEXT: v_or_b32_e32 v6, v6, v8 +; CI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; CI-NEXT: v_or_b32_e32 v7, v7, v8 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_test_canonicalize_var_v16f16: @@ -3397,72 +3495,118 @@ define <32 x half> @v_test_canonicalize_var_v32f16(<32 x half> %val) #1 { ; CI-LABEL: v_test_canonicalize_var_v32f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; CI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; CI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; CI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; CI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; CI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; CI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; CI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; CI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; CI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; CI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; CI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; CI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; CI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; CI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; CI-NEXT: v_lshrrev_b32_e32 v16, 16, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; CI-NEXT: v_lshrrev_b32_e32 v17, 16, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; CI-NEXT: v_lshrrev_b32_e32 v18, 16, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; CI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; CI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; CI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; CI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; CI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; CI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; CI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; CI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; CI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; CI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; CI-NEXT: v_lshrrev_b32_e32 v19, 16, v3 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; CI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; CI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_or_b32_e32 v0, v0, v16 +; CI-NEXT: v_lshrrev_b32_e32 v16, 16, v4 +; CI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; CI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; CI-NEXT: v_or_b32_e32 v1, v1, v17 +; CI-NEXT: v_lshrrev_b32_e32 v17, 16, v5 +; CI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; CI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; CI-NEXT: v_or_b32_e32 v2, v2, v18 +; CI-NEXT: v_lshrrev_b32_e32 v18, 16, v6 +; CI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; CI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; CI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; CI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; CI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; CI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; CI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; CI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; CI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; CI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; CI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; CI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; CI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; CI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; CI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; CI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; CI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; CI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; CI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; CI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; CI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; CI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; CI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; CI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; CI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; CI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; CI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; CI-NEXT: v_or_b32_e32 v3, v3, v19 +; CI-NEXT: v_lshrrev_b32_e32 v19, 16, v7 +; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; CI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; CI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; CI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; CI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; CI-NEXT: v_or_b32_e32 v4, v4, v16 +; CI-NEXT: v_lshrrev_b32_e32 v16, 16, v8 +; CI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; CI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; CI-NEXT: v_or_b32_e32 v5, v5, v17 +; CI-NEXT: v_lshrrev_b32_e32 v17, 16, v9 +; CI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; CI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; CI-NEXT: v_or_b32_e32 v6, v6, v18 +; CI-NEXT: v_lshrrev_b32_e32 v18, 16, v10 +; CI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; CI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; CI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; CI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; CI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; CI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; CI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; CI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; CI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; CI-NEXT: v_or_b32_e32 v7, v7, v19 +; CI-NEXT: v_lshrrev_b32_e32 v19, 16, v11 +; CI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; CI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; CI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; CI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; CI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; CI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; CI-NEXT: v_or_b32_e32 v8, v8, v16 +; CI-NEXT: v_lshrrev_b32_e32 v16, 16, v12 +; CI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; CI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; CI-NEXT: v_or_b32_e32 v9, v9, v17 +; CI-NEXT: v_lshrrev_b32_e32 v17, 16, v13 +; CI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; CI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; CI-NEXT: v_or_b32_e32 v10, v10, v18 +; CI-NEXT: v_cvt_f16_f32_e32 v18, v19 +; CI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; CI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; CI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; CI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; CI-NEXT: v_lshrrev_b32_e32 v19, 16, v14 +; CI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; CI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; CI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; CI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; CI-NEXT: v_or_b32_e32 v11, v11, v18 +; CI-NEXT: v_lshrrev_b32_e32 v18, 16, v15 +; CI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; CI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; CI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; CI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; CI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; CI-NEXT: v_or_b32_e32 v12, v12, v16 +; CI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 +; CI-NEXT: v_or_b32_e32 v13, v13, v16 +; CI-NEXT: v_cvt_f16_f32_e32 v16, v19 +; CI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; CI-NEXT: v_cvt_f16_f32_e32 v17, v18 +; CI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; CI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; CI-NEXT: v_or_b32_e32 v14, v14, v16 +; CI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 +; CI-NEXT: v_or_b32_e32 v15, v15, v16 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_test_canonicalize_var_v32f16: @@ -3635,393 +3779,232 @@ define <64 x half> @v_test_canonicalize_var_v64f16(<64 x half> %val) #1 { ; CI-LABEL: v_test_canonicalize_var_v64f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:132 -; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128 -; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_lshrrev_b32_e32 v31, 16, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; CI-NEXT: v_or_b32_e32 v0, v0, v31 +; CI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; CI-NEXT: v_or_b32_e32 v1, v1, v2 -; CI-NEXT: v_cvt_f16_f32_e32 v2, v4 -; CI-NEXT: v_cvt_f16_f32_e32 v4, v5 -; CI-NEXT: v_cvt_f16_f32_e32 v5, v7 -; CI-NEXT: v_cvt_f16_f32_e32 v7, v11 +; CI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; CI-NEXT: v_or_b32_e32 v1, v1, v31 +; CI-NEXT: v_lshrrev_b32_e32 v31, 16, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; CI-NEXT: v_or_b32_e32 v2, v3, v2 -; CI-NEXT: v_cvt_f16_f32_e32 v3, v6 -; CI-NEXT: v_cvt_f16_f32_e32 v6, v9 -; CI-NEXT: v_cvt_f16_f32_e32 v9, v16 -; CI-NEXT: v_cvt_f16_f32_e32 v16, v21 +; CI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; CI-NEXT: v_or_b32_e32 v2, v2, v31 +; CI-NEXT: v_lshrrev_b32_e32 v31, 16, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; CI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; CI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 ; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; CI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; CI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; CI-NEXT: v_or_b32_e32 v3, v4, v3 -; CI-NEXT: v_cvt_f16_f32_e32 v4, v8 -; CI-NEXT: v_cvt_f16_f32_e32 v8, v13 -; CI-NEXT: v_cvt_f16_f32_e32 v13, v20 -; CI-NEXT: v_cvt_f16_f32_e32 v20, v25 +; CI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; CI-NEXT: v_or_b32_e32 v3, v3, v31 +; CI-NEXT: v_lshrrev_b32_e32 v31, 16, v4 +; CI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; CI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; CI-NEXT: v_cvt_f16_f32_e32 v21, v28 +; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 ; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; CI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; CI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; CI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; CI-NEXT: v_or_b32_e32 v4, v5, v4 -; CI-NEXT: v_cvt_f16_f32_e32 v5, v10 -; CI-NEXT: v_cvt_f16_f32_e32 v10, v15 -; CI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; CI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; CI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; CI-NEXT: v_or_b32_e32 v4, v4, v31 +; CI-NEXT: v_lshrrev_b32_e32 v31, 16, v5 +; CI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; CI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; CI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; CI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:20 +; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 ; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; CI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; CI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; CI-NEXT: v_or_b32_e32 v5, v6, v5 -; CI-NEXT: v_cvt_f16_f32_e32 v6, v12 -; CI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:8 +; CI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; CI-NEXT: v_or_b32_e32 v5, v5, v31 +; CI-NEXT: v_lshrrev_b32_e32 v31, 16, v6 +; CI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; CI-NEXT: s_waitcnt vmcnt(3) ; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; CI-NEXT: s_waitcnt vmcnt(2) -; CI-NEXT: v_cvt_f16_f32_e32 v32, v32 ; CI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; CI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; CI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; CI-NEXT: v_or_b32_e32 v6, v7, v6 -; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; CI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; CI-NEXT: v_cvt_f16_f32_e32 v7, v14 -; CI-NEXT: v_cvt_f16_f32_e32 v14, v19 ; CI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; CI-NEXT: v_or_b32_e32 v31, v32, v31 -; CI-NEXT: v_add_i32_e32 v32, vcc, 0x7c, v0 -; CI-NEXT: buffer_store_dword v31, v32, s[0:3], 0 offen -; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:124 -; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:120 +; CI-NEXT: v_or_b32_e32 v6, v6, v31 +; CI-NEXT: v_lshrrev_b32_e32 v31, 16, v7 +; CI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; CI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; CI-NEXT: v_cvt_f16_f32_e32 v19, v26 +; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 ; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; CI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; CI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; CI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; CI-NEXT: v_or_b32_e32 v7, v8, v7 -; CI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; CI-NEXT: v_cvt_f16_f32_e32 v9, v18 -; CI-NEXT: v_or_b32_e32 v8, v10, v8 -; CI-NEXT: v_cvt_f16_f32_e32 v10, v17 -; CI-NEXT: v_cvt_f16_f32_e32 v17, v24 +; CI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; CI-NEXT: v_or_b32_e32 v7, v7, v31 +; CI-NEXT: v_lshrrev_b32_e32 v31, 16, v8 +; CI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; CI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; CI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; CI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; CI-NEXT: v_or_b32_e32 v8, v8, v31 +; CI-NEXT: v_lshrrev_b32_e32 v31, 16, v9 +; CI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; CI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; CI-NEXT: v_cvt_f16_f32_e32 v18, v23 -; CI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; CI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 ; CI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; CI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; CI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; CI-NEXT: v_or_b32_e32 v9, v9, v31 +; CI-NEXT: v_lshrrev_b32_e32 v31, 16, v10 +; CI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; CI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 ; CI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; CI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; CI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; CI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; CI-NEXT: v_or_b32_e32 v9, v10, v9 -; CI-NEXT: v_lshlrev_b32_e32 v10, 16, v13 -; CI-NEXT: v_cvt_f16_f32_e32 v13, v22 -; CI-NEXT: v_or_b32_e32 v10, v14, v10 -; CI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:4 -; CI-NEXT: buffer_load_dword v15, off, s[0:3], s32 -; CI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; CI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; CI-NEXT: v_or_b32_e32 v17, v18, v17 -; CI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:16 -; CI-NEXT: v_cvt_f16_f32_e32 v22, v27 -; CI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; CI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; CI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; CI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; CI-NEXT: v_or_b32_e32 v13, v16, v13 -; CI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:12 -; CI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; CI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; CI-NEXT: v_or_b32_e32 v19, v20, v19 -; CI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 -; CI-NEXT: v_cvt_f16_f32_e32 v21, v30 -; CI-NEXT: v_or_b32_e32 v20, v22, v20 -; CI-NEXT: v_cvt_f16_f32_e32 v22, v29 -; CI-NEXT: s_waitcnt vmcnt(8) -; CI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; CI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; CI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; CI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; CI-NEXT: v_or_b32_e32 v10, v10, v31 +; CI-NEXT: v_lshrrev_b32_e32 v31, 16, v11 +; CI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; CI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; CI-NEXT: s_waitcnt vmcnt(7) -; CI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; CI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; CI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 ; CI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; CI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; CI-NEXT: v_or_b32_e32 v11, v11, v31 +; CI-NEXT: v_lshrrev_b32_e32 v31, 16, v12 +; CI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; CI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; CI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; CI-NEXT: v_or_b32_e32 v21, v22, v21 -; CI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; CI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; CI-NEXT: s_waitcnt vmcnt(5) ; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; CI-NEXT: s_waitcnt vmcnt(4) -; CI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; CI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; CI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; CI-NEXT: v_or_b32_e32 v12, v12, v31 +; CI-NEXT: v_lshrrev_b32_e32 v31, 16, v13 ; CI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; CI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; CI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; CI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; CI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; CI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; CI-NEXT: v_or_b32_e32 v31, v32, v31 -; CI-NEXT: v_add_i32_e32 v32, vcc, 0x78, v0 -; CI-NEXT: buffer_store_dword v31, v32, s[0:3], 0 offen -; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:116 -; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:112 -; CI-NEXT: s_waitcnt vmcnt(6) -; CI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; CI-NEXT: s_waitcnt vmcnt(5) -; CI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; CI-NEXT: v_or_b32_e32 v13, v13, v31 +; CI-NEXT: v_lshrrev_b32_e32 v31, 16, v14 +; CI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; CI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; CI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 ; CI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; CI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; CI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; CI-NEXT: v_or_b32_e32 v14, v15, v14 -; CI-NEXT: s_waitcnt vmcnt(3) -; CI-NEXT: v_cvt_f16_f32_e32 v15, v16 -; CI-NEXT: v_cvt_f16_f32_e32 v16, v18 +; CI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; CI-NEXT: v_or_b32_e32 v14, v14, v31 +; CI-NEXT: v_lshrrev_b32_e32 v31, 16, v15 +; CI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; CI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; CI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; CI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; CI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; CI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; CI-NEXT: v_or_b32_e32 v12, v12, v15 -; CI-NEXT: v_add_i32_e32 v15, vcc, 0x44, v0 -; CI-NEXT: v_or_b32_e32 v11, v16, v11 -; CI-NEXT: s_waitcnt vmcnt(1) ; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; CI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; CI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; CI-NEXT: v_or_b32_e32 v15, v15, v31 +; CI-NEXT: v_lshrrev_b32_e32 v31, 16, v16 ; CI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; CI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; CI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; CI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; CI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; CI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; CI-NEXT: v_or_b32_e32 v31, v32, v31 -; CI-NEXT: v_add_i32_e32 v32, vcc, 0x74, v0 -; CI-NEXT: buffer_store_dword v31, v32, s[0:3], 0 offen -; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:108 -; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:104 -; CI-NEXT: s_waitcnt vmcnt(1) -; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; CI-NEXT: v_or_b32_e32 v16, v16, v31 +; CI-NEXT: v_lshrrev_b32_e32 v31, 16, v17 ; CI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; CI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; CI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; CI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; CI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; CI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; CI-NEXT: v_or_b32_e32 v31, v32, v31 -; CI-NEXT: v_add_i32_e32 v32, vcc, 0x70, v0 -; CI-NEXT: buffer_store_dword v31, v32, s[0:3], 0 offen -; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:100 -; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:96 -; CI-NEXT: s_waitcnt vmcnt(1) -; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; CI-NEXT: v_or_b32_e32 v17, v17, v31 +; CI-NEXT: v_lshrrev_b32_e32 v31, 16, v18 ; CI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; CI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; CI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; CI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; CI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; CI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; CI-NEXT: v_or_b32_e32 v31, v32, v31 -; CI-NEXT: v_add_i32_e32 v32, vcc, 0x6c, v0 -; CI-NEXT: buffer_store_dword v31, v32, s[0:3], 0 offen -; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:92 -; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:88 -; CI-NEXT: s_waitcnt vmcnt(1) -; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; CI-NEXT: v_or_b32_e32 v18, v18, v31 +; CI-NEXT: v_lshrrev_b32_e32 v31, 16, v19 ; CI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; CI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; CI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; CI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; CI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; CI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; CI-NEXT: v_or_b32_e32 v31, v32, v31 -; CI-NEXT: v_add_i32_e32 v32, vcc, 0x68, v0 -; CI-NEXT: buffer_store_dword v31, v32, s[0:3], 0 offen -; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:84 -; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:80 -; CI-NEXT: s_waitcnt vmcnt(1) -; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; CI-NEXT: v_or_b32_e32 v19, v19, v31 +; CI-NEXT: v_lshrrev_b32_e32 v31, 16, v20 ; CI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; CI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; CI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; CI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; CI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; CI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; CI-NEXT: v_or_b32_e32 v31, v32, v31 -; CI-NEXT: v_add_i32_e32 v32, vcc, 0x64, v0 -; CI-NEXT: buffer_store_dword v31, v32, s[0:3], 0 offen -; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76 -; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72 -; CI-NEXT: s_waitcnt vmcnt(1) -; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; CI-NEXT: v_or_b32_e32 v20, v20, v31 +; CI-NEXT: v_lshrrev_b32_e32 v31, 16, v21 ; CI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; CI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; CI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; CI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; CI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; CI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; CI-NEXT: v_or_b32_e32 v31, v32, v31 -; CI-NEXT: v_add_i32_e32 v32, vcc, 0x60, v0 -; CI-NEXT: buffer_store_dword v31, v32, s[0:3], 0 offen -; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:68 -; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64 -; CI-NEXT: s_waitcnt vmcnt(1) -; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; CI-NEXT: v_or_b32_e32 v21, v21, v31 +; CI-NEXT: v_lshrrev_b32_e32 v31, 16, v22 ; CI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; CI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; CI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; CI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; CI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; CI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; CI-NEXT: v_or_b32_e32 v31, v32, v31 -; CI-NEXT: v_add_i32_e32 v32, vcc, 0x5c, v0 -; CI-NEXT: buffer_store_dword v31, v32, s[0:3], 0 offen -; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:60 -; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56 -; CI-NEXT: s_waitcnt vmcnt(1) -; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; CI-NEXT: v_or_b32_e32 v22, v22, v31 +; CI-NEXT: v_lshrrev_b32_e32 v31, 16, v23 ; CI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; CI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; CI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; CI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; CI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; CI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; CI-NEXT: v_or_b32_e32 v31, v32, v31 -; CI-NEXT: v_add_i32_e32 v32, vcc, 0x58, v0 -; CI-NEXT: buffer_store_dword v31, v32, s[0:3], 0 offen -; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:52 -; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:48 -; CI-NEXT: s_waitcnt vmcnt(1) -; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; CI-NEXT: v_or_b32_e32 v23, v23, v31 +; CI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 ; CI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; CI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; CI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; CI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; CI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; CI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; CI-NEXT: v_or_b32_e32 v31, v32, v31 -; CI-NEXT: v_add_i32_e32 v32, vcc, 0x54, v0 -; CI-NEXT: buffer_store_dword v31, v32, s[0:3], 0 offen -; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:44 -; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:40 -; CI-NEXT: s_waitcnt vmcnt(1) +; CI-NEXT: v_or_b32_e32 v24, v24, v31 +; CI-NEXT: v_lshrrev_b32_e32 v31, 16, v25 +; CI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; CI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; CI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; CI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; CI-NEXT: v_or_b32_e32 v25, v25, v31 +; CI-NEXT: v_lshrrev_b32_e32 v31, 16, v26 ; CI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; CI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; CI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; CI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; CI-NEXT: v_cvt_f16_f32_e32 v26, v26 ; CI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; CI-NEXT: v_or_b32_e32 v31, v32, v31 -; CI-NEXT: v_add_i32_e32 v32, vcc, 0x50, v0 -; CI-NEXT: buffer_store_dword v31, v32, s[0:3], 0 offen -; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:36 -; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:32 -; CI-NEXT: s_waitcnt vmcnt(1) +; CI-NEXT: v_or_b32_e32 v26, v26, v31 +; CI-NEXT: v_lshrrev_b32_e32 v31, 16, v27 +; CI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; CI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; CI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; CI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; CI-NEXT: v_or_b32_e32 v27, v27, v31 +; CI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 ; CI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; CI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; CI-NEXT: v_cvt_f32_f16_e32 v28, v28 ; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; CI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; CI-NEXT: v_cvt_f16_f32_e32 v28, v28 ; CI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; CI-NEXT: v_or_b32_e32 v31, v32, v31 -; CI-NEXT: v_add_i32_e32 v32, vcc, 0x4c, v0 -; CI-NEXT: buffer_store_dword v31, v32, s[0:3], 0 offen -; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:28 -; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24 -; CI-NEXT: s_waitcnt vmcnt(1) +; CI-NEXT: v_or_b32_e32 v28, v28, v31 +; CI-NEXT: v_lshrrev_b32_e32 v31, 16, v29 +; CI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; CI-NEXT: v_cvt_f32_f16_e32 v29, v29 ; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; CI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; CI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; CI-NEXT: v_or_b32_e32 v29, v29, v31 +; CI-NEXT: v_lshrrev_b32_e32 v31, 16, v30 ; CI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; CI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; CI-NEXT: v_cvt_f32_f16_e32 v30, v30 ; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; CI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; CI-NEXT: v_cvt_f16_f32_e32 v30, v30 ; CI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; CI-NEXT: v_or_b32_e32 v31, v32, v31 -; CI-NEXT: v_add_i32_e32 v32, vcc, 0x48, v0 -; CI-NEXT: buffer_store_dword v31, v32, s[0:3], 0 offen -; CI-NEXT: buffer_store_dword v11, v15, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v11, vcc, 64, v0 -; CI-NEXT: buffer_store_dword v12, v11, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v11, vcc, 60, v0 -; CI-NEXT: buffer_store_dword v14, v11, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v11, vcc, 56, v0 -; CI-NEXT: buffer_store_dword v21, v11, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v11, vcc, 52, v0 -; CI-NEXT: buffer_store_dword v20, v11, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v11, vcc, 48, v0 -; CI-NEXT: buffer_store_dword v19, v11, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v11, vcc, 44, v0 -; CI-NEXT: buffer_store_dword v17, v11, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v11, vcc, 40, v0 -; CI-NEXT: buffer_store_dword v13, v11, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v11, vcc, 36, v0 -; CI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v10, vcc, 32, v0 -; CI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v9, vcc, 28, v0 -; CI-NEXT: buffer_store_dword v8, v9, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v8, vcc, 24, v0 -; CI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v7, vcc, 20, v0 -; CI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v6, vcc, 16, v0 -; CI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v5, vcc, 12, v0 -; CI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v4, vcc, 8, v0 -; CI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v3, vcc, 4, v0 -; CI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen -; CI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; CI-NEXT: v_or_b32_e32 v30, v30, v31 +; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_lshrrev_b32_e32 v32, 16, v31 +; CI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; CI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; CI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; CI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; CI-NEXT: v_or_b32_e32 v31, v31, v32 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_test_canonicalize_var_v64f16: diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll index cdec7545ac411..50066711f2552 100644 --- a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll @@ -1061,40 +1061,28 @@ define amdgpu_ps <2 x i32> @s_copysign_f64_bf16(double inreg %mag, bfloat inreg define amdgpu_ps i32 @s_copysign_v2bf16(<2 x bfloat> inreg %arg_mag, <2 x bfloat> inreg %arg_sign) { ; GCN-LABEL: s_copysign_v2bf16: ; GCN: ; %bb.0: -; GCN-NEXT: v_mul_f32_e64 v0, 1.0, s3 -; GCN-NEXT: v_mul_f32_e64 v1, 1.0, s2 -; GCN-NEXT: v_mul_f32_e64 v2, 1.0, s1 -; GCN-NEXT: v_mul_f32_e64 v3, 1.0, s0 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_bfe_u32 v3, v3, 16, 15 -; GCN-NEXT: v_bfe_u32 v2, v2, 16, 15 -; GCN-NEXT: v_and_b32_e32 v1, 0x8000, v1 -; GCN-NEXT: v_and_b32_e32 v0, 0x8000, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v1 -; GCN-NEXT: v_or_b32_e32 v0, v2, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_readfirstlane_b32 s0, v0 +; GCN-NEXT: s_lshr_b32 s2, s1, 16 +; GCN-NEXT: s_and_b32 s1, s1, 0x8000 +; GCN-NEXT: s_and_b32 s3, s0, 0x7fff +; GCN-NEXT: s_or_b32 s1, s3, s1 +; GCN-NEXT: s_and_b32 s2, s2, 0x8000 +; GCN-NEXT: s_bfe_u32 s0, s0, 0xf0010 +; GCN-NEXT: s_or_b32 s0, s0, s2 +; GCN-NEXT: s_lshl_b32 s0, s0, 16 +; GCN-NEXT: s_or_b32 s0, s1, s0 ; GCN-NEXT: ; return to shader part epilog ; ; GFX7-LABEL: s_copysign_v2bf16: ; GFX7: ; %bb.0: -; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, s3 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_mul_f32_e64 v1, 1.0, s2 -; GFX7-NEXT: v_mul_f32_e64 v2, 1.0, s1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_mul_f32_e64 v3, 1.0, s0 -; GFX7-NEXT: v_and_b32_e32 v0, 0x8000, v0 -; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 15 -; GFX7-NEXT: v_and_b32_e32 v1, 0x8000, v1 -; GFX7-NEXT: v_bfe_u32 v3, v3, 16, 15 -; GFX7-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX7-NEXT: v_or_b32_e32 v1, v3, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX7-NEXT: v_readfirstlane_b32 s0, v0 +; GFX7-NEXT: s_lshr_b32 s2, s1, 16 +; GFX7-NEXT: s_and_b32 s3, s0, 0x7fff +; GFX7-NEXT: s_and_b32 s2, s2, 0x8000 +; GFX7-NEXT: s_bfe_u32 s0, s0, 0xf0010 +; GFX7-NEXT: s_and_b32 s1, s1, 0x8000 +; GFX7-NEXT: s_or_b32 s0, s0, s2 +; GFX7-NEXT: s_or_b32 s1, s3, s1 +; GFX7-NEXT: s_lshl_b32 s0, s0, 16 +; GFX7-NEXT: s_or_b32 s0, s1, s0 ; GFX7-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_copysign_v2bf16: @@ -1137,58 +1125,34 @@ define amdgpu_ps i32 @s_copysign_v2bf16(<2 x bfloat> inreg %arg_mag, <2 x bfloat define amdgpu_ps <3 x i16> @s_copysign_v3bf16(<3 x bfloat> inreg %arg_mag, <3 x bfloat> inreg %arg_sign) { ; GCN-LABEL: s_copysign_v3bf16: ; GCN: ; %bb.0: -; GCN-NEXT: v_mul_f32_e64 v0, 1.0, s3 -; GCN-NEXT: v_mul_f32_e64 v1, 1.0, s4 -; GCN-NEXT: v_mul_f32_e64 v2, 1.0, s5 -; GCN-NEXT: v_mul_f32_e64 v3, 1.0, s0 -; GCN-NEXT: v_mul_f32_e64 v4, 1.0, s1 -; GCN-NEXT: v_mul_f32_e64 v5, 1.0, s2 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_bfe_u32 v5, v5, 16, 15 -; GCN-NEXT: v_bfe_u32 v4, v4, 16, 15 -; GCN-NEXT: v_bfe_u32 v3, v3, 16, 15 -; GCN-NEXT: v_and_b32_e32 v2, 0x8000, v2 -; GCN-NEXT: v_and_b32_e32 v6, 0x8000, v1 -; GCN-NEXT: v_and_b32_e32 v0, 0x8000, v0 -; GCN-NEXT: v_or_b32_e32 v1, v5, v2 -; GCN-NEXT: v_or_b32_e32 v2, v4, v6 -; GCN-NEXT: v_or_b32_e32 v3, v3, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; GCN-NEXT: v_or_b32_e32 v4, v3, v0 -; GCN-NEXT: v_lshr_b64 v[2:3], v[0:1], 16 -; GCN-NEXT: v_readfirstlane_b32 s0, v4 -; GCN-NEXT: v_readfirstlane_b32 s1, v2 -; GCN-NEXT: v_readfirstlane_b32 s2, v1 +; GCN-NEXT: s_and_b32 s4, s2, 0x8000 +; GCN-NEXT: s_and_b32 s5, s0, 0x7fff +; GCN-NEXT: s_and_b32 s3, s3, 0x8000 +; GCN-NEXT: s_and_b32 s1, s1, 0x7fff +; GCN-NEXT: s_lshr_b32 s2, s2, 16 +; GCN-NEXT: s_or_b32 s4, s5, s4 +; GCN-NEXT: s_or_b32 s1, s1, s3 +; GCN-NEXT: s_and_b32 s2, s2, 0x8000 +; GCN-NEXT: s_bfe_u32 s0, s0, 0xf0010 +; GCN-NEXT: s_or_b32 s0, s0, s2 +; GCN-NEXT: s_lshl_b32 s0, s0, 16 +; GCN-NEXT: s_or_b32 s0, s4, s0 ; GCN-NEXT: ; return to shader part epilog ; ; GFX7-LABEL: s_copysign_v3bf16: ; GFX7: ; %bb.0: -; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, s3 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, s4 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_mul_f32_e64 v1, 1.0, s5 -; GFX7-NEXT: v_mul_f32_e64 v4, 1.0, s1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_mul_f32_e64 v3, 1.0, s0 -; GFX7-NEXT: v_mul_f32_e64 v5, 1.0, s2 -; GFX7-NEXT: v_and_b32_e32 v0, 0x8000, v0 -; GFX7-NEXT: v_bfe_u32 v4, v4, 16, 15 -; GFX7-NEXT: v_and_b32_e32 v1, 0x8000, v1 -; GFX7-NEXT: v_bfe_u32 v5, v5, 16, 15 -; GFX7-NEXT: v_or_b32_e32 v0, v4, v0 -; GFX7-NEXT: v_and_b32_e32 v2, 0x8000, v2 -; GFX7-NEXT: v_bfe_u32 v3, v3, 16, 15 -; GFX7-NEXT: v_or_b32_e32 v1, v5, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX7-NEXT: v_or_b32_e32 v4, v2, v0 -; GFX7-NEXT: v_lshr_b64 v[2:3], v[0:1], 16 -; GFX7-NEXT: v_readfirstlane_b32 s0, v4 -; GFX7-NEXT: v_readfirstlane_b32 s1, v2 -; GFX7-NEXT: v_readfirstlane_b32 s2, v1 +; GFX7-NEXT: s_and_b32 s4, s2, 0x8000 +; GFX7-NEXT: s_lshr_b32 s2, s2, 16 +; GFX7-NEXT: s_and_b32 s5, s0, 0x7fff +; GFX7-NEXT: s_and_b32 s2, s2, 0x8000 +; GFX7-NEXT: s_bfe_u32 s0, s0, 0xf0010 +; GFX7-NEXT: s_or_b32 s0, s0, s2 +; GFX7-NEXT: s_or_b32 s4, s5, s4 +; GFX7-NEXT: s_and_b32 s3, s3, 0x8000 +; GFX7-NEXT: s_and_b32 s1, s1, 0x7fff +; GFX7-NEXT: s_lshl_b32 s0, s0, 16 +; GFX7-NEXT: s_or_b32 s1, s1, s3 +; GFX7-NEXT: s_or_b32 s0, s4, s0 ; GFX7-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_copysign_v3bf16: @@ -1245,70 +1209,46 @@ define amdgpu_ps <3 x i16> @s_copysign_v3bf16(<3 x bfloat> inreg %arg_mag, <3 x define amdgpu_ps <2 x i32> @s_copysign_v4bf16(<4 x bfloat> inreg %arg_mag, <4 x bfloat> inreg %arg_sign) { ; GCN-LABEL: s_copysign_v4bf16: ; GCN: ; %bb.0: -; GCN-NEXT: v_mul_f32_e64 v0, 1.0, s5 -; GCN-NEXT: v_mul_f32_e64 v1, 1.0, s4 -; GCN-NEXT: v_mul_f32_e64 v2, 1.0, s7 -; GCN-NEXT: v_mul_f32_e64 v3, 1.0, s6 -; GCN-NEXT: v_mul_f32_e64 v4, 1.0, s1 -; GCN-NEXT: v_mul_f32_e64 v5, 1.0, s0 -; GCN-NEXT: v_mul_f32_e64 v6, 1.0, s3 -; GCN-NEXT: v_mul_f32_e64 v7, 1.0, s2 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_bfe_u32 v7, v7, 16, 15 -; GCN-NEXT: v_bfe_u32 v6, v6, 16, 15 -; GCN-NEXT: v_bfe_u32 v5, v5, 16, 15 -; GCN-NEXT: v_bfe_u32 v4, v4, 16, 15 -; GCN-NEXT: v_and_b32_e32 v3, 0x8000, v3 -; GCN-NEXT: v_and_b32_e32 v2, 0x8000, v2 -; GCN-NEXT: v_and_b32_e32 v1, 0x8000, v1 -; GCN-NEXT: v_and_b32_e32 v0, 0x8000, v0 -; GCN-NEXT: v_or_b32_e32 v3, v7, v3 -; GCN-NEXT: v_or_b32_e32 v2, v6, v2 -; GCN-NEXT: v_or_b32_e32 v1, v5, v1 -; GCN-NEXT: v_or_b32_e32 v0, v4, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_readfirstlane_b32 s0, v0 -; GCN-NEXT: v_readfirstlane_b32 s1, v2 +; GCN-NEXT: s_lshr_b32 s4, s2, 16 +; GCN-NEXT: s_lshr_b32 s5, s3, 16 +; GCN-NEXT: s_and_b32 s3, s3, 0x8000 +; GCN-NEXT: s_and_b32 s6, s1, 0x7fff +; GCN-NEXT: s_bfe_u32 s1, s1, 0xf0010 +; GCN-NEXT: s_and_b32 s2, s2, 0x8000 +; GCN-NEXT: s_and_b32 s7, s0, 0x7fff +; GCN-NEXT: s_bfe_u32 s0, s0, 0xf0010 +; GCN-NEXT: s_or_b32 s3, s6, s3 +; GCN-NEXT: s_and_b32 s5, s5, 0x8000 +; GCN-NEXT: s_or_b32 s2, s7, s2 +; GCN-NEXT: s_and_b32 s4, s4, 0x8000 +; GCN-NEXT: s_or_b32 s1, s1, s5 +; GCN-NEXT: s_or_b32 s0, s0, s4 +; GCN-NEXT: s_lshl_b32 s1, s1, 16 +; GCN-NEXT: s_lshl_b32 s0, s0, 16 +; GCN-NEXT: s_or_b32 s1, s3, s1 +; GCN-NEXT: s_or_b32 s0, s2, s0 ; GCN-NEXT: ; return to shader part epilog ; ; GFX7-LABEL: s_copysign_v4bf16: ; GFX7: ; %bb.0: -; GFX7-NEXT: v_mul_f32_e64 v2, 1.0, s7 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_mul_f32_e64 v3, 1.0, s6 -; GFX7-NEXT: v_mul_f32_e64 v6, 1.0, s3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_mul_f32_e64 v7, 1.0, s2 -; GFX7-NEXT: v_and_b32_e32 v2, 0x8000, v2 -; GFX7-NEXT: v_bfe_u32 v6, v6, 16, 15 -; GFX7-NEXT: v_mul_f32_e64 v1, 1.0, s4 -; GFX7-NEXT: v_and_b32_e32 v3, 0x8000, v3 -; GFX7-NEXT: v_bfe_u32 v7, v7, 16, 15 -; GFX7-NEXT: v_or_b32_e32 v2, v6, v2 -; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, s5 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_mul_f32_e64 v5, 1.0, s0 -; GFX7-NEXT: v_or_b32_e32 v3, v7, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_mul_f32_e64 v4, 1.0, s1 -; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX7-NEXT: v_and_b32_e32 v1, 0x8000, v1 -; GFX7-NEXT: v_bfe_u32 v3, v5, 16, 15 -; GFX7-NEXT: v_or_b32_e32 v1, v3, v1 -; GFX7-NEXT: v_and_b32_e32 v0, 0x8000, v0 -; GFX7-NEXT: v_bfe_u32 v3, v4, 16, 15 -; GFX7-NEXT: v_or_b32_e32 v0, v3, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX7-NEXT: v_readfirstlane_b32 s0, v0 -; GFX7-NEXT: v_readfirstlane_b32 s1, v2 +; GFX7-NEXT: s_lshr_b32 s5, s3, 16 +; GFX7-NEXT: s_and_b32 s6, s1, 0x7fff +; GFX7-NEXT: s_and_b32 s5, s5, 0x8000 +; GFX7-NEXT: s_bfe_u32 s1, s1, 0xf0010 +; GFX7-NEXT: s_and_b32 s3, s3, 0x8000 +; GFX7-NEXT: s_or_b32 s1, s1, s5 +; GFX7-NEXT: s_or_b32 s3, s6, s3 +; GFX7-NEXT: s_lshl_b32 s1, s1, 16 +; GFX7-NEXT: s_lshr_b32 s4, s2, 16 +; GFX7-NEXT: s_or_b32 s1, s3, s1 +; GFX7-NEXT: s_and_b32 s2, s2, 0x8000 +; GFX7-NEXT: s_and_b32 s3, s0, 0x7fff +; GFX7-NEXT: s_or_b32 s2, s3, s2 +; GFX7-NEXT: s_and_b32 s3, s4, 0x8000 +; GFX7-NEXT: s_bfe_u32 s0, s0, 0xf0010 +; GFX7-NEXT: s_or_b32 s0, s0, s3 +; GFX7-NEXT: s_lshl_b32 s0, s0, 16 +; GFX7-NEXT: s_or_b32 s0, s2, s0 ; GFX7-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_copysign_v4bf16: @@ -1365,130 +1305,82 @@ define amdgpu_ps <2 x i32> @s_copysign_v4bf16(<4 x bfloat> inreg %arg_mag, <4 x define amdgpu_ps <4 x i32> @s_copysign_v8bf16(<8 x bfloat> inreg %arg_mag, <8 x bfloat> inreg %arg_sign) { ; GCN-LABEL: s_copysign_v8bf16: ; GCN: ; %bb.0: -; GCN-NEXT: v_mul_f32_e64 v0, 1.0, s9 -; GCN-NEXT: v_mul_f32_e64 v1, 1.0, s8 -; GCN-NEXT: v_mul_f32_e64 v2, 1.0, s11 -; GCN-NEXT: v_mul_f32_e64 v3, 1.0, s10 -; GCN-NEXT: v_mul_f32_e64 v4, 1.0, s13 -; GCN-NEXT: v_mul_f32_e64 v5, 1.0, s12 -; GCN-NEXT: v_mul_f32_e64 v6, 1.0, s15 -; GCN-NEXT: v_mul_f32_e64 v7, 1.0, s14 -; GCN-NEXT: v_mul_f32_e64 v8, 1.0, s1 -; GCN-NEXT: v_mul_f32_e64 v9, 1.0, s0 -; GCN-NEXT: v_mul_f32_e64 v10, 1.0, s3 -; GCN-NEXT: v_mul_f32_e64 v11, 1.0, s2 -; GCN-NEXT: v_mul_f32_e64 v12, 1.0, s5 -; GCN-NEXT: v_mul_f32_e64 v13, 1.0, s4 -; GCN-NEXT: v_mul_f32_e64 v14, 1.0, s7 -; GCN-NEXT: v_mul_f32_e64 v15, 1.0, s6 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_bfe_u32 v15, v15, 16, 15 -; GCN-NEXT: v_bfe_u32 v14, v14, 16, 15 -; GCN-NEXT: v_bfe_u32 v13, v13, 16, 15 -; GCN-NEXT: v_bfe_u32 v12, v12, 16, 15 -; GCN-NEXT: v_bfe_u32 v11, v11, 16, 15 -; GCN-NEXT: v_bfe_u32 v10, v10, 16, 15 -; GCN-NEXT: v_bfe_u32 v9, v9, 16, 15 -; GCN-NEXT: v_bfe_u32 v8, v8, 16, 15 -; GCN-NEXT: v_and_b32_e32 v7, 0x8000, v7 -; GCN-NEXT: v_and_b32_e32 v6, 0x8000, v6 -; GCN-NEXT: v_and_b32_e32 v5, 0x8000, v5 -; GCN-NEXT: v_and_b32_e32 v4, 0x8000, v4 -; GCN-NEXT: v_and_b32_e32 v3, 0x8000, v3 -; GCN-NEXT: v_and_b32_e32 v2, 0x8000, v2 -; GCN-NEXT: v_and_b32_e32 v1, 0x8000, v1 -; GCN-NEXT: v_and_b32_e32 v0, 0x8000, v0 -; GCN-NEXT: v_or_b32_e32 v7, v15, v7 -; GCN-NEXT: v_or_b32_e32 v6, v14, v6 -; GCN-NEXT: v_or_b32_e32 v5, v13, v5 -; GCN-NEXT: v_or_b32_e32 v4, v12, v4 -; GCN-NEXT: v_or_b32_e32 v3, v11, v3 -; GCN-NEXT: v_or_b32_e32 v2, v10, v2 -; GCN-NEXT: v_or_b32_e32 v1, v9, v1 -; GCN-NEXT: v_or_b32_e32 v0, v8, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_or_b32_e32 v6, v7, v6 -; GCN-NEXT: v_or_b32_e32 v4, v5, v4 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_readfirstlane_b32 s0, v0 -; GCN-NEXT: v_readfirstlane_b32 s1, v2 -; GCN-NEXT: v_readfirstlane_b32 s2, v4 -; GCN-NEXT: v_readfirstlane_b32 s3, v6 +; GCN-NEXT: s_lshr_b32 s8, s4, 16 +; GCN-NEXT: s_lshr_b32 s9, s5, 16 +; GCN-NEXT: s_lshr_b32 s10, s6, 16 +; GCN-NEXT: s_lshr_b32 s11, s7, 16 +; GCN-NEXT: s_and_b32 s7, s7, 0x8000 +; GCN-NEXT: s_and_b32 s12, s3, 0x7fff +; GCN-NEXT: s_bfe_u32 s3, s3, 0xf0010 +; GCN-NEXT: s_and_b32 s6, s6, 0x8000 +; GCN-NEXT: s_and_b32 s13, s2, 0x7fff +; GCN-NEXT: s_bfe_u32 s2, s2, 0xf0010 +; GCN-NEXT: s_and_b32 s5, s5, 0x8000 +; GCN-NEXT: s_and_b32 s14, s1, 0x7fff +; GCN-NEXT: s_bfe_u32 s1, s1, 0xf0010 +; GCN-NEXT: s_and_b32 s4, s4, 0x8000 +; GCN-NEXT: s_and_b32 s15, s0, 0x7fff +; GCN-NEXT: s_bfe_u32 s0, s0, 0xf0010 +; GCN-NEXT: s_or_b32 s7, s12, s7 +; GCN-NEXT: s_and_b32 s11, s11, 0x8000 +; GCN-NEXT: s_or_b32 s6, s13, s6 +; GCN-NEXT: s_and_b32 s10, s10, 0x8000 +; GCN-NEXT: s_or_b32 s5, s14, s5 +; GCN-NEXT: s_and_b32 s9, s9, 0x8000 +; GCN-NEXT: s_or_b32 s4, s15, s4 +; GCN-NEXT: s_and_b32 s8, s8, 0x8000 +; GCN-NEXT: s_or_b32 s3, s3, s11 +; GCN-NEXT: s_or_b32 s2, s2, s10 +; GCN-NEXT: s_or_b32 s1, s1, s9 +; GCN-NEXT: s_or_b32 s0, s0, s8 +; GCN-NEXT: s_lshl_b32 s3, s3, 16 +; GCN-NEXT: s_lshl_b32 s2, s2, 16 +; GCN-NEXT: s_lshl_b32 s1, s1, 16 +; GCN-NEXT: s_lshl_b32 s0, s0, 16 +; GCN-NEXT: s_or_b32 s3, s7, s3 +; GCN-NEXT: s_or_b32 s2, s6, s2 +; GCN-NEXT: s_or_b32 s1, s5, s1 +; GCN-NEXT: s_or_b32 s0, s4, s0 ; GCN-NEXT: ; return to shader part epilog ; ; GFX7-LABEL: s_copysign_v8bf16: ; GFX7: ; %bb.0: -; GFX7-NEXT: v_mul_f32_e64 v6, 1.0, s15 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX7-NEXT: v_mul_f32_e64 v7, 1.0, s14 -; GFX7-NEXT: v_mul_f32_e64 v14, 1.0, s7 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GFX7-NEXT: v_mul_f32_e64 v15, 1.0, s6 -; GFX7-NEXT: v_and_b32_e32 v6, 0x8000, v6 -; GFX7-NEXT: v_bfe_u32 v14, v14, 16, 15 -; GFX7-NEXT: v_mul_f32_e64 v5, 1.0, s12 -; GFX7-NEXT: v_and_b32_e32 v7, 0x8000, v7 -; GFX7-NEXT: v_bfe_u32 v15, v15, 16, 15 -; GFX7-NEXT: v_or_b32_e32 v6, v14, v6 -; GFX7-NEXT: v_mul_f32_e64 v4, 1.0, s13 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_mul_f32_e64 v13, 1.0, s4 -; GFX7-NEXT: v_or_b32_e32 v7, v15, v7 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_mul_f32_e64 v12, 1.0, s5 -; GFX7-NEXT: v_or_b32_e32 v6, v7, v6 -; GFX7-NEXT: v_and_b32_e32 v5, 0x8000, v5 -; GFX7-NEXT: v_bfe_u32 v7, v13, 16, 15 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v5 -; GFX7-NEXT: v_and_b32_e32 v4, 0x8000, v4 -; GFX7-NEXT: v_bfe_u32 v7, v12, 16, 15 -; GFX7-NEXT: v_mul_f32_e64 v3, 1.0, s10 -; GFX7-NEXT: v_or_b32_e32 v4, v7, v4 -; GFX7-NEXT: v_mul_f32_e64 v2, 1.0, s11 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_mul_f32_e64 v11, 1.0, s2 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_mul_f32_e64 v10, 1.0, s3 -; GFX7-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX7-NEXT: v_and_b32_e32 v3, 0x8000, v3 -; GFX7-NEXT: v_bfe_u32 v5, v11, 16, 15 -; GFX7-NEXT: v_or_b32_e32 v3, v5, v3 -; GFX7-NEXT: v_and_b32_e32 v2, 0x8000, v2 -; GFX7-NEXT: v_bfe_u32 v5, v10, 16, 15 -; GFX7-NEXT: v_mul_f32_e64 v1, 1.0, s8 -; GFX7-NEXT: v_or_b32_e32 v2, v5, v2 -; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, s9 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_mul_f32_e64 v9, 1.0, s0 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_mul_f32_e64 v8, 1.0, s1 -; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX7-NEXT: v_and_b32_e32 v1, 0x8000, v1 -; GFX7-NEXT: v_bfe_u32 v3, v9, 16, 15 -; GFX7-NEXT: v_or_b32_e32 v1, v3, v1 -; GFX7-NEXT: v_and_b32_e32 v0, 0x8000, v0 -; GFX7-NEXT: v_bfe_u32 v3, v8, 16, 15 -; GFX7-NEXT: v_or_b32_e32 v0, v3, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX7-NEXT: v_readfirstlane_b32 s0, v0 -; GFX7-NEXT: v_readfirstlane_b32 s1, v2 -; GFX7-NEXT: v_readfirstlane_b32 s2, v4 -; GFX7-NEXT: v_readfirstlane_b32 s3, v6 +; GFX7-NEXT: s_lshr_b32 s11, s7, 16 +; GFX7-NEXT: s_and_b32 s12, s3, 0x7fff +; GFX7-NEXT: s_and_b32 s11, s11, 0x8000 +; GFX7-NEXT: s_bfe_u32 s3, s3, 0xf0010 +; GFX7-NEXT: s_and_b32 s7, s7, 0x8000 +; GFX7-NEXT: s_or_b32 s3, s3, s11 +; GFX7-NEXT: s_or_b32 s7, s12, s7 +; GFX7-NEXT: s_lshl_b32 s3, s3, 16 +; GFX7-NEXT: s_lshr_b32 s10, s6, 16 +; GFX7-NEXT: s_or_b32 s3, s7, s3 +; GFX7-NEXT: s_and_b32 s6, s6, 0x8000 +; GFX7-NEXT: s_and_b32 s7, s2, 0x7fff +; GFX7-NEXT: s_or_b32 s6, s7, s6 +; GFX7-NEXT: s_and_b32 s7, s10, 0x8000 +; GFX7-NEXT: s_bfe_u32 s2, s2, 0xf0010 +; GFX7-NEXT: s_or_b32 s2, s2, s7 +; GFX7-NEXT: s_lshl_b32 s2, s2, 16 +; GFX7-NEXT: s_lshr_b32 s9, s5, 16 +; GFX7-NEXT: s_or_b32 s2, s6, s2 +; GFX7-NEXT: s_and_b32 s5, s5, 0x8000 +; GFX7-NEXT: s_and_b32 s6, s1, 0x7fff +; GFX7-NEXT: s_or_b32 s5, s6, s5 +; GFX7-NEXT: s_and_b32 s6, s9, 0x8000 +; GFX7-NEXT: s_bfe_u32 s1, s1, 0xf0010 +; GFX7-NEXT: s_or_b32 s1, s1, s6 +; GFX7-NEXT: s_lshl_b32 s1, s1, 16 +; GFX7-NEXT: s_lshr_b32 s8, s4, 16 +; GFX7-NEXT: s_or_b32 s1, s5, s1 +; GFX7-NEXT: s_and_b32 s4, s4, 0x8000 +; GFX7-NEXT: s_and_b32 s5, s0, 0x7fff +; GFX7-NEXT: s_or_b32 s4, s5, s4 +; GFX7-NEXT: s_and_b32 s5, s8, 0x8000 +; GFX7-NEXT: s_bfe_u32 s0, s0, 0xf0010 +; GFX7-NEXT: s_or_b32 s0, s0, s5 +; GFX7-NEXT: s_lshl_b32 s0, s0, 16 +; GFX7-NEXT: s_or_b32 s0, s4, s0 ; GFX7-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_copysign_v8bf16: @@ -1574,250 +1466,154 @@ define amdgpu_ps <4 x i32> @s_copysign_v8bf16(<8 x bfloat> inreg %arg_mag, <8 x define amdgpu_ps <8 x i32> @s_copysign_v16bf16(<16 x bfloat> inreg %arg_mag, <16 x bfloat> inreg %arg_sign) { ; GCN-LABEL: s_copysign_v16bf16: ; GCN: ; %bb.0: -; GCN-NEXT: v_mul_f32_e64 v0, 1.0, s17 -; GCN-NEXT: v_mul_f32_e64 v1, 1.0, s16 -; GCN-NEXT: v_mul_f32_e64 v2, 1.0, s19 -; GCN-NEXT: v_mul_f32_e64 v3, 1.0, s18 -; GCN-NEXT: v_mul_f32_e64 v4, 1.0, s21 -; GCN-NEXT: v_mul_f32_e64 v5, 1.0, s20 -; GCN-NEXT: v_mul_f32_e64 v6, 1.0, s23 -; GCN-NEXT: v_mul_f32_e64 v7, 1.0, s22 -; GCN-NEXT: v_mul_f32_e64 v8, 1.0, s25 -; GCN-NEXT: v_mul_f32_e64 v9, 1.0, s24 -; GCN-NEXT: v_mul_f32_e64 v10, 1.0, s27 -; GCN-NEXT: v_mul_f32_e64 v11, 1.0, s26 -; GCN-NEXT: v_mul_f32_e64 v12, 1.0, s29 -; GCN-NEXT: v_mul_f32_e64 v13, 1.0, s28 -; GCN-NEXT: v_mul_f32_e64 v14, 1.0, s31 -; GCN-NEXT: v_mul_f32_e64 v15, 1.0, s30 -; GCN-NEXT: v_mul_f32_e64 v16, 1.0, s1 -; GCN-NEXT: v_mul_f32_e64 v17, 1.0, s0 -; GCN-NEXT: v_mul_f32_e64 v18, 1.0, s3 -; GCN-NEXT: v_mul_f32_e64 v19, 1.0, s14 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_bfe_u32 v19, v19, 16, 15 -; GCN-NEXT: v_and_b32_e32 v15, 0x8000, v15 -; GCN-NEXT: v_or_b32_e32 v15, v19, v15 -; GCN-NEXT: v_mul_f32_e64 v19, 1.0, s15 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_bfe_u32 v19, v19, 16, 15 -; GCN-NEXT: v_and_b32_e32 v14, 0x8000, v14 -; GCN-NEXT: v_or_b32_e32 v14, v19, v14 -; GCN-NEXT: v_mul_f32_e64 v19, 1.0, s12 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_bfe_u32 v19, v19, 16, 15 -; GCN-NEXT: v_and_b32_e32 v13, 0x8000, v13 -; GCN-NEXT: v_or_b32_e32 v13, v19, v13 -; GCN-NEXT: v_mul_f32_e64 v19, 1.0, s13 -; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_bfe_u32 v19, v19, 16, 15 -; GCN-NEXT: v_and_b32_e32 v12, 0x8000, v12 -; GCN-NEXT: v_or_b32_e32 v12, v19, v12 -; GCN-NEXT: v_mul_f32_e64 v19, 1.0, s10 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_bfe_u32 v19, v19, 16, 15 -; GCN-NEXT: v_and_b32_e32 v11, 0x8000, v11 -; GCN-NEXT: v_or_b32_e32 v11, v19, v11 -; GCN-NEXT: v_mul_f32_e64 v19, 1.0, s11 -; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_bfe_u32 v19, v19, 16, 15 -; GCN-NEXT: v_and_b32_e32 v10, 0x8000, v10 -; GCN-NEXT: v_or_b32_e32 v10, v19, v10 -; GCN-NEXT: v_mul_f32_e64 v19, 1.0, s8 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_bfe_u32 v19, v19, 16, 15 -; GCN-NEXT: v_and_b32_e32 v9, 0x8000, v9 -; GCN-NEXT: v_or_b32_e32 v9, v19, v9 -; GCN-NEXT: v_mul_f32_e64 v19, 1.0, s9 -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_bfe_u32 v19, v19, 16, 15 -; GCN-NEXT: v_and_b32_e32 v8, 0x8000, v8 -; GCN-NEXT: v_or_b32_e32 v8, v19, v8 -; GCN-NEXT: v_mul_f32_e64 v19, 1.0, s6 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_bfe_u32 v19, v19, 16, 15 -; GCN-NEXT: v_and_b32_e32 v7, 0x8000, v7 -; GCN-NEXT: v_or_b32_e32 v7, v19, v7 -; GCN-NEXT: v_mul_f32_e64 v19, 1.0, s7 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_bfe_u32 v19, v19, 16, 15 -; GCN-NEXT: v_and_b32_e32 v6, 0x8000, v6 -; GCN-NEXT: v_or_b32_e32 v6, v19, v6 -; GCN-NEXT: v_mul_f32_e64 v19, 1.0, s4 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_bfe_u32 v19, v19, 16, 15 -; GCN-NEXT: v_and_b32_e32 v5, 0x8000, v5 -; GCN-NEXT: v_or_b32_e32 v5, v19, v5 -; GCN-NEXT: v_mul_f32_e64 v19, 1.0, s5 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_bfe_u32 v19, v19, 16, 15 -; GCN-NEXT: v_and_b32_e32 v4, 0x8000, v4 -; GCN-NEXT: v_or_b32_e32 v4, v19, v4 -; GCN-NEXT: v_mul_f32_e64 v19, 1.0, s2 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_bfe_u32 v19, v19, 16, 15 -; GCN-NEXT: v_bfe_u32 v18, v18, 16, 15 -; GCN-NEXT: v_bfe_u32 v17, v17, 16, 15 -; GCN-NEXT: v_bfe_u32 v16, v16, 16, 15 -; GCN-NEXT: v_and_b32_e32 v3, 0x8000, v3 -; GCN-NEXT: v_and_b32_e32 v2, 0x8000, v2 -; GCN-NEXT: v_and_b32_e32 v1, 0x8000, v1 -; GCN-NEXT: v_and_b32_e32 v0, 0x8000, v0 -; GCN-NEXT: v_or_b32_e32 v3, v19, v3 -; GCN-NEXT: v_or_b32_e32 v2, v18, v2 -; GCN-NEXT: v_or_b32_e32 v1, v17, v1 -; GCN-NEXT: v_or_b32_e32 v0, v16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_or_b32_e32 v14, v15, v14 -; GCN-NEXT: v_or_b32_e32 v12, v13, v12 -; GCN-NEXT: v_or_b32_e32 v10, v11, v10 -; GCN-NEXT: v_or_b32_e32 v8, v9, v8 -; GCN-NEXT: v_or_b32_e32 v6, v7, v6 -; GCN-NEXT: v_or_b32_e32 v4, v5, v4 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_readfirstlane_b32 s0, v0 -; GCN-NEXT: v_readfirstlane_b32 s1, v2 -; GCN-NEXT: v_readfirstlane_b32 s2, v4 -; GCN-NEXT: v_readfirstlane_b32 s3, v6 -; GCN-NEXT: v_readfirstlane_b32 s4, v8 -; GCN-NEXT: v_readfirstlane_b32 s5, v10 -; GCN-NEXT: v_readfirstlane_b32 s6, v12 -; GCN-NEXT: v_readfirstlane_b32 s7, v14 +; GCN-NEXT: s_lshr_b32 s16, s8, 16 +; GCN-NEXT: s_lshr_b32 s17, s9, 16 +; GCN-NEXT: s_lshr_b32 s18, s10, 16 +; GCN-NEXT: s_lshr_b32 s19, s11, 16 +; GCN-NEXT: s_lshr_b32 s20, s12, 16 +; GCN-NEXT: s_lshr_b32 s21, s13, 16 +; GCN-NEXT: s_lshr_b32 s22, s14, 16 +; GCN-NEXT: s_lshr_b32 s23, s15, 16 +; GCN-NEXT: s_and_b32 s15, s15, 0x8000 +; GCN-NEXT: s_and_b32 s24, s7, 0x7fff +; GCN-NEXT: s_bfe_u32 s7, s7, 0xf0010 +; GCN-NEXT: s_and_b32 s14, s14, 0x8000 +; GCN-NEXT: s_and_b32 s25, s6, 0x7fff +; GCN-NEXT: s_bfe_u32 s6, s6, 0xf0010 +; GCN-NEXT: s_and_b32 s13, s13, 0x8000 +; GCN-NEXT: s_and_b32 s26, s5, 0x7fff +; GCN-NEXT: s_bfe_u32 s5, s5, 0xf0010 +; GCN-NEXT: s_and_b32 s12, s12, 0x8000 +; GCN-NEXT: s_and_b32 s27, s4, 0x7fff +; GCN-NEXT: s_bfe_u32 s4, s4, 0xf0010 +; GCN-NEXT: s_and_b32 s11, s11, 0x8000 +; GCN-NEXT: s_and_b32 s28, s3, 0x7fff +; GCN-NEXT: s_bfe_u32 s3, s3, 0xf0010 +; GCN-NEXT: s_and_b32 s10, s10, 0x8000 +; GCN-NEXT: s_and_b32 s29, s2, 0x7fff +; GCN-NEXT: s_bfe_u32 s2, s2, 0xf0010 +; GCN-NEXT: s_and_b32 s9, s9, 0x8000 +; GCN-NEXT: s_and_b32 s30, s1, 0x7fff +; GCN-NEXT: s_bfe_u32 s1, s1, 0xf0010 +; GCN-NEXT: s_and_b32 s8, s8, 0x8000 +; GCN-NEXT: s_and_b32 s31, s0, 0x7fff +; GCN-NEXT: s_bfe_u32 s0, s0, 0xf0010 +; GCN-NEXT: s_or_b32 s15, s24, s15 +; GCN-NEXT: s_and_b32 s23, s23, 0x8000 +; GCN-NEXT: s_or_b32 s14, s25, s14 +; GCN-NEXT: s_and_b32 s22, s22, 0x8000 +; GCN-NEXT: s_or_b32 s13, s26, s13 +; GCN-NEXT: s_and_b32 s21, s21, 0x8000 +; GCN-NEXT: s_or_b32 s12, s27, s12 +; GCN-NEXT: s_and_b32 s20, s20, 0x8000 +; GCN-NEXT: s_or_b32 s11, s28, s11 +; GCN-NEXT: s_and_b32 s19, s19, 0x8000 +; GCN-NEXT: s_or_b32 s10, s29, s10 +; GCN-NEXT: s_and_b32 s18, s18, 0x8000 +; GCN-NEXT: s_or_b32 s9, s30, s9 +; GCN-NEXT: s_and_b32 s17, s17, 0x8000 +; GCN-NEXT: s_or_b32 s8, s31, s8 +; GCN-NEXT: s_and_b32 s16, s16, 0x8000 +; GCN-NEXT: s_or_b32 s7, s7, s23 +; GCN-NEXT: s_or_b32 s6, s6, s22 +; GCN-NEXT: s_or_b32 s5, s5, s21 +; GCN-NEXT: s_or_b32 s4, s4, s20 +; GCN-NEXT: s_or_b32 s3, s3, s19 +; GCN-NEXT: s_or_b32 s2, s2, s18 +; GCN-NEXT: s_or_b32 s1, s1, s17 +; GCN-NEXT: s_or_b32 s0, s0, s16 +; GCN-NEXT: s_lshl_b32 s7, s7, 16 +; GCN-NEXT: s_lshl_b32 s6, s6, 16 +; GCN-NEXT: s_lshl_b32 s5, s5, 16 +; GCN-NEXT: s_lshl_b32 s4, s4, 16 +; GCN-NEXT: s_lshl_b32 s3, s3, 16 +; GCN-NEXT: s_lshl_b32 s2, s2, 16 +; GCN-NEXT: s_lshl_b32 s1, s1, 16 +; GCN-NEXT: s_lshl_b32 s0, s0, 16 +; GCN-NEXT: s_or_b32 s7, s15, s7 +; GCN-NEXT: s_or_b32 s6, s14, s6 +; GCN-NEXT: s_or_b32 s5, s13, s5 +; GCN-NEXT: s_or_b32 s4, s12, s4 +; GCN-NEXT: s_or_b32 s3, s11, s3 +; GCN-NEXT: s_or_b32 s2, s10, s2 +; GCN-NEXT: s_or_b32 s1, s9, s1 +; GCN-NEXT: s_or_b32 s0, s8, s0 ; GCN-NEXT: ; return to shader part epilog ; ; GFX7-LABEL: s_copysign_v16bf16: ; GFX7: ; %bb.0: -; GFX7-NEXT: v_mul_f32_e64 v15, 1.0, s30 -; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; GFX7-NEXT: v_mul_f32_e64 v19, 1.0, s14 -; GFX7-NEXT: v_mul_f32_e64 v14, 1.0, s31 -; GFX7-NEXT: v_and_b32_e32 v15, 0x8000, v15 -; GFX7-NEXT: v_bfe_u32 v19, v19, 16, 15 -; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GFX7-NEXT: v_or_b32_e32 v15, v19, v15 -; GFX7-NEXT: v_mul_f32_e64 v19, 1.0, s15 -; GFX7-NEXT: v_and_b32_e32 v14, 0x8000, v14 -; GFX7-NEXT: v_bfe_u32 v19, v19, 16, 15 -; GFX7-NEXT: v_or_b32_e32 v14, v19, v14 -; GFX7-NEXT: v_mul_f32_e64 v13, 1.0, s28 -; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GFX7-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; GFX7-NEXT: v_or_b32_e32 v14, v15, v14 -; GFX7-NEXT: v_mul_f32_e64 v15, 1.0, s12 -; GFX7-NEXT: v_mul_f32_e64 v12, 1.0, s29 -; GFX7-NEXT: v_and_b32_e32 v13, 0x8000, v13 -; GFX7-NEXT: v_bfe_u32 v15, v15, 16, 15 -; GFX7-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GFX7-NEXT: v_or_b32_e32 v13, v15, v13 -; GFX7-NEXT: v_mul_f32_e64 v15, 1.0, s13 -; GFX7-NEXT: v_and_b32_e32 v12, 0x8000, v12 -; GFX7-NEXT: v_bfe_u32 v15, v15, 16, 15 -; GFX7-NEXT: v_or_b32_e32 v12, v15, v12 -; GFX7-NEXT: v_mul_f32_e64 v11, 1.0, s26 -; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GFX7-NEXT: v_or_b32_e32 v12, v13, v12 -; GFX7-NEXT: v_mul_f32_e64 v13, 1.0, s10 -; GFX7-NEXT: v_mul_f32_e64 v10, 1.0, s27 -; GFX7-NEXT: v_and_b32_e32 v11, 0x8000, v11 -; GFX7-NEXT: v_bfe_u32 v13, v13, 16, 15 -; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; GFX7-NEXT: v_or_b32_e32 v11, v13, v11 -; GFX7-NEXT: v_mul_f32_e64 v13, 1.0, s11 -; GFX7-NEXT: v_and_b32_e32 v10, 0x8000, v10 -; GFX7-NEXT: v_bfe_u32 v13, v13, 16, 15 -; GFX7-NEXT: v_or_b32_e32 v10, v13, v10 -; GFX7-NEXT: v_mul_f32_e64 v9, 1.0, s24 -; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GFX7-NEXT: v_or_b32_e32 v10, v11, v10 -; GFX7-NEXT: v_mul_f32_e64 v11, 1.0, s8 -; GFX7-NEXT: v_mul_f32_e64 v8, 1.0, s25 -; GFX7-NEXT: v_and_b32_e32 v9, 0x8000, v9 -; GFX7-NEXT: v_bfe_u32 v11, v11, 16, 15 -; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v9, v11, v9 -; GFX7-NEXT: v_mul_f32_e64 v11, 1.0, s9 -; GFX7-NEXT: v_and_b32_e32 v8, 0x8000, v8 -; GFX7-NEXT: v_bfe_u32 v11, v11, 16, 15 -; GFX7-NEXT: v_or_b32_e32 v8, v11, v8 -; GFX7-NEXT: v_mul_f32_e64 v7, 1.0, s22 -; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX7-NEXT: v_mul_f32_e64 v6, 1.0, s23 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GFX7-NEXT: v_or_b32_e32 v8, v9, v8 -; GFX7-NEXT: v_mul_f32_e64 v9, 1.0, s6 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX7-NEXT: v_mul_f32_e64 v11, 1.0, s7 -; GFX7-NEXT: v_and_b32_e32 v7, 0x8000, v7 -; GFX7-NEXT: v_bfe_u32 v9, v9, 16, 15 -; GFX7-NEXT: v_or_b32_e32 v7, v9, v7 -; GFX7-NEXT: v_and_b32_e32 v6, 0x8000, v6 -; GFX7-NEXT: v_bfe_u32 v9, v11, 16, 15 -; GFX7-NEXT: v_mul_f32_e64 v5, 1.0, s20 -; GFX7-NEXT: v_or_b32_e32 v6, v9, v6 -; GFX7-NEXT: v_mul_f32_e64 v4, 1.0, s21 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_mul_f32_e64 v13, 1.0, s4 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_mul_f32_e64 v15, 1.0, s5 -; GFX7-NEXT: v_or_b32_e32 v6, v7, v6 -; GFX7-NEXT: v_and_b32_e32 v5, 0x8000, v5 -; GFX7-NEXT: v_bfe_u32 v7, v13, 16, 15 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v5 -; GFX7-NEXT: v_and_b32_e32 v4, 0x8000, v4 -; GFX7-NEXT: v_bfe_u32 v7, v15, 16, 15 -; GFX7-NEXT: v_mul_f32_e64 v3, 1.0, s18 -; GFX7-NEXT: v_or_b32_e32 v4, v7, v4 -; GFX7-NEXT: v_mul_f32_e64 v2, 1.0, s19 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_mul_f32_e64 v19, 1.0, s2 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_mul_f32_e64 v18, 1.0, s3 -; GFX7-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX7-NEXT: v_and_b32_e32 v3, 0x8000, v3 -; GFX7-NEXT: v_bfe_u32 v5, v19, 16, 15 -; GFX7-NEXT: v_or_b32_e32 v3, v5, v3 -; GFX7-NEXT: v_and_b32_e32 v2, 0x8000, v2 -; GFX7-NEXT: v_bfe_u32 v5, v18, 16, 15 -; GFX7-NEXT: v_mul_f32_e64 v1, 1.0, s16 -; GFX7-NEXT: v_or_b32_e32 v2, v5, v2 -; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, s17 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_mul_f32_e64 v17, 1.0, s0 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_mul_f32_e64 v16, 1.0, s1 -; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX7-NEXT: v_and_b32_e32 v1, 0x8000, v1 -; GFX7-NEXT: v_bfe_u32 v3, v17, 16, 15 -; GFX7-NEXT: v_or_b32_e32 v1, v3, v1 -; GFX7-NEXT: v_and_b32_e32 v0, 0x8000, v0 -; GFX7-NEXT: v_bfe_u32 v3, v16, 16, 15 -; GFX7-NEXT: v_or_b32_e32 v0, v3, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX7-NEXT: v_readfirstlane_b32 s0, v0 -; GFX7-NEXT: v_readfirstlane_b32 s1, v2 -; GFX7-NEXT: v_readfirstlane_b32 s2, v4 -; GFX7-NEXT: v_readfirstlane_b32 s3, v6 -; GFX7-NEXT: v_readfirstlane_b32 s4, v8 -; GFX7-NEXT: v_readfirstlane_b32 s5, v10 -; GFX7-NEXT: v_readfirstlane_b32 s6, v12 -; GFX7-NEXT: v_readfirstlane_b32 s7, v14 +; GFX7-NEXT: s_lshr_b32 s23, s15, 16 +; GFX7-NEXT: s_and_b32 s24, s7, 0x7fff +; GFX7-NEXT: s_and_b32 s23, s23, 0x8000 +; GFX7-NEXT: s_bfe_u32 s7, s7, 0xf0010 +; GFX7-NEXT: s_and_b32 s15, s15, 0x8000 +; GFX7-NEXT: s_or_b32 s7, s7, s23 +; GFX7-NEXT: s_or_b32 s15, s24, s15 +; GFX7-NEXT: s_lshl_b32 s7, s7, 16 +; GFX7-NEXT: s_lshr_b32 s22, s14, 16 +; GFX7-NEXT: s_or_b32 s7, s15, s7 +; GFX7-NEXT: s_and_b32 s14, s14, 0x8000 +; GFX7-NEXT: s_and_b32 s15, s6, 0x7fff +; GFX7-NEXT: s_or_b32 s14, s15, s14 +; GFX7-NEXT: s_and_b32 s15, s22, 0x8000 +; GFX7-NEXT: s_bfe_u32 s6, s6, 0xf0010 +; GFX7-NEXT: s_or_b32 s6, s6, s15 +; GFX7-NEXT: s_lshl_b32 s6, s6, 16 +; GFX7-NEXT: s_lshr_b32 s21, s13, 16 +; GFX7-NEXT: s_or_b32 s6, s14, s6 +; GFX7-NEXT: s_and_b32 s13, s13, 0x8000 +; GFX7-NEXT: s_and_b32 s14, s5, 0x7fff +; GFX7-NEXT: s_or_b32 s13, s14, s13 +; GFX7-NEXT: s_and_b32 s14, s21, 0x8000 +; GFX7-NEXT: s_bfe_u32 s5, s5, 0xf0010 +; GFX7-NEXT: s_or_b32 s5, s5, s14 +; GFX7-NEXT: s_lshl_b32 s5, s5, 16 +; GFX7-NEXT: s_lshr_b32 s20, s12, 16 +; GFX7-NEXT: s_or_b32 s5, s13, s5 +; GFX7-NEXT: s_and_b32 s12, s12, 0x8000 +; GFX7-NEXT: s_and_b32 s13, s4, 0x7fff +; GFX7-NEXT: s_or_b32 s12, s13, s12 +; GFX7-NEXT: s_and_b32 s13, s20, 0x8000 +; GFX7-NEXT: s_bfe_u32 s4, s4, 0xf0010 +; GFX7-NEXT: s_or_b32 s4, s4, s13 +; GFX7-NEXT: s_lshl_b32 s4, s4, 16 +; GFX7-NEXT: s_lshr_b32 s19, s11, 16 +; GFX7-NEXT: s_or_b32 s4, s12, s4 +; GFX7-NEXT: s_and_b32 s11, s11, 0x8000 +; GFX7-NEXT: s_and_b32 s12, s3, 0x7fff +; GFX7-NEXT: s_or_b32 s11, s12, s11 +; GFX7-NEXT: s_and_b32 s12, s19, 0x8000 +; GFX7-NEXT: s_bfe_u32 s3, s3, 0xf0010 +; GFX7-NEXT: s_or_b32 s3, s3, s12 +; GFX7-NEXT: s_lshl_b32 s3, s3, 16 +; GFX7-NEXT: s_lshr_b32 s18, s10, 16 +; GFX7-NEXT: s_or_b32 s3, s11, s3 +; GFX7-NEXT: s_and_b32 s10, s10, 0x8000 +; GFX7-NEXT: s_and_b32 s11, s2, 0x7fff +; GFX7-NEXT: s_or_b32 s10, s11, s10 +; GFX7-NEXT: s_and_b32 s11, s18, 0x8000 +; GFX7-NEXT: s_bfe_u32 s2, s2, 0xf0010 +; GFX7-NEXT: s_or_b32 s2, s2, s11 +; GFX7-NEXT: s_lshl_b32 s2, s2, 16 +; GFX7-NEXT: s_lshr_b32 s17, s9, 16 +; GFX7-NEXT: s_or_b32 s2, s10, s2 +; GFX7-NEXT: s_and_b32 s9, s9, 0x8000 +; GFX7-NEXT: s_and_b32 s10, s1, 0x7fff +; GFX7-NEXT: s_or_b32 s9, s10, s9 +; GFX7-NEXT: s_and_b32 s10, s17, 0x8000 +; GFX7-NEXT: s_bfe_u32 s1, s1, 0xf0010 +; GFX7-NEXT: s_or_b32 s1, s1, s10 +; GFX7-NEXT: s_lshl_b32 s1, s1, 16 +; GFX7-NEXT: s_lshr_b32 s16, s8, 16 +; GFX7-NEXT: s_or_b32 s1, s9, s1 +; GFX7-NEXT: s_and_b32 s8, s8, 0x8000 +; GFX7-NEXT: s_and_b32 s9, s0, 0x7fff +; GFX7-NEXT: s_or_b32 s8, s9, s8 +; GFX7-NEXT: s_and_b32 s9, s16, 0x8000 +; GFX7-NEXT: s_bfe_u32 s0, s0, 0xf0010 +; GFX7-NEXT: s_or_b32 s0, s0, s9 +; GFX7-NEXT: s_lshl_b32 s0, s0, 16 +; GFX7-NEXT: s_or_b32 s0, s8, s0 ; GFX7-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_copysign_v16bf16: @@ -1955,39 +1751,29 @@ define <2 x bfloat> @v_copysign_v2bf16(<2 x bfloat> %mag, <2 x bfloat> %sign) { ; GCN-LABEL: v_copysign_v2bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_bfe_u32 v1, v1, 16, 15 -; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15 -; GCN-NEXT: v_and_b32_e32 v3, 0x8000, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GCN-NEXT: v_and_b32_e32 v1, 0x8000, v1 +; GCN-NEXT: v_and_b32_e32 v3, 0x7fff, v0 +; GCN-NEXT: v_or_b32_e32 v1, v3, v1 ; GCN-NEXT: v_and_b32_e32 v2, 0x8000, v2 -; GCN-NEXT: v_or_b32_e32 v1, v1, v3 +; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15 ; GCN-NEXT: v_or_b32_e32 v0, v0, v2 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v0, v1, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_copysign_v2bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_and_b32_e32 v3, 0x8000, v3 -; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 15 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX7-NEXT: v_and_b32_e32 v3, 0x7fff, v0 ; GFX7-NEXT: v_and_b32_e32 v2, 0x8000, v2 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX7-NEXT: v_and_b32_e32 v1, 0x8000, v1 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_or_b32_e32 v1, v3, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_copysign_v2bf16: @@ -2023,53 +1809,35 @@ define <3 x bfloat> @v_copysign_v3bf16(<3 x bfloat> %mag, <3 x bfloat> %sign) { ; GCN-LABEL: v_copysign_v3bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_bfe_u32 v2, v2, 16, 15 -; GCN-NEXT: v_bfe_u32 v1, v1, 16, 15 -; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15 -; GCN-NEXT: v_and_b32_e32 v5, 0x8000, v5 -; GCN-NEXT: v_and_b32_e32 v4, 0x8000, v4 ; GCN-NEXT: v_and_b32_e32 v3, 0x8000, v3 -; GCN-NEXT: v_or_b32_e32 v2, v2, v5 -; GCN-NEXT: v_or_b32_e32 v1, v1, v4 -; GCN-NEXT: v_or_b32_e32 v0, v0, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0x7fff, v1 +; GCN-NEXT: v_and_b32_e32 v4, 0x8000, v2 +; GCN-NEXT: v_and_b32_e32 v5, 0x7fff, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_or_b32_e32 v1, v1, v3 +; GCN-NEXT: v_or_b32_e32 v3, v5, v4 +; GCN-NEXT: v_and_b32_e32 v2, 0x8000, v2 +; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15 +; GCN-NEXT: v_or_b32_e32 v0, v0, v2 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_or_b32_e32 v0, v3, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_copysign_v3bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_and_b32_e32 v5, 0x8000, v5 -; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 15 -; GFX7-NEXT: v_and_b32_e32 v4, 0x8000, v4 -; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 15 ; GFX7-NEXT: v_and_b32_e32 v3, 0x8000, v3 +; GFX7-NEXT: v_and_b32_e32 v1, 0x7fff, v1 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0x8000, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_and_b32_e32 v4, 0x7fff, v0 +; GFX7-NEXT: v_and_b32_e32 v2, 0x8000, v2 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15 -; GFX7-NEXT: v_or_b32_e32 v2, v2, v5 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_or_b32_e32 v3, v4, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v3, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_copysign_v3bf16: @@ -2109,67 +1877,47 @@ define <4 x bfloat> @v_copysign_v4bf16(<4 x bfloat> %mag, <4 x bfloat> %sign) { ; GCN-LABEL: v_copysign_v4bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_bfe_u32 v3, v3, 16, 15 -; GCN-NEXT: v_bfe_u32 v2, v2, 16, 15 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; GCN-NEXT: v_and_b32_e32 v3, 0x8000, v3 +; GCN-NEXT: v_and_b32_e32 v6, 0x7fff, v1 +; GCN-NEXT: v_and_b32_e32 v2, 0x8000, v2 +; GCN-NEXT: v_and_b32_e32 v7, 0x7fff, v0 ; GCN-NEXT: v_bfe_u32 v1, v1, 16, 15 ; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15 -; GCN-NEXT: v_and_b32_e32 v7, 0x8000, v7 -; GCN-NEXT: v_and_b32_e32 v6, 0x8000, v6 +; GCN-NEXT: v_or_b32_e32 v3, v6, v3 +; GCN-NEXT: v_or_b32_e32 v2, v7, v2 ; GCN-NEXT: v_and_b32_e32 v5, 0x8000, v5 ; GCN-NEXT: v_and_b32_e32 v4, 0x8000, v4 -; GCN-NEXT: v_or_b32_e32 v3, v3, v7 -; GCN-NEXT: v_or_b32_e32 v2, v2, v6 ; GCN-NEXT: v_or_b32_e32 v1, v1, v5 ; GCN-NEXT: v_or_b32_e32 v0, v0, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_or_b32_e32 v0, v2, v0 +; GCN-NEXT: v_or_b32_e32 v1, v3, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_copysign_v4bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_and_b32_e32 v7, 0x8000, v7 -; GFX7-NEXT: v_bfe_u32 v3, v3, 16, 15 -; GFX7-NEXT: v_and_b32_e32 v6, 0x8000, v6 -; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 15 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0x8000, v3 +; GFX7-NEXT: v_and_b32_e32 v6, 0x7fff, v1 +; GFX7-NEXT: v_or_b32_e32 v3, v6, v3 +; GFX7-NEXT: v_and_b32_e32 v6, 0x7fff, v0 ; GFX7-NEXT: v_and_b32_e32 v5, 0x8000, v5 ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 15 ; GFX7-NEXT: v_and_b32_e32 v4, 0x8000, v4 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15 -; GFX7-NEXT: v_or_b32_e32 v3, v3, v7 -; GFX7-NEXT: v_or_b32_e32 v2, v2, v6 +; GFX7-NEXT: v_and_b32_e32 v2, 0x8000, v2 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v5 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_or_b32_e32 v2, v6, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX7-NEXT: v_or_b32_e32 v1, v3, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_copysign_v4bf16: @@ -2209,99 +1957,61 @@ define <8 x bfloat> @v_copysign_v8bf16(<8 x bfloat> %mag, <8 x bfloat> %sign) { ; GCN-LABEL: v_copysign_v8bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_bfe_u32 v7, v7, 16, 15 -; GCN-NEXT: v_bfe_u32 v6, v6, 16, 15 -; GCN-NEXT: v_bfe_u32 v5, v5, 16, 15 -; GCN-NEXT: v_bfe_u32 v4, v4, 16, 15 +; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v7 +; GCN-NEXT: v_and_b32_e32 v7, 0x8000, v7 +; GCN-NEXT: v_and_b32_e32 v12, 0x7fff, v3 +; GCN-NEXT: v_and_b32_e32 v6, 0x8000, v6 +; GCN-NEXT: v_and_b32_e32 v13, 0x7fff, v2 +; GCN-NEXT: v_and_b32_e32 v5, 0x8000, v5 +; GCN-NEXT: v_and_b32_e32 v14, 0x7fff, v1 +; GCN-NEXT: v_and_b32_e32 v4, 0x8000, v4 +; GCN-NEXT: v_and_b32_e32 v15, 0x7fff, v0 ; GCN-NEXT: v_bfe_u32 v3, v3, 16, 15 ; GCN-NEXT: v_bfe_u32 v2, v2, 16, 15 ; GCN-NEXT: v_bfe_u32 v1, v1, 16, 15 ; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15 -; GCN-NEXT: v_and_b32_e32 v15, 0x8000, v15 -; GCN-NEXT: v_and_b32_e32 v14, 0x8000, v14 -; GCN-NEXT: v_and_b32_e32 v13, 0x8000, v13 -; GCN-NEXT: v_and_b32_e32 v12, 0x8000, v12 +; GCN-NEXT: v_or_b32_e32 v7, v12, v7 +; GCN-NEXT: v_or_b32_e32 v6, v13, v6 +; GCN-NEXT: v_or_b32_e32 v5, v14, v5 +; GCN-NEXT: v_or_b32_e32 v4, v15, v4 ; GCN-NEXT: v_and_b32_e32 v11, 0x8000, v11 ; GCN-NEXT: v_and_b32_e32 v10, 0x8000, v10 ; GCN-NEXT: v_and_b32_e32 v9, 0x8000, v9 ; GCN-NEXT: v_and_b32_e32 v8, 0x8000, v8 -; GCN-NEXT: v_or_b32_e32 v7, v7, v15 -; GCN-NEXT: v_or_b32_e32 v6, v6, v14 -; GCN-NEXT: v_or_b32_e32 v5, v5, v13 -; GCN-NEXT: v_or_b32_e32 v4, v4, v12 ; GCN-NEXT: v_or_b32_e32 v3, v3, v11 ; GCN-NEXT: v_or_b32_e32 v2, v2, v10 ; GCN-NEXT: v_or_b32_e32 v1, v1, v9 ; GCN-NEXT: v_or_b32_e32 v0, v0, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_or_b32_e32 v0, v4, v0 +; GCN-NEXT: v_or_b32_e32 v1, v5, v1 +; GCN-NEXT: v_or_b32_e32 v2, v6, v2 +; GCN-NEXT: v_or_b32_e32 v3, v7, v3 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_copysign_v8bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 -; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 -; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 -; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 -; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GFX7-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GFX7-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GFX7-NEXT: v_and_b32_e32 v15, 0x8000, v15 -; GFX7-NEXT: v_bfe_u32 v7, v7, 16, 15 -; GFX7-NEXT: v_and_b32_e32 v14, 0x8000, v14 -; GFX7-NEXT: v_bfe_u32 v6, v6, 16, 15 -; GFX7-NEXT: v_and_b32_e32 v13, 0x8000, v13 -; GFX7-NEXT: v_bfe_u32 v5, v5, 16, 15 -; GFX7-NEXT: v_and_b32_e32 v12, 0x8000, v12 -; GFX7-NEXT: v_bfe_u32 v4, v4, 16, 15 +; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v7 +; GFX7-NEXT: v_and_b32_e32 v7, 0x8000, v7 +; GFX7-NEXT: v_and_b32_e32 v12, 0x7fff, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v7, v12, v7 +; GFX7-NEXT: v_and_b32_e32 v6, 0x8000, v6 +; GFX7-NEXT: v_and_b32_e32 v12, 0x7fff, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v6, v12, v6 +; GFX7-NEXT: v_and_b32_e32 v5, 0x8000, v5 +; GFX7-NEXT: v_and_b32_e32 v12, 0x7fff, v1 +; GFX7-NEXT: v_or_b32_e32 v5, v12, v5 +; GFX7-NEXT: v_and_b32_e32 v12, 0x7fff, v0 ; GFX7-NEXT: v_and_b32_e32 v11, 0x8000, v11 ; GFX7-NEXT: v_bfe_u32 v3, v3, 16, 15 ; GFX7-NEXT: v_and_b32_e32 v10, 0x8000, v10 @@ -2310,22 +2020,20 @@ define <8 x bfloat> @v_copysign_v8bf16(<8 x bfloat> %mag, <8 x bfloat> %sign) { ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 15 ; GFX7-NEXT: v_and_b32_e32 v8, 0x8000, v8 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15 -; GFX7-NEXT: v_or_b32_e32 v7, v7, v15 -; GFX7-NEXT: v_or_b32_e32 v6, v6, v14 -; GFX7-NEXT: v_or_b32_e32 v5, v5, v13 -; GFX7-NEXT: v_or_b32_e32 v4, v4, v12 +; GFX7-NEXT: v_and_b32_e32 v4, 0x8000, v4 ; GFX7-NEXT: v_or_b32_e32 v3, v3, v11 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v10 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v9 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v8 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_or_b32_e32 v4, v12, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX7-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX7-NEXT: v_or_b32_e32 v1, v5, v1 +; GFX7-NEXT: v_or_b32_e32 v2, v6, v2 +; GFX7-NEXT: v_or_b32_e32 v3, v7, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_copysign_v8bf16: @@ -2373,239 +2081,155 @@ define <16 x bfloat> @v_copysign_v16bf16(<16 x bfloat> %mag, <16 x bfloat> %sign ; GCN-LABEL: v_copysign_v16bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30 -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_bfe_u32 v14, v14, 16, 15 -; GCN-NEXT: v_and_b32_e32 v30, 0x8000, v30 -; GCN-NEXT: v_or_b32_e32 v14, v14, v30 -; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29 -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v29 -; GCN-NEXT: v_bfe_u32 v13, v13, 16, 15 -; GCN-NEXT: v_and_b32_e32 v29, 0x8000, v29 -; GCN-NEXT: v_or_b32_e32 v13, v13, v29 -; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28 -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_bfe_u32 v12, v12, 16, 15 -; GCN-NEXT: v_and_b32_e32 v28, 0x8000, v28 -; GCN-NEXT: v_or_b32_e32 v12, v12, v28 -; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27 -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v27 -; GCN-NEXT: v_bfe_u32 v11, v11, 16, 15 -; GCN-NEXT: v_and_b32_e32 v27, 0x8000, v27 -; GCN-NEXT: v_or_b32_e32 v11, v11, v27 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 -; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_bfe_u32 v10, v10, 16, 15 -; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_bfe_u32 v9, v9, 16, 15 -; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_bfe_u32 v8, v8, 16, 15 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; GCN-NEXT: v_and_b32_e32 v16, 0x8000, v15 +; GCN-NEXT: v_and_b32_e32 v17, 0x7fff, v7 +; GCN-NEXT: v_and_b32_e32 v18, 0x8000, v14 +; GCN-NEXT: v_and_b32_e32 v19, 0x7fff, v6 +; GCN-NEXT: v_and_b32_e32 v20, 0x8000, v13 +; GCN-NEXT: v_and_b32_e32 v21, 0x7fff, v5 +; GCN-NEXT: v_and_b32_e32 v22, 0x8000, v12 +; GCN-NEXT: v_and_b32_e32 v23, 0x7fff, v4 +; GCN-NEXT: v_or_b32_e32 v16, v17, v16 +; GCN-NEXT: v_and_b32_e32 v17, 0x8000, v11 +; GCN-NEXT: v_or_b32_e32 v18, v19, v18 +; GCN-NEXT: v_and_b32_e32 v19, 0x7fff, v3 +; GCN-NEXT: v_or_b32_e32 v20, v21, v20 +; GCN-NEXT: v_and_b32_e32 v21, 0x8000, v10 +; GCN-NEXT: v_or_b32_e32 v22, v23, v22 +; GCN-NEXT: v_and_b32_e32 v23, 0x7fff, v2 +; GCN-NEXT: v_or_b32_e32 v17, v19, v17 +; GCN-NEXT: v_and_b32_e32 v19, 0x8000, v9 +; GCN-NEXT: v_or_b32_e32 v21, v23, v21 +; GCN-NEXT: v_and_b32_e32 v23, 0x7fff, v1 +; GCN-NEXT: v_or_b32_e32 v19, v23, v19 +; GCN-NEXT: v_and_b32_e32 v23, 0x8000, v8 +; GCN-NEXT: v_and_b32_e32 v24, 0x7fff, v0 +; GCN-NEXT: v_or_b32_e32 v23, v24, v23 +; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 ; GCN-NEXT: v_bfe_u32 v7, v7, 16, 15 -; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 ; GCN-NEXT: v_bfe_u32 v6, v6, 16, 15 -; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 ; GCN-NEXT: v_bfe_u32 v5, v5, 16, 15 -; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 ; GCN-NEXT: v_bfe_u32 v4, v4, 16, 15 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 ; GCN-NEXT: v_bfe_u32 v3, v3, 16, 15 -; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 ; GCN-NEXT: v_bfe_u32 v2, v2, 16, 15 -; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 ; GCN-NEXT: v_bfe_u32 v1, v1, 16, 15 -; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 ; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15 -; GCN-NEXT: v_bfe_u32 v15, v15, 16, 15 -; GCN-NEXT: v_and_b32_e32 v26, 0x8000, v26 -; GCN-NEXT: v_and_b32_e32 v25, 0x8000, v25 -; GCN-NEXT: v_and_b32_e32 v24, 0x8000, v24 -; GCN-NEXT: v_and_b32_e32 v23, 0x8000, v23 -; GCN-NEXT: v_and_b32_e32 v22, 0x8000, v22 -; GCN-NEXT: v_and_b32_e32 v21, 0x8000, v21 -; GCN-NEXT: v_and_b32_e32 v20, 0x8000, v20 -; GCN-NEXT: v_and_b32_e32 v19, 0x8000, v19 -; GCN-NEXT: v_and_b32_e32 v18, 0x8000, v18 -; GCN-NEXT: v_and_b32_e32 v17, 0x8000, v17 -; GCN-NEXT: v_and_b32_e32 v16, 0x8000, v16 -; GCN-NEXT: v_or_b32_e32 v10, v10, v26 -; GCN-NEXT: v_or_b32_e32 v9, v9, v25 -; GCN-NEXT: v_or_b32_e32 v8, v8, v24 -; GCN-NEXT: v_or_b32_e32 v7, v7, v23 -; GCN-NEXT: v_or_b32_e32 v6, v6, v22 -; GCN-NEXT: v_or_b32_e32 v5, v5, v21 -; GCN-NEXT: v_or_b32_e32 v4, v4, v20 -; GCN-NEXT: v_or_b32_e32 v3, v3, v19 -; GCN-NEXT: v_or_b32_e32 v2, v2, v18 -; GCN-NEXT: v_or_b32_e32 v1, v1, v17 -; GCN-NEXT: v_or_b32_e32 v0, v0, v16 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v27 +; GCN-NEXT: v_and_b32_e32 v15, 0x8000, v15 +; GCN-NEXT: v_and_b32_e32 v14, 0x8000, v14 +; GCN-NEXT: v_and_b32_e32 v13, 0x8000, v13 +; GCN-NEXT: v_and_b32_e32 v12, 0x8000, v12 +; GCN-NEXT: v_and_b32_e32 v11, 0x8000, v11 +; GCN-NEXT: v_and_b32_e32 v10, 0x8000, v10 +; GCN-NEXT: v_and_b32_e32 v9, 0x8000, v9 +; GCN-NEXT: v_and_b32_e32 v8, 0x8000, v8 +; GCN-NEXT: v_or_b32_e32 v7, v7, v15 +; GCN-NEXT: v_or_b32_e32 v6, v6, v14 +; GCN-NEXT: v_or_b32_e32 v5, v5, v13 +; GCN-NEXT: v_or_b32_e32 v4, v4, v12 +; GCN-NEXT: v_or_b32_e32 v3, v3, v11 +; GCN-NEXT: v_or_b32_e32 v2, v2, v10 +; GCN-NEXT: v_or_b32_e32 v1, v1, v9 +; GCN-NEXT: v_or_b32_e32 v0, v0, v8 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_and_b32_e32 v16, 0x8000, v16 -; GCN-NEXT: v_or_b32_e32 v15, v15, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GCN-NEXT: v_or_b32_e32 v0, v23, v0 +; GCN-NEXT: v_or_b32_e32 v1, v19, v1 +; GCN-NEXT: v_or_b32_e32 v2, v21, v2 +; GCN-NEXT: v_or_b32_e32 v3, v17, v3 +; GCN-NEXT: v_or_b32_e32 v4, v22, v4 +; GCN-NEXT: v_or_b32_e32 v5, v20, v5 +; GCN-NEXT: v_or_b32_e32 v6, v18, v6 +; GCN-NEXT: v_or_b32_e32 v7, v16, v7 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_copysign_v16bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27 -; GFX7-NEXT: v_lshrrev_b32_e32 v27, 16, v27 -; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GFX7-NEXT: v_and_b32_e32 v27, 0x8000, v27 -; GFX7-NEXT: v_bfe_u32 v11, v11, 16, 15 -; GFX7-NEXT: v_or_b32_e32 v11, v11, v27 -; GFX7-NEXT: buffer_load_dword v27, off, s[0:3], s32 -; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 -; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; GFX7-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; GFX7-NEXT: v_bfe_u32 v8, v8, 16, 15 -; GFX7-NEXT: v_and_b32_e32 v24, 0x8000, v24 -; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30 -; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29 -; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28 -; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26 -; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 -; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23 -; GFX7-NEXT: v_or_b32_e32 v8, v8, v24 -; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 -; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21 -; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20 -; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19 -; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18 -; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; GFX7-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 -; GFX7-NEXT: v_lshrrev_b32_e32 v29, 16, v29 -; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GFX7-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 -; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 -; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 -; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GFX7-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; GFX7-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; GFX7-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; GFX7-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GFX7-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_and_b32_e32 v30, 0x8000, v30 -; GFX7-NEXT: v_bfe_u32 v14, v14, 16, 15 -; GFX7-NEXT: v_and_b32_e32 v29, 0x8000, v29 -; GFX7-NEXT: v_bfe_u32 v13, v13, 16, 15 -; GFX7-NEXT: v_and_b32_e32 v28, 0x8000, v28 -; GFX7-NEXT: v_bfe_u32 v12, v12, 16, 15 -; GFX7-NEXT: v_bfe_u32 v10, v10, 16, 15 -; GFX7-NEXT: v_bfe_u32 v9, v9, 16, 15 -; GFX7-NEXT: v_bfe_u32 v15, v15, 16, 15 -; GFX7-NEXT: v_and_b32_e32 v26, 0x8000, v26 -; GFX7-NEXT: v_and_b32_e32 v25, 0x8000, v25 -; GFX7-NEXT: v_and_b32_e32 v23, 0x8000, v23 +; GFX7-NEXT: v_and_b32_e32 v16, 0x8000, v15 +; GFX7-NEXT: v_and_b32_e32 v17, 0x7fff, v7 +; GFX7-NEXT: v_or_b32_e32 v16, v17, v16 +; GFX7-NEXT: v_and_b32_e32 v17, 0x8000, v14 +; GFX7-NEXT: v_and_b32_e32 v18, 0x7fff, v6 +; GFX7-NEXT: v_or_b32_e32 v17, v18, v17 +; GFX7-NEXT: v_and_b32_e32 v18, 0x8000, v13 +; GFX7-NEXT: v_and_b32_e32 v19, 0x7fff, v5 +; GFX7-NEXT: v_or_b32_e32 v18, v19, v18 +; GFX7-NEXT: v_and_b32_e32 v19, 0x8000, v12 +; GFX7-NEXT: v_and_b32_e32 v20, 0x7fff, v4 +; GFX7-NEXT: v_or_b32_e32 v19, v20, v19 +; GFX7-NEXT: v_and_b32_e32 v20, 0x8000, v11 +; GFX7-NEXT: v_and_b32_e32 v21, 0x7fff, v3 +; GFX7-NEXT: v_or_b32_e32 v20, v21, v20 +; GFX7-NEXT: v_and_b32_e32 v21, 0x8000, v10 +; GFX7-NEXT: v_and_b32_e32 v22, 0x7fff, v2 +; GFX7-NEXT: v_or_b32_e32 v21, v22, v21 +; GFX7-NEXT: v_and_b32_e32 v22, 0x8000, v9 +; GFX7-NEXT: v_and_b32_e32 v23, 0x7fff, v1 +; GFX7-NEXT: v_or_b32_e32 v22, v23, v22 +; GFX7-NEXT: v_and_b32_e32 v23, 0x8000, v8 +; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; GFX7-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; GFX7-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GFX7-NEXT: v_and_b32_e32 v24, 0x7fff, v0 +; GFX7-NEXT: v_and_b32_e32 v15, 0x8000, v15 ; GFX7-NEXT: v_bfe_u32 v7, v7, 16, 15 -; GFX7-NEXT: v_and_b32_e32 v22, 0x8000, v22 +; GFX7-NEXT: v_and_b32_e32 v14, 0x8000, v14 ; GFX7-NEXT: v_bfe_u32 v6, v6, 16, 15 -; GFX7-NEXT: v_and_b32_e32 v21, 0x8000, v21 +; GFX7-NEXT: v_and_b32_e32 v13, 0x8000, v13 ; GFX7-NEXT: v_bfe_u32 v5, v5, 16, 15 -; GFX7-NEXT: v_and_b32_e32 v20, 0x8000, v20 +; GFX7-NEXT: v_and_b32_e32 v12, 0x8000, v12 ; GFX7-NEXT: v_bfe_u32 v4, v4, 16, 15 -; GFX7-NEXT: v_and_b32_e32 v19, 0x8000, v19 +; GFX7-NEXT: v_and_b32_e32 v11, 0x8000, v11 ; GFX7-NEXT: v_bfe_u32 v3, v3, 16, 15 -; GFX7-NEXT: v_and_b32_e32 v18, 0x8000, v18 +; GFX7-NEXT: v_and_b32_e32 v10, 0x8000, v10 ; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 15 -; GFX7-NEXT: v_and_b32_e32 v17, 0x8000, v17 +; GFX7-NEXT: v_and_b32_e32 v9, 0x8000, v9 ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 15 -; GFX7-NEXT: v_and_b32_e32 v16, 0x8000, v16 +; GFX7-NEXT: v_and_b32_e32 v8, 0x8000, v8 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15 -; GFX7-NEXT: v_or_b32_e32 v14, v14, v30 -; GFX7-NEXT: v_or_b32_e32 v13, v13, v29 -; GFX7-NEXT: v_or_b32_e32 v12, v12, v28 -; GFX7-NEXT: v_or_b32_e32 v10, v10, v26 -; GFX7-NEXT: v_or_b32_e32 v9, v9, v25 -; GFX7-NEXT: v_or_b32_e32 v7, v7, v23 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v27 -; GFX7-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; GFX7-NEXT: v_and_b32_e32 v24, 0x8000, v24 -; GFX7-NEXT: v_or_b32_e32 v15, v15, v24 -; GFX7-NEXT: v_or_b32_e32 v6, v6, v22 -; GFX7-NEXT: v_or_b32_e32 v5, v5, v21 -; GFX7-NEXT: v_or_b32_e32 v4, v4, v20 -; GFX7-NEXT: v_or_b32_e32 v3, v3, v19 -; GFX7-NEXT: v_or_b32_e32 v2, v2, v18 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v17 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v16 +; GFX7-NEXT: v_or_b32_e32 v7, v7, v15 +; GFX7-NEXT: v_or_b32_e32 v6, v6, v14 +; GFX7-NEXT: v_or_b32_e32 v5, v5, v13 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v12 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v11 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v10 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v9 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v8 +; GFX7-NEXT: v_or_b32_e32 v23, v24, v23 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GFX7-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GFX7-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GFX7-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX7-NEXT: v_or_b32_e32 v0, v23, v0 +; GFX7-NEXT: v_or_b32_e32 v1, v22, v1 +; GFX7-NEXT: v_or_b32_e32 v2, v21, v2 +; GFX7-NEXT: v_or_b32_e32 v3, v20, v3 +; GFX7-NEXT: v_or_b32_e32 v4, v19, v4 +; GFX7-NEXT: v_or_b32_e32 v5, v18, v5 +; GFX7-NEXT: v_or_b32_e32 v6, v17, v6 +; GFX7-NEXT: v_or_b32_e32 v7, v16, v7 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_copysign_v16bf16: @@ -2669,264 +2293,114 @@ define <32 x bfloat> @v_copysign_v32bf16(<32 x bfloat> %mag, <32 x bfloat> %sign ; GCN-LABEL: v_copysign_v32bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; GCN-NEXT: v_bfe_u32 v32, v32, 16, 15 -; GCN-NEXT: v_and_b32_e32 v31, 0x8000, v31 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:124 +; GCN-NEXT: v_and_b32_e32 v31, 0x8000, v30 +; GCN-NEXT: v_and_b32_e32 v32, 0x7fff, v14 +; GCN-NEXT: v_and_b32_e32 v33, 0x8000, v29 +; GCN-NEXT: v_and_b32_e32 v34, 0x7fff, v13 +; GCN-NEXT: v_and_b32_e32 v35, 0x8000, v28 +; GCN-NEXT: v_and_b32_e32 v36, 0x7fff, v12 +; GCN-NEXT: v_and_b32_e32 v37, 0x8000, v27 +; GCN-NEXT: v_and_b32_e32 v38, 0x7fff, v11 +; GCN-NEXT: v_and_b32_e32 v39, 0x8000, v26 +; GCN-NEXT: v_and_b32_e32 v48, 0x7fff, v10 +; GCN-NEXT: v_and_b32_e32 v49, 0x8000, v25 +; GCN-NEXT: v_and_b32_e32 v50, 0x7fff, v9 +; GCN-NEXT: v_and_b32_e32 v51, 0x8000, v24 ; GCN-NEXT: v_or_b32_e32 v31, v32, v31 -; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30 -; GCN-NEXT: v_bfe_u32 v30, v30, 16, 15 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_and_b32_e32 v32, 0x8000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:120 -; GCN-NEXT: v_or_b32_e32 v30, v30, v32 -; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29 -; GCN-NEXT: v_bfe_u32 v29, v29, 16, 15 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_and_b32_e32 v32, 0x8000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:116 -; GCN-NEXT: v_or_b32_e32 v29, v29, v32 -; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28 -; GCN-NEXT: v_bfe_u32 v28, v28, 16, 15 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_and_b32_e32 v32, 0x8000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:112 -; GCN-NEXT: v_or_b32_e32 v28, v28, v32 -; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27 -; GCN-NEXT: v_bfe_u32 v27, v27, 16, 15 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_and_b32_e32 v32, 0x8000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:108 -; GCN-NEXT: v_or_b32_e32 v27, v27, v32 -; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26 -; GCN-NEXT: v_bfe_u32 v26, v26, 16, 15 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_and_b32_e32 v32, 0x8000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:104 -; GCN-NEXT: v_or_b32_e32 v26, v26, v32 -; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 -; GCN-NEXT: v_bfe_u32 v25, v25, 16, 15 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_and_b32_e32 v32, 0x8000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:100 -; GCN-NEXT: v_or_b32_e32 v25, v25, v32 -; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24 -; GCN-NEXT: v_bfe_u32 v24, v24, 16, 15 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_and_b32_e32 v32, 0x8000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:96 -; GCN-NEXT: v_or_b32_e32 v24, v24, v32 -; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 -; GCN-NEXT: v_bfe_u32 v23, v23, 16, 15 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_and_b32_e32 v32, 0x8000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92 -; GCN-NEXT: v_or_b32_e32 v23, v23, v32 -; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 -; GCN-NEXT: v_bfe_u32 v22, v22, 16, 15 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_and_b32_e32 v32, 0x8000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:88 -; GCN-NEXT: v_or_b32_e32 v22, v22, v32 -; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 -; GCN-NEXT: v_bfe_u32 v21, v21, 16, 15 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_and_b32_e32 v32, 0x8000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:84 -; GCN-NEXT: v_or_b32_e32 v21, v21, v32 -; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 -; GCN-NEXT: v_bfe_u32 v20, v20, 16, 15 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_and_b32_e32 v32, 0x8000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:80 -; GCN-NEXT: v_or_b32_e32 v20, v20, v32 -; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_bfe_u32 v19, v19, 16, 15 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_and_b32_e32 v32, 0x8000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:76 -; GCN-NEXT: v_or_b32_e32 v19, v19, v32 -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_bfe_u32 v18, v18, 16, 15 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_and_b32_e32 v32, 0x8000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:72 -; GCN-NEXT: v_or_b32_e32 v18, v18, v32 -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_bfe_u32 v17, v17, 16, 15 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_and_b32_e32 v32, 0x8000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:68 -; GCN-NEXT: v_or_b32_e32 v17, v17, v32 -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_bfe_u32 v16, v16, 16, 15 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_and_b32_e32 v32, 0x8000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:64 -; GCN-NEXT: v_or_b32_e32 v16, v16, v32 -; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_bfe_u32 v15, v15, 16, 15 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_and_b32_e32 v32, 0x8000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:60 -; GCN-NEXT: v_or_b32_e32 v15, v15, v32 -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 +; GCN-NEXT: v_and_b32_e32 v52, 0x7fff, v8 +; GCN-NEXT: v_or_b32_e32 v32, v34, v33 +; GCN-NEXT: v_and_b32_e32 v53, 0x8000, v23 +; GCN-NEXT: v_or_b32_e32 v33, v36, v35 +; GCN-NEXT: v_and_b32_e32 v54, 0x7fff, v7 +; GCN-NEXT: v_or_b32_e32 v34, v38, v37 +; GCN-NEXT: v_and_b32_e32 v55, 0x8000, v22 +; GCN-NEXT: v_or_b32_e32 v35, v48, v39 +; GCN-NEXT: v_and_b32_e32 v39, 0x7fff, v6 +; GCN-NEXT: v_or_b32_e32 v36, v50, v49 +; GCN-NEXT: v_and_b32_e32 v48, 0x8000, v21 +; GCN-NEXT: v_or_b32_e32 v37, v52, v51 +; GCN-NEXT: v_and_b32_e32 v49, 0x7fff, v5 +; GCN-NEXT: v_or_b32_e32 v38, v54, v53 +; GCN-NEXT: v_and_b32_e32 v50, 0x8000, v20 +; GCN-NEXT: v_or_b32_e32 v39, v39, v55 +; GCN-NEXT: v_and_b32_e32 v51, 0x7fff, v4 +; GCN-NEXT: v_or_b32_e32 v48, v49, v48 +; GCN-NEXT: v_and_b32_e32 v52, 0x8000, v19 +; GCN-NEXT: v_or_b32_e32 v49, v51, v50 +; GCN-NEXT: v_and_b32_e32 v50, 0x7fff, v3 +; GCN-NEXT: v_or_b32_e32 v50, v50, v52 +; GCN-NEXT: v_and_b32_e32 v51, 0x8000, v18 +; GCN-NEXT: v_and_b32_e32 v52, 0x7fff, v2 +; GCN-NEXT: v_or_b32_e32 v51, v52, v51 +; GCN-NEXT: v_and_b32_e32 v52, 0x8000, v17 +; GCN-NEXT: v_and_b32_e32 v53, 0x7fff, v1 +; GCN-NEXT: v_or_b32_e32 v52, v53, v52 +; GCN-NEXT: v_and_b32_e32 v53, 0x8000, v16 +; GCN-NEXT: v_and_b32_e32 v54, 0x7fff, v0 +; GCN-NEXT: v_or_b32_e32 v53, v54, v53 +; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v30 ; GCN-NEXT: v_bfe_u32 v14, v14, 16, 15 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_and_b32_e32 v32, 0x8000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:56 -; GCN-NEXT: v_or_b32_e32 v14, v14, v32 -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 +; GCN-NEXT: v_and_b32_e32 v30, 0x8000, v30 +; GCN-NEXT: v_or_b32_e32 v14, v14, v30 +; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v29 ; GCN-NEXT: v_bfe_u32 v13, v13, 16, 15 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_and_b32_e32 v32, 0x8000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:52 -; GCN-NEXT: v_or_b32_e32 v13, v13, v32 -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 +; GCN-NEXT: v_and_b32_e32 v29, 0x8000, v29 +; GCN-NEXT: v_or_b32_e32 v13, v13, v29 +; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v28 ; GCN-NEXT: v_bfe_u32 v12, v12, 16, 15 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_and_b32_e32 v32, 0x8000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:48 -; GCN-NEXT: v_or_b32_e32 v12, v12, v32 -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 +; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v27 ; GCN-NEXT: v_bfe_u32 v11, v11, 16, 15 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_and_b32_e32 v32, 0x8000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:44 -; GCN-NEXT: v_or_b32_e32 v11, v11, v32 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v26 ; GCN-NEXT: v_bfe_u32 v10, v10, 16, 15 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_and_b32_e32 v32, 0x8000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:40 -; GCN-NEXT: v_or_b32_e32 v10, v10, v32 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v25 ; GCN-NEXT: v_bfe_u32 v9, v9, 16, 15 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_and_b32_e32 v32, 0x8000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:36 -; GCN-NEXT: v_or_b32_e32 v9, v9, v32 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v24 ; GCN-NEXT: v_bfe_u32 v8, v8, 16, 15 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_and_b32_e32 v32, 0x8000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:32 -; GCN-NEXT: v_or_b32_e32 v8, v8, v32 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23 ; GCN-NEXT: v_bfe_u32 v7, v7, 16, 15 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_and_b32_e32 v32, 0x8000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:28 -; GCN-NEXT: v_or_b32_e32 v7, v7, v32 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22 ; GCN-NEXT: v_bfe_u32 v6, v6, 16, 15 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_and_b32_e32 v32, 0x8000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:24 -; GCN-NEXT: v_or_b32_e32 v6, v6, v32 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v21 ; GCN-NEXT: v_bfe_u32 v5, v5, 16, 15 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_and_b32_e32 v32, 0x8000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20 -; GCN-NEXT: v_or_b32_e32 v5, v5, v32 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20 ; GCN-NEXT: v_bfe_u32 v4, v4, 16, 15 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_and_b32_e32 v32, 0x8000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:16 -; GCN-NEXT: v_or_b32_e32 v4, v4, v32 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19 ; GCN-NEXT: v_bfe_u32 v3, v3, 16, 15 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_and_b32_e32 v32, 0x8000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12 -; GCN-NEXT: v_or_b32_e32 v3, v3, v32 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18 ; GCN-NEXT: v_bfe_u32 v2, v2, 16, 15 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_and_b32_e32 v32, 0x8000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 -; GCN-NEXT: v_or_b32_e32 v2, v2, v32 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17 ; GCN-NEXT: v_bfe_u32 v1, v1, 16, 15 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_and_b32_e32 v32, 0x8000, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4 -; GCN-NEXT: v_or_b32_e32 v1, v1, v32 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16 ; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_and_b32_e32 v32, 0x8000, v32 -; GCN-NEXT: v_or_b32_e32 v0, v0, v32 +; GCN-NEXT: v_and_b32_e32 v28, 0x8000, v28 +; GCN-NEXT: v_or_b32_e32 v12, v12, v28 +; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 +; GCN-NEXT: v_and_b32_e32 v27, 0x8000, v27 +; GCN-NEXT: v_or_b32_e32 v11, v11, v27 +; GCN-NEXT: v_and_b32_e32 v27, 0x7fff, v15 +; GCN-NEXT: v_bfe_u32 v15, v15, 16, 15 +; GCN-NEXT: v_and_b32_e32 v26, 0x8000, v26 +; GCN-NEXT: v_and_b32_e32 v25, 0x8000, v25 +; GCN-NEXT: v_and_b32_e32 v24, 0x8000, v24 +; GCN-NEXT: v_and_b32_e32 v23, 0x8000, v23 +; GCN-NEXT: v_and_b32_e32 v22, 0x8000, v22 +; GCN-NEXT: v_and_b32_e32 v21, 0x8000, v21 +; GCN-NEXT: v_and_b32_e32 v20, 0x8000, v20 +; GCN-NEXT: v_and_b32_e32 v19, 0x8000, v19 +; GCN-NEXT: v_and_b32_e32 v18, 0x8000, v18 +; GCN-NEXT: v_and_b32_e32 v17, 0x8000, v17 +; GCN-NEXT: v_and_b32_e32 v16, 0x8000, v16 +; GCN-NEXT: v_or_b32_e32 v10, v10, v26 +; GCN-NEXT: v_or_b32_e32 v9, v9, v25 +; GCN-NEXT: v_or_b32_e32 v8, v8, v24 +; GCN-NEXT: v_or_b32_e32 v7, v7, v23 +; GCN-NEXT: v_or_b32_e32 v6, v6, v22 +; GCN-NEXT: v_or_b32_e32 v5, v5, v21 +; GCN-NEXT: v_or_b32_e32 v4, v4, v20 +; GCN-NEXT: v_or_b32_e32 v3, v3, v19 +; GCN-NEXT: v_or_b32_e32 v2, v2, v18 +; GCN-NEXT: v_or_b32_e32 v1, v1, v17 +; GCN-NEXT: v_or_b32_e32 v0, v0, v16 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 @@ -2942,318 +2416,204 @@ define <32 x bfloat> @v_copysign_v32bf16(<32 x bfloat> %mag, <32 x bfloat> %sign ; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GCN-NEXT: v_or_b32_e32 v0, v53, v0 +; GCN-NEXT: v_or_b32_e32 v1, v52, v1 +; GCN-NEXT: v_or_b32_e32 v2, v51, v2 +; GCN-NEXT: v_or_b32_e32 v3, v50, v3 +; GCN-NEXT: v_or_b32_e32 v4, v49, v4 +; GCN-NEXT: v_or_b32_e32 v5, v48, v5 +; GCN-NEXT: v_or_b32_e32 v6, v39, v6 +; GCN-NEXT: v_or_b32_e32 v7, v38, v7 +; GCN-NEXT: v_or_b32_e32 v8, v37, v8 +; GCN-NEXT: v_or_b32_e32 v9, v36, v9 +; GCN-NEXT: v_or_b32_e32 v10, v35, v10 +; GCN-NEXT: v_or_b32_e32 v11, v34, v11 +; GCN-NEXT: v_or_b32_e32 v12, v33, v12 +; GCN-NEXT: v_or_b32_e32 v13, v32, v13 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v28 +; GCN-NEXT: v_and_b32_e32 v17, 0x8000, v28 +; GCN-NEXT: v_or_b32_e32 v17, v27, v17 +; GCN-NEXT: v_and_b32_e32 v16, 0x8000, v16 +; GCN-NEXT: v_or_b32_e32 v15, v15, v16 ; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; GCN-NEXT: v_or_b32_e32 v14, v31, v14 +; GCN-NEXT: v_or_b32_e32 v15, v17, v15 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_copysign_v32bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 -; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30 -; GFX7-NEXT: v_bfe_u32 v30, v30, 16, 15 -; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29 -; GFX7-NEXT: v_bfe_u32 v29, v29, 16, 15 -; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28 -; GFX7-NEXT: v_bfe_u32 v28, v28, 16, 15 -; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27 -; GFX7-NEXT: v_bfe_u32 v27, v27, 16, 15 -; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26 -; GFX7-NEXT: v_bfe_u32 v26, v26, 16, 15 -; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 -; GFX7-NEXT: v_bfe_u32 v25, v25, 16, 15 -; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 -; GFX7-NEXT: v_bfe_u32 v24, v24, 16, 15 -; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23 -; GFX7-NEXT: v_bfe_u32 v23, v23, 16, 15 -; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 -; GFX7-NEXT: v_bfe_u32 v22, v22, 16, 15 -; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21 -; GFX7-NEXT: v_bfe_u32 v21, v21, 16, 15 -; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20 -; GFX7-NEXT: v_bfe_u32 v20, v20, 16, 15 -; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19 -; GFX7-NEXT: v_bfe_u32 v19, v19, 16, 15 -; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18 -; GFX7-NEXT: v_bfe_u32 v18, v18, 16, 15 -; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GFX7-NEXT: v_bfe_u32 v17, v17, 16, 15 -; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; GFX7-NEXT: v_bfe_u32 v16, v16, 16, 15 -; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 +; GFX7-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX7-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX7-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX7-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX7-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX7-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX7-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX7-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX7-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX7-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX7-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX7-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX7-NEXT: v_and_b32_e32 v58, 0x8000, v17 +; GFX7-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX7-NEXT: v_and_b32_e32 v59, 0x7fff, v1 +; GFX7-NEXT: v_and_b32_e32 v17, 0x8000, v17 +; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 15 +; GFX7-NEXT: v_and_b32_e32 v38, 0x8000, v27 +; GFX7-NEXT: v_and_b32_e32 v39, 0x7fff, v11 +; GFX7-NEXT: v_and_b32_e32 v48, 0x8000, v26 +; GFX7-NEXT: v_and_b32_e32 v49, 0x7fff, v10 +; GFX7-NEXT: v_and_b32_e32 v50, 0x8000, v25 +; GFX7-NEXT: v_and_b32_e32 v51, 0x7fff, v9 +; GFX7-NEXT: v_and_b32_e32 v40, 0x8000, v22 +; GFX7-NEXT: v_and_b32_e32 v41, 0x7fff, v6 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v17 +; GFX7-NEXT: v_and_b32_e32 v52, 0x8000, v24 +; GFX7-NEXT: v_and_b32_e32 v53, 0x7fff, v8 +; GFX7-NEXT: v_and_b32_e32 v54, 0x8000, v23 +; GFX7-NEXT: v_and_b32_e32 v55, 0x7fff, v7 +; GFX7-NEXT: v_and_b32_e32 v42, 0x8000, v21 +; GFX7-NEXT: v_and_b32_e32 v43, 0x7fff, v5 +; GFX7-NEXT: v_and_b32_e32 v44, 0x8000, v20 +; GFX7-NEXT: v_and_b32_e32 v45, 0x7fff, v4 +; GFX7-NEXT: v_and_b32_e32 v46, 0x8000, v19 +; GFX7-NEXT: v_and_b32_e32 v47, 0x7fff, v3 +; GFX7-NEXT: v_and_b32_e32 v56, 0x8000, v18 +; GFX7-NEXT: v_and_b32_e32 v57, 0x7fff, v2 +; GFX7-NEXT: v_or_b32_e32 v38, v39, v38 +; GFX7-NEXT: v_or_b32_e32 v39, v49, v48 +; GFX7-NEXT: v_or_b32_e32 v48, v51, v50 +; GFX7-NEXT: v_or_b32_e32 v51, v41, v40 +; GFX7-NEXT: v_or_b32_e32 v40, v59, v58 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v49, v53, v52 +; GFX7-NEXT: v_or_b32_e32 v50, v55, v54 +; GFX7-NEXT: v_or_b32_e32 v52, v43, v42 +; GFX7-NEXT: v_or_b32_e32 v53, v45, v44 +; GFX7-NEXT: v_or_b32_e32 v54, v47, v46 +; GFX7-NEXT: v_or_b32_e32 v55, v57, v56 +; GFX7-NEXT: v_or_b32_e32 v1, v40, v1 +; GFX7-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX7-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX7-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX7-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX7-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX7-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX7-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX7-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX7-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX7-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX7-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX7-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX7-NEXT: v_and_b32_e32 v32, 0x8000, v30 +; GFX7-NEXT: v_and_b32_e32 v33, 0x7fff, v14 +; GFX7-NEXT: v_and_b32_e32 v34, 0x8000, v29 +; GFX7-NEXT: v_and_b32_e32 v35, 0x7fff, v13 +; GFX7-NEXT: v_or_b32_e32 v32, v33, v32 +; GFX7-NEXT: v_and_b32_e32 v33, 0x8000, v16 +; GFX7-NEXT: v_or_b32_e32 v34, v35, v34 +; GFX7-NEXT: v_and_b32_e32 v35, 0x7fff, v0 +; GFX7-NEXT: v_and_b32_e32 v36, 0x8000, v28 +; GFX7-NEXT: v_and_b32_e32 v37, 0x7fff, v12 +; GFX7-NEXT: v_or_b32_e32 v33, v35, v33 +; GFX7-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; GFX7-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; GFX7-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; GFX7-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; GFX7-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; GFX7-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; GFX7-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; GFX7-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; GFX7-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; GFX7-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; GFX7-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; GFX7-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; GFX7-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; GFX7-NEXT: v_or_b32_e32 v36, v37, v36 +; GFX7-NEXT: v_and_b32_e32 v37, 0x7fff, v15 ; GFX7-NEXT: v_bfe_u32 v15, v15, 16, 15 -; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 +; GFX7-NEXT: v_and_b32_e32 v30, 0x8000, v30 ; GFX7-NEXT: v_bfe_u32 v14, v14, 16, 15 -; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 +; GFX7-NEXT: v_and_b32_e32 v29, 0x8000, v29 ; GFX7-NEXT: v_bfe_u32 v13, v13, 16, 15 -; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 +; GFX7-NEXT: v_and_b32_e32 v28, 0x8000, v28 ; GFX7-NEXT: v_bfe_u32 v12, v12, 16, 15 -; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 +; GFX7-NEXT: v_and_b32_e32 v27, 0x8000, v27 ; GFX7-NEXT: v_bfe_u32 v11, v11, 16, 15 -; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 +; GFX7-NEXT: v_and_b32_e32 v26, 0x8000, v26 ; GFX7-NEXT: v_bfe_u32 v10, v10, 16, 15 -; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; GFX7-NEXT: v_and_b32_e32 v25, 0x8000, v25 ; GFX7-NEXT: v_bfe_u32 v9, v9, 16, 15 -; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 +; GFX7-NEXT: v_and_b32_e32 v24, 0x8000, v24 ; GFX7-NEXT: v_bfe_u32 v8, v8, 16, 15 -; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GFX7-NEXT: v_and_b32_e32 v23, 0x8000, v23 ; GFX7-NEXT: v_bfe_u32 v7, v7, 16, 15 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GFX7-NEXT: v_and_b32_e32 v22, 0x8000, v22 ; GFX7-NEXT: v_bfe_u32 v6, v6, 16, 15 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_and_b32_e32 v21, 0x8000, v21 ; GFX7-NEXT: v_bfe_u32 v5, v5, 16, 15 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_and_b32_e32 v20, 0x8000, v20 ; GFX7-NEXT: v_bfe_u32 v4, v4, 16, 15 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_and_b32_e32 v19, 0x8000, v19 ; GFX7-NEXT: v_bfe_u32 v3, v3, 16, 15 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_and_b32_e32 v18, 0x8000, v18 ; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 15 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 15 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_and_b32_e32 v16, 0x8000, v16 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15 -; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31 +; GFX7-NEXT: v_or_b32_e32 v14, v14, v30 +; GFX7-NEXT: v_or_b32_e32 v13, v13, v29 +; GFX7-NEXT: v_or_b32_e32 v12, v12, v28 +; GFX7-NEXT: v_or_b32_e32 v11, v11, v27 +; GFX7-NEXT: v_or_b32_e32 v10, v10, v26 +; GFX7-NEXT: v_or_b32_e32 v9, v9, v25 +; GFX7-NEXT: v_or_b32_e32 v8, v8, v24 +; GFX7-NEXT: v_or_b32_e32 v7, v7, v23 +; GFX7-NEXT: v_or_b32_e32 v6, v6, v22 +; GFX7-NEXT: v_or_b32_e32 v5, v5, v21 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v20 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v19 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v18 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v16 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GFX7-NEXT: v_and_b32_e32 v35, 0x8000, v31 ; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; GFX7-NEXT: v_bfe_u32 v32, v32, 16, 15 ; GFX7-NEXT: v_and_b32_e32 v31, 0x8000, v31 -; GFX7-NEXT: v_or_b32_e32 v31, v32, v31 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124 -; GFX7-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0x8000, v32 -; GFX7-NEXT: v_or_b32_e32 v30, v30, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:120 -; GFX7-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0x8000, v32 -; GFX7-NEXT: v_or_b32_e32 v29, v29, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:116 -; GFX7-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0x8000, v32 -; GFX7-NEXT: v_or_b32_e32 v28, v28, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:112 -; GFX7-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0x8000, v32 -; GFX7-NEXT: v_or_b32_e32 v27, v27, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:108 -; GFX7-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0x8000, v32 -; GFX7-NEXT: v_or_b32_e32 v26, v26, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:104 -; GFX7-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0x8000, v32 -; GFX7-NEXT: v_or_b32_e32 v25, v25, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:100 -; GFX7-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0x8000, v32 -; GFX7-NEXT: v_or_b32_e32 v24, v24, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:96 -; GFX7-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0x8000, v32 -; GFX7-NEXT: v_or_b32_e32 v23, v23, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:92 -; GFX7-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0x8000, v32 -; GFX7-NEXT: v_or_b32_e32 v22, v22, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:88 -; GFX7-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0x8000, v32 -; GFX7-NEXT: v_or_b32_e32 v21, v21, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:84 -; GFX7-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0x8000, v32 -; GFX7-NEXT: v_or_b32_e32 v20, v20, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:80 -; GFX7-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0x8000, v32 -; GFX7-NEXT: v_or_b32_e32 v19, v19, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:76 -; GFX7-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0x8000, v32 -; GFX7-NEXT: v_or_b32_e32 v18, v18, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72 -; GFX7-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0x8000, v32 -; GFX7-NEXT: v_or_b32_e32 v17, v17, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68 -; GFX7-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0x8000, v32 -; GFX7-NEXT: v_or_b32_e32 v16, v16, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64 -; GFX7-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0x8000, v32 -; GFX7-NEXT: v_or_b32_e32 v15, v15, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60 +; GFX7-NEXT: v_or_b32_e32 v15, v15, v31 +; GFX7-NEXT: v_or_b32_e32 v35, v37, v35 +; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX7-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; GFX7-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0x8000, v32 -; GFX7-NEXT: v_or_b32_e32 v14, v14, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56 -; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0x8000, v32 -; GFX7-NEXT: v_or_b32_e32 v13, v13, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52 ; GFX7-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0x8000, v32 -; GFX7-NEXT: v_or_b32_e32 v12, v12, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:48 ; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0x8000, v32 -; GFX7-NEXT: v_or_b32_e32 v11, v11, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:44 -; GFX7-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0x8000, v32 -; GFX7-NEXT: v_or_b32_e32 v10, v10, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:40 -; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0x8000, v32 -; GFX7-NEXT: v_or_b32_e32 v9, v9, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36 -; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0x8000, v32 -; GFX7-NEXT: v_or_b32_e32 v8, v8, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:32 -; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0x8000, v32 -; GFX7-NEXT: v_or_b32_e32 v7, v7, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28 -; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0x8000, v32 -; GFX7-NEXT: v_or_b32_e32 v6, v6, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0x8000, v32 -; GFX7-NEXT: v_or_b32_e32 v5, v5, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:20 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0x8000, v32 -; GFX7-NEXT: v_or_b32_e32 v4, v4, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:16 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0x8000, v32 -; GFX7-NEXT: v_or_b32_e32 v3, v3, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0x8000, v32 -; GFX7-NEXT: v_or_b32_e32 v2, v2, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0x8000, v32 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0x8000, v32 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v32 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX7-NEXT: v_or_b32_e32 v0, v33, v0 +; GFX7-NEXT: v_or_b32_e32 v2, v55, v2 +; GFX7-NEXT: v_or_b32_e32 v3, v54, v3 +; GFX7-NEXT: v_or_b32_e32 v4, v53, v4 +; GFX7-NEXT: v_or_b32_e32 v5, v52, v5 +; GFX7-NEXT: v_or_b32_e32 v6, v51, v6 +; GFX7-NEXT: v_or_b32_e32 v7, v50, v7 +; GFX7-NEXT: v_or_b32_e32 v8, v49, v8 +; GFX7-NEXT: v_or_b32_e32 v9, v48, v9 +; GFX7-NEXT: v_or_b32_e32 v10, v39, v10 +; GFX7-NEXT: v_or_b32_e32 v11, v38, v11 +; GFX7-NEXT: v_or_b32_e32 v12, v36, v12 +; GFX7-NEXT: v_or_b32_e32 v13, v34, v13 +; GFX7-NEXT: v_or_b32_e32 v14, v32, v14 +; GFX7-NEXT: v_or_b32_e32 v15, v35, v15 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_copysign_v32bf16: @@ -3864,25 +3224,21 @@ define <2 x float> @v_copysign_out_v2f32_mag_v2bf16_sign_v2f32(<2 x bfloat> %mag ; GCN-LABEL: v_copysign_out_v2f32_mag_v2bf16_sign_v2f32: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_and_b32_e32 v3, 0x7fff0000, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GCN-NEXT: s_brev_b32 s4, -2 -; GCN-NEXT: v_and_b32_e32 v1, 0x7fff0000, v1 -; GCN-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0 -; GCN-NEXT: v_bfi_b32 v0, s4, v0, v2 -; GCN-NEXT: v_bfi_b32 v1, s4, v1, v3 +; GCN-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GCN-NEXT: v_bfi_b32 v1, s4, v3, v2 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_copysign_out_v2f32_mag_v2bf16_sign_v2f32: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0x7fff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0 +; GFX7-NEXT: v_and_b32_e32 v3, 0x7fff0000, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_brev_b32 s4, -2 -; GFX7-NEXT: v_bfi_b32 v0, s4, v0, v2 -; GFX7-NEXT: v_bfi_b32 v1, s4, v1, v3 +; GFX7-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX7-NEXT: v_bfi_b32 v1, s4, v3, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_copysign_out_v2f32_mag_v2bf16_sign_v2f32: @@ -3932,21 +3288,19 @@ define <2 x float> @v_copysign_out_v2f32_mag_v2f32_sign_v2bf16(<2 x float> %mag, ; GCN-LABEL: v_copysign_out_v2f32_mag_v2f32_sign_v2bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GCN-NEXT: s_brev_b32 s4, -2 -; GCN-NEXT: v_bfi_b32 v0, s4, v0, v2 -; GCN-NEXT: v_bfi_b32 v1, s4, v1, v3 +; GCN-NEXT: v_bfi_b32 v1, s4, v1, v2 +; GCN-NEXT: v_bfi_b32 v0, s4, v0, v3 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_copysign_out_v2f32_mag_v2f32_sign_v2bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX7-NEXT: s_brev_b32 s4, -2 -; GFX7-NEXT: v_bfi_b32 v0, s4, v0, v2 -; GFX7-NEXT: v_bfi_b32 v1, s4, v1, v3 +; GFX7-NEXT: v_bfi_b32 v1, s4, v1, v2 +; GFX7-NEXT: v_bfi_b32 v0, s4, v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_copysign_out_v2f32_mag_v2f32_sign_v2bf16: @@ -4011,16 +3365,18 @@ define <2 x double> @v_copysign_out_v2f64_mag_v2f64_sign_v2bf16(<2 x double> %ma ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_brev_b32 s4, -2 -; GCN-NEXT: v_bfi_b32 v1, s4, v1, v4 -; GCN-NEXT: v_bfi_b32 v3, s4, v3, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GCN-NEXT: v_bfi_b32 v3, s4, v3, v4 +; GCN-NEXT: v_bfi_b32 v1, s4, v1, v5 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_copysign_out_v2f64_mag_v2f64_sign_v2bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_brev_b32 s4, -2 +; GFX7-NEXT: v_bfi_b32 v3, s4, v3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_bfi_b32 v1, s4, v1, v4 -; GFX7-NEXT: v_bfi_b32 v3, s4, v3, v5 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_copysign_out_v2f64_mag_v2f64_sign_v2bf16: @@ -4084,39 +3440,33 @@ define <2 x bfloat> @v_copysign_out_v2bf16_mag_v2f32_sign_v2bf16(<2 x float> %ma ; GCN-LABEL: v_copysign_out_v2bf16_mag_v2f32_sign_v2bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_bfe_u32 v1, v1, 16, 15 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_and_b32_e32 v2, 0x8000, v2 ; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15 ; GCN-NEXT: v_and_b32_e32 v3, 0x8000, v3 -; GCN-NEXT: v_and_b32_e32 v2, 0x8000, v2 -; GCN-NEXT: v_or_b32_e32 v1, v1, v3 +; GCN-NEXT: v_bfe_u32 v1, v1, 16, 15 ; GCN-NEXT: v_or_b32_e32 v0, v0, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_or_b32_e32 v1, v1, v3 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v0, v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_copysign_out_v2bf16_mag_v2f32_sign_v2bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_and_b32_e32 v3, 0x8000, v3 -; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 15 ; GFX7-NEXT: v_and_b32_e32 v2, 0x8000, v2 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_and_b32_e32 v2, 0x8000, v3 +; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 15 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_copysign_out_v2bf16_mag_v2f32_sign_v2bf16: @@ -4223,20 +3573,17 @@ define <2 x bfloat> @v_copysign_out_v2bf16_mag_v2f64_sign_v2bf16(<2 x double> %m ; GCN-LABEL: v_copysign_out_v2bf16_mag_v2f64_sign_v2bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; GCN-NEXT: v_cvt_f32_f64_e32 v2, v[2:3] ; GCN-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] -; GCN-NEXT: v_cvt_f32_f64_e32 v1, v[2:3] -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GCN-NEXT: v_bfe_u32 v1, v1, 16, 15 +; GCN-NEXT: v_and_b32_e32 v1, 0x8000, v4 ; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15 -; GCN-NEXT: v_and_b32_e32 v3, 0x8000, v3 -; GCN-NEXT: v_and_b32_e32 v2, 0x8000, v2 -; GCN-NEXT: v_or_b32_e32 v1, v1, v3 -; GCN-NEXT: v_or_b32_e32 v0, v0, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_and_b32_e32 v3, 0x8000, v5 +; GCN-NEXT: v_bfe_u32 v2, v2, 16, 15 +; GCN-NEXT: v_or_b32_e32 v0, v0, v1 +; GCN-NEXT: v_or_b32_e32 v1, v2, v3 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v0, v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_copysign_out_v2bf16_mag_v2f64_sign_v2bf16: @@ -4244,18 +3591,15 @@ define <2 x bfloat> @v_copysign_out_v2bf16_mag_v2f64_sign_v2bf16(<2 x double> %m ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cvt_f32_f64_e32 v2, v[2:3] ; GFX7-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_and_b32_e32 v1, 0x8000, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v4 +; GFX7-NEXT: v_and_b32_e32 v1, 0x8000, v1 ; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 15 -; GFX7-NEXT: v_or_b32_e32 v1, v2, v1 -; GFX7-NEXT: v_and_b32_e32 v2, 0x8000, v4 +; GFX7-NEXT: v_and_b32_e32 v3, 0x8000, v4 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_copysign_out_v2bf16_mag_v2f64_sign_v2bf16: @@ -4476,39 +3820,35 @@ define <2 x bfloat> @v_copysign_out_v2bf16_mag_v2bf16_sign_v2f32(<2 x bfloat> %m ; GCN-LABEL: v_copysign_out_v2bf16_mag_v2bf16_sign_v2f32: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_bfe_u32 v1, v1, 16, 15 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_and_b32_e32 v3, 0x7fff, v0 ; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15 -; GCN-NEXT: v_and_b32_e32 v3, 0x8000, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_and_b32_e32 v1, 0x8000, v1 ; GCN-NEXT: v_and_b32_e32 v2, 0x8000, v2 -; GCN-NEXT: v_or_b32_e32 v1, v1, v3 +; GCN-NEXT: v_or_b32_e32 v1, v3, v1 ; GCN-NEXT: v_or_b32_e32 v0, v0, v2 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v0, v1, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_copysign_out_v2bf16_mag_v2bf16_sign_v2f32: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_and_b32_e32 v3, 0x8000, v3 -; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 15 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_and_b32_e32 v3, 0x7fff, v0 ; GFX7-NEXT: v_and_b32_e32 v2, 0x8000, v2 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX7-NEXT: v_and_b32_e32 v1, 0x8000, v1 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_or_b32_e32 v1, v3, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_copysign_out_v2bf16_mag_v2bf16_sign_v2f32: @@ -4613,35 +3953,31 @@ define <2 x bfloat> @v_copysign_out_v2bf16_mag_v2bf16_sign_v2f64(<2 x bfloat> %m ; GCN-LABEL: v_copysign_out_v2bf16_mag_v2bf16_sign_v2f64: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0x80000000, v5 -; GCN-NEXT: v_and_b32_e32 v3, 0x80000000, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0x7fff, v0 +; GCN-NEXT: v_and_b32_e32 v2, 0x80000000, v2 +; GCN-NEXT: v_and_b32_e32 v3, 0x80000000, v4 +; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15 ; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_bfe_u32 v1, v1, 16, 15 ; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15 ; GCN-NEXT: v_or_b32_e32 v1, v1, v2 ; GCN-NEXT: v_or_b32_e32 v0, v0, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_or_b32_e32 v0, v1, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_copysign_out_v2bf16_mag_v2bf16_sign_v2f64: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_and_b32_e32 v2, 0x80000000, v5 +; GFX7-NEXT: v_and_b32_e32 v2, 0x80000000, v2 +; GFX7-NEXT: v_and_b32_e32 v1, 0x7fff, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 15 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX7-NEXT: v_and_b32_e32 v2, 0x80000000, v3 +; GFX7-NEXT: v_and_b32_e32 v2, 0x80000000, v4 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_copysign_out_v2bf16_mag_v2bf16_sign_v2f64: @@ -4693,30 +4029,30 @@ define <2 x bfloat> @v_copysign_out_v2bf16_mag_v2bf16_sign_v2f64(<2 x bfloat> %m define amdgpu_ps <2 x i32> @s_copysign_out_v2f32_mag_v2bf16_sign_v2f32(<2 x bfloat> inreg %mag, <2 x float> inreg %sign) { ; GCN-LABEL: s_copysign_out_v2f32_mag_v2bf16_sign_v2f32: ; GCN: ; %bb.0: -; GCN-NEXT: v_mul_f32_e64 v0, 1.0, s1 -; GCN-NEXT: v_mul_f32_e64 v1, 1.0, s0 -; GCN-NEXT: s_brev_b32 s0, -2 +; GCN-NEXT: s_lshl_b32 s3, s0, 16 +; GCN-NEXT: s_and_b32 s0, s0, 0x7fff0000 +; GCN-NEXT: s_brev_b32 s4, -2 +; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: v_mov_b32_e32 v2, s0 +; GCN-NEXT: v_bfi_b32 v0, s4, v2, v0 ; GCN-NEXT: v_mov_b32_e32 v2, s3 -; GCN-NEXT: v_mov_b32_e32 v3, s2 -; GCN-NEXT: v_and_b32_e32 v1, 0x7fff0000, v1 -; GCN-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0 -; GCN-NEXT: v_bfi_b32 v0, s0, v0, v2 -; GCN-NEXT: v_bfi_b32 v1, s0, v1, v3 +; GCN-NEXT: v_bfi_b32 v1, s4, v2, v1 ; GCN-NEXT: v_readfirstlane_b32 s0, v1 ; GCN-NEXT: v_readfirstlane_b32 s1, v0 ; GCN-NEXT: ; return to shader part epilog ; ; GFX7-LABEL: s_copysign_out_v2f32_mag_v2bf16_sign_v2f32: ; GFX7: ; %bb.0: -; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, s1 -; GFX7-NEXT: v_mul_f32_e64 v1, 1.0, s0 -; GFX7-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0 -; GFX7-NEXT: s_brev_b32 s0, -2 -; GFX7-NEXT: v_mov_b32_e32 v2, s3 -; GFX7-NEXT: v_and_b32_e32 v1, 0x7fff0000, v1 -; GFX7-NEXT: v_bfi_b32 v0, s0, v0, v2 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_bfi_b32 v1, s0, v1, v2 +; GFX7-NEXT: s_lshl_b32 s3, s0, 16 +; GFX7-NEXT: s_and_b32 s0, s0, 0x7fff0000 +; GFX7-NEXT: s_brev_b32 s4, -2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: v_bfi_b32 v1, s4, v1, v2 ; GFX7-NEXT: v_readfirstlane_b32 s0, v1 ; GFX7-NEXT: v_readfirstlane_b32 s1, v0 ; GFX7-NEXT: ; return to shader part epilog @@ -4784,28 +4120,30 @@ define amdgpu_ps <2 x i32> @s_copysign_out_v2f32_mag_v2bf16_sign_v2f32(<2 x bflo define amdgpu_ps <2 x i32> @s_copysign_out_v2f32_mag_v2f32_sign_v2bf16(<2 x float> inreg %mag, <2 x bfloat> inreg %sign) { ; GCN-LABEL: s_copysign_out_v2f32_mag_v2f32_sign_v2bf16: ; GCN: ; %bb.0: -; GCN-NEXT: v_mul_f32_e64 v0, 1.0, s2 -; GCN-NEXT: v_mul_f32_e64 v1, 1.0, s3 -; GCN-NEXT: s_brev_b32 s2, -2 -; GCN-NEXT: v_mov_b32_e32 v2, s1 -; GCN-NEXT: v_mov_b32_e32 v3, s0 -; GCN-NEXT: v_bfi_b32 v1, s2, v2, v1 -; GCN-NEXT: v_bfi_b32 v0, s2, v3, v0 -; GCN-NEXT: v_readfirstlane_b32 s0, v0 -; GCN-NEXT: v_readfirstlane_b32 s1, v1 +; GCN-NEXT: s_lshl_b32 s3, s2, 16 +; GCN-NEXT: s_brev_b32 s4, -2 +; GCN-NEXT: v_mov_b32_e32 v0, s1 +; GCN-NEXT: v_mov_b32_e32 v1, s2 +; GCN-NEXT: v_mov_b32_e32 v2, s0 +; GCN-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: v_bfi_b32 v1, s4, v2, v1 +; GCN-NEXT: v_readfirstlane_b32 s1, v0 +; GCN-NEXT: v_readfirstlane_b32 s0, v1 ; GCN-NEXT: ; return to shader part epilog ; ; GFX7-LABEL: s_copysign_out_v2f32_mag_v2f32_sign_v2bf16: ; GFX7: ; %bb.0: -; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, s2 -; GFX7-NEXT: v_mul_f32_e64 v1, 1.0, s3 -; GFX7-NEXT: s_brev_b32 s2, -2 -; GFX7-NEXT: v_mov_b32_e32 v2, s1 -; GFX7-NEXT: v_bfi_b32 v1, s2, v2, v1 -; GFX7-NEXT: v_mov_b32_e32 v2, s0 -; GFX7-NEXT: v_bfi_b32 v0, s2, v2, v0 -; GFX7-NEXT: v_readfirstlane_b32 s0, v0 -; GFX7-NEXT: v_readfirstlane_b32 s1, v1 +; GFX7-NEXT: s_lshl_b32 s3, s2, 16 +; GFX7-NEXT: s_brev_b32 s4, -2 +; GFX7-NEXT: v_mov_b32_e32 v0, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s3 +; GFX7-NEXT: v_bfi_b32 v1, s4, v1, v2 +; GFX7-NEXT: v_readfirstlane_b32 s1, v0 +; GFX7-NEXT: v_readfirstlane_b32 s0, v1 ; GFX7-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_copysign_out_v2f32_mag_v2f32_sign_v2bf16: @@ -4881,28 +4219,30 @@ define amdgpu_ps <2 x i32> @s_copysign_out_v2f32_mag_v2f32_sign_v2bf16(<2 x floa define amdgpu_ps <4 x i32> @s_copysign_out_v2f64_mag_v2f64_sign_v2bf16(<2 x double> inreg %mag, <2 x bfloat> inreg %sign) { ; GCN-LABEL: s_copysign_out_v2f64_mag_v2f64_sign_v2bf16: ; GCN: ; %bb.0: -; GCN-NEXT: s_brev_b32 s6, -2 +; GCN-NEXT: s_brev_b32 s5, -2 ; GCN-NEXT: v_mov_b32_e32 v0, s3 -; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: v_mov_b32_e32 v1, s4 +; GCN-NEXT: s_lshl_b32 s3, s4, 16 ; GCN-NEXT: v_mov_b32_e32 v2, s1 -; GCN-NEXT: v_mov_b32_e32 v3, s4 -; GCN-NEXT: v_bfi_b32 v0, s6, v0, v1 -; GCN-NEXT: v_bfi_b32 v1, s6, v2, v3 -; GCN-NEXT: v_readfirstlane_b32 s1, v1 +; GCN-NEXT: v_bfi_b32 v0, s5, v0, v1 +; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: v_bfi_b32 v1, s5, v2, v1 ; GCN-NEXT: v_readfirstlane_b32 s3, v0 +; GCN-NEXT: v_readfirstlane_b32 s1, v1 ; GCN-NEXT: ; return to shader part epilog ; ; GFX7-LABEL: s_copysign_out_v2f64_mag_v2f64_sign_v2bf16: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_brev_b32 s6, -2 +; GFX7-NEXT: s_brev_b32 s5, -2 ; GFX7-NEXT: v_mov_b32_e32 v0, s3 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: v_bfi_b32 v0, s6, v0, v1 +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_lshl_b32 s3, s4, 16 +; GFX7-NEXT: v_bfi_b32 v0, s5, v0, v1 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: v_bfi_b32 v1, s6, v1, v2 -; GFX7-NEXT: v_readfirstlane_b32 s1, v1 +; GFX7-NEXT: v_mov_b32_e32 v2, s3 +; GFX7-NEXT: v_bfi_b32 v1, s5, v1, v2 ; GFX7-NEXT: v_readfirstlane_b32 s3, v0 +; GFX7-NEXT: v_readfirstlane_b32 s1, v1 ; GFX7-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_copysign_out_v2f64_mag_v2f64_sign_v2bf16: @@ -4979,18 +4319,15 @@ define amdgpu_ps <4 x i32> @s_copysign_out_v2f64_mag_v2f64_sign_v2bf16(<2 x doub define amdgpu_ps i32 @s_copysign_out_v2bf16_mag_v2f32_sign_v2bf16(<2 x float> inreg %mag, <2 x bfloat> inreg %sign) { ; GCN-LABEL: s_copysign_out_v2bf16_mag_v2f32_sign_v2bf16: ; GCN: ; %bb.0: -; GCN-NEXT: v_mul_f32_e64 v0, 1.0, s3 -; GCN-NEXT: v_mul_f32_e64 v1, 1.0, s2 -; GCN-NEXT: v_mul_f32_e64 v2, 1.0, s1 -; GCN-NEXT: v_mul_f32_e64 v3, 1.0, s0 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_bfe_u32 v3, v3, 16, 15 -; GCN-NEXT: v_bfe_u32 v2, v2, 16, 15 -; GCN-NEXT: v_and_b32_e32 v1, 0x8000, v1 -; GCN-NEXT: v_and_b32_e32 v0, 0x8000, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v1 -; GCN-NEXT: v_or_b32_e32 v0, v2, v0 +; GCN-NEXT: s_lshr_b32 s3, s2, 16 +; GCN-NEXT: v_mul_f32_e64 v0, 1.0, s1 +; GCN-NEXT: v_mul_f32_e64 v1, 1.0, s0 +; GCN-NEXT: s_and_b32 s0, s2, 0x8000 +; GCN-NEXT: v_bfe_u32 v1, v1, 16, 15 +; GCN-NEXT: s_and_b32 s1, s3, 0x8000 +; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15 +; GCN-NEXT: v_or_b32_e32 v1, s0, v1 +; GCN-NEXT: v_or_b32_e32 v0, s1, v0 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GCN-NEXT: v_or_b32_e32 v0, v1, v0 ; GCN-NEXT: v_readfirstlane_b32 s0, v0 @@ -4998,18 +4335,15 @@ define amdgpu_ps i32 @s_copysign_out_v2bf16_mag_v2f32_sign_v2bf16(<2 x float> in ; ; GFX7-LABEL: s_copysign_out_v2bf16_mag_v2f32_sign_v2bf16: ; GFX7: ; %bb.0: -; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, s3 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_mul_f32_e64 v1, 1.0, s2 -; GFX7-NEXT: v_mul_f32_e64 v2, 1.0, s1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_mul_f32_e64 v3, 1.0, s0 -; GFX7-NEXT: v_and_b32_e32 v0, 0x8000, v0 -; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 15 -; GFX7-NEXT: v_and_b32_e32 v1, 0x8000, v1 -; GFX7-NEXT: v_bfe_u32 v3, v3, 16, 15 -; GFX7-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX7-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX7-NEXT: v_mul_f32_e64 v1, 1.0, s0 +; GFX7-NEXT: s_lshr_b32 s3, s2, 16 +; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, s1 +; GFX7-NEXT: s_and_b32 s0, s2, 0x8000 +; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 15 +; GFX7-NEXT: v_or_b32_e32 v1, s0, v1 +; GFX7-NEXT: s_and_b32 s0, s3, 0x8000 +; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15 +; GFX7-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: v_readfirstlane_b32 s0, v0 @@ -5124,18 +4458,15 @@ define amdgpu_ps i32 @s_copysign_out_v2bf16_mag_v2f32_sign_v2bf16(<2 x float> in define amdgpu_ps i32 @s_copysign_out_v2bf16_mag_v2f64_sign_v2bf16(<2 x double> inreg %mag, <2 x bfloat> inreg %sign) { ; GCN-LABEL: s_copysign_out_v2bf16_mag_v2f64_sign_v2bf16: ; GCN: ; %bb.0: -; GCN-NEXT: v_mul_f32_e64 v0, 1.0, s5 -; GCN-NEXT: v_mul_f32_e64 v1, 1.0, s4 -; GCN-NEXT: v_cvt_f32_f64_e32 v2, s[2:3] -; GCN-NEXT: v_cvt_f32_f64_e32 v3, s[0:1] -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_bfe_u32 v3, v3, 16, 15 -; GCN-NEXT: v_bfe_u32 v2, v2, 16, 15 -; GCN-NEXT: v_and_b32_e32 v1, 0x8000, v1 -; GCN-NEXT: v_and_b32_e32 v0, 0x8000, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v1 -; GCN-NEXT: v_or_b32_e32 v0, v2, v0 +; GCN-NEXT: s_lshr_b32 s5, s4, 16 +; GCN-NEXT: v_cvt_f32_f64_e32 v0, s[2:3] +; GCN-NEXT: v_cvt_f32_f64_e32 v1, s[0:1] +; GCN-NEXT: s_and_b32 s0, s4, 0x8000 +; GCN-NEXT: v_bfe_u32 v1, v1, 16, 15 +; GCN-NEXT: s_and_b32 s1, s5, 0x8000 +; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15 +; GCN-NEXT: v_or_b32_e32 v1, s0, v1 +; GCN-NEXT: v_or_b32_e32 v0, s1, v0 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GCN-NEXT: v_or_b32_e32 v0, v1, v0 ; GCN-NEXT: v_readfirstlane_b32 s0, v0 @@ -5143,20 +4474,17 @@ define amdgpu_ps i32 @s_copysign_out_v2bf16_mag_v2f64_sign_v2bf16(<2 x double> i ; ; GFX7-LABEL: s_copysign_out_v2bf16_mag_v2f64_sign_v2bf16: ; GFX7: ; %bb.0: -; GFX7-NEXT: v_cvt_f32_f64_e32 v2, s[0:1] -; GFX7-NEXT: v_cvt_f32_f64_e32 v3, s[2:3] -; GFX7-NEXT: v_mul_f32_e64 v1, 1.0, s4 -; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, s5 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_and_b32_e32 v1, 0x8000, v1 -; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 15 -; GFX7-NEXT: v_or_b32_e32 v1, v2, v1 -; GFX7-NEXT: v_and_b32_e32 v0, 0x8000, v0 -; GFX7-NEXT: v_bfe_u32 v2, v3, 16, 15 -; GFX7-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_cvt_f32_f64_e32 v1, s[2:3] +; GFX7-NEXT: v_cvt_f32_f64_e32 v0, s[0:1] +; GFX7-NEXT: s_lshr_b32 s0, s4, 16 +; GFX7-NEXT: s_and_b32 s0, s0, 0x8000 +; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 15 +; GFX7-NEXT: s_and_b32 s1, s4, 0x8000 +; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15 +; GFX7-NEXT: v_or_b32_e32 v1, s0, v1 +; GFX7-NEXT: v_or_b32_e32 v0, s1, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: v_readfirstlane_b32 s0, v0 ; GFX7-NEXT: ; return to shader part epilog ; @@ -5378,18 +4706,16 @@ define amdgpu_ps i32 @s_copysign_out_v2bf16_mag_v2f64_sign_v2bf16(<2 x double> i define amdgpu_ps i32 @s_copysign_out_v2bf16_mag_v2bf16_sign_v2f32(<2 x bfloat> inreg %mag, <2 x float> inreg %sign) { ; GCN-LABEL: s_copysign_out_v2bf16_mag_v2bf16_sign_v2f32: ; GCN: ; %bb.0: -; GCN-NEXT: v_mul_f32_e64 v0, 1.0, s1 -; GCN-NEXT: v_mul_f32_e64 v1, 1.0, s0 -; GCN-NEXT: v_mul_f32_e64 v2, 1.0, s3 -; GCN-NEXT: v_mul_f32_e64 v3, 1.0, s2 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_bfe_u32 v1, v1, 16, 15 -; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15 -; GCN-NEXT: v_and_b32_e32 v3, 0x8000, v3 -; GCN-NEXT: v_and_b32_e32 v2, 0x8000, v2 -; GCN-NEXT: v_or_b32_e32 v1, v1, v3 -; GCN-NEXT: v_or_b32_e32 v0, v0, v2 +; GCN-NEXT: v_mul_f32_e64 v0, 1.0, s2 +; GCN-NEXT: v_mul_f32_e64 v1, 1.0, s1 +; GCN-NEXT: s_and_b32 s1, s0, 0x7fff +; GCN-NEXT: s_bfe_u32 s0, s0, 0xf0010 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_and_b32_e32 v1, 0x8000, v1 +; GCN-NEXT: v_and_b32_e32 v0, 0x8000, v0 +; GCN-NEXT: v_or_b32_e32 v1, s1, v1 +; GCN-NEXT: v_or_b32_e32 v0, s0, v0 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GCN-NEXT: v_or_b32_e32 v0, v1, v0 ; GCN-NEXT: v_readfirstlane_b32 s0, v0 @@ -5397,18 +4723,16 @@ define amdgpu_ps i32 @s_copysign_out_v2bf16_mag_v2bf16_sign_v2f32(<2 x bfloat> i ; ; GFX7-LABEL: s_copysign_out_v2bf16_mag_v2bf16_sign_v2f32: ; GFX7: ; %bb.0: -; GFX7-NEXT: v_mul_f32_e64 v2, 1.0, s3 -; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, s1 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_mul_f32_e64 v3, 1.0, s2 -; GFX7-NEXT: v_mul_f32_e64 v1, 1.0, s0 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_and_b32_e32 v2, 0x8000, v2 -; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15 -; GFX7-NEXT: v_and_b32_e32 v3, 0x8000, v3 -; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 15 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, s2 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_mul_f32_e64 v1, 1.0, s1 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: s_and_b32 s1, s0, 0x7fff +; GFX7-NEXT: v_and_b32_e32 v0, 0x8000, v0 +; GFX7-NEXT: s_bfe_u32 s0, s0, 0xf0010 +; GFX7-NEXT: v_and_b32_e32 v1, 0x8000, v1 +; GFX7-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX7-NEXT: v_or_b32_e32 v1, s1, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: v_readfirstlane_b32 s0, v0 @@ -5522,36 +4846,30 @@ define amdgpu_ps i32 @s_copysign_out_v2bf16_mag_v2bf16_sign_v2f32(<2 x bfloat> i define amdgpu_ps i32 @s_copysign_out_v2bf16_mag_v2bf16_sign_v2f64(<2 x bfloat> inreg %mag, <2 x double> inreg %sign) { ; GCN-LABEL: s_copysign_out_v2bf16_mag_v2bf16_sign_v2f64: ; GCN: ; %bb.0: -; GCN-NEXT: v_mul_f32_e64 v0, 1.0, s1 -; GCN-NEXT: v_mul_f32_e64 v1, 1.0, s0 -; GCN-NEXT: s_and_b32 s0, s3, 0x80000000 -; GCN-NEXT: s_and_b32 s1, s5, 0x80000000 -; GCN-NEXT: s_lshr_b32 s0, s0, 16 -; GCN-NEXT: v_bfe_u32 v1, v1, 16, 15 -; GCN-NEXT: s_lshr_b32 s1, s1, 16 -; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15 -; GCN-NEXT: v_or_b32_e32 v1, s0, v1 -; GCN-NEXT: v_or_b32_e32 v0, s1, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_readfirstlane_b32 s0, v0 +; GCN-NEXT: s_and_b32 s1, s0, 0x7fff +; GCN-NEXT: s_and_b32 s2, s2, 0x80000000 +; GCN-NEXT: s_and_b32 s3, s4, 0x80000000 +; GCN-NEXT: s_bfe_u32 s0, s0, 0xf0010 +; GCN-NEXT: s_lshr_b32 s2, s2, 16 +; GCN-NEXT: s_lshr_b32 s3, s3, 16 +; GCN-NEXT: s_or_b32 s1, s1, s2 +; GCN-NEXT: s_or_b32 s0, s0, s3 +; GCN-NEXT: s_lshl_b32 s0, s0, 16 +; GCN-NEXT: s_or_b32 s0, s1, s0 ; GCN-NEXT: ; return to shader part epilog ; ; GFX7-LABEL: s_copysign_out_v2bf16_mag_v2bf16_sign_v2f64: ; GFX7: ; %bb.0: -; GFX7-NEXT: v_mul_f32_e64 v1, 1.0, s0 -; GFX7-NEXT: s_and_b32 s0, s3, 0x80000000 -; GFX7-NEXT: s_lshr_b32 s0, s0, 16 -; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 15 -; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, s1 -; GFX7-NEXT: v_or_b32_e32 v1, s0, v1 -; GFX7-NEXT: s_and_b32 s0, s5, 0x80000000 -; GFX7-NEXT: s_lshr_b32 s0, s0, 16 -; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15 -; GFX7-NEXT: v_or_b32_e32 v0, s0, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX7-NEXT: v_readfirstlane_b32 s0, v0 +; GFX7-NEXT: s_and_b32 s2, s2, 0x80000000 +; GFX7-NEXT: s_and_b32 s1, s0, 0x7fff +; GFX7-NEXT: s_lshr_b32 s2, s2, 16 +; GFX7-NEXT: s_or_b32 s1, s1, s2 +; GFX7-NEXT: s_and_b32 s2, s4, 0x80000000 +; GFX7-NEXT: s_lshr_b32 s2, s2, 16 +; GFX7-NEXT: s_bfe_u32 s0, s0, 0xf0010 +; GFX7-NEXT: s_or_b32 s0, s0, s2 +; GFX7-NEXT: s_lshl_b32 s0, s0, 16 +; GFX7-NEXT: s_or_b32 s0, s1, s0 ; GFX7-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_copysign_out_v2bf16_mag_v2bf16_sign_v2f64: @@ -5604,31 +4922,25 @@ define <3 x float> @v_copysign_out_v3f32_mag_v3bf16_sign_v3f32(<3 x bfloat> %mag ; GCN-LABEL: v_copysign_out_v3f32_mag_v3bf16_sign_v3f32: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v1 +; GCN-NEXT: v_and_b32_e32 v1, 0x7fff0000, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GCN-NEXT: s_brev_b32 s4, -2 -; GCN-NEXT: v_and_b32_e32 v2, 0x7fff0000, v2 -; GCN-NEXT: v_and_b32_e32 v1, 0x7fff0000, v1 -; GCN-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0 -; GCN-NEXT: v_bfi_b32 v0, s4, v0, v3 -; GCN-NEXT: v_bfi_b32 v1, s4, v1, v4 -; GCN-NEXT: v_bfi_b32 v2, s4, v2, v5 +; GCN-NEXT: v_bfi_b32 v0, s4, v0, v2 +; GCN-NEXT: v_bfi_b32 v1, s4, v1, v3 +; GCN-NEXT: v_bfi_b32 v2, s4, v5, v4 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_copysign_out_v3f32_mag_v3bf16_sign_v3f32: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_and_b32_e32 v2, 0x7fff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v1, 0x7fff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v1 +; GFX7-NEXT: v_and_b32_e32 v1, 0x7fff0000, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_brev_b32 s4, -2 -; GFX7-NEXT: v_bfi_b32 v0, s4, v0, v3 -; GFX7-NEXT: v_bfi_b32 v1, s4, v1, v4 -; GFX7-NEXT: v_bfi_b32 v2, s4, v2, v5 +; GFX7-NEXT: v_bfi_b32 v0, s4, v0, v2 +; GFX7-NEXT: v_bfi_b32 v1, s4, v1, v3 +; GFX7-NEXT: v_bfi_b32 v2, s4, v5, v4 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_copysign_out_v3f32_mag_v3bf16_sign_v3f32: @@ -5687,25 +4999,23 @@ define <3 x float> @v_copysign_out_v3f32_mag_v3f32_sign_v3bf16(<3 x float> %mag, ; GCN-LABEL: v_copysign_out_v3f32_mag_v3f32_sign_v3bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v3 ; GCN-NEXT: s_brev_b32 s4, -2 -; GCN-NEXT: v_bfi_b32 v0, s4, v0, v3 -; GCN-NEXT: v_bfi_b32 v1, s4, v1, v4 -; GCN-NEXT: v_bfi_b32 v2, s4, v2, v5 +; GCN-NEXT: v_bfi_b32 v1, s4, v1, v3 +; GCN-NEXT: v_bfi_b32 v0, s4, v0, v5 +; GCN-NEXT: v_bfi_b32 v2, s4, v2, v4 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_copysign_out_v3f32_mag_v3f32_sign_v3bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v3 ; GFX7-NEXT: s_brev_b32 s4, -2 -; GFX7-NEXT: v_bfi_b32 v0, s4, v0, v3 -; GFX7-NEXT: v_bfi_b32 v1, s4, v1, v4 -; GFX7-NEXT: v_bfi_b32 v2, s4, v2, v5 +; GFX7-NEXT: v_bfi_b32 v1, s4, v1, v3 +; GFX7-NEXT: v_bfi_b32 v0, s4, v0, v5 +; GFX7-NEXT: v_bfi_b32 v2, s4, v2, v4 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_copysign_out_v3f32_mag_v3f32_sign_v3bf16: @@ -5780,18 +5090,22 @@ define <3 x double> @v_copysign_out_v3f64_mag_v3f64_sign_v3bf16(<3 x double> %ma ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_brev_b32 s4, -2 -; GCN-NEXT: v_bfi_b32 v1, s4, v1, v6 -; GCN-NEXT: v_bfi_b32 v3, s4, v3, v7 -; GCN-NEXT: v_bfi_b32 v5, s4, v5, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_bfi_b32 v3, s4, v3, v6 +; GCN-NEXT: v_bfi_b32 v1, s4, v1, v8 +; GCN-NEXT: v_bfi_b32 v5, s4, v5, v7 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_copysign_out_v3f64_mag_v3f64_sign_v3bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_brev_b32 s4, -2 +; GFX7-NEXT: v_bfi_b32 v3, s4, v3, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; GFX7-NEXT: v_bfi_b32 v1, s4, v1, v6 -; GFX7-NEXT: v_bfi_b32 v3, s4, v3, v7 -; GFX7-NEXT: v_bfi_b32 v5, s4, v5, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v7 +; GFX7-NEXT: v_bfi_b32 v5, s4, v5, v6 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_copysign_out_v3f64_mag_v3f64_sign_v3bf16: @@ -5865,53 +5179,41 @@ define <3 x bfloat> @v_copysign_out_v3bf16_mag_v3f32_sign_v3bf16(<3 x float> %ma ; GCN-LABEL: v_copysign_out_v3bf16_mag_v3f32_sign_v3bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_and_b32_e32 v4, 0x8000, v4 +; GCN-NEXT: v_and_b32_e32 v5, 0x8000, v3 ; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GCN-NEXT: v_bfe_u32 v2, v2, 16, 15 -; GCN-NEXT: v_bfe_u32 v1, v1, 16, 15 ; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15 -; GCN-NEXT: v_and_b32_e32 v5, 0x8000, v5 -; GCN-NEXT: v_and_b32_e32 v4, 0x8000, v4 ; GCN-NEXT: v_and_b32_e32 v3, 0x8000, v3 -; GCN-NEXT: v_or_b32_e32 v2, v2, v5 -; GCN-NEXT: v_or_b32_e32 v1, v1, v4 -; GCN-NEXT: v_or_b32_e32 v0, v0, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_bfe_u32 v6, v1, 16, 15 +; GCN-NEXT: v_or_b32_e32 v1, v2, v4 +; GCN-NEXT: v_or_b32_e32 v0, v0, v5 +; GCN-NEXT: v_or_b32_e32 v2, v6, v3 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_or_b32_e32 v0, v0, v2 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_copysign_out_v3bf16_mag_v3f32_sign_v3bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v1 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_and_b32_e32 v5, 0x8000, v5 -; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 15 -; GFX7-NEXT: v_and_b32_e32 v4, 0x8000, v4 +; GFX7-NEXT: v_and_b32_e32 v2, 0x8000, v4 ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 15 -; GFX7-NEXT: v_and_b32_e32 v3, 0x8000, v3 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 0x8000, v3 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15 -; GFX7-NEXT: v_or_b32_e32 v2, v2, v5 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v2, 0x8000, v2 +; GFX7-NEXT: v_bfe_u32 v3, v5, 16, 15 +; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_copysign_out_v3bf16_mag_v3f32_sign_v3bf16: @@ -6055,27 +5357,21 @@ define <3 x bfloat> @v_copysign_out_v3bf16_mag_v3f64_sign_v3bf16(<3 x double> %m ; GCN-LABEL: v_copysign_out_v3bf16_mag_v3f64_sign_v3bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 +; GCN-NEXT: v_cvt_f32_f64_e32 v2, v[2:3] ; GCN-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] -; GCN-NEXT: v_cvt_f32_f64_e32 v1, v[2:3] -; GCN-NEXT: v_cvt_f32_f64_e32 v2, v[4:5] -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v8 -; GCN-NEXT: v_bfe_u32 v2, v2, 16, 15 +; GCN-NEXT: v_cvt_f32_f64_e32 v1, v[4:5] +; GCN-NEXT: v_and_b32_e32 v3, 0x8000, v7 +; GCN-NEXT: v_and_b32_e32 v4, 0x8000, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v6 ; GCN-NEXT: v_bfe_u32 v1, v1, 16, 15 ; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15 ; GCN-NEXT: v_and_b32_e32 v5, 0x8000, v5 -; GCN-NEXT: v_and_b32_e32 v4, 0x8000, v4 -; GCN-NEXT: v_and_b32_e32 v3, 0x8000, v3 +; GCN-NEXT: v_bfe_u32 v2, v2, 16, 15 +; GCN-NEXT: v_or_b32_e32 v1, v1, v3 +; GCN-NEXT: v_or_b32_e32 v0, v0, v4 ; GCN-NEXT: v_or_b32_e32 v2, v2, v5 -; GCN-NEXT: v_or_b32_e32 v1, v1, v4 -; GCN-NEXT: v_or_b32_e32 v0, v0, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_or_b32_e32 v0, v0, v2 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_copysign_out_v3bf16_mag_v3f64_sign_v3bf16: @@ -6083,25 +5379,19 @@ define <3 x bfloat> @v_copysign_out_v3bf16_mag_v3f64_sign_v3bf16(<3 x double> %m ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cvt_f32_f64_e32 v4, v[4:5] ; GFX7-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] -; GFX7-NEXT: v_cvt_f32_f64_e32 v1, v[2:3] -; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GFX7-NEXT: v_and_b32_e32 v2, 0x8000, v8 +; GFX7-NEXT: v_cvt_f32_f64_e32 v2, v[2:3] +; GFX7-NEXT: v_and_b32_e32 v1, 0x8000, v7 ; GFX7-NEXT: v_bfe_u32 v3, v4, 16, 15 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX7-NEXT: v_and_b32_e32 v3, 0x8000, v7 -; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 15 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX7-NEXT: v_or_b32_e32 v1, v3, v1 ; GFX7-NEXT: v_and_b32_e32 v3, 0x8000, v6 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_and_b32_e32 v3, 0x8000, v3 +; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 15 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_copysign_out_v3bf16_mag_v3f64_sign_v3bf16: @@ -6411,53 +5701,45 @@ define <3 x bfloat> @v_copysign_out_v3bf16_mag_v3bf16_sign_v3f32(<3 x bfloat> %m ; GCN-LABEL: v_copysign_out_v3bf16_mag_v3bf16_sign_v3f32: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_and_b32_e32 v1, 0x7fff, v1 +; GCN-NEXT: v_and_b32_e32 v5, 0x7fff, v0 +; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15 ; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_bfe_u32 v2, v2, 16, 15 -; GCN-NEXT: v_bfe_u32 v1, v1, 16, 15 -; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15 -; GCN-NEXT: v_and_b32_e32 v5, 0x8000, v5 ; GCN-NEXT: v_and_b32_e32 v4, 0x8000, v4 +; GCN-NEXT: v_and_b32_e32 v2, 0x8000, v2 ; GCN-NEXT: v_and_b32_e32 v3, 0x8000, v3 -; GCN-NEXT: v_or_b32_e32 v2, v2, v5 ; GCN-NEXT: v_or_b32_e32 v1, v1, v4 +; GCN-NEXT: v_or_b32_e32 v2, v5, v2 ; GCN-NEXT: v_or_b32_e32 v0, v0, v3 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_or_b32_e32 v0, v2, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_copysign_out_v3bf16_mag_v3bf16_sign_v3f32: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_and_b32_e32 v5, 0x8000, v5 -; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 15 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_and_b32_e32 v1, 0x7fff, v1 ; GFX7-NEXT: v_and_b32_e32 v4, 0x8000, v4 -; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 15 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0x7fff, v0 ; GFX7-NEXT: v_and_b32_e32 v3, 0x8000, v3 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15 -; GFX7-NEXT: v_or_b32_e32 v2, v2, v5 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX7-NEXT: v_and_b32_e32 v2, 0x8000, v2 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v3 +; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_copysign_out_v3bf16_mag_v3bf16_sign_v3f32: @@ -6597,47 +5879,39 @@ define <3 x bfloat> @v_copysign_out_v3bf16_mag_v3bf16_sign_v3f64(<3 x bfloat> %m ; GCN-LABEL: v_copysign_out_v3bf16_mag_v3bf16_sign_v3f64: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_and_b32_e32 v3, 0x80000000, v6 -; GCN-NEXT: v_and_b32_e32 v5, 0x80000000, v8 -; GCN-NEXT: v_and_b32_e32 v4, 0x80000000, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_bfe_u32 v1, v1, 16, 15 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_bfe_u32 v2, v2, 16, 15 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15 -; GCN-NEXT: v_or_b32_e32 v1, v1, v3 -; GCN-NEXT: v_or_b32_e32 v2, v2, v5 -; GCN-NEXT: v_or_b32_e32 v0, v0, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_and_b32_e32 v1, 0x7fff, v1 +; GCN-NEXT: v_and_b32_e32 v2, 0x80000000, v7 +; GCN-NEXT: v_and_b32_e32 v4, 0x7fff, v0 +; GCN-NEXT: v_and_b32_e32 v3, 0x80000000, v3 +; GCN-NEXT: v_and_b32_e32 v5, 0x80000000, v5 +; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_or_b32_e32 v1, v1, v2 +; GCN-NEXT: v_or_b32_e32 v2, v4, v3 +; GCN-NEXT: v_or_b32_e32 v0, v0, v5 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_or_b32_e32 v0, v2, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_copysign_out_v3bf16_mag_v3bf16_sign_v3f64: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_and_b32_e32 v3, 0x80000000, v6 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 15 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v3 -; GFX7-NEXT: v_and_b32_e32 v3, 0x80000000, v8 +; GFX7-NEXT: v_and_b32_e32 v2, 0x80000000, v7 +; GFX7-NEXT: v_and_b32_e32 v1, 0x7fff, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_and_b32_e32 v3, 0x80000000, v3 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 0x7fff, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 15 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 -; GFX7-NEXT: v_and_b32_e32 v3, 0x80000000, v4 +; GFX7-NEXT: v_and_b32_e32 v3, 0x80000000, v5 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_copysign_out_v3bf16_mag_v3bf16_sign_v3f64: @@ -6721,37 +5995,29 @@ define <4 x float> @v_copysign_out_v4f32_mag_v4bf16_sign_v4f32(<4 x bfloat> %mag ; GCN-LABEL: v_copysign_out_v4f32_mag_v4bf16_sign_v4f32: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v1 +; GCN-NEXT: v_and_b32_e32 v1, 0x7fff0000, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GCN-NEXT: s_brev_b32 s4, -2 -; GCN-NEXT: v_and_b32_e32 v3, 0x7fff0000, v3 -; GCN-NEXT: v_and_b32_e32 v2, 0x7fff0000, v2 -; GCN-NEXT: v_and_b32_e32 v1, 0x7fff0000, v1 -; GCN-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0 -; GCN-NEXT: v_bfi_b32 v0, s4, v0, v4 -; GCN-NEXT: v_bfi_b32 v1, s4, v1, v5 -; GCN-NEXT: v_bfi_b32 v2, s4, v2, v6 -; GCN-NEXT: v_bfi_b32 v3, s4, v3, v7 +; GCN-NEXT: v_bfi_b32 v0, s4, v0, v2 +; GCN-NEXT: v_bfi_b32 v1, s4, v1, v3 +; GCN-NEXT: v_bfi_b32 v2, s4, v7, v4 +; GCN-NEXT: v_bfi_b32 v3, s4, v6, v5 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_copysign_out_v4f32_mag_v4bf16_sign_v4f32: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_and_b32_e32 v3, 0x7fff0000, v3 -; GFX7-NEXT: v_and_b32_e32 v2, 0x7fff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v1, 0x7fff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v1 +; GFX7-NEXT: v_and_b32_e32 v1, 0x7fff0000, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_brev_b32 s4, -2 -; GFX7-NEXT: v_bfi_b32 v0, s4, v0, v4 -; GFX7-NEXT: v_bfi_b32 v1, s4, v1, v5 -; GFX7-NEXT: v_bfi_b32 v2, s4, v2, v6 -; GFX7-NEXT: v_bfi_b32 v3, s4, v3, v7 +; GFX7-NEXT: v_bfi_b32 v0, s4, v0, v2 +; GFX7-NEXT: v_bfi_b32 v1, s4, v1, v3 +; GFX7-NEXT: v_bfi_b32 v2, s4, v7, v4 +; GFX7-NEXT: v_bfi_b32 v3, s4, v6, v5 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_copysign_out_v4f32_mag_v4bf16_sign_v4f32: @@ -6818,29 +6084,27 @@ define <4 x float> @v_copysign_out_v4f32_mag_v4f32_sign_v4bf16(<4 x float> %mag, ; GCN-LABEL: v_copysign_out_v4f32_mag_v4f32_sign_v4bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v4 ; GCN-NEXT: s_brev_b32 s4, -2 -; GCN-NEXT: v_bfi_b32 v0, s4, v0, v4 -; GCN-NEXT: v_bfi_b32 v1, s4, v1, v5 -; GCN-NEXT: v_bfi_b32 v2, s4, v2, v6 -; GCN-NEXT: v_bfi_b32 v3, s4, v3, v7 +; GCN-NEXT: v_bfi_b32 v1, s4, v1, v4 +; GCN-NEXT: v_bfi_b32 v0, s4, v0, v7 +; GCN-NEXT: v_bfi_b32 v2, s4, v2, v5 +; GCN-NEXT: v_bfi_b32 v3, s4, v3, v6 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_copysign_out_v4f32_mag_v4f32_sign_v4bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v4 ; GFX7-NEXT: s_brev_b32 s4, -2 -; GFX7-NEXT: v_bfi_b32 v0, s4, v0, v4 -; GFX7-NEXT: v_bfi_b32 v1, s4, v1, v5 -; GFX7-NEXT: v_bfi_b32 v2, s4, v2, v6 -; GFX7-NEXT: v_bfi_b32 v3, s4, v3, v7 +; GFX7-NEXT: v_bfi_b32 v1, s4, v1, v4 +; GFX7-NEXT: v_bfi_b32 v0, s4, v0, v7 +; GFX7-NEXT: v_bfi_b32 v2, s4, v2, v5 +; GFX7-NEXT: v_bfi_b32 v3, s4, v3, v6 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_copysign_out_v4f32_mag_v4f32_sign_v4bf16: @@ -6929,20 +6193,24 @@ define <4 x double> @v_copysign_out_v4f64_mag_v4f64_sign_v4bf16(<4 x double> %ma ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_brev_b32 s4, -2 -; GCN-NEXT: v_bfi_b32 v1, s4, v1, v8 -; GCN-NEXT: v_bfi_b32 v3, s4, v3, v9 -; GCN-NEXT: v_bfi_b32 v5, s4, v5, v10 -; GCN-NEXT: v_bfi_b32 v7, s4, v7, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v9 +; GCN-NEXT: v_bfi_b32 v3, s4, v3, v8 +; GCN-NEXT: v_bfi_b32 v7, s4, v7, v9 +; GCN-NEXT: v_bfi_b32 v1, s4, v1, v10 +; GCN-NEXT: v_bfi_b32 v5, s4, v5, v11 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_copysign_out_v4f64_mag_v4f64_sign_v4bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_brev_b32 s4, -2 +; GFX7-NEXT: v_bfi_b32 v3, s4, v3, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; GFX7-NEXT: v_bfi_b32 v1, s4, v1, v8 -; GFX7-NEXT: v_bfi_b32 v3, s4, v3, v9 -; GFX7-NEXT: v_bfi_b32 v5, s4, v5, v10 -; GFX7-NEXT: v_bfi_b32 v7, s4, v7, v11 +; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; GFX7-NEXT: v_bfi_b32 v7, s4, v7, v9 +; GFX7-NEXT: v_bfi_b32 v5, s4, v5, v8 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_copysign_out_v4f64_mag_v4f64_sign_v4bf16: @@ -7030,67 +6298,55 @@ define <4 x bfloat> @v_copysign_out_v4bf16_mag_v4f32_sign_v4bf16(<4 x float> %ma ; GCN-LABEL: v_copysign_out_v4bf16_mag_v4f32_sign_v4bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v5 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_bfe_u32 v3, v3, 16, 15 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_and_b32_e32 v5, 0x8000, v5 +; GCN-NEXT: v_and_b32_e32 v4, 0x8000, v4 ; GCN-NEXT: v_bfe_u32 v2, v2, 16, 15 -; GCN-NEXT: v_bfe_u32 v1, v1, 16, 15 -; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15 ; GCN-NEXT: v_and_b32_e32 v7, 0x8000, v7 +; GCN-NEXT: v_bfe_u32 v3, v3, 16, 15 +; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15 ; GCN-NEXT: v_and_b32_e32 v6, 0x8000, v6 -; GCN-NEXT: v_and_b32_e32 v5, 0x8000, v5 -; GCN-NEXT: v_and_b32_e32 v4, 0x8000, v4 +; GCN-NEXT: v_bfe_u32 v1, v1, 16, 15 +; GCN-NEXT: v_or_b32_e32 v2, v2, v5 ; GCN-NEXT: v_or_b32_e32 v3, v3, v7 -; GCN-NEXT: v_or_b32_e32 v2, v2, v6 -; GCN-NEXT: v_or_b32_e32 v1, v1, v5 ; GCN-NEXT: v_or_b32_e32 v0, v0, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_or_b32_e32 v1, v1, v6 ; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v0, v0, v1 +; GCN-NEXT: v_or_b32_e32 v1, v2, v3 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_copysign_out_v4bf16_mag_v4f32_sign_v4bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v5 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_and_b32_e32 v7, 0x8000, v7 -; GFX7-NEXT: v_bfe_u32 v3, v3, 16, 15 -; GFX7-NEXT: v_and_b32_e32 v6, 0x8000, v6 -; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 15 ; GFX7-NEXT: v_and_b32_e32 v5, 0x8000, v5 -; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 15 +; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 15 ; GFX7-NEXT: v_and_b32_e32 v4, 0x8000, v4 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15 -; GFX7-NEXT: v_or_b32_e32 v3, v3, v7 -; GFX7-NEXT: v_or_b32_e32 v2, v2, v6 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v5 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v5 +; GFX7-NEXT: v_and_b32_e32 v5, 0x8000, v7 +; GFX7-NEXT: v_bfe_u32 v3, v3, 16, 15 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_and_b32_e32 v4, 0x8000, v6 +; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 15 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v5 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_or_b32_e32 v1, v2, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_copysign_out_v4bf16_mag_v4f32_sign_v4bf16: @@ -7261,67 +6517,55 @@ define <4 x bfloat> @v_copysign_out_v4bf16_mag_v4f64_sign_v4bf16(<4 x double> %m ; GCN-LABEL: v_copysign_out_v4bf16_mag_v4f64_sign_v4bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 +; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v9 +; GCN-NEXT: v_cvt_f32_f64_e32 v2, v[2:3] ; GCN-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] -; GCN-NEXT: v_cvt_f32_f64_e32 v1, v[2:3] -; GCN-NEXT: v_cvt_f32_f64_e32 v2, v[4:5] -; GCN-NEXT: v_cvt_f32_f64_e32 v3, v[6:7] -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v11 +; GCN-NEXT: v_cvt_f32_f64_e32 v1, v[6:7] +; GCN-NEXT: v_cvt_f32_f64_e32 v3, v[4:5] +; GCN-NEXT: v_and_b32_e32 v4, 0x8000, v9 +; GCN-NEXT: v_and_b32_e32 v5, 0x8000, v8 ; GCN-NEXT: v_bfe_u32 v3, v3, 16, 15 -; GCN-NEXT: v_bfe_u32 v2, v2, 16, 15 +; GCN-NEXT: v_and_b32_e32 v6, 0x8000, v11 ; GCN-NEXT: v_bfe_u32 v1, v1, 16, 15 ; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15 -; GCN-NEXT: v_and_b32_e32 v7, 0x8000, v7 -; GCN-NEXT: v_and_b32_e32 v6, 0x8000, v6 -; GCN-NEXT: v_and_b32_e32 v5, 0x8000, v5 -; GCN-NEXT: v_and_b32_e32 v4, 0x8000, v4 -; GCN-NEXT: v_or_b32_e32 v3, v3, v7 -; GCN-NEXT: v_or_b32_e32 v2, v2, v6 -; GCN-NEXT: v_or_b32_e32 v1, v1, v5 -; GCN-NEXT: v_or_b32_e32 v0, v0, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_and_b32_e32 v7, 0x8000, v10 +; GCN-NEXT: v_bfe_u32 v2, v2, 16, 15 +; GCN-NEXT: v_or_b32_e32 v3, v3, v4 +; GCN-NEXT: v_or_b32_e32 v1, v1, v6 +; GCN-NEXT: v_or_b32_e32 v0, v0, v5 +; GCN-NEXT: v_or_b32_e32 v2, v2, v7 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_or_b32_e32 v0, v0, v2 +; GCN-NEXT: v_or_b32_e32 v1, v3, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_copysign_out_v4bf16_mag_v4f64_sign_v4bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] -; GFX7-NEXT: v_cvt_f32_f64_e32 v1, v[6:7] ; GFX7-NEXT: v_cvt_f32_f64_e32 v2, v[2:3] ; GFX7-NEXT: v_cvt_f32_f64_e32 v3, v[4:5] -; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 -; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GFX7-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] +; GFX7-NEXT: v_cvt_f32_f64_e32 v1, v[6:7] +; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v9 +; GFX7-NEXT: v_and_b32_e32 v4, 0x8000, v9 +; GFX7-NEXT: v_bfe_u32 v3, v3, 16, 15 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX7-NEXT: v_and_b32_e32 v4, 0x8000, v11 ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 15 -; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GFX7-NEXT: v_or_b32_e32 v4, v1, v4 -; GFX7-NEXT: v_and_b32_e32 v1, 0x8000, v10 -; GFX7-NEXT: v_bfe_u32 v3, v3, 16, 15 -; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v3, v3, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0x8000, v9 -; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 15 -; GFX7-NEXT: v_or_b32_e32 v1, v2, v1 -; GFX7-NEXT: v_and_b32_e32 v2, 0x8000, v8 +; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v8 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0x8000, v8 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0x8000, v10 +; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 15 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_or_b32_e32 v1, v3, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_copysign_out_v4bf16_mag_v4f64_sign_v4bf16: @@ -7711,67 +6955,59 @@ define <4 x bfloat> @v_copysign_out_v4bf16_mag_v4bf16_sign_v4f32(<4 x bfloat> %m ; GCN-LABEL: v_copysign_out_v4bf16_mag_v4bf16_sign_v4f32: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_bfe_u32 v3, v3, 16, 15 -; GCN-NEXT: v_bfe_u32 v2, v2, 16, 15 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GCN-NEXT: v_and_b32_e32 v6, 0x7fff, v1 +; GCN-NEXT: v_and_b32_e32 v7, 0x7fff, v0 ; GCN-NEXT: v_bfe_u32 v1, v1, 16, 15 ; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15 -; GCN-NEXT: v_and_b32_e32 v7, 0x8000, v7 -; GCN-NEXT: v_and_b32_e32 v6, 0x8000, v6 -; GCN-NEXT: v_and_b32_e32 v5, 0x8000, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GCN-NEXT: v_and_b32_e32 v4, 0x8000, v4 -; GCN-NEXT: v_or_b32_e32 v3, v3, v7 -; GCN-NEXT: v_or_b32_e32 v2, v2, v6 +; GCN-NEXT: v_and_b32_e32 v2, 0x8000, v2 +; GCN-NEXT: v_and_b32_e32 v5, 0x8000, v5 +; GCN-NEXT: v_and_b32_e32 v3, 0x8000, v3 +; GCN-NEXT: v_or_b32_e32 v4, v6, v4 +; GCN-NEXT: v_or_b32_e32 v2, v7, v2 ; GCN-NEXT: v_or_b32_e32 v1, v1, v5 -; GCN-NEXT: v_or_b32_e32 v0, v0, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_or_b32_e32 v0, v0, v3 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_or_b32_e32 v0, v2, v0 +; GCN-NEXT: v_or_b32_e32 v1, v4, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_copysign_out_v4bf16_mag_v4bf16_sign_v4f32: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GFX7-NEXT: v_and_b32_e32 v7, 0x8000, v7 -; GFX7-NEXT: v_bfe_u32 v3, v3, 16, 15 -; GFX7-NEXT: v_and_b32_e32 v6, 0x8000, v6 -; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 15 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_and_b32_e32 v6, 0x7fff, v1 +; GFX7-NEXT: v_and_b32_e32 v4, 0x8000, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v4, v6, v4 +; GFX7-NEXT: v_and_b32_e32 v6, 0x7fff, v0 ; GFX7-NEXT: v_and_b32_e32 v5, 0x8000, v5 ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 15 -; GFX7-NEXT: v_and_b32_e32 v4, 0x8000, v4 +; GFX7-NEXT: v_and_b32_e32 v3, 0x8000, v3 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15 -; GFX7-NEXT: v_or_b32_e32 v3, v3, v7 -; GFX7-NEXT: v_or_b32_e32 v2, v2, v6 +; GFX7-NEXT: v_and_b32_e32 v2, 0x8000, v2 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v5 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v3 +; GFX7-NEXT: v_or_b32_e32 v2, v6, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX7-NEXT: v_or_b32_e32 v1, v4, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_copysign_out_v4bf16_mag_v4bf16_sign_v4f32: @@ -7938,59 +7174,51 @@ define <4 x bfloat> @v_copysign_out_v4bf16_mag_v4bf16_sign_v4f64(<4 x bfloat> %m ; GCN-LABEL: v_copysign_out_v4bf16_mag_v4bf16_sign_v4f64: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_and_b32_e32 v2, 0x7fff, v1 ; GCN-NEXT: v_and_b32_e32 v4, 0x80000000, v7 -; GCN-NEXT: v_and_b32_e32 v6, 0x80000000, v11 +; GCN-NEXT: v_and_b32_e32 v6, 0x7fff, v0 +; GCN-NEXT: v_and_b32_e32 v3, 0x80000000, v3 ; GCN-NEXT: v_and_b32_e32 v7, 0x80000000, v9 +; GCN-NEXT: v_bfe_u32 v1, v1, 16, 15 ; GCN-NEXT: v_and_b32_e32 v5, 0x80000000, v5 +; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15 ; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_bfe_u32 v1, v1, 16, 15 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_bfe_u32 v3, v3, 16, 15 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_bfe_u32 v2, v2, 16, 15 ; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15 -; GCN-NEXT: v_or_b32_e32 v1, v1, v4 -; GCN-NEXT: v_or_b32_e32 v3, v3, v6 -; GCN-NEXT: v_or_b32_e32 v2, v2, v7 +; GCN-NEXT: v_or_b32_e32 v2, v2, v4 +; GCN-NEXT: v_or_b32_e32 v3, v6, v3 +; GCN-NEXT: v_or_b32_e32 v1, v1, v7 ; GCN-NEXT: v_or_b32_e32 v0, v0, v5 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_or_b32_e32 v0, v3, v0 +; GCN-NEXT: v_or_b32_e32 v1, v2, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_copysign_out_v4bf16_mag_v4bf16_sign_v4f64: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_and_b32_e32 v4, 0x80000000, v7 +; GFX7-NEXT: v_and_b32_e32 v2, 0x7fff, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 15 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 -; GFX7-NEXT: v_and_b32_e32 v4, 0x80000000, v11 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_bfe_u32 v3, v3, 16, 15 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX7-NEXT: v_and_b32_e32 v3, 0x80000000, v3 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0x7fff, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_or_b32_e32 v3, v4, v3 ; GFX7-NEXT: v_and_b32_e32 v4, 0x80000000, v9 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 15 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_or_b32_e32 v2, v2, v4 +; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 15 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 ; GFX7-NEXT: v_and_b32_e32 v4, 0x80000000, v5 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v3, v0 +; GFX7-NEXT: v_or_b32_e32 v1, v2, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_copysign_out_v4bf16_mag_v4bf16_sign_v4f64: @@ -8447,22 +7675,12 @@ define bfloat @v_copysign_bf16_0_f64(double %sign) { define amdgpu_ps i32 @s_copysign_v2bf16_0_v2bf16(<2 x bfloat> inreg %sign) { ; GCN-LABEL: s_copysign_v2bf16_0_v2bf16: ; GCN: ; %bb.0: -; GCN-NEXT: v_mul_f32_e64 v0, 1.0, s1 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GCN-NEXT: v_mul_f32_e64 v0, 1.0, s0 -; GCN-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 -; GCN-NEXT: v_and_b32_e32 v0, 0x80008000, v0 -; GCN-NEXT: v_readfirstlane_b32 s0, v0 +; GCN-NEXT: s_and_b32 s0, s0, 0x80008000 ; GCN-NEXT: ; return to shader part epilog ; ; GFX7-LABEL: s_copysign_v2bf16_0_v2bf16: ; GFX7: ; %bb.0: -; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, s1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, s0 -; GFX7-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 -; GFX7-NEXT: v_and_b32_e32 v0, 0x80008000, v0 -; GFX7-NEXT: v_readfirstlane_b32 s0, v0 +; GFX7-NEXT: s_and_b32 s0, s0, 0x80008000 ; GFX7-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_copysign_v2bf16_0_v2bf16: @@ -8496,19 +7714,13 @@ define <2 x bfloat> @v_copysign_v2bf16_0_v2bf16(<2 x bfloat> %sign) { ; GCN-LABEL: v_copysign_v2bf16_0_v2bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_and_b32_e32 v0, 0x80000000, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0x80000000, v1 +; GCN-NEXT: v_and_b32_e32 v0, 0x80008000, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_copysign_v2bf16_0_v2bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_and_b32_e32 v0, 0x80000000, v0 -; GFX7-NEXT: v_and_b32_e32 v1, 0x80000000, v1 +; GFX7-NEXT: v_and_b32_e32 v0, 0x80008000, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_copysign_v2bf16_0_v2bf16: @@ -8660,18 +7872,20 @@ define <2 x bfloat> @v_copysign_v2bf16_0_v2bf32(<2 x float> %sign) { ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_and_b32_e32 v0, 0x80000000, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0x80000000, v1 +; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GCN-NEXT: v_and_b32_e32 v0, 0x80008000, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_copysign_v2bf16_0_v2bf32: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_and_b32_e32 v0, 0x80000000, v0 -; GFX7-NEXT: v_and_b32_e32 v1, 0x80000000, v1 +; GFX7-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX7-NEXT: v_and_b32_e32 v0, 0x80008000, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_copysign_v2bf16_0_v2bf32: @@ -8830,9 +8044,7 @@ define <2 x bfloat> @v_copysign_v2bf16_0_v2bf64(<2 x double> %sign) { ; GCN-NEXT: v_and_b32_e32 v0, 0x80000000, v1 ; GCN-NEXT: v_and_b32_e32 v1, 0x80000000, v3 ; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v1, v1, v0, 16 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0x80000000, v1 +; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_copysign_v2bf16_0_v2bf64: @@ -8841,9 +8053,7 @@ define <2 x bfloat> @v_copysign_v2bf16_0_v2bf64(<2 x double> %sign) { ; GFX7-NEXT: v_and_b32_e32 v0, 0x80000000, v1 ; GFX7-NEXT: v_and_b32_e32 v1, 0x80000000, v3 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0x80000000, v1 +; GFX7-NEXT: v_alignbit_b32 v0, v1, v0, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_copysign_v2bf16_0_v2bf64: diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll index eec6bab67b6c2..7f38e5bb5bb61 100644 --- a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll @@ -1372,14 +1372,12 @@ define amdgpu_ps i16 @s_copysign_out_f16_mag_f64_sign_f16(double inreg %mag, hal define amdgpu_ps i32 @s_copysign_v2f16(<2 x half> inreg %arg_mag, <2 x half> inreg %arg_sign) { ; SI-LABEL: s_copysign_v2f16: ; SI: ; %bb.0: -; SI-NEXT: v_cvt_f16_f32_e32 v0, s3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, s1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, s2 -; SI-NEXT: v_cvt_f16_f32_e32 v3, s0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: s_lshr_b32 s2, s1, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 +; SI-NEXT: s_lshr_b32 s2, s0, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s0 ; SI-NEXT: s_brev_b32 s0, -2 ; SI-NEXT: v_bfi_b32 v0, s0, v1, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -1423,31 +1421,25 @@ define amdgpu_ps i32 @s_copysign_v2f16(<2 x half> inreg %arg_mag, <2 x half> inr define amdgpu_ps <3 x i16> @s_copysign_v3f16(<3 x half> inreg %arg_mag, <3 x half> inreg %arg_sign) { ; SI-LABEL: s_copysign_v3f16: ; SI: ; %bb.0: -; SI-NEXT: v_cvt_f16_f32_e32 v2, s4 -; SI-NEXT: v_cvt_f16_f32_e32 v3, s1 -; SI-NEXT: v_cvt_f16_f32_e32 v4, s5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, s2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, s3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, s0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_lshr_b32 s4, s2, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 +; SI-NEXT: s_lshr_b32 s4, s0, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s2 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s1 ; SI-NEXT: s_brev_b32 s0, -2 +; SI-NEXT: v_bfi_b32 v0, s0, v1, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_bfi_b32 v1, s0, v5, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_bfi_b32 v2, s0, v3, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_bfi_b32 v3, s0, v5, v4 -; SI-NEXT: v_bfi_b32 v0, s0, v1, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; SI-NEXT: v_lshr_b64 v[2:3], v[0:1], 16 -; SI-NEXT: v_or_b32_e32 v4, v4, v0 -; SI-NEXT: v_readfirstlane_b32 s0, v4 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_readfirstlane_b32 s0, v0 ; SI-NEXT: v_readfirstlane_b32 s1, v2 -; SI-NEXT: v_readfirstlane_b32 s2, v1 ; SI-NEXT: ; return to shader part epilog ; ; VI-LABEL: s_copysign_v3f16: @@ -1494,37 +1486,33 @@ define amdgpu_ps <3 x i16> @s_copysign_v3f16(<3 x half> inreg %arg_mag, <3 x hal define amdgpu_ps <2 x i32> @s_copysign_v4f16(<4 x half> inreg %arg_mag, <4 x half> inreg %arg_sign) { ; SI-LABEL: s_copysign_v4f16: ; SI: ; %bb.0: -; SI-NEXT: v_cvt_f16_f32_e32 v2, s5 -; SI-NEXT: v_cvt_f16_f32_e32 v3, s1 -; SI-NEXT: v_cvt_f16_f32_e32 v4, s7 -; SI-NEXT: v_cvt_f16_f32_e32 v5, s3 -; SI-NEXT: v_cvt_f16_f32_e32 v0, s4 -; SI-NEXT: v_cvt_f16_f32_e32 v1, s0 -; SI-NEXT: v_cvt_f16_f32_e32 v6, s6 -; SI-NEXT: v_cvt_f16_f32_e32 v7, s2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: s_lshr_b32 s4, s2, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 +; SI-NEXT: s_lshr_b32 s4, s0, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: s_lshr_b32 s4, s3, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: s_lshr_b32 s4, s1, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s2 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s0 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s1 ; SI-NEXT: s_brev_b32 s0, -2 -; SI-NEXT: v_bfi_b32 v4, s0, v5, v4 ; SI-NEXT: v_bfi_b32 v2, s0, v3, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_bfi_b32 v5, s0, v7, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_bfi_b32 v0, s0, v1, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_bfi_b32 v3, s0, v7, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; SI-NEXT: v_bfi_b32 v1, s0, v5, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v5, v1 -; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_readfirstlane_b32 s0, v0 -; SI-NEXT: v_readfirstlane_b32 s1, v1 +; SI-NEXT: v_readfirstlane_b32 s1, v2 ; SI-NEXT: ; return to shader part epilog ; ; VI-LABEL: s_copysign_v4f16: @@ -1571,67 +1559,59 @@ define amdgpu_ps <2 x i32> @s_copysign_v4f16(<4 x half> inreg %arg_mag, <4 x hal define amdgpu_ps <4 x i32> @s_copysign_v8f16(<8 x half> inreg %arg_mag, <8 x half> inreg %arg_sign) { ; SI-LABEL: s_copysign_v8f16: ; SI: ; %bb.0: -; SI-NEXT: v_cvt_f16_f32_e32 v2, s9 -; SI-NEXT: v_cvt_f16_f32_e32 v3, s1 -; SI-NEXT: v_cvt_f16_f32_e32 v6, s11 -; SI-NEXT: v_cvt_f16_f32_e32 v7, s3 -; SI-NEXT: v_cvt_f16_f32_e32 v10, s13 -; SI-NEXT: v_cvt_f16_f32_e32 v11, s5 -; SI-NEXT: v_cvt_f16_f32_e32 v12, s15 -; SI-NEXT: v_cvt_f16_f32_e32 v13, s7 -; SI-NEXT: v_cvt_f16_f32_e32 v0, s8 -; SI-NEXT: v_cvt_f16_f32_e32 v1, s0 -; SI-NEXT: v_cvt_f16_f32_e32 v4, s10 -; SI-NEXT: v_cvt_f16_f32_e32 v5, s2 -; SI-NEXT: v_cvt_f16_f32_e32 v8, s12 -; SI-NEXT: v_cvt_f16_f32_e32 v9, s4 -; SI-NEXT: v_cvt_f16_f32_e32 v14, s14 -; SI-NEXT: v_cvt_f16_f32_e32 v15, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: s_lshr_b32 s8, s4, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s8 +; SI-NEXT: s_lshr_b32 s8, s0, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s8 +; SI-NEXT: s_lshr_b32 s8, s5, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s8 +; SI-NEXT: s_lshr_b32 s8, s1, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s8 +; SI-NEXT: s_lshr_b32 s8, s6, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s8 +; SI-NEXT: s_lshr_b32 s8, s2, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s8 +; SI-NEXT: s_lshr_b32 s8, s7, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s8 +; SI-NEXT: s_lshr_b32 s8, s3, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s0 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s1 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s2 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s3 ; SI-NEXT: s_brev_b32 s0, -2 -; SI-NEXT: v_bfi_b32 v12, s0, v13, v12 -; SI-NEXT: v_bfi_b32 v10, s0, v11, v10 ; SI-NEXT: v_bfi_b32 v6, s0, v7, v6 -; SI-NEXT: v_bfi_b32 v2, s0, v3, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_bfi_b32 v13, s0, v15, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_bfi_b32 v8, s0, v9, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_bfi_b32 v4, s0, v5, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_bfi_b32 v2, s0, v3, v2 ; SI-NEXT: v_bfi_b32 v0, s0, v1, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_bfi_b32 v7, s0, v15, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_bfi_b32 v5, s0, v13, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_bfi_b32 v3, s0, v11, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v6 +; SI-NEXT: v_bfi_b32 v1, s0, v9, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v9, v13, v9 -; SI-NEXT: v_or_b32_e32 v8, v8, v10 -; SI-NEXT: v_or_b32_e32 v1, v4, v1 -; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_readfirstlane_b32 s0, v0 -; SI-NEXT: v_readfirstlane_b32 s1, v1 -; SI-NEXT: v_readfirstlane_b32 s2, v8 -; SI-NEXT: v_readfirstlane_b32 s3, v9 +; SI-NEXT: v_readfirstlane_b32 s1, v2 +; SI-NEXT: v_readfirstlane_b32 s2, v4 +; SI-NEXT: v_readfirstlane_b32 s3, v6 ; SI-NEXT: ; return to shader part epilog ; ; VI-LABEL: s_copysign_v8f16: @@ -1701,127 +1681,111 @@ define amdgpu_ps <4 x i32> @s_copysign_v8f16(<8 x half> inreg %arg_mag, <8 x hal define amdgpu_ps <8 x i32> @s_copysign_v16f16(<16 x half> inreg %arg_mag, <16 x half> inreg %arg_sign) { ; SI-LABEL: s_copysign_v16f16: ; SI: ; %bb.0: -; SI-NEXT: v_cvt_f16_f32_e32 v16, s31 -; SI-NEXT: v_cvt_f16_f32_e32 v17, s15 -; SI-NEXT: v_cvt_f16_f32_e32 v18, s30 -; SI-NEXT: v_cvt_f16_f32_e32 v19, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v1, s0 +; SI-NEXT: s_lshr_b32 s16, s8, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: s_lshr_b32 s16, s0, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: s_lshr_b32 s16, s9, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s16 +; SI-NEXT: s_lshr_b32 s16, s1, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s16 +; SI-NEXT: s_lshr_b32 s16, s10, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s16 +; SI-NEXT: s_lshr_b32 s16, s2, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s16 +; SI-NEXT: s_lshr_b32 s16, s11, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s16 +; SI-NEXT: s_lshr_b32 s16, s3, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s16 +; SI-NEXT: s_lshr_b32 s16, s12, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s16 +; SI-NEXT: s_lshr_b32 s16, s4, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s16 +; SI-NEXT: s_lshr_b32 s16, s13, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s16 +; SI-NEXT: s_lshr_b32 s16, s5, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s16 +; SI-NEXT: s_lshr_b32 s16, s14, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s16 +; SI-NEXT: s_lshr_b32 s16, s6, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s16 +; SI-NEXT: s_lshr_b32 s16, s15, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s16 +; SI-NEXT: s_lshr_b32 s16, s7, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s0 ; SI-NEXT: s_brev_b32 s0, -2 -; SI-NEXT: v_cvt_f16_f32_e32 v12, s28 -; SI-NEXT: v_cvt_f16_f32_e32 v13, s12 -; SI-NEXT: v_bfi_b32 v16, s0, v17, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v17, s29 +; SI-NEXT: v_bfi_b32 v14, s0, v15, v14 ; SI-NEXT: v_bfi_b32 v18, s0, v19, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v19, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_bfi_b32 v12, s0, v13, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v13, s11 -; SI-NEXT: v_bfi_b32 v17, s0, v19, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v19, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s6 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_bfi_b32 v10, s0, v11, v10 +; SI-NEXT: v_bfi_b32 v15, s0, v19, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s13 +; SI-NEXT: v_or_b32_e32 v14, v18, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s12 +; SI-NEXT: v_bfi_b32 v12, s0, v13, v12 +; SI-NEXT: v_bfi_b32 v8, s0, v9, v8 +; SI-NEXT: v_bfi_b32 v18, s0, v18, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v14, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v15, s26 -; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v18, s10 -; SI-NEXT: v_or_b32_e32 v12, v12, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, s9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, s24 -; SI-NEXT: v_bfi_b32 v13, s0, v13, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_bfi_b32 v15, s0, v18, v15 -; SI-NEXT: v_bfi_b32 v14, s0, v17, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_bfi_b32 v11, s0, v19, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v2, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v3, s1 -; SI-NEXT: v_cvt_f16_f32_e32 v6, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v7, s3 -; SI-NEXT: v_cvt_f16_f32_e32 v10, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v18, s5 -; SI-NEXT: v_cvt_f16_f32_e32 v19, s23 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, s7 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v0, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v4, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v5, s2 -; SI-NEXT: v_cvt_f16_f32_e32 v8, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v9, s4 -; SI-NEXT: v_cvt_f16_f32_e32 v17, s22 -; SI-NEXT: v_or_b32_e32 v11, v11, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_bfi_b32 v15, s0, v15, v19 -; SI-NEXT: v_bfi_b32 v10, s0, v18, v10 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s9 +; SI-NEXT: v_or_b32_e32 v12, v15, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s1 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s11 +; SI-NEXT: v_or_b32_e32 v10, v18, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s3 +; SI-NEXT: v_or_b32_e32 v8, v11, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s2 ; SI-NEXT: v_bfi_b32 v6, s0, v7, v6 -; SI-NEXT: v_bfi_b32 v2, s0, v3, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_bfi_b32 v14, s0, v14, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_bfi_b32 v8, s0, v9, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_bfi_b32 v4, s0, v5, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_bfi_b32 v2, s0, v3, v2 ; SI-NEXT: v_bfi_b32 v0, s0, v1, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_bfi_b32 v7, s0, v18, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_bfi_b32 v5, s0, v11, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_bfi_b32 v3, s0, v15, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v6 +; SI-NEXT: v_bfi_b32 v1, s0, v17, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v9, v14, v9 -; SI-NEXT: v_or_b32_e32 v8, v8, v10 -; SI-NEXT: v_or_b32_e32 v1, v4, v1 -; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_readfirstlane_b32 s0, v0 -; SI-NEXT: v_readfirstlane_b32 s1, v1 -; SI-NEXT: v_readfirstlane_b32 s2, v8 -; SI-NEXT: v_readfirstlane_b32 s3, v9 -; SI-NEXT: v_readfirstlane_b32 s4, v11 -; SI-NEXT: v_readfirstlane_b32 s5, v13 +; SI-NEXT: v_readfirstlane_b32 s1, v2 +; SI-NEXT: v_readfirstlane_b32 s2, v4 +; SI-NEXT: v_readfirstlane_b32 s3, v6 +; SI-NEXT: v_readfirstlane_b32 s4, v8 +; SI-NEXT: v_readfirstlane_b32 s5, v10 ; SI-NEXT: v_readfirstlane_b32 s6, v12 -; SI-NEXT: v_readfirstlane_b32 s7, v16 +; SI-NEXT: v_readfirstlane_b32 s7, v14 ; SI-NEXT: ; return to shader part epilog ; ; VI-LABEL: s_copysign_v16f16: @@ -1931,17 +1895,19 @@ define <2 x half> @v_copysign_v2f16(<2 x half> %mag, <2 x half> %sign) { ; SI-LABEL: v_copysign_v2f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: s_brev_b32 s4, -2 -; SI-NEXT: v_bfi_b32 v0, s4, v0, v2 -; SI-NEXT: v_bfi_b32 v1, s4, v1, v3 +; SI-NEXT: v_bfi_b32 v2, s4, v3, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_bfi_b32 v0, s4, v0, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_copysign_v2f16: @@ -1971,22 +1937,23 @@ define <3 x half> @v_copysign_v3f16(<3 x half> %mag, <3 x half> %sign) { ; SI-LABEL: v_copysign_v3f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: s_brev_b32 s4, -2 -; SI-NEXT: v_bfi_b32 v0, s4, v0, v3 -; SI-NEXT: v_bfi_b32 v1, s4, v1, v4 -; SI-NEXT: v_bfi_b32 v2, s4, v2, v5 +; SI-NEXT: v_bfi_b32 v0, s4, v0, v2 +; SI-NEXT: v_bfi_b32 v2, s4, v5, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_bfi_b32 v1, s4, v1, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_copysign_v3f16: @@ -2019,27 +1986,31 @@ define <4 x half> @v_copysign_v4f16(<4 x half> %mag, <4 x half> %sign) { ; SI-LABEL: v_copysign_v4f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: s_brev_b32 s4, -2 -; SI-NEXT: v_bfi_b32 v0, s4, v0, v4 -; SI-NEXT: v_bfi_b32 v1, s4, v1, v5 -; SI-NEXT: v_bfi_b32 v2, s4, v2, v6 -; SI-NEXT: v_bfi_b32 v3, s4, v3, v7 +; SI-NEXT: v_bfi_b32 v0, s4, v0, v2 +; SI-NEXT: v_bfi_b32 v2, s4, v5, v4 +; SI-NEXT: v_bfi_b32 v1, s4, v1, v3 +; SI-NEXT: v_bfi_b32 v3, s4, v7, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_copysign_v4f16: @@ -2072,47 +2043,55 @@ define <8 x half> @v_copysign_v8f16(<8 x half> %mag, <8 x half> %sign) { ; SI-LABEL: v_copysign_v8f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_brev_b32 s4, -2 +; SI-NEXT: v_bfi_b32 v0, s4, v0, v4 +; SI-NEXT: v_bfi_b32 v4, s4, v9, v8 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v2 +; SI-NEXT: v_bfi_b32 v1, s4, v1, v5 +; SI-NEXT: v_bfi_b32 v5, s4, v11, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: s_brev_b32 s4, -2 -; SI-NEXT: v_bfi_b32 v0, s4, v0, v8 -; SI-NEXT: v_bfi_b32 v1, s4, v1, v9 -; SI-NEXT: v_bfi_b32 v2, s4, v2, v10 -; SI-NEXT: v_bfi_b32 v3, s4, v3, v11 -; SI-NEXT: v_bfi_b32 v4, s4, v4, v12 -; SI-NEXT: v_bfi_b32 v5, s4, v5, v13 -; SI-NEXT: v_bfi_b32 v6, s4, v6, v14 -; SI-NEXT: v_bfi_b32 v7, s4, v7, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_bfi_b32 v2, s4, v2, v6 +; SI-NEXT: v_bfi_b32 v6, s4, v13, v12 +; SI-NEXT: v_or_b32_e32 v0, v0, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; SI-NEXT: v_bfi_b32 v3, s4, v3, v7 +; SI-NEXT: v_bfi_b32 v7, s4, v15, v14 +; SI-NEXT: v_or_b32_e32 v1, v1, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_copysign_v8f16: @@ -2151,89 +2130,103 @@ define <16 x half> @v_copysign_v16f16(<16 x half> %mag, <16 x half> %sign) { ; SI-LABEL: v_copysign_v16f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: s_brev_b32 s4, -2 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_bfi_b32 v0, s4, v0, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_bfi_b32 v1, s4, v1, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v8 +; SI-NEXT: v_bfi_b32 v15, s4, v19, v15 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v14 +; SI-NEXT: v_bfi_b32 v7, s4, v7, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v6 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v0 +; SI-NEXT: v_bfi_b32 v14, s4, v18, v14 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v13 +; SI-NEXT: v_bfi_b32 v6, s4, v6, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_bfi_b32 v2, s4, v2, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_bfi_b32 v4, s4, v4, v17 +; SI-NEXT: v_bfi_b32 v13, s4, v19, v13 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v12 ; SI-NEXT: v_bfi_b32 v5, s4, v5, v18 -; SI-NEXT: v_bfi_b32 v3, s4, v3, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 -; SI-NEXT: v_bfi_b32 v6, s4, v6, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_bfi_b32 v12, s4, v18, v12 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v11 +; SI-NEXT: v_bfi_b32 v4, s4, v4, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_bfi_b32 v11, s4, v19, v11 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v10 +; SI-NEXT: v_bfi_b32 v3, s4, v3, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_bfi_b32 v7, s4, v7, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_bfi_b32 v8, s4, v8, v17 -; SI-NEXT: v_bfi_b32 v9, s4, v9, v18 -; SI-NEXT: v_bfi_b32 v10, s4, v10, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_bfi_b32 v10, s4, v18, v10 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v9 +; SI-NEXT: v_bfi_b32 v2, s4, v2, v19 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_bfi_b32 v11, s4, v11, v20 -; SI-NEXT: v_bfi_b32 v12, s4, v12, v17 -; SI-NEXT: v_bfi_b32 v13, s4, v13, v18 -; SI-NEXT: v_bfi_b32 v14, s4, v14, v19 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_bfi_b32 v15, s4, v15, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_bfi_b32 v0, s4, v0, v8 +; SI-NEXT: v_bfi_b32 v8, s4, v17, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_bfi_b32 v1, s4, v1, v9 +; SI-NEXT: v_bfi_b32 v9, s4, v19, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v0, v0, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v11 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_or_b32_e32 v2, v8, v2 +; SI-NEXT: v_or_b32_e32 v3, v9, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v13 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_or_b32_e32 v4, v8, v4 +; SI-NEXT: v_or_b32_e32 v5, v9, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v15 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_copysign_v16f16: @@ -2284,233 +2277,201 @@ define <32 x half> @v_copysign_v32f32(<32 x half> %mag, <32 x half> %sign) { ; SI-LABEL: v_copysign_v32f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v14 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: s_brev_b32 s4, -2 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_bfi_b32 v31, s4, v32, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v13 +; SI-NEXT: v_bfi_b32 v14, s4, v14, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_bfi_b32 v30, s4, v32, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v12 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_bfi_b32 v13, s4, v13, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_bfi_b32 v29, s4, v32, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v11 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_bfi_b32 v12, s4, v12, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_bfi_b32 v28, s4, v32, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_bfi_b32 v11, s4, v11, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_bfi_b32 v27, s4, v32, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_bfi_b32 v10, s4, v10, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_bfi_b32 v32, s4, v32, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v15 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_bfi_b32 v9, s4, v9, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v34, v33 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_bfi_b32 v26, s4, v26, v34 +; SI-NEXT: v_bfi_b32 v15, s4, v15, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_bfi_b32 v25, s4, v33, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v7 +; SI-NEXT: v_bfi_b32 v8, s4, v8, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_bfi_b32 v0, s4, v0, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_bfi_b32 v1, s4, v1, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:12 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_bfi_b32 v2, s4, v2, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_bfi_b32 v3, s4, v3, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_bfi_b32 v4, s4, v4, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:24 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_bfi_b32 v5, s4, v5, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:28 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_bfi_b32 v6, s4, v6, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:32 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_bfi_b32 v7, s4, v7, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:36 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_bfi_b32 v8, s4, v8, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:40 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_bfi_b32 v9, s4, v9, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:44 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_bfi_b32 v10, s4, v10, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:48 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_bfi_b32 v11, s4, v11, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:52 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_bfi_b32 v12, s4, v12, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:56 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_bfi_b32 v13, s4, v13, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:60 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_bfi_b32 v14, s4, v14, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:64 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_bfi_b32 v15, s4, v15, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:68 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_bfi_b32 v16, s4, v16, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:72 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_bfi_b32 v17, s4, v17, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_bfi_b32 v18, s4, v18, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_bfi_b32 v19, s4, v19, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:84 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_bfi_b32 v20, s4, v20, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:88 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_bfi_b32 v21, s4, v21, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:92 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_bfi_b32 v22, s4, v22, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:96 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_bfi_b32 v23, s4, v23, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:100 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_bfi_b32 v24, s4, v24, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:104 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_bfi_b32 v25, s4, v25, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:108 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_bfi_b32 v26, s4, v26, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:112 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_bfi_b32 v27, s4, v27, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:116 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_bfi_b32 v28, s4, v28, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_bfi_b32 v29, s4, v29, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:124 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_bfi_b32 v30, s4, v30, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_bfi_b32 v31, s4, v32, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_bfi_b32 v24, s4, v33, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v6 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_bfi_b32 v7, s4, v7, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_bfi_b32 v23, s4, v33, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_bfi_b32 v6, s4, v6, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_bfi_b32 v22, s4, v33, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_bfi_b32 v5, s4, v5, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v20 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_bfi_b32 v21, s4, v33, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_bfi_b32 v4, s4, v4, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v19 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_bfi_b32 v20, s4, v33, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_bfi_b32 v3, s4, v3, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_bfi_b32 v19, s4, v33, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_bfi_b32 v2, s4, v2, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_bfi_b32 v18, s4, v33, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_bfi_b32 v1, s4, v1, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_bfi_b32 v17, s4, v33, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_bfi_b32 v0, s4, v0, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v18 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_or_b32_e32 v0, v16, v0 +; SI-NEXT: v_or_b32_e32 v1, v17, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v20 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_or_b32_e32 v2, v16, v2 +; SI-NEXT: v_or_b32_e32 v3, v17, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v22 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_or_b32_e32 v4, v16, v4 +; SI-NEXT: v_or_b32_e32 v5, v17, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v24 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_or_b32_e32 v6, v16, v6 +; SI-NEXT: v_or_b32_e32 v7, v17, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v32 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v8, v16, v8 +; SI-NEXT: v_or_b32_e32 v9, v17, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v28 +; SI-NEXT: v_or_b32_e32 v10, v16, v10 +; SI-NEXT: v_or_b32_e32 v11, v17, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v30 +; SI-NEXT: v_or_b32_e32 v12, v16, v12 +; SI-NEXT: v_or_b32_e32 v13, v17, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v26 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_copysign_v32f32: @@ -2935,13 +2896,12 @@ define <2 x float> @v_copysign_out_v2f32_mag_v2f16_sign_v2f32(<2 x half> %mag, < ; SI-LABEL: v_copysign_out_v2f32_mag_v2f16_sign_v2f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_brev_b32 s4, -2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_bfi_b32 v0, s4, v0, v2 -; SI-NEXT: v_bfi_b32 v1, s4, v1, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: s_brev_b32 s4, -2 +; SI-NEXT: v_bfi_b32 v0, s4, v0, v1 +; SI-NEXT: v_bfi_b32 v1, s4, v3, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_copysign_out_v2f32_mag_v2f16_sign_v2f32: @@ -2994,11 +2954,10 @@ define <2 x float> @v_copysign_out_v2f32_mag_v2f32_sign_v2f16(<2 x float> %mag, ; SI-LABEL: v_copysign_out_v2f32_mag_v2f32_sign_v2f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: s_brev_b32 s4, -2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: s_brev_b32 s4, -2 ; SI-NEXT: v_bfi_b32 v0, s4, v0, v2 ; SI-NEXT: v_bfi_b32 v1, s4, v1, v3 ; SI-NEXT: s_setpc_b64 s[30:31] @@ -3054,6 +3013,9 @@ define <2 x double> @v_copysign_out_v2f64_mag_v2f64_sign_v2f16(<2 x double> %mag ; SI-LABEL: v_copysign_out_v2f64_mag_v2f64_sign_v2f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: s_brev_b32 s4, -2 ; SI-NEXT: v_bfi_b32 v1, s4, v1, v4 ; SI-NEXT: v_bfi_b32 v3, s4, v3, v5 @@ -3110,17 +3072,20 @@ define <2 x half> @v_copysign_out_v2f16_mag_v2f32_sign_v2f16(<2 x float> %mag, < ; SI-LABEL: v_copysign_out_v2f16_mag_v2f32_sign_v2f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: s_brev_b32 s4, -2 -; SI-NEXT: v_bfi_b32 v0, s4, v0, v2 ; SI-NEXT: v_bfi_b32 v1, s4, v1, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_bfi_b32 v0, s4, v0, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_copysign_out_v2f16_mag_v2f32_sign_v2f16: @@ -3170,17 +3135,17 @@ define <2 x half> @v_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> %mag, ; SI-LABEL: v_copysign_out_v2f16_mag_v2f64_sign_v2f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0x1ff, v3 -; SI-NEXT: v_or_b32_e32 v2, v7, v2 -; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v3 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: v_and_b32_e32 v7, 0x1ff, v1 +; SI-NEXT: v_or_b32_e32 v0, v7, v0 +; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v1 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SI-NEXT: v_and_b32_e32 v6, 0xffe, v6 -; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; SI-NEXT: v_bfe_u32 v7, v3, 20, 11 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-NEXT: v_bfe_u32 v7, v1, 20, 11 ; SI-NEXT: s_movk_i32 s4, 0x3f1 -; SI-NEXT: v_or_b32_e32 v2, v6, v2 +; SI-NEXT: v_or_b32_e32 v0, v6, v0 ; SI-NEXT: v_sub_i32_e32 v8, vcc, s4, v7 -; SI-NEXT: v_or_b32_e32 v6, 0x1000, v2 +; SI-NEXT: v_or_b32_e32 v6, 0x1000, v0 ; SI-NEXT: v_med3_i32 v8, v8, 0, 13 ; SI-NEXT: v_lshrrev_b32_e32 v9, v8, v6 ; SI-NEXT: v_lshlrev_b32_e32 v8, v8, v9 @@ -3190,7 +3155,7 @@ define <2 x half> @v_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> %mag, ; SI-NEXT: v_add_i32_e32 v7, vcc, s5, v7 ; SI-NEXT: v_lshlrev_b32_e32 v8, 12, v7 ; SI-NEXT: v_or_b32_e32 v6, v9, v6 -; SI-NEXT: v_or_b32_e32 v8, v2, v8 +; SI-NEXT: v_or_b32_e32 v8, v0, v8 ; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v7 ; SI-NEXT: v_cndmask_b32_e32 v6, v8, v6, vcc ; SI-NEXT: v_and_b32_e32 v8, 7, v6 @@ -3205,61 +3170,64 @@ define <2 x half> @v_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> %mag, ; SI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v7 ; SI-NEXT: v_cndmask_b32_e32 v6, v8, v6, vcc ; SI-NEXT: v_mov_b32_e32 v9, 0x7e00 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: s_movk_i32 s6, 0x40f -; SI-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc -; SI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v7 -; SI-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_and_b32_e32 v6, 0x1ff, v1 -; SI-NEXT: v_and_b32_e32 v3, 0x8000, v3 -; SI-NEXT: v_or_b32_e32 v0, v6, v0 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 8, v1 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: v_and_b32_e32 v3, 0xffe, v3 -; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; SI-NEXT: v_bfe_u32 v6, v1, 20, 11 -; SI-NEXT: v_or_b32_e32 v0, v3, v0 -; SI-NEXT: v_sub_i32_e32 v7, vcc, s4, v6 -; SI-NEXT: v_or_b32_e32 v3, 0x1000, v0 -; SI-NEXT: v_med3_i32 v7, v7, 0, 13 -; SI-NEXT: v_lshrrev_b32_e32 v10, v7, v3 -; SI-NEXT: v_lshlrev_b32_e32 v7, v7, v10 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, v7, v3 -; SI-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; SI-NEXT: v_add_i32_e32 v6, vcc, s5, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 12, v6 -; SI-NEXT: v_or_b32_e32 v3, v10, v3 -; SI-NEXT: v_or_b32_e32 v7, v0, v7 -; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v6 -; SI-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc -; SI-NEXT: v_and_b32_e32 v7, 7, v3 -; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v7 -; SI-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v7 -; SI-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; SI-NEXT: v_or_b32_e32 v7, v7, v10 -; SI-NEXT: v_lshrrev_b32_e32 v3, 2, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, v3, v7 -; SI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cndmask_b32_e32 v3, v8, v3, vcc ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_movk_i32 s6, 0x40f ; SI-NEXT: v_cndmask_b32_e32 v0, v8, v9, vcc -; SI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v6 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0x1ff, v3 +; SI-NEXT: v_or_b32_e32 v2, v7, v2 +; SI-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc +; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v3 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: v_and_b32_e32 v6, 0xffe, v6 +; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SI-NEXT: v_bfe_u32 v7, v3, 20, 11 +; SI-NEXT: v_or_b32_e32 v2, v6, v2 +; SI-NEXT: v_sub_i32_e32 v10, vcc, s4, v7 +; SI-NEXT: v_or_b32_e32 v6, 0x1000, v2 +; SI-NEXT: v_med3_i32 v10, v10, 0, 13 +; SI-NEXT: v_lshrrev_b32_e32 v11, v10, v6 +; SI-NEXT: v_lshlrev_b32_e32 v10, v10, v11 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, v10, v6 +; SI-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; SI-NEXT: v_add_i32_e32 v7, vcc, s5, v7 +; SI-NEXT: v_lshlrev_b32_e32 v10, 12, v7 +; SI-NEXT: v_or_b32_e32 v6, v11, v6 +; SI-NEXT: v_or_b32_e32 v10, v2, v10 +; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v7 +; SI-NEXT: v_cndmask_b32_e32 v6, v10, v6, vcc +; SI-NEXT: v_and_b32_e32 v10, 7, v6 +; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v10 +; SI-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v10 +; SI-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_lshrrev_b32_e32 v6, 2, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, v6, v10 +; SI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v7 +; SI-NEXT: v_cndmask_b32_e32 v6, v8, v6, vcc +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v7 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc +; SI-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; SI-NEXT: v_and_b32_e32 v3, 0x8000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; SI-NEXT: v_and_b32_e32 v1, 0x8000, v1 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v2 ; SI-NEXT: s_brev_b32 s4, -2 +; SI-NEXT: v_bfi_b32 v1, s4, v2, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_bfi_b32 v0, s4, v0, v4 -; SI-NEXT: v_bfi_b32 v1, s4, v1, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_copysign_out_v2f16_mag_v2f64_sign_v2f16: @@ -3617,16 +3585,19 @@ define <2 x half> @v_copysign_out_v2f16_mag_v2f16_sign_v2f32(<2 x half> %mag, <2 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: s_brev_b32 s4, -2 -; SI-NEXT: v_bfi_b32 v0, s4, v0, v2 -; SI-NEXT: v_bfi_b32 v1, s4, v1, v3 +; SI-NEXT: v_bfi_b32 v0, s4, v0, v1 +; SI-NEXT: v_bfi_b32 v1, s4, v3, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_copysign_out_v2f16_mag_v2f16_sign_v2f32: @@ -3676,13 +3647,16 @@ define <2 x half> @v_copysign_out_v2f16_mag_v2f16_sign_v2f64(<2 x half> %mag, <2 ; SI-LABEL: v_copysign_out_v2f16_mag_v2f16_sign_v2f64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_brev_b32 s4, -2 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_bfi_b32 v0, s4, v0, v3 -; SI-NEXT: v_bfi_b32 v1, s4, v1, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: s_brev_b32 s4, -2 +; SI-NEXT: v_bfi_b32 v1, s4, v1, v4 +; SI-NEXT: v_bfi_b32 v0, s4, v0, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_copysign_out_v2f16_mag_v2f16_sign_v2f64: @@ -3727,17 +3701,16 @@ define <2 x half> @v_copysign_out_v2f16_mag_v2f16_sign_v2f64(<2 x half> %mag, <2 define amdgpu_ps <2 x i32> @s_copysign_out_v2f32_mag_v2f16_sign_v2f32(<2 x half> inreg %mag, <2 x float> inreg %sign) { ; SI-LABEL: s_copysign_out_v2f32_mag_v2f16_sign_v2f32: ; SI: ; %bb.0: -; SI-NEXT: v_cvt_f16_f32_e32 v0, s1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, s0 +; SI-NEXT: s_lshr_b32 s3, s0, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s3 ; SI-NEXT: s_brev_b32 s0, -2 -; SI-NEXT: v_mov_b32_e32 v2, s3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_mov_b32_e32 v2, s1 ; SI-NEXT: v_bfi_b32 v0, s0, v0, v2 ; SI-NEXT: v_mov_b32_e32 v2, s2 ; SI-NEXT: v_bfi_b32 v1, s0, v1, v2 -; SI-NEXT: v_readfirstlane_b32 s0, v1 -; SI-NEXT: v_readfirstlane_b32 s1, v0 +; SI-NEXT: v_readfirstlane_b32 s0, v0 +; SI-NEXT: v_readfirstlane_b32 s1, v1 ; SI-NEXT: ; return to shader part epilog ; ; VI-LABEL: s_copysign_out_v2f32_mag_v2f16_sign_v2f32: @@ -3789,17 +3762,16 @@ define amdgpu_ps <2 x i32> @s_copysign_out_v2f32_mag_v2f16_sign_v2f32(<2 x half> define amdgpu_ps <2 x i32> @s_copysign_out_v2f32_mag_v2f32_sign_v2f16(<2 x float> inreg %mag, <2 x half> inreg %sign) { ; SI-LABEL: s_copysign_out_v2f32_mag_v2f32_sign_v2f16: ; SI: ; %bb.0: -; SI-NEXT: v_cvt_f16_f32_e32 v0, s3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, s2 +; SI-NEXT: s_lshr_b32 s3, s2, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s3 ; SI-NEXT: s_brev_b32 s2, -2 -; SI-NEXT: v_mov_b32_e32 v2, s1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_bfi_b32 v0, s2, v2, v0 ; SI-NEXT: v_mov_b32_e32 v2, s0 +; SI-NEXT: v_bfi_b32 v0, s2, v2, v0 +; SI-NEXT: v_mov_b32_e32 v2, s1 ; SI-NEXT: v_bfi_b32 v1, s2, v2, v1 -; SI-NEXT: v_readfirstlane_b32 s0, v1 -; SI-NEXT: v_readfirstlane_b32 s1, v0 +; SI-NEXT: v_readfirstlane_b32 s0, v0 +; SI-NEXT: v_readfirstlane_b32 s1, v1 ; SI-NEXT: ; return to shader part epilog ; ; VI-LABEL: s_copysign_out_v2f32_mag_v2f32_sign_v2f16: @@ -3864,15 +3836,16 @@ define amdgpu_ps <2 x i32> @s_copysign_out_v2f32_mag_v2f32_sign_v2f16(<2 x float define amdgpu_ps <4 x i32> @s_copysign_out_v2f64_mag_v2f64_sign_v2f16(<2 x double> inreg %mag, <2 x half> inreg %sign) { ; SI-LABEL: s_copysign_out_v2f64_mag_v2f64_sign_v2f16: ; SI: ; %bb.0: -; SI-NEXT: s_brev_b32 s6, -2 -; SI-NEXT: v_mov_b32_e32 v0, s3 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_bfi_b32 v0, s6, v0, v1 -; SI-NEXT: v_mov_b32_e32 v1, s1 -; SI-NEXT: v_mov_b32_e32 v2, s4 -; SI-NEXT: v_bfi_b32 v1, s6, v1, v2 -; SI-NEXT: v_readfirstlane_b32 s1, v1 -; SI-NEXT: v_readfirstlane_b32 s3, v0 +; SI-NEXT: s_lshr_b32 s5, s4, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 +; SI-NEXT: s_brev_b32 s4, -2 +; SI-NEXT: v_mov_b32_e32 v2, s1 +; SI-NEXT: v_bfi_b32 v0, s4, v2, v0 +; SI-NEXT: v_mov_b32_e32 v2, s3 +; SI-NEXT: v_bfi_b32 v1, s4, v2, v1 +; SI-NEXT: v_readfirstlane_b32 s1, v0 +; SI-NEXT: v_readfirstlane_b32 s3, v1 ; SI-NEXT: ; return to shader part epilog ; ; VI-LABEL: s_copysign_out_v2f64_mag_v2f64_sign_v2f16: @@ -3938,21 +3911,20 @@ define amdgpu_ps <4 x i32> @s_copysign_out_v2f64_mag_v2f64_sign_v2f16(<2 x doubl define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f32_sign_v2f16(<2 x float> inreg %mag, <2 x half> inreg %sign) { ; SI-LABEL: s_copysign_out_v2f16_mag_v2f32_sign_v2f16: ; SI: ; %bb.0: -; SI-NEXT: v_cvt_f16_f32_e32 v0, s3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, s1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, s2 -; SI-NEXT: v_cvt_f16_f32_e32 v3, s0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, s0 +; SI-NEXT: s_lshr_b32 s3, s2, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s3 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: s_brev_b32 s0, -2 -; SI-NEXT: v_bfi_b32 v0, s0, v1, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_bfi_b32 v1, s0, v3, v2 +; SI-NEXT: v_bfi_b32 v1, s0, v1, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_bfi_b32 v0, s0, v0, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_readfirstlane_b32 s0, v0 ; SI-NEXT: ; return to shader part epilog ; @@ -4007,97 +3979,96 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f32_sign_v2f16(<2 x float> inre define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inreg %mag, <2 x half> inreg %sign) { ; SI-LABEL: s_copysign_out_v2f16_mag_v2f64_sign_v2f16: ; SI: ; %bb.0: -; SI-NEXT: v_cvt_f16_f32_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s1, 8 -; SI-NEXT: s_and_b32 s6, s4, 0xffe -; SI-NEXT: s_and_b32 s4, s1, 0x1ff -; SI-NEXT: s_or_b32 s0, s4, s0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, s5 -; SI-NEXT: s_cselect_b64 s[4:5], -1, 0 -; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5] -; SI-NEXT: v_readfirstlane_b32 s0, v2 -; SI-NEXT: s_bfe_u32 s5, s1, 0xb0014 +; SI-NEXT: s_lshr_b32 s5, s4, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s5 +; SI-NEXT: s_lshr_b32 s5, s1, 8 +; SI-NEXT: s_and_b32 s5, s5, 0xffe +; SI-NEXT: s_and_b32 s6, s1, 0x1ff ; SI-NEXT: s_or_b32 s0, s6, s0 -; SI-NEXT: s_sub_i32 s6, 0x3f1, s5 -; SI-NEXT: v_med3_i32 v2, s6, 0, 13 -; SI-NEXT: s_or_b32 s4, s0, 0x1000 -; SI-NEXT: v_readfirstlane_b32 s6, v2 -; SI-NEXT: s_lshr_b32 s7, s4, s6 -; SI-NEXT: s_lshl_b32 s6, s7, s6 -; SI-NEXT: s_cmp_lg_u32 s6, s4 -; SI-NEXT: s_cselect_b32 s4, 1, 0 -; SI-NEXT: s_addk_i32 s5, 0xfc10 -; SI-NEXT: s_lshl_b32 s6, s5, 12 -; SI-NEXT: s_or_b32 s4, s7, s4 -; SI-NEXT: s_or_b32 s6, s0, s6 -; SI-NEXT: s_cmp_lt_i32 s5, 1 -; SI-NEXT: s_cselect_b32 s4, s4, s6 -; SI-NEXT: s_and_b32 s6, s4, 7 -; SI-NEXT: s_cmp_gt_i32 s6, 5 +; SI-NEXT: s_cselect_b64 s[6:7], -1, 0 +; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[6:7] +; SI-NEXT: s_bfe_u32 s6, s1, 0xb0014 +; SI-NEXT: v_readfirstlane_b32 s0, v1 +; SI-NEXT: s_sub_i32 s7, 0x3f1, s6 +; SI-NEXT: s_or_b32 s0, s5, s0 +; SI-NEXT: v_med3_i32 v1, s7, 0, 13 +; SI-NEXT: s_or_b32 s5, s0, 0x1000 +; SI-NEXT: v_readfirstlane_b32 s7, v1 +; SI-NEXT: s_lshr_b32 s8, s5, s7 +; SI-NEXT: s_lshl_b32 s7, s8, s7 +; SI-NEXT: s_cmp_lg_u32 s7, s5 +; SI-NEXT: s_cselect_b32 s5, 1, 0 +; SI-NEXT: s_addk_i32 s6, 0xfc10 +; SI-NEXT: s_lshl_b32 s7, s6, 12 +; SI-NEXT: s_or_b32 s5, s8, s5 +; SI-NEXT: s_or_b32 s7, s0, s7 +; SI-NEXT: s_cmp_lt_i32 s6, 1 +; SI-NEXT: s_cselect_b32 s5, s5, s7 +; SI-NEXT: s_and_b32 s7, s5, 7 +; SI-NEXT: s_cmp_gt_i32 s7, 5 +; SI-NEXT: s_cselect_b32 s8, 1, 0 +; SI-NEXT: s_cmp_eq_u32 s7, 3 ; SI-NEXT: s_cselect_b32 s7, 1, 0 -; SI-NEXT: s_cmp_eq_u32 s6, 3 -; SI-NEXT: s_cselect_b32 s6, 1, 0 -; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: s_lshr_b32 s4, s4, 2 -; SI-NEXT: s_add_i32 s4, s4, s6 -; SI-NEXT: s_cmp_lt_i32 s5, 31 -; SI-NEXT: s_cselect_b32 s4, s4, 0x7c00 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_lshr_b32 s5, s5, 2 +; SI-NEXT: s_add_i32 s5, s5, s7 +; SI-NEXT: s_cmp_lt_i32 s6, 31 +; SI-NEXT: s_cselect_b32 s5, s5, 0x7c00 ; SI-NEXT: s_cmp_lg_u32 s0, 0 -; SI-NEXT: s_movk_i32 s6, 0x7e00 -; SI-NEXT: s_cselect_b32 s0, s6, 0x7c00 -; SI-NEXT: s_cmpk_eq_i32 s5, 0x40f -; SI-NEXT: s_cselect_b32 s0, s0, s4 +; SI-NEXT: s_movk_i32 s7, 0x7e00 +; SI-NEXT: s_cselect_b32 s0, s7, 0x7c00 +; SI-NEXT: s_cmpk_eq_i32 s6, 0x40f +; SI-NEXT: s_cselect_b32 s0, s0, s5 ; SI-NEXT: s_lshr_b32 s1, s1, 16 ; SI-NEXT: s_and_b32 s1, s1, 0x8000 -; SI-NEXT: s_or_b32 s4, s1, s0 +; SI-NEXT: s_or_b32 s5, s1, s0 ; SI-NEXT: s_lshr_b32 s0, s3, 8 -; SI-NEXT: s_and_b32 s5, s0, 0xffe +; SI-NEXT: s_and_b32 s6, s0, 0xffe ; SI-NEXT: s_and_b32 s0, s3, 0x1ff ; SI-NEXT: s_or_b32 s0, s0, s2 ; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] -; SI-NEXT: v_readfirstlane_b32 s0, v2 +; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] +; SI-NEXT: v_readfirstlane_b32 s0, v1 ; SI-NEXT: s_bfe_u32 s2, s3, 0xb0014 -; SI-NEXT: s_or_b32 s0, s5, s0 -; SI-NEXT: s_sub_i32 s5, 0x3f1, s2 -; SI-NEXT: v_med3_i32 v2, s5, 0, 13 +; SI-NEXT: s_or_b32 s0, s6, s0 +; SI-NEXT: s_sub_i32 s6, 0x3f1, s2 +; SI-NEXT: v_med3_i32 v1, s6, 0, 13 ; SI-NEXT: s_or_b32 s1, s0, 0x1000 -; SI-NEXT: v_readfirstlane_b32 s5, v2 -; SI-NEXT: s_lshr_b32 s7, s1, s5 -; SI-NEXT: s_lshl_b32 s5, s7, s5 -; SI-NEXT: s_cmp_lg_u32 s5, s1 +; SI-NEXT: v_readfirstlane_b32 s6, v1 +; SI-NEXT: s_lshr_b32 s8, s1, s6 +; SI-NEXT: s_lshl_b32 s6, s8, s6 +; SI-NEXT: s_cmp_lg_u32 s6, s1 ; SI-NEXT: s_cselect_b32 s1, 1, 0 ; SI-NEXT: s_addk_i32 s2, 0xfc10 -; SI-NEXT: s_lshl_b32 s5, s2, 12 -; SI-NEXT: s_or_b32 s1, s7, s1 -; SI-NEXT: s_or_b32 s5, s0, s5 +; SI-NEXT: s_lshl_b32 s6, s2, 12 +; SI-NEXT: s_or_b32 s1, s8, s1 +; SI-NEXT: s_or_b32 s6, s0, s6 ; SI-NEXT: s_cmp_lt_i32 s2, 1 -; SI-NEXT: s_cselect_b32 s1, s1, s5 -; SI-NEXT: s_and_b32 s5, s1, 7 -; SI-NEXT: s_cmp_gt_i32 s5, 5 -; SI-NEXT: s_cselect_b32 s7, 1, 0 -; SI-NEXT: s_cmp_eq_u32 s5, 3 -; SI-NEXT: s_cselect_b32 s5, 1, 0 -; SI-NEXT: s_or_b32 s5, s5, s7 +; SI-NEXT: s_cselect_b32 s1, s1, s6 +; SI-NEXT: s_and_b32 s6, s1, 7 +; SI-NEXT: s_cmp_gt_i32 s6, 5 +; SI-NEXT: s_cselect_b32 s8, 1, 0 +; SI-NEXT: s_cmp_eq_u32 s6, 3 +; SI-NEXT: s_cselect_b32 s6, 1, 0 +; SI-NEXT: s_or_b32 s6, s6, s8 ; SI-NEXT: s_lshr_b32 s1, s1, 2 -; SI-NEXT: s_add_i32 s1, s1, s5 +; SI-NEXT: s_add_i32 s1, s1, s6 ; SI-NEXT: s_cmp_lt_i32 s2, 31 ; SI-NEXT: s_cselect_b32 s1, s1, 0x7c00 ; SI-NEXT: s_cmp_lg_u32 s0, 0 -; SI-NEXT: s_cselect_b32 s0, s6, 0x7c00 +; SI-NEXT: s_cselect_b32 s0, s7, 0x7c00 ; SI-NEXT: s_cmpk_eq_i32 s2, 0x40f ; SI-NEXT: s_cselect_b32 s0, s0, s1 ; SI-NEXT: s_lshr_b32 s1, s3, 16 ; SI-NEXT: s_and_b32 s1, s1, 0x8000 ; SI-NEXT: s_or_b32 s0, s1, s0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s5 ; SI-NEXT: s_brev_b32 s0, -2 -; SI-NEXT: v_bfi_b32 v0, s0, v2, v0 +; SI-NEXT: v_bfi_b32 v0, s0, v1, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_bfi_b32 v1, s0, v3, v1 +; SI-NEXT: v_bfi_b32 v1, s0, v3, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 @@ -4396,21 +4367,20 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f16_sign_v2f32(<2 x half> inreg %mag, <2 x float> inreg %sign) { ; SI-LABEL: s_copysign_out_v2f16_mag_v2f16_sign_v2f32: ; SI: ; %bb.0: +; SI-NEXT: v_cvt_f16_f32_e32 v1, s2 ; SI-NEXT: v_cvt_f16_f32_e32 v0, s1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, s3 -; SI-NEXT: v_cvt_f16_f32_e32 v2, s0 -; SI-NEXT: v_cvt_f16_f32_e32 v3, s2 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: s_lshr_b32 s3, s0, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s3 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: s_brev_b32 s0, -2 -; SI-NEXT: v_bfi_b32 v0, s0, v0, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_bfi_b32 v1, s0, v2, v3 +; SI-NEXT: v_bfi_b32 v1, s0, v2, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_bfi_b32 v0, s0, v3, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_readfirstlane_b32 s0, v0 ; SI-NEXT: ; return to shader part epilog ; @@ -4465,19 +4435,18 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f16_sign_v2f32(<2 x half> inreg define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f16_sign_v2f64(<2 x half> inreg %mag, <2 x double> inreg %sign) { ; SI-LABEL: s_copysign_out_v2f16_mag_v2f16_sign_v2f64: ; SI: ; %bb.0: -; SI-NEXT: v_cvt_f16_f32_e32 v0, s1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, s0 +; SI-NEXT: s_lshr_b32 s1, s0, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s1 ; SI-NEXT: s_brev_b32 s0, -2 -; SI-NEXT: v_mov_b32_e32 v2, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_mov_b32_e32 v2, s2 ; SI-NEXT: v_bfi_b32 v0, s0, v0, v2 -; SI-NEXT: v_mov_b32_e32 v2, s3 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_mov_b32_e32 v2, s4 ; SI-NEXT: v_bfi_b32 v1, s0, v1, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_readfirstlane_b32 s0, v0 ; SI-NEXT: ; return to shader part epilog ; @@ -4522,16 +4491,14 @@ define <3 x float> @v_copysign_out_v3f32_mag_v3f16_sign_v3f32(<3 x half> %mag, < ; SI-LABEL: v_copysign_out_v3f32_mag_v3f16_sign_v3f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: s_brev_b32 s4, -2 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_bfi_b32 v0, s4, v0, v3 -; SI-NEXT: v_bfi_b32 v1, s4, v1, v4 -; SI-NEXT: v_bfi_b32 v2, s4, v2, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: s_brev_b32 s4, -2 +; SI-NEXT: v_bfi_b32 v0, s4, v0, v2 +; SI-NEXT: v_bfi_b32 v2, s4, v1, v4 +; SI-NEXT: v_bfi_b32 v1, s4, v5, v3 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_copysign_out_v3f32_mag_v3f16_sign_v3f32: @@ -4593,16 +4560,14 @@ define <3 x float> @v_copysign_out_v3f32_mag_v3f32_sign_v3f16(<3 x float> %mag, ; SI-LABEL: v_copysign_out_v3f32_mag_v3f32_sign_v3f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: s_brev_b32 s4, -2 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: s_brev_b32 s4, -2 ; SI-NEXT: v_bfi_b32 v0, s4, v0, v3 -; SI-NEXT: v_bfi_b32 v1, s4, v1, v4 -; SI-NEXT: v_bfi_b32 v2, s4, v2, v5 +; SI-NEXT: v_bfi_b32 v2, s4, v2, v4 +; SI-NEXT: v_bfi_b32 v1, s4, v1, v5 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_copysign_out_v3f32_mag_v3f32_sign_v3f16: @@ -4664,10 +4629,14 @@ define <3 x double> @v_copysign_out_v3f64_mag_v3f64_sign_v3f16(<3 x double> %mag ; SI-LABEL: v_copysign_out_v3f64_mag_v3f64_sign_v3f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: s_brev_b32 s4, -2 ; SI-NEXT: v_bfi_b32 v1, s4, v1, v6 -; SI-NEXT: v_bfi_b32 v3, s4, v3, v7 -; SI-NEXT: v_bfi_b32 v5, s4, v5, v8 +; SI-NEXT: v_bfi_b32 v5, s4, v5, v7 +; SI-NEXT: v_bfi_b32 v3, s4, v3, v8 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_copysign_out_v3f64_mag_v3f64_sign_v3f16: @@ -4729,22 +4698,25 @@ define <3 x half> @v_copysign_out_v3f16_mag_v3f32_sign_v3f16(<3 x float> %mag, < ; SI-LABEL: v_copysign_out_v3f16_mag_v3f32_sign_v3f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: s_brev_b32 s4, -2 +; SI-NEXT: v_bfi_b32 v1, s4, v1, v5 ; SI-NEXT: v_bfi_b32 v0, s4, v0, v3 -; SI-NEXT: v_bfi_b32 v1, s4, v1, v4 -; SI-NEXT: v_bfi_b32 v2, s4, v2, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_bfi_b32 v1, s4, v2, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_copysign_out_v3f16_mag_v3f32_sign_v3f16: @@ -4803,138 +4775,141 @@ define <3 x half> @v_copysign_out_v3f16_mag_v3f64_sign_v3f16(<3 x double> %mag, ; SI-LABEL: v_copysign_out_v3f16_mag_v3f64_sign_v3f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v6 -; SI-NEXT: v_and_b32_e32 v10, 0x1ff, v5 -; SI-NEXT: v_or_b32_e32 v4, v10, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v9 -; SI-NEXT: v_lshrrev_b32_e32 v9, 8, v5 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; SI-NEXT: v_and_b32_e32 v9, 0xffe, v9 -; SI-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; SI-NEXT: v_bfe_u32 v10, v5, 20, 11 +; SI-NEXT: v_and_b32_e32 v10, 0x1ff, v3 +; SI-NEXT: v_or_b32_e32 v2, v10, v2 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 8, v3 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: v_and_b32_e32 v7, 0xffe, v7 +; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SI-NEXT: v_bfe_u32 v10, v3, 20, 11 ; SI-NEXT: s_movk_i32 s4, 0x3f1 -; SI-NEXT: v_or_b32_e32 v4, v9, v4 +; SI-NEXT: v_or_b32_e32 v2, v7, v2 ; SI-NEXT: v_sub_i32_e32 v11, vcc, s4, v10 -; SI-NEXT: v_or_b32_e32 v9, 0x1000, v4 +; SI-NEXT: v_or_b32_e32 v7, 0x1000, v2 ; SI-NEXT: v_med3_i32 v11, v11, 0, 13 -; SI-NEXT: v_lshrrev_b32_e32 v12, v11, v9 +; SI-NEXT: v_lshrrev_b32_e32 v12, v11, v7 ; SI-NEXT: v_lshlrev_b32_e32 v11, v11, v12 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, v11, v9 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, v11, v7 ; SI-NEXT: s_movk_i32 s5, 0xfc10 -; SI-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; SI-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; SI-NEXT: v_add_i32_e32 v10, vcc, s5, v10 ; SI-NEXT: v_lshlrev_b32_e32 v11, 12, v10 -; SI-NEXT: v_or_b32_e32 v9, v12, v9 -; SI-NEXT: v_or_b32_e32 v11, v4, v11 +; SI-NEXT: v_or_b32_e32 v7, v12, v7 +; SI-NEXT: v_or_b32_e32 v11, v2, v11 ; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v10 -; SI-NEXT: v_cndmask_b32_e32 v9, v11, v9, vcc -; SI-NEXT: v_and_b32_e32 v11, 7, v9 +; SI-NEXT: v_cndmask_b32_e32 v7, v11, v7, vcc +; SI-NEXT: v_and_b32_e32 v11, 7, v7 ; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v11 ; SI-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v11 ; SI-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; SI-NEXT: v_or_b32_e32 v11, v11, v12 -; SI-NEXT: v_lshrrev_b32_e32 v9, 2, v9 -; SI-NEXT: v_add_i32_e32 v9, vcc, v9, v11 +; SI-NEXT: v_lshrrev_b32_e32 v7, 2, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, v7, v11 ; SI-NEXT: v_mov_b32_e32 v11, 0x7c00 ; SI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v10 -; SI-NEXT: v_cndmask_b32_e32 v9, v11, v9, vcc +; SI-NEXT: v_cndmask_b32_e32 v7, v11, v7, vcc ; SI-NEXT: v_mov_b32_e32 v12, 0x7e00 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; SI-NEXT: s_movk_i32 s6, 0x40f -; SI-NEXT: v_cndmask_b32_e32 v4, v11, v12, vcc -; SI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v10 -; SI-NEXT: v_cndmask_b32_e32 v4, v9, v4, vcc -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_and_b32_e32 v9, 0x1ff, v3 -; SI-NEXT: v_and_b32_e32 v5, 0x8000, v5 -; SI-NEXT: v_or_b32_e32 v2, v9, v2 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v3 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: v_and_b32_e32 v5, 0xffe, v5 -; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; SI-NEXT: v_bfe_u32 v9, v3, 20, 11 -; SI-NEXT: v_or_b32_e32 v2, v5, v2 -; SI-NEXT: v_sub_i32_e32 v10, vcc, s4, v9 -; SI-NEXT: v_or_b32_e32 v5, 0x1000, v2 -; SI-NEXT: v_med3_i32 v10, v10, 0, 13 -; SI-NEXT: v_lshrrev_b32_e32 v13, v10, v5 -; SI-NEXT: v_lshlrev_b32_e32 v10, v10, v13 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, v10, v5 -; SI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; SI-NEXT: v_add_i32_e32 v9, vcc, s5, v9 -; SI-NEXT: v_lshlrev_b32_e32 v10, 12, v9 -; SI-NEXT: v_or_b32_e32 v5, v13, v5 -; SI-NEXT: v_or_b32_e32 v10, v2, v10 -; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v9 -; SI-NEXT: v_cndmask_b32_e32 v5, v10, v5, vcc -; SI-NEXT: v_and_b32_e32 v10, 7, v5 -; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v10 -; SI-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v10 -; SI-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; SI-NEXT: v_or_b32_e32 v10, v10, v13 -; SI-NEXT: v_lshrrev_b32_e32 v5, 2, v5 -; SI-NEXT: v_add_i32_e32 v5, vcc, v5, v10 -; SI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v9 -; SI-NEXT: v_cndmask_b32_e32 v5, v11, v5, vcc ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: s_movk_i32 s6, 0x40f ; SI-NEXT: v_cndmask_b32_e32 v2, v11, v12, vcc -; SI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v9 -; SI-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v10 +; SI-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_and_b32_e32 v5, 0x1ff, v1 +; SI-NEXT: v_and_b32_e32 v7, 0x1ff, v1 ; SI-NEXT: v_and_b32_e32 v3, 0x8000, v3 -; SI-NEXT: v_or_b32_e32 v0, v5, v0 +; SI-NEXT: v_or_b32_e32 v0, v7, v0 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, 8, v1 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SI-NEXT: v_and_b32_e32 v3, 0xffe, v3 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; SI-NEXT: v_bfe_u32 v5, v1, 20, 11 +; SI-NEXT: v_bfe_u32 v7, v1, 20, 11 ; SI-NEXT: v_or_b32_e32 v0, v3, v0 -; SI-NEXT: v_sub_i32_e32 v9, vcc, s4, v5 +; SI-NEXT: v_sub_i32_e32 v10, vcc, s4, v7 ; SI-NEXT: v_or_b32_e32 v3, 0x1000, v0 -; SI-NEXT: v_med3_i32 v9, v9, 0, 13 -; SI-NEXT: v_lshrrev_b32_e32 v10, v9, v3 -; SI-NEXT: v_lshlrev_b32_e32 v9, v9, v10 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, v9, v3 +; SI-NEXT: v_med3_i32 v10, v10, 0, 13 +; SI-NEXT: v_lshrrev_b32_e32 v13, v10, v3 +; SI-NEXT: v_lshlrev_b32_e32 v10, v10, v13 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, v10, v3 ; SI-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; SI-NEXT: v_add_i32_e32 v5, vcc, s5, v5 -; SI-NEXT: v_lshlrev_b32_e32 v9, 12, v5 -; SI-NEXT: v_or_b32_e32 v3, v10, v3 -; SI-NEXT: v_or_b32_e32 v9, v0, v9 -; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v5 -; SI-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc -; SI-NEXT: v_and_b32_e32 v9, 7, v3 -; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v9 +; SI-NEXT: v_add_i32_e32 v7, vcc, s5, v7 +; SI-NEXT: v_lshlrev_b32_e32 v10, 12, v7 +; SI-NEXT: v_or_b32_e32 v3, v13, v3 +; SI-NEXT: v_or_b32_e32 v10, v0, v10 +; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v7 +; SI-NEXT: v_cndmask_b32_e32 v3, v10, v3, vcc +; SI-NEXT: v_and_b32_e32 v10, 7, v3 +; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v10 +; SI-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v10 ; SI-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v9 -; SI-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_or_b32_e32 v10, v10, v13 ; SI-NEXT: v_lshrrev_b32_e32 v3, 2, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, v3, v9 -; SI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_i32_e32 v3, vcc, v3, v10 +; SI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v7 ; SI-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SI-NEXT: v_cndmask_b32_e32 v0, v11, v12, vcc -; SI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v5 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v7 ; SI-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v3, 0x1ff, v5 ; SI-NEXT: v_and_b32_e32 v1, 0x8000, v1 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v5 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffe, v1 +; SI-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; SI-NEXT: v_bfe_u32 v4, v5, 20, 11 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_sub_i32_e32 v7, vcc, s4, v4 +; SI-NEXT: v_or_b32_e32 v3, 0x1000, v1 +; SI-NEXT: v_med3_i32 v7, v7, 0, 13 +; SI-NEXT: v_lshrrev_b32_e32 v10, v7, v3 +; SI-NEXT: v_lshlrev_b32_e32 v7, v7, v10 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, v7, v3 +; SI-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, s5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v7, 12, v4 +; SI-NEXT: v_or_b32_e32 v3, v10, v3 +; SI-NEXT: v_or_b32_e32 v7, v1, v7 +; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v4 +; SI-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; SI-NEXT: v_and_b32_e32 v7, 7, v3 +; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v7 +; SI-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v7 +; SI-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; SI-NEXT: v_or_b32_e32 v7, v7, v10 +; SI-NEXT: v_lshrrev_b32_e32 v3, 2, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, v3, v7 +; SI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v4 +; SI-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; SI-NEXT: v_cndmask_b32_e32 v1, v11, v12, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v4 +; SI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_and_b32_e32 v3, 0x8000, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v4 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: s_brev_b32 s4, -2 -; SI-NEXT: v_bfi_b32 v0, s4, v0, v8 -; SI-NEXT: v_bfi_b32 v1, s4, v1, v7 -; SI-NEXT: v_bfi_b32 v2, s4, v2, v6 +; SI-NEXT: v_bfi_b32 v2, s4, v2, v8 +; SI-NEXT: v_bfi_b32 v0, s4, v0, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_bfi_b32 v1, s4, v1, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_copysign_out_v3f16_mag_v3f64_sign_v3f16: @@ -5431,21 +5406,24 @@ define <3 x half> @v_copysign_out_v3f16_mag_v3f16_sign_v3f32(<3 x half> %mag, <3 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: s_brev_b32 s4, -2 -; SI-NEXT: v_bfi_b32 v0, s4, v0, v3 +; SI-NEXT: v_bfi_b32 v0, s4, v0, v2 +; SI-NEXT: v_bfi_b32 v2, s4, v5, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_bfi_b32 v1, s4, v1, v4 -; SI-NEXT: v_bfi_b32 v2, s4, v2, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_copysign_out_v3f16_mag_v3f16_sign_v3f32: @@ -5504,16 +5482,19 @@ define <3 x half> @v_copysign_out_v3f16_mag_v3f16_sign_v3f64(<3 x half> %mag, <3 ; SI-LABEL: v_copysign_out_v3f16_mag_v3f16_sign_v3f64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: s_brev_b32 s4, -2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_bfi_b32 v0, s4, v0, v4 -; SI-NEXT: v_bfi_b32 v1, s4, v1, v6 -; SI-NEXT: v_bfi_b32 v2, s4, v2, v8 +; SI-NEXT: s_brev_b32 s4, -2 +; SI-NEXT: v_bfi_b32 v2, s4, v2, v5 +; SI-NEXT: v_bfi_b32 v0, s4, v0, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_bfi_b32 v1, s4, v1, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_copysign_out_v3f16_mag_v3f16_sign_v3f64: @@ -5584,19 +5565,17 @@ define <4 x float> @v_copysign_out_v4f32_mag_v4f16_sign_v4f32(<4 x half> %mag, < ; SI-LABEL: v_copysign_out_v4f32_mag_v4f16_sign_v4f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: s_brev_b32 s4, -2 -; SI-NEXT: v_bfi_b32 v0, s4, v0, v4 -; SI-NEXT: v_bfi_b32 v1, s4, v1, v5 -; SI-NEXT: v_bfi_b32 v2, s4, v2, v6 -; SI-NEXT: v_bfi_b32 v3, s4, v3, v7 +; SI-NEXT: v_bfi_b32 v0, s4, v0, v2 +; SI-NEXT: v_bfi_b32 v2, s4, v1, v4 +; SI-NEXT: v_bfi_b32 v1, s4, v7, v3 +; SI-NEXT: v_bfi_b32 v3, s4, v6, v5 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_copysign_out_v4f32_mag_v4f16_sign_v4f32: @@ -5668,19 +5647,17 @@ define <4 x float> @v_copysign_out_v4f32_mag_v4f32_sign_v4f16(<4 x float> %mag, ; SI-LABEL: v_copysign_out_v4f32_mag_v4f32_sign_v4f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: s_brev_b32 s4, -2 ; SI-NEXT: v_bfi_b32 v0, s4, v0, v4 -; SI-NEXT: v_bfi_b32 v1, s4, v1, v5 -; SI-NEXT: v_bfi_b32 v2, s4, v2, v6 -; SI-NEXT: v_bfi_b32 v3, s4, v3, v7 +; SI-NEXT: v_bfi_b32 v2, s4, v2, v5 +; SI-NEXT: v_bfi_b32 v1, s4, v1, v7 +; SI-NEXT: v_bfi_b32 v3, s4, v3, v6 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_copysign_out_v4f32_mag_v4f32_sign_v4f16: @@ -5753,11 +5730,17 @@ define <4 x double> @v_copysign_out_v4f64_mag_v4f64_sign_v4f16(<4 x double> %mag ; SI-LABEL: v_copysign_out_v4f64_mag_v4f64_sign_v4f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: s_brev_b32 s4, -2 ; SI-NEXT: v_bfi_b32 v1, s4, v1, v8 -; SI-NEXT: v_bfi_b32 v3, s4, v3, v9 -; SI-NEXT: v_bfi_b32 v5, s4, v5, v10 -; SI-NEXT: v_bfi_b32 v7, s4, v7, v11 +; SI-NEXT: v_bfi_b32 v5, s4, v5, v9 +; SI-NEXT: v_bfi_b32 v3, s4, v3, v11 +; SI-NEXT: v_bfi_b32 v7, s4, v7, v10 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_copysign_out_v4f64_mag_v4f64_sign_v4f16: @@ -5830,27 +5813,33 @@ define <4 x half> @v_copysign_out_v4f16_mag_v4f32_sign_v4f16(<4 x float> %mag, < ; SI-LABEL: v_copysign_out_v4f16_mag_v4f32_sign_v4f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: s_brev_b32 s4, -2 +; SI-NEXT: v_bfi_b32 v1, s4, v1, v6 ; SI-NEXT: v_bfi_b32 v0, s4, v0, v4 -; SI-NEXT: v_bfi_b32 v1, s4, v1, v5 -; SI-NEXT: v_bfi_b32 v2, s4, v2, v6 ; SI-NEXT: v_bfi_b32 v3, s4, v3, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_bfi_b32 v2, s4, v2, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_copysign_out_v4f16_mag_v4f32_sign_v4f16: @@ -5916,25 +5905,17 @@ define <4 x half> @v_copysign_out_v4f16_mag_v4f64_sign_v4f16(<4 x double> %mag, ; SI-LABEL: v_copysign_out_v4f16_mag_v4f64_sign_v4f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v13 -; SI-NEXT: v_and_b32_e32 v13, 0x1ff, v7 -; SI-NEXT: v_or_b32_e32 v6, v13, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v12 -; SI-NEXT: v_lshrrev_b32_e32 v12, 8, v7 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: v_and_b32_e32 v13, 0x1ff, v3 +; SI-NEXT: v_or_b32_e32 v2, v13, v2 +; SI-NEXT: v_lshrrev_b32_e32 v12, 8, v3 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; SI-NEXT: v_and_b32_e32 v12, 0xffe, v12 -; SI-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; SI-NEXT: v_bfe_u32 v13, v7, 20, 11 +; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SI-NEXT: v_bfe_u32 v13, v3, 20, 11 ; SI-NEXT: s_movk_i32 s4, 0x3f1 -; SI-NEXT: v_or_b32_e32 v6, v12, v6 +; SI-NEXT: v_or_b32_e32 v2, v12, v2 ; SI-NEXT: v_sub_i32_e32 v14, vcc, s4, v13 -; SI-NEXT: v_or_b32_e32 v12, 0x1000, v6 +; SI-NEXT: v_or_b32_e32 v12, 0x1000, v2 ; SI-NEXT: v_med3_i32 v14, v14, 0, 13 ; SI-NEXT: v_lshrrev_b32_e32 v15, v14, v12 ; SI-NEXT: v_lshlrev_b32_e32 v14, v14, v15 @@ -5944,7 +5925,7 @@ define <4 x half> @v_copysign_out_v4f16_mag_v4f64_sign_v4f16(<4 x double> %mag, ; SI-NEXT: v_add_i32_e32 v13, vcc, s5, v13 ; SI-NEXT: v_lshlrev_b32_e32 v14, 12, v13 ; SI-NEXT: v_or_b32_e32 v12, v15, v12 -; SI-NEXT: v_or_b32_e32 v14, v6, v14 +; SI-NEXT: v_or_b32_e32 v14, v2, v14 ; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v13 ; SI-NEXT: v_cndmask_b32_e32 v12, v14, v12, vcc ; SI-NEXT: v_and_b32_e32 v14, 7, v12 @@ -5959,137 +5940,151 @@ define <4 x half> @v_copysign_out_v4f16_mag_v4f64_sign_v4f16(<4 x double> %mag, ; SI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v13 ; SI-NEXT: v_cndmask_b32_e32 v12, v14, v12, vcc ; SI-NEXT: v_mov_b32_e32 v15, 0x7e00 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; SI-NEXT: s_movk_i32 s6, 0x40f -; SI-NEXT: v_cndmask_b32_e32 v6, v14, v15, vcc +; SI-NEXT: v_cndmask_b32_e32 v2, v14, v15, vcc ; SI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v13 -; SI-NEXT: v_cndmask_b32_e32 v6, v12, v6, vcc -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_and_b32_e32 v12, 0x1ff, v5 -; SI-NEXT: v_and_b32_e32 v7, 0x8000, v7 -; SI-NEXT: v_or_b32_e32 v4, v12, v4 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshrrev_b32_e32 v7, 8, v5 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; SI-NEXT: v_and_b32_e32 v7, 0xffe, v7 -; SI-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; SI-NEXT: v_bfe_u32 v12, v5, 20, 11 -; SI-NEXT: v_or_b32_e32 v4, v7, v4 +; SI-NEXT: v_cndmask_b32_e32 v2, v12, v2, vcc +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v12, 0x1ff, v1 +; SI-NEXT: v_and_b32_e32 v3, 0x8000, v3 +; SI-NEXT: v_or_b32_e32 v0, v12, v0 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 8, v1 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: v_and_b32_e32 v3, 0xffe, v3 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-NEXT: v_bfe_u32 v12, v1, 20, 11 +; SI-NEXT: v_or_b32_e32 v0, v3, v0 ; SI-NEXT: v_sub_i32_e32 v13, vcc, s4, v12 -; SI-NEXT: v_or_b32_e32 v7, 0x1000, v4 +; SI-NEXT: v_or_b32_e32 v3, 0x1000, v0 ; SI-NEXT: v_med3_i32 v13, v13, 0, 13 -; SI-NEXT: v_lshrrev_b32_e32 v16, v13, v7 +; SI-NEXT: v_lshrrev_b32_e32 v16, v13, v3 ; SI-NEXT: v_lshlrev_b32_e32 v13, v13, v16 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, v13, v7 -; SI-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; SI-NEXT: v_cmp_ne_u32_e32 vcc, v13, v3 +; SI-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; SI-NEXT: v_add_i32_e32 v12, vcc, s5, v12 ; SI-NEXT: v_lshlrev_b32_e32 v13, 12, v12 -; SI-NEXT: v_or_b32_e32 v7, v16, v7 -; SI-NEXT: v_or_b32_e32 v13, v4, v13 +; SI-NEXT: v_or_b32_e32 v3, v16, v3 +; SI-NEXT: v_or_b32_e32 v13, v0, v13 ; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v12 -; SI-NEXT: v_cndmask_b32_e32 v7, v13, v7, vcc -; SI-NEXT: v_and_b32_e32 v13, 7, v7 +; SI-NEXT: v_cndmask_b32_e32 v3, v13, v3, vcc +; SI-NEXT: v_and_b32_e32 v13, 7, v3 ; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v13 ; SI-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v13 ; SI-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; SI-NEXT: v_or_b32_e32 v13, v13, v16 -; SI-NEXT: v_lshrrev_b32_e32 v7, 2, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, v7, v13 +; SI-NEXT: v_lshrrev_b32_e32 v3, 2, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, v3, v13 ; SI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v12 -; SI-NEXT: v_cndmask_b32_e32 v7, v14, v7, vcc -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; SI-NEXT: v_cndmask_b32_e32 v4, v14, v15, vcc +; SI-NEXT: v_cndmask_b32_e32 v3, v14, v3, vcc +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: v_cndmask_b32_e32 v0, v14, v15, vcc ; SI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v12 -; SI-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_and_b32_e32 v7, 0x1ff, v3 -; SI-NEXT: v_and_b32_e32 v5, 0x8000, v5 -; SI-NEXT: v_or_b32_e32 v2, v7, v2 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v3 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: v_and_b32_e32 v5, 0xffe, v5 -; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; SI-NEXT: v_bfe_u32 v7, v3, 20, 11 -; SI-NEXT: v_or_b32_e32 v2, v5, v2 -; SI-NEXT: v_sub_i32_e32 v12, vcc, s4, v7 -; SI-NEXT: v_or_b32_e32 v5, 0x1000, v2 +; SI-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v3, 0x1ff, v7 +; SI-NEXT: v_and_b32_e32 v1, 0x8000, v1 +; SI-NEXT: v_or_b32_e32 v3, v3, v6 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v7 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffe, v1 +; SI-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; SI-NEXT: v_bfe_u32 v6, v7, 20, 11 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_sub_i32_e32 v12, vcc, s4, v6 +; SI-NEXT: v_or_b32_e32 v3, 0x1000, v1 ; SI-NEXT: v_med3_i32 v12, v12, 0, 13 -; SI-NEXT: v_lshrrev_b32_e32 v13, v12, v5 +; SI-NEXT: v_lshrrev_b32_e32 v13, v12, v3 ; SI-NEXT: v_lshlrev_b32_e32 v12, v12, v13 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, v12, v5 -; SI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; SI-NEXT: v_add_i32_e32 v7, vcc, s5, v7 -; SI-NEXT: v_lshlrev_b32_e32 v12, 12, v7 -; SI-NEXT: v_or_b32_e32 v5, v13, v5 -; SI-NEXT: v_or_b32_e32 v12, v2, v12 -; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v7 -; SI-NEXT: v_cndmask_b32_e32 v5, v12, v5, vcc -; SI-NEXT: v_and_b32_e32 v12, 7, v5 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, v12, v3 +; SI-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, s5, v6 +; SI-NEXT: v_lshlrev_b32_e32 v12, 12, v6 +; SI-NEXT: v_or_b32_e32 v3, v13, v3 +; SI-NEXT: v_or_b32_e32 v12, v1, v12 +; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v6 +; SI-NEXT: v_cndmask_b32_e32 v3, v12, v3, vcc +; SI-NEXT: v_and_b32_e32 v12, 7, v3 ; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v12 ; SI-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v12 ; SI-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; SI-NEXT: v_or_b32_e32 v12, v12, v13 -; SI-NEXT: v_lshrrev_b32_e32 v5, 2, v5 -; SI-NEXT: v_add_i32_e32 v5, vcc, v5, v12 -; SI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v7 -; SI-NEXT: v_cndmask_b32_e32 v5, v14, v5, vcc -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: v_cndmask_b32_e32 v2, v14, v15, vcc -; SI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v7 -; SI-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_and_b32_e32 v5, 0x1ff, v1 +; SI-NEXT: v_lshrrev_b32_e32 v3, 2, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, v3, v12 +; SI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v6 +; SI-NEXT: v_cndmask_b32_e32 v3, v14, v3, vcc +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; SI-NEXT: v_cndmask_b32_e32 v1, v14, v15, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v6 +; SI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v7 +; SI-NEXT: v_and_b32_e32 v6, 0x1ff, v5 ; SI-NEXT: v_and_b32_e32 v3, 0x8000, v3 -; SI-NEXT: v_or_b32_e32 v0, v5, v0 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 8, v1 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_lshrrev_b32_e32 v3, 8, v5 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 ; SI-NEXT: v_and_b32_e32 v3, 0xffe, v3 -; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; SI-NEXT: v_bfe_u32 v5, v1, 20, 11 -; SI-NEXT: v_or_b32_e32 v0, v3, v0 -; SI-NEXT: v_sub_i32_e32 v7, vcc, s4, v5 -; SI-NEXT: v_or_b32_e32 v3, 0x1000, v0 +; SI-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; SI-NEXT: v_bfe_u32 v6, v5, 20, 11 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_sub_i32_e32 v7, vcc, s4, v6 +; SI-NEXT: v_or_b32_e32 v4, 0x1000, v3 ; SI-NEXT: v_med3_i32 v7, v7, 0, 13 -; SI-NEXT: v_lshrrev_b32_e32 v12, v7, v3 +; SI-NEXT: v_lshrrev_b32_e32 v12, v7, v4 ; SI-NEXT: v_lshlrev_b32_e32 v7, v7, v12 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, v7, v3 -; SI-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; SI-NEXT: v_add_i32_e32 v5, vcc, s5, v5 -; SI-NEXT: v_lshlrev_b32_e32 v7, 12, v5 -; SI-NEXT: v_or_b32_e32 v3, v12, v3 -; SI-NEXT: v_or_b32_e32 v7, v0, v7 -; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v5 -; SI-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc -; SI-NEXT: v_and_b32_e32 v7, 7, v3 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, v7, v4 +; SI-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, s5, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 12, v6 +; SI-NEXT: v_or_b32_e32 v4, v12, v4 +; SI-NEXT: v_or_b32_e32 v7, v3, v7 +; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v6 +; SI-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc +; SI-NEXT: v_and_b32_e32 v7, 7, v4 ; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v7 ; SI-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v7 ; SI-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; SI-NEXT: v_or_b32_e32 v7, v7, v12 -; SI-NEXT: v_lshrrev_b32_e32 v3, 2, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, v3, v7 -; SI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v5 -; SI-NEXT: v_cndmask_b32_e32 v3, v14, v3, vcc -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: v_cndmask_b32_e32 v0, v14, v15, vcc -; SI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v5 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc -; SI-NEXT: v_and_b32_e32 v1, 0x8000, v1 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_lshrrev_b32_e32 v4, 2, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, v4, v7 +; SI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v6 +; SI-NEXT: v_cndmask_b32_e32 v4, v14, v4, vcc +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; SI-NEXT: v_cndmask_b32_e32 v3, v14, v15, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v6 +; SI-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v9 +; SI-NEXT: v_and_b32_e32 v4, 0x8000, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v4 ; SI-NEXT: s_brev_b32 s4, -2 -; SI-NEXT: v_bfi_b32 v0, s4, v0, v11 -; SI-NEXT: v_bfi_b32 v1, s4, v1, v10 -; SI-NEXT: v_bfi_b32 v2, s4, v2, v9 -; SI-NEXT: v_bfi_b32 v3, s4, v6, v8 +; SI-NEXT: v_bfi_b32 v1, s4, v1, v11 +; SI-NEXT: v_bfi_b32 v2, s4, v2, v10 +; SI-NEXT: v_bfi_b32 v3, s4, v3, v9 +; SI-NEXT: v_bfi_b32 v0, s4, v0, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_copysign_out_v4f16_mag_v4f64_sign_v4f16: @@ -6742,25 +6737,31 @@ define <4 x half> @v_copysign_out_v4f16_mag_v4f16_sign_v4f32(<4 x half> %mag, <4 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: s_brev_b32 s4, -2 -; SI-NEXT: v_bfi_b32 v0, s4, v0, v4 -; SI-NEXT: v_bfi_b32 v1, s4, v1, v5 -; SI-NEXT: v_bfi_b32 v2, s4, v2, v6 -; SI-NEXT: v_bfi_b32 v3, s4, v3, v7 +; SI-NEXT: v_bfi_b32 v0, s4, v0, v2 +; SI-NEXT: v_bfi_b32 v2, s4, v7, v5 +; SI-NEXT: v_bfi_b32 v3, s4, v6, v3 +; SI-NEXT: v_bfi_b32 v1, s4, v1, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_copysign_out_v4f16_mag_v4f16_sign_v4f32: @@ -6826,19 +6827,25 @@ define <4 x half> @v_copysign_out_v4f16_mag_v4f16_sign_v4f64(<4 x half> %mag, <4 ; SI-LABEL: v_copysign_out_v4f16_mag_v4f16_sign_v4f64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: s_brev_b32 s4, -2 -; SI-NEXT: v_bfi_b32 v0, s4, v0, v5 +; SI-NEXT: v_bfi_b32 v2, s4, v2, v5 +; SI-NEXT: v_bfi_b32 v0, s4, v0, v3 +; SI-NEXT: v_bfi_b32 v3, s4, v4, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_bfi_b32 v1, s4, v1, v7 -; SI-NEXT: v_bfi_b32 v2, s4, v2, v9 -; SI-NEXT: v_bfi_b32 v3, s4, v3, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_copysign_out_v4f16_mag_v4f16_sign_v4f64: @@ -7273,10 +7280,9 @@ define half @v_copysign_f16_0_f64(double %sign) { define amdgpu_ps i32 @s_copysign_v2f16_0_v2f16(<2 x half> inreg %sign) { ; SI-LABEL: s_copysign_v2f16_0_v2f16: ; SI: ; %bb.0: -; SI-NEXT: v_cvt_f16_f32_e32 v0, s1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, s0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_lshr_b32 s1, s0, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s0 ; SI-NEXT: v_and_b32_e32 v0, 0x80000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_and_b32_e32 v1, 0x80000000, v1 @@ -7312,12 +7318,15 @@ define <2 x half> @v_copysign_v2f16_0_v2f16(<2 x half> %sign) { ; SI-LABEL: v_copysign_v2f16_0_v2f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_and_b32_e32 v0, 0x80000000, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_and_b32_e32 v1, 0x80000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_and_b32_e32 v0, 0x80000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_copysign_v2f16_0_v2f16: @@ -7407,12 +7416,16 @@ define <2 x half> @v_copysign_v2f16_0_v2bf32(<2 x float> %sign) { ; SI-LABEL: v_copysign_v2f16_0_v2bf32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_and_b32_e32 v0, 0x80000000, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_and_b32_e32 v1, 0x80000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_and_b32_e32 v0, 0x80000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_copysign_v2f16_0_v2bf32: @@ -7502,8 +7515,12 @@ define <2 x half> @v_copysign_v2f16_0_v2bf64(<2 x double> %sign) { ; SI-LABEL: v_copysign_v2f16_0_v2bf64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0x80000000, v1 -; SI-NEXT: v_and_b32_e32 v1, 0x80000000, v3 +; SI-NEXT: v_and_b32_e32 v0, 0x80000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_and_b32_e32 v1, 0x80000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_copysign_v2f16_0_v2bf64: diff --git a/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll b/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll index 1334d0ef278d1..0bb3b8c6f3740 100644 --- a/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll @@ -2136,20 +2136,19 @@ define <2 x half> @v_rsq_v2f16(<2 x half> %a) { ; SI-LABEL: v_rsq_v2f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_sqrt_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_sqrt_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_sqrt_f32_e32 v0, v0 +; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, 1.0 ; SI-NEXT: v_rcp_f32_e32 v3, v2 -; SI-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0 +; SI-NEXT: v_div_scale_f32 v4, vcc, 1.0, v1, 1.0 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; SI-NEXT: v_fma_f32 v5, -v2, v3, 1.0 ; SI-NEXT: v_fma_f32 v3, v5, v3, v3 @@ -2158,21 +2157,25 @@ define <2 x half> @v_rsq_v2f16(<2 x half> %a) { ; SI-NEXT: v_fma_f32 v5, v6, v3, v5 ; SI-NEXT: v_fma_f32 v2, -v2, v5, v4 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; SI-NEXT: v_div_scale_f32 v4, s[4:5], v0, v0, 1.0 ; SI-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; SI-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, 1.0 -; SI-NEXT: v_rcp_f32_e32 v4, v3 -; SI-NEXT: v_div_scale_f32 v5, vcc, 1.0, v1, 1.0 -; SI-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 +; SI-NEXT: v_rcp_f32_e32 v5, v4 +; SI-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0 +; SI-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; SI-NEXT: v_fma_f32 v2, -v3, v4, 1.0 -; SI-NEXT: v_fma_f32 v2, v2, v4, v4 -; SI-NEXT: v_mul_f32_e32 v4, v5, v2 -; SI-NEXT: v_fma_f32 v6, -v3, v4, v5 -; SI-NEXT: v_fma_f32 v4, v6, v2, v4 -; SI-NEXT: v_fma_f32 v3, -v3, v4, v5 +; SI-NEXT: v_fma_f32 v2, -v4, v5, 1.0 +; SI-NEXT: v_fma_f32 v2, v2, v5, v5 +; SI-NEXT: v_mul_f32_e32 v5, v3, v2 +; SI-NEXT: v_fma_f32 v6, -v4, v5, v3 +; SI-NEXT: v_fma_f32 v5, v6, v2, v5 +; SI-NEXT: v_fma_f32 v3, -v4, v5, v3 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; SI-NEXT: v_div_fmas_f32 v2, v3, v2, v4 -; SI-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0 +; SI-NEXT: v_div_fmas_f32 v2, v3, v2, v5 +; SI-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_rsq_v2f16: @@ -2341,20 +2344,19 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) { ; SI-LABEL: v_neg_rsq_v2f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_sqrt_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_sqrt_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_sqrt_f32_e32 v0, v0 +; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, -1.0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, -1.0 ; SI-NEXT: v_rcp_f32_e32 v3, v2 -; SI-NEXT: v_div_scale_f32 v4, vcc, -1.0, v0, -1.0 +; SI-NEXT: v_div_scale_f32 v4, vcc, -1.0, v1, -1.0 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; SI-NEXT: v_fma_f32 v5, -v2, v3, 1.0 ; SI-NEXT: v_fma_f32 v3, v5, v3, v3 @@ -2363,21 +2365,25 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) { ; SI-NEXT: v_fma_f32 v5, v6, v3, v5 ; SI-NEXT: v_fma_f32 v2, -v2, v5, v4 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; SI-NEXT: v_div_scale_f32 v4, s[4:5], v0, v0, -1.0 ; SI-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; SI-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, -1.0 -; SI-NEXT: v_rcp_f32_e32 v4, v3 -; SI-NEXT: v_div_scale_f32 v5, vcc, -1.0, v1, -1.0 -; SI-NEXT: v_div_fixup_f32 v0, v2, v0, -1.0 +; SI-NEXT: v_rcp_f32_e32 v5, v4 +; SI-NEXT: v_div_fixup_f32 v1, v2, v1, -1.0 +; SI-NEXT: v_div_scale_f32 v3, vcc, -1.0, v0, -1.0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; SI-NEXT: v_fma_f32 v2, -v3, v4, 1.0 -; SI-NEXT: v_fma_f32 v2, v2, v4, v4 -; SI-NEXT: v_mul_f32_e32 v4, v5, v2 -; SI-NEXT: v_fma_f32 v6, -v3, v4, v5 -; SI-NEXT: v_fma_f32 v4, v6, v2, v4 -; SI-NEXT: v_fma_f32 v3, -v3, v4, v5 +; SI-NEXT: v_fma_f32 v2, -v4, v5, 1.0 +; SI-NEXT: v_fma_f32 v2, v2, v5, v5 +; SI-NEXT: v_mul_f32_e32 v5, v3, v2 +; SI-NEXT: v_fma_f32 v6, -v4, v5, v3 +; SI-NEXT: v_fma_f32 v5, v6, v2, v5 +; SI-NEXT: v_fma_f32 v3, -v4, v5, v3 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; SI-NEXT: v_div_fmas_f32 v2, v3, v2, v4 -; SI-NEXT: v_div_fixup_f32 v1, v2, v1, -1.0 +; SI-NEXT: v_div_fmas_f32 v2, v3, v2, v5 +; SI-NEXT: v_div_fixup_f32 v0, v2, v0, -1.0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_neg_rsq_v2f16: diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll index 0bc1fca0409c3..e32842f8d6f57 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll @@ -16420,44 +16420,49 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX7-LABEL: flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_load_dword v5, v[0:1] -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: flat_load_dword v3, v[0:1] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX7-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v4 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX7-NEXT: v_or_b32_e32 v7, v2, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; GFX7-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX7-NEXT: flat_atomic_cmpswap v6, v[0:1], v[6:7] glc +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 +; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v7 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB56_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v0, v2 -; GFX7-NEXT: v_mov_b32_e32 v1, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fadd ptr %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x half> %result @@ -16609,44 +16614,51 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX7-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fc, v0 -; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GFX7-NEXT: flat_load_dword v1, v[4:5] -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v2 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v3, v[0:1] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX7-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 ; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX7-NEXT: v_or_b32_e32 v7, v0, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX7-NEXT: v_or_b32_e32 v6, v8, v0 -; GFX7-NEXT: flat_atomic_cmpswap v6, v[4:5], v[6:7] glc +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 +; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v7 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB57_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds <2 x half>, ptr %ptr, i64 511 %result = atomicrmw fadd ptr %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -16812,44 +16824,51 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX7-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 -; GFX7-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc -; GFX7-NEXT: flat_load_dword v1, v[4:5] -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v2 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX7-NEXT: flat_load_dword v3, v[0:1] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX7-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 ; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX7-NEXT: v_or_b32_e32 v7, v0, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX7-NEXT: v_or_b32_e32 v6, v8, v0 -; GFX7-NEXT: flat_atomic_cmpswap v6, v[4:5], v[6:7] glc +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 +; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v7 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB58_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds <2 x half>, ptr %ptr, i64 -512 %result = atomicrmw fadd ptr %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -16994,14 +17013,17 @@ define void @flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX7-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_load_dword v5, v[0:1] -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: flat_load_dword v3, v[0:1] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX7-NEXT: .LBB59_1: ; %atomicrmw.start @@ -17179,14 +17201,17 @@ define void @flat_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: flat_load_dword v5, v[0:1] -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: flat_load_dword v3, v[0:1] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX7-NEXT: .LBB60_1: ; %atomicrmw.start @@ -17381,14 +17406,17 @@ define void @flat_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX7-NEXT: flat_load_dword v5, v[0:1] -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: flat_load_dword v3, v[0:1] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX7-NEXT: .LBB61_1: ; %atomicrmw.start @@ -17572,44 +17600,51 @@ define <2 x half> @flat_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX7-LABEL: flat_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fc, v0 -; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GFX7-NEXT: flat_load_dword v1, v[4:5] -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v2 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v3, v[0:1] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX7-NEXT: .LBB62_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 ; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX7-NEXT: v_or_b32_e32 v7, v0, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX7-NEXT: v_or_b32_e32 v6, v8, v0 -; GFX7-NEXT: flat_atomic_cmpswap v6, v[4:5], v[6:7] glc +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 +; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v7 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB62_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds <2 x half>, ptr %ptr, i64 511 %result = atomicrmw fadd ptr %gep, <2 x half> %val seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -17763,14 +17798,17 @@ define void @flat_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: flat_load_dword v5, v[0:1] -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: flat_load_dword v3, v[0:1] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX7-NEXT: .LBB63_1: ; %atomicrmw.start @@ -17949,44 +17987,49 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory(ptr ; GFX7-LABEL: flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_load_dword v5, v[0:1] -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: flat_load_dword v3, v[0:1] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX7-NEXT: .LBB64_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v4 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX7-NEXT: v_or_b32_e32 v7, v2, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; GFX7-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX7-NEXT: flat_atomic_cmpswap v6, v[0:1], v[6:7] glc +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 +; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v7 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB64_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v0, v2 -; GFX7-NEXT: v_mov_b32_e32 v1, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fadd ptr %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 ret <2 x half> %result @@ -18130,14 +18173,17 @@ define void @flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory(ptr %pt ; GFX7-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_load_dword v5, v[0:1] -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: flat_load_dword v3, v[0:1] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX7-NEXT: .LBB65_1: ; %atomicrmw.start @@ -18315,44 +18361,49 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX7-LABEL: flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_load_dword v5, v[0:1] -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: flat_load_dword v3, v[0:1] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX7-NEXT: .LBB66_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v4 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX7-NEXT: v_or_b32_e32 v7, v2, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; GFX7-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX7-NEXT: flat_atomic_cmpswap v6, v[0:1], v[6:7] glc +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 +; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v7 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB66_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v0, v2 -; GFX7-NEXT: v_mov_b32_e32 v1, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fadd ptr %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 ret <2 x half> %result @@ -18496,14 +18547,17 @@ define void @flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__ ; GFX7-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_load_dword v5, v[0:1] -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: flat_load_dword v3, v[0:1] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX7-NEXT: .LBB67_1: ; %atomicrmw.start @@ -18824,40 +18878,44 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX7-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_load_dword v5, v[0:1] +; GFX7-NEXT: flat_load_dword v3, v[0:1] +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v3 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 ; GFX7-NEXT: .LBB68_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v5 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v4 -; GFX7-NEXT: v_alignbit_b32 v3, v2, v3, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v7 -; GFX7-NEXT: v_alignbit_b32 v2, v2, v6, 16 -; GFX7-NEXT: flat_atomic_cmpswap v6, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_alignbit_b32 v5, v4, v5, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v7 +; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16 +; GFX7-NEXT: flat_atomic_cmpswap v6, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB68_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v0, v3 -; GFX7-NEXT: v_mov_b32_e32 v1, v2 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v5 +; GFX7-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fadd ptr %ptr, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x bfloat> %result @@ -19148,40 +19206,46 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX7-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fc, v0 -; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GFX7-NEXT: flat_load_dword v0, v[4:5] -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v3, v[0:1] +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 ; GFX7-NEXT: .LBB69_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 ; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v7 -; GFX7-NEXT: v_alignbit_b32 v0, v0, v6, 16 -; GFX7-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc +; GFX7-NEXT: v_alignbit_b32 v5, v4, v5, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v7 +; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16 +; GFX7-NEXT: flat_atomic_cmpswap v6, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB69_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v5 +; GFX7-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds <2 x bfloat>, ptr %ptr, i64 511 %result = atomicrmw fadd ptr %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -19487,40 +19551,46 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX7-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 -; GFX7-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc -; GFX7-NEXT: flat_load_dword v0, v[4:5] -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX7-NEXT: flat_load_dword v3, v[0:1] +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 ; GFX7-NEXT: .LBB70_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 ; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v7 -; GFX7-NEXT: v_alignbit_b32 v0, v0, v6, 16 -; GFX7-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc +; GFX7-NEXT: v_alignbit_b32 v5, v4, v5, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v7 +; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16 +; GFX7-NEXT: flat_atomic_cmpswap v6, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB70_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v5 +; GFX7-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds <2 x bfloat>, ptr %ptr, i64 -512 %result = atomicrmw fadd ptr %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -19802,15 +19872,17 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX7-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_load_dword v5, v[0:1] +; GFX7-NEXT: flat_load_dword v3, v[0:1] +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 ; GFX7-NEXT: .LBB71_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -20120,15 +20192,17 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: flat_load_dword v5, v[0:1] +; GFX7-NEXT: flat_load_dword v3, v[0:1] +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 ; GFX7-NEXT: .LBB72_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -20458,15 +20532,17 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX7-NEXT: flat_load_dword v5, v[0:1] +; GFX7-NEXT: flat_load_dword v3, v[0:1] +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 ; GFX7-NEXT: .LBB73_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -20784,40 +20860,46 @@ define <2 x bfloat> @flat_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX7-LABEL: flat_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fc, v0 -; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GFX7-NEXT: flat_load_dword v0, v[4:5] -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v3, v[0:1] +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 ; GFX7-NEXT: .LBB74_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 ; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v7 -; GFX7-NEXT: v_alignbit_b32 v0, v0, v6, 16 -; GFX7-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc +; GFX7-NEXT: v_alignbit_b32 v5, v4, v5, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v7 +; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16 +; GFX7-NEXT: flat_atomic_cmpswap v6, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB74_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v5 +; GFX7-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds <2 x bfloat>, ptr %ptr, i64 511 %result = atomicrmw fadd ptr %gep, <2 x bfloat> %val seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -21108,15 +21190,17 @@ define void @flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: flat_load_dword v5, v[0:1] +; GFX7-NEXT: flat_load_dword v3, v[0:1] +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 ; GFX7-NEXT: .LBB75_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -21429,40 +21513,44 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory( ; GFX7-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_load_dword v5, v[0:1] +; GFX7-NEXT: flat_load_dword v3, v[0:1] +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v3 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 ; GFX7-NEXT: .LBB76_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v5 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v4 -; GFX7-NEXT: v_alignbit_b32 v3, v2, v3, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v7 -; GFX7-NEXT: v_alignbit_b32 v2, v2, v6, 16 -; GFX7-NEXT: flat_atomic_cmpswap v6, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_alignbit_b32 v5, v4, v5, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v7 +; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16 +; GFX7-NEXT: flat_atomic_cmpswap v6, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB76_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v0, v3 -; GFX7-NEXT: v_mov_b32_e32 v1, v2 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v5 +; GFX7-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fadd ptr %ptr, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 ret <2 x bfloat> %result @@ -21743,15 +21831,17 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr %p ; GFX7-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_load_dword v5, v[0:1] +; GFX7-NEXT: flat_load_dword v3, v[0:1] +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 ; GFX7-NEXT: .LBB77_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -22063,40 +22153,44 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX7-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_load_dword v5, v[0:1] +; GFX7-NEXT: flat_load_dword v3, v[0:1] +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v3 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 ; GFX7-NEXT: .LBB78_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v5 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v4 -; GFX7-NEXT: v_alignbit_b32 v3, v2, v3, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v7 -; GFX7-NEXT: v_alignbit_b32 v2, v2, v6, 16 -; GFX7-NEXT: flat_atomic_cmpswap v6, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_alignbit_b32 v5, v4, v5, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v7 +; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16 +; GFX7-NEXT: flat_atomic_cmpswap v6, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB78_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v0, v3 -; GFX7-NEXT: v_mov_b32_e32 v1, v2 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v5 +; GFX7-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fadd ptr %ptr, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 ret <2 x bfloat> %result @@ -22377,15 +22471,17 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory_ ; GFX7-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_load_dword v5, v[0:1] +; GFX7-NEXT: flat_load_dword v3, v[0:1] +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 ; GFX7-NEXT: .LBB79_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll index 6831485790abc..2b15147365777 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll @@ -14348,44 +14348,49 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX7-LABEL: flat_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_load_dword v5, v[0:1] -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: flat_load_dword v3, v[0:1] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX7-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_max_f32_e32 v6, v6, v4 -; GFX7-NEXT: v_max_f32_e32 v7, v7, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX7-NEXT: v_or_b32_e32 v7, v2, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; GFX7-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX7-NEXT: flat_atomic_cmpswap v6, v[0:1], v[6:7] glc +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_max_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_max_f32_e32 v7, v7, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 +; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v7 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB46_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v0, v2 -; GFX7-NEXT: v_mov_b32_e32 v1, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fmax ptr %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x half> %result @@ -14585,44 +14590,51 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX7-LABEL: flat_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fc, v0 -; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GFX7-NEXT: flat_load_dword v1, v[4:5] -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v2 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v3, v[0:1] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX7-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: v_max_f32_e32 v6, v6, v2 ; GFX7-NEXT: v_max_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX7-NEXT: v_or_b32_e32 v7, v0, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX7-NEXT: v_or_b32_e32 v6, v8, v0 -; GFX7-NEXT: flat_atomic_cmpswap v6, v[4:5], v[6:7] glc +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 +; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v7 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB47_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds <2 x half>, ptr %ptr, i64 511 %result = atomicrmw fmax ptr %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -14838,44 +14850,51 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX7-LABEL: flat_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 -; GFX7-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc -; GFX7-NEXT: flat_load_dword v1, v[4:5] -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v2 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX7-NEXT: flat_load_dword v3, v[0:1] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX7-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: v_max_f32_e32 v6, v6, v2 ; GFX7-NEXT: v_max_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX7-NEXT: v_or_b32_e32 v7, v0, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX7-NEXT: v_or_b32_e32 v6, v8, v0 -; GFX7-NEXT: flat_atomic_cmpswap v6, v[4:5], v[6:7] glc +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 +; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v7 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB48_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds <2 x half>, ptr %ptr, i64 -512 %result = atomicrmw fmax ptr %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -15067,14 +15086,17 @@ define void @flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX7-LABEL: flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_load_dword v5, v[0:1] -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: flat_load_dword v3, v[0:1] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX7-NEXT: .LBB49_1: ; %atomicrmw.start @@ -15299,14 +15321,17 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: flat_load_dword v5, v[0:1] -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: flat_load_dword v3, v[0:1] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX7-NEXT: .LBB50_1: ; %atomicrmw.start @@ -15551,14 +15576,17 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX7-NEXT: flat_load_dword v5, v[0:1] -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: flat_load_dword v3, v[0:1] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX7-NEXT: .LBB51_1: ; %atomicrmw.start @@ -15790,44 +15818,51 @@ define <2 x half> @flat_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX7-LABEL: flat_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fc, v0 -; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GFX7-NEXT: flat_load_dword v1, v[4:5] -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v2 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v3, v[0:1] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX7-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: v_max_f32_e32 v6, v6, v2 ; GFX7-NEXT: v_max_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX7-NEXT: v_or_b32_e32 v7, v0, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX7-NEXT: v_or_b32_e32 v6, v8, v0 -; GFX7-NEXT: flat_atomic_cmpswap v6, v[4:5], v[6:7] glc +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 +; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v7 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB52_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds <2 x half>, ptr %ptr, i64 511 %result = atomicrmw fmax ptr %gep, <2 x half> %val seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -16028,14 +16063,17 @@ define void @flat_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: flat_load_dword v5, v[0:1] -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: flat_load_dword v3, v[0:1] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX7-NEXT: .LBB53_1: ; %atomicrmw.start @@ -16480,40 +16518,44 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX7-LABEL: flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_load_dword v5, v[0:1] -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: flat_load_dword v3, v[0:1] +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 ; GFX7-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_max_f32_e32 v6, v6, v4 -; GFX7-NEXT: v_max_f32_e32 v7, v7, v5 -; GFX7-NEXT: v_alignbit_b32 v3, v2, v3, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v6 -; GFX7-NEXT: v_alignbit_b32 v2, v2, v7, 16 -; GFX7-NEXT: flat_atomic_cmpswap v6, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_max_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_max_f32_e32 v7, v7, v3 +; GFX7-NEXT: v_alignbit_b32 v5, v5, v4, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_alignbit_b32 v4, v4, v7, 16 +; GFX7-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB54_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v0, v3 -; GFX7-NEXT: v_mov_b32_e32 v1, v2 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v4 +; GFX7-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fmax ptr %ptr, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x bfloat> %result @@ -16927,40 +16969,46 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX7-LABEL: flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fc, v0 -; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GFX7-NEXT: flat_load_dword v0, v[4:5] -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v2 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v3, v[0:1] +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 ; GFX7-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: v_max_f32_e32 v6, v6, v2 ; GFX7-NEXT: v_max_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v6 -; GFX7-NEXT: v_alignbit_b32 v0, v0, v7, 16 -; GFX7-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc +; GFX7-NEXT: v_alignbit_b32 v5, v5, v4, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_alignbit_b32 v4, v4, v7, 16 +; GFX7-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB55_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v4 +; GFX7-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds <2 x bfloat>, ptr %ptr, i64 511 %result = atomicrmw fmax ptr %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -17391,40 +17439,46 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX7-LABEL: flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 -; GFX7-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc -; GFX7-NEXT: flat_load_dword v0, v[4:5] -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v2 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX7-NEXT: flat_load_dword v3, v[0:1] +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 ; GFX7-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: v_max_f32_e32 v6, v6, v2 ; GFX7-NEXT: v_max_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v6 -; GFX7-NEXT: v_alignbit_b32 v0, v0, v7, 16 -; GFX7-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc +; GFX7-NEXT: v_alignbit_b32 v5, v5, v4, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_alignbit_b32 v4, v4, v7, 16 +; GFX7-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB56_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v4 +; GFX7-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds <2 x bfloat>, ptr %ptr, i64 -512 %result = atomicrmw fmax ptr %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -17824,15 +17878,17 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX7-LABEL: flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_load_dword v4, v[0:1] -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: flat_load_dword v3, v[0:1] +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 ; GFX7-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 @@ -18260,15 +18316,17 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: flat_load_dword v4, v[0:1] -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: flat_load_dword v3, v[0:1] +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 ; GFX7-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 @@ -18719,15 +18777,17 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX7-NEXT: flat_load_dword v4, v[0:1] -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: flat_load_dword v3, v[0:1] +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 ; GFX7-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 @@ -19169,40 +19229,46 @@ define <2 x bfloat> @flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX7-LABEL: flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fc, v0 -; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GFX7-NEXT: flat_load_dword v0, v[4:5] -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v2 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v3, v[0:1] +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 ; GFX7-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: v_max_f32_e32 v6, v6, v2 ; GFX7-NEXT: v_max_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v6 -; GFX7-NEXT: v_alignbit_b32 v0, v0, v7, 16 -; GFX7-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc +; GFX7-NEXT: v_alignbit_b32 v5, v5, v4, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_alignbit_b32 v4, v4, v7, 16 +; GFX7-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB60_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v4 +; GFX7-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds <2 x bfloat>, ptr %ptr, i64 511 %result = atomicrmw fmax ptr %gep, <2 x bfloat> %val seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -19612,15 +19678,17 @@ define void @flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: flat_load_dword v4, v[0:1] -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: flat_load_dword v3, v[0:1] +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 ; GFX7-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll index 4c659f9ca9174..ad7ee22fdb76e 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll @@ -14348,44 +14348,49 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX7-LABEL: flat_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_load_dword v5, v[0:1] -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: flat_load_dword v3, v[0:1] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX7-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_min_f32_e32 v6, v6, v4 -; GFX7-NEXT: v_min_f32_e32 v7, v7, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX7-NEXT: v_or_b32_e32 v7, v2, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; GFX7-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX7-NEXT: flat_atomic_cmpswap v6, v[0:1], v[6:7] glc +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_min_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_min_f32_e32 v7, v7, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 +; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v7 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB46_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v0, v2 -; GFX7-NEXT: v_mov_b32_e32 v1, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fmin ptr %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x half> %result @@ -14585,44 +14590,51 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX7-LABEL: flat_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fc, v0 -; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GFX7-NEXT: flat_load_dword v1, v[4:5] -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v2 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v3, v[0:1] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX7-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: v_min_f32_e32 v6, v6, v2 ; GFX7-NEXT: v_min_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX7-NEXT: v_or_b32_e32 v7, v0, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX7-NEXT: v_or_b32_e32 v6, v8, v0 -; GFX7-NEXT: flat_atomic_cmpswap v6, v[4:5], v[6:7] glc +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 +; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v7 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB47_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds <2 x half>, ptr %ptr, i64 511 %result = atomicrmw fmin ptr %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -14838,44 +14850,51 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX7-LABEL: flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 -; GFX7-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc -; GFX7-NEXT: flat_load_dword v1, v[4:5] -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v2 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX7-NEXT: flat_load_dword v3, v[0:1] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX7-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: v_min_f32_e32 v6, v6, v2 ; GFX7-NEXT: v_min_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX7-NEXT: v_or_b32_e32 v7, v0, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX7-NEXT: v_or_b32_e32 v6, v8, v0 -; GFX7-NEXT: flat_atomic_cmpswap v6, v[4:5], v[6:7] glc +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 +; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v7 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB48_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds <2 x half>, ptr %ptr, i64 -512 %result = atomicrmw fmin ptr %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -15067,14 +15086,17 @@ define void @flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX7-LABEL: flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_load_dword v5, v[0:1] -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: flat_load_dword v3, v[0:1] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX7-NEXT: .LBB49_1: ; %atomicrmw.start @@ -15299,14 +15321,17 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: flat_load_dword v5, v[0:1] -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: flat_load_dword v3, v[0:1] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX7-NEXT: .LBB50_1: ; %atomicrmw.start @@ -15551,14 +15576,17 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX7-NEXT: flat_load_dword v5, v[0:1] -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: flat_load_dword v3, v[0:1] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX7-NEXT: .LBB51_1: ; %atomicrmw.start @@ -15790,44 +15818,51 @@ define <2 x half> @flat_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX7-LABEL: flat_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fc, v0 -; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GFX7-NEXT: flat_load_dword v1, v[4:5] -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v2 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v3, v[0:1] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX7-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: v_min_f32_e32 v6, v6, v2 ; GFX7-NEXT: v_min_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX7-NEXT: v_or_b32_e32 v7, v0, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX7-NEXT: v_or_b32_e32 v6, v8, v0 -; GFX7-NEXT: flat_atomic_cmpswap v6, v[4:5], v[6:7] glc +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 +; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v7 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB52_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds <2 x half>, ptr %ptr, i64 511 %result = atomicrmw fmin ptr %gep, <2 x half> %val seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -16028,14 +16063,17 @@ define void @flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: flat_load_dword v5, v[0:1] -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: flat_load_dword v3, v[0:1] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX7-NEXT: .LBB53_1: ; %atomicrmw.start @@ -16480,40 +16518,44 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX7-LABEL: flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_load_dword v5, v[0:1] -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: flat_load_dword v3, v[0:1] +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 ; GFX7-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_min_f32_e32 v6, v6, v4 -; GFX7-NEXT: v_min_f32_e32 v7, v7, v5 -; GFX7-NEXT: v_alignbit_b32 v3, v2, v3, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v6 -; GFX7-NEXT: v_alignbit_b32 v2, v2, v7, 16 -; GFX7-NEXT: flat_atomic_cmpswap v6, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_min_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_min_f32_e32 v7, v7, v3 +; GFX7-NEXT: v_alignbit_b32 v5, v5, v4, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_alignbit_b32 v4, v4, v7, 16 +; GFX7-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB54_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v0, v3 -; GFX7-NEXT: v_mov_b32_e32 v1, v2 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v4 +; GFX7-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fmin ptr %ptr, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x bfloat> %result @@ -16927,40 +16969,46 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX7-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fc, v0 -; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GFX7-NEXT: flat_load_dword v0, v[4:5] -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v2 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v3, v[0:1] +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 ; GFX7-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: v_min_f32_e32 v6, v6, v2 ; GFX7-NEXT: v_min_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v6 -; GFX7-NEXT: v_alignbit_b32 v0, v0, v7, 16 -; GFX7-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc +; GFX7-NEXT: v_alignbit_b32 v5, v5, v4, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_alignbit_b32 v4, v4, v7, 16 +; GFX7-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB55_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v4 +; GFX7-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds <2 x bfloat>, ptr %ptr, i64 511 %result = atomicrmw fmin ptr %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -17391,40 +17439,46 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX7-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 -; GFX7-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc -; GFX7-NEXT: flat_load_dword v0, v[4:5] -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v2 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX7-NEXT: flat_load_dword v3, v[0:1] +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 ; GFX7-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: v_min_f32_e32 v6, v6, v2 ; GFX7-NEXT: v_min_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v6 -; GFX7-NEXT: v_alignbit_b32 v0, v0, v7, 16 -; GFX7-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc +; GFX7-NEXT: v_alignbit_b32 v5, v5, v4, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_alignbit_b32 v4, v4, v7, 16 +; GFX7-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB56_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v4 +; GFX7-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds <2 x bfloat>, ptr %ptr, i64 -512 %result = atomicrmw fmin ptr %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -17824,15 +17878,17 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX7-LABEL: flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_load_dword v4, v[0:1] -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: flat_load_dword v3, v[0:1] +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 ; GFX7-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 @@ -18260,15 +18316,17 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: flat_load_dword v4, v[0:1] -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: flat_load_dword v3, v[0:1] +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 ; GFX7-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 @@ -18719,15 +18777,17 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX7-NEXT: flat_load_dword v4, v[0:1] -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: flat_load_dword v3, v[0:1] +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 ; GFX7-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 @@ -19169,40 +19229,46 @@ define <2 x bfloat> @flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX7-LABEL: flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fc, v0 -; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GFX7-NEXT: flat_load_dword v0, v[4:5] -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v2 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v3, v[0:1] +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 ; GFX7-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: v_min_f32_e32 v6, v6, v2 ; GFX7-NEXT: v_min_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v6 -; GFX7-NEXT: v_alignbit_b32 v0, v0, v7, 16 -; GFX7-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc +; GFX7-NEXT: v_alignbit_b32 v5, v5, v4, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_alignbit_b32 v4, v4, v7, 16 +; GFX7-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB60_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v4 +; GFX7-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds <2 x bfloat>, ptr %ptr, i64 511 %result = atomicrmw fmin ptr %gep, <2 x bfloat> %val seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -19612,15 +19678,17 @@ define void @flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: flat_load_dword v4, v[0:1] -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: flat_load_dword v3, v[0:1] +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 ; GFX7-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll index 9166ad3043472..a278be61104cc 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll @@ -13898,44 +13898,49 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16(ptr %ptr, <2 x half> %val) # ; GFX7-LABEL: flat_agent_atomic_fsub_ret_v2f16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_load_dword v5, v[0:1] -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: flat_load_dword v3, v[0:1] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX7-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_sub_f32_e32 v6, v6, v4 -; GFX7-NEXT: v_sub_f32_e32 v7, v7, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX7-NEXT: v_or_b32_e32 v7, v2, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; GFX7-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX7-NEXT: flat_atomic_cmpswap v6, v[0:1], v[6:7] glc +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_sub_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_sub_f32_e32 v7, v7, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 +; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v7 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB42_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v0, v2 -; GFX7-NEXT: v_mov_b32_e32 v1, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fsub ptr %ptr, <2 x half> %val syncscope("agent") seq_cst ret <2 x half> %result @@ -14118,44 +14123,51 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16__offset12b_pos(ptr %ptr, <2 ; GFX7-LABEL: flat_agent_atomic_fsub_ret_v2f16__offset12b_pos: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fc, v0 -; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GFX7-NEXT: flat_load_dword v1, v[4:5] -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v2 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v3, v[0:1] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX7-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: v_sub_f32_e32 v6, v6, v2 ; GFX7-NEXT: v_sub_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX7-NEXT: v_or_b32_e32 v7, v0, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX7-NEXT: v_or_b32_e32 v6, v8, v0 -; GFX7-NEXT: flat_atomic_cmpswap v6, v[4:5], v[6:7] glc +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 +; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v7 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB43_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds <2 x half>, ptr %ptr, i64 511 %result = atomicrmw fsub ptr %gep, <2 x half> %val syncscope("agent") seq_cst @@ -14354,44 +14366,51 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16__offset12b_neg(ptr %ptr, <2 ; GFX7-LABEL: flat_agent_atomic_fsub_ret_v2f16__offset12b_neg: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 -; GFX7-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc -; GFX7-NEXT: flat_load_dword v1, v[4:5] -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v2 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX7-NEXT: flat_load_dword v3, v[0:1] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX7-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: v_sub_f32_e32 v6, v6, v2 ; GFX7-NEXT: v_sub_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX7-NEXT: v_or_b32_e32 v7, v0, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX7-NEXT: v_or_b32_e32 v6, v8, v0 -; GFX7-NEXT: flat_atomic_cmpswap v6, v[4:5], v[6:7] glc +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 +; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v7 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB44_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds <2 x half>, ptr %ptr, i64 -512 %result = atomicrmw fsub ptr %gep, <2 x half> %val syncscope("agent") seq_cst @@ -14564,14 +14583,17 @@ define void @flat_agent_atomic_fsub_noret_v2f16(ptr %ptr, <2 x half> %val) #0 { ; GFX7-LABEL: flat_agent_atomic_fsub_noret_v2f16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_load_dword v5, v[0:1] -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: flat_load_dword v3, v[0:1] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX7-NEXT: .LBB45_1: ; %atomicrmw.start @@ -14777,14 +14799,17 @@ define void @flat_agent_atomic_fsub_noret_v2f16__offset12b_pos(ptr %ptr, <2 x ha ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: flat_load_dword v5, v[0:1] -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: flat_load_dword v3, v[0:1] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX7-NEXT: .LBB46_1: ; %atomicrmw.start @@ -15010,14 +15035,17 @@ define void @flat_agent_atomic_fsub_noret_v2f16__offset12b_neg(ptr %ptr, <2 x ha ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX7-NEXT: flat_load_dword v5, v[0:1] -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: flat_load_dword v3, v[0:1] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX7-NEXT: .LBB47_1: ; %atomicrmw.start @@ -15232,44 +15260,51 @@ define <2 x half> @flat_system_atomic_fsub_ret_v2f16__offset12b_pos(ptr %ptr, <2 ; GFX7-LABEL: flat_system_atomic_fsub_ret_v2f16__offset12b_pos: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fc, v0 -; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GFX7-NEXT: flat_load_dword v1, v[4:5] -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v2 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v3, v[0:1] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX7-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: v_sub_f32_e32 v6, v6, v2 ; GFX7-NEXT: v_sub_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX7-NEXT: v_or_b32_e32 v7, v0, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX7-NEXT: v_or_b32_e32 v6, v8, v0 -; GFX7-NEXT: flat_atomic_cmpswap v6, v[4:5], v[6:7] glc +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 +; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v7 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB48_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds <2 x half>, ptr %ptr, i64 511 %result = atomicrmw fsub ptr %gep, <2 x half> %val seq_cst @@ -15451,14 +15486,17 @@ define void @flat_system_atomic_fsub_noret_v2f16__offset12b_pos(ptr %ptr, <2 x h ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: flat_load_dword v5, v[0:1] -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: flat_load_dword v3, v[0:1] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX7-NEXT: .LBB49_1: ; %atomicrmw.start @@ -15903,40 +15941,44 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16(ptr %ptr, <2 x bfloat> %v ; GFX7-LABEL: flat_agent_atomic_fsub_ret_v2bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_load_dword v5, v[0:1] +; GFX7-NEXT: flat_load_dword v3, v[0:1] +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v3 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 ; GFX7-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_sub_f32_e32 v7, v7, v5 -; GFX7-NEXT: v_sub_f32_e32 v6, v6, v4 -; GFX7-NEXT: v_alignbit_b32 v3, v2, v3, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v7 -; GFX7-NEXT: v_alignbit_b32 v2, v2, v6, 16 -; GFX7-NEXT: flat_atomic_cmpswap v6, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_sub_f32_e32 v7, v7, v3 +; GFX7-NEXT: v_sub_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_alignbit_b32 v5, v4, v5, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v7 +; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16 +; GFX7-NEXT: flat_atomic_cmpswap v6, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB50_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v0, v3 -; GFX7-NEXT: v_mov_b32_e32 v1, v2 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v5 +; GFX7-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fsub ptr %ptr, <2 x bfloat> %val syncscope("agent") seq_cst ret <2 x bfloat> %result @@ -16350,40 +16392,46 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, ; GFX7-LABEL: flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fc, v0 -; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GFX7-NEXT: flat_load_dword v0, v[4:5] -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v3, v[0:1] +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 ; GFX7-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_sub_f32_e32 v7, v7, v3 ; GFX7-NEXT: v_sub_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v7 -; GFX7-NEXT: v_alignbit_b32 v0, v0, v6, 16 -; GFX7-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc +; GFX7-NEXT: v_alignbit_b32 v5, v4, v5, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v7 +; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16 +; GFX7-NEXT: flat_atomic_cmpswap v6, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB51_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v5 +; GFX7-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds <2 x bfloat>, ptr %ptr, i64 511 %result = atomicrmw fsub ptr %gep, <2 x bfloat> %val syncscope("agent") seq_cst @@ -16814,40 +16862,46 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr %ptr, ; GFX7-LABEL: flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 -; GFX7-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc -; GFX7-NEXT: flat_load_dword v0, v[4:5] -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX7-NEXT: flat_load_dword v3, v[0:1] +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 ; GFX7-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_sub_f32_e32 v7, v7, v3 ; GFX7-NEXT: v_sub_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v7 -; GFX7-NEXT: v_alignbit_b32 v0, v0, v6, 16 -; GFX7-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc +; GFX7-NEXT: v_alignbit_b32 v5, v4, v5, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v7 +; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16 +; GFX7-NEXT: flat_atomic_cmpswap v6, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB52_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v5 +; GFX7-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds <2 x bfloat>, ptr %ptr, i64 -512 %result = atomicrmw fsub ptr %gep, <2 x bfloat> %val syncscope("agent") seq_cst @@ -17247,15 +17301,17 @@ define void @flat_agent_atomic_fsub_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0 ; GFX7-LABEL: flat_agent_atomic_fsub_noret_v2bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_load_dword v5, v[0:1] +; GFX7-NEXT: flat_load_dword v3, v[0:1] +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 ; GFX7-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -17683,15 +17739,17 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x b ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: flat_load_dword v5, v[0:1] +; GFX7-NEXT: flat_load_dword v3, v[0:1] +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 ; GFX7-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -18142,15 +18200,17 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX7-NEXT: flat_load_dword v5, v[0:1] +; GFX7-NEXT: flat_load_dword v3, v[0:1] +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 ; GFX7-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -18592,40 +18652,46 @@ define <2 x bfloat> @flat_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, ; GFX7-LABEL: flat_system_atomic_fsub_ret_v2bf16__offset12b_pos: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fc, v0 -; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GFX7-NEXT: flat_load_dword v0, v[4:5] -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v3, v[0:1] +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 ; GFX7-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_sub_f32_e32 v7, v7, v3 ; GFX7-NEXT: v_sub_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v7 -; GFX7-NEXT: v_alignbit_b32 v0, v0, v6, 16 -; GFX7-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc +; GFX7-NEXT: v_alignbit_b32 v5, v4, v5, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v7 +; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16 +; GFX7-NEXT: flat_atomic_cmpswap v6, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB56_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v5 +; GFX7-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds <2 x bfloat>, ptr %ptr, i64 511 %result = atomicrmw fsub ptr %gep, <2 x bfloat> %val seq_cst @@ -19035,15 +19101,17 @@ define void @flat_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: flat_load_dword v5, v[0:1] +; GFX7-NEXT: flat_load_dword v3, v[0:1] +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 ; GFX7-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 diff --git a/llvm/test/CodeGen/AMDGPU/fmax3-maximumnum.ll b/llvm/test/CodeGen/AMDGPU/fmax3-maximumnum.ll index 2465c1eef6b10..e9a6854226e60 100644 --- a/llvm/test/CodeGen/AMDGPU/fmax3-maximumnum.ll +++ b/llvm/test/CodeGen/AMDGPU/fmax3-maximumnum.ll @@ -1122,39 +1122,41 @@ define <2 x half> @v_max3_v2f16_maximumnum_maximumnum__v_v_v_0(<2 x half> %a, <2 ; GFX6-LABEL: v_max3_v2f16_maximumnum_maximumnum__v_v_v_0: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-NEXT: v_max3_f32 v0, v0, v2, v4 -; GFX6-NEXT: v_max3_f32 v1, v1, v3, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_max3_f32 v3, v5, v4, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_max3_f32 v0, v0, v1, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_max3_v2f16_maximumnum_maximumnum__v_v_v_0: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_max3_f32 v0, v0, v2, v4 -; GFX7-NEXT: v_max3_f32 v1, v1, v3, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_max3_f32 v3, v5, v4, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_max3_f32 v0, v0, v1, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_max3_v2f16_maximumnum_maximumnum__v_v_v_0: @@ -1239,53 +1241,51 @@ define <3 x half> @v_max3_v3f16_maximumnum_maximumnum__v_v_v_0(<3 x half> %a, <3 ; GFX6-LABEL: v_max3_v3f16_maximumnum_maximumnum__v_v_v_0: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v8, 16, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v8, v8 ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX6-NEXT: v_max3_f32 v0, v0, v3, v6 -; GFX6-NEXT: v_max3_f32 v1, v1, v4, v7 -; GFX6-NEXT: v_max3_f32 v2, v2, v5, v8 +; GFX6-NEXT: v_max3_f32 v6, v8, v7, v6 +; GFX6-NEXT: v_max3_f32 v0, v0, v2, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_max3_f32 v1, v1, v3, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_max3_v3f16_maximumnum_maximumnum__v_v_v_0: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v8 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_max3_f32 v0, v0, v3, v6 -; GFX7-NEXT: v_max3_f32 v1, v1, v4, v7 -; GFX7-NEXT: v_max3_f32 v2, v2, v5, v8 +; GFX7-NEXT: v_max3_f32 v6, v8, v7, v6 +; GFX7-NEXT: v_max3_f32 v0, v0, v2, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_max3_f32 v1, v1, v3, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_max3_v3f16_maximumnum_maximumnum__v_v_v_0: @@ -1401,67 +1401,71 @@ define <4 x half> @v_max3_v4f16_maximumnum_maximumnum__v_v_v_0(<4 x half> %a, <4 ; GFX6-LABEL: v_max3_v4f16_maximumnum_maximumnum__v_v_v_0: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX6-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v10, v10 +; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v8, 16, v0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v7 ; GFX6-NEXT: v_cvt_f32_f16_e32 v8, v8 +; GFX6-NEXT: v_lshrrev_b32_e32 v9, 16, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v10, 16, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v11, 16, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v9, v9 +; GFX6-NEXT: v_cvt_f32_f16_e32 v10, v10 +; GFX6-NEXT: v_cvt_f32_f16_e32 v11, v11 ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_max3_f32 v0, v0, v4, v8 -; GFX6-NEXT: v_max3_f32 v1, v1, v5, v9 -; GFX6-NEXT: v_max3_f32 v2, v2, v6, v10 -; GFX6-NEXT: v_max3_f32 v3, v3, v7, v11 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-NEXT: v_max3_f32 v6, v8, v7, v6 +; GFX6-NEXT: v_max3_f32 v9, v11, v10, v9 +; GFX6-NEXT: v_max3_f32 v0, v0, v2, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v6 +; GFX6-NEXT: v_max3_f32 v1, v1, v3, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v9 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_max3_v4f16_maximumnum_maximumnum__v_v_v_0: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v10 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7 ; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v8 +; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v9, v9 +; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v10 +; GFX7-NEXT: v_cvt_f32_f16_e32 v11, v11 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_max3_f32 v0, v0, v4, v8 -; GFX7-NEXT: v_max3_f32 v1, v1, v5, v9 -; GFX7-NEXT: v_max3_f32 v2, v2, v6, v10 -; GFX7-NEXT: v_max3_f32 v3, v3, v7, v11 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_max3_f32 v6, v8, v7, v6 +; GFX7-NEXT: v_max3_f32 v9, v11, v10, v9 +; GFX7-NEXT: v_max3_f32 v0, v0, v2, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v6 +; GFX7-NEXT: v_max3_f32 v1, v1, v3, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v9 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_max3_v4f16_maximumnum_maximumnum__v_v_v_0: @@ -2042,51 +2046,51 @@ define <2 x bfloat> @v_max3_v2bf16_maximumnum_maximumnum__v_v_v_0(<2 x bfloat> % ; GFX6-LABEL: v_max3_v2bf16_maximumnum_maximumnum__v_v_v_0: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v0 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX6-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX6-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX6-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 -; GFX6-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX6-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 -; GFX6-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX6-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; GFX6-NEXT: v_max_f32_e32 v3, v4, v3 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX6-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_max3_v2bf16_maximumnum_maximumnum__v_v_v_0: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v0 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 -; GFX7-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 -; GFX7-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX7-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; GFX7-NEXT: v_max_f32_e32 v3, v4, v3 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX7-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_max3_v2bf16_maximumnum_maximumnum__v_v_v_0: diff --git a/llvm/test/CodeGen/AMDGPU/fmax3.ll b/llvm/test/CodeGen/AMDGPU/fmax3.ll index ab9c7f16d54ac..38ab4c2712a2c 100644 --- a/llvm/test/CodeGen/AMDGPU/fmax3.ll +++ b/llvm/test/CodeGen/AMDGPU/fmax3.ll @@ -975,26 +975,26 @@ define <2 x half> @no_fmax3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, < ; SI-LABEL: no_fmax3_v2f16: ; SI: ; %bb.0: ; %entry ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_max_f32_e32 v1, v1, v3 -; SI-NEXT: v_max_f32_e32 v0, v0, v2 -; SI-NEXT: v_max3_f32 v0, v4, v0, v6 -; SI-NEXT: v_max3_f32 v1, v5, v1, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_max_f32_e32 v0, v0, v1 +; SI-NEXT: v_max_f32_e32 v1, v7, v6 +; SI-NEXT: v_max3_f32 v0, v2, v0, v3 +; SI-NEXT: v_max3_f32 v1, v5, v1, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: no_fmax3_v2f16: diff --git a/llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll b/llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll index bd28f72bb8913..dd2e9896cf882 100644 --- a/llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll @@ -119,16 +119,18 @@ define <2 x half> @test_fmax_legacy_ugt_v2f16(<2 x half> %a, <2 x half> %b) #0 { ; SI-LABEL: test_fmax_legacy_ugt_v2f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_max_legacy_f32_e32 v0, v2, v0 -; SI-NEXT: v_max_legacy_f32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_max_legacy_f32_e32 v0, v1, v0 +; SI-NEXT: v_max_legacy_f32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: test_fmax_legacy_ugt_v2f16: @@ -176,16 +178,18 @@ define <2 x half> @test_fmax_legacy_ugt_v2f16_fast(<2 x half> %a, <2 x half> %b) ; SI-LABEL: test_fmax_legacy_ugt_v2f16_fast: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_max_f32_e32 v0, v0, v2 -; SI-NEXT: v_max_f32_e32 v1, v1, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_max_f32_e32 v0, v0, v1 +; SI-NEXT: v_max_f32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: test_fmax_legacy_ugt_v2f16_fast: @@ -231,21 +235,22 @@ define <3 x half> @test_fmax_legacy_ugt_v3f16(<3 x half> %a, <3 x half> %b) #0 { ; SI-LABEL: test_fmax_legacy_ugt_v3f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_max_legacy_f32_e32 v0, v3, v0 -; SI-NEXT: v_max_legacy_f32_e32 v1, v4, v1 -; SI-NEXT: v_max_legacy_f32_e32 v2, v5, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_max_legacy_f32_e32 v1, v3, v1 +; SI-NEXT: v_max_legacy_f32_e32 v0, v2, v0 +; SI-NEXT: v_max_legacy_f32_e32 v2, v5, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: test_fmax_legacy_ugt_v3f16: @@ -301,21 +306,22 @@ define <3 x half> @test_fmax_legacy_ugt_v3f16_fast(<3 x half> %a, <3 x half> %b) ; SI-LABEL: test_fmax_legacy_ugt_v3f16_fast: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_max_f32_e32 v0, v0, v3 -; SI-NEXT: v_max_f32_e32 v1, v1, v4 -; SI-NEXT: v_max_f32_e32 v2, v2, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_max_f32_e32 v1, v1, v3 +; SI-NEXT: v_max_f32_e32 v0, v0, v2 +; SI-NEXT: v_max_f32_e32 v2, v5, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: test_fmax_legacy_ugt_v3f16_fast: @@ -377,26 +383,30 @@ define <4 x half> @test_fmax_legacy_ugt_v4f16(<4 x half> %a, <4 x half> %b) #0 { ; SI-LABEL: test_fmax_legacy_ugt_v4f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_max_legacy_f32_e32 v0, v4, v0 -; SI-NEXT: v_max_legacy_f32_e32 v1, v5, v1 -; SI-NEXT: v_max_legacy_f32_e32 v2, v6, v2 -; SI-NEXT: v_max_legacy_f32_e32 v3, v7, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_max_legacy_f32_e32 v1, v3, v1 +; SI-NEXT: v_max_legacy_f32_e32 v0, v2, v0 +; SI-NEXT: v_max_legacy_f32_e32 v2, v7, v6 +; SI-NEXT: v_max_legacy_f32_e32 v3, v5, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: test_fmax_legacy_ugt_v4f16: @@ -461,26 +471,30 @@ define <4 x half> @test_fmax_legacy_ugt_v4f16_fast(<4 x half> %a, <4 x half> %b) ; SI-LABEL: test_fmax_legacy_ugt_v4f16_fast: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_max_f32_e32 v0, v0, v4 -; SI-NEXT: v_max_f32_e32 v1, v1, v5 -; SI-NEXT: v_max_f32_e32 v2, v2, v6 -; SI-NEXT: v_max_f32_e32 v3, v3, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_max_f32_e32 v1, v1, v3 +; SI-NEXT: v_max_f32_e32 v0, v0, v2 +; SI-NEXT: v_max_f32_e32 v2, v7, v6 +; SI-NEXT: v_max_f32_e32 v3, v5, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: test_fmax_legacy_ugt_v4f16_fast: @@ -569,46 +583,54 @@ define <8 x half> @test_fmax_legacy_ugt_v8f16(<8 x half> %a, <8 x half> %b) #0 { ; SI-LABEL: test_fmax_legacy_ugt_v8f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_max_legacy_f32_e32 v3, v7, v3 +; SI-NEXT: v_max_legacy_f32_e32 v2, v6, v2 +; SI-NEXT: v_max_legacy_f32_e32 v1, v5, v1 +; SI-NEXT: v_max_legacy_f32_e32 v0, v4, v0 +; SI-NEXT: v_max_legacy_f32_e32 v4, v15, v14 +; SI-NEXT: v_max_legacy_f32_e32 v5, v13, v12 +; SI-NEXT: v_max_legacy_f32_e32 v6, v11, v10 +; SI-NEXT: v_max_legacy_f32_e32 v7, v9, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_max_legacy_f32_e32 v0, v8, v0 -; SI-NEXT: v_max_legacy_f32_e32 v1, v9, v1 -; SI-NEXT: v_max_legacy_f32_e32 v2, v10, v2 -; SI-NEXT: v_max_legacy_f32_e32 v3, v11, v3 -; SI-NEXT: v_max_legacy_f32_e32 v4, v12, v4 -; SI-NEXT: v_max_legacy_f32_e32 v5, v13, v5 -; SI-NEXT: v_max_legacy_f32_e32 v6, v14, v6 -; SI-NEXT: v_max_legacy_f32_e32 v7, v15, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v0, v0, v7 +; SI-NEXT: v_or_b32_e32 v1, v1, v6 +; SI-NEXT: v_or_b32_e32 v2, v2, v5 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: test_fmax_legacy_ugt_v8f16: @@ -701,46 +723,54 @@ define <8 x half> @test_fmax_legacy_ugt_v8f16_fast(<8 x half> %a, <8 x half> %b) ; SI-LABEL: test_fmax_legacy_ugt_v8f16_fast: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_max_f32_e32 v3, v3, v7 +; SI-NEXT: v_max_f32_e32 v2, v2, v6 +; SI-NEXT: v_max_f32_e32 v1, v1, v5 +; SI-NEXT: v_max_f32_e32 v0, v0, v4 +; SI-NEXT: v_max_f32_e32 v4, v15, v14 +; SI-NEXT: v_max_f32_e32 v5, v13, v12 +; SI-NEXT: v_max_f32_e32 v6, v11, v10 +; SI-NEXT: v_max_f32_e32 v7, v9, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_max_f32_e32 v0, v0, v8 -; SI-NEXT: v_max_f32_e32 v1, v1, v9 -; SI-NEXT: v_max_f32_e32 v2, v2, v10 -; SI-NEXT: v_max_f32_e32 v3, v3, v11 -; SI-NEXT: v_max_f32_e32 v4, v4, v12 -; SI-NEXT: v_max_f32_e32 v5, v5, v13 -; SI-NEXT: v_max_f32_e32 v6, v6, v14 -; SI-NEXT: v_max_f32_e32 v7, v7, v15 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v0, v0, v7 +; SI-NEXT: v_or_b32_e32 v1, v1, v6 +; SI-NEXT: v_or_b32_e32 v2, v2, v5 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: test_fmax_legacy_ugt_v8f16_fast: diff --git a/llvm/test/CodeGen/AMDGPU/fmed3.bf16.ll b/llvm/test/CodeGen/AMDGPU/fmed3.bf16.ll index 3e513de22caf3..65ced4f658692 100644 --- a/llvm/test/CodeGen/AMDGPU/fmed3.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/fmed3.bf16.ll @@ -89,18 +89,18 @@ define <2 x bfloat> @v_test_fmed3_r_i_i_v2bf16_minimumnum_maximumnum(<2 x bfloat ; SI-LABEL: v_test_fmed3_r_i_i_v2bf16_minimumnum_maximumnum: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_max_f32_e32 v0, 2.0, v0 ; SI-NEXT: v_max_f32_e32 v1, 2.0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; SI-NEXT: v_min_f32_e32 v1, 4.0, v1 -; SI-NEXT: v_min_f32_e32 v0, 4.0, v0 +; SI-NEXT: v_max_f32_e32 v0, 2.0, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_min_f32_e32 v0, 4.0, v0 +; SI-NEXT: v_min_f32_e32 v1, 4.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-SDAG-LABEL: v_test_fmed3_r_i_i_v2bf16_minimumnum_maximumnum: diff --git a/llvm/test/CodeGen/AMDGPU/fmed3.ll b/llvm/test/CodeGen/AMDGPU/fmed3.ll index 29163c111fc5e..d1b1a96fdeffc 100644 --- a/llvm/test/CodeGen/AMDGPU/fmed3.ll +++ b/llvm/test/CodeGen/AMDGPU/fmed3.ll @@ -8776,29 +8776,35 @@ define <2 x half> @v_test_fmed3_r_i_i_v2f16_minimumnum_maximumnum(<2 x half> %a) ; SI-SDAG-LABEL: v_test_fmed3_r_i_i_v2f16_minimumnum_maximumnum: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-SDAG-NEXT: v_med3_f32 v0, v0, 2.0, 4.0 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_med3_f32 v1, v1, 2.0, 4.0 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-SDAG-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_test_fmed3_r_i_i_v2f16_minimumnum_maximumnum: ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-GISEL-NEXT: v_max_f32_e32 v0, 2.0, v0 -; SI-GISEL-NEXT: v_max_f32_e32 v1, 2.0, v1 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_max_f32_e32 v1, 2.0, v1 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-GISEL-NEXT: v_min_f32_e32 v0, 4.0, v0 -; SI-GISEL-NEXT: v_min_f32_e32 v1, 4.0, v1 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_min_f32_e32 v1, 4.0, v1 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; VI-SDAG-LABEL: v_test_fmed3_r_i_i_v2f16_minimumnum_maximumnum: @@ -8986,46 +8992,52 @@ define <2 x half> @v_test_nnan_input_fmed3_r_i_i_v2f16_maximum_minimum(<2 x half ; SI-SDAG-LABEL: v_test_nnan_input_fmed3_r_i_i_v2f16_maximum_minimum: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-SDAG-NEXT: v_add_f32_e32 v1, 1.0, v1 ; SI-SDAG-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-SDAG-NEXT: v_add_f32_e32 v1, 1.0, v1 ; SI-SDAG-NEXT: v_med3_f32 v0, v0, 2.0, 4.0 ; SI-SDAG-NEXT: v_med3_f32 v1, v1, 2.0, 4.0 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_test_nnan_input_fmed3_r_i_i_v2f16_maximum_minimum: ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x7fc00000 ; SI-GISEL-NEXT: v_add_f32_e32 v0, 1.0, v0 -; SI-GISEL-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_add_f32_e32 v1, 1.0, v1 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-GISEL-NEXT: v_max_f32_e32 v3, 2.0, v0 -; SI-GISEL-NEXT: v_max_f32_e32 v4, 2.0, v1 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-GISEL-NEXT: v_cmp_o_f32_e32 vcc, 2.0, v0 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; SI-GISEL-NEXT: v_cmp_o_f32_e32 vcc, 2.0, v1 -; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc +; SI-GISEL-NEXT: v_max_f32_e32 v3, 2.0, v1 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_cmp_o_f32_e32 vcc, 2.0, v1 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-GISEL-NEXT: v_min_f32_e32 v3, 4.0, v0 -; SI-GISEL-NEXT: v_min_f32_e32 v4, 4.0, v1 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-GISEL-NEXT: v_cmp_o_f32_e32 vcc, 4.0, v0 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; SI-GISEL-NEXT: v_cmp_o_f32_e32 vcc, 4.0, v1 -; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc +; SI-GISEL-NEXT: v_min_f32_e32 v3, 4.0, v1 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_cmp_o_f32_e32 vcc, 4.0, v1 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; VI-SDAG-LABEL: v_test_nnan_input_fmed3_r_i_i_v2f16_maximum_minimum: diff --git a/llvm/test/CodeGen/AMDGPU/fmin3-minimumnum.ll b/llvm/test/CodeGen/AMDGPU/fmin3-minimumnum.ll index 6b99b06e155fb..6c78f55d2da86 100644 --- a/llvm/test/CodeGen/AMDGPU/fmin3-minimumnum.ll +++ b/llvm/test/CodeGen/AMDGPU/fmin3-minimumnum.ll @@ -1122,39 +1122,41 @@ define <2 x half> @v_min3_v2f16_minimumnum_minimumnum__v_v_v_0(<2 x half> %a, <2 ; GFX6-LABEL: v_min3_v2f16_minimumnum_minimumnum__v_v_v_0: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-NEXT: v_min3_f32 v0, v0, v2, v4 -; GFX6-NEXT: v_min3_f32 v1, v1, v3, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_min3_f32 v3, v5, v4, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_min3_f32 v0, v0, v1, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_min3_v2f16_minimumnum_minimumnum__v_v_v_0: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_min3_f32 v0, v0, v2, v4 -; GFX7-NEXT: v_min3_f32 v1, v1, v3, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_min3_f32 v3, v5, v4, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_min3_f32 v0, v0, v1, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_min3_v2f16_minimumnum_minimumnum__v_v_v_0: @@ -1239,53 +1241,51 @@ define <3 x half> @v_min3_v3f16_minimumnum_minimumnum__v_v_v_0(<3 x half> %a, <3 ; GFX6-LABEL: v_min3_v3f16_minimumnum_minimumnum__v_v_v_0: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v8, 16, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v8, v8 ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX6-NEXT: v_min3_f32 v0, v0, v3, v6 -; GFX6-NEXT: v_min3_f32 v1, v1, v4, v7 -; GFX6-NEXT: v_min3_f32 v2, v2, v5, v8 +; GFX6-NEXT: v_min3_f32 v6, v8, v7, v6 +; GFX6-NEXT: v_min3_f32 v0, v0, v2, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_min3_f32 v1, v1, v3, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_min3_v3f16_minimumnum_minimumnum__v_v_v_0: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v8 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_min3_f32 v0, v0, v3, v6 -; GFX7-NEXT: v_min3_f32 v1, v1, v4, v7 -; GFX7-NEXT: v_min3_f32 v2, v2, v5, v8 +; GFX7-NEXT: v_min3_f32 v6, v8, v7, v6 +; GFX7-NEXT: v_min3_f32 v0, v0, v2, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_min3_f32 v1, v1, v3, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_min3_v3f16_minimumnum_minimumnum__v_v_v_0: @@ -1401,67 +1401,71 @@ define <4 x half> @v_min3_v4f16_minimumnum_minimumnum__v_v_v_0(<4 x half> %a, <4 ; GFX6-LABEL: v_min3_v4f16_minimumnum_minimumnum__v_v_v_0: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX6-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v10, v10 +; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v8, 16, v0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v7 ; GFX6-NEXT: v_cvt_f32_f16_e32 v8, v8 +; GFX6-NEXT: v_lshrrev_b32_e32 v9, 16, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v10, 16, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v11, 16, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v9, v9 +; GFX6-NEXT: v_cvt_f32_f16_e32 v10, v10 +; GFX6-NEXT: v_cvt_f32_f16_e32 v11, v11 ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_min3_f32 v0, v0, v4, v8 -; GFX6-NEXT: v_min3_f32 v1, v1, v5, v9 -; GFX6-NEXT: v_min3_f32 v2, v2, v6, v10 -; GFX6-NEXT: v_min3_f32 v3, v3, v7, v11 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-NEXT: v_min3_f32 v6, v8, v7, v6 +; GFX6-NEXT: v_min3_f32 v9, v11, v10, v9 +; GFX6-NEXT: v_min3_f32 v0, v0, v2, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v6 +; GFX6-NEXT: v_min3_f32 v1, v1, v3, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v9 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_min3_v4f16_minimumnum_minimumnum__v_v_v_0: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v10 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7 ; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v8 +; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v9, v9 +; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v10 +; GFX7-NEXT: v_cvt_f32_f16_e32 v11, v11 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_min3_f32 v0, v0, v4, v8 -; GFX7-NEXT: v_min3_f32 v1, v1, v5, v9 -; GFX7-NEXT: v_min3_f32 v2, v2, v6, v10 -; GFX7-NEXT: v_min3_f32 v3, v3, v7, v11 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_min3_f32 v6, v8, v7, v6 +; GFX7-NEXT: v_min3_f32 v9, v11, v10, v9 +; GFX7-NEXT: v_min3_f32 v0, v0, v2, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v6 +; GFX7-NEXT: v_min3_f32 v1, v1, v3, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v9 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_min3_v4f16_minimumnum_minimumnum__v_v_v_0: @@ -2044,51 +2048,51 @@ define <2 x bfloat> @v_min3_v2bf16_minimumnum_minimumnum__v_v_v_0(<2 x bfloat> % ; GFX6-LABEL: v_min3_v2bf16_minimumnum_minimumnum__v_v_v_0: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v0 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX6-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX6-NEXT: v_min_f32_e32 v0, v0, v2 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX6-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 -; GFX6-NEXT: v_min_f32_e32 v1, v1, v2 -; GFX6-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 -; GFX6-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX6-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; GFX6-NEXT: v_min_f32_e32 v3, v4, v3 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX6-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_min3_v2bf16_minimumnum_minimumnum__v_v_v_0: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v0 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_min_f32_e32 v0, v0, v2 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 -; GFX7-NEXT: v_min_f32_e32 v1, v1, v2 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 -; GFX7-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX7-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; GFX7-NEXT: v_min_f32_e32 v3, v4, v3 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX7-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_min3_v2bf16_minimumnum_minimumnum__v_v_v_0: diff --git a/llvm/test/CodeGen/AMDGPU/fmin3.ll b/llvm/test/CodeGen/AMDGPU/fmin3.ll index 7c2aeeb90bd9e..fee2fad933158 100644 --- a/llvm/test/CodeGen/AMDGPU/fmin3.ll +++ b/llvm/test/CodeGen/AMDGPU/fmin3.ll @@ -975,26 +975,26 @@ define <2 x half> @no_fmin3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, < ; SI-LABEL: no_fmin3_v2f16: ; SI: ; %bb.0: ; %entry ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_min_f32_e32 v1, v1, v3 -; SI-NEXT: v_min_f32_e32 v0, v0, v2 -; SI-NEXT: v_min3_f32 v0, v4, v0, v6 -; SI-NEXT: v_min3_f32 v1, v5, v1, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_min_f32_e32 v0, v0, v1 +; SI-NEXT: v_min_f32_e32 v1, v7, v6 +; SI-NEXT: v_min3_f32 v0, v2, v0, v3 +; SI-NEXT: v_min3_f32 v1, v5, v1, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: no_fmin3_v2f16: diff --git a/llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll b/llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll index 40c2ec0a39f51..9e5a28d6c5041 100644 --- a/llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll @@ -120,16 +120,18 @@ define <2 x half> @test_fmin_legacy_ule_v2f16(<2 x half> %a, <2 x half> %b) #0 { ; SI-LABEL: test_fmin_legacy_ule_v2f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_min_legacy_f32_e32 v0, v2, v0 -; SI-NEXT: v_min_legacy_f32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_min_legacy_f32_e32 v0, v1, v0 +; SI-NEXT: v_min_legacy_f32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: test_fmin_legacy_ule_v2f16: @@ -177,16 +179,18 @@ define <2 x half> @test_fmin_legacy_ule_v2f16_fast(<2 x half> %a, <2 x half> %b) ; SI-LABEL: test_fmin_legacy_ule_v2f16_fast: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_min_f32_e32 v0, v0, v2 -; SI-NEXT: v_min_f32_e32 v1, v1, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_min_f32_e32 v0, v0, v1 +; SI-NEXT: v_min_f32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: test_fmin_legacy_ule_v2f16_fast: @@ -232,21 +236,22 @@ define <3 x half> @test_fmin_legacy_ule_v3f16(<3 x half> %a, <3 x half> %b) #0 { ; SI-LABEL: test_fmin_legacy_ule_v3f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_min_legacy_f32_e32 v0, v3, v0 -; SI-NEXT: v_min_legacy_f32_e32 v1, v4, v1 -; SI-NEXT: v_min_legacy_f32_e32 v2, v5, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_min_legacy_f32_e32 v1, v3, v1 +; SI-NEXT: v_min_legacy_f32_e32 v0, v2, v0 +; SI-NEXT: v_min_legacy_f32_e32 v2, v5, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: test_fmin_legacy_ule_v3f16: @@ -302,21 +307,22 @@ define <3 x half> @test_fmin_legacy_ule_v3f16_fast(<3 x half> %a, <3 x half> %b) ; SI-LABEL: test_fmin_legacy_ule_v3f16_fast: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_min_f32_e32 v0, v0, v3 -; SI-NEXT: v_min_f32_e32 v1, v1, v4 -; SI-NEXT: v_min_f32_e32 v2, v2, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_min_f32_e32 v1, v1, v3 +; SI-NEXT: v_min_f32_e32 v0, v0, v2 +; SI-NEXT: v_min_f32_e32 v2, v5, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: test_fmin_legacy_ule_v3f16_fast: @@ -378,26 +384,30 @@ define <4 x half> @test_fmin_legacy_ule_v4f16(<4 x half> %a, <4 x half> %b) #0 { ; SI-LABEL: test_fmin_legacy_ule_v4f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_min_legacy_f32_e32 v0, v4, v0 -; SI-NEXT: v_min_legacy_f32_e32 v1, v5, v1 -; SI-NEXT: v_min_legacy_f32_e32 v2, v6, v2 -; SI-NEXT: v_min_legacy_f32_e32 v3, v7, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_min_legacy_f32_e32 v1, v3, v1 +; SI-NEXT: v_min_legacy_f32_e32 v0, v2, v0 +; SI-NEXT: v_min_legacy_f32_e32 v2, v7, v6 +; SI-NEXT: v_min_legacy_f32_e32 v3, v5, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: test_fmin_legacy_ule_v4f16: @@ -462,26 +472,30 @@ define <4 x half> @test_fmin_legacy_ule_v4f16_fast(<4 x half> %a, <4 x half> %b) ; SI-LABEL: test_fmin_legacy_ule_v4f16_fast: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_min_f32_e32 v0, v0, v4 -; SI-NEXT: v_min_f32_e32 v1, v1, v5 -; SI-NEXT: v_min_f32_e32 v2, v2, v6 -; SI-NEXT: v_min_f32_e32 v3, v3, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_min_f32_e32 v1, v1, v3 +; SI-NEXT: v_min_f32_e32 v0, v0, v2 +; SI-NEXT: v_min_f32_e32 v2, v7, v6 +; SI-NEXT: v_min_f32_e32 v3, v5, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: test_fmin_legacy_ule_v4f16_fast: @@ -570,46 +584,54 @@ define <8 x half> @test_fmin_legacy_ule_v8f16(<8 x half> %a, <8 x half> %b) #0 { ; SI-LABEL: test_fmin_legacy_ule_v8f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_min_legacy_f32_e32 v3, v7, v3 +; SI-NEXT: v_min_legacy_f32_e32 v2, v6, v2 +; SI-NEXT: v_min_legacy_f32_e32 v1, v5, v1 +; SI-NEXT: v_min_legacy_f32_e32 v0, v4, v0 +; SI-NEXT: v_min_legacy_f32_e32 v4, v15, v14 +; SI-NEXT: v_min_legacy_f32_e32 v5, v13, v12 +; SI-NEXT: v_min_legacy_f32_e32 v6, v11, v10 +; SI-NEXT: v_min_legacy_f32_e32 v7, v9, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_min_legacy_f32_e32 v0, v8, v0 -; SI-NEXT: v_min_legacy_f32_e32 v1, v9, v1 -; SI-NEXT: v_min_legacy_f32_e32 v2, v10, v2 -; SI-NEXT: v_min_legacy_f32_e32 v3, v11, v3 -; SI-NEXT: v_min_legacy_f32_e32 v4, v12, v4 -; SI-NEXT: v_min_legacy_f32_e32 v5, v13, v5 -; SI-NEXT: v_min_legacy_f32_e32 v6, v14, v6 -; SI-NEXT: v_min_legacy_f32_e32 v7, v15, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v0, v0, v7 +; SI-NEXT: v_or_b32_e32 v1, v1, v6 +; SI-NEXT: v_or_b32_e32 v2, v2, v5 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: test_fmin_legacy_ule_v8f16: @@ -702,46 +724,54 @@ define <8 x half> @test_fmin_legacy_ule_v8f16_fast(<8 x half> %a, <8 x half> %b) ; SI-LABEL: test_fmin_legacy_ule_v8f16_fast: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_min_f32_e32 v3, v3, v7 +; SI-NEXT: v_min_f32_e32 v2, v2, v6 +; SI-NEXT: v_min_f32_e32 v1, v1, v5 +; SI-NEXT: v_min_f32_e32 v0, v0, v4 +; SI-NEXT: v_min_f32_e32 v4, v15, v14 +; SI-NEXT: v_min_f32_e32 v5, v13, v12 +; SI-NEXT: v_min_f32_e32 v6, v11, v10 +; SI-NEXT: v_min_f32_e32 v7, v9, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_min_f32_e32 v0, v0, v8 -; SI-NEXT: v_min_f32_e32 v1, v1, v9 -; SI-NEXT: v_min_f32_e32 v2, v2, v10 -; SI-NEXT: v_min_f32_e32 v3, v3, v11 -; SI-NEXT: v_min_f32_e32 v4, v4, v12 -; SI-NEXT: v_min_f32_e32 v5, v5, v13 -; SI-NEXT: v_min_f32_e32 v6, v6, v14 -; SI-NEXT: v_min_f32_e32 v7, v7, v15 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v0, v0, v7 +; SI-NEXT: v_or_b32_e32 v1, v1, v6 +; SI-NEXT: v_or_b32_e32 v2, v2, v5 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: test_fmin_legacy_ule_v8f16_fast: diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll index 688e152e73f40..b3f6de638a67d 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll @@ -1792,10 +1792,14 @@ define <2 x half> @v_fneg_minnum_multi_use_minnum_f16_no_ieee(half %a, half %b) ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e64 v1, -v1 -; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 -; SI-NEXT: v_max_f32_e32 v0, v0, v1 -; SI-NEXT: v_mul_f32_e32 v1, -4.0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_min_f32_e32 v0, v0, v1 +; SI-NEXT: v_mul_f32_e32 v1, 4.0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fneg_minnum_multi_use_minnum_f16_no_ieee: @@ -2426,10 +2430,14 @@ define <2 x half> @v_fneg_maxnum_multi_use_maxnum_f16_no_ieee(half %a, half %b) ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e64 v1, -v1 -; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 -; SI-NEXT: v_min_f32_e32 v0, v0, v1 -; SI-NEXT: v_mul_f32_e32 v1, -4.0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_max_f32_e32 v0, v0, v1 +; SI-NEXT: v_mul_f32_e32 v1, 4.0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fneg_maxnum_multi_use_maxnum_f16_no_ieee: @@ -3288,71 +3296,75 @@ define <4 x half> @v_fneg_fmad_v4f32(<4 x half> %a, <4 x half> %b, <4 x half> %c ; SI-SAFE-LABEL: v_fneg_fmad_v4f32: ; SI-SAFE: ; %bb.0: ; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-SAFE-NEXT: v_lshrrev_b32_e32 v9, 16, v5 +; SI-SAFE-NEXT: v_lshrrev_b32_e32 v10, 16, v3 +; SI-SAFE-NEXT: v_lshrrev_b32_e32 v11, 16, v1 +; SI-SAFE-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; SI-SAFE-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; SI-SAFE-NEXT: v_lshrrev_b32_e32 v8, 16, v0 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-SAFE-NEXT: v_mac_f32_e32 v11, v3, v7 -; SI-SAFE-NEXT: v_mac_f32_e32 v10, v2, v6 -; SI-SAFE-NEXT: v_mac_f32_e32 v9, v1, v5 -; SI-SAFE-NEXT: v_mac_f32_e32 v8, v0, v4 -; SI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v8 -; SI-SAFE-NEXT: v_xor_b32_e32 v1, 0x80000000, v9 -; SI-SAFE-NEXT: v_xor_b32_e32 v2, 0x80000000, v10 -; SI-SAFE-NEXT: v_xor_b32_e32 v3, 0x80000000, v11 +; SI-SAFE-NEXT: v_mac_f32_e32 v9, v11, v10 +; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-SAFE-NEXT: v_mac_f32_e32 v5, v1, v3 +; SI-SAFE-NEXT: v_mac_f32_e32 v6, v8, v7 +; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v5 +; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v3, v6 +; SI-SAFE-NEXT: v_mac_f32_e32 v4, v0, v2 +; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v4 +; SI-SAFE-NEXT: v_lshlrev_b32_e32 v2, 16, v9 +; SI-SAFE-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-SAFE-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-SAFE-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 +; SI-SAFE-NEXT: v_xor_b32_e32 v1, 0x80008000, v1 ; SI-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; SI-NSZ-LABEL: v_fneg_fmad_v4f32: ; SI-NSZ: ; %bb.0: ; SI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NSZ-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 +; SI-NSZ-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; SI-NSZ-NEXT: v_lshrrev_b32_e32 v7, 16, v0 +; SI-NSZ-NEXT: v_xor_b32_e32 v3, 0x80008000, v3 +; SI-NSZ-NEXT: v_lshrrev_b32_e32 v11, 16, v2 +; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NSZ-NEXT: v_lshrrev_b32_e32 v8, 16, v5 +; SI-NSZ-NEXT: v_lshrrev_b32_e32 v9, 16, v1 +; SI-NSZ-NEXT: v_lshrrev_b32_e32 v10, 16, v3 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NSZ-NEXT: v_cvt_f32_f16_e64 v4, -v4 -; SI-NSZ-NEXT: v_cvt_f32_f16_e64 v5, -v5 -; SI-NSZ-NEXT: v_cvt_f32_f16_e64 v6, -v6 -; SI-NSZ-NEXT: v_cvt_f32_f16_e64 v7, -v7 -; SI-NSZ-NEXT: v_mad_f32 v0, v0, v4, -v8 -; SI-NSZ-NEXT: v_mad_f32 v1, v1, v5, -v9 -; SI-NSZ-NEXT: v_mad_f32 v2, v2, v6, -v10 -; SI-NSZ-NEXT: v_mad_f32 v3, v3, v7, -v11 +; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NSZ-NEXT: v_mad_f32 v6, v7, v11, -v6 +; SI-NSZ-NEXT: v_mad_f32 v8, v9, v10, -v8 +; SI-NSZ-NEXT: v_mad_f32 v0, v0, v2, -v4 +; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v2, v6 +; SI-NSZ-NEXT: v_mad_f32 v1, v1, v3, -v5 +; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v3, v8 +; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NSZ-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NSZ-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NSZ-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NSZ-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NSZ-NEXT: s_setpc_b64 s[30:31] ; ; VI-SAFE-LABEL: v_fneg_fmad_v4f32: @@ -5766,31 +5778,30 @@ define <2 x half> @fneg_fma_fneg_dagcombine_loop(<2 x half> %arg, <2 x half> %ar ; SI-LABEL: fneg_fma_fneg_dagcombine_loop: ; SI: ; %bb.0: ; %bb ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v5 -; SI-NEXT: v_or_b32_e32 v6, v4, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_xor_b32_e32 v6, 0x80008000, v6 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v2 +; SI-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: s_brev_b32 s4, 1 -; SI-NEXT: v_fma_f32 v3, v3, v7, s4 -; SI-NEXT: v_fma_f32 v2, v2, v6, s4 -; SI-NEXT: v_sub_f32_e32 v1, v3, v1 -; SI-NEXT: v_sub_f32_e32 v0, v2, v0 -; SI-NEXT: v_mul_f32_e32 v0, v0, v4 -; SI-NEXT: v_mul_f32_e32 v1, v1, v5 +; SI-NEXT: v_fma_f32 v5, v5, v7, s4 +; SI-NEXT: v_sub_f32_e32 v4, v5, v4 +; SI-NEXT: v_fma_f32 v1, v1, v2, s4 +; SI-NEXT: v_sub_f32_e32 v0, v1, v0 +; SI-NEXT: v_mul_f32_e32 v1, v4, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_mul_f32_e32 v0, v0, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: fneg_fma_fneg_dagcombine_loop: @@ -6055,25 +6066,20 @@ define <2 x half> @fadd_select_fneg_fneg_v2f16(i32 %arg0, <2 x half> %x, <2 x ha ; SI-LABEL: fadd_select_fneg_fneg_v2f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v6 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_sub_f32_e32 v0, v4, v0 -; SI-NEXT: v_sub_f32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_sub_f32_e32 v1, v4, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_sub_f32_e32 v0, v2, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: fadd_select_fneg_fneg_v2f16: diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.ll index 6645bc858d392..bdea710725ace 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-combines.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.ll @@ -7808,25 +7808,20 @@ define <2 x half> @fadd_select_fneg_fneg_v2f16(i32 %arg0, <2 x half> %x, <2 x ha ; SI-LABEL: fadd_select_fneg_fneg_v2f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v6 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_sub_f32_e32 v0, v4, v0 -; SI-NEXT: v_sub_f32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_sub_f32_e32 v1, v4, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_sub_f32_e32 v0, v2, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: fadd_select_fneg_fneg_v2f16: diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll index 022edc4792d25..58adbd4d0d250 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll @@ -3815,25 +3815,20 @@ define <2 x half> @fadd_select_fneg_fneg_v2f16(i32 %arg0, <2 x half> %x, <2 x ha ; SI-LABEL: fadd_select_fneg_fneg_v2f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v6 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_sub_f32_e32 v0, v4, v0 -; SI-NEXT: v_sub_f32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_sub_f32_e32 v1, v4, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_sub_f32_e32 v0, v2, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: fadd_select_fneg_fneg_v2f16: @@ -4242,32 +4237,15 @@ define amdgpu_kernel void @s_fneg_select_infloop_regression_v2f16(<2 x half> %ar } define <2 x half> @v_fneg_select_infloop_regression_v2f16(<2 x half> %arg, i1 %arg1) { -; SI-LABEL: v_fneg_select_infloop_regression_v2f16: -; SI: ; %bb.0: -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 1, v2 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 -; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc -; SI-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 -; SI-NEXT: v_cndmask_b32_e64 v1, v0, 0, vcc -; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: v_fneg_select_infloop_regression_v2f16: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_and_b32_e32 v1, 1, v1 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 -; VI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc -; VI-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 -; VI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc -; VI-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: v_fneg_select_infloop_regression_v2f16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 1, v1 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; GCN-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GCN-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 +; GCN-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GCN-NEXT: s_setpc_b64 s[30:31] %i = select i1 %arg1, <2 x half> zeroinitializer, <2 x half> %arg %i2 = fneg <2 x half> %i %i3 = select i1 %arg1, <2 x half> zeroinitializer, <2 x half> %i2 @@ -4453,6 +4431,7 @@ define float @v_fmul_0_fsub_0_safe_infloop_regression(float %arg) { ; SI-NSZ-NEXT: s_brev_b32 s4, 1 ; SI-NSZ-NEXT: v_fma_f32 v0, v0, s4, 0 ; SI-NSZ-NEXT: s_setpc_b64 s[30:31] +; ; FIXME: utils/update_llc_test_checks.py will generate redundant VI ; labels, remove them, they will cause test failure. bb: diff --git a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll index db08cb132a3d7..e5c34f695f9a7 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll @@ -214,18 +214,16 @@ define <2 x i16> @fneg_xor_select_v2i16(<2 x i1> %cond, <2 x i16> %arg0, <2 x i1 ; GFX7-LABEL: fneg_xor_select_v2i16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX7-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc +; GFX7-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc -; GFX7-NEXT: v_xor_b32_e32 v1, 0x8000, v1 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX7-NEXT: v_xor_b32_e32 v0, 0x8000, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-NEXT: s_mov_b32 s4, 0xffff +; GFX7-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x80000000, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: fneg_xor_select_v2i16: @@ -722,17 +720,12 @@ define <2 x half> @select_fneg_select_v2f16(<2 x i1> %cond0, <2 x i1> %cond1, <2 ; GFX7-LABEL: select_fneg_select_v2f16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 ; GFX7-NEXT: v_xor_b32_e32 v4, 0x80008000, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v7 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v5 ; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_and_b32_e32 v1, 1, v1 ; GFX7-NEXT: v_and_b32_e32 v0, 1, v0 @@ -742,18 +735,22 @@ define <2 x half> @select_fneg_select_v2f16(<2 x i1> %cond0, <2 x i1> %cond1, <2 ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v1 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v0 -; GFX7-NEXT: v_and_b32_e32 v2, 1, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX7-NEXT: v_and_b32_e32 v3, 1, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; GFX7-NEXT: v_and_b32_e32 v2, 1, v2 ; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 ; GFX7-NEXT: v_xor_b32_e32 v4, 0x80008000, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: select_fneg_select_v2f16: @@ -840,26 +837,27 @@ define <2 x i16> @select_fneg_xor_select_v2i16(<2 x i1> %cond0, <2 x i1> %cond1, ; GFX7-LABEL: select_fneg_xor_select_v2i16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v4 ; GFX7-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v5 ; GFX7-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX7-NEXT: v_xor_b32_e32 v5, 0xffff8000, v5 +; GFX7-NEXT: v_xor_b32_e32 v7, 0xffff8000, v7 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 ; GFX7-NEXT: v_and_b32_e32 v3, 1, v3 ; GFX7-NEXT: v_xor_b32_e32 v4, 0xffff8000, v4 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX7-NEXT: v_and_b32_e32 v2, 1, v2 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc -; GFX7-NEXT: v_xor_b32_e32 v5, 0x8000, v1 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; GFX7-NEXT: v_xor_b32_e32 v4, 0x8000, v1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 -; GFX7-NEXT: v_xor_b32_e32 v4, 0x8000, v0 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GFX7-NEXT: v_xor_b32_e32 v5, 0x8000, v0 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v3 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: select_fneg_xor_select_v2i16: @@ -1199,23 +1197,11 @@ define double @fneg_f64_bitcast_vector_v2f32_to_f64(<2 x float> %arg) { } define double @fneg_f64_bitcast_vector_v4i16_to_f64(<4 x i16> %arg) { -; GFX7-LABEL: fneg_f64_bitcast_vector_v4i16_to_f64: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX7-NEXT: v_xor_b32_e32 v1, 0x80000000, v2 -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: fneg_f64_bitcast_vector_v4i16_to_f64: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: fneg_f64_bitcast_vector_v4i16_to_f64: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 +; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: fneg_f64_bitcast_vector_v4i16_to_f64: ; GFX11: ; %bb.0: @@ -1228,25 +1214,11 @@ define double @fneg_f64_bitcast_vector_v4i16_to_f64(<4 x i16> %arg) { } define double @fneg_f64_bitcast_vector_v4f16_to_f64(<4 x half> %arg) { -; GFX7-LABEL: fneg_f64_bitcast_vector_v4f16_to_f64: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX7-NEXT: v_xor_b32_e32 v1, 0x80000000, v2 -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: fneg_f64_bitcast_vector_v4f16_to_f64: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: fneg_f64_bitcast_vector_v4f16_to_f64: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 +; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: fneg_f64_bitcast_vector_v4f16_to_f64: ; GFX11: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/fneg.ll b/llvm/test/CodeGen/AMDGPU/fneg.ll index 02235151a83e1..c277f3b546c6b 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg.ll @@ -673,28 +673,11 @@ define amdgpu_kernel void @s_fneg_v2i16(ptr addrspace(1) %out, i32 %arg) { } define <2 x i16> @v_fneg_v2i16(<2 x i16> %in) { -; SI-LABEL: v_fneg_v2i16: -; SI: ; %bb.0: -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_xor_b32_e32 v1, 0x8000, v1 -; SI-NEXT: v_xor_b32_e32 v0, 0x8000, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v0, v2 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: v_fneg_v2i16: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: v_fneg_v2i16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: v_fneg_v2i16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 +; GCN-NEXT: s_setpc_b64 s[30:31] %fneg = xor <2 x i16> %in, ret <2 x i16> %fneg } @@ -759,10 +742,14 @@ define <2 x half> @v_fneg_v2i16_fp_use(i32 %arg) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_sub_f32_e32 v0, 2.0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_sub_f32_e32 v1, 2.0, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_sub_f32_e32 v0, 2.0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fneg_v2i16_fp_use: diff --git a/llvm/test/CodeGen/AMDGPU/fpow.ll b/llvm/test/CodeGen/AMDGPU/fpow.ll index ad3f3433c74b3..fd7816e7df1d9 100644 --- a/llvm/test/CodeGen/AMDGPU/fpow.ll +++ b/llvm/test/CodeGen/AMDGPU/fpow.ll @@ -224,20 +224,22 @@ define <2 x half> @v_pow_v2f16(<2 x half> %x, <2 x half> %y) { ; GFX6-LABEL: v_pow_v2f16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_log_f32_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_log_f32_e32 v0, v0 -; GFX6-NEXT: v_log_f32_e32 v1, v1 -; GFX6-NEXT: v_mul_legacy_f32_e32 v0, v2, v0 -; GFX6-NEXT: v_mul_legacy_f32_e32 v1, v3, v1 +; GFX6-NEXT: v_mul_legacy_f32_e32 v2, v3, v2 +; GFX6-NEXT: v_exp_f32_e32 v2, v2 +; GFX6-NEXT: v_mul_legacy_f32_e32 v0, v1, v0 ; GFX6-NEXT: v_exp_f32_e32 v0, v0 -; GFX6-NEXT: v_exp_f32_e32 v1, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_pow_v2f16: @@ -366,24 +368,23 @@ define <2 x half> @v_pow_v2f16_fneg_lhs(<2 x half> %x, <2 x half> %y) { ; GFX6-LABEL: v_pow_v2f16_fneg_lhs: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX6-NEXT: v_log_f32_e32 v3, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_log_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-NEXT: v_log_f32_e32 v4, v0 -; GFX6-NEXT: v_mul_legacy_f32_e32 v0, v2, v3 +; GFX6-NEXT: v_log_f32_e32 v0, v0 +; GFX6-NEXT: v_mul_legacy_f32_e32 v2, v3, v2 +; GFX6-NEXT: v_exp_f32_e32 v2, v2 +; GFX6-NEXT: v_mul_legacy_f32_e32 v0, v1, v0 ; GFX6-NEXT: v_exp_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_legacy_f32_e32 v1, v1, v4 -; GFX6-NEXT: v_exp_f32_e32 v1, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_pow_v2f16_fneg_lhs: @@ -513,24 +514,23 @@ define <2 x half> @v_pow_v2f16_fneg_rhs(<2 x half> %x, <2 x half> %y) { ; GFX6-LABEL: v_pow_v2f16_fneg_rhs: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 -; GFX6-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX6-NEXT: v_log_f32_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_xor_b32_e32 v1, 0x80008000, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_log_f32_e32 v1, v1 -; GFX6-NEXT: v_mul_legacy_f32_e32 v0, v2, v0 +; GFX6-NEXT: v_log_f32_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-NEXT: v_log_f32_e32 v0, v0 +; GFX6-NEXT: v_mul_legacy_f32_e32 v2, v3, v2 +; GFX6-NEXT: v_exp_f32_e32 v2, v2 +; GFX6-NEXT: v_mul_legacy_f32_e32 v0, v1, v0 ; GFX6-NEXT: v_exp_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_legacy_f32_e32 v1, v3, v1 -; GFX6-NEXT: v_exp_f32_e32 v1, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_pow_v2f16_fneg_rhs: @@ -660,28 +660,24 @@ define <2 x half> @v_pow_v2f16_fneg_lhs_rhs(<2 x half> %x, <2 x half> %y) { ; GFX6-LABEL: v_pow_v2f16_fneg_lhs_rhs: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 -; GFX6-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX6-NEXT: v_log_f32_e32 v0, v0 +; GFX6-NEXT: v_xor_b32_e32 v1, 0x80008000, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_log_f32_e32 v1, v1 -; GFX6-NEXT: v_mul_legacy_f32_e32 v0, v2, v0 +; GFX6-NEXT: v_log_f32_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-NEXT: v_log_f32_e32 v2, v2 +; GFX6-NEXT: v_mul_legacy_f32_e32 v0, v3, v0 ; GFX6-NEXT: v_exp_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_legacy_f32_e32 v1, v3, v1 +; GFX6-NEXT: v_mul_legacy_f32_e32 v1, v1, v2 ; GFX6-NEXT: v_exp_f32_e32 v1, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_pow_v2f16_fneg_lhs_rhs: diff --git a/llvm/test/CodeGen/AMDGPU/fract-match.ll b/llvm/test/CodeGen/AMDGPU/fract-match.ll index b14935c57152b..a2bd98d3d7b27 100644 --- a/llvm/test/CodeGen/AMDGPU/fract-match.ll +++ b/llvm/test/CodeGen/AMDGPU/fract-match.ll @@ -1670,31 +1670,37 @@ define <2 x half> @basic_fract_v2f16_nonan(<2 x half> nofpclass(nan) %x) { ; GFX6-LABEL: basic_fract_v2f16_nonan: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-NEXT: v_floor_f32_e32 v2, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: v_floor_f32_e32 v3, v1 +; GFX6-NEXT: v_floor_f32_e32 v2, v0 ; GFX6-NEXT: v_sub_f32_e32 v1, v1, v3 ; GFX6-NEXT: v_sub_f32_e32 v0, v0, v2 -; GFX6-NEXT: v_min_f32_e32 v0, 0x3f7fe000, v0 ; GFX6-NEXT: v_min_f32_e32 v1, 0x3f7fe000, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: v_min_f32_e32 v0, 0x3f7fe000, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: basic_fract_v2f16_nonan: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_floor_f32_e32 v2, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: v_floor_f32_e32 v3, v1 +; GFX7-NEXT: v_floor_f32_e32 v2, v0 ; GFX7-NEXT: v_sub_f32_e32 v1, v1, v3 ; GFX7-NEXT: v_sub_f32_e32 v0, v0, v2 -; GFX7-NEXT: v_min_f32_e32 v0, 0x3f7fe000, v0 ; GFX7-NEXT: v_min_f32_e32 v1, 0x3f7fe000, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_min_f32_e32 v0, 0x3f7fe000, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: basic_fract_v2f16_nonan: @@ -2649,72 +2655,78 @@ define <2 x half> @safe_math_fract_v2f16(<2 x half> %x, ptr addrspace(1) writeon ; GFX6-LABEL: safe_math_fract_v2f16: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v0 ; GFX6-NEXT: s_movk_i32 s8, 0x7c00 ; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v0 -; GFX6-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; GFX6-NEXT: v_and_b32_e32 v1, 0x7fff, v1 -; GFX6-NEXT: v_floor_f32_e32 v6, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v6 -; GFX6-NEXT: v_floor_f32_e32 v8, v5 -; GFX6-NEXT: v_sub_f32_e32 v6, v4, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v9, v8 -; GFX6-NEXT: v_sub_f32_e32 v8, v5, v8 -; GFX6-NEXT: v_min_f32_e32 v6, 0x3f7fe000, v6 +; GFX6-NEXT: v_floor_f32_e32 v5, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v5 +; GFX6-NEXT: v_floor_f32_e32 v7, v4 +; GFX6-NEXT: v_sub_f32_e32 v5, v3, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX6-NEXT: v_sub_f32_e32 v7, v4, v7 +; GFX6-NEXT: v_min_f32_e32 v5, 0x3f7fe000, v5 +; GFX6-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX6-NEXT: v_min_f32_e32 v7, 0x3f7fe000, v7 +; GFX6-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX6-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX6-NEXT: v_min_f32_e32 v8, 0x3f7fe000, v8 -; GFX6-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc -; GFX6-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX6-NEXT: v_cndmask_b32_e32 v5, v8, v5, vcc +; GFX6-NEXT: v_and_b32_e32 v5, 0x7fff, v0 +; GFX6-NEXT: v_bfe_u32 v0, v0, 16, 15 +; GFX6-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, s8, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v5, vcc -; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, s8, v1 +; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc +; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, s8, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_cndmask_b32_e32 v3, 0, v4, vcc +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: v_or_b32_e32 v7, v9, v7 -; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc -; GFX6-NEXT: buffer_store_dword v7, v[2:3], s[4:7], 0 addr64 +; GFX6-NEXT: v_or_b32_e32 v4, v8, v4 +; GFX6-NEXT: v_or_b32_e32 v0, v3, v0 +; GFX6-NEXT: buffer_store_dword v4, v[1:2], s[4:7], 0 addr64 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: safe_math_fract_v2f16: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v0 ; GFX7-NEXT: s_movk_i32 s8, 0x7c00 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v0 -; GFX7-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; GFX7-NEXT: v_and_b32_e32 v1, 0x7fff, v1 -; GFX7-NEXT: v_floor_f32_e32 v6, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v6 -; GFX7-NEXT: v_floor_f32_e32 v8, v5 -; GFX7-NEXT: v_sub_f32_e32 v6, v4, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v9, v8 -; GFX7-NEXT: v_sub_f32_e32 v8, v5, v8 -; GFX7-NEXT: v_min_f32_e32 v6, 0x3f7fe000, v6 +; GFX7-NEXT: v_floor_f32_e32 v5, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v5 +; GFX7-NEXT: v_floor_f32_e32 v7, v4 +; GFX7-NEXT: v_sub_f32_e32 v5, v3, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX7-NEXT: v_sub_f32_e32 v7, v4, v7 +; GFX7-NEXT: v_min_f32_e32 v5, 0x3f7fe000, v5 +; GFX7-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX7-NEXT: v_min_f32_e32 v7, 0x3f7fe000, v7 +; GFX7-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX7-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX7-NEXT: v_min_f32_e32 v8, 0x3f7fe000, v8 -; GFX7-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc -; GFX7-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX7-NEXT: v_cndmask_b32_e32 v5, v8, v5, vcc +; GFX7-NEXT: v_and_b32_e32 v5, 0x7fff, v0 +; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15 +; GFX7-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc ; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, s8, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX7-NEXT: v_cndmask_b32_e32 v0, 0, v5, vcc -; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, s8, v1 +; GFX7-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc +; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, s8, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cndmask_b32_e32 v3, 0, v4, vcc +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: v_or_b32_e32 v7, v9, v7 -; GFX7-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc -; GFX7-NEXT: buffer_store_dword v7, v[2:3], s[4:7], 0 addr64 +; GFX7-NEXT: v_or_b32_e32 v4, v8, v4 +; GFX7-NEXT: v_or_b32_e32 v0, v3, v0 +; GFX7-NEXT: buffer_store_dword v4, v[1:2], s[4:7], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/fshr.ll b/llvm/test/CodeGen/AMDGPU/fshr.ll index 7afb2cf317869..be49838f86417 100644 --- a/llvm/test/CodeGen/AMDGPU/fshr.ll +++ b/llvm/test/CodeGen/AMDGPU/fshr.ll @@ -1618,16 +1618,17 @@ define <2 x i16> @v_fshr_v2i16(<2 x i16> %src0, <2 x i16> %src1, <2 x i16> %src2 ; SI-LABEL: v_fshr_v2i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_or_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 ; SI-NEXT: v_or_b32_e32 v4, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v3, v5 -; SI-NEXT: v_alignbit_b32 v0, v0, v2, v4 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_or_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v3, v3, v5, v4 +; SI-NEXT: v_alignbit_b32 v0, v0, v1, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v0, v3 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fshr_v2i16: @@ -1716,20 +1717,21 @@ define <3 x i16> @v_fshr_v3i16(<3 x i16> %src0, <3 x i16> %src1, <3 x i16> %src2 ; SI-LABEL: v_fshr_v3i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_or_b32_e32 v7, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_alignbit_b32 v1, v1, v4, v7 -; SI-NEXT: v_or_b32_e32 v4, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v4 +; SI-NEXT: v_or_b32_e32 v5, 16, v5 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_alignbit_b32 v0, v0, v3, v4 -; SI-NEXT: v_or_b32_e32 v3, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v7, 16, v7 +; SI-NEXT: v_alignbit_b32 v1, v1, v3, v5 +; SI-NEXT: v_or_b32_e32 v3, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_alignbit_b32 v6, v6, v8, v7 +; SI-NEXT: v_alignbit_b32 v0, v0, v2, v3 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_alignbit_b32 v3, v2, v4, v3 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v3 -; SI-NEXT: v_alignbit_b32 v1, v3, v1, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v6 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fshr_v3i16: @@ -1919,26 +1921,28 @@ define <4 x i16> @v_fshr_v4i16(<4 x i16> %src0, <4 x i16> %src1, <4 x i16> %src2 ; SI-LABEL: v_fshr_v4i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v1 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v3 ; SI-NEXT: v_or_b32_e32 v9, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_alignbit_b32 v1, v1, v5, v9 -; SI-NEXT: v_or_b32_e32 v5, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_alignbit_b32 v0, v0, v4, v5 -; SI-NEXT: v_or_b32_e32 v4, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 -; SI-NEXT: v_alignbit_b32 v3, v3, v5, v4 -; SI-NEXT: v_or_b32_e32 v5, 16, v10 +; SI-NEXT: v_or_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; SI-NEXT: v_alignbit_b32 v8, v8, v10, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v7, 16, v7 +; SI-NEXT: v_alignbit_b32 v1, v1, v3, v5 +; SI-NEXT: v_or_b32_e32 v3, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_alignbit_b32 v6, v6, v9, v7 +; SI-NEXT: v_alignbit_b32 v0, v0, v2, v3 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_alignbit_b32 v2, v2, v6, v5 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v2, v2, v4 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v6 +; SI-NEXT: v_or_b32_e32 v1, v1, v8 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fshr_v4i16: diff --git a/llvm/test/CodeGen/AMDGPU/function-args.ll b/llvm/test/CodeGen/AMDGPU/function-args.ll index 5babe9fb3d851..1117e7f74f11c 100644 --- a/llvm/test/CodeGen/AMDGPU/function-args.ll +++ b/llvm/test/CodeGen/AMDGPU/function-args.ll @@ -974,26 +974,14 @@ define void @void_func_v2i8(<2 x i8> %arg0) #0 { } define void @void_func_v2i16(<2 x i16> %arg0) #0 { -; CI-LABEL: void_func_v2i16: -; CI: ; %bb.0: -; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; CI-NEXT: v_or_b32_e32 v0, v0, v1 -; CI-NEXT: s_mov_b32 s7, 0xf000 -; CI-NEXT: s_mov_b32 s6, -1 -; CI-NEXT: buffer_store_dword v0, off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: s_setpc_b64 s[30:31] -; -; GFX89-LABEL: void_func_v2i16: -; GFX89: ; %bb.0: -; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX89-NEXT: s_mov_b32 s7, 0xf000 -; GFX89-NEXT: s_mov_b32 s6, -1 -; GFX89-NEXT: buffer_store_dword v0, off, s[4:7], 0 -; GFX89-NEXT: s_waitcnt vmcnt(0) -; GFX89-NEXT: s_setpc_b64 s[30:31] +; CIGFX89-LABEL: void_func_v2i16: +; CIGFX89: ; %bb.0: +; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIGFX89-NEXT: s_mov_b32 s7, 0xf000 +; CIGFX89-NEXT: s_mov_b32 s6, -1 +; CIGFX89-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: void_func_v2i16: ; GFX11: ; %bb.0: @@ -1775,28 +1763,15 @@ define void @void_func_v32i8(<32 x i8> %arg0) #0 { } define void @void_func_v3i16(<3 x i16> %arg0) #0 { -; CI-LABEL: void_func_v3i16: -; CI: ; %bb.0: -; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; CI-NEXT: s_mov_b32 s7, 0xf000 -; CI-NEXT: s_mov_b32 s6, -1 -; CI-NEXT: v_or_b32_e32 v0, v0, v1 -; CI-NEXT: buffer_store_short v2, off, s[4:7], 0 -; CI-NEXT: buffer_store_dword v0, off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: s_setpc_b64 s[30:31] -; -; GFX89-LABEL: void_func_v3i16: -; GFX89: ; %bb.0: -; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX89-NEXT: s_mov_b32 s7, 0xf000 -; GFX89-NEXT: s_mov_b32 s6, -1 -; GFX89-NEXT: buffer_store_short v1, off, s[4:7], 0 -; GFX89-NEXT: buffer_store_dword v0, off, s[4:7], 0 -; GFX89-NEXT: s_waitcnt vmcnt(0) -; GFX89-NEXT: s_setpc_b64 s[30:31] +; CIGFX89-LABEL: void_func_v3i16: +; CIGFX89: ; %bb.0: +; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIGFX89-NEXT: s_mov_b32 s7, 0xf000 +; CIGFX89-NEXT: s_mov_b32 s6, -1 +; CIGFX89-NEXT: buffer_store_short v1, off, s[4:7], 0 +; CIGFX89-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: void_func_v3i16: ; GFX11: ; %bb.0: @@ -1812,29 +1787,14 @@ define void @void_func_v3i16(<3 x i16> %arg0) #0 { } define void @void_func_v4i16(<4 x i16> %arg0) #0 { -; CI-LABEL: void_func_v4i16: -; CI: ; %bb.0: -; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; CI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; CI-NEXT: v_or_b32_e32 v2, v2, v3 -; CI-NEXT: v_or_b32_e32 v1, v0, v1 -; CI-NEXT: s_mov_b32 s7, 0xf000 -; CI-NEXT: s_mov_b32 s6, -1 -; CI-NEXT: buffer_store_dwordx2 v[1:2], off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: s_setpc_b64 s[30:31] -; -; GFX89-LABEL: void_func_v4i16: -; GFX89: ; %bb.0: -; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX89-NEXT: s_mov_b32 s7, 0xf000 -; GFX89-NEXT: s_mov_b32 s6, -1 -; GFX89-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 -; GFX89-NEXT: s_waitcnt vmcnt(0) -; GFX89-NEXT: s_setpc_b64 s[30:31] +; CIGFX89-LABEL: void_func_v4i16: +; CIGFX89: ; %bb.0: +; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIGFX89-NEXT: s_mov_b32 s7, 0xf000 +; CIGFX89-NEXT: s_mov_b32 s6, -1 +; CIGFX89-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: void_func_v4i16: ; GFX11: ; %bb.0: @@ -1848,31 +1808,15 @@ define void @void_func_v4i16(<4 x i16> %arg0) #0 { } define void @void_func_v5i16(<5 x i16> %arg0) #0 { -; CI-LABEL: void_func_v5i16: -; CI: ; %bb.0: -; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; CI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; CI-NEXT: s_mov_b32 s7, 0xf000 -; CI-NEXT: s_mov_b32 s6, -1 -; CI-NEXT: v_or_b32_e32 v2, v2, v3 -; CI-NEXT: v_or_b32_e32 v1, v0, v1 -; CI-NEXT: buffer_store_short v4, off, s[4:7], 0 -; CI-NEXT: buffer_store_dwordx2 v[1:2], off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: s_setpc_b64 s[30:31] -; -; GFX89-LABEL: void_func_v5i16: -; GFX89: ; %bb.0: -; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX89-NEXT: s_mov_b32 s7, 0xf000 -; GFX89-NEXT: s_mov_b32 s6, -1 -; GFX89-NEXT: buffer_store_short v2, off, s[4:7], 0 -; GFX89-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 -; GFX89-NEXT: s_waitcnt vmcnt(0) -; GFX89-NEXT: s_setpc_b64 s[30:31] +; CIGFX89-LABEL: void_func_v5i16: +; CIGFX89: ; %bb.0: +; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIGFX89-NEXT: s_mov_b32 s7, 0xf000 +; CIGFX89-NEXT: s_mov_b32 s6, -1 +; CIGFX89-NEXT: buffer_store_short v2, off, s[4:7], 0 +; CIGFX89-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: void_func_v5i16: ; GFX11: ; %bb.0: @@ -1888,35 +1832,14 @@ define void @void_func_v5i16(<5 x i16> %arg0) #0 { } define void @void_func_v8i16(<8 x i16> %arg0) #0 { -; CI-LABEL: void_func_v8i16: -; CI: ; %bb.0: -; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; CI-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; CI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; CI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; CI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; CI-NEXT: v_or_b32_e32 v6, v6, v7 -; CI-NEXT: v_or_b32_e32 v5, v4, v5 -; CI-NEXT: v_or_b32_e32 v4, v2, v3 -; CI-NEXT: v_or_b32_e32 v3, v0, v1 -; CI-NEXT: s_mov_b32 s7, 0xf000 -; CI-NEXT: s_mov_b32 s6, -1 -; CI-NEXT: buffer_store_dwordx4 v[3:6], off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: s_setpc_b64 s[30:31] -; -; GFX89-LABEL: void_func_v8i16: -; GFX89: ; %bb.0: -; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX89-NEXT: s_mov_b32 s7, 0xf000 -; GFX89-NEXT: s_mov_b32 s6, -1 -; GFX89-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 -; GFX89-NEXT: s_waitcnt vmcnt(0) -; GFX89-NEXT: s_setpc_b64 s[30:31] +; CIGFX89-LABEL: void_func_v8i16: +; CIGFX89: ; %bb.0: +; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIGFX89-NEXT: s_mov_b32 s7, 0xf000 +; CIGFX89-NEXT: s_mov_b32 s6, -1 +; CIGFX89-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: void_func_v8i16: ; GFX11: ; %bb.0: @@ -1930,49 +1853,15 @@ define void @void_func_v8i16(<8 x i16> %arg0) #0 { } define void @void_func_v16i16(<16 x i16> %arg0) #0 { -; CI-LABEL: void_func_v16i16: -; CI: ; %bb.0: -; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; CI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; CI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; CI-NEXT: v_or_b32_e32 v5, v4, v5 -; CI-NEXT: v_or_b32_e32 v4, v2, v3 -; CI-NEXT: v_or_b32_e32 v3, v0, v1 -; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v15 -; CI-NEXT: v_and_b32_e32 v1, 0xffff, v14 -; CI-NEXT: v_or_b32_e32 v14, v1, v0 -; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v13 -; CI-NEXT: v_and_b32_e32 v1, 0xffff, v12 -; CI-NEXT: v_or_b32_e32 v13, v1, v0 -; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v11 -; CI-NEXT: v_and_b32_e32 v1, 0xffff, v10 -; CI-NEXT: v_or_b32_e32 v12, v1, v0 -; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v9 -; CI-NEXT: v_and_b32_e32 v1, 0xffff, v8 -; CI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; CI-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; CI-NEXT: v_or_b32_e32 v11, v1, v0 -; CI-NEXT: s_mov_b32 s7, 0xf000 -; CI-NEXT: s_mov_b32 s6, -1 -; CI-NEXT: v_or_b32_e32 v6, v6, v7 -; CI-NEXT: buffer_store_dwordx4 v[11:14], off, s[4:7], 0 -; CI-NEXT: buffer_store_dwordx4 v[3:6], off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: s_setpc_b64 s[30:31] -; -; GFX89-LABEL: void_func_v16i16: -; GFX89: ; %bb.0: -; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX89-NEXT: s_mov_b32 s7, 0xf000 -; GFX89-NEXT: s_mov_b32 s6, -1 -; GFX89-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 -; GFX89-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 -; GFX89-NEXT: s_waitcnt vmcnt(0) -; GFX89-NEXT: s_setpc_b64 s[30:31] +; CIGFX89-LABEL: void_func_v16i16: +; CIGFX89: ; %bb.0: +; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIGFX89-NEXT: s_mov_b32 s7, 0xf000 +; CIGFX89-NEXT: s_mov_b32 s6, -1 +; CIGFX89-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; CIGFX89-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: void_func_v16i16: ; GFX11: ; %bb.0: @@ -2311,27 +2200,14 @@ define void @void_func_v16f64(<16 x double> %arg0) #0 { } define void @void_func_v2f16(<2 x half> %arg0) #0 { -; CI-LABEL: void_func_v2f16: -; CI: ; %bb.0: -; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; CI-NEXT: s_mov_b32 s7, 0xf000 -; CI-NEXT: s_mov_b32 s6, -1 -; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; CI-NEXT: v_or_b32_e32 v0, v0, v1 -; CI-NEXT: buffer_store_dword v0, off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: s_setpc_b64 s[30:31] -; -; GFX89-LABEL: void_func_v2f16: -; GFX89: ; %bb.0: -; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX89-NEXT: s_mov_b32 s7, 0xf000 -; GFX89-NEXT: s_mov_b32 s6, -1 -; GFX89-NEXT: buffer_store_dword v0, off, s[4:7], 0 -; GFX89-NEXT: s_waitcnt vmcnt(0) -; GFX89-NEXT: s_setpc_b64 s[30:31] +; CIGFX89-LABEL: void_func_v2f16: +; CIGFX89: ; %bb.0: +; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIGFX89-NEXT: s_mov_b32 s7, 0xf000 +; CIGFX89-NEXT: s_mov_b32 s6, -1 +; CIGFX89-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: void_func_v2f16: ; GFX11: ; %bb.0: @@ -2346,30 +2222,15 @@ define void @void_func_v2f16(<2 x half> %arg0) #0 { ; FIXME: Different abi if f16 legal define void @void_func_v3f16(<3 x half> %arg0) #0 { -; CI-LABEL: void_func_v3f16: -; CI: ; %bb.0: -; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: s_mov_b32 s7, 0xf000 -; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; CI-NEXT: s_mov_b32 s6, -1 -; CI-NEXT: v_or_b32_e32 v0, v0, v1 -; CI-NEXT: buffer_store_short v2, off, s[4:7], 0 -; CI-NEXT: buffer_store_dword v0, off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: s_setpc_b64 s[30:31] -; -; GFX89-LABEL: void_func_v3f16: -; GFX89: ; %bb.0: -; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX89-NEXT: s_mov_b32 s7, 0xf000 -; GFX89-NEXT: s_mov_b32 s6, -1 -; GFX89-NEXT: buffer_store_short v1, off, s[4:7], 0 -; GFX89-NEXT: buffer_store_dword v0, off, s[4:7], 0 -; GFX89-NEXT: s_waitcnt vmcnt(0) -; GFX89-NEXT: s_setpc_b64 s[30:31] +; CIGFX89-LABEL: void_func_v3f16: +; CIGFX89: ; %bb.0: +; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIGFX89-NEXT: s_mov_b32 s7, 0xf000 +; CIGFX89-NEXT: s_mov_b32 s6, -1 +; CIGFX89-NEXT: buffer_store_short v1, off, s[4:7], 0 +; CIGFX89-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: void_func_v3f16: ; GFX11: ; %bb.0: @@ -2385,31 +2246,14 @@ define void @void_func_v3f16(<3 x half> %arg0) #0 { } define void @void_func_v4f16(<4 x half> %arg0) #0 { -; CI-LABEL: void_func_v4f16: -; CI: ; %bb.0: -; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_cvt_f16_f32_e32 v4, v1 -; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; CI-NEXT: v_or_b32_e32 v1, v2, v1 -; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; CI-NEXT: v_or_b32_e32 v0, v0, v2 -; CI-NEXT: s_mov_b32 s7, 0xf000 -; CI-NEXT: s_mov_b32 s6, -1 -; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: s_setpc_b64 s[30:31] -; -; GFX89-LABEL: void_func_v4f16: -; GFX89: ; %bb.0: -; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX89-NEXT: s_mov_b32 s7, 0xf000 -; GFX89-NEXT: s_mov_b32 s6, -1 -; GFX89-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 -; GFX89-NEXT: s_waitcnt vmcnt(0) -; GFX89-NEXT: s_setpc_b64 s[30:31] +; CIGFX89-LABEL: void_func_v4f16: +; CIGFX89: ; %bb.0: +; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIGFX89-NEXT: s_mov_b32 s7, 0xf000 +; CIGFX89-NEXT: s_mov_b32 s6, -1 +; CIGFX89-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: void_func_v4f16: ; GFX11: ; %bb.0: @@ -2423,39 +2267,14 @@ define void @void_func_v4f16(<4 x half> %arg0) #0 { } define void @void_func_v8f16(<8 x half> %arg0) #0 { -; CI-LABEL: void_func_v8f16: -; CI: ; %bb.0: -; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; CI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; CI-NEXT: v_cvt_f16_f32_e32 v8, v5 -; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; CI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 -; CI-NEXT: v_or_b32_e32 v5, v6, v5 -; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v8 -; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; CI-NEXT: v_or_b32_e32 v4, v4, v6 -; CI-NEXT: v_or_b32_e32 v3, v2, v3 -; CI-NEXT: v_or_b32_e32 v2, v0, v1 -; CI-NEXT: s_mov_b32 s7, 0xf000 -; CI-NEXT: s_mov_b32 s6, -1 -; CI-NEXT: buffer_store_dwordx4 v[2:5], off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: s_setpc_b64 s[30:31] -; -; GFX89-LABEL: void_func_v8f16: -; GFX89: ; %bb.0: -; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX89-NEXT: s_mov_b32 s7, 0xf000 -; GFX89-NEXT: s_mov_b32 s6, -1 -; GFX89-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 -; GFX89-NEXT: s_waitcnt vmcnt(0) -; GFX89-NEXT: s_setpc_b64 s[30:31] +; CIGFX89-LABEL: void_func_v8f16: +; CIGFX89: ; %bb.0: +; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIGFX89-NEXT: s_mov_b32 s7, 0xf000 +; CIGFX89-NEXT: s_mov_b32 s6, -1 +; CIGFX89-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: void_func_v8f16: ; GFX11: ; %bb.0: @@ -2469,57 +2288,15 @@ define void @void_func_v8f16(<8 x half> %arg0) #0 { } define void @void_func_v16f16(<16 x half> %arg0) #0 { -; CI-LABEL: void_func_v16f16: -; CI: ; %bb.0: -; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; CI-NEXT: v_cvt_f16_f32_e32 v16, v5 -; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; CI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 -; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; CI-NEXT: v_or_b32_e32 v5, v6, v5 -; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v16 -; CI-NEXT: v_or_b32_e32 v3, v2, v3 -; CI-NEXT: v_or_b32_e32 v2, v0, v1 -; CI-NEXT: v_cvt_f16_f32_e32 v0, v15 -; CI-NEXT: v_or_b32_e32 v4, v4, v6 -; CI-NEXT: v_cvt_f16_f32_e32 v1, v14 -; CI-NEXT: v_cvt_f16_f32_e32 v6, v13 -; CI-NEXT: v_cvt_f16_f32_e32 v7, v12 -; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; CI-NEXT: v_or_b32_e32 v13, v1, v0 -; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; CI-NEXT: v_or_b32_e32 v12, v7, v0 -; CI-NEXT: v_cvt_f16_f32_e32 v0, v11 -; CI-NEXT: v_cvt_f16_f32_e32 v1, v10 -; CI-NEXT: v_cvt_f16_f32_e32 v6, v9 -; CI-NEXT: v_cvt_f16_f32_e32 v7, v8 -; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; CI-NEXT: v_or_b32_e32 v11, v1, v0 -; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; CI-NEXT: v_or_b32_e32 v10, v7, v0 -; CI-NEXT: s_mov_b32 s7, 0xf000 -; CI-NEXT: s_mov_b32 s6, -1 -; CI-NEXT: buffer_store_dwordx4 v[10:13], off, s[4:7], 0 -; CI-NEXT: buffer_store_dwordx4 v[2:5], off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: s_setpc_b64 s[30:31] -; -; GFX89-LABEL: void_func_v16f16: -; GFX89: ; %bb.0: -; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX89-NEXT: s_mov_b32 s7, 0xf000 -; GFX89-NEXT: s_mov_b32 s6, -1 -; GFX89-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 -; GFX89-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 -; GFX89-NEXT: s_waitcnt vmcnt(0) -; GFX89-NEXT: s_setpc_b64 s[30:31] +; CIGFX89-LABEL: void_func_v16f16: +; CIGFX89: ; %bb.0: +; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIGFX89-NEXT: s_mov_b32 s7, 0xf000 +; CIGFX89-NEXT: s_mov_b32 s6, -1 +; CIGFX89-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; CIGFX89-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: void_func_v16f16: ; GFX11: ; %bb.0: @@ -3172,68 +2949,45 @@ define void @void_func_v32i32_v2i16_v2f16_v2bf16_v4bf16(<32 x i32> %arg0, <2 x i ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:16 +; CI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20 +; CI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:4 +; CI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:8 +; CI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:12 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, -1 -; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28 -; CI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:32 -; CI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:36 -; CI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:40 -; CI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:20 -; CI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:24 -; CI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:12 -; CI-NEXT: s_waitcnt vmcnt(7) +; CI-NEXT: s_waitcnt vmcnt(5) ; CI-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:16 ; CI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:8 -; CI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:4 ; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_lshrrev_b32_e32 v8, 16, v32 +; CI-NEXT: v_lshrrev_b32_e32 v9, 16, v33 ; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v10, v38 -; CI-NEXT: v_mul_f32_e32 v4, 1.0, v32 -; CI-NEXT: v_mul_f32_e32 v5, 1.0, v33 -; CI-NEXT: v_mul_f32_e32 v6, 1.0, v34 -; CI-NEXT: v_mul_f32_e32 v7, 1.0, v35 -; CI-NEXT: v_mul_f32_e32 v8, 1.0, v36 -; CI-NEXT: v_mul_f32_e32 v9, 1.0, v37 ; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_short v16, off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_short v17, off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v11, v20 -; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v4 -; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 -; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v6 -; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v7 -; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v8 -; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v9 -; CI-NEXT: buffer_store_short v11, off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_short v10, off, s[4:7], 0 +; CI-NEXT: buffer_store_dword v34, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_short v5, off, s[4:7], 0 +; CI-NEXT: buffer_store_dword v35, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_short v4, off, s[4:7], 0 +; CI-NEXT: buffer_store_dword v36, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_short v3, off, s[4:7], 0 +; CI-NEXT: buffer_store_short v9, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_short v2, off, s[4:7], 0 +; CI-NEXT: buffer_store_short v33, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_short v1, off, s[4:7], 0 +; CI-NEXT: buffer_store_short v8, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; CI-NEXT: buffer_store_short v32, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: s_setpc_b64 s[30:31] ; @@ -4712,27 +4466,14 @@ define void @void_func_bf16(bfloat %arg0) #0 { } define void @void_func_v2bf16(<2 x bfloat> %arg0) #0 { -; CI-LABEL: void_func_v2bf16: -; CI: ; %bb.0: -; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; CI-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; CI-NEXT: s_mov_b32 s7, 0xf000 -; CI-NEXT: s_mov_b32 s6, -1 -; CI-NEXT: buffer_store_dword v0, off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: s_setpc_b64 s[30:31] -; -; GFX89-LABEL: void_func_v2bf16: -; GFX89: ; %bb.0: -; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX89-NEXT: s_mov_b32 s7, 0xf000 -; GFX89-NEXT: s_mov_b32 s6, -1 -; GFX89-NEXT: buffer_store_dword v0, off, s[4:7], 0 -; GFX89-NEXT: s_waitcnt vmcnt(0) -; GFX89-NEXT: s_setpc_b64 s[30:31] +; CIGFX89-LABEL: void_func_v2bf16: +; CIGFX89: ; %bb.0: +; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIGFX89-NEXT: s_mov_b32 s7, 0xf000 +; CIGFX89-NEXT: s_mov_b32 s6, -1 +; CIGFX89-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: void_func_v2bf16: ; GFX11: ; %bb.0: @@ -4746,31 +4487,15 @@ define void @void_func_v2bf16(<2 x bfloat> %arg0) #0 { } define void @void_func_v3bf16(<3 x bfloat> %arg0) #0 { -; CI-LABEL: void_func_v3bf16: -; CI: ; %bb.0: -; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; CI-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; CI-NEXT: v_mul_f32_e32 v1, 1.0, v2 -; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; CI-NEXT: s_mov_b32 s7, 0xf000 -; CI-NEXT: s_mov_b32 s6, -1 -; CI-NEXT: buffer_store_short v1, off, s[4:7], 0 -; CI-NEXT: buffer_store_dword v0, off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: s_setpc_b64 s[30:31] -; -; GFX89-LABEL: void_func_v3bf16: -; GFX89: ; %bb.0: -; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX89-NEXT: s_mov_b32 s7, 0xf000 -; GFX89-NEXT: s_mov_b32 s6, -1 -; GFX89-NEXT: buffer_store_short v1, off, s[4:7], 0 -; GFX89-NEXT: buffer_store_dword v0, off, s[4:7], 0 -; GFX89-NEXT: s_waitcnt vmcnt(0) -; GFX89-NEXT: s_setpc_b64 s[30:31] +; CIGFX89-LABEL: void_func_v3bf16: +; CIGFX89: ; %bb.0: +; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIGFX89-NEXT: s_mov_b32 s7, 0xf000 +; CIGFX89-NEXT: s_mov_b32 s6, -1 +; CIGFX89-NEXT: buffer_store_short v1, off, s[4:7], 0 +; CIGFX89-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: void_func_v3bf16: ; GFX11: ; %bb.0: @@ -4786,31 +4511,14 @@ define void @void_func_v3bf16(<3 x bfloat> %arg0) #0 { } define void @void_func_v4bf16(<4 x bfloat> %arg0) #0 { -; CI-LABEL: void_func_v4bf16: -; CI: ; %bb.0: -; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; CI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; CI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; CI-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; CI-NEXT: v_alignbit_b32 v1, v1, v0, 16 -; CI-NEXT: s_mov_b32 s7, 0xf000 -; CI-NEXT: s_mov_b32 s6, -1 -; CI-NEXT: buffer_store_dwordx2 v[1:2], off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: s_setpc_b64 s[30:31] -; -; GFX89-LABEL: void_func_v4bf16: -; GFX89: ; %bb.0: -; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX89-NEXT: s_mov_b32 s7, 0xf000 -; GFX89-NEXT: s_mov_b32 s6, -1 -; GFX89-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 -; GFX89-NEXT: s_waitcnt vmcnt(0) -; GFX89-NEXT: s_setpc_b64 s[30:31] +; CIGFX89-LABEL: void_func_v4bf16: +; CIGFX89: ; %bb.0: +; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIGFX89-NEXT: s_mov_b32 s7, 0xf000 +; CIGFX89-NEXT: s_mov_b32 s6, -1 +; CIGFX89-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: void_func_v4bf16: ; GFX11: ; %bb.0: @@ -4824,39 +4532,14 @@ define void @void_func_v4bf16(<4 x bfloat> %arg0) #0 { } define void @void_func_v8bf16(<8 x bfloat> %arg0) #0 { -; CI-LABEL: void_func_v8bf16: -; CI: ; %bb.0: -; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; CI-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; CI-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; CI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; CI-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; CI-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; CI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; CI-NEXT: v_alignbit_b32 v6, v7, v6, 16 -; CI-NEXT: v_alignbit_b32 v5, v5, v4, 16 -; CI-NEXT: v_alignbit_b32 v4, v3, v2, 16 -; CI-NEXT: v_alignbit_b32 v3, v1, v0, 16 -; CI-NEXT: s_mov_b32 s7, 0xf000 -; CI-NEXT: s_mov_b32 s6, -1 -; CI-NEXT: buffer_store_dwordx4 v[3:6], off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: s_setpc_b64 s[30:31] -; -; GFX89-LABEL: void_func_v8bf16: -; GFX89: ; %bb.0: -; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX89-NEXT: s_mov_b32 s7, 0xf000 -; GFX89-NEXT: s_mov_b32 s6, -1 -; GFX89-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 -; GFX89-NEXT: s_waitcnt vmcnt(0) -; GFX89-NEXT: s_setpc_b64 s[30:31] +; CIGFX89-LABEL: void_func_v8bf16: +; CIGFX89: ; %bb.0: +; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIGFX89-NEXT: s_mov_b32 s7, 0xf000 +; CIGFX89-NEXT: s_mov_b32 s6, -1 +; CIGFX89-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: void_func_v8bf16: ; GFX11: ; %bb.0: @@ -4870,57 +4553,15 @@ define void @void_func_v8bf16(<8 x bfloat> %arg0) #0 { } define void @void_func_v16bf16(<16 x bfloat> %arg0) #0 { -; CI-LABEL: void_func_v16bf16: -; CI: ; %bb.0: -; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; CI-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; CI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; CI-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; CI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; CI-NEXT: v_alignbit_b32 v5, v5, v4, 16 -; CI-NEXT: v_alignbit_b32 v4, v3, v2, 16 -; CI-NEXT: v_alignbit_b32 v3, v1, v0, 16 -; CI-NEXT: v_mul_f32_e32 v0, 1.0, v15 -; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; CI-NEXT: v_mul_f32_e32 v1, 1.0, v14 -; CI-NEXT: v_alignbit_b32 v14, v0, v1, 16 -; CI-NEXT: v_mul_f32_e32 v0, 1.0, v13 -; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; CI-NEXT: v_mul_f32_e32 v1, 1.0, v12 -; CI-NEXT: v_alignbit_b32 v13, v0, v1, 16 -; CI-NEXT: v_mul_f32_e32 v0, 1.0, v11 -; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; CI-NEXT: v_mul_f32_e32 v1, 1.0, v10 -; CI-NEXT: v_alignbit_b32 v12, v0, v1, 16 -; CI-NEXT: v_mul_f32_e32 v0, 1.0, v9 -; CI-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; CI-NEXT: v_mul_f32_e32 v1, 1.0, v8 -; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; CI-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; CI-NEXT: v_alignbit_b32 v11, v0, v1, 16 -; CI-NEXT: s_mov_b32 s7, 0xf000 -; CI-NEXT: s_mov_b32 s6, -1 -; CI-NEXT: v_alignbit_b32 v6, v7, v6, 16 -; CI-NEXT: buffer_store_dwordx4 v[11:14], off, s[4:7], 0 -; CI-NEXT: buffer_store_dwordx4 v[3:6], off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: s_setpc_b64 s[30:31] -; -; GFX89-LABEL: void_func_v16bf16: -; GFX89: ; %bb.0: -; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX89-NEXT: s_mov_b32 s7, 0xf000 -; GFX89-NEXT: s_mov_b32 s6, -1 -; GFX89-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 -; GFX89-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 -; GFX89-NEXT: s_waitcnt vmcnt(0) -; GFX89-NEXT: s_setpc_b64 s[30:31] +; CIGFX89-LABEL: void_func_v16bf16: +; CIGFX89: ; %bb.0: +; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIGFX89-NEXT: s_mov_b32 s7, 0xf000 +; CIGFX89-NEXT: s_mov_b32 s6, -1 +; CIGFX89-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; CIGFX89-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: void_func_v16bf16: ; GFX11: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/function-returns.ll b/llvm/test/CodeGen/AMDGPU/function-returns.ll index 0084d936ec03b..c923431bb17c1 100644 --- a/llvm/test/CodeGen/AMDGPU/function-returns.ll +++ b/llvm/test/CodeGen/AMDGPU/function-returns.ll @@ -858,24 +858,14 @@ define <16 x i64> @v16i64_func_void() #0 { } define <2 x i16> @v2i16_func_void() #0 { -; CI-LABEL: v2i16_func_void: -; CI: ; %bb.0: -; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: s_mov_b32 s7, 0xf000 -; CI-NEXT: s_mov_b32 s6, -1 -; CI-NEXT: buffer_load_dword v0, off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; CI-NEXT: s_setpc_b64 s[30:31] -; -; GFX89-LABEL: v2i16_func_void: -; GFX89: ; %bb.0: -; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX89-NEXT: s_mov_b32 s7, 0xf000 -; GFX89-NEXT: s_mov_b32 s6, -1 -; GFX89-NEXT: buffer_load_dword v0, off, s[4:7], 0 -; GFX89-NEXT: s_waitcnt vmcnt(0) -; GFX89-NEXT: s_setpc_b64 s[30:31] +; GFX789-LABEL: v2i16_func_void: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX789-NEXT: s_mov_b32 s7, 0xf000 +; GFX789-NEXT: s_mov_b32 s6, -1 +; GFX789-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; GFX789-NEXT: s_waitcnt vmcnt(0) +; GFX789-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v2i16_func_void: ; GFX11: ; %bb.0: @@ -895,11 +885,9 @@ define <3 x i16> @v3i16_func_void() #0 { ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, -1 -; CI-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], 0 +; CI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_lshr_b64 v[1:2], v[3:4], 16 -; CI-NEXT: v_mov_b32_e32 v0, v3 -; CI-NEXT: v_mov_b32_e32 v2, v4 +; CI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; GFX89-LABEL: v3i16_func_void: @@ -924,27 +912,14 @@ define <3 x i16> @v3i16_func_void() #0 { } define <4 x i16> @v4i16_func_void() #0 { -; CI-LABEL: v4i16_func_void: -; CI: ; %bb.0: -; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: s_mov_b32 s7, 0xf000 -; CI-NEXT: s_mov_b32 s6, -1 -; CI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; CI-NEXT: v_mov_b32_e32 v2, v1 -; CI-NEXT: v_mov_b32_e32 v1, v4 -; CI-NEXT: s_setpc_b64 s[30:31] -; -; GFX89-LABEL: v4i16_func_void: -; GFX89: ; %bb.0: -; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX89-NEXT: s_mov_b32 s7, 0xf000 -; GFX89-NEXT: s_mov_b32 s6, -1 -; GFX89-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 -; GFX89-NEXT: s_waitcnt vmcnt(0) -; GFX89-NEXT: s_setpc_b64 s[30:31] +; GFX789-LABEL: v4i16_func_void: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX789-NEXT: s_mov_b32 s7, 0xf000 +; GFX789-NEXT: s_mov_b32 s6, -1 +; GFX789-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 +; GFX789-NEXT: s_waitcnt vmcnt(0) +; GFX789-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v4i16_func_void: ; GFX11: ; %bb.0: @@ -959,29 +934,14 @@ define <4 x i16> @v4i16_func_void() #0 { } define <4 x half> @v4f16_func_void() #0 { -; CI-LABEL: v4f16_func_void: -; CI: ; %bb.0: -; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: s_mov_b32 s7, 0xf000 -; CI-NEXT: s_mov_b32 s6, -1 -; CI-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_cvt_f32_f16_e32 v0, v3 -; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v3 -; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v4 -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: s_setpc_b64 s[30:31] -; -; GFX89-LABEL: v4f16_func_void: -; GFX89: ; %bb.0: -; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX89-NEXT: s_mov_b32 s7, 0xf000 -; GFX89-NEXT: s_mov_b32 s6, -1 -; GFX89-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 -; GFX89-NEXT: s_waitcnt vmcnt(0) -; GFX89-NEXT: s_setpc_b64 s[30:31] +; GFX789-LABEL: v4f16_func_void: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX789-NEXT: s_mov_b32 s7, 0xf000 +; GFX789-NEXT: s_mov_b32 s6, -1 +; GFX789-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 +; GFX789-NEXT: s_waitcnt vmcnt(0) +; GFX789-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v4f16_func_void: ; GFX11: ; %bb.0: @@ -1006,12 +966,11 @@ define <5 x i16> @v5i16_func_void() #0 { ; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 -; CI-NEXT: buffer_load_sshort v4, off, s[4:7], 0 offset:8 +; CI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 offset:8 ; CI-NEXT: s_waitcnt vmcnt(1) -; CI-NEXT: v_lshr_b64 v[5:6], v[0:1], 16 -; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; CI-NEXT: v_mov_b32_e32 v2, v1 -; CI-NEXT: v_mov_b32_e32 v1, v5 +; CI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; CI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; CI-NEXT: v_or_b32_e32 v1, v1, v3 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: s_setpc_b64 s[30:31] ; @@ -1042,35 +1001,16 @@ define <5 x i16> @v5i16_func_void() #0 { } define <8 x i16> @v8i16_func_void() #0 { -; CI-LABEL: v8i16_func_void: -; CI: ; %bb.0: -; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; CI-NEXT: s_mov_b32 s7, 0xf000 -; CI-NEXT: s_mov_b32 s6, -1 -; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: buffer_load_dwordx4 v[8:11], off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v8 -; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v9 -; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v10 -; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v11 -; CI-NEXT: v_mov_b32_e32 v0, v8 -; CI-NEXT: v_mov_b32_e32 v2, v9 -; CI-NEXT: v_mov_b32_e32 v4, v10 -; CI-NEXT: v_mov_b32_e32 v6, v11 -; CI-NEXT: s_setpc_b64 s[30:31] -; -; GFX89-LABEL: v8i16_func_void: -; GFX89: ; %bb.0: -; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX89-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GFX89-NEXT: s_mov_b32 s7, 0xf000 -; GFX89-NEXT: s_mov_b32 s6, -1 -; GFX89-NEXT: s_waitcnt lgkmcnt(0) -; GFX89-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 -; GFX89-NEXT: s_waitcnt vmcnt(0) -; GFX89-NEXT: s_setpc_b64 s[30:31] +; GFX789-LABEL: v8i16_func_void: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX789-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX789-NEXT: s_mov_b32 s7, 0xf000 +; GFX789-NEXT: s_mov_b32 s6, -1 +; GFX789-NEXT: s_waitcnt lgkmcnt(0) +; GFX789-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 +; GFX789-NEXT: s_waitcnt vmcnt(0) +; GFX789-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v8i16_func_void: ; GFX11: ; %bb.0: @@ -1088,46 +1028,17 @@ define <8 x i16> @v8i16_func_void() #0 { } define <16 x i16> @v16i16_func_void() #0 { -; CI-LABEL: v16i16_func_void: -; CI: ; %bb.0: -; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; CI-NEXT: s_mov_b32 s7, 0xf000 -; CI-NEXT: s_mov_b32 s6, -1 -; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: buffer_load_dwordx4 v[22:25], off, s[4:7], 0 -; CI-NEXT: buffer_load_dwordx4 v[18:21], off, s[4:7], 0 offset:16 -; CI-NEXT: s_waitcnt vmcnt(1) -; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v22 -; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v23 -; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v24 -; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v25 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_lshrrev_b32_e32 v9, 16, v18 -; CI-NEXT: v_lshrrev_b32_e32 v11, 16, v19 -; CI-NEXT: v_lshrrev_b32_e32 v13, 16, v20 -; CI-NEXT: v_lshrrev_b32_e32 v15, 16, v21 -; CI-NEXT: v_mov_b32_e32 v0, v22 -; CI-NEXT: v_mov_b32_e32 v2, v23 -; CI-NEXT: v_mov_b32_e32 v4, v24 -; CI-NEXT: v_mov_b32_e32 v6, v25 -; CI-NEXT: v_mov_b32_e32 v8, v18 -; CI-NEXT: v_mov_b32_e32 v10, v19 -; CI-NEXT: v_mov_b32_e32 v12, v20 -; CI-NEXT: v_mov_b32_e32 v14, v21 -; CI-NEXT: s_setpc_b64 s[30:31] -; -; GFX89-LABEL: v16i16_func_void: -; GFX89: ; %bb.0: -; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX89-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GFX89-NEXT: s_mov_b32 s7, 0xf000 -; GFX89-NEXT: s_mov_b32 s6, -1 -; GFX89-NEXT: s_waitcnt lgkmcnt(0) -; GFX89-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 -; GFX89-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16 -; GFX89-NEXT: s_waitcnt vmcnt(0) -; GFX89-NEXT: s_setpc_b64 s[30:31] +; GFX789-LABEL: v16i16_func_void: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX789-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX789-NEXT: s_mov_b32 s7, 0xf000 +; GFX789-NEXT: s_mov_b32 s6, -1 +; GFX789-NEXT: s_waitcnt lgkmcnt(0) +; GFX789-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 +; GFX789-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16 +; GFX789-NEXT: s_waitcnt vmcnt(0) +; GFX789-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v16i16_func_void: ; GFX11: ; %bb.0: @@ -2380,25 +2291,14 @@ define bfloat @bf16_func_void() #0 { } define <2 x bfloat> @v2bf16_func_void() #0 { -; CI-LABEL: v2bf16_func_void: -; CI: ; %bb.0: -; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: s_mov_b32 s7, 0xf000 -; CI-NEXT: s_mov_b32 s6, -1 -; CI-NEXT: buffer_load_dword v1, off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; CI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; CI-NEXT: s_setpc_b64 s[30:31] -; -; GFX89-LABEL: v2bf16_func_void: -; GFX89: ; %bb.0: -; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX89-NEXT: s_mov_b32 s7, 0xf000 -; GFX89-NEXT: s_mov_b32 s6, -1 -; GFX89-NEXT: buffer_load_dword v0, off, s[4:7], 0 -; GFX89-NEXT: s_waitcnt vmcnt(0) -; GFX89-NEXT: s_setpc_b64 s[30:31] +; GFX789-LABEL: v2bf16_func_void: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX789-NEXT: s_mov_b32 s7, 0xf000 +; GFX789-NEXT: s_mov_b32 s6, -1 +; GFX789-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; GFX789-NEXT: s_waitcnt vmcnt(0) +; GFX789-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v2bf16_func_void: ; GFX11: ; %bb.0: @@ -2418,11 +2318,9 @@ define <3 x bfloat> @v3bf16_func_void() #0 { ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, -1 -; CI-NEXT: buffer_load_dwordx2 v[1:2], off, s[4:7], 0 +; CI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; CI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; CI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; GFX89-LABEL: v3bf16_func_void: @@ -2447,27 +2345,14 @@ define <3 x bfloat> @v3bf16_func_void() #0 { } define <4 x bfloat> @v4bf16_func_void() #0 { -; CI-LABEL: v4bf16_func_void: -; CI: ; %bb.0: -; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: s_mov_b32 s7, 0xf000 -; CI-NEXT: s_mov_b32 s6, -1 -; CI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; CI-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 -; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; CI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; CI-NEXT: s_setpc_b64 s[30:31] -; -; GFX89-LABEL: v4bf16_func_void: -; GFX89: ; %bb.0: -; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX89-NEXT: s_mov_b32 s7, 0xf000 -; GFX89-NEXT: s_mov_b32 s6, -1 -; GFX89-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 -; GFX89-NEXT: s_waitcnt vmcnt(0) -; GFX89-NEXT: s_setpc_b64 s[30:31] +; GFX789-LABEL: v4bf16_func_void: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX789-NEXT: s_mov_b32 s7, 0xf000 +; GFX789-NEXT: s_mov_b32 s6, -1 +; GFX789-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 +; GFX789-NEXT: s_waitcnt vmcnt(0) +; GFX789-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v4bf16_func_void: ; GFX11: ; %bb.0: @@ -2482,29 +2367,14 @@ define <4 x bfloat> @v4bf16_func_void() #0 { } define <6 x bfloat> @v6bf16_func_void() #0 { -; CI-LABEL: v6bf16_func_void: -; CI: ; %bb.0: -; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: s_mov_b32 s7, 0xf000 -; CI-NEXT: s_mov_b32 s6, -1 -; CI-NEXT: buffer_load_dwordx3 v[3:5], off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 -; CI-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 -; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; CI-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 -; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; CI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; CI-NEXT: s_setpc_b64 s[30:31] -; -; GFX89-LABEL: v6bf16_func_void: -; GFX89: ; %bb.0: -; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX89-NEXT: s_mov_b32 s7, 0xf000 -; GFX89-NEXT: s_mov_b32 s6, -1 -; GFX89-NEXT: buffer_load_dwordx3 v[0:2], off, s[4:7], 0 -; GFX89-NEXT: s_waitcnt vmcnt(0) -; GFX89-NEXT: s_setpc_b64 s[30:31] +; GFX789-LABEL: v6bf16_func_void: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX789-NEXT: s_mov_b32 s7, 0xf000 +; GFX789-NEXT: s_mov_b32 s6, -1 +; GFX789-NEXT: buffer_load_dwordx3 v[0:2], off, s[4:7], 0 +; GFX789-NEXT: s_waitcnt vmcnt(0) +; GFX789-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v6bf16_func_void: ; GFX11: ; %bb.0: @@ -2519,31 +2389,14 @@ define <6 x bfloat> @v6bf16_func_void() #0 { } define <8 x bfloat> @v8bf16_func_void() #0 { -; CI-LABEL: v8bf16_func_void: -; CI: ; %bb.0: -; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: s_mov_b32 s7, 0xf000 -; CI-NEXT: s_mov_b32 s6, -1 -; CI-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; CI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 -; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v5 -; CI-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 -; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v6 -; CI-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; CI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; CI-NEXT: s_setpc_b64 s[30:31] -; -; GFX89-LABEL: v8bf16_func_void: -; GFX89: ; %bb.0: -; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX89-NEXT: s_mov_b32 s7, 0xf000 -; GFX89-NEXT: s_mov_b32 s6, -1 -; GFX89-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 -; GFX89-NEXT: s_waitcnt vmcnt(0) -; GFX89-NEXT: s_setpc_b64 s[30:31] +; GFX789-LABEL: v8bf16_func_void: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX789-NEXT: s_mov_b32 s7, 0xf000 +; GFX789-NEXT: s_mov_b32 s6, -1 +; GFX789-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 +; GFX789-NEXT: s_waitcnt vmcnt(0) +; GFX789-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v8bf16_func_void: ; GFX11: ; %bb.0: @@ -2558,43 +2411,18 @@ define <8 x bfloat> @v8bf16_func_void() #0 { } define <16 x bfloat> @v16bf16_func_void() #0 { -; CI-LABEL: v16bf16_func_void: -; CI: ; %bb.0: -; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: s_mov_b32 s7, 0xf000 -; CI-NEXT: s_mov_b32 s6, -1 -; CI-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; CI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 -; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v5 -; CI-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 -; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v6 -; CI-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; CI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; CI-NEXT: v_mov_b32_e32 v8, v0 -; CI-NEXT: v_mov_b32_e32 v9, v1 -; CI-NEXT: v_mov_b32_e32 v10, v2 -; CI-NEXT: v_mov_b32_e32 v11, v3 -; CI-NEXT: v_mov_b32_e32 v12, v4 -; CI-NEXT: v_mov_b32_e32 v13, v5 -; CI-NEXT: v_mov_b32_e32 v14, v6 -; CI-NEXT: v_mov_b32_e32 v15, v7 -; CI-NEXT: s_setpc_b64 s[30:31] -; -; GFX89-LABEL: v16bf16_func_void: -; GFX89: ; %bb.0: -; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX89-NEXT: s_mov_b32 s7, 0xf000 -; GFX89-NEXT: s_mov_b32 s6, -1 -; GFX89-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 -; GFX89-NEXT: s_waitcnt vmcnt(0) -; GFX89-NEXT: v_mov_b32_e32 v4, v0 -; GFX89-NEXT: v_mov_b32_e32 v5, v1 -; GFX89-NEXT: v_mov_b32_e32 v6, v2 -; GFX89-NEXT: v_mov_b32_e32 v7, v3 -; GFX89-NEXT: s_setpc_b64 s[30:31] +; GFX789-LABEL: v16bf16_func_void: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX789-NEXT: s_mov_b32 s7, 0xf000 +; GFX789-NEXT: s_mov_b32 s6, -1 +; GFX789-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 +; GFX789-NEXT: s_waitcnt vmcnt(0) +; GFX789-NEXT: v_mov_b32_e32 v4, v0 +; GFX789-NEXT: v_mov_b32_e32 v5, v1 +; GFX789-NEXT: v_mov_b32_e32 v6, v2 +; GFX789-NEXT: v_mov_b32_e32 v7, v3 +; GFX789-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v16bf16_func_void: ; GFX11: ; %bb.0: @@ -2611,67 +2439,26 @@ define <16 x bfloat> @v16bf16_func_void() #0 { } define <32 x bfloat> @v32bf16_func_void() #0 { -; CI-LABEL: v32bf16_func_void: -; CI: ; %bb.0: -; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: s_mov_b32 s7, 0xf000 -; CI-NEXT: s_mov_b32 s6, -1 -; CI-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; CI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 -; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v5 -; CI-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 -; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v6 -; CI-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; CI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; CI-NEXT: v_mov_b32_e32 v8, v0 -; CI-NEXT: v_mov_b32_e32 v9, v1 -; CI-NEXT: v_mov_b32_e32 v10, v2 -; CI-NEXT: v_mov_b32_e32 v11, v3 -; CI-NEXT: v_mov_b32_e32 v16, v0 -; CI-NEXT: v_mov_b32_e32 v17, v1 -; CI-NEXT: v_mov_b32_e32 v18, v2 -; CI-NEXT: v_mov_b32_e32 v19, v3 -; CI-NEXT: v_mov_b32_e32 v24, v0 -; CI-NEXT: v_mov_b32_e32 v25, v1 -; CI-NEXT: v_mov_b32_e32 v26, v2 -; CI-NEXT: v_mov_b32_e32 v27, v3 -; CI-NEXT: v_mov_b32_e32 v12, v4 -; CI-NEXT: v_mov_b32_e32 v20, v4 -; CI-NEXT: v_mov_b32_e32 v28, v4 -; CI-NEXT: v_mov_b32_e32 v13, v5 -; CI-NEXT: v_mov_b32_e32 v21, v5 -; CI-NEXT: v_mov_b32_e32 v29, v5 -; CI-NEXT: v_mov_b32_e32 v14, v6 -; CI-NEXT: v_mov_b32_e32 v22, v6 -; CI-NEXT: v_mov_b32_e32 v30, v6 -; CI-NEXT: v_mov_b32_e32 v15, v7 -; CI-NEXT: v_mov_b32_e32 v23, v7 -; CI-NEXT: v_mov_b32_e32 v31, v7 -; CI-NEXT: s_setpc_b64 s[30:31] -; -; GFX89-LABEL: v32bf16_func_void: -; GFX89: ; %bb.0: -; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX89-NEXT: s_mov_b32 s7, 0xf000 -; GFX89-NEXT: s_mov_b32 s6, -1 -; GFX89-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 -; GFX89-NEXT: s_waitcnt vmcnt(0) -; GFX89-NEXT: v_mov_b32_e32 v4, v0 -; GFX89-NEXT: v_mov_b32_e32 v5, v1 -; GFX89-NEXT: v_mov_b32_e32 v6, v2 -; GFX89-NEXT: v_mov_b32_e32 v7, v3 -; GFX89-NEXT: v_mov_b32_e32 v8, v0 -; GFX89-NEXT: v_mov_b32_e32 v9, v1 -; GFX89-NEXT: v_mov_b32_e32 v10, v2 -; GFX89-NEXT: v_mov_b32_e32 v11, v3 -; GFX89-NEXT: v_mov_b32_e32 v12, v0 -; GFX89-NEXT: v_mov_b32_e32 v13, v1 -; GFX89-NEXT: v_mov_b32_e32 v14, v2 -; GFX89-NEXT: v_mov_b32_e32 v15, v3 -; GFX89-NEXT: s_setpc_b64 s[30:31] +; GFX789-LABEL: v32bf16_func_void: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX789-NEXT: s_mov_b32 s7, 0xf000 +; GFX789-NEXT: s_mov_b32 s6, -1 +; GFX789-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 +; GFX789-NEXT: s_waitcnt vmcnt(0) +; GFX789-NEXT: v_mov_b32_e32 v4, v0 +; GFX789-NEXT: v_mov_b32_e32 v5, v1 +; GFX789-NEXT: v_mov_b32_e32 v6, v2 +; GFX789-NEXT: v_mov_b32_e32 v7, v3 +; GFX789-NEXT: v_mov_b32_e32 v8, v0 +; GFX789-NEXT: v_mov_b32_e32 v9, v1 +; GFX789-NEXT: v_mov_b32_e32 v10, v2 +; GFX789-NEXT: v_mov_b32_e32 v11, v3 +; GFX789-NEXT: v_mov_b32_e32 v12, v0 +; GFX789-NEXT: v_mov_b32_e32 v13, v1 +; GFX789-NEXT: v_mov_b32_e32 v14, v2 +; GFX789-NEXT: v_mov_b32_e32 v15, v3 +; GFX789-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v32bf16_func_void: ; GFX11: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll index 5b49e5fa09546..8996a8d9ce4bf 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll @@ -19546,46 +19546,51 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_me ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX7-NEXT: .LBB64_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v4 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX7-NEXT: v_or_b32_e32 v7, v2, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; GFX7-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX7-NEXT: v_mov_b32_e32 v9, v7 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 ; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v8 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v8 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB64_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v0, v2 -; GFX7-NEXT: v_mov_b32_e32 v1, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory: @@ -19595,47 +19600,52 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_me ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX6-NEXT: .LBB64_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v4 -; GFX6-NEXT: v_add_f32_e32 v7, v7, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX6-NEXT: v_or_b32_e32 v7, v2, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; GFX6-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX6-NEXT: v_mov_b32_e32 v9, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX6-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 ; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_mov_b32_e32 v7, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v8 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v8 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB64_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v0, v2 -; GFX6-NEXT: v_mov_b32_e32 v1, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -19790,46 +19800,51 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX7-NEXT: .LBB65_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v4 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX7-NEXT: v_or_b32_e32 v7, v2, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; GFX7-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX7-NEXT: v_mov_b32_e32 v9, v7 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 ; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v8 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v8 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB65_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v0, v2 -; GFX7-NEXT: v_mov_b32_e32 v1, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -19839,47 +19854,52 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX6-NEXT: .LBB65_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v4 -; GFX6-NEXT: v_add_f32_e32 v7, v7, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX6-NEXT: v_or_b32_e32 v7, v2, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; GFX6-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX6-NEXT: v_mov_b32_e32 v9, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX6-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 ; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: v_mov_b32_e32 v7, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v8 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v8 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB65_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v0, v2 -; GFX6-NEXT: v_mov_b32_e32 v1, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511 @@ -20035,48 +20055,55 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX7-NEXT: s_mov_b32 s5, -1 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: buffer_load_dword v6, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v2 -; GFX7-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX7-NEXT: .LBB66_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 ; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX7-NEXT: v_or_b32_e32 v7, v0, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX7-NEXT: v_or_b32_e32 v6, v8, v0 -; GFX7-NEXT: v_mov_b32_e32 v9, v7 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 ; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v[4:5], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v8 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v8 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB66_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -20086,49 +20113,56 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX6-NEXT: s_mov_b32 s5, -1 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: buffer_load_dword v6, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v2 -; GFX6-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX6-NEXT: .LBB66_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 ; GFX6-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX6-NEXT: v_or_b32_e32 v7, v0, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX6-NEXT: v_or_b32_e32 v6, v8, v0 -; GFX6-NEXT: v_mov_b32_e32 v9, v7 +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 ; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: buffer_atomic_cmpswap v[8:9], v[4:5], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_mov_b32_e32 v7, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v8 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v8 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB66_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 -512 @@ -20265,14 +20299,17 @@ define void @global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX7-NEXT: .LBB67_1: ; %atomicrmw.start @@ -20312,14 +20349,17 @@ define void @global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX6-NEXT: .LBB67_1: ; %atomicrmw.start @@ -20488,14 +20528,17 @@ define void @global_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX7-NEXT: .LBB68_1: ; %atomicrmw.start @@ -20535,14 +20578,17 @@ define void @global_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX6-NEXT: .LBB68_1: ; %atomicrmw.start @@ -20712,18 +20758,21 @@ define void @global_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX7-NEXT: s_mov_b32 s5, -1 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX7-NEXT: .LBB69_1: ; %atomicrmw.start @@ -20763,18 +20812,21 @@ define void @global_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX6-NEXT: s_mov_b32 s5, -1 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX6-NEXT: .LBB69_1: ; %atomicrmw.start @@ -20964,46 +21016,51 @@ define <2 x half> @global_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX7-NEXT: .LBB70_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v4 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX7-NEXT: v_or_b32_e32 v7, v2, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; GFX7-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX7-NEXT: v_mov_b32_e32 v9, v7 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 ; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v8 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v8 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB70_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v0, v2 -; GFX7-NEXT: v_mov_b32_e32 v1, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -21013,47 +21070,52 @@ define <2 x half> @global_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX6-NEXT: .LBB70_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v4 -; GFX6-NEXT: v_add_f32_e32 v7, v7, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX6-NEXT: v_or_b32_e32 v7, v2, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; GFX6-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX6-NEXT: v_mov_b32_e32 v9, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX6-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 ; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: v_mov_b32_e32 v7, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v8 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v8 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB70_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v0, v2 -; GFX6-NEXT: v_mov_b32_e32 v1, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511 @@ -21195,14 +21257,17 @@ define void @global_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX7-NEXT: .LBB71_1: ; %atomicrmw.start @@ -21242,14 +21307,17 @@ define void @global_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX6-NEXT: .LBB71_1: ; %atomicrmw.start @@ -21449,46 +21517,51 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory(p ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX7-NEXT: .LBB72_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v4 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX7-NEXT: v_or_b32_e32 v7, v2, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; GFX7-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX7-NEXT: v_mov_b32_e32 v9, v7 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 ; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v8 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v8 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB72_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v0, v2 -; GFX7-NEXT: v_mov_b32_e32 v1, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory: @@ -21498,47 +21571,52 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory(p ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX6-NEXT: .LBB72_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v4 -; GFX6-NEXT: v_add_f32_e32 v7, v7, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX6-NEXT: v_or_b32_e32 v7, v2, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; GFX6-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX6-NEXT: v_mov_b32_e32 v9, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX6-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 ; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_mov_b32_e32 v7, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v8 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v8 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB72_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v0, v2 -; GFX6-NEXT: v_mov_b32_e32 v1, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 @@ -21700,14 +21778,17 @@ define void @global_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory(ptr a ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX7-NEXT: .LBB73_1: ; %atomicrmw.start @@ -21747,14 +21828,17 @@ define void @global_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory(ptr a ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX6-NEXT: .LBB73_1: ; %atomicrmw.start @@ -21939,46 +22023,51 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_me ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX7-NEXT: .LBB74_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v4 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX7-NEXT: v_or_b32_e32 v7, v2, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; GFX7-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX7-NEXT: v_mov_b32_e32 v9, v7 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 ; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v8 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v8 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB74_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v0, v2 -; GFX7-NEXT: v_mov_b32_e32 v1, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -21988,47 +22077,52 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_me ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX6-NEXT: .LBB74_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v4 -; GFX6-NEXT: v_add_f32_e32 v7, v7, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX6-NEXT: v_or_b32_e32 v7, v2, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; GFX6-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX6-NEXT: v_mov_b32_e32 v9, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX6-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 ; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_mov_b32_e32 v7, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v8 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v8 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB74_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v0, v2 -; GFX6-NEXT: v_mov_b32_e32 v1, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 @@ -22164,14 +22258,17 @@ define void @global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX7-NEXT: .LBB75_1: ; %atomicrmw.start @@ -22211,14 +22308,17 @@ define void @global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX6-NEXT: .LBB75_1: ; %atomicrmw.start @@ -22417,46 +22517,51 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__maybe_remote(ptr addrspac ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX7-NEXT: .LBB76_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v4 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX7-NEXT: v_or_b32_e32 v7, v2, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; GFX7-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX7-NEXT: v_mov_b32_e32 v9, v7 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 ; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v8 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v8 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB76_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v0, v2 -; GFX7-NEXT: v_mov_b32_e32 v1, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fadd_ret_v2f16__maybe_remote: @@ -22466,47 +22571,52 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__maybe_remote(ptr addrspac ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX6-NEXT: .LBB76_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v4 -; GFX6-NEXT: v_add_f32_e32 v7, v7, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX6-NEXT: v_or_b32_e32 v7, v2, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; GFX6-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX6-NEXT: v_mov_b32_e32 v9, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX6-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 ; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_mov_b32_e32 v7, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v8 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v8 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB76_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v0, v2 -; GFX6-NEXT: v_mov_b32_e32 v1, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst @@ -22668,14 +22778,17 @@ define void @global_agent_atomic_fadd_noret_v2f16__maybe_remote(ptr addrspace(1) ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX7-NEXT: .LBB77_1: ; %atomicrmw.start @@ -22715,14 +22828,17 @@ define void @global_agent_atomic_fadd_noret_v2f16__maybe_remote(ptr addrspace(1) ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX6-NEXT: .LBB77_1: ; %atomicrmw.start @@ -23064,42 +23180,46 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v3 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 ; GFX7-NEXT: .LBB78_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v5 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v4 -; GFX7-NEXT: v_alignbit_b32 v3, v2, v3, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v7 -; GFX7-NEXT: v_alignbit_b32 v2, v2, v6, 16 -; GFX7-NEXT: v_mov_b32_e32 v7, v3 -; GFX7-NEXT: v_mov_b32_e32 v6, v2 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_alignbit_b32 v5, v4, v5, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v7 +; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16 +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB78_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v0, v3 -; GFX7-NEXT: v_mov_b32_e32 v1, v2 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v5 +; GFX7-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory: @@ -23109,43 +23229,47 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v3 +; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 ; GFX6-NEXT: .LBB78_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_add_f32_e32 v7, v7, v5 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v4 -; GFX6-NEXT: v_alignbit_b32 v3, v2, v3, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v7 -; GFX6-NEXT: v_alignbit_b32 v2, v2, v6, 16 -; GFX6-NEXT: v_mov_b32_e32 v7, v3 -; GFX6-NEXT: v_mov_b32_e32 v6, v2 +; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX6-NEXT: v_alignbit_b32 v5, v4, v5, 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v7 +; GFX6-NEXT: v_alignbit_b32 v4, v4, v6, 16 +; GFX6-NEXT: v_mov_b32_e32 v7, v5 +; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB78_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v0, v3 -; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v5 +; GFX6-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -23453,42 +23577,46 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v3 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 ; GFX7-NEXT: .LBB79_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v5 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v4 -; GFX7-NEXT: v_alignbit_b32 v3, v2, v3, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v7 -; GFX7-NEXT: v_alignbit_b32 v2, v2, v6, 16 -; GFX7-NEXT: v_mov_b32_e32 v7, v3 -; GFX7-NEXT: v_mov_b32_e32 v6, v2 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_alignbit_b32 v5, v4, v5, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v7 +; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16 +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB79_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v0, v3 -; GFX7-NEXT: v_mov_b32_e32 v1, v2 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v5 +; GFX7-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -23498,43 +23626,47 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v3 +; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 ; GFX6-NEXT: .LBB79_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_add_f32_e32 v7, v7, v5 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v4 -; GFX6-NEXT: v_alignbit_b32 v3, v2, v3, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v7 -; GFX6-NEXT: v_alignbit_b32 v2, v2, v6, 16 -; GFX6-NEXT: v_mov_b32_e32 v7, v3 -; GFX6-NEXT: v_mov_b32_e32 v6, v2 +; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX6-NEXT: v_alignbit_b32 v5, v4, v5, 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v7 +; GFX6-NEXT: v_alignbit_b32 v4, v4, v6, 16 +; GFX6-NEXT: v_mov_b32_e32 v7, v5 +; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB79_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v0, v3 -; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v5 +; GFX6-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 511 @@ -23843,44 +23975,50 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX7-NEXT: s_mov_b32 s5, -1 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: buffer_load_dword v6, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v2 +; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: s_mov_b32 s4, s6 -; GFX7-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: .LBB80_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 ; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v7 -; GFX7-NEXT: v_alignbit_b32 v0, v0, v6, 16 -; GFX7-NEXT: v_mov_b32_e32 v7, v1 -; GFX7-NEXT: v_mov_b32_e32 v6, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[4:5], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_alignbit_b32 v5, v4, v5, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v7 +; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16 +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB80_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v5 +; GFX7-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -23890,45 +24028,51 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX6-NEXT: s_mov_b32 s5, -1 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: buffer_load_dword v6, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 -; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v2 +; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX6-NEXT: .LBB80_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX6-NEXT: v_add_f32_e32 v7, v7, v3 ; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_alignbit_b32 v1, v1, v0, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v7 -; GFX6-NEXT: v_alignbit_b32 v0, v0, v6, 16 -; GFX6-NEXT: v_mov_b32_e32 v7, v1 -; GFX6-NEXT: v_mov_b32_e32 v6, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[4:5], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_alignbit_b32 v5, v4, v5, 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v7 +; GFX6-NEXT: v_alignbit_b32 v4, v4, v6, 16 +; GFX6-NEXT: v_mov_b32_e32 v7, v5 +; GFX6-NEXT: v_mov_b32_e32 v6, v4 +; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB80_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v5 +; GFX6-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 -512 @@ -24228,15 +24372,17 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 ; GFX7-NEXT: .LBB81_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -24271,15 +24417,17 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 ; GFX6-NEXT: .LBB81_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -24606,15 +24754,17 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 ; GFX7-NEXT: .LBB82_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -24649,15 +24799,17 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 ; GFX6-NEXT: .LBB82_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -24986,6 +25138,8 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -25033,6 +25187,8 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -25382,42 +25538,46 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v3 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 ; GFX7-NEXT: .LBB84_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v5 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v4 -; GFX7-NEXT: v_alignbit_b32 v3, v2, v3, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v7 -; GFX7-NEXT: v_alignbit_b32 v2, v2, v6, 16 -; GFX7-NEXT: v_mov_b32_e32 v7, v3 -; GFX7-NEXT: v_mov_b32_e32 v6, v2 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_alignbit_b32 v5, v4, v5, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v7 +; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16 +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB84_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v0, v3 -; GFX7-NEXT: v_mov_b32_e32 v1, v2 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v5 +; GFX7-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -25427,43 +25587,47 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v3 +; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 ; GFX6-NEXT: .LBB84_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_add_f32_e32 v7, v7, v5 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v4 -; GFX6-NEXT: v_alignbit_b32 v3, v2, v3, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v7 -; GFX6-NEXT: v_alignbit_b32 v2, v2, v6, 16 -; GFX6-NEXT: v_mov_b32_e32 v7, v3 -; GFX6-NEXT: v_mov_b32_e32 v6, v2 +; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX6-NEXT: v_alignbit_b32 v5, v4, v5, 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v7 +; GFX6-NEXT: v_alignbit_b32 v4, v4, v6, 16 +; GFX6-NEXT: v_mov_b32_e32 v7, v5 +; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB84_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v0, v3 -; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v5 +; GFX6-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 511 @@ -25768,15 +25932,17 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 ; GFX7-NEXT: .LBB85_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -25811,15 +25977,17 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 ; GFX6-NEXT: .LBB85_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -26153,42 +26321,46 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memor ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v3 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 ; GFX7-NEXT: .LBB86_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v5 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v4 -; GFX7-NEXT: v_alignbit_b32 v3, v2, v3, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v7 -; GFX7-NEXT: v_alignbit_b32 v2, v2, v6, 16 -; GFX7-NEXT: v_mov_b32_e32 v7, v3 -; GFX7-NEXT: v_mov_b32_e32 v6, v2 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_alignbit_b32 v5, v4, v5, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v7 +; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16 +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB86_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v0, v3 -; GFX7-NEXT: v_mov_b32_e32 v1, v2 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v5 +; GFX7-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory: @@ -26198,43 +26370,47 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memor ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v3 +; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 ; GFX6-NEXT: .LBB86_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_add_f32_e32 v7, v7, v5 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v4 -; GFX6-NEXT: v_alignbit_b32 v3, v2, v3, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v7 -; GFX6-NEXT: v_alignbit_b32 v2, v2, v6, 16 -; GFX6-NEXT: v_mov_b32_e32 v7, v3 -; GFX6-NEXT: v_mov_b32_e32 v6, v2 +; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX6-NEXT: v_alignbit_b32 v5, v4, v5, 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v7 +; GFX6-NEXT: v_alignbit_b32 v4, v4, v6, 16 +; GFX6-NEXT: v_mov_b32_e32 v7, v5 +; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB86_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v0, v3 -; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v5 +; GFX6-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 @@ -26533,15 +26709,17 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 ; GFX7-NEXT: .LBB87_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -26576,15 +26754,17 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 ; GFX6-NEXT: .LBB87_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -26917,42 +27097,46 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v3 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 ; GFX7-NEXT: .LBB88_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v5 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v4 -; GFX7-NEXT: v_alignbit_b32 v3, v2, v3, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v7 -; GFX7-NEXT: v_alignbit_b32 v2, v2, v6, 16 -; GFX7-NEXT: v_mov_b32_e32 v7, v3 -; GFX7-NEXT: v_mov_b32_e32 v6, v2 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_alignbit_b32 v5, v4, v5, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v7 +; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16 +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB88_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v0, v3 -; GFX7-NEXT: v_mov_b32_e32 v1, v2 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v5 +; GFX7-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -26962,43 +27146,47 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v3 +; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 ; GFX6-NEXT: .LBB88_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_add_f32_e32 v7, v7, v5 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v4 -; GFX6-NEXT: v_alignbit_b32 v3, v2, v3, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v7 -; GFX6-NEXT: v_alignbit_b32 v2, v2, v6, 16 -; GFX6-NEXT: v_mov_b32_e32 v7, v3 -; GFX6-NEXT: v_mov_b32_e32 v6, v2 +; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX6-NEXT: v_alignbit_b32 v5, v4, v5, 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v7 +; GFX6-NEXT: v_alignbit_b32 v4, v4, v6, 16 +; GFX6-NEXT: v_mov_b32_e32 v7, v5 +; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB88_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v0, v3 -; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v5 +; GFX6-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 @@ -27297,15 +27485,17 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 ; GFX7-NEXT: .LBB89_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -27340,15 +27530,17 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 ; GFX6-NEXT: .LBB89_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -27681,42 +27873,46 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__maybe_remote(ptr addrs ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v3 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 ; GFX7-NEXT: .LBB90_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v5 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v4 -; GFX7-NEXT: v_alignbit_b32 v3, v2, v3, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v7 -; GFX7-NEXT: v_alignbit_b32 v2, v2, v6, 16 -; GFX7-NEXT: v_mov_b32_e32 v7, v3 -; GFX7-NEXT: v_mov_b32_e32 v6, v2 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_alignbit_b32 v5, v4, v5, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v7 +; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16 +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB90_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v0, v3 -; GFX7-NEXT: v_mov_b32_e32 v1, v2 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v5 +; GFX7-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fadd_ret_v2bf16__maybe_remote: @@ -27726,43 +27922,47 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__maybe_remote(ptr addrs ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v3 +; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 ; GFX6-NEXT: .LBB90_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_add_f32_e32 v7, v7, v5 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v4 -; GFX6-NEXT: v_alignbit_b32 v3, v2, v3, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v7 -; GFX6-NEXT: v_alignbit_b32 v2, v2, v6, 16 -; GFX6-NEXT: v_mov_b32_e32 v7, v3 -; GFX6-NEXT: v_mov_b32_e32 v6, v2 +; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX6-NEXT: v_alignbit_b32 v5, v4, v5, 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v7 +; GFX6-NEXT: v_alignbit_b32 v4, v4, v6, 16 +; GFX6-NEXT: v_mov_b32_e32 v7, v5 +; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB90_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v0, v3 -; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v5 +; GFX6-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst @@ -28061,15 +28261,17 @@ define void @global_agent_atomic_fadd_noret_v2bf16__maybe_remote(ptr addrspace(1 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 ; GFX7-NEXT: .LBB91_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -28104,15 +28306,17 @@ define void @global_agent_atomic_fadd_noret_v2bf16__maybe_remote(ptr addrspace(1 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 ; GFX6-NEXT: .LBB91_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll index 31ec099d41cfe..279cff3f5d368 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll @@ -13747,46 +13747,51 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_me ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX7-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_max_f32_e32 v6, v6, v4 -; GFX7-NEXT: v_max_f32_e32 v7, v7, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX7-NEXT: v_or_b32_e32 v7, v2, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; GFX7-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX7-NEXT: v_mov_b32_e32 v9, v7 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_max_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_max_f32_e32 v7, v7, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 ; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v8 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v8 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB46_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v0, v2 -; GFX7-NEXT: v_mov_b32_e32 v1, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memory: @@ -13796,47 +13801,52 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_me ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX6-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_max_f32_e32 v6, v6, v4 -; GFX6-NEXT: v_max_f32_e32 v7, v7, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX6-NEXT: v_or_b32_e32 v7, v2, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; GFX6-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX6-NEXT: v_mov_b32_e32 v9, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX6-NEXT: v_max_f32_e32 v6, v6, v2 +; GFX6-NEXT: v_max_f32_e32 v7, v7, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 ; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_mov_b32_e32 v7, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v8 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v8 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB46_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v0, v2 -; GFX6-NEXT: v_mov_b32_e32 v1, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fmax ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -14040,46 +14050,51 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX7-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_max_f32_e32 v6, v6, v4 -; GFX7-NEXT: v_max_f32_e32 v7, v7, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX7-NEXT: v_or_b32_e32 v7, v2, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; GFX7-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX7-NEXT: v_mov_b32_e32 v9, v7 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_max_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_max_f32_e32 v7, v7, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 ; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v8 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v8 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB47_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v0, v2 -; GFX7-NEXT: v_mov_b32_e32 v1, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -14089,47 +14104,52 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX6-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_max_f32_e32 v6, v6, v4 -; GFX6-NEXT: v_max_f32_e32 v7, v7, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX6-NEXT: v_or_b32_e32 v7, v2, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; GFX6-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX6-NEXT: v_mov_b32_e32 v9, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX6-NEXT: v_max_f32_e32 v6, v6, v2 +; GFX6-NEXT: v_max_f32_e32 v7, v7, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 ; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: v_mov_b32_e32 v7, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v8 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v8 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB47_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v0, v2 -; GFX6-NEXT: v_mov_b32_e32 v1, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511 @@ -14334,48 +14354,55 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX7-NEXT: s_mov_b32 s5, -1 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: buffer_load_dword v6, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v2 -; GFX7-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX7-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: v_max_f32_e32 v6, v6, v2 ; GFX7-NEXT: v_max_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX7-NEXT: v_or_b32_e32 v7, v0, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX7-NEXT: v_or_b32_e32 v6, v8, v0 -; GFX7-NEXT: v_mov_b32_e32 v9, v7 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 ; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v[4:5], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v8 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v8 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB48_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -14385,49 +14412,56 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX6-NEXT: s_mov_b32 s5, -1 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: buffer_load_dword v6, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v2 -; GFX6-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX6-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX6-NEXT: v_max_f32_e32 v6, v6, v2 ; GFX6-NEXT: v_max_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX6-NEXT: v_or_b32_e32 v7, v0, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX6-NEXT: v_or_b32_e32 v6, v8, v0 -; GFX6-NEXT: v_mov_b32_e32 v9, v7 +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 ; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: buffer_atomic_cmpswap v[8:9], v[4:5], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_mov_b32_e32 v7, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v8 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v8 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB48_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 -512 @@ -14624,14 +14658,17 @@ define void @global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX7-NEXT: .LBB49_1: ; %atomicrmw.start @@ -14671,14 +14708,17 @@ define void @global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX6-NEXT: .LBB49_1: ; %atomicrmw.start @@ -14907,14 +14947,17 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX7-NEXT: .LBB50_1: ; %atomicrmw.start @@ -14954,14 +14997,17 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX6-NEXT: .LBB50_1: ; %atomicrmw.start @@ -15191,18 +15237,21 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX7-NEXT: s_mov_b32 s5, -1 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX7-NEXT: .LBB51_1: ; %atomicrmw.start @@ -15242,18 +15291,21 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX6-NEXT: s_mov_b32 s5, -1 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX6-NEXT: .LBB51_1: ; %atomicrmw.start @@ -15492,46 +15544,51 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX7-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_max_f32_e32 v6, v6, v4 -; GFX7-NEXT: v_max_f32_e32 v7, v7, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX7-NEXT: v_or_b32_e32 v7, v2, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; GFX7-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX7-NEXT: v_mov_b32_e32 v9, v7 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_max_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_max_f32_e32 v7, v7, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 ; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v8 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v8 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB52_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v0, v2 -; GFX7-NEXT: v_mov_b32_e32 v1, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -15541,47 +15598,52 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX6-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_max_f32_e32 v6, v6, v4 -; GFX6-NEXT: v_max_f32_e32 v7, v7, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX6-NEXT: v_or_b32_e32 v7, v2, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; GFX6-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX6-NEXT: v_mov_b32_e32 v9, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX6-NEXT: v_max_f32_e32 v6, v6, v2 +; GFX6-NEXT: v_max_f32_e32 v7, v7, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 ; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: v_mov_b32_e32 v7, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v8 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v8 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB52_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v0, v2 -; GFX6-NEXT: v_mov_b32_e32 v1, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511 @@ -15783,14 +15845,17 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX7-NEXT: .LBB53_1: ; %atomicrmw.start @@ -15830,14 +15895,17 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX6-NEXT: .LBB53_1: ; %atomicrmw.start @@ -16290,42 +16358,46 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 ; GFX7-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_max_f32_e32 v6, v6, v4 -; GFX7-NEXT: v_max_f32_e32 v7, v7, v5 -; GFX7-NEXT: v_alignbit_b32 v3, v2, v3, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v6 -; GFX7-NEXT: v_alignbit_b32 v2, v2, v7, 16 -; GFX7-NEXT: v_mov_b32_e32 v7, v3 -; GFX7-NEXT: v_mov_b32_e32 v6, v2 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_max_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_max_f32_e32 v7, v7, v3 +; GFX7-NEXT: v_alignbit_b32 v5, v5, v4, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_alignbit_b32 v4, v4, v7, 16 +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB54_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v0, v3 -; GFX7-NEXT: v_mov_b32_e32 v1, v2 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v4 +; GFX7-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory: @@ -16335,43 +16407,47 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 ; GFX6-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_max_f32_e32 v6, v6, v4 -; GFX6-NEXT: v_max_f32_e32 v7, v7, v5 -; GFX6-NEXT: v_alignbit_b32 v3, v2, v3, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v6 -; GFX6-NEXT: v_alignbit_b32 v2, v2, v7, 16 -; GFX6-NEXT: v_mov_b32_e32 v7, v3 -; GFX6-NEXT: v_mov_b32_e32 v6, v2 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX6-NEXT: v_max_f32_e32 v6, v6, v2 +; GFX6-NEXT: v_max_f32_e32 v7, v7, v3 +; GFX6-NEXT: v_alignbit_b32 v5, v5, v4, 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 +; GFX6-NEXT: v_alignbit_b32 v4, v4, v7, 16 +; GFX6-NEXT: v_mov_b32_e32 v7, v5 +; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB54_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v0, v3 -; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v4 +; GFX6-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fmax ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -16789,42 +16865,46 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 ; GFX7-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_max_f32_e32 v6, v6, v4 -; GFX7-NEXT: v_max_f32_e32 v7, v7, v5 -; GFX7-NEXT: v_alignbit_b32 v3, v2, v3, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v6 -; GFX7-NEXT: v_alignbit_b32 v2, v2, v7, 16 -; GFX7-NEXT: v_mov_b32_e32 v7, v3 -; GFX7-NEXT: v_mov_b32_e32 v6, v2 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_max_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_max_f32_e32 v7, v7, v3 +; GFX7-NEXT: v_alignbit_b32 v5, v5, v4, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_alignbit_b32 v4, v4, v7, 16 +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB55_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v0, v3 -; GFX7-NEXT: v_mov_b32_e32 v1, v2 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v4 +; GFX7-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -16834,43 +16914,47 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 -; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 ; GFX6-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_max_f32_e32 v6, v6, v4 -; GFX6-NEXT: v_max_f32_e32 v7, v7, v5 -; GFX6-NEXT: v_alignbit_b32 v3, v2, v3, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v6 -; GFX6-NEXT: v_alignbit_b32 v2, v2, v7, 16 -; GFX6-NEXT: v_mov_b32_e32 v7, v3 -; GFX6-NEXT: v_mov_b32_e32 v6, v2 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX6-NEXT: v_max_f32_e32 v6, v6, v2 +; GFX6-NEXT: v_max_f32_e32 v7, v7, v3 +; GFX6-NEXT: v_alignbit_b32 v5, v5, v4, 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 +; GFX6-NEXT: v_alignbit_b32 v4, v4, v7, 16 +; GFX6-NEXT: v_mov_b32_e32 v7, v5 +; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB55_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v0, v3 -; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v4 +; GFX6-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 511 @@ -17289,44 +17373,50 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX7-NEXT: s_mov_b32 s5, -1 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: buffer_load_dword v6, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v2 +; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: s_mov_b32 s4, s6 -; GFX7-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: v_max_f32_e32 v6, v6, v2 ; GFX7-NEXT: v_max_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v6 -; GFX7-NEXT: v_alignbit_b32 v0, v0, v7, 16 -; GFX7-NEXT: v_mov_b32_e32 v7, v1 -; GFX7-NEXT: v_mov_b32_e32 v6, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[4:5], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_alignbit_b32 v5, v5, v4, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_alignbit_b32 v4, v4, v7, 16 +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB56_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v4 +; GFX7-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -17336,45 +17426,51 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX6-NEXT: s_mov_b32 s5, -1 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: buffer_load_dword v6, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 -; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v3 -; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v2 +; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 +; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX6-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 -; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GFX6-NEXT: v_max_f32_e32 v6, v6, v2 ; GFX6-NEXT: v_max_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_alignbit_b32 v1, v1, v0, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v6 -; GFX6-NEXT: v_alignbit_b32 v0, v0, v7, 16 -; GFX6-NEXT: v_mov_b32_e32 v7, v1 -; GFX6-NEXT: v_mov_b32_e32 v6, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[4:5], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_alignbit_b32 v5, v5, v4, 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 +; GFX6-NEXT: v_alignbit_b32 v4, v4, v7, 16 +; GFX6-NEXT: v_mov_b32_e32 v7, v5 +; GFX6-NEXT: v_mov_b32_e32 v6, v4 +; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB56_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v4 +; GFX6-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 -512 @@ -17779,15 +17875,17 @@ define void @global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 ; GFX7-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 @@ -17822,15 +17920,17 @@ define void @global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 ; GFX6-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 @@ -18262,15 +18362,17 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 ; GFX7-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 @@ -18305,15 +18407,17 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044 -; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 ; GFX6-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 @@ -18747,6 +18851,8 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v2 @@ -18794,6 +18900,8 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v2 @@ -19254,42 +19362,46 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 ; GFX7-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_max_f32_e32 v6, v6, v4 -; GFX7-NEXT: v_max_f32_e32 v7, v7, v5 -; GFX7-NEXT: v_alignbit_b32 v3, v2, v3, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v6 -; GFX7-NEXT: v_alignbit_b32 v2, v2, v7, 16 -; GFX7-NEXT: v_mov_b32_e32 v7, v3 -; GFX7-NEXT: v_mov_b32_e32 v6, v2 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_max_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_max_f32_e32 v7, v7, v3 +; GFX7-NEXT: v_alignbit_b32 v5, v5, v4, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_alignbit_b32 v4, v4, v7, 16 +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB60_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v0, v3 -; GFX7-NEXT: v_mov_b32_e32 v1, v2 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v4 +; GFX7-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -19299,43 +19411,47 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 -; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 ; GFX6-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_max_f32_e32 v6, v6, v4 -; GFX6-NEXT: v_max_f32_e32 v7, v7, v5 -; GFX6-NEXT: v_alignbit_b32 v3, v2, v3, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v6 -; GFX6-NEXT: v_alignbit_b32 v2, v2, v7, 16 -; GFX6-NEXT: v_mov_b32_e32 v7, v3 -; GFX6-NEXT: v_mov_b32_e32 v6, v2 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX6-NEXT: v_max_f32_e32 v6, v6, v2 +; GFX6-NEXT: v_max_f32_e32 v7, v7, v3 +; GFX6-NEXT: v_alignbit_b32 v5, v5, v4, 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 +; GFX6-NEXT: v_alignbit_b32 v4, v4, v7, 16 +; GFX6-NEXT: v_mov_b32_e32 v7, v5 +; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB60_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v0, v3 -; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v4 +; GFX6-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 511 @@ -19746,15 +19862,17 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 ; GFX7-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 @@ -19789,15 +19907,17 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044 -; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 ; GFX6-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll index 64d42356e8968..e658cb658de78 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll @@ -13747,46 +13747,51 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_me ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX7-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_min_f32_e32 v6, v6, v4 -; GFX7-NEXT: v_min_f32_e32 v7, v7, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX7-NEXT: v_or_b32_e32 v7, v2, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; GFX7-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX7-NEXT: v_mov_b32_e32 v9, v7 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_min_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_min_f32_e32 v7, v7, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 ; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v8 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v8 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB46_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v0, v2 -; GFX7-NEXT: v_mov_b32_e32 v1, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory: @@ -13796,47 +13801,52 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_me ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX6-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_min_f32_e32 v6, v6, v4 -; GFX6-NEXT: v_min_f32_e32 v7, v7, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX6-NEXT: v_or_b32_e32 v7, v2, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; GFX6-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX6-NEXT: v_mov_b32_e32 v9, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX6-NEXT: v_min_f32_e32 v6, v6, v2 +; GFX6-NEXT: v_min_f32_e32 v7, v7, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 ; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_mov_b32_e32 v7, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v8 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v8 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB46_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v0, v2 -; GFX6-NEXT: v_mov_b32_e32 v1, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fmin ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -14040,46 +14050,51 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX7-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_min_f32_e32 v6, v6, v4 -; GFX7-NEXT: v_min_f32_e32 v7, v7, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX7-NEXT: v_or_b32_e32 v7, v2, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; GFX7-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX7-NEXT: v_mov_b32_e32 v9, v7 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_min_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_min_f32_e32 v7, v7, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 ; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v8 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v8 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB47_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v0, v2 -; GFX7-NEXT: v_mov_b32_e32 v1, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -14089,47 +14104,52 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX6-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_min_f32_e32 v6, v6, v4 -; GFX6-NEXT: v_min_f32_e32 v7, v7, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX6-NEXT: v_or_b32_e32 v7, v2, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; GFX6-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX6-NEXT: v_mov_b32_e32 v9, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX6-NEXT: v_min_f32_e32 v6, v6, v2 +; GFX6-NEXT: v_min_f32_e32 v7, v7, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 ; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: v_mov_b32_e32 v7, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v8 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v8 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB47_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v0, v2 -; GFX6-NEXT: v_mov_b32_e32 v1, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511 @@ -14334,48 +14354,55 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX7-NEXT: s_mov_b32 s5, -1 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: buffer_load_dword v6, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v2 -; GFX7-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX7-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: v_min_f32_e32 v6, v6, v2 ; GFX7-NEXT: v_min_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX7-NEXT: v_or_b32_e32 v7, v0, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX7-NEXT: v_or_b32_e32 v6, v8, v0 -; GFX7-NEXT: v_mov_b32_e32 v9, v7 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 ; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v[4:5], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v8 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v8 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB48_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -14385,49 +14412,56 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX6-NEXT: s_mov_b32 s5, -1 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: buffer_load_dword v6, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v2 -; GFX6-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX6-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX6-NEXT: v_min_f32_e32 v6, v6, v2 ; GFX6-NEXT: v_min_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX6-NEXT: v_or_b32_e32 v7, v0, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX6-NEXT: v_or_b32_e32 v6, v8, v0 -; GFX6-NEXT: v_mov_b32_e32 v9, v7 +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 ; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: buffer_atomic_cmpswap v[8:9], v[4:5], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_mov_b32_e32 v7, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v8 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v8 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB48_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 -512 @@ -14624,14 +14658,17 @@ define void @global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX7-NEXT: .LBB49_1: ; %atomicrmw.start @@ -14671,14 +14708,17 @@ define void @global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX6-NEXT: .LBB49_1: ; %atomicrmw.start @@ -14907,14 +14947,17 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX7-NEXT: .LBB50_1: ; %atomicrmw.start @@ -14954,14 +14997,17 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX6-NEXT: .LBB50_1: ; %atomicrmw.start @@ -15191,18 +15237,21 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX7-NEXT: s_mov_b32 s5, -1 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX7-NEXT: .LBB51_1: ; %atomicrmw.start @@ -15242,18 +15291,21 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX6-NEXT: s_mov_b32 s5, -1 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX6-NEXT: .LBB51_1: ; %atomicrmw.start @@ -15492,46 +15544,51 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX7-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_min_f32_e32 v6, v6, v4 -; GFX7-NEXT: v_min_f32_e32 v7, v7, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX7-NEXT: v_or_b32_e32 v7, v2, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; GFX7-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX7-NEXT: v_mov_b32_e32 v9, v7 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_min_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_min_f32_e32 v7, v7, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 ; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v8 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v8 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB52_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v0, v2 -; GFX7-NEXT: v_mov_b32_e32 v1, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -15541,47 +15598,52 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX6-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_min_f32_e32 v6, v6, v4 -; GFX6-NEXT: v_min_f32_e32 v7, v7, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX6-NEXT: v_or_b32_e32 v7, v2, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; GFX6-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX6-NEXT: v_mov_b32_e32 v9, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX6-NEXT: v_min_f32_e32 v6, v6, v2 +; GFX6-NEXT: v_min_f32_e32 v7, v7, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 ; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: v_mov_b32_e32 v7, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v8 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v8 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB52_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v0, v2 -; GFX6-NEXT: v_mov_b32_e32 v1, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511 @@ -15783,14 +15845,17 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX7-NEXT: .LBB53_1: ; %atomicrmw.start @@ -15830,14 +15895,17 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX6-NEXT: .LBB53_1: ; %atomicrmw.start @@ -16290,42 +16358,46 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 ; GFX7-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_min_f32_e32 v6, v6, v4 -; GFX7-NEXT: v_min_f32_e32 v7, v7, v5 -; GFX7-NEXT: v_alignbit_b32 v3, v2, v3, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v6 -; GFX7-NEXT: v_alignbit_b32 v2, v2, v7, 16 -; GFX7-NEXT: v_mov_b32_e32 v7, v3 -; GFX7-NEXT: v_mov_b32_e32 v6, v2 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_min_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_min_f32_e32 v7, v7, v3 +; GFX7-NEXT: v_alignbit_b32 v5, v5, v4, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_alignbit_b32 v4, v4, v7, 16 +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB54_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v0, v3 -; GFX7-NEXT: v_mov_b32_e32 v1, v2 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v4 +; GFX7-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory: @@ -16335,43 +16407,47 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 ; GFX6-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_min_f32_e32 v6, v6, v4 -; GFX6-NEXT: v_min_f32_e32 v7, v7, v5 -; GFX6-NEXT: v_alignbit_b32 v3, v2, v3, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v6 -; GFX6-NEXT: v_alignbit_b32 v2, v2, v7, 16 -; GFX6-NEXT: v_mov_b32_e32 v7, v3 -; GFX6-NEXT: v_mov_b32_e32 v6, v2 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX6-NEXT: v_min_f32_e32 v6, v6, v2 +; GFX6-NEXT: v_min_f32_e32 v7, v7, v3 +; GFX6-NEXT: v_alignbit_b32 v5, v5, v4, 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 +; GFX6-NEXT: v_alignbit_b32 v4, v4, v7, 16 +; GFX6-NEXT: v_mov_b32_e32 v7, v5 +; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB54_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v0, v3 -; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v4 +; GFX6-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fmin ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -16789,42 +16865,46 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 ; GFX7-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_min_f32_e32 v6, v6, v4 -; GFX7-NEXT: v_min_f32_e32 v7, v7, v5 -; GFX7-NEXT: v_alignbit_b32 v3, v2, v3, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v6 -; GFX7-NEXT: v_alignbit_b32 v2, v2, v7, 16 -; GFX7-NEXT: v_mov_b32_e32 v7, v3 -; GFX7-NEXT: v_mov_b32_e32 v6, v2 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_min_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_min_f32_e32 v7, v7, v3 +; GFX7-NEXT: v_alignbit_b32 v5, v5, v4, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_alignbit_b32 v4, v4, v7, 16 +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB55_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v0, v3 -; GFX7-NEXT: v_mov_b32_e32 v1, v2 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v4 +; GFX7-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -16834,43 +16914,47 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 -; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 ; GFX6-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_min_f32_e32 v6, v6, v4 -; GFX6-NEXT: v_min_f32_e32 v7, v7, v5 -; GFX6-NEXT: v_alignbit_b32 v3, v2, v3, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v6 -; GFX6-NEXT: v_alignbit_b32 v2, v2, v7, 16 -; GFX6-NEXT: v_mov_b32_e32 v7, v3 -; GFX6-NEXT: v_mov_b32_e32 v6, v2 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX6-NEXT: v_min_f32_e32 v6, v6, v2 +; GFX6-NEXT: v_min_f32_e32 v7, v7, v3 +; GFX6-NEXT: v_alignbit_b32 v5, v5, v4, 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 +; GFX6-NEXT: v_alignbit_b32 v4, v4, v7, 16 +; GFX6-NEXT: v_mov_b32_e32 v7, v5 +; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB55_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v0, v3 -; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v4 +; GFX6-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 511 @@ -17289,44 +17373,50 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX7-NEXT: s_mov_b32 s5, -1 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: buffer_load_dword v6, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v2 +; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: s_mov_b32 s4, s6 -; GFX7-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: v_min_f32_e32 v6, v6, v2 ; GFX7-NEXT: v_min_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v6 -; GFX7-NEXT: v_alignbit_b32 v0, v0, v7, 16 -; GFX7-NEXT: v_mov_b32_e32 v7, v1 -; GFX7-NEXT: v_mov_b32_e32 v6, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[4:5], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_alignbit_b32 v5, v5, v4, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_alignbit_b32 v4, v4, v7, 16 +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB56_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v4 +; GFX7-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -17336,45 +17426,51 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX6-NEXT: s_mov_b32 s5, -1 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: buffer_load_dword v6, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 -; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v3 -; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v2 +; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 +; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX6-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 -; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GFX6-NEXT: v_min_f32_e32 v6, v6, v2 ; GFX6-NEXT: v_min_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_alignbit_b32 v1, v1, v0, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v6 -; GFX6-NEXT: v_alignbit_b32 v0, v0, v7, 16 -; GFX6-NEXT: v_mov_b32_e32 v7, v1 -; GFX6-NEXT: v_mov_b32_e32 v6, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[4:5], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_alignbit_b32 v5, v5, v4, 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 +; GFX6-NEXT: v_alignbit_b32 v4, v4, v7, 16 +; GFX6-NEXT: v_mov_b32_e32 v7, v5 +; GFX6-NEXT: v_mov_b32_e32 v6, v4 +; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB56_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v4 +; GFX6-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 -512 @@ -17779,15 +17875,17 @@ define void @global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 ; GFX7-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 @@ -17822,15 +17920,17 @@ define void @global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 ; GFX6-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 @@ -18262,15 +18362,17 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 ; GFX7-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 @@ -18305,15 +18407,17 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044 -; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 ; GFX6-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 @@ -18747,6 +18851,8 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v2 @@ -18794,6 +18900,8 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v2 @@ -19254,42 +19362,46 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 ; GFX7-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_min_f32_e32 v6, v6, v4 -; GFX7-NEXT: v_min_f32_e32 v7, v7, v5 -; GFX7-NEXT: v_alignbit_b32 v3, v2, v3, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v6 -; GFX7-NEXT: v_alignbit_b32 v2, v2, v7, 16 -; GFX7-NEXT: v_mov_b32_e32 v7, v3 -; GFX7-NEXT: v_mov_b32_e32 v6, v2 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_min_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_min_f32_e32 v7, v7, v3 +; GFX7-NEXT: v_alignbit_b32 v5, v5, v4, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_alignbit_b32 v4, v4, v7, 16 +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB60_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v0, v3 -; GFX7-NEXT: v_mov_b32_e32 v1, v2 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v4 +; GFX7-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -19299,43 +19411,47 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 -; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 ; GFX6-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_min_f32_e32 v6, v6, v4 -; GFX6-NEXT: v_min_f32_e32 v7, v7, v5 -; GFX6-NEXT: v_alignbit_b32 v3, v2, v3, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v6 -; GFX6-NEXT: v_alignbit_b32 v2, v2, v7, 16 -; GFX6-NEXT: v_mov_b32_e32 v7, v3 -; GFX6-NEXT: v_mov_b32_e32 v6, v2 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX6-NEXT: v_min_f32_e32 v6, v6, v2 +; GFX6-NEXT: v_min_f32_e32 v7, v7, v3 +; GFX6-NEXT: v_alignbit_b32 v5, v5, v4, 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 +; GFX6-NEXT: v_alignbit_b32 v4, v4, v7, 16 +; GFX6-NEXT: v_mov_b32_e32 v7, v5 +; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB60_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v0, v3 -; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v4 +; GFX6-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 511 @@ -19746,15 +19862,17 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 ; GFX7-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 @@ -19789,15 +19907,17 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044 -; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 ; GFX6-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll index 940918a5437b3..965c10b2e9ff9 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll @@ -14236,46 +14236,51 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16(ptr addrspace(1) %ptr, <2 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX7-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_sub_f32_e32 v6, v6, v4 -; GFX7-NEXT: v_sub_f32_e32 v7, v7, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX7-NEXT: v_or_b32_e32 v7, v2, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; GFX7-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX7-NEXT: v_mov_b32_e32 v9, v7 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_sub_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_sub_f32_e32 v7, v7, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 ; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v8 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v8 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB42_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v0, v2 -; GFX7-NEXT: v_mov_b32_e32 v1, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fsub_ret_v2f16: @@ -14285,47 +14290,52 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16(ptr addrspace(1) %ptr, <2 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX6-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_sub_f32_e32 v6, v6, v4 -; GFX6-NEXT: v_sub_f32_e32 v7, v7, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX6-NEXT: v_or_b32_e32 v7, v2, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; GFX6-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX6-NEXT: v_mov_b32_e32 v9, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX6-NEXT: v_sub_f32_e32 v6, v6, v2 +; GFX6-NEXT: v_sub_f32_e32 v7, v7, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 ; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_mov_b32_e32 v7, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v8 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v8 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB42_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v0, v2 -; GFX6-NEXT: v_mov_b32_e32 v1, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fsub ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst @@ -14512,46 +14522,51 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16__offset12b_pos(ptr addrspa ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX7-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_sub_f32_e32 v6, v6, v4 -; GFX7-NEXT: v_sub_f32_e32 v7, v7, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX7-NEXT: v_or_b32_e32 v7, v2, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; GFX7-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX7-NEXT: v_mov_b32_e32 v9, v7 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_sub_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_sub_f32_e32 v7, v7, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 ; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v8 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v8 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB43_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v0, v2 -; GFX7-NEXT: v_mov_b32_e32 v1, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fsub_ret_v2f16__offset12b_pos: @@ -14561,47 +14576,52 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16__offset12b_pos(ptr addrspa ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX6-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_sub_f32_e32 v6, v6, v4 -; GFX6-NEXT: v_sub_f32_e32 v7, v7, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX6-NEXT: v_or_b32_e32 v7, v2, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; GFX6-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX6-NEXT: v_mov_b32_e32 v9, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX6-NEXT: v_sub_f32_e32 v6, v6, v2 +; GFX6-NEXT: v_sub_f32_e32 v7, v7, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 ; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: v_mov_b32_e32 v7, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v8 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v8 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB43_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v0, v2 -; GFX6-NEXT: v_mov_b32_e32 v1, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511 @@ -14789,48 +14809,55 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16__offset12b_neg(ptr addrspa ; GFX7-NEXT: s_mov_b32 s5, -1 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: buffer_load_dword v6, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v2 -; GFX7-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX7-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: v_sub_f32_e32 v6, v6, v2 ; GFX7-NEXT: v_sub_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX7-NEXT: v_or_b32_e32 v7, v0, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX7-NEXT: v_or_b32_e32 v6, v8, v0 -; GFX7-NEXT: v_mov_b32_e32 v9, v7 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 ; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v[4:5], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v8 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v8 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB44_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fsub_ret_v2f16__offset12b_neg: @@ -14840,49 +14867,56 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16__offset12b_neg(ptr addrspa ; GFX6-NEXT: s_mov_b32 s5, -1 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: buffer_load_dword v6, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v2 -; GFX6-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX6-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX6-NEXT: v_sub_f32_e32 v6, v6, v2 ; GFX6-NEXT: v_sub_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX6-NEXT: v_or_b32_e32 v7, v0, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX6-NEXT: v_or_b32_e32 v6, v8, v0 -; GFX6-NEXT: v_mov_b32_e32 v9, v7 +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 ; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: buffer_atomic_cmpswap v[8:9], v[4:5], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_mov_b32_e32 v7, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v8 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v8 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB44_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 -512 @@ -15060,14 +15094,17 @@ define void @global_agent_atomic_fsub_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX7-NEXT: .LBB45_1: ; %atomicrmw.start @@ -15107,14 +15144,17 @@ define void @global_agent_atomic_fsub_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX6-NEXT: .LBB45_1: ; %atomicrmw.start @@ -15324,14 +15364,17 @@ define void @global_agent_atomic_fsub_noret_v2f16__offset12b_pos(ptr addrspace(1 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX7-NEXT: .LBB46_1: ; %atomicrmw.start @@ -15371,14 +15414,17 @@ define void @global_agent_atomic_fsub_noret_v2f16__offset12b_pos(ptr addrspace(1 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX6-NEXT: .LBB46_1: ; %atomicrmw.start @@ -15589,18 +15635,21 @@ define void @global_agent_atomic_fsub_noret_v2f16__offset12b_neg(ptr addrspace(1 ; GFX7-NEXT: s_mov_b32 s5, -1 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX7-NEXT: .LBB47_1: ; %atomicrmw.start @@ -15640,18 +15689,21 @@ define void @global_agent_atomic_fsub_noret_v2f16__offset12b_neg(ptr addrspace(1 ; GFX6-NEXT: s_mov_b32 s5, -1 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX6-NEXT: .LBB47_1: ; %atomicrmw.start @@ -15873,46 +15925,51 @@ define <2 x half> @global_system_atomic_fsub_ret_v2f16__offset12b_pos(ptr addrsp ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX7-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_sub_f32_e32 v6, v6, v4 -; GFX7-NEXT: v_sub_f32_e32 v7, v7, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX7-NEXT: v_or_b32_e32 v7, v2, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; GFX7-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX7-NEXT: v_mov_b32_e32 v9, v7 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_sub_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_sub_f32_e32 v7, v7, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 ; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v8 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v8 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB48_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v0, v2 -; GFX7-NEXT: v_mov_b32_e32 v1, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_system_atomic_fsub_ret_v2f16__offset12b_pos: @@ -15922,47 +15979,52 @@ define <2 x half> @global_system_atomic_fsub_ret_v2f16__offset12b_pos(ptr addrsp ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX6-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_sub_f32_e32 v6, v6, v4 -; GFX6-NEXT: v_sub_f32_e32 v7, v7, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX6-NEXT: v_or_b32_e32 v7, v2, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; GFX6-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX6-NEXT: v_mov_b32_e32 v9, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX6-NEXT: v_sub_f32_e32 v6, v6, v2 +; GFX6-NEXT: v_sub_f32_e32 v7, v7, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 ; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: v_mov_b32_e32 v7, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v8 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v8 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB48_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v0, v2 -; GFX6-NEXT: v_mov_b32_e32 v1, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511 @@ -16145,14 +16207,17 @@ define void @global_system_atomic_fsub_noret_v2f16__offset12b_pos(ptr addrspace( ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX7-NEXT: .LBB49_1: ; %atomicrmw.start @@ -16192,14 +16257,17 @@ define void @global_system_atomic_fsub_noret_v2f16__offset12b_pos(ptr addrspace( ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX6-NEXT: .LBB49_1: ; %atomicrmw.start @@ -16652,42 +16720,46 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v3 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX7-NEXT: .LBB50_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_sub_f32_e32 v7, v7, v5 -; GFX7-NEXT: v_sub_f32_e32 v6, v6, v4 -; GFX7-NEXT: v_alignbit_b32 v3, v2, v3, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v7 -; GFX7-NEXT: v_alignbit_b32 v2, v2, v6, 16 -; GFX7-NEXT: v_mov_b32_e32 v7, v3 -; GFX7-NEXT: v_mov_b32_e32 v6, v2 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 +; GFX7-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_sub_f32_e32 v7, v7, v3 +; GFX7-NEXT: v_sub_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_alignbit_b32 v5, v4, v5, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v7 +; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16 +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB50_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v0, v3 -; GFX7-NEXT: v_mov_b32_e32 v1, v2 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v5 +; GFX7-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fsub_ret_v2bf16: @@ -16697,43 +16769,47 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v3 +; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 ; GFX6-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_sub_f32_e32 v7, v7, v5 -; GFX6-NEXT: v_sub_f32_e32 v6, v6, v4 -; GFX6-NEXT: v_alignbit_b32 v3, v2, v3, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v7 -; GFX6-NEXT: v_alignbit_b32 v2, v2, v6, 16 -; GFX6-NEXT: v_mov_b32_e32 v7, v3 -; GFX6-NEXT: v_mov_b32_e32 v6, v2 +; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_sub_f32_e32 v7, v7, v3 +; GFX6-NEXT: v_sub_f32_e32 v6, v6, v2 +; GFX6-NEXT: v_alignbit_b32 v5, v4, v5, 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v7 +; GFX6-NEXT: v_alignbit_b32 v4, v4, v6, 16 +; GFX6-NEXT: v_mov_b32_e32 v7, v5 +; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB50_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v0, v3 -; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v5 +; GFX6-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fsub ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst @@ -17151,42 +17227,46 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr addr ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v3 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 ; GFX7-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_sub_f32_e32 v7, v7, v5 -; GFX7-NEXT: v_sub_f32_e32 v6, v6, v4 -; GFX7-NEXT: v_alignbit_b32 v3, v2, v3, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v7 -; GFX7-NEXT: v_alignbit_b32 v2, v2, v6, 16 -; GFX7-NEXT: v_mov_b32_e32 v7, v3 -; GFX7-NEXT: v_mov_b32_e32 v6, v2 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_sub_f32_e32 v7, v7, v3 +; GFX7-NEXT: v_sub_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_alignbit_b32 v5, v4, v5, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v7 +; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16 +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB51_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v0, v3 -; GFX7-NEXT: v_mov_b32_e32 v1, v2 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v5 +; GFX7-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fsub_ret_v2bf16__offset12b_pos: @@ -17196,43 +17276,47 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr addr ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v3 +; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 ; GFX6-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_sub_f32_e32 v7, v7, v5 -; GFX6-NEXT: v_sub_f32_e32 v6, v6, v4 -; GFX6-NEXT: v_alignbit_b32 v3, v2, v3, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v7 -; GFX6-NEXT: v_alignbit_b32 v2, v2, v6, 16 -; GFX6-NEXT: v_mov_b32_e32 v7, v3 -; GFX6-NEXT: v_mov_b32_e32 v6, v2 +; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_sub_f32_e32 v7, v7, v3 +; GFX6-NEXT: v_sub_f32_e32 v6, v6, v2 +; GFX6-NEXT: v_alignbit_b32 v5, v4, v5, 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v7 +; GFX6-NEXT: v_alignbit_b32 v4, v4, v6, 16 +; GFX6-NEXT: v_mov_b32_e32 v7, v5 +; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB51_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v0, v3 -; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v5 +; GFX6-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 511 @@ -17651,44 +17735,50 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr addr ; GFX7-NEXT: s_mov_b32 s5, -1 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: buffer_load_dword v6, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v2 +; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: s_mov_b32 s4, s6 -; GFX7-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_sub_f32_e32 v7, v7, v3 ; GFX7-NEXT: v_sub_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v7 -; GFX7-NEXT: v_alignbit_b32 v0, v0, v6, 16 -; GFX7-NEXT: v_mov_b32_e32 v7, v1 -; GFX7-NEXT: v_mov_b32_e32 v6, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[4:5], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_alignbit_b32 v5, v4, v5, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v7 +; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16 +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB52_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v5 +; GFX7-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fsub_ret_v2bf16__offset12b_neg: @@ -17698,45 +17788,51 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr addr ; GFX6-NEXT: s_mov_b32 s5, -1 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: buffer_load_dword v6, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 -; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v2 +; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX6-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX6-NEXT: v_sub_f32_e32 v7, v7, v3 ; GFX6-NEXT: v_sub_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_alignbit_b32 v1, v1, v0, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v7 -; GFX6-NEXT: v_alignbit_b32 v0, v0, v6, 16 -; GFX6-NEXT: v_mov_b32_e32 v7, v1 -; GFX6-NEXT: v_mov_b32_e32 v6, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[4:5], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_alignbit_b32 v5, v4, v5, 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v7 +; GFX6-NEXT: v_alignbit_b32 v4, v4, v6, 16 +; GFX6-NEXT: v_mov_b32_e32 v7, v5 +; GFX6-NEXT: v_mov_b32_e32 v6, v4 +; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB52_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v5 +; GFX6-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 -512 @@ -18141,15 +18237,17 @@ define void @global_agent_atomic_fsub_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 ; GFX7-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -18184,15 +18282,17 @@ define void @global_agent_atomic_fsub_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 ; GFX6-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -18624,15 +18724,17 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 ; GFX7-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -18667,15 +18769,17 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 ; GFX6-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -19109,6 +19213,8 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -19156,6 +19262,8 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -19616,42 +19724,46 @@ define <2 x bfloat> @global_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr add ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v3 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 ; GFX7-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_sub_f32_e32 v7, v7, v5 -; GFX7-NEXT: v_sub_f32_e32 v6, v6, v4 -; GFX7-NEXT: v_alignbit_b32 v3, v2, v3, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v7 -; GFX7-NEXT: v_alignbit_b32 v2, v2, v6, 16 -; GFX7-NEXT: v_mov_b32_e32 v7, v3 -; GFX7-NEXT: v_mov_b32_e32 v6, v2 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_sub_f32_e32 v7, v7, v3 +; GFX7-NEXT: v_sub_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_alignbit_b32 v5, v4, v5, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v7 +; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16 +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB56_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v0, v3 -; GFX7-NEXT: v_mov_b32_e32 v1, v2 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v5 +; GFX7-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_system_atomic_fsub_ret_v2bf16__offset12b_pos: @@ -19661,43 +19773,47 @@ define <2 x bfloat> @global_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr add ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v3 +; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 ; GFX6-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_sub_f32_e32 v7, v7, v5 -; GFX6-NEXT: v_sub_f32_e32 v6, v6, v4 -; GFX6-NEXT: v_alignbit_b32 v3, v2, v3, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v7 -; GFX6-NEXT: v_alignbit_b32 v2, v2, v6, 16 -; GFX6-NEXT: v_mov_b32_e32 v7, v3 -; GFX6-NEXT: v_mov_b32_e32 v6, v2 +; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_sub_f32_e32 v7, v7, v3 +; GFX6-NEXT: v_sub_f32_e32 v6, v6, v2 +; GFX6-NEXT: v_alignbit_b32 v5, v4, v5, 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v7 +; GFX6-NEXT: v_alignbit_b32 v4, v4, v6, 16 +; GFX6-NEXT: v_mov_b32_e32 v7, v5 +; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB56_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v0, v3 -; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v5 +; GFX6-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 511 @@ -20108,15 +20224,17 @@ define void @global_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 ; GFX7-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -20151,15 +20269,17 @@ define void @global_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 ; GFX6-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 diff --git a/llvm/test/CodeGen/AMDGPU/i1-to-bf16.ll b/llvm/test/CodeGen/AMDGPU/i1-to-bf16.ll index 86dbb16948ebe..53cfd12a953d3 100644 --- a/llvm/test/CodeGen/AMDGPU/i1-to-bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/i1-to-bf16.ll @@ -196,9 +196,9 @@ define <2 x bfloat> @v_uitofp_v2i1_to_v2bf16(<2 x i1> %num) { ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 ; GFX7-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, vcc ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc -; GFX7-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0 -; GFX7-NEXT: v_and_b32_e32 v1, 0x7fff0000, v1 +; GFX7-NEXT: v_alignbit_b32 v0, v1, v0, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_uitofp_v2i1_to_v2bf16: @@ -474,18 +474,18 @@ define <3 x bfloat> @v_uitofp_v3i1_to_v3bf16(<3 x i1> %num) { ; GFX7-LABEL: v_uitofp_v3i1_to_v3bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v2, 1, v2 -; GFX7-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX7-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX7-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, vcc +; GFX7-NEXT: v_and_b32_e32 v3, 1, v1 +; GFX7-NEXT: v_and_b32_e32 v1, 1, v2 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; GFX7-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX7-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, vcc +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 +; GFX7-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, vcc ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc -; GFX7-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0 -; GFX7-NEXT: v_and_b32_e32 v1, 0x7fff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v2, 0x7fff0000, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_alignbit_b32 v0, v2, v0, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_uitofp_v3i1_to_v3bf16: @@ -858,21 +858,21 @@ define <4 x bfloat> @v_uitofp_v4i1_to_v4bf16(<4 x i1> %num) { ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v3, 1, v3 -; GFX7-NEXT: v_and_b32_e32 v2, 1, v2 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 ; GFX7-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 +; GFX7-NEXT: v_and_b32_e32 v2, 1, v2 ; GFX7-NEXT: v_cndmask_b32_e64 v3, 0, 1.0, vcc -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX7-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX7-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, vcc ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; GFX7-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX7-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, vcc +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX7-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, vcc ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc -; GFX7-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0 -; GFX7-NEXT: v_and_b32_e32 v1, 0x7fff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v2, 0x7fff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v3, 0x7fff0000, v3 +; GFX7-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX7-NEXT: v_alignbit_b32 v1, v3, v2, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_uitofp_v4i1_to_v4bf16: @@ -1520,9 +1520,9 @@ define <2 x bfloat> @v_sitofp_v2i1_to_v2bf16(<2 x i1> %num) { ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 ; GFX7-NEXT: v_cndmask_b32_e64 v1, 0, -1.0, vcc ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, vcc -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: v_alignbit_b32 v0, v1, v0, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_sitofp_v2i1_to_v2bf16: @@ -1800,18 +1800,18 @@ define <3 x bfloat> @v_sitofp_v3i1_to_v3bf16(<3 x i1> %num) { ; GFX7-LABEL: v_sitofp_v3i1_to_v3bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v2, 1, v2 -; GFX7-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX7-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX7-NEXT: v_cndmask_b32_e64 v2, 0, -1.0, vcc +; GFX7-NEXT: v_and_b32_e32 v3, 1, v1 +; GFX7-NEXT: v_and_b32_e32 v1, 1, v2 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; GFX7-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX7-NEXT: v_cndmask_b32_e64 v1, 0, -1.0, vcc +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 +; GFX7-NEXT: v_cndmask_b32_e64 v2, 0, -1.0, vcc ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, vcc -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_alignbit_b32 v0, v2, v0, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_sitofp_v3i1_to_v3bf16: @@ -2187,21 +2187,21 @@ define <4 x bfloat> @v_sitofp_v4i1_to_v4bf16(<4 x i1> %num) { ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v3, 1, v3 -; GFX7-NEXT: v_and_b32_e32 v2, 1, v2 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 ; GFX7-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 +; GFX7-NEXT: v_and_b32_e32 v2, 1, v2 ; GFX7-NEXT: v_cndmask_b32_e64 v3, 0, -1.0, vcc -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX7-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX7-NEXT: v_cndmask_b32_e64 v2, 0, -1.0, vcc ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; GFX7-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX7-NEXT: v_cndmask_b32_e64 v1, 0, -1.0, vcc +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX7-NEXT: v_cndmask_b32_e64 v2, 0, -1.0, vcc ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, vcc -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX7-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX7-NEXT: v_alignbit_b32 v1, v3, v2, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_sitofp_v4i1_to_v4bf16: diff --git a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll index 69bd0687b71af..d74515f19dc8d 100644 --- a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll +++ b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll @@ -541,24 +541,25 @@ define <2 x i16> @clpeak_imad_pat_v2i16(<2 x i16> %x, <2 x i16> %y) { ; GFX67-SDAG-LABEL: clpeak_imad_pat_v2i16: ; GFX67-SDAG: ; %bb.0: ; %entry ; GFX67-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX67-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX67-SDAG-NEXT: v_add_i32_e32 v0, vcc, 1, v0 +; GFX67-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX67-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v0 -; GFX67-SDAG-NEXT: v_add_i32_e32 v1, vcc, 1, v1 -; GFX67-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX67-SDAG-NEXT: v_and_b32_e32 v5, 0xffff, v1 -; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v4, v2, v0 -; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v1, v5, v3, v1 +; GFX67-SDAG-NEXT: v_add_i32_e32 v3, vcc, 1, v3 ; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v6, v0, v2 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v4, v4, v2, 1 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v7, v1, v3 +; GFX67-SDAG-NEXT: v_and_b32_e32 v5, 0xffff, v3 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v4, v1, v0 +; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v3, v5, v2, v3 +; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v6, v0, v1 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v4, v4, v1, 1 +; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v7, v3, v2 ; GFX67-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v5, v5, v3, 1 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v1, v1, v3, 1 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v5, v5, v2, 1 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v0, v1, 1 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v1, v3, v2, 1 ; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v6 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v0, v2, 1 ; GFX67-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v5 ; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v3, v3, v4 ; GFX67-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v7 @@ -569,55 +570,63 @@ define <2 x i16> @clpeak_imad_pat_v2i16(<2 x i16> %x, <2 x i16> %y) { ; GFX67-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v0, v3, v0 ; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v1, v2, v1 +; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX67-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX67-SDAG-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX67-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX67-GISEL-LABEL: clpeak_imad_pat_v2i16: ; GFX67-GISEL: ; %bb.0: ; %entry ; GFX67-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX67-GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v1 +; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX67-GISEL-NEXT: v_add_i32_e32 v2, vcc, 1, v2 ; GFX67-GISEL-NEXT: v_add_i32_e32 v0, vcc, 1, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v1 -; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v0 -; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX67-GISEL-NEXT: v_or_b32_e32 v4, v4, v5 -; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v2 +; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v0 +; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX67-GISEL-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v5, 16, v1 ; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX67-GISEL-NEXT: v_mad_u32_u24 v1, v5, v3, v1 -; GFX67-GISEL-NEXT: v_mad_u32_u24 v0, v4, v2, v0 ; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v2, v4, v5, v2 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v0, v3, v1, v0 +; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX67-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX67-GISEL-NEXT: v_mad_u32_u24 v5, v5, v3, 1 -; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX67-GISEL-NEXT: v_mad_u32_u24 v4, v4, v2, 1 -; GFX67-GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX67-GISEL-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v7, v1, v3 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v4, v4, v5, 1 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v6, v0, v1 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v3, v3, v1, 1 ; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX67-GISEL-NEXT: v_mad_u32_u24 v1, v1, v3, 1 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v6, v0, v2 -; GFX67-GISEL-NEXT: v_or_b32_e32 v4, v4, v5 -; GFX67-GISEL-NEXT: v_mad_u32_u24 v0, v0, v2, 1 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v0, v0, v1, 1 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v1, v2, v5, 1 +; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX67-GISEL-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v6 -; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v4 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v7, v2, v5 ; GFX67-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v4 +; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v6 +; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v2, v2, v3 ; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v7 ; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v3, v1 ; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v2, v0 ; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v1, v3 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v2, v0 +; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX67-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX67-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: clpeak_imad_pat_v2i16: @@ -749,103 +758,113 @@ define <3 x i16> @clpeak_imad_pat_v3i16(<3 x i16> %x, <3 x i16> %y) { ; GFX67-SDAG-LABEL: clpeak_imad_pat_v3i16: ; GFX67-SDAG: ; %bb.0: ; %entry ; GFX67-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX67-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; GFX67-SDAG-NEXT: v_add_i32_e32 v0, vcc, 1, v0 +; GFX67-SDAG-NEXT: v_add_i32_e32 v4, vcc, 1, v4 +; GFX67-SDAG-NEXT: v_and_b32_e32 v5, 0xffff, v0 +; GFX67-SDAG-NEXT: v_and_b32_e32 v7, 0xffff, v4 +; GFX67-SDAG-NEXT: v_lshrrev_b32_e32 v8, 16, v2 +; GFX67-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX67-SDAG-NEXT: v_add_i32_e32 v1, vcc, 1, v1 -; GFX67-SDAG-NEXT: v_and_b32_e32 v6, 0xffff, v0 -; GFX67-SDAG-NEXT: v_and_b32_e32 v8, 0xffff, v1 +; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v9, v7, v8 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v5, v2, v0 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v4, v7, v8, v4 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v5, v5, v2, 1 +; GFX67-SDAG-NEXT: v_and_b32_e32 v6, 0xffff, v1 ; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX67-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX67-SDAG-NEXT: v_add_i32_e32 v2, vcc, 1, v2 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v9, v8, v4 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v6, v3, v0 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v1, v8, v4, v1 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v6, v6, v3, 1 -; GFX67-SDAG-NEXT: v_and_b32_e32 v7, 0xffff, v2 ; GFX67-SDAG-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-SDAG-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; GFX67-SDAG-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v2, v7, v5, v2 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v8, v0, v3 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v1, v1, v4 -; GFX67-SDAG-NEXT: v_or_b32_e32 v6, v9, v6 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v1, v6, v3, v1 +; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v7, v0, v2 +; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v4, v4, v8 +; GFX67-SDAG-NEXT: v_or_b32_e32 v5, v9, v5 ; GFX67-SDAG-NEXT: s_mov_b32 s4, 0x10000 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v0, v3, 1 -; GFX67-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX67-SDAG-NEXT: v_add_i32_e32 v6, vcc, s4, v6 -; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-SDAG-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v4, v2, v5 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v7, v7, v5, 1 -; GFX67-SDAG-NEXT: v_or_b32_e32 v0, v3, v0 -; GFX67-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v6 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v0, v2, 1 ; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-SDAG-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v2, v2, v5, 1 -; GFX67-SDAG-NEXT: v_and_b32_e32 v5, 0xffff, v8 -; GFX67-SDAG-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v1, v1, v3 -; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v4 -; GFX67-SDAG-NEXT: v_add_i32_e32 v0, vcc, s4, v0 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v5, v5, v6 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v3, v3, v7 -; GFX67-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX67-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX67-SDAG-NEXT: v_add_i32_e32 v5, vcc, s4, v5 +; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX67-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v8, v1, v3 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v6, v6, v3, 1 +; GFX67-SDAG-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v1, v1, v3, 1 +; GFX67-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v5 +; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v7 ; GFX67-SDAG-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX67-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX67-SDAG-NEXT: v_add_i32_e32 v0, vcc, s4, v0 +; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v3, v3, v5 +; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v2, v4, v2 +; GFX67-SDAG-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX67-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v8 +; GFX67-SDAG-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX67-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v4, v4, v6 +; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v0, v3, v0 +; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v2, v2, v5 ; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v0, v5, v0 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v1, v1, v4 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v2, v3, v2 +; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v4 +; GFX67-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v1, v3, v1 +; GFX67-SDAG-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX67-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX67-GISEL-LABEL: clpeak_imad_pat_v3i16: ; GFX67-GISEL: ; %bb.0: ; %entry ; GFX67-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; GFX67-GISEL-NEXT: v_add_i32_e32 v0, vcc, 1, v0 +; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX67-GISEL-NEXT: v_add_i32_e32 v4, vcc, 1, v4 ; GFX67-GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v1 ; GFX67-GISEL-NEXT: v_and_b32_e32 v6, 0xffff, v0 +; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX67-GISEL-NEXT: v_and_b32_e32 v7, 0xffff, v4 +; GFX67-GISEL-NEXT: v_and_b32_e32 v8, 0xffff, v1 ; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX67-GISEL-NEXT: v_add_i32_e32 v2, vcc, 1, v2 -; GFX67-GISEL-NEXT: v_and_b32_e32 v7, 0xffff, v1 -; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX67-GISEL-NEXT: v_mad_u32_u24 v0, v6, v3, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v8, 0xffff, v2 -; GFX67-GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX67-GISEL-NEXT: v_mad_u32_u24 v1, v7, v4, v1 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v0, v6, v2, v0 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v4, v7, v5, v4 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v1, v8, v3, v1 ; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-GISEL-NEXT: v_mad_u32_u24 v2, v8, v5, v2 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v9, v0, v3 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v9, v0, v2 +; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-GISEL-NEXT: v_mad_u32_u24 v6, v6, v3, 1 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v10, v1, v4 -; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX67-GISEL-NEXT: v_mad_u32_u24 v7, v7, v4, 1 -; GFX67-GISEL-NEXT: v_mad_u32_u24 v0, v0, v3, 1 -; GFX67-GISEL-NEXT: v_mad_u32_u24 v1, v1, v4, 1 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v6, v6, v2, 1 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v10, v4, v5 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v11, v1, v3 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v7, v7, v5, 1 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v8, v8, v3, 1 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v0, v0, v2, 1 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v2, v4, v5, 1 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v1, v1, v3, 1 ; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v9 ; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v6 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v11, v2, v5 -; GFX67-GISEL-NEXT: v_mad_u32_u24 v8, v8, v5, 1 -; GFX67-GISEL-NEXT: v_mad_u32_u24 v2, v2, v5, 1 ; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v3, v3, v4 ; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v10 ; GFX67-GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v7 ; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v4, v4, v5 -; GFX67-GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v11 -; GFX67-GISEL-NEXT: v_and_b32_e32 v6, 0xffff, v8 ; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v5, v5, v6 +; GFX67-GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v11 +; GFX67-GISEL-NEXT: v_and_b32_e32 v6, 0xffff, v8 ; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v3, v0 ; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v4 -; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v3, v1 -; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v5 ; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v5, v5, v6 ; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v2, v3, v2 +; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v5 +; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v3, v1 +; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX67-GISEL-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX67-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: clpeak_imad_pat_v3i16: @@ -1143,140 +1162,150 @@ define <4 x i16> @clpeak_imad_pat_v4i16(<4 x i16> %x, <4 x i16> %y) { ; GFX67-SDAG-LABEL: clpeak_imad_pat_v4i16: ; GFX67-SDAG: ; %bb.0: ; %entry ; GFX67-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX67-SDAG-NEXT: v_add_i32_e32 v3, vcc, 1, v3 -; GFX67-SDAG-NEXT: v_and_b32_e32 v11, 0xffff, v3 -; GFX67-SDAG-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX67-SDAG-NEXT: v_add_i32_e32 v2, vcc, 1, v2 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v3, v11, v7, v3 -; GFX67-SDAG-NEXT: v_add_i32_e32 v0, vcc, 1, v0 -; GFX67-SDAG-NEXT: v_and_b32_e32 v9, 0xffff, v2 +; GFX67-SDAG-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; GFX67-SDAG-NEXT: v_add_i32_e32 v7, vcc, 1, v7 +; GFX67-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; GFX67-SDAG-NEXT: v_and_b32_e32 v11, 0xffff, v7 +; GFX67-SDAG-NEXT: v_lshrrev_b32_e32 v6, 16, v0 ; GFX67-SDAG-NEXT: v_add_i32_e32 v1, vcc, 1, v1 -; GFX67-SDAG-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v7, v11, v4, v7 +; GFX67-SDAG-NEXT: v_add_i32_e32 v0, vcc, 1, v0 +; GFX67-SDAG-NEXT: v_and_b32_e32 v9, 0xffff, v1 +; GFX67-SDAG-NEXT: v_add_i32_e32 v6, vcc, 1, v6 ; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX67-SDAG-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX67-SDAG-NEXT: v_lshrrev_b32_e32 v5, 16, v2 ; GFX67-SDAG-NEXT: v_and_b32_e32 v8, 0xffff, v0 -; GFX67-SDAG-NEXT: v_and_b32_e32 v10, 0xffff, v1 -; GFX67-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX67-SDAG-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v13, v11, v7 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v3, v3, v7 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v7, v9, v6, 1 +; GFX67-SDAG-NEXT: v_and_b32_e32 v10, 0xffff, v6 +; GFX67-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v13, v11, v4 +; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v4, v7, v4 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v7, v9, v3, 1 ; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v12, v10, v5 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v2, v9, v6, v2 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v8, v4, v0 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v1, v10, v5, v1 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v1, v9, v3, v1 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v8, v2, v0 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v6, v10, v5, v6 ; GFX67-SDAG-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; GFX67-SDAG-NEXT: v_lshlrev_b32_e32 v9, 16, v13 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v8, v8, v4, 1 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v8, v8, v2, 1 ; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX67-SDAG-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; GFX67-SDAG-NEXT: v_or_b32_e32 v7, v9, v7 ; GFX67-SDAG-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; GFX67-SDAG-NEXT: v_lshlrev_b32_e32 v9, 16, v12 -; GFX67-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v10, v0, v4 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v1, v1, v5 +; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v10, v0, v2 +; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v5, v6, v5 ; GFX67-SDAG-NEXT: s_mov_b32 s4, 0x10000 ; GFX67-SDAG-NEXT: v_or_b32_e32 v8, v9, v8 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v0, v4, 1 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v5, v2, v6 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v0, v2, 1 +; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v6, v1, v3 ; GFX67-SDAG-NEXT: v_add_i32_e32 v8, vcc, s4, v8 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v2, v2, v6, 1 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v1, v1, v3, 1 ; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-SDAG-NEXT: v_lshlrev_b32_e32 v4, 16, v1 +; GFX67-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v5 ; GFX67-SDAG-NEXT: v_add_i32_e32 v7, vcc, s4, v7 -; GFX67-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX67-SDAG-NEXT: v_lshlrev_b32_e32 v6, 16, v3 -; GFX67-SDAG-NEXT: v_or_b32_e32 v0, v4, v0 -; GFX67-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v8 ; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-SDAG-NEXT: v_or_b32_e32 v2, v6, v2 -; GFX67-SDAG-NEXT: v_lshrrev_b32_e32 v6, 16, v7 +; GFX67-SDAG-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX67-SDAG-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX67-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v8 ; GFX67-SDAG-NEXT: v_and_b32_e32 v9, 0xffff, v10 ; GFX67-SDAG-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v1, v1, v4 -; GFX67-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v5 -; GFX67-SDAG-NEXT: v_and_b32_e32 v5, 0xffff, v7 -; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX67-SDAG-NEXT: v_add_i32_e32 v2, vcc, s4, v2 +; GFX67-SDAG-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX67-SDAG-NEXT: v_or_b32_e32 v1, v3, v1 ; GFX67-SDAG-NEXT: v_add_i32_e32 v0, vcc, s4, v0 +; GFX67-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v7 ; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v8, v9, v8 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v4, v4, v5 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v3, v3, v6 -; GFX67-SDAG-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GFX67-SDAG-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v2, v5, v2 +; GFX67-SDAG-NEXT: v_and_b32_e32 v5, 0xffff, v6 +; GFX67-SDAG-NEXT: v_and_b32_e32 v6, 0xffff, v7 +; GFX67-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX67-SDAG-NEXT: v_add_i32_e32 v1, vcc, s4, v1 +; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v5, v5, v6 +; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v3, v4, v3 +; GFX67-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; GFX67-SDAG-NEXT: v_and_b32_e32 v7, 0xffff, v8 ; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX67-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX67-SDAG-NEXT: v_lshrrev_b32_e32 v6, 16, v1 ; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v0, v7, v0 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v1, v1, v5 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v2, v4, v2 +; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v2, v2, v4 +; GFX67-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v5 +; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v1, v4, v1 ; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v3, v3, v6 +; GFX67-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX67-SDAG-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX67-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX67-SDAG-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX67-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX67-GISEL-LABEL: clpeak_imad_pat_v4i16: ; GFX67-GISEL: ; %bb.0: ; %entry ; GFX67-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX67-GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v1 +; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX67-GISEL-NEXT: v_add_i32_e32 v4, vcc, 1, v4 ; GFX67-GISEL-NEXT: v_add_i32_e32 v0, vcc, 1, v0 -; GFX67-GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v3 -; GFX67-GISEL-NEXT: v_and_b32_e32 v9, 0xffff, v1 -; GFX67-GISEL-NEXT: v_add_i32_e32 v2, vcc, 1, v2 +; GFX67-GISEL-NEXT: v_add_i32_e32 v5, vcc, 1, v5 +; GFX67-GISEL-NEXT: v_and_b32_e32 v9, 0xffff, v4 +; GFX67-GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v1 ; GFX67-GISEL-NEXT: v_and_b32_e32 v8, 0xffff, v0 ; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GFX67-GISEL-NEXT: v_and_b32_e32 v10, 0xffff, v3 +; GFX67-GISEL-NEXT: v_and_b32_e32 v10, 0xffff, v5 ; GFX67-GISEL-NEXT: v_or_b32_e32 v8, v8, v9 -; GFX67-GISEL-NEXT: v_and_b32_e32 v9, 0xffff, v2 +; GFX67-GISEL-NEXT: v_and_b32_e32 v9, 0xffff, v1 ; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v6, 16, v2 ; GFX67-GISEL-NEXT: v_or_b32_e32 v9, v9, v10 ; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v10, 16, v8 -; GFX67-GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX67-GISEL-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX67-GISEL-NEXT: v_mad_u32_u24 v1, v10, v5, v1 +; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v4, v10, v6, v4 +; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v7, 16, v3 ; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v11, 16, v9 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v0, v8, v2, v0 +; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX67-GISEL-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GFX67-GISEL-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX67-GISEL-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX67-GISEL-NEXT: v_mad_u32_u24 v0, v8, v4, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-GISEL-NEXT: v_mad_u32_u24 v2, v9, v6, v2 -; GFX67-GISEL-NEXT: v_mad_u32_u24 v3, v11, v7, v3 +; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v5, v11, v7, v5 ; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX67-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v3 -; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX67-GISEL-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v1, v9, v3, v1 +; GFX67-GISEL-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v5 +; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX67-GISEL-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v13, v2, v5 -; GFX67-GISEL-NEXT: v_mad_u32_u24 v10, v10, v5, 1 -; GFX67-GISEL-NEXT: v_mad_u32_u24 v2, v2, v5, 1 -; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v12, v0, v4 -; GFX67-GISEL-NEXT: v_mad_u32_u24 v8, v8, v4, 1 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v12, v0, v2 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v8, v8, v2, 1 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v10, v10, v6, 1 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v0, v0, v2, 1 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v2, v4, v6, 1 +; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX67-GISEL-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GFX67-GISEL-NEXT: v_mad_u32_u24 v0, v0, v4, 1 ; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v15, v3, v7 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v14, v1, v3 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v9, v9, v3, 1 ; GFX67-GISEL-NEXT: v_mad_u32_u24 v11, v11, v7, 1 ; GFX67-GISEL-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GFX67-GISEL-NEXT: v_mad_u32_u24 v3, v3, v7, 1 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v1, v1, v3, 1 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v3, v5, v7, 1 ; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v14, v1, v6 -; GFX67-GISEL-NEXT: v_mad_u32_u24 v9, v9, v6, 1 ; GFX67-GISEL-NEXT: v_or_b32_e32 v8, v8, v10 ; GFX67-GISEL-NEXT: v_and_b32_e32 v10, 0xffff, v11 -; GFX67-GISEL-NEXT: v_mad_u32_u24 v1, v1, v6, 1 ; GFX67-GISEL-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v3 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v13, v4, v6 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v15, v5, v7 ; GFX67-GISEL-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 @@ -1284,11 +1313,11 @@ define <4 x i16> @clpeak_imad_pat_v4i16(<4 x i16> %x, <4 x i16> %y) { ; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v12 ; GFX67-GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v8 ; GFX67-GISEL-NEXT: v_or_b32_e32 v9, v9, v10 -; GFX67-GISEL-NEXT: v_or_b32_e32 v2, v1, v2 -; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v8 +; GFX67-GISEL-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v8 ; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v4, v4, v5 ; GFX67-GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v13 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v5, v1 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v2, v5, v2 ; GFX67-GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v14 ; GFX67-GISEL-NEXT: v_and_b32_e32 v6, 0xffff, v9 ; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v9 @@ -1296,17 +1325,25 @@ define <4 x i16> @clpeak_imad_pat_v4i16(<4 x i16> %x, <4 x i16> %y) { ; GFX67-GISEL-NEXT: v_and_b32_e32 v6, 0xffff, v15 ; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v3, v6, v3 ; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v2, v2, v6 +; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v7, 16, v1 ; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v4, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v5 -; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v1, v6 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v2, v4, v2 +; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v5 +; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v3, v3, v7 +; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v4, v1 +; GFX67-GISEL-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v3 +; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX67-GISEL-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX67-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: clpeak_imad_pat_v4i16: @@ -1884,24 +1921,25 @@ define <2 x i16> @clpeak_umad_pat_v2i16(<2 x i16> %x, <2 x i16> %y) { ; GFX67-SDAG-LABEL: clpeak_umad_pat_v2i16: ; GFX67-SDAG: ; %bb.0: ; %entry ; GFX67-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX67-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX67-SDAG-NEXT: v_add_i32_e32 v0, vcc, 1, v0 +; GFX67-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX67-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v0 -; GFX67-SDAG-NEXT: v_add_i32_e32 v1, vcc, 1, v1 -; GFX67-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX67-SDAG-NEXT: v_and_b32_e32 v5, 0xffff, v1 -; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v4, v2, v0 -; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v1, v5, v3, v1 +; GFX67-SDAG-NEXT: v_add_i32_e32 v3, vcc, 1, v3 ; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v6, v0, v2 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v4, v4, v2, 1 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v7, v1, v3 +; GFX67-SDAG-NEXT: v_and_b32_e32 v5, 0xffff, v3 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v4, v1, v0 +; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v3, v5, v2, v3 +; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v6, v0, v1 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v4, v4, v1, 1 +; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v7, v3, v2 ; GFX67-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v5, v5, v3, 1 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v1, v1, v3, 1 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v5, v5, v2, 1 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v0, v1, 1 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v1, v3, v2, 1 ; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v6 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v0, v2, 1 ; GFX67-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v5 ; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v3, v3, v4 ; GFX67-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v7 @@ -1912,55 +1950,63 @@ define <2 x i16> @clpeak_umad_pat_v2i16(<2 x i16> %x, <2 x i16> %y) { ; GFX67-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v0, v3, v0 ; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v1, v2, v1 +; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX67-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX67-SDAG-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX67-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX67-GISEL-LABEL: clpeak_umad_pat_v2i16: ; GFX67-GISEL: ; %bb.0: ; %entry ; GFX67-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX67-GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v1 +; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX67-GISEL-NEXT: v_add_i32_e32 v2, vcc, 1, v2 ; GFX67-GISEL-NEXT: v_add_i32_e32 v0, vcc, 1, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v1 -; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v0 -; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX67-GISEL-NEXT: v_or_b32_e32 v4, v4, v5 -; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v2 +; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v0 +; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX67-GISEL-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v5, 16, v1 ; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX67-GISEL-NEXT: v_mad_u32_u24 v1, v5, v3, v1 -; GFX67-GISEL-NEXT: v_mad_u32_u24 v0, v4, v2, v0 ; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v2, v4, v5, v2 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v0, v3, v1, v0 +; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX67-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX67-GISEL-NEXT: v_mad_u32_u24 v5, v5, v3, 1 -; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX67-GISEL-NEXT: v_mad_u32_u24 v4, v4, v2, 1 -; GFX67-GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX67-GISEL-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v7, v1, v3 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v4, v4, v5, 1 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v6, v0, v1 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v3, v3, v1, 1 ; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX67-GISEL-NEXT: v_mad_u32_u24 v1, v1, v3, 1 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v6, v0, v2 -; GFX67-GISEL-NEXT: v_or_b32_e32 v4, v4, v5 -; GFX67-GISEL-NEXT: v_mad_u32_u24 v0, v0, v2, 1 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v0, v0, v1, 1 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v1, v2, v5, 1 +; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX67-GISEL-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v6 -; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v4 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v7, v2, v5 ; GFX67-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v4 +; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v6 +; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v2, v2, v3 ; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v7 ; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v3, v1 ; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v2, v0 ; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v1, v3 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v2, v0 +; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX67-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX67-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: clpeak_umad_pat_v2i16: @@ -2092,103 +2138,113 @@ define <3 x i16> @clpeak_umad_pat_v3i16(<3 x i16> %x, <3 x i16> %y) { ; GFX67-SDAG-LABEL: clpeak_umad_pat_v3i16: ; GFX67-SDAG: ; %bb.0: ; %entry ; GFX67-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX67-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; GFX67-SDAG-NEXT: v_add_i32_e32 v0, vcc, 1, v0 +; GFX67-SDAG-NEXT: v_add_i32_e32 v4, vcc, 1, v4 +; GFX67-SDAG-NEXT: v_and_b32_e32 v5, 0xffff, v0 +; GFX67-SDAG-NEXT: v_and_b32_e32 v7, 0xffff, v4 +; GFX67-SDAG-NEXT: v_lshrrev_b32_e32 v8, 16, v2 +; GFX67-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX67-SDAG-NEXT: v_add_i32_e32 v1, vcc, 1, v1 -; GFX67-SDAG-NEXT: v_and_b32_e32 v6, 0xffff, v0 -; GFX67-SDAG-NEXT: v_and_b32_e32 v8, 0xffff, v1 +; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v9, v7, v8 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v5, v2, v0 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v4, v7, v8, v4 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v5, v5, v2, 1 +; GFX67-SDAG-NEXT: v_and_b32_e32 v6, 0xffff, v1 ; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX67-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX67-SDAG-NEXT: v_add_i32_e32 v2, vcc, 1, v2 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v9, v8, v4 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v6, v3, v0 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v1, v8, v4, v1 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v6, v6, v3, 1 -; GFX67-SDAG-NEXT: v_and_b32_e32 v7, 0xffff, v2 ; GFX67-SDAG-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-SDAG-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; GFX67-SDAG-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v2, v7, v5, v2 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v8, v0, v3 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v1, v1, v4 -; GFX67-SDAG-NEXT: v_or_b32_e32 v6, v9, v6 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v1, v6, v3, v1 +; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v7, v0, v2 +; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v4, v4, v8 +; GFX67-SDAG-NEXT: v_or_b32_e32 v5, v9, v5 ; GFX67-SDAG-NEXT: s_mov_b32 s4, 0x10000 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v0, v3, 1 -; GFX67-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX67-SDAG-NEXT: v_add_i32_e32 v6, vcc, s4, v6 -; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-SDAG-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v4, v2, v5 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v7, v7, v5, 1 -; GFX67-SDAG-NEXT: v_or_b32_e32 v0, v3, v0 -; GFX67-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v6 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v0, v2, 1 ; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-SDAG-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v2, v2, v5, 1 -; GFX67-SDAG-NEXT: v_and_b32_e32 v5, 0xffff, v8 -; GFX67-SDAG-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v1, v1, v3 -; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v4 -; GFX67-SDAG-NEXT: v_add_i32_e32 v0, vcc, s4, v0 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v5, v5, v6 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v3, v3, v7 -; GFX67-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX67-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX67-SDAG-NEXT: v_add_i32_e32 v5, vcc, s4, v5 +; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX67-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v8, v1, v3 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v6, v6, v3, 1 +; GFX67-SDAG-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v1, v1, v3, 1 +; GFX67-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v5 +; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v7 ; GFX67-SDAG-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX67-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX67-SDAG-NEXT: v_add_i32_e32 v0, vcc, s4, v0 +; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v3, v3, v5 +; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v2, v4, v2 +; GFX67-SDAG-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX67-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v8 +; GFX67-SDAG-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX67-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v4, v4, v6 +; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v0, v3, v0 +; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v2, v2, v5 ; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v0, v5, v0 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v1, v1, v4 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v2, v3, v2 +; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v4 +; GFX67-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v1, v3, v1 +; GFX67-SDAG-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX67-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX67-GISEL-LABEL: clpeak_umad_pat_v3i16: ; GFX67-GISEL: ; %bb.0: ; %entry ; GFX67-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; GFX67-GISEL-NEXT: v_add_i32_e32 v0, vcc, 1, v0 +; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX67-GISEL-NEXT: v_add_i32_e32 v4, vcc, 1, v4 ; GFX67-GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v1 ; GFX67-GISEL-NEXT: v_and_b32_e32 v6, 0xffff, v0 +; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX67-GISEL-NEXT: v_and_b32_e32 v7, 0xffff, v4 +; GFX67-GISEL-NEXT: v_and_b32_e32 v8, 0xffff, v1 ; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX67-GISEL-NEXT: v_add_i32_e32 v2, vcc, 1, v2 -; GFX67-GISEL-NEXT: v_and_b32_e32 v7, 0xffff, v1 -; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX67-GISEL-NEXT: v_mad_u32_u24 v0, v6, v3, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v8, 0xffff, v2 -; GFX67-GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX67-GISEL-NEXT: v_mad_u32_u24 v1, v7, v4, v1 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v0, v6, v2, v0 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v4, v7, v5, v4 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v1, v8, v3, v1 ; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-GISEL-NEXT: v_mad_u32_u24 v2, v8, v5, v2 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v9, v0, v3 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v9, v0, v2 +; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-GISEL-NEXT: v_mad_u32_u24 v6, v6, v3, 1 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v10, v1, v4 -; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX67-GISEL-NEXT: v_mad_u32_u24 v7, v7, v4, 1 -; GFX67-GISEL-NEXT: v_mad_u32_u24 v0, v0, v3, 1 -; GFX67-GISEL-NEXT: v_mad_u32_u24 v1, v1, v4, 1 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v6, v6, v2, 1 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v10, v4, v5 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v11, v1, v3 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v7, v7, v5, 1 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v8, v8, v3, 1 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v0, v0, v2, 1 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v2, v4, v5, 1 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v1, v1, v3, 1 ; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v9 ; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v6 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v11, v2, v5 -; GFX67-GISEL-NEXT: v_mad_u32_u24 v8, v8, v5, 1 -; GFX67-GISEL-NEXT: v_mad_u32_u24 v2, v2, v5, 1 ; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v3, v3, v4 ; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v10 ; GFX67-GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v7 ; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v4, v4, v5 -; GFX67-GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v11 -; GFX67-GISEL-NEXT: v_and_b32_e32 v6, 0xffff, v8 ; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v5, v5, v6 +; GFX67-GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v11 +; GFX67-GISEL-NEXT: v_and_b32_e32 v6, 0xffff, v8 ; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v3, v0 ; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v4 -; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v3, v1 -; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v5 ; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v5, v5, v6 ; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v2, v3, v2 +; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v5 +; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v3, v1 +; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX67-GISEL-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX67-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: clpeak_umad_pat_v3i16: @@ -2486,140 +2542,150 @@ define <4 x i16> @clpeak_umad_pat_v4i16(<4 x i16> %x, <4 x i16> %y) { ; GFX67-SDAG-LABEL: clpeak_umad_pat_v4i16: ; GFX67-SDAG: ; %bb.0: ; %entry ; GFX67-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX67-SDAG-NEXT: v_add_i32_e32 v3, vcc, 1, v3 -; GFX67-SDAG-NEXT: v_and_b32_e32 v11, 0xffff, v3 -; GFX67-SDAG-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX67-SDAG-NEXT: v_add_i32_e32 v2, vcc, 1, v2 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v3, v11, v7, v3 -; GFX67-SDAG-NEXT: v_add_i32_e32 v0, vcc, 1, v0 -; GFX67-SDAG-NEXT: v_and_b32_e32 v9, 0xffff, v2 +; GFX67-SDAG-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; GFX67-SDAG-NEXT: v_add_i32_e32 v7, vcc, 1, v7 +; GFX67-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; GFX67-SDAG-NEXT: v_and_b32_e32 v11, 0xffff, v7 +; GFX67-SDAG-NEXT: v_lshrrev_b32_e32 v6, 16, v0 ; GFX67-SDAG-NEXT: v_add_i32_e32 v1, vcc, 1, v1 -; GFX67-SDAG-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v7, v11, v4, v7 +; GFX67-SDAG-NEXT: v_add_i32_e32 v0, vcc, 1, v0 +; GFX67-SDAG-NEXT: v_and_b32_e32 v9, 0xffff, v1 +; GFX67-SDAG-NEXT: v_add_i32_e32 v6, vcc, 1, v6 ; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX67-SDAG-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX67-SDAG-NEXT: v_lshrrev_b32_e32 v5, 16, v2 ; GFX67-SDAG-NEXT: v_and_b32_e32 v8, 0xffff, v0 -; GFX67-SDAG-NEXT: v_and_b32_e32 v10, 0xffff, v1 -; GFX67-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX67-SDAG-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v13, v11, v7 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v3, v3, v7 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v7, v9, v6, 1 +; GFX67-SDAG-NEXT: v_and_b32_e32 v10, 0xffff, v6 +; GFX67-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v13, v11, v4 +; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v4, v7, v4 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v7, v9, v3, 1 ; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v12, v10, v5 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v2, v9, v6, v2 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v8, v4, v0 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v1, v10, v5, v1 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v1, v9, v3, v1 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v8, v2, v0 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v6, v10, v5, v6 ; GFX67-SDAG-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; GFX67-SDAG-NEXT: v_lshlrev_b32_e32 v9, 16, v13 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v8, v8, v4, 1 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v8, v8, v2, 1 ; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX67-SDAG-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; GFX67-SDAG-NEXT: v_or_b32_e32 v7, v9, v7 ; GFX67-SDAG-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; GFX67-SDAG-NEXT: v_lshlrev_b32_e32 v9, 16, v12 -; GFX67-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v10, v0, v4 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v1, v1, v5 +; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v10, v0, v2 +; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v5, v6, v5 ; GFX67-SDAG-NEXT: s_mov_b32 s4, 0x10000 ; GFX67-SDAG-NEXT: v_or_b32_e32 v8, v9, v8 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v0, v4, 1 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v5, v2, v6 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v0, v2, 1 +; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v6, v1, v3 ; GFX67-SDAG-NEXT: v_add_i32_e32 v8, vcc, s4, v8 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v2, v2, v6, 1 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v1, v1, v3, 1 ; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-SDAG-NEXT: v_lshlrev_b32_e32 v4, 16, v1 +; GFX67-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v5 ; GFX67-SDAG-NEXT: v_add_i32_e32 v7, vcc, s4, v7 -; GFX67-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX67-SDAG-NEXT: v_lshlrev_b32_e32 v6, 16, v3 -; GFX67-SDAG-NEXT: v_or_b32_e32 v0, v4, v0 -; GFX67-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v8 ; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-SDAG-NEXT: v_or_b32_e32 v2, v6, v2 -; GFX67-SDAG-NEXT: v_lshrrev_b32_e32 v6, 16, v7 +; GFX67-SDAG-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX67-SDAG-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX67-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v8 ; GFX67-SDAG-NEXT: v_and_b32_e32 v9, 0xffff, v10 ; GFX67-SDAG-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v1, v1, v4 -; GFX67-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v5 -; GFX67-SDAG-NEXT: v_and_b32_e32 v5, 0xffff, v7 -; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX67-SDAG-NEXT: v_add_i32_e32 v2, vcc, s4, v2 +; GFX67-SDAG-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX67-SDAG-NEXT: v_or_b32_e32 v1, v3, v1 ; GFX67-SDAG-NEXT: v_add_i32_e32 v0, vcc, s4, v0 +; GFX67-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v7 ; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v8, v9, v8 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v4, v4, v5 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v3, v3, v6 -; GFX67-SDAG-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GFX67-SDAG-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v2, v5, v2 +; GFX67-SDAG-NEXT: v_and_b32_e32 v5, 0xffff, v6 +; GFX67-SDAG-NEXT: v_and_b32_e32 v6, 0xffff, v7 +; GFX67-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX67-SDAG-NEXT: v_add_i32_e32 v1, vcc, s4, v1 +; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v5, v5, v6 +; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v3, v4, v3 +; GFX67-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; GFX67-SDAG-NEXT: v_and_b32_e32 v7, 0xffff, v8 ; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX67-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX67-SDAG-NEXT: v_lshrrev_b32_e32 v6, 16, v1 ; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v0, v7, v0 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v1, v1, v5 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v2, v4, v2 +; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v2, v2, v4 +; GFX67-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v5 +; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v1, v4, v1 ; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v3, v3, v6 +; GFX67-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX67-SDAG-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX67-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX67-SDAG-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX67-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX67-GISEL-LABEL: clpeak_umad_pat_v4i16: ; GFX67-GISEL: ; %bb.0: ; %entry ; GFX67-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX67-GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v1 +; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX67-GISEL-NEXT: v_add_i32_e32 v4, vcc, 1, v4 ; GFX67-GISEL-NEXT: v_add_i32_e32 v0, vcc, 1, v0 -; GFX67-GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v3 -; GFX67-GISEL-NEXT: v_and_b32_e32 v9, 0xffff, v1 -; GFX67-GISEL-NEXT: v_add_i32_e32 v2, vcc, 1, v2 +; GFX67-GISEL-NEXT: v_add_i32_e32 v5, vcc, 1, v5 +; GFX67-GISEL-NEXT: v_and_b32_e32 v9, 0xffff, v4 +; GFX67-GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v1 ; GFX67-GISEL-NEXT: v_and_b32_e32 v8, 0xffff, v0 ; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GFX67-GISEL-NEXT: v_and_b32_e32 v10, 0xffff, v3 +; GFX67-GISEL-NEXT: v_and_b32_e32 v10, 0xffff, v5 ; GFX67-GISEL-NEXT: v_or_b32_e32 v8, v8, v9 -; GFX67-GISEL-NEXT: v_and_b32_e32 v9, 0xffff, v2 +; GFX67-GISEL-NEXT: v_and_b32_e32 v9, 0xffff, v1 ; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v6, 16, v2 ; GFX67-GISEL-NEXT: v_or_b32_e32 v9, v9, v10 ; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v10, 16, v8 -; GFX67-GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX67-GISEL-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX67-GISEL-NEXT: v_mad_u32_u24 v1, v10, v5, v1 +; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v4, v10, v6, v4 +; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v7, 16, v3 ; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v11, 16, v9 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v0, v8, v2, v0 +; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX67-GISEL-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GFX67-GISEL-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX67-GISEL-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX67-GISEL-NEXT: v_mad_u32_u24 v0, v8, v4, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-GISEL-NEXT: v_mad_u32_u24 v2, v9, v6, v2 -; GFX67-GISEL-NEXT: v_mad_u32_u24 v3, v11, v7, v3 +; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v5, v11, v7, v5 ; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX67-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v3 -; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX67-GISEL-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v1, v9, v3, v1 +; GFX67-GISEL-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v5 +; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX67-GISEL-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v13, v2, v5 -; GFX67-GISEL-NEXT: v_mad_u32_u24 v10, v10, v5, 1 -; GFX67-GISEL-NEXT: v_mad_u32_u24 v2, v2, v5, 1 -; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v12, v0, v4 -; GFX67-GISEL-NEXT: v_mad_u32_u24 v8, v8, v4, 1 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v12, v0, v2 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v8, v8, v2, 1 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v10, v10, v6, 1 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v0, v0, v2, 1 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v2, v4, v6, 1 +; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX67-GISEL-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GFX67-GISEL-NEXT: v_mad_u32_u24 v0, v0, v4, 1 ; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v15, v3, v7 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v14, v1, v3 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v9, v9, v3, 1 ; GFX67-GISEL-NEXT: v_mad_u32_u24 v11, v11, v7, 1 ; GFX67-GISEL-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GFX67-GISEL-NEXT: v_mad_u32_u24 v3, v3, v7, 1 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v1, v1, v3, 1 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v3, v5, v7, 1 ; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v14, v1, v6 -; GFX67-GISEL-NEXT: v_mad_u32_u24 v9, v9, v6, 1 ; GFX67-GISEL-NEXT: v_or_b32_e32 v8, v8, v10 ; GFX67-GISEL-NEXT: v_and_b32_e32 v10, 0xffff, v11 -; GFX67-GISEL-NEXT: v_mad_u32_u24 v1, v1, v6, 1 ; GFX67-GISEL-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v3 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v13, v4, v6 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v15, v5, v7 ; GFX67-GISEL-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 @@ -2627,11 +2693,11 @@ define <4 x i16> @clpeak_umad_pat_v4i16(<4 x i16> %x, <4 x i16> %y) { ; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v12 ; GFX67-GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v8 ; GFX67-GISEL-NEXT: v_or_b32_e32 v9, v9, v10 -; GFX67-GISEL-NEXT: v_or_b32_e32 v2, v1, v2 -; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v8 +; GFX67-GISEL-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v8 ; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v4, v4, v5 ; GFX67-GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v13 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v5, v1 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v2, v5, v2 ; GFX67-GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v14 ; GFX67-GISEL-NEXT: v_and_b32_e32 v6, 0xffff, v9 ; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v9 @@ -2639,17 +2705,25 @@ define <4 x i16> @clpeak_umad_pat_v4i16(<4 x i16> %x, <4 x i16> %y) { ; GFX67-GISEL-NEXT: v_and_b32_e32 v6, 0xffff, v15 ; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v3, v6, v3 ; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v2, v2, v6 +; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v7, 16, v1 ; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v4, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v5 -; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v1, v6 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v2, v4, v2 +; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v5 +; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v3, v3, v7 +; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v4, v1 +; GFX67-GISEL-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v3 +; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX67-GISEL-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX67-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: clpeak_umad_pat_v4i16: @@ -9583,48 +9657,49 @@ define <2 x i16> @clpeak_imad_pat_v2i16_x2(<2 x i16> %x, <2 x i16> %y) { ; GFX67-SDAG-LABEL: clpeak_imad_pat_v2i16_x2: ; GFX67-SDAG: ; %bb.0: ; %entry ; GFX67-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX67-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX67-SDAG-NEXT: v_add_i32_e32 v0, vcc, 1, v0 +; GFX67-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX67-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v0 -; GFX67-SDAG-NEXT: v_add_i32_e32 v1, vcc, 1, v1 -; GFX67-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX67-SDAG-NEXT: v_and_b32_e32 v5, 0xffff, v1 -; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v4, v2, v0 -; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v1, v5, v3, v1 +; GFX67-SDAG-NEXT: v_add_i32_e32 v3, vcc, 1, v3 ; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v0, v0, v2 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v2, v4, v2, 1 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v1, v1, v3 -; GFX67-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v2 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v3, v5, v3, 1 -; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX67-SDAG-NEXT: v_and_b32_e32 v5, 0xffff, v3 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v4, v1, v0 +; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v3, v5, v2, v3 +; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v0, v0, v1 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v1, v4, v1, 1 +; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v3, v3, v2 +; GFX67-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v1 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v2, v5, v2, 1 +; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX67-SDAG-NEXT: v_and_b32_e32 v5, 0xffff, v2 +; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v1, v0, v4, v1 ; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v2, v0, v4, v2 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v2, v3, v5, v2 ; GFX67-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v3, v1, v5, v3 -; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v2, v2, v0 +; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v1, v1, v0 ; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v0, v4, 1 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v3, v3, v1 +; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v2, v2, v3 ; GFX67-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v0 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v1, v1, v5, 1 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v3, v3, v5, 1 +; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX67-SDAG-NEXT: v_and_b32_e32 v5, 0xffff, v3 ; GFX67-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX67-SDAG-NEXT: v_and_b32_e32 v5, 0xffff, v1 -; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v2, v4, v0 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v1, v4, v0 ; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v1, v3, v5, v1 -; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v6, v0, v2 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v4, v2, v4, 1 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v7, v1, v3 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v3, v2, v5, v3 +; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v6, v0, v1 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v4, v1, v4, 1 +; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v7, v3, v2 ; GFX67-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v5, v3, v5, 1 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v1, v1, v3, 1 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v5, v2, v5, 1 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v0, v1, 1 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v1, v3, v2, 1 ; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v6 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v0, v2, 1 ; GFX67-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v5 ; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v3, v3, v4 ; GFX67-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v7 @@ -9635,80 +9710,84 @@ define <2 x i16> @clpeak_imad_pat_v2i16_x2(<2 x i16> %x, <2 x i16> %y) { ; GFX67-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v0, v3, v0 ; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v1, v2, v1 +; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX67-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX67-SDAG-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX67-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX67-GISEL-LABEL: clpeak_imad_pat_v2i16_x2: ; GFX67-GISEL: ; %bb.0: ; %entry ; GFX67-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX67-GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v1 +; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX67-GISEL-NEXT: v_add_i32_e32 v2, vcc, 1, v2 ; GFX67-GISEL-NEXT: v_add_i32_e32 v0, vcc, 1, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v1 -; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v0 -; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX67-GISEL-NEXT: v_or_b32_e32 v4, v4, v5 -; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v2 +; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v0 +; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX67-GISEL-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v5, 16, v1 ; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX67-GISEL-NEXT: v_mad_u32_u24 v1, v5, v3, v1 -; GFX67-GISEL-NEXT: v_mad_u32_u24 v0, v4, v2, v0 ; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v2, v4, v5, v2 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v0, v3, v1, v0 +; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX67-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX67-GISEL-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v1, v3 -; GFX67-GISEL-NEXT: v_mad_u32_u24 v3, v5, v3, 1 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v0, v2 -; GFX67-GISEL-NEXT: v_mad_u32_u24 v2, v4, v2, 1 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v0, v1 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v1, v3, v1, 1 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v3, v4, v5, 1 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v2, v2, v5 ; GFX67-GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v3 -; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v2 +; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v1 ; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX67-GISEL-NEXT: v_or_b32_e32 v4, v4, v5 ; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX67-GISEL-NEXT: v_mad_u32_u24 v3, v1, v5, v3 -; GFX67-GISEL-NEXT: v_mad_u32_u24 v2, v0, v4, v2 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v3, v2, v5, v3 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v1, v0, v4, v1 ; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX67-GISEL-NEXT: v_or_b32_e32 v2, v2, v3 -; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v3, v3, v1 -; GFX67-GISEL-NEXT: v_mad_u32_u24 v1, v1, v5, 1 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v2, v2, v0 +; GFX67-GISEL-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v3, v3, v2 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v2, v2, v5, 1 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v1, v0 ; GFX67-GISEL-NEXT: v_mad_u32_u24 v0, v0, v4, 1 -; GFX67-GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v1 +; GFX67-GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v2 ; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v0 ; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX67-GISEL-NEXT: v_or_b32_e32 v4, v4, v5 ; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX67-GISEL-NEXT: v_mad_u32_u24 v1, v3, v5, v1 -; GFX67-GISEL-NEXT: v_mad_u32_u24 v0, v2, v4, v0 ; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v2, v3, v5, v2 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v0, v1, v4, v0 +; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX67-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX67-GISEL-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX67-GISEL-NEXT: v_mad_u32_u24 v5, v3, v5, 1 -; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX67-GISEL-NEXT: v_mad_u32_u24 v4, v2, v4, 1 -; GFX67-GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v7, v1, v3 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v4, v1, v4, 1 +; GFX67-GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v6, v0, v1 ; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX67-GISEL-NEXT: v_mad_u32_u24 v1, v1, v3, 1 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v6, v0, v2 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v0, v0, v1, 1 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v1, v2, v3, 1 ; GFX67-GISEL-NEXT: v_or_b32_e32 v4, v4, v5 -; GFX67-GISEL-NEXT: v_mad_u32_u24 v0, v0, v2, 1 ; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v7, v2, v3 ; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v6 @@ -9719,11 +9798,15 @@ define <2 x i16> @clpeak_imad_pat_v2i16_x2(<2 x i16> %x, <2 x i16> %y) { ; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v7 ; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v3, v1 ; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v2, v0 ; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v1, v3 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v2, v0 +; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX67-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX67-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: clpeak_imad_pat_v2i16_x2: @@ -9923,48 +10006,49 @@ define <2 x i16> @clpeak_umad_pat_v2i16_x2(<2 x i16> %x, <2 x i16> %y) { ; GFX67-SDAG-LABEL: clpeak_umad_pat_v2i16_x2: ; GFX67-SDAG: ; %bb.0: ; %entry ; GFX67-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX67-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX67-SDAG-NEXT: v_add_i32_e32 v0, vcc, 1, v0 +; GFX67-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX67-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v0 -; GFX67-SDAG-NEXT: v_add_i32_e32 v1, vcc, 1, v1 -; GFX67-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX67-SDAG-NEXT: v_and_b32_e32 v5, 0xffff, v1 -; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v4, v2, v0 -; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v1, v5, v3, v1 +; GFX67-SDAG-NEXT: v_add_i32_e32 v3, vcc, 1, v3 ; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v0, v0, v2 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v2, v4, v2, 1 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v1, v1, v3 -; GFX67-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v2 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v3, v5, v3, 1 -; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX67-SDAG-NEXT: v_and_b32_e32 v5, 0xffff, v3 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v4, v1, v0 +; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v3, v5, v2, v3 +; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v0, v0, v1 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v1, v4, v1, 1 +; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v3, v3, v2 +; GFX67-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v1 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v2, v5, v2, 1 +; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX67-SDAG-NEXT: v_and_b32_e32 v5, 0xffff, v2 +; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v1, v0, v4, v1 ; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v2, v0, v4, v2 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v2, v3, v5, v2 ; GFX67-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v3, v1, v5, v3 -; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v2, v2, v0 +; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v1, v1, v0 ; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v0, v4, 1 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v3, v3, v1 +; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v2, v2, v3 ; GFX67-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v0 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v1, v1, v5, 1 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v3, v3, v5, 1 +; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX67-SDAG-NEXT: v_and_b32_e32 v5, 0xffff, v3 ; GFX67-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX67-SDAG-NEXT: v_and_b32_e32 v5, 0xffff, v1 -; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v2, v4, v0 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v1, v4, v0 ; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v1, v3, v5, v1 -; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v6, v0, v2 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v4, v2, v4, 1 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v7, v1, v3 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v3, v2, v5, v3 +; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v6, v0, v1 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v4, v1, v4, 1 +; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v7, v3, v2 ; GFX67-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v5, v3, v5, 1 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v1, v1, v3, 1 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v5, v2, v5, 1 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v0, v1, 1 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v1, v3, v2, 1 ; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v6 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v0, v2, 1 ; GFX67-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v5 ; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v3, v3, v4 ; GFX67-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v7 @@ -9975,80 +10059,84 @@ define <2 x i16> @clpeak_umad_pat_v2i16_x2(<2 x i16> %x, <2 x i16> %y) { ; GFX67-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v0, v3, v0 ; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v1, v2, v1 +; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX67-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX67-SDAG-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX67-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX67-GISEL-LABEL: clpeak_umad_pat_v2i16_x2: ; GFX67-GISEL: ; %bb.0: ; %entry ; GFX67-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX67-GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v1 +; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX67-GISEL-NEXT: v_add_i32_e32 v2, vcc, 1, v2 ; GFX67-GISEL-NEXT: v_add_i32_e32 v0, vcc, 1, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v1 -; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v0 -; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX67-GISEL-NEXT: v_or_b32_e32 v4, v4, v5 -; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v2 +; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v0 +; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX67-GISEL-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v5, 16, v1 ; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX67-GISEL-NEXT: v_mad_u32_u24 v1, v5, v3, v1 -; GFX67-GISEL-NEXT: v_mad_u32_u24 v0, v4, v2, v0 ; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v2, v4, v5, v2 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v0, v3, v1, v0 +; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX67-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX67-GISEL-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v1, v3 -; GFX67-GISEL-NEXT: v_mad_u32_u24 v3, v5, v3, 1 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v0, v2 -; GFX67-GISEL-NEXT: v_mad_u32_u24 v2, v4, v2, 1 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v0, v1 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v1, v3, v1, 1 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v3, v4, v5, 1 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v2, v2, v5 ; GFX67-GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v3 -; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v2 +; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v1 ; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX67-GISEL-NEXT: v_or_b32_e32 v4, v4, v5 ; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX67-GISEL-NEXT: v_mad_u32_u24 v3, v1, v5, v3 -; GFX67-GISEL-NEXT: v_mad_u32_u24 v2, v0, v4, v2 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v3, v2, v5, v3 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v1, v0, v4, v1 ; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX67-GISEL-NEXT: v_or_b32_e32 v2, v2, v3 -; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v3, v3, v1 -; GFX67-GISEL-NEXT: v_mad_u32_u24 v1, v1, v5, 1 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v2, v2, v0 +; GFX67-GISEL-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v3, v3, v2 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v2, v2, v5, 1 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v1, v0 ; GFX67-GISEL-NEXT: v_mad_u32_u24 v0, v0, v4, 1 -; GFX67-GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v1 +; GFX67-GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v2 ; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v0 ; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX67-GISEL-NEXT: v_or_b32_e32 v4, v4, v5 ; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX67-GISEL-NEXT: v_mad_u32_u24 v1, v3, v5, v1 -; GFX67-GISEL-NEXT: v_mad_u32_u24 v0, v2, v4, v0 ; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v2, v3, v5, v2 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v0, v1, v4, v0 +; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX67-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX67-GISEL-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX67-GISEL-NEXT: v_mad_u32_u24 v5, v3, v5, 1 -; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX67-GISEL-NEXT: v_mad_u32_u24 v4, v2, v4, 1 -; GFX67-GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v7, v1, v3 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v4, v1, v4, 1 +; GFX67-GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v6, v0, v1 ; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX67-GISEL-NEXT: v_mad_u32_u24 v1, v1, v3, 1 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v6, v0, v2 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v0, v0, v1, 1 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v1, v2, v3, 1 ; GFX67-GISEL-NEXT: v_or_b32_e32 v4, v4, v5 -; GFX67-GISEL-NEXT: v_mad_u32_u24 v0, v0, v2, 1 ; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v7, v2, v3 ; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v6 @@ -10059,11 +10147,15 @@ define <2 x i16> @clpeak_umad_pat_v2i16_x2(<2 x i16> %x, <2 x i16> %y) { ; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v7 ; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v3, v1 ; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v2, v0 ; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v1, v3 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v2, v0 +; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX67-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX67-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: clpeak_umad_pat_v2i16_x2: @@ -10333,14 +10425,30 @@ entry: } define <2 x i16> @multi_use_mul_mad_i16_var(i16 %x, i16 %y, i16 %z0, i16 %z1) { -; GFX67-LABEL: multi_use_mul_mad_i16_var: -; GFX67: ; %bb.0: ; %entry -; GFX67-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX67-NEXT: v_and_b32_e32 v4, 0xffff, v0 -; GFX67-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-NEXT: v_mad_u32_u24 v0, v4, v1, v2 -; GFX67-NEXT: v_mad_u32_u24 v1, v4, v1, v3 -; GFX67-NEXT: s_setpc_b64 s[30:31] +; GFX67-SDAG-LABEL: multi_use_mul_mad_i16_var: +; GFX67-SDAG: ; %bb.0: ; %entry +; GFX67-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v2, v0, v1, v2 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v0, v1, v3 +; GFX67-SDAG-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX67-SDAG-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX67-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX67-GISEL-LABEL: multi_use_mul_mad_i16_var: +; GFX67-GISEL: ; %bb.0: ; %entry +; GFX67-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v2, v0, v1, v2 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v0, v0, v1, v3 +; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX67-GISEL-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX67-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: multi_use_mul_mad_i16_var: ; GFX8-SDAG: ; %bb.0: ; %entry @@ -10742,35 +10850,45 @@ define <4 x i16> @multi_use_mul_mad_v2i16_var(<2 x i16> %x, <2 x i16> %y, <2 x i ; GFX67-SDAG-LABEL: multi_use_mul_mad_v2i16_var: ; GFX67-SDAG: ; %bb.0: ; %entry ; GFX67-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX67-SDAG-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX67-SDAG-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; GFX67-SDAG-NEXT: v_lshrrev_b32_e32 v7, 16, v0 ; GFX67-SDAG-NEXT: v_and_b32_e32 v8, 0xffff, v0 -; GFX67-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v5, v1, v3, v5 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v4, v8, v2, v4 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v3, v1, v3, v7 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v2, v8, v2, v6 -; GFX67-SDAG-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; GFX67-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX67-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v7, v6, v5 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v2, v8, v1, v2 +; GFX67-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; GFX67-SDAG-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX67-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX67-SDAG-NEXT: v_or_b32_e32 v0, v4, v0 -; GFX67-SDAG-NEXT: v_or_b32_e32 v2, v2, v1 -; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v5 -; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX67-SDAG-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v2, v7, v6, v4 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v1, v8, v1, v3 +; GFX67-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX67-SDAG-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX67-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX67-GISEL-LABEL: multi_use_mul_mad_v2i16_var: ; GFX67-GISEL: ; %bb.0: ; %entry ; GFX67-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX67-GISEL-NEXT: v_and_b32_e32 v8, 0xffff, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX67-GISEL-NEXT: v_and_b32_e32 v9, 0xffff, v1 -; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX67-GISEL-NEXT: v_mad_u32_u24 v0, v8, v2, v4 -; GFX67-GISEL-NEXT: v_mad_u32_u24 v1, v9, v3, v5 -; GFX67-GISEL-NEXT: v_mad_u32_u24 v2, v8, v2, v6 -; GFX67-GISEL-NEXT: v_mad_u32_u24 v3, v9, v3, v7 +; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v2, v0, v1, v2 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v6, v4, v5, v6 +; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v1, v0, v1, v3 +; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v6 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v3, v4, v5, v7 +; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX67-GISEL-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v3 +; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX67-GISEL-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX67-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: multi_use_mul_mad_v2i16_var: @@ -10868,43 +10986,48 @@ define <2 x i16> @other_use_mul_mad_v2i16_var(<2 x i16> %x, <2 x i16> %y, <2 x i ; GFX67-SDAG-LABEL: other_use_mul_mad_v2i16_var: ; GFX67-SDAG: ; %bb.0: ; %entry ; GFX67-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX67-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX67-SDAG-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX67-SDAG-NEXT: v_lshrrev_b32_e32 v6, 16, v0 ; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v7, v0, v2 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v8, v1, v3 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v1, v1, v3, v5 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v0, v2, v4 +; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v7, v0, v1 +; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v8, v6, v5 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v4, v6, v5, v4 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v0, v1, v2 ; GFX67-SDAG-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; GFX67-SDAG-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX67-SDAG-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX67-SDAG-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX67-SDAG-NEXT: v_or_b32_e32 v7, v7, v8 -; GFX67-SDAG-NEXT: v_or_b32_e32 v0, v0, v3 +; GFX67-SDAG-NEXT: v_or_b32_e32 v0, v0, v4 ; GFX67-SDAG-NEXT: s_mov_b32 m0, -1 -; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-SDAG-NEXT: ds_write_b32 v6, v7 +; GFX67-SDAG-NEXT: ds_write_b32 v3, v7 ; GFX67-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX67-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX67-GISEL-LABEL: other_use_mul_mad_v2i16_var: ; GFX67-GISEL: ; %bb.0: ; %entry ; GFX67-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v5, 16, v1 ; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v8, v1, v3 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v7, v0, v2 -; GFX67-GISEL-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v7, v4, v5 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v6, v0, v1 ; GFX67-GISEL-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX67-GISEL-NEXT: v_or_b32_e32 v7, v7, v8 -; GFX67-GISEL-NEXT: v_mad_u32_u24 v0, v0, v2, v4 -; GFX67-GISEL-NEXT: v_mad_u32_u24 v1, v1, v3, v5 +; GFX67-GISEL-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX67-GISEL-NEXT: v_or_b32_e32 v6, v6, v7 +; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v0, v0, v1, v2 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v1, v4, v5, v7 +; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX67-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX67-GISEL-NEXT: s_mov_b32 m0, -1 -; GFX67-GISEL-NEXT: ds_write_b32 v6, v7 +; GFX67-GISEL-NEXT: ds_write_b32 v3, v6 ; GFX67-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX67-GISEL-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll index f770133e3559f..4c102c1f47dc5 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll @@ -616,19 +616,10 @@ main_body: } define amdgpu_ps void @buffer_store_v2f16(<4 x i32> inreg %rsrc, <2 x half> %data, i32 %offset) { -; VERDE-LABEL: buffer_store_v2f16: -; VERDE: ; %bb.0: ; %main_body -; VERDE-NEXT: v_cvt_f16_f32_e32 v1, v1 -; VERDE-NEXT: v_cvt_f16_f32_e32 v0, v0 -; VERDE-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; VERDE-NEXT: v_or_b32_e32 v0, v0, v1 -; VERDE-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen -; VERDE-NEXT: s_endpgm -; -; GFX8-LABEL: buffer_store_v2f16: -; GFX8: ; %bb.0: ; %main_body -; GFX8-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen -; GFX8-NEXT: s_endpgm +; GFX68-LABEL: buffer_store_v2f16: +; GFX68: ; %bb.0: ; %main_body +; GFX68-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX68-NEXT: s_endpgm ; ; GFX11-LABEL: buffer_store_v2f16: ; GFX11: ; %bb.0: ; %main_body @@ -651,23 +642,10 @@ main_body: } define amdgpu_ps void @buffer_store_v4f16(<4 x i32> inreg %rsrc, <4 x half> %data, i32 %offset) #0 { -; VERDE-LABEL: buffer_store_v4f16: -; VERDE: ; %bb.0: ; %main_body -; VERDE-NEXT: v_cvt_f16_f32_e32 v3, v3 -; VERDE-NEXT: v_cvt_f16_f32_e32 v2, v2 -; VERDE-NEXT: v_cvt_f16_f32_e32 v5, v1 -; VERDE-NEXT: v_cvt_f16_f32_e32 v0, v0 -; VERDE-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; VERDE-NEXT: v_or_b32_e32 v1, v2, v1 -; VERDE-NEXT: v_lshlrev_b32_e32 v2, 16, v5 -; VERDE-NEXT: v_or_b32_e32 v0, v0, v2 -; VERDE-NEXT: buffer_store_dwordx2 v[0:1], v4, s[0:3], 0 offen -; VERDE-NEXT: s_endpgm -; -; GFX8-LABEL: buffer_store_v4f16: -; GFX8: ; %bb.0: ; %main_body -; GFX8-NEXT: buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 offen -; GFX8-NEXT: s_endpgm +; GFX68-LABEL: buffer_store_v4f16: +; GFX68: ; %bb.0: ; %main_body +; GFX68-NEXT: buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 offen +; GFX68-NEXT: s_endpgm ; ; GFX11-LABEL: buffer_store_v4f16: ; GFX11: ; %bb.0: ; %main_body @@ -717,18 +695,10 @@ main_body: } define amdgpu_ps void @buffer_store_v2i16(<4 x i32> inreg %rsrc, <2 x i16> %data, i32 %offset) { -; VERDE-LABEL: buffer_store_v2i16: -; VERDE: ; %bb.0: ; %main_body -; VERDE-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; VERDE-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; VERDE-NEXT: v_or_b32_e32 v0, v0, v1 -; VERDE-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen -; VERDE-NEXT: s_endpgm -; -; GFX8-LABEL: buffer_store_v2i16: -; GFX8: ; %bb.0: ; %main_body -; GFX8-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen -; GFX8-NEXT: s_endpgm +; GFX68-LABEL: buffer_store_v2i16: +; GFX68: ; %bb.0: ; %main_body +; GFX68-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX68-NEXT: s_endpgm ; ; GFX11-LABEL: buffer_store_v2i16: ; GFX11: ; %bb.0: ; %main_body @@ -751,21 +721,10 @@ main_body: } define amdgpu_ps void @buffer_store_v4i16(<4 x i32> inreg %rsrc, <4 x i16> %data, i32 %offset) #0 { -; VERDE-LABEL: buffer_store_v4i16: -; VERDE: ; %bb.0: ; %main_body -; VERDE-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; VERDE-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; VERDE-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; VERDE-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; VERDE-NEXT: v_or_b32_e32 v2, v2, v3 -; VERDE-NEXT: v_or_b32_e32 v1, v0, v1 -; VERDE-NEXT: buffer_store_dwordx2 v[1:2], v4, s[0:3], 0 offen -; VERDE-NEXT: s_endpgm -; -; GFX8-LABEL: buffer_store_v4i16: -; GFX8: ; %bb.0: ; %main_body -; GFX8-NEXT: buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 offen -; GFX8-NEXT: s_endpgm +; GFX68-LABEL: buffer_store_v4i16: +; GFX68: ; %bb.0: ; %main_body +; GFX68-NEXT: buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 offen +; GFX68-NEXT: s_endpgm ; ; GFX11-LABEL: buffer_store_v4i16: ; GFX11: ; %bb.0: ; %main_body diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.bf16.ll index 4dd258b7bda82..6e887f54de861 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.bf16.ll @@ -58,10 +58,8 @@ define <2 x bfloat> @raw_ptr_buffer_load_v2bf16(ptr addrspace(8) inreg %rsrc) { ; GFX7-LABEL: raw_ptr_buffer_load_v2bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: buffer_load_dword v1, off, s[16:19], 0 +; GFX7-NEXT: buffer_load_dword v0, off, s[16:19], 0 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: raw_ptr_buffer_load_v2bf16: @@ -107,12 +105,8 @@ define <4 x bfloat> @raw_ptr_buffer_load_v4bf16(ptr addrspace(8) inreg %rsrc) { ; GFX7-LABEL: raw_ptr_buffer_load_v4bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: buffer_load_dwordx2 v[2:3], off, s[16:19], 0 +; GFX7-NEXT: buffer_load_dwordx2 v[0:1], off, s[16:19], 0 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: raw_ptr_buffer_load_v4bf16: @@ -164,16 +158,8 @@ define <8 x bfloat> @raw_ptr_buffer_load_v8bf16(ptr addrspace(8) inreg %rsrc) { ; GFX7-LABEL: raw_ptr_buffer_load_v8bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: buffer_load_dwordx4 v[4:7], off, s[16:19], 0 +; GFX7-NEXT: buffer_load_dwordx4 v[0:3], off, s[16:19], 0 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v5 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: raw_ptr_buffer_load_v8bf16: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.bf16.ll index 5894073ea47e3..395de3d4e2379 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.bf16.ll @@ -46,11 +46,7 @@ define amdgpu_ps void @buffer_store_bf16(ptr addrspace(8) inreg %rsrc, bfloat %d define amdgpu_ps void @buffer_store_v2bf16(ptr addrspace(8) inreg %rsrc, <2 x bfloat> %data, i32 %offset) { ; GFX7-LABEL: buffer_store_v2bf16: ; GFX7: ; %bb.0: -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; GFX7-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: buffer_store_v2bf16: @@ -85,15 +81,7 @@ define amdgpu_ps void @buffer_store_v2bf16(ptr addrspace(8) inreg %rsrc, <2 x bf define amdgpu_ps void @buffer_store_v4bf16(ptr addrspace(8) inreg %rsrc, <4 x bfloat> %data, i32 %offset) #0 { ; GFX7-LABEL: buffer_store_v4bf16: ; GFX7: ; %bb.0: -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16 -; GFX7-NEXT: buffer_store_dwordx2 v[1:2], v4, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 offen ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: buffer_store_v4bf16: @@ -134,23 +122,7 @@ define amdgpu_ps void @buffer_store_v4bf16(ptr addrspace(8) inreg %rsrc, <4 x bf define amdgpu_ps void @buffer_store_v8bf16(ptr addrspace(8) inreg %rsrc, <8 x bfloat> %data, i32 %offset) #0 { ; GFX7-LABEL: buffer_store_v8bf16: ; GFX7: ; %bb.0: -; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_alignbit_b32 v6, v7, v6, 16 -; GFX7-NEXT: v_alignbit_b32 v5, v5, v4, 16 -; GFX7-NEXT: v_alignbit_b32 v4, v3, v2, 16 -; GFX7-NEXT: v_alignbit_b32 v3, v1, v0, 16 -; GFX7-NEXT: buffer_store_dwordx4 v[3:6], v8, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 offen ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: buffer_store_v8bf16: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.ll index 91c479e599d88..85bd675323627 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.ll @@ -347,11 +347,7 @@ main_body: define amdgpu_ps void @buffer_store_v2f16(ptr addrspace(8) inreg %rsrc, <2 x half> %data, i32 %offset) { ; VERDE-LABEL: buffer_store_v2f16: ; VERDE: ; %bb.0: ; %main_body -; VERDE-NEXT: v_cvt_f16_f32_e32 v1, v1 -; VERDE-NEXT: v_cvt_f16_f32_e32 v0, v0 -; VERDE-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; VERDE-NEXT: v_or_b32_e32 v0, v0, v1 -; VERDE-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; VERDE-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; VERDE-NEXT: s_endpgm ; ; CHECK-LABEL: buffer_store_v2f16: @@ -366,15 +362,7 @@ main_body: define amdgpu_ps void @buffer_store_v4f16(ptr addrspace(8) inreg %rsrc, <4 x half> %data, i32 %offset) #0 { ; VERDE-LABEL: buffer_store_v4f16: ; VERDE: ; %bb.0: ; %main_body -; VERDE-NEXT: v_cvt_f16_f32_e32 v3, v3 -; VERDE-NEXT: v_cvt_f16_f32_e32 v2, v2 -; VERDE-NEXT: v_cvt_f16_f32_e32 v5, v1 -; VERDE-NEXT: v_cvt_f16_f32_e32 v0, v0 -; VERDE-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; VERDE-NEXT: v_or_b32_e32 v1, v2, v1 -; VERDE-NEXT: v_lshlrev_b32_e32 v2, 16, v5 -; VERDE-NEXT: v_or_b32_e32 v0, v0, v2 -; VERDE-NEXT: buffer_store_dwordx2 v[0:1], v4, s[0:3], 0 offen +; VERDE-NEXT: buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 offen ; VERDE-NEXT: s_endpgm ; ; CHECK-LABEL: buffer_store_v4f16: @@ -389,23 +377,7 @@ main_body: define amdgpu_ps void @buffer_store_v8f16(ptr addrspace(8) inreg %rsrc, <8 x half> %data, i32 %offset) #0 { ; VERDE-LABEL: buffer_store_v8f16: ; VERDE: ; %bb.0: ; %main_body -; VERDE-NEXT: v_cvt_f16_f32_e32 v7, v7 -; VERDE-NEXT: v_cvt_f16_f32_e32 v6, v6 -; VERDE-NEXT: v_cvt_f16_f32_e32 v9, v5 -; VERDE-NEXT: v_cvt_f16_f32_e32 v3, v3 -; VERDE-NEXT: v_cvt_f16_f32_e32 v1, v1 -; VERDE-NEXT: v_cvt_f16_f32_e32 v4, v4 -; VERDE-NEXT: v_cvt_f16_f32_e32 v2, v2 -; VERDE-NEXT: v_cvt_f16_f32_e32 v0, v0 -; VERDE-NEXT: v_lshlrev_b32_e32 v5, 16, v7 -; VERDE-NEXT: v_or_b32_e32 v5, v6, v5 -; VERDE-NEXT: v_lshlrev_b32_e32 v6, 16, v9 -; VERDE-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; VERDE-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; VERDE-NEXT: v_or_b32_e32 v4, v4, v6 -; VERDE-NEXT: v_or_b32_e32 v3, v2, v3 -; VERDE-NEXT: v_or_b32_e32 v2, v0, v1 -; VERDE-NEXT: buffer_store_dwordx4 v[2:5], v8, s[0:3], 0 offen +; VERDE-NEXT: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 offen ; VERDE-NEXT: s_endpgm ; ; CHECK-LABEL: buffer_store_v8f16: @@ -420,11 +392,7 @@ main_body: define amdgpu_ps void @buffer_store_v2bf16(ptr addrspace(8) inreg %rsrc, <2 x bfloat> %data, i32 %offset) { ; VERDE-LABEL: buffer_store_v2bf16: ; VERDE: ; %bb.0: -; VERDE-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; VERDE-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VERDE-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; VERDE-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; VERDE-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; VERDE-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; VERDE-NEXT: s_endpgm ; ; CHECK-LABEL: buffer_store_v2bf16: @@ -438,15 +406,7 @@ define amdgpu_ps void @buffer_store_v2bf16(ptr addrspace(8) inreg %rsrc, <2 x bf define amdgpu_ps void @buffer_store_v4bf16(ptr addrspace(8) inreg %rsrc, <4 x bfloat> %data, i32 %offset) #0 { ; VERDE-LABEL: buffer_store_v4bf16: ; VERDE: ; %bb.0: -; VERDE-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; VERDE-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; VERDE-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VERDE-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; VERDE-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VERDE-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; VERDE-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; VERDE-NEXT: v_alignbit_b32 v1, v1, v0, 16 -; VERDE-NEXT: buffer_store_dwordx2 v[1:2], v4, s[0:3], 0 offen +; VERDE-NEXT: buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 offen ; VERDE-NEXT: s_endpgm ; ; CHECK-LABEL: buffer_store_v4bf16: @@ -476,10 +436,7 @@ main_body: define amdgpu_ps void @buffer_store_v2i16(ptr addrspace(8) inreg %rsrc, <2 x i16> %data, i32 %offset) { ; VERDE-LABEL: buffer_store_v2i16: ; VERDE: ; %bb.0: ; %main_body -; VERDE-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; VERDE-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; VERDE-NEXT: v_or_b32_e32 v0, v0, v1 -; VERDE-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; VERDE-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; VERDE-NEXT: s_endpgm ; ; CHECK-LABEL: buffer_store_v2i16: @@ -494,13 +451,7 @@ main_body: define amdgpu_ps void @buffer_store_v4i16(ptr addrspace(8) inreg %rsrc, <4 x i16> %data, i32 %offset) #0 { ; VERDE-LABEL: buffer_store_v4i16: ; VERDE: ; %bb.0: ; %main_body -; VERDE-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; VERDE-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; VERDE-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; VERDE-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; VERDE-NEXT: v_or_b32_e32 v2, v2, v3 -; VERDE-NEXT: v_or_b32_e32 v1, v0, v1 -; VERDE-NEXT: buffer_store_dwordx2 v[1:2], v4, s[0:3], 0 offen +; VERDE-NEXT: buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 offen ; VERDE-NEXT: s_endpgm ; ; CHECK-LABEL: buffer_store_v4i16: @@ -521,19 +472,7 @@ main_body: define amdgpu_ps void @buffer_store_v8i16(ptr addrspace(8) inreg %rsrc, <8 x i16> %data, i32 %offset) #0 { ; VERDE-LABEL: buffer_store_v8i16: ; VERDE: ; %bb.0: ; %main_body -; VERDE-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; VERDE-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; VERDE-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; VERDE-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; VERDE-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; VERDE-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; VERDE-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; VERDE-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; VERDE-NEXT: v_or_b32_e32 v6, v6, v7 -; VERDE-NEXT: v_or_b32_e32 v5, v4, v5 -; VERDE-NEXT: v_or_b32_e32 v4, v2, v3 -; VERDE-NEXT: v_or_b32_e32 v3, v0, v1 -; VERDE-NEXT: buffer_store_dwordx4 v[3:6], v8, s[0:3], 0 offen +; VERDE-NEXT: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 offen ; VERDE-NEXT: s_endpgm ; ; CHECK-LABEL: buffer_store_v8i16: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.ll index 4bde9db509f13..1b300528a4fd5 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.ll @@ -343,19 +343,10 @@ define amdgpu_ps void @struct_buffer_store_f16(<4 x i32> inreg %rsrc, float %v1, } define amdgpu_ps void @struct_buffer_store_v2f16(<4 x i32> inreg %rsrc, <2 x half> %v1, i32 %index) { -; VERDE-LABEL: struct_buffer_store_v2f16: -; VERDE: ; %bb.0: -; VERDE-NEXT: v_cvt_f16_f32_e32 v1, v1 -; VERDE-NEXT: v_cvt_f16_f32_e32 v0, v0 -; VERDE-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; VERDE-NEXT: v_or_b32_e32 v0, v0, v1 -; VERDE-NEXT: buffer_store_dword v0, v2, s[0:3], 0 idxen -; VERDE-NEXT: s_endpgm -; -; GFX8-LABEL: struct_buffer_store_v2f16: -; GFX8: ; %bb.0: -; GFX8-NEXT: buffer_store_dword v0, v1, s[0:3], 0 idxen -; GFX8-NEXT: s_endpgm +; GFX68-LABEL: struct_buffer_store_v2f16: +; GFX68: ; %bb.0: +; GFX68-NEXT: buffer_store_dword v0, v1, s[0:3], 0 idxen +; GFX68-NEXT: s_endpgm ; ; GFX11-LABEL: struct_buffer_store_v2f16: ; GFX11: ; %bb.0: @@ -372,23 +363,10 @@ define amdgpu_ps void @struct_buffer_store_v2f16(<4 x i32> inreg %rsrc, <2 x hal } define amdgpu_ps void @struct_buffer_store_v4f16(<4 x i32> inreg %rsrc, <4 x half> %v1, i32 %index) { -; VERDE-LABEL: struct_buffer_store_v4f16: -; VERDE: ; %bb.0: -; VERDE-NEXT: v_cvt_f16_f32_e32 v3, v3 -; VERDE-NEXT: v_cvt_f16_f32_e32 v2, v2 -; VERDE-NEXT: v_cvt_f16_f32_e32 v5, v1 -; VERDE-NEXT: v_cvt_f16_f32_e32 v0, v0 -; VERDE-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; VERDE-NEXT: v_or_b32_e32 v1, v2, v1 -; VERDE-NEXT: v_lshlrev_b32_e32 v2, 16, v5 -; VERDE-NEXT: v_or_b32_e32 v0, v0, v2 -; VERDE-NEXT: buffer_store_dwordx2 v[0:1], v4, s[0:3], 0 idxen -; VERDE-NEXT: s_endpgm -; -; GFX8-LABEL: struct_buffer_store_v4f16: -; GFX8: ; %bb.0: -; GFX8-NEXT: buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 idxen -; GFX8-NEXT: s_endpgm +; GFX68-LABEL: struct_buffer_store_v4f16: +; GFX68: ; %bb.0: +; GFX68-NEXT: buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 idxen +; GFX68-NEXT: s_endpgm ; ; GFX11-LABEL: struct_buffer_store_v4f16: ; GFX11: ; %bb.0: @@ -431,18 +409,10 @@ main_body: } define amdgpu_ps void @struct_buffer_store_vif16(<4 x i32> inreg %rsrc, <2 x i16> %v1, i32 %index) { -; VERDE-LABEL: struct_buffer_store_vif16: -; VERDE: ; %bb.0: -; VERDE-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; VERDE-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; VERDE-NEXT: v_or_b32_e32 v0, v0, v1 -; VERDE-NEXT: buffer_store_dword v0, v2, s[0:3], 0 idxen -; VERDE-NEXT: s_endpgm -; -; GFX8-LABEL: struct_buffer_store_vif16: -; GFX8: ; %bb.0: -; GFX8-NEXT: buffer_store_dword v0, v1, s[0:3], 0 idxen -; GFX8-NEXT: s_endpgm +; GFX68-LABEL: struct_buffer_store_vif16: +; GFX68: ; %bb.0: +; GFX68-NEXT: buffer_store_dword v0, v1, s[0:3], 0 idxen +; GFX68-NEXT: s_endpgm ; ; GFX11-LABEL: struct_buffer_store_vif16: ; GFX11: ; %bb.0: @@ -459,21 +429,10 @@ define amdgpu_ps void @struct_buffer_store_vif16(<4 x i32> inreg %rsrc, <2 x i16 } define amdgpu_ps void @struct_buffer_store_v4i16(<4 x i32> inreg %rsrc, <4 x i16> %v1, i32 %index) { -; VERDE-LABEL: struct_buffer_store_v4i16: -; VERDE: ; %bb.0: -; VERDE-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; VERDE-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; VERDE-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; VERDE-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; VERDE-NEXT: v_or_b32_e32 v2, v2, v3 -; VERDE-NEXT: v_or_b32_e32 v1, v0, v1 -; VERDE-NEXT: buffer_store_dwordx2 v[1:2], v4, s[0:3], 0 idxen -; VERDE-NEXT: s_endpgm -; -; GFX8-LABEL: struct_buffer_store_v4i16: -; GFX8: ; %bb.0: -; GFX8-NEXT: buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 idxen -; GFX8-NEXT: s_endpgm +; GFX68-LABEL: struct_buffer_store_v4i16: +; GFX68: ; %bb.0: +; GFX68-NEXT: buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 idxen +; GFX68-NEXT: s_endpgm ; ; GFX11-LABEL: struct_buffer_store_v4i16: ; GFX11: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.store.ll index df943522404a2..6cc6472bc0ff6 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.store.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.store.ll @@ -157,41 +157,19 @@ define amdgpu_ps void @struct_ptr_buffer_store_f16(ptr addrspace(8) inreg %rsrc, } define amdgpu_ps void @struct_ptr_buffer_store_v2f16(ptr addrspace(8) inreg %rsrc, <2 x half> %v1, i32 %index) { -; SI-LABEL: struct_ptr_buffer_store_v2f16: -; SI: ; %bb.0: -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 idxen -; SI-NEXT: s_endpgm -; -; VI-LABEL: struct_ptr_buffer_store_v2f16: -; VI: ; %bb.0: -; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 idxen -; VI-NEXT: s_endpgm +; CHECK-LABEL: struct_ptr_buffer_store_v2f16: +; CHECK: ; %bb.0: +; CHECK-NEXT: buffer_store_dword v0, v1, s[0:3], 0 idxen +; CHECK-NEXT: s_endpgm call void @llvm.amdgcn.struct.ptr.buffer.store.v2f16(<2 x half> %v1, ptr addrspace(8) %rsrc, i32 %index, i32 0, i32 0, i32 0) ret void } define amdgpu_ps void @struct_ptr_buffer_store_v4f16(ptr addrspace(8) inreg %rsrc, <4 x half> %v1, i32 %index) { -; SI-LABEL: struct_ptr_buffer_store_v4f16: -; SI: ; %bb.0: -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v5 -; SI-NEXT: v_or_b32_e32 v0, v0, v2 -; SI-NEXT: buffer_store_dwordx2 v[0:1], v4, s[0:3], 0 idxen -; SI-NEXT: s_endpgm -; -; VI-LABEL: struct_ptr_buffer_store_v4f16: -; VI: ; %bb.0: -; VI-NEXT: buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 idxen -; VI-NEXT: s_endpgm +; CHECK-LABEL: struct_ptr_buffer_store_v4f16: +; CHECK: ; %bb.0: +; CHECK-NEXT: buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 idxen +; CHECK-NEXT: s_endpgm call void @llvm.amdgcn.struct.ptr.buffer.store.v4f16(<4 x half> %v1, ptr addrspace(8) %rsrc, i32 %index, i32 0, i32 0, i32 0) ret void } @@ -210,38 +188,19 @@ main_body: } define amdgpu_ps void @struct_ptr_buffer_store_vif16(ptr addrspace(8) inreg %rsrc, <2 x i16> %v1, i32 %index) { -; SI-LABEL: struct_ptr_buffer_store_vif16: -; SI: ; %bb.0: -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 idxen -; SI-NEXT: s_endpgm -; -; VI-LABEL: struct_ptr_buffer_store_vif16: -; VI: ; %bb.0: -; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 idxen -; VI-NEXT: s_endpgm +; CHECK-LABEL: struct_ptr_buffer_store_vif16: +; CHECK: ; %bb.0: +; CHECK-NEXT: buffer_store_dword v0, v1, s[0:3], 0 idxen +; CHECK-NEXT: s_endpgm call void @llvm.amdgcn.struct.ptr.buffer.store.v2i16(<2 x i16> %v1, ptr addrspace(8) %rsrc, i32 %index, i32 0, i32 0, i32 0) ret void } define amdgpu_ps void @struct_ptr_buffer_store_v4i16(ptr addrspace(8) inreg %rsrc, <4 x i16> %v1, i32 %index) { -; SI-LABEL: struct_ptr_buffer_store_v4i16: -; SI: ; %bb.0: -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_or_b32_e32 v1, v0, v1 -; SI-NEXT: buffer_store_dwordx2 v[1:2], v4, s[0:3], 0 idxen -; SI-NEXT: s_endpgm -; -; VI-LABEL: struct_ptr_buffer_store_v4i16: -; VI: ; %bb.0: -; VI-NEXT: buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 idxen -; VI-NEXT: s_endpgm +; CHECK-LABEL: struct_ptr_buffer_store_v4i16: +; CHECK: ; %bb.0: +; CHECK-NEXT: buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 idxen +; CHECK-NEXT: s_endpgm call void @llvm.amdgcn.struct.ptr.buffer.store.v4i16(<4 x i16> %v1, ptr addrspace(8) %rsrc, i32 %index, i32 0, i32 0, i32 0) ret void } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll index 3897a0e028334..33b644181af52 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll @@ -6076,31 +6076,33 @@ define <2 x half> @v_exp_v2f16(<2 x half> %in) { ; SI-SDAG-LABEL: v_exp_v2f16: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1 -; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_exp_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 +; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-SDAG-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_exp_v2f16: ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1 -; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 ; SI-GISEL-NEXT: v_exp_f32_e32 v1, v1 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_exp_v2f16: @@ -6178,36 +6180,34 @@ define <2 x half> @v_exp_fabs_v2f16(<2 x half> %in) { ; SI-SDAG-LABEL: v_exp_fabs_v2f16: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f32_f16_e64 v0, |v0| +; SI-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; SI-SDAG-NEXT: v_cvt_f32_f16_e64 v1, |v1| -; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 +; SI-SDAG-NEXT: v_cvt_f32_f16_e64 v0, |v0| ; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1 -; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_exp_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 +; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-SDAG-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_exp_fabs_v2f16: ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-GISEL-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-GISEL-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1 -; SI-GISEL-NEXT: v_exp_f32_e32 v1, v1 ; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 -; SI-GISEL-NEXT: v_exp_f32_e32 v2, v0 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v1 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v2 +; SI-GISEL-NEXT: v_exp_f32_e32 v1, v1 +; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_exp_fabs_v2f16: @@ -6286,40 +6286,35 @@ define <2 x half> @v_exp_fneg_fabs_v2f16(<2 x half> %in) { ; SI-SDAG-LABEL: v_exp_fneg_fabs_v2f16: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-SDAG-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-SDAG-NEXT: v_or_b32_e32 v0, 0x80008000, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1 ; SI-SDAG-NEXT: v_exp_f32_e32 v1, v1 ; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 ; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v1 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v2 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-SDAG-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_exp_fneg_fabs_v2f16: ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-GISEL-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-GISEL-NEXT: v_or_b32_e32 v0, 0x80008000, v0 -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1 -; SI-GISEL-NEXT: v_exp_f32_e32 v1, v1 ; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 -; SI-GISEL-NEXT: v_exp_f32_e32 v2, v0 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v1 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v2 +; SI-GISEL-NEXT: v_exp_f32_e32 v1, v1 +; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_exp_fneg_fabs_v2f16: @@ -6399,40 +6394,35 @@ define <2 x half> @v_exp_fneg_v2f16(<2 x half> %in) { ; SI-SDAG-LABEL: v_exp_fneg_v2f16: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-SDAG-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-SDAG-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1 ; SI-SDAG-NEXT: v_exp_f32_e32 v1, v1 ; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 ; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v1 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v2 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-SDAG-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_exp_fneg_v2f16: ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-GISEL-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-GISEL-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1 -; SI-GISEL-NEXT: v_exp_f32_e32 v1, v1 ; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 -; SI-GISEL-NEXT: v_exp_f32_e32 v2, v0 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v1 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v2 +; SI-GISEL-NEXT: v_exp_f32_e32 v1, v1 +; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_exp_fneg_v2f16: @@ -6496,31 +6486,37 @@ define <2 x half> @v_exp_v2f16_fast(<2 x half> %in) { ; SI-SDAG-LABEL: v_exp_v2f16_fast: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3fb8a000, v0 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3fb8a000, v1 -; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_exp_f32_e32 v1, v1 +; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3fb8a000, v0 +; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-SDAG-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_exp_v2f16_fast: ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8a000, v0 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3fb8a000, v1 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8a000, v0 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-GISEL-NEXT: v_exp_f32_e32 v1, v1 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_exp_v2f16_fast: @@ -6576,41 +6572,41 @@ define <3 x half> @v_exp_v3f16(<3 x half> %in) { ; SI-SDAG-LABEL: v_exp_v3f16: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 -; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1 ; SI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3fb8aa3b, v2 +; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 +; SI-SDAG-NEXT: v_exp_f32_e32 v2, v2 ; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1 ; SI-SDAG-NEXT: v_exp_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_exp_f32_e32 v2, v2 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-SDAG-NEXT: v_or_b32_e32 v0, v0, v2 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_exp_v3f16: ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v0 +; SI-GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 -; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1 ; SI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3fb8aa3b, v2 +; SI-GISEL-NEXT: v_exp_f32_e32 v2, v2 +; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 ; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1 ; SI-GISEL-NEXT: v_exp_f32_e32 v1, v1 -; SI-GISEL-NEXT: v_exp_f32_e32 v2, v2 +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-GISEL-NEXT: v_or_b32_e32 v0, v2, v0 ; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_exp_v3f16: @@ -6682,47 +6678,47 @@ define <3 x half> @v_exp_v3f16_afn(<3 x half> %in) { ; SI-SDAG-LABEL: v_exp_v3f16_afn: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3fb8a000, v0 -; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3fb8a000, v1 ; SI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3fb8a000, v2 +; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3fb8a000, v0 +; SI-SDAG-NEXT: v_exp_f32_e32 v2, v2 ; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3fb8a000, v1 ; SI-SDAG-NEXT: v_exp_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_exp_f32_e32 v2, v2 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-SDAG-NEXT: v_or_b32_e32 v0, v0, v2 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_exp_v3f16_afn: ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v0 +; SI-GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8a000, v0 -; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3fb8a000, v1 ; SI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3fb8a000, v2 +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8a000, v0 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3fb8a000, v1 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-GISEL-NEXT: v_exp_f32_e32 v2, v2 ; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_exp_f32_e32 v1, v1 -; SI-GISEL-NEXT: v_exp_f32_e32 v2, v2 +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-GISEL-NEXT: v_or_b32_e32 v0, v2, v0 ; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_exp_v3f16_afn: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll index 574b1c0b4974c..de1f2e900e326 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll @@ -6162,31 +6162,33 @@ define <2 x half> @v_exp10_v2f16(<2 x half> %in) { ; SI-SDAG-LABEL: v_exp10_v2f16: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x40549a78, v0 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x40549a78, v1 -; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_exp_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x40549a78, v0 +; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-SDAG-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_exp10_v2f16: ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x40549a78, v0 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x40549a78, v1 -; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x40549a78, v0 ; SI-GISEL-NEXT: v_exp_f32_e32 v1, v1 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_exp10_v2f16: @@ -6264,36 +6266,34 @@ define <2 x half> @v_exp10_fabs_v2f16(<2 x half> %in) { ; SI-SDAG-LABEL: v_exp10_fabs_v2f16: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f32_f16_e64 v0, |v0| +; SI-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; SI-SDAG-NEXT: v_cvt_f32_f16_e64 v1, |v1| -; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x40549a78, v0 +; SI-SDAG-NEXT: v_cvt_f32_f16_e64 v0, |v0| ; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x40549a78, v1 -; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_exp_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x40549a78, v0 +; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-SDAG-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_exp10_fabs_v2f16: ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-GISEL-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-GISEL-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x40549a78, v1 -; SI-GISEL-NEXT: v_exp_f32_e32 v1, v1 ; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x40549a78, v0 -; SI-GISEL-NEXT: v_exp_f32_e32 v2, v0 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v1 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v2 +; SI-GISEL-NEXT: v_exp_f32_e32 v1, v1 +; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_exp10_fabs_v2f16: @@ -6372,40 +6372,35 @@ define <2 x half> @v_exp10_fneg_fabs_v2f16(<2 x half> %in) { ; SI-SDAG-LABEL: v_exp10_fneg_fabs_v2f16: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-SDAG-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-SDAG-NEXT: v_or_b32_e32 v0, 0x80008000, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x40549a78, v1 ; SI-SDAG-NEXT: v_exp_f32_e32 v1, v1 ; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x40549a78, v0 ; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v1 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v2 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-SDAG-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_exp10_fneg_fabs_v2f16: ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-GISEL-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-GISEL-NEXT: v_or_b32_e32 v0, 0x80008000, v0 -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x40549a78, v1 -; SI-GISEL-NEXT: v_exp_f32_e32 v1, v1 ; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x40549a78, v0 -; SI-GISEL-NEXT: v_exp_f32_e32 v2, v0 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v1 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v2 +; SI-GISEL-NEXT: v_exp_f32_e32 v1, v1 +; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_exp10_fneg_fabs_v2f16: @@ -6485,40 +6480,35 @@ define <2 x half> @v_exp10_fneg_v2f16(<2 x half> %in) { ; SI-SDAG-LABEL: v_exp10_fneg_v2f16: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-SDAG-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-SDAG-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x40549a78, v1 ; SI-SDAG-NEXT: v_exp_f32_e32 v1, v1 ; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x40549a78, v0 ; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v1 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v2 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-SDAG-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_exp10_fneg_v2f16: ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-GISEL-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-GISEL-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x40549a78, v1 -; SI-GISEL-NEXT: v_exp_f32_e32 v1, v1 ; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x40549a78, v0 -; SI-GISEL-NEXT: v_exp_f32_e32 v2, v0 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v1 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v2 +; SI-GISEL-NEXT: v_exp_f32_e32 v1, v1 +; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_exp10_fneg_v2f16: @@ -6611,55 +6601,61 @@ define <2 x half> @v_exp10_v2f16_fast(<2 x half> %in) { ; SI-SDAG-LABEL: v_exp10_v2f16_fast: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3a278000, v0 -; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x40548000, v0 -; SI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3a278000, v1 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3a278000, v1 ; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x40548000, v1 ; SI-SDAG-NEXT: v_exp_f32_e32 v2, v2 -; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_exp_f32_e32 v3, v3 ; SI-SDAG-NEXT: v_exp_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 -; SI-SDAG-NEXT: v_mul_f32_e32 v1, v1, v3 +; SI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3a278000, v0 +; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x40548000, v0 +; SI-SDAG-NEXT: v_exp_f32_e32 v3, v3 +; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_mul_f32_e32 v1, v1, v2 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-SDAG-NEXT: v_mul_f32_e32 v0, v0, v3 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-SDAG-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_exp10_v2f16_fast: ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3a278000, v0 -; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x40548000, v0 -; SI-GISEL-NEXT: v_mul_f32_e32 v3, 0x3a278000, v1 +; SI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3a278000, v1 ; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x40548000, v1 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-GISEL-NEXT: v_mul_f32_e32 v3, 0x3a278000, v0 +; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x40548000, v0 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-GISEL-NEXT: v_exp_f32_e32 v2, v2 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_exp_f32_e32 v1, v1 ; SI-GISEL-NEXT: v_exp_f32_e32 v3, v3 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-GISEL-NEXT: v_exp_f32_e32 v2, v2 +; SI-GISEL-NEXT: v_exp_f32_e32 v1, v1 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v2 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v3 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_mul_f32_e32 v1, v1, v3 +; SI-GISEL-NEXT: v_mul_f32_e32 v1, v1, v2 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-GISEL-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_exp10_v2f16_fast: @@ -6715,41 +6711,41 @@ define <3 x half> @v_exp10_v3f16(<3 x half> %in) { ; SI-SDAG-LABEL: v_exp10_v3f16: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x40549a78, v0 -; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x40549a78, v1 ; SI-SDAG-NEXT: v_mul_f32_e32 v2, 0x40549a78, v2 +; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x40549a78, v0 +; SI-SDAG-NEXT: v_exp_f32_e32 v2, v2 ; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x40549a78, v1 ; SI-SDAG-NEXT: v_exp_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_exp_f32_e32 v2, v2 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-SDAG-NEXT: v_or_b32_e32 v0, v0, v2 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_exp10_v3f16: ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v0 +; SI-GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x40549a78, v0 -; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x40549a78, v1 ; SI-GISEL-NEXT: v_mul_f32_e32 v2, 0x40549a78, v2 +; SI-GISEL-NEXT: v_exp_f32_e32 v2, v2 +; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x40549a78, v0 ; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x40549a78, v1 ; SI-GISEL-NEXT: v_exp_f32_e32 v1, v1 -; SI-GISEL-NEXT: v_exp_f32_e32 v2, v2 +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-GISEL-NEXT: v_or_b32_e32 v0, v2, v0 ; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_exp10_v3f16: @@ -6861,89 +6857,95 @@ define <3 x half> @v_exp10_v3f16_afn(<3 x half> %in) { ; SI-SDAG-LABEL: v_exp10_v3f16_afn: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3a278000, v0 -; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x40548000, v0 -; SI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3a278000, v1 -; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x40548000, v1 ; SI-SDAG-NEXT: v_mul_f32_e32 v5, 0x3a278000, v2 ; SI-SDAG-NEXT: v_mul_f32_e32 v2, 0x40548000, v2 -; SI-SDAG-NEXT: v_exp_f32_e32 v3, v3 -; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_exp_f32_e32 v4, v4 -; SI-SDAG-NEXT: v_exp_f32_e32 v1, v1 +; SI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3a278000, v0 +; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x40548000, v0 ; SI-SDAG-NEXT: v_exp_f32_e32 v5, v5 ; SI-SDAG-NEXT: v_exp_f32_e32 v2, v2 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3a278000, v1 +; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x40548000, v1 +; SI-SDAG-NEXT: v_exp_f32_e32 v4, v4 +; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_exp_f32_e32 v3, v3 +; SI-SDAG-NEXT: v_exp_f32_e32 v1, v1 ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-SDAG-NEXT: v_mul_f32_e32 v0, v0, v3 -; SI-SDAG-NEXT: v_mul_f32_e32 v1, v1, v4 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-SDAG-NEXT: v_mul_f32_e32 v2, v2, v5 +; SI-SDAG-NEXT: v_mul_f32_e32 v0, v0, v4 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_mul_f32_e32 v1, v1, v3 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-SDAG-NEXT: v_or_b32_e32 v0, v0, v2 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_exp10_v3f16_afn: ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v0 +; SI-GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-GISEL-NEXT: v_mul_f32_e32 v3, 0x3a278000, v0 +; SI-GISEL-NEXT: v_mul_f32_e32 v3, 0x3a278000, v2 +; SI-GISEL-NEXT: v_mul_f32_e32 v2, 0x40548000, v2 +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-GISEL-NEXT: v_mul_f32_e32 v4, 0x3a278000, v0 ; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x40548000, v0 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-GISEL-NEXT: v_mul_f32_e32 v4, 0x3a278000, v1 +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-GISEL-NEXT: v_mul_f32_e32 v5, 0x3a278000, v1 ; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x40548000, v1 -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-GISEL-NEXT: v_mul_f32_e32 v5, 0x3a278000, v2 -; SI-GISEL-NEXT: v_mul_f32_e32 v2, 0x40548000, v2 -; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_exp_f32_e32 v3, v3 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v3 -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v3, v4 -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v4, v5 -; SI-GISEL-NEXT: v_exp_f32_e32 v1, v1 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-GISEL-NEXT: v_exp_f32_e32 v2, v2 ; SI-GISEL-NEXT: v_exp_f32_e32 v3, v3 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_exp_f32_e32 v4, v4 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-GISEL-NEXT: v_exp_f32_e32 v1, v1 +; SI-GISEL-NEXT: v_exp_f32_e32 v5, v5 +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-GISEL-NEXT: v_mul_f32_e32 v2, v2, v3 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v3, v5 +; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v4 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-GISEL-NEXT: v_mul_f32_e32 v1, v1, v3 -; SI-GISEL-NEXT: v_mul_f32_e32 v2, v2, v4 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-GISEL-NEXT: v_or_b32_e32 v0, v2, v0 ; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_exp10_v3f16_afn: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll index dd44a1a35067e..390fedb1d2ef3 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll @@ -3143,31 +3143,19 @@ define half @v_exp2_f16_fast(half %in) { } define <2 x half> @v_exp2_v2f16(<2 x half> %in) { -; SI-SDAG-LABEL: v_exp2_v2f16: -; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_exp_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; SI-GISEL-LABEL: v_exp2_v2f16: -; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_exp_f32_e32 v1, v1 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-GISEL-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: v_exp2_v2f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_exp_f32_e32 v1, v1 +; SI-NEXT: v_exp_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-SDAG-LABEL: v_exp2_v2f16: ; VI-SDAG: ; %bb.0: @@ -3218,32 +3206,30 @@ define <2 x half> @v_exp2_fabs_v2f16(<2 x half> %in) { ; SI-SDAG-LABEL: v_exp2_fabs_v2f16: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f32_f16_e64 v0, |v0| +; SI-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; SI-SDAG-NEXT: v_cvt_f32_f16_e64 v1, |v1| -; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_cvt_f32_f16_e64 v0, |v0| ; SI-SDAG-NEXT: v_exp_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-SDAG-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_exp2_fabs_v2f16: ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-GISEL-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-GISEL-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 ; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-GISEL-NEXT: v_exp_f32_e32 v1, v1 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; VI-SDAG-LABEL: v_exp2_fabs_v2f16: @@ -3295,40 +3281,20 @@ define <2 x half> @v_exp2_fabs_v2f16(<2 x half> %in) { } define <2 x half> @v_exp2_fneg_fabs_v2f16(<2 x half> %in) { -; SI-SDAG-LABEL: v_exp2_fneg_fabs_v2f16: -; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-SDAG-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-SDAG-NEXT: v_or_b32_e32 v0, 0x80008000, v0 -; SI-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_exp_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; SI-GISEL-LABEL: v_exp2_fneg_fabs_v2f16: -; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-GISEL-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-GISEL-NEXT: v_or_b32_e32 v0, 0x80008000, v0 -; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_exp_f32_e32 v1, v1 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-GISEL-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: v_exp2_fneg_fabs_v2f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, 0x80008000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_exp_f32_e32 v1, v1 +; SI-NEXT: v_exp_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-SDAG-LABEL: v_exp2_fneg_fabs_v2f16: ; VI-SDAG: ; %bb.0: @@ -3380,40 +3346,20 @@ define <2 x half> @v_exp2_fneg_fabs_v2f16(<2 x half> %in) { } define <2 x half> @v_exp2_fneg_v2f16(<2 x half> %in) { -; SI-SDAG-LABEL: v_exp2_fneg_v2f16: -; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-SDAG-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-SDAG-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 -; SI-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_exp_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; SI-GISEL-LABEL: v_exp2_fneg_v2f16: -; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-GISEL-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-GISEL-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 -; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_exp_f32_e32 v1, v1 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-GISEL-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: v_exp2_fneg_v2f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_exp_f32_e32 v1, v1 +; SI-NEXT: v_exp_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-SDAG-LABEL: v_exp2_fneg_v2f16: ; VI-SDAG: ; %bb.0: @@ -3464,27 +3410,19 @@ define <2 x half> @v_exp2_fneg_v2f16(<2 x half> %in) { } define <2 x half> @v_exp2_v2f16_fast(<2 x half> %in) { -; SI-SDAG-LABEL: v_exp2_v2f16_fast: -; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_exp_f32_e32 v1, v1 -; SI-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; SI-GISEL-LABEL: v_exp2_v2f16_fast: -; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_exp_f32_e32 v1, v1 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-GISEL-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: v_exp2_v2f16_fast: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_exp_f32_e32 v1, v1 +; SI-NEXT: v_exp_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-SDAG-LABEL: v_exp2_v2f16_fast: ; VI-SDAG: ; %bb.0: @@ -3532,39 +3470,22 @@ define <2 x half> @v_exp2_v2f16_fast(<2 x half> %in) { } define <3 x half> @v_exp_v3f16(<3 x half> %in) { -; SI-SDAG-LABEL: v_exp_v3f16: -; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_exp_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_exp_f32_e32 v2, v2 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; SI-GISEL-LABEL: v_exp_v3f16: -; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_exp_f32_e32 v1, v1 -; SI-GISEL-NEXT: v_exp_f32_e32 v2, v2 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-GISEL-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: v_exp_v3f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_exp_f32_e32 v2, v2 +; SI-NEXT: v_exp_f32_e32 v0, v0 +; SI-NEXT: v_exp_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-SDAG-LABEL: v_exp_v3f16: ; VI-SDAG: ; %bb.0: @@ -3616,39 +3537,22 @@ define <3 x half> @v_exp_v3f16(<3 x half> %in) { } define <3 x half> @v_exp2_v3f16_afn(<3 x half> %in) { -; SI-SDAG-LABEL: v_exp2_v3f16_afn: -; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_exp_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_exp_f32_e32 v2, v2 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; SI-GISEL-LABEL: v_exp2_v3f16_afn: -; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_exp_f32_e32 v1, v1 -; SI-GISEL-NEXT: v_exp_f32_e32 v2, v2 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-GISEL-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: v_exp2_v3f16_afn: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_exp_f32_e32 v2, v2 +; SI-NEXT: v_exp_f32_e32 v0, v0 +; SI-NEXT: v_exp_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-SDAG-LABEL: v_exp2_v3f16_afn: ; VI-SDAG: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll b/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll index 37b49d48a10ee..c562eb168478f 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll @@ -378,21 +378,24 @@ define { <2 x half>, <2 x i32> } @test_frexp_v2f16_v2i32(<2 x half> %a) { ; GFX6-SDAG-LABEL: test_frexp_v2f16_v2i32: ; GFX6-SDAG: ; %bb.0: ; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX6-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v0 ; GFX6-SDAG-NEXT: s_mov_b32 s4, 0x7f800000 -; GFX6-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v0 -; GFX6-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v1 -; GFX6-SDAG-NEXT: v_frexp_mant_f32_e32 v0, v2 -; GFX6-SDAG-NEXT: v_frexp_mant_f32_e32 v1, v3 -; GFX6-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v2|, s4 +; GFX6-SDAG-NEXT: v_frexp_mant_f32_e32 v0, v1 +; GFX6-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v1|, s4 +; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GFX6-SDAG-NEXT: v_frexp_mant_f32_e32 v2, v3 ; GFX6-SDAG-NEXT: v_cmp_lt_f32_e64 s[4:5], |v3|, s4 -; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; GFX6-SDAG-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5] -; GFX6-SDAG-NEXT: v_frexp_exp_i32_f32_e32 v2, v2 -; GFX6-SDAG-NEXT: v_frexp_exp_i32_f32_e32 v3, v3 -; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc -; GFX6-SDAG-NEXT: v_cndmask_b32_e64 v3, 0, v3, s[4:5] +; GFX6-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-SDAG-NEXT: v_cndmask_b32_e64 v2, v3, v2, s[4:5] +; GFX6-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-SDAG-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; GFX6-SDAG-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-SDAG-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc +; GFX6-SDAG-NEXT: v_frexp_exp_i32_f32_e32 v1, v3 +; GFX6-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, v1, s[4:5] ; GFX6-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: test_frexp_v2f16_v2i32: @@ -486,21 +489,24 @@ define { <2 x half>, <2 x i32> } @test_frexp_v2f16_v2i32(<2 x half> %a) { ; GFX6-GISEL-LABEL: test_frexp_v2f16_v2i32: ; GFX6-GISEL: ; %bb.0: ; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v0 +; GFX6-GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX6-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-GISEL-NEXT: v_mov_b32_e32 v3, 0x7f800000 -; GFX6-GISEL-NEXT: v_frexp_mant_f32_e32 v4, v0 -; GFX6-GISEL-NEXT: v_frexp_exp_i32_f32_e32 v2, v0 +; GFX6-GISEL-NEXT: v_frexp_mant_f32_e32 v4, v2 +; GFX6-GISEL-NEXT: v_frexp_exp_i32_f32_e32 v1, v2 +; GFX6-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, |v2|, v3 +; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v4, v2, v4, vcc +; GFX6-GISEL-NEXT: v_frexp_mant_f32_e32 v5, v0 ; GFX6-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, v3 -; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc -; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GFX6-GISEL-NEXT: v_frexp_mant_f32_e32 v4, v1 -; GFX6-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, |v1|, v3 -; GFX6-GISEL-NEXT: v_frexp_exp_i32_f32_e32 v5, v1 -; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; GFX6-GISEL-NEXT: v_frexp_exp_i32_f32_e32 v2, v0 +; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc ; GFX6-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc +; GFX6-GISEL-NEXT: v_cvt_f16_f32_e32 v3, v4 +; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GFX6-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-GISEL-NEXT: v_or_b32_e32 v0, v3, v0 ; GFX6-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-GISEL-LABEL: test_frexp_v2f16_v2i32: @@ -596,17 +602,20 @@ define <2 x half> @test_frexp_v2f16_v2i32_only_use_fract(<2 x half> %a) { ; GFX6-SDAG-LABEL: test_frexp_v2f16_v2i32_only_use_fract: ; GFX6-SDAG: ; %bb.0: ; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-SDAG-NEXT: s_mov_b32 s4, 0x7f800000 -; GFX6-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX6-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-SDAG-NEXT: s_mov_b32 s4, 0x7f800000 +; GFX6-SDAG-NEXT: v_frexp_mant_f32_e32 v2, v1 +; GFX6-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v1|, s4 +; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX6-SDAG-NEXT: v_frexp_mant_f32_e32 v2, v0 ; GFX6-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 -; GFX6-SDAG-NEXT: v_frexp_mant_f32_e32 v3, v1 +; GFX6-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX6-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v1|, s4 -; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX6-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-SDAG-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: test_frexp_v2f16_v2i32_only_use_fract: @@ -670,17 +679,20 @@ define <2 x half> @test_frexp_v2f16_v2i32_only_use_fract(<2 x half> %a) { ; GFX6-GISEL-LABEL: test_frexp_v2f16_v2i32_only_use_fract: ; GFX6-GISEL: ; %bb.0: ; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v0 +; GFX6-GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX6-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-GISEL-NEXT: v_mov_b32_e32 v2, 0x7f800000 -; GFX6-GISEL-NEXT: v_frexp_mant_f32_e32 v3, v0 -; GFX6-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, v2 -; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX6-GISEL-NEXT: v_frexp_mant_f32_e32 v3, v1 ; GFX6-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, |v1|, v2 ; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX6-GISEL-NEXT: v_frexp_mant_f32_e32 v3, v0 +; GFX6-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, v2 +; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX6-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-GISEL-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-GISEL-LABEL: test_frexp_v2f16_v2i32_only_use_fract: @@ -741,17 +753,16 @@ define <2 x i32> @test_frexp_v2f16_v2i32_only_use_exp(<2 x half> %a) { ; GFX6-SDAG-LABEL: test_frexp_v2f16_v2i32_only_use_exp: ; GFX6-SDAG: ; %bb.0: ; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-SDAG-NEXT: s_mov_b32 s4, 0x7f800000 -; GFX6-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX6-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-SDAG-NEXT: s_mov_b32 s4, 0x7f800000 +; GFX6-SDAG-NEXT: v_frexp_exp_i32_f32_e32 v2, v1 +; GFX6-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v1|, s4 +; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc ; GFX6-SDAG-NEXT: v_frexp_exp_i32_f32_e32 v2, v0 ; GFX6-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 -; GFX6-SDAG-NEXT: v_frexp_exp_i32_f32_e32 v3, v1 ; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc -; GFX6-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v1|, s4 -; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc ; GFX6-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: test_frexp_v2f16_v2i32_only_use_exp: @@ -827,15 +838,16 @@ define <2 x i32> @test_frexp_v2f16_v2i32_only_use_exp(<2 x half> %a) { ; GFX6-GISEL-LABEL: test_frexp_v2f16_v2i32_only_use_exp: ; GFX6-GISEL: ; %bb.0: ; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v0 +; GFX6-GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX6-GISEL-NEXT: v_cvt_f32_f16_e32 v3, v0 ; GFX6-GISEL-NEXT: v_mov_b32_e32 v2, 0x7f800000 -; GFX6-GISEL-NEXT: v_frexp_exp_i32_f32_e32 v3, v0 -; GFX6-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, v2 -; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc -; GFX6-GISEL-NEXT: v_frexp_exp_i32_f32_e32 v3, v1 +; GFX6-GISEL-NEXT: v_frexp_exp_i32_f32_e32 v0, v1 ; GFX6-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, |v1|, v2 -; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc +; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GFX6-GISEL-NEXT: v_frexp_exp_i32_f32_e32 v1, v3 +; GFX6-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, |v3|, v2 +; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; GFX6-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-GISEL-LABEL: test_frexp_v2f16_v2i32_only_use_exp: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll index 956145fb24c4a..e94a2813f2ecc 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll @@ -1006,11 +1006,9 @@ define <2 x i1> @isnan_v2bf16(<2 x bfloat> %x) nounwind { ; GFX7CHECK-LABEL: isnan_v2bf16: ; GFX7CHECK: ; %bb.0: ; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7CHECK-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7CHECK-NEXT: v_bfe_u32 v1, v0, 16, 15 ; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7f80 -; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15 -; GFX7CHECK-NEXT: v_bfe_u32 v1, v1, 16, 15 +; GFX7CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX7CHECK-NEXT: v_cmp_lt_i32_e32 vcc, s4, v0 ; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX7CHECK-NEXT: v_cmp_lt_i32_e32 vcc, s4, v1 @@ -1079,14 +1077,11 @@ define <3 x i1> @isnan_v3bf16(<3 x bfloat> %x) nounwind { ; GFX7CHECK-LABEL: isnan_v3bf16: ; GFX7CHECK: ; %bb.0: ; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7CHECK-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7CHECK-NEXT: v_and_b32_e32 v2, 0x7fff, v1 ; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7f80 -; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15 -; GFX7CHECK-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7CHECK-NEXT: v_bfe_u32 v1, v1, 16, 15 +; GFX7CHECK-NEXT: v_bfe_u32 v1, v0, 16, 15 +; GFX7CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX7CHECK-NEXT: v_cmp_lt_i32_e32 vcc, s4, v0 -; GFX7CHECK-NEXT: v_bfe_u32 v2, v2, 16, 15 ; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX7CHECK-NEXT: v_cmp_lt_i32_e32 vcc, s4, v1 ; GFX7CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc @@ -1171,18 +1166,14 @@ define <4 x i1> @isnan_v4bf16(<4 x bfloat> %x) nounwind { ; GFX7CHECK-LABEL: isnan_v4bf16: ; GFX7CHECK: ; %bb.0: ; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7CHECK-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7CHECK-NEXT: v_bfe_u32 v3, v1, 16, 15 ; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7f80 -; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15 -; GFX7CHECK-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7CHECK-NEXT: v_bfe_u32 v1, v1, 16, 15 +; GFX7CHECK-NEXT: v_and_b32_e32 v2, 0x7fff, v1 +; GFX7CHECK-NEXT: v_bfe_u32 v1, v0, 16, 15 +; GFX7CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX7CHECK-NEXT: v_cmp_lt_i32_e32 vcc, s4, v0 -; GFX7CHECK-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7CHECK-NEXT: v_bfe_u32 v2, v2, 16, 15 ; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX7CHECK-NEXT: v_cmp_lt_i32_e32 vcc, s4, v1 -; GFX7CHECK-NEXT: v_bfe_u32 v3, v3, 16, 15 ; GFX7CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX7CHECK-NEXT: v_cmp_lt_i32_e32 vcc, s4, v2 ; GFX7CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc diff --git a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll index dd19ba17bb292..a7b6e5877adf4 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll @@ -1354,11 +1354,9 @@ define <2 x i1> @isnan_v2f16(<2 x half> %x) nounwind { ; GFX7SELDAG-LABEL: isnan_v2f16: ; GFX7SELDAG: ; %bb.0: ; GFX7SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7SELDAG-NEXT: v_bfe_u32 v1, v0, 16, 15 ; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7c00 ; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; GFX7SELDAG-NEXT: v_and_b32_e32 v1, 0x7fff, v1 ; GFX7SELDAG-NEXT: v_cmp_lt_i32_e32 vcc, s4, v0 ; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX7SELDAG-NEXT: v_cmp_lt_i32_e32 vcc, s4, v1 @@ -1368,6 +1366,7 @@ define <2 x i1> @isnan_v2f16(<2 x half> %x) nounwind { ; GFX7GLISEL-LABEL: isnan_v2f16: ; GFX7GLISEL: ; %bb.0: ; GFX7GLISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7GLISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7GLISEL-NEXT: v_mov_b32_e32 v2, 0x7c00 @@ -1489,14 +1488,11 @@ define <3 x i1> @isnan_v3f16(<3 x half> %x) nounwind { ; GFX7SELDAG-LABEL: isnan_v3f16: ; GFX7SELDAG: ; %bb.0: ; GFX7SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7SELDAG-NEXT: v_and_b32_e32 v2, 0x7fff, v1 ; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7c00 +; GFX7SELDAG-NEXT: v_bfe_u32 v1, v0, 16, 15 ; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; GFX7SELDAG-NEXT: v_and_b32_e32 v1, 0x7fff, v1 ; GFX7SELDAG-NEXT: v_cmp_lt_i32_e32 vcc, s4, v0 -; GFX7SELDAG-NEXT: v_and_b32_e32 v2, 0x7fff, v2 ; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX7SELDAG-NEXT: v_cmp_lt_i32_e32 vcc, s4, v1 ; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc @@ -1507,18 +1503,19 @@ define <3 x i1> @isnan_v3f16(<3 x half> %x) nounwind { ; GFX7GLISEL-LABEL: isnan_v3f16: ; GFX7GLISEL: ; %bb.0: ; GFX7GLISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7GLISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7GLISEL-NEXT: v_mov_b32_e32 v3, 0x7c00 -; GFX7GLISEL-NEXT: v_and_b32_e32 v1, 0x7fff, v1 -; GFX7GLISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX7GLISEL-NEXT: v_and_b32_e32 v2, 0x7fff, v2 -; GFX7GLISEL-NEXT: v_cmp_gt_u32_e32 vcc, v0, v3 ; GFX7GLISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX7GLISEL-NEXT: v_and_b32_e32 v1, 0x7fff, v1 +; GFX7GLISEL-NEXT: v_cmp_gt_u32_e32 vcc, v0, v3 +; GFX7GLISEL-NEXT: v_and_b32_e32 v4, 0xffff, v1 ; GFX7GLISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX7GLISEL-NEXT: v_cmp_gt_u32_e32 vcc, v1, v3 -; GFX7GLISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX7GLISEL-NEXT: v_cmp_gt_u32_e32 vcc, v2, v3 +; GFX7GLISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX7GLISEL-NEXT: v_cmp_gt_u32_e32 vcc, v4, v3 ; GFX7GLISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX7GLISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -1661,18 +1658,14 @@ define <4 x i1> @isnan_v4f16(<4 x half> %x) nounwind { ; GFX7SELDAG-LABEL: isnan_v4f16: ; GFX7SELDAG: ; %bb.0: ; GFX7SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7SELDAG-NEXT: v_bfe_u32 v3, v1, 16, 15 ; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7c00 +; GFX7SELDAG-NEXT: v_and_b32_e32 v2, 0x7fff, v1 +; GFX7SELDAG-NEXT: v_bfe_u32 v1, v0, 16, 15 ; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; GFX7SELDAG-NEXT: v_and_b32_e32 v1, 0x7fff, v1 ; GFX7SELDAG-NEXT: v_cmp_lt_i32_e32 vcc, s4, v0 -; GFX7SELDAG-NEXT: v_and_b32_e32 v2, 0x7fff, v2 ; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX7SELDAG-NEXT: v_cmp_lt_i32_e32 vcc, s4, v1 -; GFX7SELDAG-NEXT: v_and_b32_e32 v3, 0x7fff, v3 ; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX7SELDAG-NEXT: v_cmp_lt_i32_e32 vcc, s4, v2 ; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc @@ -1683,20 +1676,22 @@ define <4 x i1> @isnan_v4f16(<4 x half> %x) nounwind { ; GFX7GLISEL-LABEL: isnan_v4f16: ; GFX7GLISEL: ; %bb.0: ; GFX7GLISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7GLISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7GLISEL-NEXT: v_mov_b32_e32 v4, 0x7c00 -; GFX7GLISEL-NEXT: v_and_b32_e32 v1, 0x7fff, v1 -; GFX7GLISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX7GLISEL-NEXT: v_and_b32_e32 v2, 0x7fff, v2 -; GFX7GLISEL-NEXT: v_cmp_gt_u32_e32 vcc, v0, v4 +; GFX7GLISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GFX7GLISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX7GLISEL-NEXT: v_and_b32_e32 v3, 0x7fff, v3 +; GFX7GLISEL-NEXT: v_and_b32_e32 v1, 0x7fff, v1 +; GFX7GLISEL-NEXT: v_cmp_gt_u32_e32 vcc, v0, v4 +; GFX7GLISEL-NEXT: v_and_b32_e32 v5, 0xffff, v1 +; GFX7GLISEL-NEXT: v_and_b32_e32 v1, 0x7fff, v3 ; GFX7GLISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX7GLISEL-NEXT: v_cmp_gt_u32_e32 vcc, v1, v4 -; GFX7GLISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX7GLISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX7GLISEL-NEXT: v_cmp_gt_u32_e32 vcc, v2, v4 +; GFX7GLISEL-NEXT: v_and_b32_e32 v3, 0xffff, v1 +; GFX7GLISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX7GLISEL-NEXT: v_cmp_gt_u32_e32 vcc, v5, v4 ; GFX7GLISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX7GLISEL-NEXT: v_cmp_gt_u32_e32 vcc, v3, v4 ; GFX7GLISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc diff --git a/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll b/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll index 5fae6de4a9682..749600b4a99f7 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll @@ -435,12 +435,15 @@ define <2 x half> @test_ldexp_v2f16_v2i32(<2 x half> %a, <2 x i32> %b) { ; GFX6-SDAG-LABEL: test_ldexp_v2f16_v2i32: ; GFX6-SDAG: ; %bb.0: ; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX6-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v2 -; GFX6-SDAG-NEXT: v_ldexp_f32_e32 v1, v1, v3 +; GFX6-SDAG-NEXT: v_ldexp_f32_e32 v2, v3, v2 +; GFX6-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; GFX6-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX6-SDAG-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: test_ldexp_v2f16_v2i32: @@ -496,12 +499,15 @@ define <2 x half> @test_ldexp_v2f16_v2i32(<2 x half> %a, <2 x i32> %b) { ; GFX6-GISEL-LABEL: test_ldexp_v2f16_v2i32: ; GFX6-GISEL: ; %bb.0: ; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX6-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v2 -; GFX6-GISEL-NEXT: v_ldexp_f32_e32 v1, v1, v3 -; GFX6-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-GISEL-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; GFX6-GISEL-NEXT: v_ldexp_f32_e32 v1, v3, v2 ; GFX6-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-GISEL-LABEL: test_ldexp_v2f16_v2i32: @@ -563,14 +569,17 @@ define <2 x half> @test_ldexp_v2f16_v2i16(<2 x half> %a, <2 x i16> %b) { ; GFX6-SDAG-LABEL: test_ldexp_v2f16_v2i16: ; GFX6-SDAG: ; %bb.0: ; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-SDAG-NEXT: v_bfe_i32 v3, v3, 0, 16 -; GFX6-SDAG-NEXT: v_bfe_i32 v2, v2, 0, 16 +; GFX6-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX6-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v2 -; GFX6-SDAG-NEXT: v_ldexp_f32_e32 v1, v1, v3 +; GFX6-SDAG-NEXT: v_ashrrev_i32_e32 v3, 16, v1 +; GFX6-SDAG-NEXT: v_bfe_i32 v1, v1, 0, 16 +; GFX6-SDAG-NEXT: v_ldexp_f32_e32 v2, v2, v3 +; GFX6-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; GFX6-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX6-SDAG-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: test_ldexp_v2f16_v2i16: @@ -610,14 +619,17 @@ define <2 x half> @test_ldexp_v2f16_v2i16(<2 x half> %a, <2 x i16> %b) { ; GFX6-GISEL-LABEL: test_ldexp_v2f16_v2i16: ; GFX6-GISEL: ; %bb.0: ; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v0 +; GFX6-GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX6-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-GISEL-NEXT: v_bfe_i32 v2, v2, 0, 16 -; GFX6-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v2 -; GFX6-GISEL-NEXT: v_bfe_i32 v2, v3, 0, 16 -; GFX6-GISEL-NEXT: v_ldexp_f32_e32 v1, v1, v2 +; GFX6-GISEL-NEXT: v_bfe_i32 v3, v1, 0, 16 +; GFX6-GISEL-NEXT: v_bfe_i32 v1, v1, 16, 16 +; GFX6-GISEL-NEXT: v_ldexp_f32_e32 v2, v2, v3 +; GFX6-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; GFX6-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v2 +; GFX6-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-GISEL-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-GISEL-LABEL: test_ldexp_v2f16_v2i16: @@ -663,15 +675,18 @@ define <3 x half> @test_ldexp_v3f16_v3i32(<3 x half> %a, <3 x i32> %b) { ; GFX6-SDAG-LABEL: test_ldexp_v3f16_v3i32: ; GFX6-SDAG: ; %bb.0: ; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-SDAG-NEXT: v_lshrrev_b32_e32 v5, 16, v0 ; GFX6-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GFX6-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX6-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v3 +; GFX6-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v2 +; GFX6-SDAG-NEXT: v_ldexp_f32_e32 v2, v5, v3 +; GFX6-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-SDAG-NEXT: v_ldexp_f32_e32 v1, v1, v4 -; GFX6-SDAG-NEXT: v_ldexp_f32_e32 v2, v2, v5 +; GFX6-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-SDAG-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX6-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: test_ldexp_v3f16_v3i32: @@ -735,15 +750,18 @@ define <3 x half> @test_ldexp_v3f16_v3i32(<3 x half> %a, <3 x i32> %b) { ; GFX6-GISEL-LABEL: test_ldexp_v3f16_v3i32: ; GFX6-GISEL: ; %bb.0: ; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-GISEL-NEXT: v_cvt_f32_f16_e32 v5, v0 +; GFX6-GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX6-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-GISEL-NEXT: v_ldexp_f32_e32 v2, v5, v2 +; GFX6-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v3 -; GFX6-GISEL-NEXT: v_ldexp_f32_e32 v1, v1, v4 -; GFX6-GISEL-NEXT: v_ldexp_f32_e32 v2, v2, v5 ; GFX6-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-GISEL-NEXT: v_ldexp_f32_e32 v1, v1, v4 ; GFX6-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-GISEL-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX6-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-GISEL-LABEL: test_ldexp_v3f16_v3i32: @@ -817,18 +835,21 @@ define <3 x half> @test_ldexp_v3f16_v3i16(<3 x half> %a, <3 x i16> %b) { ; GFX6-SDAG-LABEL: test_ldexp_v3f16_v3i16: ; GFX6-SDAG: ; %bb.0: ; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-SDAG-NEXT: v_bfe_i32 v5, v5, 0, 16 -; GFX6-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX6-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX6-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-SDAG-NEXT: v_bfe_i32 v4, v4, 0, 16 +; GFX6-SDAG-NEXT: v_ashrrev_i32_e32 v5, 16, v2 +; GFX6-SDAG-NEXT: v_ldexp_f32_e32 v4, v4, v5 +; GFX6-SDAG-NEXT: v_bfe_i32 v2, v2, 0, 16 ; GFX6-SDAG-NEXT: v_bfe_i32 v3, v3, 0, 16 -; GFX6-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v3 -; GFX6-SDAG-NEXT: v_ldexp_f32_e32 v1, v1, v4 -; GFX6-SDAG-NEXT: v_ldexp_f32_e32 v2, v2, v5 +; GFX6-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v2 +; GFX6-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v4 +; GFX6-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-SDAG-NEXT: v_ldexp_f32_e32 v1, v1, v3 +; GFX6-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-SDAG-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX6-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: test_ldexp_v3f16_v3i16: @@ -872,18 +893,21 @@ define <3 x half> @test_ldexp_v3f16_v3i16(<3 x half> %a, <3 x i16> %b) { ; GFX6-GISEL-LABEL: test_ldexp_v3f16_v3i16: ; GFX6-GISEL: ; %bb.0: ; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-GISEL-NEXT: v_cvt_f32_f16_e32 v4, v0 +; GFX6-GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX6-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX6-GISEL-NEXT: v_bfe_i32 v3, v3, 0, 16 -; GFX6-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v3 -; GFX6-GISEL-NEXT: v_bfe_i32 v3, v4, 0, 16 -; GFX6-GISEL-NEXT: v_ldexp_f32_e32 v1, v1, v3 -; GFX6-GISEL-NEXT: v_bfe_i32 v3, v5, 0, 16 -; GFX6-GISEL-NEXT: v_ldexp_f32_e32 v2, v2, v3 +; GFX6-GISEL-NEXT: v_bfe_i32 v5, v2, 0, 16 +; GFX6-GISEL-NEXT: v_bfe_i32 v2, v2, 16, 16 +; GFX6-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v2 +; GFX6-GISEL-NEXT: v_ldexp_f32_e32 v4, v4, v5 ; GFX6-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-GISEL-NEXT: v_bfe_i32 v2, v3, 0, 16 +; GFX6-GISEL-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX6-GISEL-NEXT: v_ldexp_f32_e32 v1, v1, v2 ; GFX6-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-GISEL-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX6-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-GISEL-LABEL: test_ldexp_v3f16_v3i16: @@ -937,18 +961,24 @@ define <4 x half> @test_ldexp_v4f16_v4i32(<4 x half> %a, <4 x i32> %b) { ; GFX6-SDAG-LABEL: test_ldexp_v4f16_v4i32: ; GFX6-SDAG: ; %bb.0: ; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-SDAG-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX6-SDAG-NEXT: v_lshrrev_b32_e32 v7, 16, v1 ; GFX6-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-SDAG-NEXT: v_cvt_f32_f16_e32 v6, v6 ; GFX6-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX6-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v4 -; GFX6-SDAG-NEXT: v_ldexp_f32_e32 v1, v1, v5 -; GFX6-SDAG-NEXT: v_ldexp_f32_e32 v2, v2, v6 -; GFX6-SDAG-NEXT: v_ldexp_f32_e32 v3, v3, v7 +; GFX6-SDAG-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GFX6-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v2 +; GFX6-SDAG-NEXT: v_ldexp_f32_e32 v2, v6, v3 +; GFX6-SDAG-NEXT: v_ldexp_f32_e32 v1, v1, v4 +; GFX6-SDAG-NEXT: v_ldexp_f32_e32 v4, v7, v5 +; GFX6-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v4 +; GFX6-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-SDAG-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX6-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX6-SDAG-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: test_ldexp_v4f16_v4i32: @@ -1026,18 +1056,24 @@ define <4 x half> @test_ldexp_v4f16_v4i32(<4 x half> %a, <4 x i32> %b) { ; GFX6-GISEL-LABEL: test_ldexp_v4f16_v4i32: ; GFX6-GISEL: ; %bb.0: ; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-GISEL-NEXT: v_cvt_f32_f16_e32 v6, v0 +; GFX6-GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX6-GISEL-NEXT: v_lshrrev_b32_e32 v7, 16, v1 ; GFX6-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-GISEL-NEXT: v_ldexp_f32_e32 v2, v6, v2 +; GFX6-GISEL-NEXT: v_cvt_f32_f16_e32 v6, v7 ; GFX6-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX6-GISEL-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v4 -; GFX6-GISEL-NEXT: v_ldexp_f32_e32 v1, v1, v5 -; GFX6-GISEL-NEXT: v_ldexp_f32_e32 v2, v2, v6 -; GFX6-GISEL-NEXT: v_ldexp_f32_e32 v3, v3, v7 +; GFX6-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v3 ; GFX6-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-GISEL-NEXT: v_ldexp_f32_e32 v3, v6, v5 ; GFX6-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-GISEL-NEXT: v_ldexp_f32_e32 v1, v1, v4 ; GFX6-GISEL-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-GISEL-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX6-GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX6-GISEL-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-GISEL-LABEL: test_ldexp_v4f16_v4i32: @@ -1123,22 +1159,28 @@ define <4 x half> @test_ldexp_v4f16_v4i16(<4 x half> %a, <4 x i16> %b) { ; GFX6-SDAG-LABEL: test_ldexp_v4f16_v4i16: ; GFX6-SDAG: ; %bb.0: ; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-SDAG-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GFX6-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX6-SDAG-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; GFX6-SDAG-NEXT: v_cvt_f32_f16_e32 v7, v7 ; GFX6-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-SDAG-NEXT: v_bfe_i32 v7, v7, 0, 16 -; GFX6-SDAG-NEXT: v_bfe_i32 v6, v6, 0, 16 -; GFX6-SDAG-NEXT: v_bfe_i32 v5, v5, 0, 16 -; GFX6-SDAG-NEXT: v_bfe_i32 v4, v4, 0, 16 -; GFX6-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v4 -; GFX6-SDAG-NEXT: v_ldexp_f32_e32 v1, v1, v5 -; GFX6-SDAG-NEXT: v_ldexp_f32_e32 v2, v2, v6 -; GFX6-SDAG-NEXT: v_ldexp_f32_e32 v3, v3, v7 +; GFX6-SDAG-NEXT: v_ashrrev_i32_e32 v4, 16, v2 +; GFX6-SDAG-NEXT: v_ashrrev_i32_e32 v6, 16, v3 +; GFX6-SDAG-NEXT: v_bfe_i32 v2, v2, 0, 16 +; GFX6-SDAG-NEXT: v_ldexp_f32_e32 v4, v5, v4 +; GFX6-SDAG-NEXT: v_bfe_i32 v3, v3, 0, 16 +; GFX6-SDAG-NEXT: v_ldexp_f32_e32 v6, v7, v6 +; GFX6-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v2 +; GFX6-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v4 +; GFX6-SDAG-NEXT: v_ldexp_f32_e32 v1, v1, v3 +; GFX6-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v6 +; GFX6-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-SDAG-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX6-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX6-SDAG-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: test_ldexp_v4f16_v4i16: @@ -1192,22 +1234,28 @@ define <4 x half> @test_ldexp_v4f16_v4i16(<4 x half> %a, <4 x i16> %b) { ; GFX6-GISEL-LABEL: test_ldexp_v4f16_v4i16: ; GFX6-GISEL: ; %bb.0: ; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX6-GISEL-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX6-GISEL-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX6-GISEL-NEXT: v_bfe_i32 v6, v2, 0, 16 +; GFX6-GISEL-NEXT: v_bfe_i32 v2, v2, 16, 16 ; GFX6-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-GISEL-NEXT: v_ldexp_f32_e32 v2, v4, v2 +; GFX6-GISEL-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX6-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX6-GISEL-NEXT: v_bfe_i32 v4, v4, 0, 16 -; GFX6-GISEL-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v4 -; GFX6-GISEL-NEXT: v_bfe_i32 v4, v5, 0, 16 -; GFX6-GISEL-NEXT: v_ldexp_f32_e32 v1, v1, v4 -; GFX6-GISEL-NEXT: v_bfe_i32 v4, v6, 0, 16 -; GFX6-GISEL-NEXT: v_ldexp_f32_e32 v2, v2, v4 -; GFX6-GISEL-NEXT: v_bfe_i32 v4, v7, 0, 16 -; GFX6-GISEL-NEXT: v_ldexp_f32_e32 v3, v3, v4 -; GFX6-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-GISEL-NEXT: v_bfe_i32 v5, v3, 0, 16 +; GFX6-GISEL-NEXT: v_bfe_i32 v3, v3, 16, 16 +; GFX6-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v6 ; GFX6-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-GISEL-NEXT: v_ldexp_f32_e32 v3, v4, v3 +; GFX6-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-GISEL-NEXT: v_ldexp_f32_e32 v1, v1, v5 ; GFX6-GISEL-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-GISEL-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX6-GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX6-GISEL-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-GISEL-LABEL: test_ldexp_v4f16_v4i16: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log.ll b/llvm/test/CodeGen/AMDGPU/llvm.log.ll index 6353640bed146..59c1c2facb5c9 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.log.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.log.ll @@ -6692,31 +6692,33 @@ define <2 x half> @v_log_v2f16(<2 x half> %in) { ; SI-SDAG-LABEL: v_log_v2f16: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_log_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 +; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317218, v1 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-SDAG-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_log_v2f16: ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-GISEL-NEXT: v_log_f32_e32 v1, v1 -; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 +; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3f317218, v1 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; VI-SDAG-LABEL: v_log_v2f16: @@ -6814,36 +6816,34 @@ define <2 x half> @v_log_fabs_v2f16(<2 x half> %in) { ; SI-SDAG-LABEL: v_log_fabs_v2f16: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f32_f16_e64 v0, |v0| +; SI-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; SI-SDAG-NEXT: v_cvt_f32_f16_e64 v1, |v1| -; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_cvt_f32_f16_e64 v0, |v0| ; SI-SDAG-NEXT: v_log_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 +; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317218, v1 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-SDAG-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_log_fabs_v2f16: ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-GISEL-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-GISEL-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-GISEL-NEXT: v_log_f32_e32 v1, v1 -; SI-GISEL-NEXT: v_log_f32_e32 v2, v0 -; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3f317218, v1 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3f317218, v2 +; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3f317218, v1 +; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; VI-SDAG-LABEL: v_log_fabs_v2f16: @@ -6958,40 +6958,35 @@ define <2 x half> @v_log_fneg_fabs_v2f16(<2 x half> %in) { ; SI-SDAG-LABEL: v_log_fneg_fabs_v2f16: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-SDAG-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-SDAG-NEXT: v_or_b32_e32 v0, 0x80008000, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_log_f32_e32 v1, v1 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317218, v1 ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v1 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v2 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-SDAG-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_log_fneg_fabs_v2f16: ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-GISEL-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-GISEL-NEXT: v_or_b32_e32 v0, 0x80008000, v0 -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-GISEL-NEXT: v_log_f32_e32 v1, v1 -; SI-GISEL-NEXT: v_log_f32_e32 v2, v0 -; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3f317218, v1 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3f317218, v2 +; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3f317218, v1 +; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; VI-SDAG-LABEL: v_log_fneg_fabs_v2f16: @@ -7107,40 +7102,35 @@ define <2 x half> @v_log_fneg_v2f16(<2 x half> %in) { ; SI-SDAG-LABEL: v_log_fneg_v2f16: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-SDAG-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-SDAG-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_log_f32_e32 v1, v1 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317218, v1 ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v1 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v2 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-SDAG-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_log_fneg_v2f16: ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-GISEL-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-GISEL-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-GISEL-NEXT: v_log_f32_e32 v1, v1 -; SI-GISEL-NEXT: v_log_f32_e32 v2, v0 -; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3f317218, v1 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3f317218, v2 +; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3f317218, v1 +; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; VI-SDAG-LABEL: v_log_fneg_v2f16: @@ -7255,27 +7245,33 @@ define <2 x half> @v_log_v2f16_fast(<2 x half> %in) { ; SI-SDAG-LABEL: v_log_v2f16_fast: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_log_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 +; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317218, v1 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-SDAG-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_log_v2f16_fast: ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-GISEL-NEXT: v_log_f32_e32 v1, v1 -; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 +; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3f317218, v1 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; VI-SDAG-LABEL: v_log_v2f16_fast: @@ -7370,45 +7366,25 @@ define <2 x half> @v_log_v2f16_fast(<2 x half> %in) { } define <3 x half> @v_log_v3f16(<3 x half> %in) { -; SI-SDAG-LABEL: v_log_v3f16: -; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_log_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_log_f32_e32 v2, v2 -; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 -; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317218, v1 -; SI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3f317218, v2 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; SI-GISEL-LABEL: v_log_v3f16: -; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_log_f32_e32 v1, v1 -; SI-GISEL-NEXT: v_log_f32_e32 v2, v2 -; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 -; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3f317218, v1 -; SI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3f317218, v2 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-GISEL-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: v_log_v3f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_log_f32_e32 v2, v2 +; SI-NEXT: v_log_f32_e32 v0, v0 +; SI-NEXT: v_log_f32_e32 v1, v1 +; SI-NEXT: v_mul_f32_e32 v2, 0x3f317218, v2 +; SI-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_mul_f32_e32 v1, 0x3f317218, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_log_v3f16: ; VI: ; %bb.0: @@ -7507,39 +7483,25 @@ define <3 x half> @v_log_v3f16(<3 x half> %in) { } define <3 x half> @v_log_v3f16_fast(<3 x half> %in) { -; SI-SDAG-LABEL: v_log_v3f16_fast: -; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_log_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_log_f32_e32 v2, v2 -; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 -; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317218, v1 -; SI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3f317218, v2 -; SI-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; SI-GISEL-LABEL: v_log_v3f16_fast: -; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_log_f32_e32 v1, v1 -; SI-GISEL-NEXT: v_log_f32_e32 v2, v2 -; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 -; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3f317218, v1 -; SI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3f317218, v2 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-GISEL-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: v_log_v3f16_fast: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_log_f32_e32 v2, v2 +; SI-NEXT: v_log_f32_e32 v0, v0 +; SI-NEXT: v_log_f32_e32 v1, v1 +; SI-NEXT: v_mul_f32_e32 v2, 0x3f317218, v2 +; SI-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_mul_f32_e32 v1, 0x3f317218, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_log_v3f16_fast: ; VI: ; %bb.0: @@ -7641,51 +7603,55 @@ define <4 x half> @v_log_v4f16(<4 x half> %in) { ; SI-SDAG-LABEL: v_log_v4f16: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-SDAG-NEXT: v_log_f32_e32 v2, v2 +; SI-SDAG-NEXT: v_log_f32_e32 v3, v3 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_log_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_log_f32_e32 v3, v3 -; SI-SDAG-NEXT: v_log_f32_e32 v2, v2 +; SI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3f317218, v2 +; SI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3f317218, v3 ; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317218, v1 -; SI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3f317218, v3 -; SI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3f317218, v2 ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-SDAG-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-SDAG-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_log_v4f16: ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v0 +; SI-GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_log_f32_e32 v1, v1 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-GISEL-NEXT: v_log_f32_e32 v2, v2 +; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_log_f32_e32 v3, v3 -; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 -; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3f317218, v1 +; SI-GISEL-NEXT: v_log_f32_e32 v1, v1 ; SI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3f317218, v2 -; SI-GISEL-NEXT: v_mul_f32_e32 v3, 0x3f317218, v3 +; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-GISEL-NEXT: v_mul_f32_e32 v3, 0x3f317218, v3 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3f317218, v1 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-GISEL-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-GISEL-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; VI-SDAG-LABEL: v_log_v4f16: @@ -7839,43 +7805,55 @@ define <4 x half> @v_log_v4f16_fast(<4 x half> %in) { ; SI-SDAG-LABEL: v_log_v4f16_fast: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_log_f32_e32 v1, v1 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-SDAG-NEXT: v_log_f32_e32 v2, v2 ; SI-SDAG-NEXT: v_log_f32_e32 v3, v3 -; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 -; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317218, v1 +; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_log_f32_e32 v1, v1 ; SI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3f317218, v2 ; SI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3f317218, v3 +; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317218, v1 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-SDAG-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-SDAG-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_log_v4f16_fast: ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v0 +; SI-GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_log_f32_e32 v1, v1 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-GISEL-NEXT: v_log_f32_e32 v2, v2 +; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_log_f32_e32 v3, v3 -; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 -; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3f317218, v1 +; SI-GISEL-NEXT: v_log_f32_e32 v1, v1 ; SI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3f317218, v2 -; SI-GISEL-NEXT: v_mul_f32_e32 v3, 0x3f317218, v3 +; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-GISEL-NEXT: v_mul_f32_e32 v3, 0x3f317218, v3 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3f317218, v1 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-GISEL-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-GISEL-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; VI-SDAG-LABEL: v_log_v4f16_fast: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll index 58665c7b24aea..2dc85d3c161a0 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll @@ -6692,31 +6692,33 @@ define <2 x half> @v_log10_v2f16(<2 x half> %in) { ; SI-SDAG-LABEL: v_log10_v2f16: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_log_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 +; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a209b, v1 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-SDAG-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_log10_v2f16: ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-GISEL-NEXT: v_log_f32_e32 v1, v1 -; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 +; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3e9a209b, v1 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; VI-SDAG-LABEL: v_log10_v2f16: @@ -6814,36 +6816,34 @@ define <2 x half> @v_log10_fabs_v2f16(<2 x half> %in) { ; SI-SDAG-LABEL: v_log10_fabs_v2f16: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f32_f16_e64 v0, |v0| +; SI-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; SI-SDAG-NEXT: v_cvt_f32_f16_e64 v1, |v1| -; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_cvt_f32_f16_e64 v0, |v0| ; SI-SDAG-NEXT: v_log_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 +; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a209b, v1 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-SDAG-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_log10_fabs_v2f16: ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-GISEL-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-GISEL-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-GISEL-NEXT: v_log_f32_e32 v1, v1 -; SI-GISEL-NEXT: v_log_f32_e32 v2, v0 -; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v1 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3e9a209b, v2 +; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3e9a209b, v1 +; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; VI-SDAG-LABEL: v_log10_fabs_v2f16: @@ -6958,40 +6958,35 @@ define <2 x half> @v_log10_fneg_fabs_v2f16(<2 x half> %in) { ; SI-SDAG-LABEL: v_log10_fneg_fabs_v2f16: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-SDAG-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-SDAG-NEXT: v_or_b32_e32 v0, 0x80008000, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_log_f32_e32 v1, v1 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a209b, v1 ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v1 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v2 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-SDAG-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_log10_fneg_fabs_v2f16: ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-GISEL-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-GISEL-NEXT: v_or_b32_e32 v0, 0x80008000, v0 -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-GISEL-NEXT: v_log_f32_e32 v1, v1 -; SI-GISEL-NEXT: v_log_f32_e32 v2, v0 -; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v1 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3e9a209b, v2 +; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3e9a209b, v1 +; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; VI-SDAG-LABEL: v_log10_fneg_fabs_v2f16: @@ -7107,40 +7102,35 @@ define <2 x half> @v_log10_fneg_v2f16(<2 x half> %in) { ; SI-SDAG-LABEL: v_log10_fneg_v2f16: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-SDAG-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-SDAG-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_log_f32_e32 v1, v1 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a209b, v1 ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v1 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v2 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-SDAG-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_log10_fneg_v2f16: ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-GISEL-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-GISEL-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-GISEL-NEXT: v_log_f32_e32 v1, v1 -; SI-GISEL-NEXT: v_log_f32_e32 v2, v0 -; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v1 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3e9a209b, v2 +; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3e9a209b, v1 +; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; VI-SDAG-LABEL: v_log10_fneg_v2f16: @@ -7255,27 +7245,33 @@ define <2 x half> @v_log10_v2f16_fast(<2 x half> %in) { ; SI-SDAG-LABEL: v_log10_v2f16_fast: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_log_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 +; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a209b, v1 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-SDAG-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_log10_v2f16_fast: ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-GISEL-NEXT: v_log_f32_e32 v1, v1 -; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 +; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3e9a209b, v1 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; VI-SDAG-LABEL: v_log10_v2f16_fast: @@ -7370,45 +7366,25 @@ define <2 x half> @v_log10_v2f16_fast(<2 x half> %in) { } define <3 x half> @v_log10_v3f16(<3 x half> %in) { -; SI-SDAG-LABEL: v_log10_v3f16: -; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_log_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_log_f32_e32 v2, v2 -; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 -; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a209b, v1 -; SI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3e9a209b, v2 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; SI-GISEL-LABEL: v_log10_v3f16: -; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_log_f32_e32 v1, v1 -; SI-GISEL-NEXT: v_log_f32_e32 v2, v2 -; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 -; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3e9a209b, v1 -; SI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3e9a209b, v2 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-GISEL-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: v_log10_v3f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_log_f32_e32 v2, v2 +; SI-NEXT: v_log_f32_e32 v0, v0 +; SI-NEXT: v_log_f32_e32 v1, v1 +; SI-NEXT: v_mul_f32_e32 v2, 0x3e9a209b, v2 +; SI-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_mul_f32_e32 v1, 0x3e9a209b, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_log10_v3f16: ; VI: ; %bb.0: @@ -7507,39 +7483,25 @@ define <3 x half> @v_log10_v3f16(<3 x half> %in) { } define <3 x half> @v_log10_v3f16_fast(<3 x half> %in) { -; SI-SDAG-LABEL: v_log10_v3f16_fast: -; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_log_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_log_f32_e32 v2, v2 -; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 -; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a209b, v1 -; SI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3e9a209b, v2 -; SI-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; SI-GISEL-LABEL: v_log10_v3f16_fast: -; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_log_f32_e32 v1, v1 -; SI-GISEL-NEXT: v_log_f32_e32 v2, v2 -; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 -; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3e9a209b, v1 -; SI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3e9a209b, v2 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-GISEL-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: v_log10_v3f16_fast: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_log_f32_e32 v2, v2 +; SI-NEXT: v_log_f32_e32 v0, v0 +; SI-NEXT: v_log_f32_e32 v1, v1 +; SI-NEXT: v_mul_f32_e32 v2, 0x3e9a209b, v2 +; SI-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_mul_f32_e32 v1, 0x3e9a209b, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_log10_v3f16_fast: ; VI: ; %bb.0: @@ -7641,51 +7603,55 @@ define <4 x half> @v_log10_v4f16(<4 x half> %in) { ; SI-SDAG-LABEL: v_log10_v4f16: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-SDAG-NEXT: v_log_f32_e32 v2, v2 +; SI-SDAG-NEXT: v_log_f32_e32 v3, v3 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_log_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_log_f32_e32 v3, v3 -; SI-SDAG-NEXT: v_log_f32_e32 v2, v2 +; SI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3e9a209b, v2 +; SI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3e9a209b, v3 ; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a209b, v1 -; SI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3e9a209b, v3 -; SI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3e9a209b, v2 ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-SDAG-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-SDAG-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_log10_v4f16: ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v0 +; SI-GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_log_f32_e32 v1, v1 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-GISEL-NEXT: v_log_f32_e32 v2, v2 +; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_log_f32_e32 v3, v3 -; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 -; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3e9a209b, v1 +; SI-GISEL-NEXT: v_log_f32_e32 v1, v1 ; SI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3e9a209b, v2 -; SI-GISEL-NEXT: v_mul_f32_e32 v3, 0x3e9a209b, v3 +; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-GISEL-NEXT: v_mul_f32_e32 v3, 0x3e9a209b, v3 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3e9a209b, v1 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-GISEL-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-GISEL-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; VI-SDAG-LABEL: v_log10_v4f16: @@ -7839,43 +7805,55 @@ define <4 x half> @v_log10_v4f16_fast(<4 x half> %in) { ; SI-SDAG-LABEL: v_log10_v4f16_fast: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_log_f32_e32 v1, v1 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-SDAG-NEXT: v_log_f32_e32 v2, v2 ; SI-SDAG-NEXT: v_log_f32_e32 v3, v3 -; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 -; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a209b, v1 +; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_log_f32_e32 v1, v1 ; SI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3e9a209b, v2 ; SI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3e9a209b, v3 +; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a209b, v1 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-SDAG-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-SDAG-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_log10_v4f16_fast: ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v0 +; SI-GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_log_f32_e32 v1, v1 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-GISEL-NEXT: v_log_f32_e32 v2, v2 +; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_log_f32_e32 v3, v3 -; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 -; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3e9a209b, v1 +; SI-GISEL-NEXT: v_log_f32_e32 v1, v1 ; SI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3e9a209b, v2 -; SI-GISEL-NEXT: v_mul_f32_e32 v3, 0x3e9a209b, v3 +; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-GISEL-NEXT: v_mul_f32_e32 v3, 0x3e9a209b, v3 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3e9a209b, v1 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-GISEL-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-GISEL-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; VI-SDAG-LABEL: v_log10_v4f16_fast: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll index cf2c8fe8fc574..047cc9addbcfc 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll @@ -4079,31 +4079,19 @@ define half @v_log2_f16_fast(half %in) { } define <2 x half> @v_log2_v2f16(<2 x half> %in) { -; SI-SDAG-LABEL: v_log2_v2f16: -; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_log_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; SI-GISEL-LABEL: v_log2_v2f16: -; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_log_f32_e32 v1, v1 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-GISEL-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: v_log2_v2f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_log_f32_e32 v1, v1 +; SI-NEXT: v_log_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-SDAG-LABEL: v_log2_v2f16: ; VI-SDAG: ; %bb.0: @@ -4190,32 +4178,30 @@ define <2 x half> @v_log2_fabs_v2f16(<2 x half> %in) { ; SI-SDAG-LABEL: v_log2_fabs_v2f16: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f32_f16_e64 v0, |v0| +; SI-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; SI-SDAG-NEXT: v_cvt_f32_f16_e64 v1, |v1| -; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_cvt_f32_f16_e64 v0, |v0| ; SI-SDAG-NEXT: v_log_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-SDAG-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_log2_fabs_v2f16: ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-GISEL-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-GISEL-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 ; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-GISEL-NEXT: v_log_f32_e32 v1, v1 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; VI-SDAG-LABEL: v_log2_fabs_v2f16: @@ -4306,40 +4292,20 @@ define <2 x half> @v_log2_fabs_v2f16(<2 x half> %in) { } define <2 x half> @v_log2_fneg_fabs_v2f16(<2 x half> %in) { -; SI-SDAG-LABEL: v_log2_fneg_fabs_v2f16: -; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-SDAG-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-SDAG-NEXT: v_or_b32_e32 v0, 0x80008000, v0 -; SI-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_log_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; SI-GISEL-LABEL: v_log2_fneg_fabs_v2f16: -; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-GISEL-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-GISEL-NEXT: v_or_b32_e32 v0, 0x80008000, v0 -; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_log_f32_e32 v1, v1 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-GISEL-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: v_log2_fneg_fabs_v2f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, 0x80008000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_log_f32_e32 v1, v1 +; SI-NEXT: v_log_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-SDAG-LABEL: v_log2_fneg_fabs_v2f16: ; VI-SDAG: ; %bb.0: @@ -4430,40 +4396,20 @@ define <2 x half> @v_log2_fneg_fabs_v2f16(<2 x half> %in) { } define <2 x half> @v_log2_fneg_v2f16(<2 x half> %in) { -; SI-SDAG-LABEL: v_log2_fneg_v2f16: -; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-SDAG-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-SDAG-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 -; SI-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_log_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; SI-GISEL-LABEL: v_log2_fneg_v2f16: -; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-GISEL-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-GISEL-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 -; SI-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_log_f32_e32 v1, v1 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-GISEL-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: v_log2_fneg_v2f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_log_f32_e32 v1, v1 +; SI-NEXT: v_log_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-SDAG-LABEL: v_log2_fneg_v2f16: ; VI-SDAG: ; %bb.0: @@ -4553,27 +4499,19 @@ define <2 x half> @v_log2_fneg_v2f16(<2 x half> %in) { } define <2 x half> @v_log2_v2f16_fast(<2 x half> %in) { -; SI-SDAG-LABEL: v_log2_v2f16_fast: -; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_log_f32_e32 v1, v1 -; SI-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; SI-GISEL-LABEL: v_log2_v2f16_fast: -; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_log_f32_e32 v1, v1 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-GISEL-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: v_log2_v2f16_fast: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_log_f32_e32 v1, v1 +; SI-NEXT: v_log_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-SDAG-LABEL: v_log2_v2f16_fast: ; VI-SDAG: ; %bb.0: @@ -4657,39 +4595,22 @@ define <2 x half> @v_log2_v2f16_fast(<2 x half> %in) { } define <3 x half> @v_log2_v3f16(<3 x half> %in) { -; SI-SDAG-LABEL: v_log2_v3f16: -; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_log_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_log_f32_e32 v2, v2 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; SI-GISEL-LABEL: v_log2_v3f16: -; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_log_f32_e32 v1, v1 -; SI-GISEL-NEXT: v_log_f32_e32 v2, v2 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-GISEL-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: v_log2_v3f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_log_f32_e32 v2, v2 +; SI-NEXT: v_log_f32_e32 v0, v0 +; SI-NEXT: v_log_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-SDAG-LABEL: v_log2_v3f16: ; VI-SDAG: ; %bb.0: @@ -4781,33 +4702,22 @@ define <3 x half> @v_log2_v3f16(<3 x half> %in) { } define <3 x half> @v_log2_v3f16_fast(<3 x half> %in) { -; SI-SDAG-LABEL: v_log2_v3f16_fast: -; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_log_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_log_f32_e32 v2, v2 -; SI-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; SI-GISEL-LABEL: v_log2_v3f16_fast: -; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_log_f32_e32 v1, v1 -; SI-GISEL-NEXT: v_log_f32_e32 v2, v2 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-GISEL-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: v_log2_v3f16_fast: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_log_f32_e32 v2, v2 +; SI-NEXT: v_log_f32_e32 v0, v0 +; SI-NEXT: v_log_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-SDAG-LABEL: v_log2_v3f16_fast: ; VI-SDAG: ; %bb.0: @@ -4902,43 +4812,47 @@ define <4 x half> @v_log2_v4f16(<4 x half> %in) { ; SI-SDAG-LABEL: v_log2_v4f16: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_log_f32_e32 v1, v1 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-SDAG-NEXT: v_log_f32_e32 v2, v2 +; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_log_f32_e32 v3, v3 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-SDAG-NEXT: v_log_f32_e32 v1, v1 ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-SDAG-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-SDAG-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_log2_v4f16: ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_log_f32_e32 v1, v1 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-GISEL-NEXT: v_log_f32_e32 v2, v2 +; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_log_f32_e32 v3, v3 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-GISEL-NEXT: v_log_f32_e32 v1, v1 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-GISEL-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-GISEL-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; VI-SDAG-LABEL: v_log2_v4f16: @@ -5050,35 +4964,47 @@ define <4 x half> @v_log2_v4f16_fast(<4 x half> %in) { ; SI-SDAG-LABEL: v_log2_v4f16_fast: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_log_f32_e32 v1, v1 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-SDAG-NEXT: v_log_f32_e32 v2, v2 +; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_log_f32_e32 v3, v3 +; SI-SDAG-NEXT: v_log_f32_e32 v1, v1 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-SDAG-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-SDAG-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_log2_v4f16_fast: ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_log_f32_e32 v1, v1 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-GISEL-NEXT: v_log_f32_e32 v2, v2 +; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_log_f32_e32 v3, v3 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-GISEL-NEXT: v_log_f32_e32 v1, v1 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-GISEL-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-GISEL-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; VI-SDAG-LABEL: v_log2_v4f16_fast: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll index 7fd70de81af6f..6d371d4b76e0b 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll @@ -632,21 +632,23 @@ define <2 x half> @v_maximum_v2f16(<2 x half> %src0, <2 x half> %src1) { ; GFX7-LABEL: v_maximum_v2f16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: v_mov_b32_e32 v5, 0x7fc00000 -; GFX7-NEXT: v_max_f32_e32 v4, v0, v2 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc -; GFX7-NEXT: v_max_f32_e32 v2, v1, v3 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v1, v3 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc +; GFX7-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v3, v2 +; GFX7-NEXT: v_cndmask_b32_e32 v2, v5, v4, vcc +; GFX7-NEXT: v_max_f32_e32 v3, v0, v1 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_maximum_v2f16: @@ -739,16 +741,18 @@ define <2 x half> @v_maximum_v2f16__nnan(<2 x half> %src0, <2 x half> %src1) { ; GFX7-LABEL: v_maximum_v2f16__nnan: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX7-NEXT: v_max_f32_e32 v1, v1, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_max_f32_e32 v2, v3, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_maximum_v2f16__nnan: @@ -794,21 +798,23 @@ define <2 x half> @v_maximum_v2f16__nsz(<2 x half> %src0, <2 x half> %src1) { ; GFX7-LABEL: v_maximum_v2f16__nsz: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: v_mov_b32_e32 v5, 0x7fc00000 -; GFX7-NEXT: v_max_f32_e32 v4, v0, v2 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc -; GFX7-NEXT: v_max_f32_e32 v2, v1, v3 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v1, v3 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc +; GFX7-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v3, v2 +; GFX7-NEXT: v_cndmask_b32_e32 v2, v5, v4, vcc +; GFX7-NEXT: v_max_f32_e32 v3, v0, v1 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_maximum_v2f16__nsz: @@ -901,16 +907,18 @@ define <2 x half> @v_maximum_v2f16__nnan_nsz(<2 x half> %src0, <2 x half> %src1) ; GFX7-LABEL: v_maximum_v2f16__nnan_nsz: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX7-NEXT: v_max_f32_e32 v1, v1, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_max_f32_e32 v2, v3, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_maximum_v2f16__nnan_nsz: @@ -956,14 +964,12 @@ define void @s_maximum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) { ; GFX7-LABEL: s_maximum_v2f16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, s19 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, s17 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, s18 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, s16 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: s_lshr_b32 s4, s17, 16 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, s4 +; GFX7-NEXT: s_lshr_b32 s4, s16, 16 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, s4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, s17 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, s16 ; GFX7-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX7-NEXT: v_max_f32_e32 v4, v1, v0 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v1, v0 @@ -1109,28 +1115,29 @@ define <3 x half> @v_maximum_v3f16(<3 x half> %src0, <3 x half> %src1) { ; GFX7-LABEL: v_maximum_v3f16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_max_f32_e32 v6, v0, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_max_f32_e32 v6, v5, v4 ; GFX7-NEXT: v_mov_b32_e32 v7, 0x7fc00000 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v3 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v5, v4 +; GFX7-NEXT: v_cndmask_b32_e32 v4, v7, v6, vcc +; GFX7-NEXT: v_max_f32_e32 v6, v0, v2 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 +; GFX7-NEXT: v_max_f32_e32 v5, v1, v3 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc -; GFX7-NEXT: v_max_f32_e32 v3, v1, v4 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v1, v4 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v7, v3, vcc -; GFX7-NEXT: v_max_f32_e32 v3, v2, v5 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v2, v5 -; GFX7-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v4 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v1, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v7, v5, vcc +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_maximum_v3f16: @@ -1242,21 +1249,22 @@ define <3 x half> @v_maximum_v3f16__nnan(<3 x half> %src0, <3 x half> %src1) { ; GFX7-LABEL: v_maximum_v3f16__nnan: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_max_f32_e32 v0, v0, v3 -; GFX7-NEXT: v_max_f32_e32 v1, v1, v4 -; GFX7-NEXT: v_max_f32_e32 v2, v2, v5 +; GFX7-NEXT: v_max_f32_e32 v4, v5, v4 +; GFX7-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_max_f32_e32 v1, v1, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_maximum_v3f16__nnan: @@ -1307,28 +1315,29 @@ define <3 x half> @v_maximum_v3f16__nsz(<3 x half> %src0, <3 x half> %src1) { ; GFX7-LABEL: v_maximum_v3f16__nsz: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_max_f32_e32 v6, v0, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_max_f32_e32 v6, v5, v4 ; GFX7-NEXT: v_mov_b32_e32 v7, 0x7fc00000 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v3 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v5, v4 +; GFX7-NEXT: v_cndmask_b32_e32 v4, v7, v6, vcc +; GFX7-NEXT: v_max_f32_e32 v6, v0, v2 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 +; GFX7-NEXT: v_max_f32_e32 v5, v1, v3 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc -; GFX7-NEXT: v_max_f32_e32 v3, v1, v4 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v1, v4 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v7, v3, vcc -; GFX7-NEXT: v_max_f32_e32 v3, v2, v5 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v2, v5 -; GFX7-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v4 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v1, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v7, v5, vcc +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_maximum_v3f16__nsz: @@ -1440,21 +1449,22 @@ define <3 x half> @v_maximum_v3f16__nnan_nsz(<3 x half> %src0, <3 x half> %src1) ; GFX7-LABEL: v_maximum_v3f16__nnan_nsz: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_max_f32_e32 v0, v0, v3 -; GFX7-NEXT: v_max_f32_e32 v1, v1, v4 -; GFX7-NEXT: v_max_f32_e32 v2, v2, v5 +; GFX7-NEXT: v_max_f32_e32 v4, v5, v4 +; GFX7-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_max_f32_e32 v1, v1, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_maximum_v3f16__nnan_nsz: @@ -1505,35 +1515,39 @@ define <4 x half> @v_maximum_v4f16(<4 x half> %src0, <4 x half> %src1) { ; GFX7-LABEL: v_maximum_v4f16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_max_f32_e32 v8, v0, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_max_f32_e32 v8, v7, v6 ; GFX7-NEXT: v_mov_b32_e32 v9, 0x7fc00000 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v4 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v9, v8, vcc -; GFX7-NEXT: v_max_f32_e32 v4, v1, v5 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v1, v5 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v9, v4, vcc -; GFX7-NEXT: v_max_f32_e32 v4, v2, v6 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v2, v6 -; GFX7-NEXT: v_cndmask_b32_e32 v2, v9, v4, vcc -; GFX7-NEXT: v_max_f32_e32 v4, v3, v7 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v3, v7 -; GFX7-NEXT: v_cndmask_b32_e32 v3, v9, v4, vcc +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cndmask_b32_e32 v6, v9, v8, vcc +; GFX7-NEXT: v_max_f32_e32 v7, v5, v4 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v5, v4 +; GFX7-NEXT: v_cndmask_b32_e32 v4, v9, v7, vcc +; GFX7-NEXT: v_max_f32_e32 v5, v1, v3 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v1, v3 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v9, v5, vcc +; GFX7-NEXT: v_max_f32_e32 v3, v0, v2 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v9, v3, vcc +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_maximum_v4f16: @@ -1666,26 +1680,30 @@ define <4 x half> @v_maximum_v4f16__nnan(<4 x half> %src0, <4 x half> %src1) { ; GFX7-LABEL: v_maximum_v4f16__nnan: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_max_f32_e32 v0, v0, v4 -; GFX7-NEXT: v_max_f32_e32 v1, v1, v5 -; GFX7-NEXT: v_max_f32_e32 v2, v2, v6 -; GFX7-NEXT: v_max_f32_e32 v3, v3, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_max_f32_e32 v4, v5, v4 +; GFX7-NEXT: v_max_f32_e32 v6, v7, v6 +; GFX7-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v4 +; GFX7-NEXT: v_max_f32_e32 v1, v1, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_maximum_v4f16__nnan: @@ -1738,35 +1756,39 @@ define <4 x half> @v_maximum_v4f16__nsz(<4 x half> %src0, <4 x half> %src1) { ; GFX7-LABEL: v_maximum_v4f16__nsz: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_max_f32_e32 v8, v0, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_max_f32_e32 v8, v7, v6 ; GFX7-NEXT: v_mov_b32_e32 v9, 0x7fc00000 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v4 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v9, v8, vcc -; GFX7-NEXT: v_max_f32_e32 v4, v1, v5 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v1, v5 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v9, v4, vcc -; GFX7-NEXT: v_max_f32_e32 v4, v2, v6 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v2, v6 -; GFX7-NEXT: v_cndmask_b32_e32 v2, v9, v4, vcc -; GFX7-NEXT: v_max_f32_e32 v4, v3, v7 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v3, v7 -; GFX7-NEXT: v_cndmask_b32_e32 v3, v9, v4, vcc +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cndmask_b32_e32 v6, v9, v8, vcc +; GFX7-NEXT: v_max_f32_e32 v7, v5, v4 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v5, v4 +; GFX7-NEXT: v_cndmask_b32_e32 v4, v9, v7, vcc +; GFX7-NEXT: v_max_f32_e32 v5, v1, v3 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v1, v3 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v9, v5, vcc +; GFX7-NEXT: v_max_f32_e32 v3, v0, v2 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v9, v3, vcc +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_maximum_v4f16__nsz: @@ -1899,26 +1921,30 @@ define <4 x half> @v_maximum_v4f16__nnan_nsz(<4 x half> %src0, <4 x half> %src1) ; GFX7-LABEL: v_maximum_v4f16__nnan_nsz: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_max_f32_e32 v0, v0, v4 -; GFX7-NEXT: v_max_f32_e32 v1, v1, v5 -; GFX7-NEXT: v_max_f32_e32 v2, v2, v6 -; GFX7-NEXT: v_max_f32_e32 v3, v3, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_max_f32_e32 v4, v5, v4 +; GFX7-NEXT: v_max_f32_e32 v6, v7, v6 +; GFX7-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v4 +; GFX7-NEXT: v_max_f32_e32 v1, v1, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_maximum_v4f16__nnan_nsz: @@ -1971,63 +1997,71 @@ define <8 x half> @v_maximum_v8f16(<8 x half> %src0, <8 x half> %src1) { ; GFX7-LABEL: v_maximum_v8f16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v7 +; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v12, 16, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v13, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v14, v14 +; GFX7-NEXT: v_cvt_f32_f16_e32 v15, v15 +; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v12, v12 +; GFX7-NEXT: v_cvt_f32_f16_e32 v13, v13 +; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 ; GFX7-NEXT: v_cvt_f32_f16_e32 v11, v11 +; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v8 +; GFX7-NEXT: v_cvt_f32_f16_e32 v9, v9 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_max_f32_e32 v16, v0, v8 +; GFX7-NEXT: v_max_f32_e32 v16, v15, v14 ; GFX7-NEXT: v_mov_b32_e32 v17, 0x7fc00000 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v8 -; GFX7-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v17, v16, vcc -; GFX7-NEXT: v_max_f32_e32 v8, v1, v9 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v1, v9 -; GFX7-NEXT: v_cvt_f32_f16_e32 v14, v14 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v15, v14 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v17, v8, vcc -; GFX7-NEXT: v_max_f32_e32 v8, v2, v10 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v2, v10 -; GFX7-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX7-NEXT: v_cndmask_b32_e32 v2, v17, v8, vcc -; GFX7-NEXT: v_max_f32_e32 v8, v3, v11 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v3, v11 -; GFX7-NEXT: v_cndmask_b32_e32 v3, v17, v8, vcc -; GFX7-NEXT: v_max_f32_e32 v8, v4, v12 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v4, v12 -; GFX7-NEXT: v_cndmask_b32_e32 v4, v17, v8, vcc -; GFX7-NEXT: v_max_f32_e32 v8, v5, v13 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v5, v13 -; GFX7-NEXT: v_cndmask_b32_e32 v5, v17, v8, vcc -; GFX7-NEXT: v_max_f32_e32 v8, v6, v14 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v6, v14 -; GFX7-NEXT: v_cndmask_b32_e32 v6, v17, v8, vcc -; GFX7-NEXT: v_max_f32_e32 v8, v7, v15 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v7, v15 -; GFX7-NEXT: v_cndmask_b32_e32 v7, v17, v8, vcc +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cndmask_b32_e32 v14, v17, v16, vcc +; GFX7-NEXT: v_max_f32_e32 v15, v13, v12 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v13, v12 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cndmask_b32_e32 v12, v17, v15, vcc +; GFX7-NEXT: v_max_f32_e32 v13, v11, v10 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v11, v10 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cndmask_b32_e32 v10, v17, v13, vcc +; GFX7-NEXT: v_max_f32_e32 v11, v9, v8 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v9, v8 +; GFX7-NEXT: v_cndmask_b32_e32 v8, v17, v11, vcc +; GFX7-NEXT: v_max_f32_e32 v9, v3, v7 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v3, v7 +; GFX7-NEXT: v_cndmask_b32_e32 v3, v17, v9, vcc +; GFX7-NEXT: v_max_f32_e32 v7, v2, v6 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v2, v6 +; GFX7-NEXT: v_cndmask_b32_e32 v2, v17, v7, vcc +; GFX7-NEXT: v_max_f32_e32 v6, v1, v5 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v1, v5 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v17, v6, vcc +; GFX7-NEXT: v_max_f32_e32 v5, v0, v4 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v4 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v17, v5, vcc +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v8 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v10 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v12 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v14 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_maximum_v8f16: @@ -2241,121 +2275,135 @@ define <16 x half> @v_maximum_v16f16(<16 x half> %src0, <16 x half> %src1) { ; GFX7-LABEL: v_maximum_v16f16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v16 -; GFX7-NEXT: v_max_f32_e32 v0, v0, v16 -; GFX7-NEXT: v_cvt_f16_f32_e32 v16, v17 +; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v14 +; GFX7-NEXT: v_cvt_f32_f16_e32 v17, v16 +; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v18, v16 +; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v13 +; GFX7-NEXT: v_cvt_f32_f16_e32 v19, v16 +; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v20, v16 +; GFX7-NEXT: v_max_f32_e32 v16, v18, v17 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v18, v17 +; GFX7-NEXT: v_lshrrev_b32_e32 v18, 16, v11 +; GFX7-NEXT: v_max_f32_e32 v17, v20, v19 +; GFX7-NEXT: v_cmp_o_f32_e64 s[4:5], v20, v19 +; GFX7-NEXT: v_lshrrev_b32_e32 v19, 16, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v20, 16, v10 +; GFX7-NEXT: v_lshrrev_b32_e32 v22, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v18, v18 +; GFX7-NEXT: v_cvt_f32_f16_e32 v19, v19 +; GFX7-NEXT: v_cvt_f32_f16_e32 v20, v20 +; GFX7-NEXT: v_cvt_f32_f16_e32 v22, v22 +; GFX7-NEXT: v_cvt_f32_f16_e32 v14, v14 +; GFX7-NEXT: v_max_f32_e32 v23, v19, v18 +; GFX7-NEXT: v_cmp_o_f32_e64 s[6:7], v19, v18 +; GFX7-NEXT: v_max_f32_e32 v18, v22, v20 +; GFX7-NEXT: v_cmp_o_f32_e64 s[8:9], v22, v20 +; GFX7-NEXT: v_lshrrev_b32_e32 v19, 16, v9 +; GFX7-NEXT: v_lshrrev_b32_e32 v20, 16, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v19, v19 +; GFX7-NEXT: v_cvt_f32_f16_e32 v20, v20 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v11, v11 +; GFX7-NEXT: v_max_f32_e32 v22, v20, v19 +; GFX7-NEXT: v_cmp_o_f32_e64 s[10:11], v20, v19 +; GFX7-NEXT: v_lshrrev_b32_e32 v19, 16, v8 +; GFX7-NEXT: v_lshrrev_b32_e32 v20, 16, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v19, v19 +; GFX7-NEXT: v_cvt_f32_f16_e32 v20, v20 +; GFX7-NEXT: v_cmp_o_f32_e64 s[16:17], v6, v14 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v10 +; GFX7-NEXT: v_max_f32_e32 v24, v20, v19 +; GFX7-NEXT: v_cmp_o_f32_e64 s[12:13], v20, v19 +; GFX7-NEXT: v_cvt_f32_f16_e32 v19, v15 +; GFX7-NEXT: v_cvt_f32_f16_e32 v20, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v9, v9 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_max_f32_e32 v25, v20, v19 +; GFX7-NEXT: v_cmp_o_f32_e64 s[14:15], v20, v19 +; GFX7-NEXT: v_max_f32_e32 v19, v6, v14 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v13 +; GFX7-NEXT: v_mov_b32_e32 v21, 0x7fc00000 +; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v8 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_max_f32_e32 v13, v5, v6 +; GFX7-NEXT: v_cmp_o_f32_e64 s[18:19], v5, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v12 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_cmp_o_f32_e64 s[22:23], v3, v11 +; GFX7-NEXT: v_max_f32_e32 v14, v6, v5 +; GFX7-NEXT: v_cmp_o_f32_e64 s[20:21], v6, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v15 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v7 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v12 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_o_f32_e64 s[4:5], v1, v16 -; GFX7-NEXT: v_max_f32_e32 v1, v1, v16 -; GFX7-NEXT: v_cvt_f16_f32_e32 v16, v18 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GFX7-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GFX7-NEXT: v_cvt_f16_f32_e32 v9, v9 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GFX7-NEXT: v_cmp_o_f32_e64 s[6:7], v2, v16 -; GFX7-NEXT: v_max_f32_e32 v2, v2, v16 -; GFX7-NEXT: v_cvt_f16_f32_e32 v16, v19 -; GFX7-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GFX7-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GFX7-NEXT: v_cvt_f16_f32_e32 v17, v27 -; GFX7-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GFX7-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GFX7-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GFX7-NEXT: v_cmp_o_f32_e64 s[8:9], v3, v16 -; GFX7-NEXT: v_max_f32_e32 v3, v3, v16 -; GFX7-NEXT: v_cvt_f16_f32_e32 v16, v20 -; GFX7-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GFX7-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GFX7-NEXT: v_cvt_f16_f32_e32 v19, v29 -; GFX7-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GFX7-NEXT: v_cmp_o_f32_e64 s[24:25], v11, v17 -; GFX7-NEXT: v_max_f32_e32 v11, v11, v17 -; GFX7-NEXT: v_cvt_f16_f32_e32 v17, v28 -; GFX7-NEXT: v_cmp_o_f32_e64 s[10:11], v4, v16 -; GFX7-NEXT: v_max_f32_e32 v4, v4, v16 -; GFX7-NEXT: v_cvt_f16_f32_e32 v16, v21 -; GFX7-NEXT: v_cvt_f16_f32_e32 v20, v13 -; GFX7-NEXT: v_cvt_f32_f16_e32 v13, v17 -; GFX7-NEXT: v_cvt_f32_f16_e32 v18, v12 -; GFX7-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GFX7-NEXT: v_cvt_f32_f16_e32 v12, v19 -; GFX7-NEXT: v_cvt_f32_f16_e32 v17, v20 -; GFX7-NEXT: v_cmp_o_f32_e64 s[26:27], v18, v13 -; GFX7-NEXT: v_cmp_o_f32_e64 s[12:13], v5, v16 -; GFX7-NEXT: v_max_f32_e32 v5, v5, v16 -; GFX7-NEXT: v_cvt_f16_f32_e32 v16, v22 -; GFX7-NEXT: v_max_f32_e32 v13, v18, v13 -; GFX7-NEXT: v_max_f32_e32 v18, v17, v12 -; GFX7-NEXT: v_cmp_o_f32_e64 s[28:29], v17, v12 -; GFX7-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GFX7-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GFX7-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GFX7-NEXT: v_mov_b32_e32 v19, 0x7fc00000 -; GFX7-NEXT: v_cmp_o_f32_e64 s[14:15], v6, v16 -; GFX7-NEXT: v_max_f32_e32 v6, v6, v16 -; GFX7-NEXT: v_cvt_f16_f32_e32 v16, v23 -; GFX7-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GFX7-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v19, v0, vcc -; GFX7-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GFX7-NEXT: v_cndmask_b32_e64 v1, v19, v1, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v2, v19, v2, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e64 v3, v19, v3, s[8:9] -; GFX7-NEXT: v_cmp_o_f32_e64 s[16:17], v7, v16 -; GFX7-NEXT: v_max_f32_e32 v7, v7, v16 -; GFX7-NEXT: v_cvt_f16_f32_e32 v16, v24 -; GFX7-NEXT: v_cndmask_b32_e64 v4, v19, v4, s[10:11] -; GFX7-NEXT: v_cndmask_b32_e64 v5, v19, v5, s[12:13] -; GFX7-NEXT: v_cndmask_b32_e64 v6, v19, v6, s[14:15] -; GFX7-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GFX7-NEXT: v_cndmask_b32_e64 v7, v19, v7, s[16:17] -; GFX7-NEXT: v_cndmask_b32_e64 v11, v19, v11, s[24:25] -; GFX7-NEXT: v_cmp_o_f32_e64 s[18:19], v8, v16 -; GFX7-NEXT: v_max_f32_e32 v8, v8, v16 -; GFX7-NEXT: v_cvt_f16_f32_e32 v16, v25 -; GFX7-NEXT: v_cndmask_b32_e64 v8, v19, v8, s[18:19] -; GFX7-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GFX7-NEXT: v_cmp_o_f32_e64 s[20:21], v9, v16 -; GFX7-NEXT: v_max_f32_e32 v9, v9, v16 -; GFX7-NEXT: v_cvt_f16_f32_e32 v16, v26 -; GFX7-NEXT: v_cndmask_b32_e64 v9, v19, v9, s[20:21] -; GFX7-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GFX7-NEXT: v_cmp_o_f32_e64 s[22:23], v10, v16 -; GFX7-NEXT: v_max_f32_e32 v10, v10, v16 -; GFX7-NEXT: buffer_load_dword v16, off, s[0:3], s32 -; GFX7-NEXT: v_cndmask_b32_e64 v10, v19, v10, s[22:23] -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cvt_f16_f32_e32 v12, v16 -; GFX7-NEXT: v_cvt_f16_f32_e32 v16, v30 -; GFX7-NEXT: v_cvt_f32_f16_e32 v17, v12 -; GFX7-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GFX7-NEXT: v_cndmask_b32_e64 v12, v19, v13, s[26:27] -; GFX7-NEXT: v_cndmask_b32_e64 v13, v19, v18, s[28:29] -; GFX7-NEXT: v_max_f32_e32 v18, v14, v16 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v14, v16 -; GFX7-NEXT: v_cndmask_b32_e32 v14, v19, v18, vcc -; GFX7-NEXT: v_max_f32_e32 v16, v15, v17 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v15, v17 -; GFX7-NEXT: v_cndmask_b32_e32 v15, v19, v16, vcc +; GFX7-NEXT: v_max_f32_e32 v12, v3, v11 +; GFX7-NEXT: v_cndmask_b32_e32 v15, v21, v16, vcc +; GFX7-NEXT: v_max_f32_e32 v3, v6, v5 +; GFX7-NEXT: v_max_f32_e32 v11, v4, v7 +; GFX7-NEXT: v_cmp_o_f32_e64 s[24:25], v6, v5 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v4, v7 +; GFX7-NEXT: v_cndmask_b32_e64 v6, v21, v3, s[24:25] +; GFX7-NEXT: v_max_f32_e32 v3, v2, v10 +; GFX7-NEXT: v_cndmask_b32_e32 v4, v21, v11, vcc +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v2, v10 +; GFX7-NEXT: v_max_f32_e32 v5, v1, v9 +; GFX7-NEXT: v_cndmask_b32_e32 v2, v21, v3, vcc +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v1, v9 +; GFX7-NEXT: v_max_f32_e32 v7, v0, v8 +; GFX7-NEXT: v_cndmask_b32_e64 v20, v21, v24, s[12:13] +; GFX7-NEXT: v_cndmask_b32_e32 v1, v21, v5, vcc +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v8 +; GFX7-NEXT: v_cndmask_b32_e64 v16, v21, v17, s[4:5] +; GFX7-NEXT: v_cndmask_b32_e64 v17, v21, v18, s[8:9] +; GFX7-NEXT: v_cndmask_b32_e64 v18, v21, v22, s[10:11] +; GFX7-NEXT: v_cndmask_b32_e32 v0, v21, v7, vcc +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v20 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v18 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX7-NEXT: v_cndmask_b32_e64 v11, v21, v23, s[6:7] +; GFX7-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v17 +; GFX7-NEXT: v_cndmask_b32_e64 v12, v21, v12, s[22:23] +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v11 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v12 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_cndmask_b32_e64 v14, v21, v14, s[20:21] +; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cndmask_b32_e64 v13, v21, v13, s[18:19] +; GFX7-NEXT: v_or_b32_e32 v3, v7, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v14 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v16 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v13 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_cndmask_b32_e64 v22, v21, v25, s[14:15] +; GFX7-NEXT: v_cndmask_b32_e64 v19, v21, v19, s[16:17] +; GFX7-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v7 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v15 +; GFX7-NEXT: v_cvt_f16_f32_e32 v9, v6 +; GFX7-NEXT: v_or_b32_e32 v5, v8, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v19 +; GFX7-NEXT: v_cvt_f16_f32_e32 v10, v22 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v7 +; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v9 +; GFX7-NEXT: v_or_b32_e32 v6, v8, v6 +; GFX7-NEXT: v_or_b32_e32 v7, v10, v7 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_maximum_v16f16: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll index 22f09579a8f5d..24d6f4f84e816 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll @@ -577,16 +577,19 @@ entry: define amdgpu_ps <2 x half> @minnum_v2f16_no_ieee(<2 x half> %a, <2 x half> %b) #0 { ; SI-LABEL: minnum_v2f16_no_ieee: ; SI: ; %bb.0: -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_min_f32_e32 v0, v0, v2 -; SI-NEXT: v_min_f32_e32 v1, v1, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_min_f32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_min_f32_e32 v0, v0, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_readfirstlane_b32 s0, v0 ; SI-NEXT: ; return to shader part epilog ; ; VI-LABEL: minnum_v2f16_no_ieee: diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll index 96c4c3e74b384..4233367b3d5bb 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll @@ -6027,85 +6027,97 @@ define <2 x half> @local_atomic_fadd_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX7-LABEL: local_atomic_fadd_ret_v2f16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: ds_read_b32 v3, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v1 +; GFX7-NEXT: ds_read_b32 v2, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v1 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v6 ; GFX7-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_or_b32_e32 v7, v2, v1 -; GFX7-NEXT: v_add_f32_e32 v5, v5, v3 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX7-NEXT: v_add_f32_e32 v5, v5, v1 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v5 -; GFX7-NEXT: v_or_b32_e32 v1, v6, v1 -; GFX7-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v3, v6, v3 +; GFX7-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB20_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v0, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fadd_ret_v2f16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v3, v0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v2 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v1 +; GFX6-NEXT: ds_read_b32 v2, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v1 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v6 ; GFX6-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_or_b32_e32 v7, v2, v1 -; GFX6-NEXT: v_add_f32_e32 v5, v5, v3 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX6-NEXT: v_add_f32_e32 v5, v5, v1 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v5 -; GFX6-NEXT: v_or_b32_e32 v1, v6, v1 -; GFX6-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX6-NEXT: v_or_b32_e32 v3, v6, v3 +; GFX6-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB20_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v0, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(3) %ptr, i32 16383 %result = atomicrmw fadd ptr addrspace(3) %ptr, <2 x half> %val seq_cst @@ -6250,85 +6262,98 @@ define <2 x half> @local_atomic_fadd_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX7-LABEL: local_atomic_fadd_ret_v2f16__offset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: ds_read_b32 v3, v0 offset:65532 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v1 +; GFX7-NEXT: ds_read_b32 v2, v0 offset:65532 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v1 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v6 ; GFX7-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_or_b32_e32 v7, v2, v1 -; GFX7-NEXT: v_add_f32_e32 v5, v5, v3 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX7-NEXT: v_add_f32_e32 v5, v5, v1 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v5 -; GFX7-NEXT: v_or_b32_e32 v1, v6, v1 -; GFX7-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v1 offset:65532 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v3, v6, v3 +; GFX7-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v3 offset:65532 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB21_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v0, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fadd_ret_v2f16__offset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_add_i32_e32 v3, vcc, 0xfffc, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffc, v0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v4, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v1 +; GFX6-NEXT: ds_read_b32 v2, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v1 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v6 ; GFX6-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_or_b32_e32 v7, v0, v1 -; GFX6-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX6-NEXT: v_add_f32_e32 v5, v5, v1 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; GFX6-NEXT: v_or_b32_e32 v0, v6, v0 -; GFX6-NEXT: ds_cmpst_rtn_b32 v5, v3, v7, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX6-NEXT: v_or_b32_e32 v3, v6, v3 +; GFX6-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB21_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(3) %ptr, i32 16383 %result = atomicrmw fadd ptr addrspace(3) %gep, <2 x half> %val seq_cst @@ -6467,17 +6492,20 @@ define void @local_atomic_fadd_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX7-LABEL: local_atomic_fadd_noret_v2f16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: ds_read_b32 v3, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v1 +; GFX7-NEXT: ds_read_b32 v2, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v1 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v6 ; GFX7-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -6508,17 +6536,20 @@ define void @local_atomic_fadd_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX6-LABEL: local_atomic_fadd_noret_v2f16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v3, v0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v1 +; GFX6-NEXT: ds_read_b32 v2, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v1 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v6 ; GFX6-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -6681,17 +6712,20 @@ define void @local_atomic_fadd_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX7-LABEL: local_atomic_fadd_noret_v2f16__offset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: ds_read_b32 v3, v0 offset:65532 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v1 +; GFX7-NEXT: ds_read_b32 v2, v0 offset:65532 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v1 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v6 ; GFX7-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -6722,18 +6756,21 @@ define void @local_atomic_fadd_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX6-LABEL: local_atomic_fadd_noret_v2f16__offset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffc, v0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v4, v0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v1 +; GFX6-NEXT: ds_read_b32 v2, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v1 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v6 ; GFX6-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -7045,78 +7082,86 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: ds_read_b32 v3, v0 -; GFX7-NEXT: v_mov_b32_e32 v4, v1 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: ds_read_b32 v2, v0 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v5 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_add_f32_e32 v5, v5, v4 -; GFX7-NEXT: v_alignbit_b32 v1, v1, v3, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v6 -; GFX7-NEXT: v_alignbit_b32 v3, v3, v5, 16 -; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v0, v1, v3 +; GFX7-NEXT: v_add_f32_e32 v5, v5, v1 +; GFX7-NEXT: v_alignbit_b32 v3, v3, v4, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_alignbit_b32 v4, v4, v5, 16 +; GFX7-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB24_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v0, v3 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v4 +; GFX7-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fadd_ret_v2bf16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v3, v0 -; GFX6-NEXT: v_mov_b32_e32 v4, v1 -; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX6-NEXT: ds_read_b32 v2, v0 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v5 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX6-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_add_f32_e32 v5, v5, v4 -; GFX6-NEXT: v_alignbit_b32 v1, v1, v3, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v6 -; GFX6-NEXT: v_alignbit_b32 v3, v3, v5, 16 -; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v0, v1, v3 +; GFX6-NEXT: v_add_f32_e32 v5, v5, v1 +; GFX6-NEXT: v_alignbit_b32 v3, v3, v4, 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 +; GFX6-NEXT: v_alignbit_b32 v4, v4, v5, 16 +; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB24_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v0, v3 +; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v4 +; GFX6-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; GFX6-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fadd ptr addrspace(3) %ptr, <2 x bfloat> %val seq_cst ret <2 x bfloat> %result @@ -7398,78 +7443,87 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: ds_read_b32 v3, v0 offset:65532 -; GFX7-NEXT: v_mov_b32_e32 v4, v1 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: ds_read_b32 v2, v0 offset:65532 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v5 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_add_f32_e32 v5, v5, v4 -; GFX7-NEXT: v_alignbit_b32 v1, v1, v3, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v6 -; GFX7-NEXT: v_alignbit_b32 v3, v3, v5, 16 -; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v0, v1, v3 offset:65532 +; GFX7-NEXT: v_add_f32_e32 v5, v5, v1 +; GFX7-NEXT: v_alignbit_b32 v3, v3, v4, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_alignbit_b32 v4, v4, v5, 16 +; GFX7-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB25_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v0, v3 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v4 +; GFX7-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fadd_ret_v2bf16__offset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0xfffc, v0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffc, v0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v0, v4 -; GFX6-NEXT: v_mov_b32_e32 v3, v1 -; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX6-NEXT: ds_read_b32 v2, v0 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v5 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX6-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_add_f32_e32 v5, v5, v3 -; GFX6-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v6 -; GFX6-NEXT: v_alignbit_b32 v1, v1, v5, 16 -; GFX6-NEXT: ds_cmpst_rtn_b32 v5, v4, v0, v1 +; GFX6-NEXT: v_add_f32_e32 v5, v5, v1 +; GFX6-NEXT: v_alignbit_b32 v3, v3, v4, 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 +; GFX6-NEXT: v_alignbit_b32 v4, v4, v5, 16 +; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v0 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB25_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v4 +; GFX6-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(3) %ptr, i32 16383 %result = atomicrmw fadd ptr addrspace(3) %gep, <2 x bfloat> %val seq_cst @@ -7743,14 +7797,16 @@ define void @local_atomic_fadd_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: ds_read_b32 v4, v0 +; GFX7-NEXT: ds_read_b32 v2, v0 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v5 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -7780,14 +7836,16 @@ define void @local_atomic_fadd_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v4, v0 +; GFX6-NEXT: ds_read_b32 v2, v0 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v5 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX6-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -8083,14 +8141,16 @@ define void @local_atomic_fadd_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: ds_read_b32 v4, v0 offset:65532 +; GFX7-NEXT: ds_read_b32 v2, v0 offset:65532 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v5 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -8121,14 +8181,16 @@ define void @local_atomic_fadd_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffc, v0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v4, v0 +; GFX6-NEXT: ds_read_b32 v2, v0 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v5 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX6-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll index e0745fda6c003..4dd7f0e3c450a 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll @@ -5697,85 +5697,97 @@ define <2 x half> @local_atomic_fmax_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX7-LABEL: local_atomic_fmax_ret_v2f16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: ds_read_b32 v3, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v1 +; GFX7-NEXT: ds_read_b32 v2, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v1 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v6 ; GFX7-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_or_b32_e32 v7, v2, v1 -; GFX7-NEXT: v_max_f32_e32 v5, v5, v3 -; GFX7-NEXT: v_max_f32_e32 v6, v6, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX7-NEXT: v_max_f32_e32 v5, v5, v1 +; GFX7-NEXT: v_max_f32_e32 v6, v6, v2 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v5 -; GFX7-NEXT: v_or_b32_e32 v1, v6, v1 -; GFX7-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v3, v6, v3 +; GFX7-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB20_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v0, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fmax_ret_v2f16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v3, v0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v2 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v1 +; GFX6-NEXT: ds_read_b32 v2, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v1 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v6 ; GFX6-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_or_b32_e32 v7, v2, v1 -; GFX6-NEXT: v_max_f32_e32 v5, v5, v3 -; GFX6-NEXT: v_max_f32_e32 v6, v6, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX6-NEXT: v_max_f32_e32 v5, v5, v1 +; GFX6-NEXT: v_max_f32_e32 v6, v6, v2 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v5 -; GFX6-NEXT: v_or_b32_e32 v1, v6, v1 -; GFX6-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX6-NEXT: v_or_b32_e32 v3, v6, v3 +; GFX6-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB20_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v0, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(3) %ptr, i32 16383 %result = atomicrmw fmax ptr addrspace(3) %ptr, <2 x half> %val seq_cst @@ -5968,85 +5980,98 @@ define <2 x half> @local_atomic_fmax_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX7-LABEL: local_atomic_fmax_ret_v2f16__offset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: ds_read_b32 v3, v0 offset:65532 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v1 +; GFX7-NEXT: ds_read_b32 v2, v0 offset:65532 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v1 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v6 ; GFX7-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_or_b32_e32 v7, v2, v1 -; GFX7-NEXT: v_max_f32_e32 v5, v5, v3 -; GFX7-NEXT: v_max_f32_e32 v6, v6, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX7-NEXT: v_max_f32_e32 v5, v5, v1 +; GFX7-NEXT: v_max_f32_e32 v6, v6, v2 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v5 -; GFX7-NEXT: v_or_b32_e32 v1, v6, v1 -; GFX7-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v1 offset:65532 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v3, v6, v3 +; GFX7-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v3 offset:65532 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB21_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v0, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fmax_ret_v2f16__offset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_add_i32_e32 v3, vcc, 0xfffc, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffc, v0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v4, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v1 +; GFX6-NEXT: ds_read_b32 v2, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v1 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v6 ; GFX6-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_or_b32_e32 v7, v0, v1 -; GFX6-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX6-NEXT: v_max_f32_e32 v6, v6, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX6-NEXT: v_max_f32_e32 v5, v5, v1 +; GFX6-NEXT: v_max_f32_e32 v6, v6, v2 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; GFX6-NEXT: v_or_b32_e32 v0, v6, v0 -; GFX6-NEXT: ds_cmpst_rtn_b32 v5, v3, v7, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX6-NEXT: v_or_b32_e32 v3, v6, v3 +; GFX6-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB21_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(3) %ptr, i32 16383 %result = atomicrmw fmax ptr addrspace(3) %gep, <2 x half> %val seq_cst @@ -6232,17 +6257,20 @@ define void @local_atomic_fmax_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX7-LABEL: local_atomic_fmax_noret_v2f16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: ds_read_b32 v3, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v1 +; GFX7-NEXT: ds_read_b32 v2, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v1 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v6 ; GFX7-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -6273,17 +6301,20 @@ define void @local_atomic_fmax_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX6-LABEL: local_atomic_fmax_noret_v2f16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v3, v0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v1 +; GFX6-NEXT: ds_read_b32 v2, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v1 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v6 ; GFX6-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -6493,17 +6524,20 @@ define void @local_atomic_fmax_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX7-LABEL: local_atomic_fmax_noret_v2f16__offset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: ds_read_b32 v3, v0 offset:65532 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v1 +; GFX7-NEXT: ds_read_b32 v2, v0 offset:65532 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v1 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v6 ; GFX7-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -6534,18 +6568,21 @@ define void @local_atomic_fmax_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX6-LABEL: local_atomic_fmax_noret_v2f16__offset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffc, v0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v4, v0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v1 +; GFX6-NEXT: ds_read_b32 v2, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v1 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v6 ; GFX6-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -6980,78 +7017,86 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: ds_read_b32 v3, v0 -; GFX7-NEXT: v_mov_b32_e32 v4, v1 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: ds_read_b32 v2, v0 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v5 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v6 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX7-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX7-NEXT: v_max_f32_e32 v6, v6, v4 -; GFX7-NEXT: v_alignbit_b32 v1, v1, v3, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_alignbit_b32 v3, v3, v6, 16 -; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v0, v1, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_max_f32_e32 v5, v5, v1 +; GFX7-NEXT: v_max_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v5 +; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16 +; GFX7-NEXT: ds_cmpst_rtn_b32 v5, v0, v3, v4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB24_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v0, v3 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v3 +; GFX7-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fmax_ret_v2bf16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v3, v0 -; GFX6-NEXT: v_mov_b32_e32 v4, v1 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX6-NEXT: ds_read_b32 v2, v0 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v5 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v6 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX6-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 ; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX6-NEXT: v_max_f32_e32 v6, v6, v4 -; GFX6-NEXT: v_alignbit_b32 v1, v1, v3, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX6-NEXT: v_alignbit_b32 v3, v3, v6, 16 -; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v0, v1, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_max_f32_e32 v5, v5, v1 +; GFX6-NEXT: v_max_f32_e32 v6, v6, v2 +; GFX6-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v5 +; GFX6-NEXT: v_alignbit_b32 v4, v4, v6, 16 +; GFX6-NEXT: ds_cmpst_rtn_b32 v5, v0, v3, v4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB24_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v0, v3 +; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v3 +; GFX6-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; GFX6-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fmax ptr addrspace(3) %ptr, <2 x bfloat> %val seq_cst ret <2 x bfloat> %result @@ -7456,78 +7501,87 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: ds_read_b32 v3, v0 offset:65532 -; GFX7-NEXT: v_mov_b32_e32 v4, v1 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: ds_read_b32 v2, v0 offset:65532 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v5 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v6 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX7-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX7-NEXT: v_max_f32_e32 v6, v6, v4 -; GFX7-NEXT: v_alignbit_b32 v1, v1, v3, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_alignbit_b32 v3, v3, v6, 16 -; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v0, v1, v3 offset:65532 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_max_f32_e32 v5, v5, v1 +; GFX7-NEXT: v_max_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v5 +; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16 +; GFX7-NEXT: ds_cmpst_rtn_b32 v5, v0, v3, v4 offset:65532 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB25_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v0, v3 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v3 +; GFX7-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fmax_ret_v2bf16__offset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0xfffc, v0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffc, v0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v0, v4 -; GFX6-NEXT: v_mov_b32_e32 v3, v1 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: ds_read_b32 v2, v0 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v5 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v6 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX6-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX6-NEXT: v_max_f32_e32 v6, v6, v3 -; GFX6-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v5 -; GFX6-NEXT: v_alignbit_b32 v1, v1, v6, 16 -; GFX6-NEXT: ds_cmpst_rtn_b32 v5, v4, v0, v1 +; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_max_f32_e32 v5, v5, v1 +; GFX6-NEXT: v_max_f32_e32 v6, v6, v2 +; GFX6-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v5 +; GFX6-NEXT: v_alignbit_b32 v4, v4, v6, 16 +; GFX6-NEXT: ds_cmpst_rtn_b32 v5, v0, v3, v4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v0 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB25_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v3 +; GFX6-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(3) %ptr, i32 16383 %result = atomicrmw fmax ptr addrspace(3) %gep, <2 x bfloat> %val seq_cst @@ -7917,15 +7971,17 @@ define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: ds_read_b32 v3, v0 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: ds_read_b32 v2, v0 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v5 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v6 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -7933,8 +7989,8 @@ define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX7-NEXT: v_max_f32_e32 v6, v6, v1 +; GFX7-NEXT: v_max_f32_e32 v5, v5, v1 +; GFX7-NEXT: v_max_f32_e32 v6, v6, v2 ; GFX7-NEXT: v_alignbit_b32 v3, v4, v3, 16 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v5 ; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16 @@ -7954,15 +8010,17 @@ define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v3, v0 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX6-NEXT: ds_read_b32 v2, v0 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v5 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v6 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX6-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -7970,8 +8028,8 @@ define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 ; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX6-NEXT: v_max_f32_e32 v6, v6, v1 +; GFX6-NEXT: v_max_f32_e32 v5, v5, v1 +; GFX6-NEXT: v_max_f32_e32 v6, v6, v2 ; GFX6-NEXT: v_alignbit_b32 v3, v4, v3, 16 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v5 ; GFX6-NEXT: v_alignbit_b32 v4, v4, v6, 16 @@ -8373,15 +8431,17 @@ define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: ds_read_b32 v3, v0 offset:65532 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: ds_read_b32 v2, v0 offset:65532 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v5 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v6 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -8389,8 +8449,8 @@ define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX7-NEXT: v_max_f32_e32 v6, v6, v1 +; GFX7-NEXT: v_max_f32_e32 v5, v5, v1 +; GFX7-NEXT: v_max_f32_e32 v6, v6, v2 ; GFX7-NEXT: v_alignbit_b32 v3, v4, v3, 16 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v5 ; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16 @@ -8411,15 +8471,17 @@ define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffc, v0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v3, v0 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX6-NEXT: ds_read_b32 v2, v0 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v5 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v6 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX6-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -8427,8 +8489,8 @@ define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 ; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX6-NEXT: v_max_f32_e32 v6, v6, v1 +; GFX6-NEXT: v_max_f32_e32 v5, v5, v1 +; GFX6-NEXT: v_max_f32_e32 v6, v6, v2 ; GFX6-NEXT: v_alignbit_b32 v3, v4, v3, 16 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v5 ; GFX6-NEXT: v_alignbit_b32 v4, v4, v6, 16 diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll index a03d02691a8b4..57fe5f708e216 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll @@ -5697,85 +5697,97 @@ define <2 x half> @local_atomic_fmin_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX7-LABEL: local_atomic_fmin_ret_v2f16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: ds_read_b32 v3, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v1 +; GFX7-NEXT: ds_read_b32 v2, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v1 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v6 ; GFX7-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_or_b32_e32 v7, v2, v1 -; GFX7-NEXT: v_min_f32_e32 v5, v5, v3 -; GFX7-NEXT: v_min_f32_e32 v6, v6, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX7-NEXT: v_min_f32_e32 v5, v5, v1 +; GFX7-NEXT: v_min_f32_e32 v6, v6, v2 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v5 -; GFX7-NEXT: v_or_b32_e32 v1, v6, v1 -; GFX7-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v3, v6, v3 +; GFX7-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB20_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v0, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fmin_ret_v2f16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v3, v0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v2 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v1 +; GFX6-NEXT: ds_read_b32 v2, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v1 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v6 ; GFX6-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_or_b32_e32 v7, v2, v1 -; GFX6-NEXT: v_min_f32_e32 v5, v5, v3 -; GFX6-NEXT: v_min_f32_e32 v6, v6, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX6-NEXT: v_min_f32_e32 v5, v5, v1 +; GFX6-NEXT: v_min_f32_e32 v6, v6, v2 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v5 -; GFX6-NEXT: v_or_b32_e32 v1, v6, v1 -; GFX6-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX6-NEXT: v_or_b32_e32 v3, v6, v3 +; GFX6-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB20_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v0, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(3) %ptr, i32 16383 %result = atomicrmw fmin ptr addrspace(3) %ptr, <2 x half> %val seq_cst @@ -5968,85 +5980,98 @@ define <2 x half> @local_atomic_fmin_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX7-LABEL: local_atomic_fmin_ret_v2f16__offset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: ds_read_b32 v3, v0 offset:65532 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v1 +; GFX7-NEXT: ds_read_b32 v2, v0 offset:65532 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v1 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v6 ; GFX7-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_or_b32_e32 v7, v2, v1 -; GFX7-NEXT: v_min_f32_e32 v5, v5, v3 -; GFX7-NEXT: v_min_f32_e32 v6, v6, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX7-NEXT: v_min_f32_e32 v5, v5, v1 +; GFX7-NEXT: v_min_f32_e32 v6, v6, v2 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v5 -; GFX7-NEXT: v_or_b32_e32 v1, v6, v1 -; GFX7-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v1 offset:65532 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v3, v6, v3 +; GFX7-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v3 offset:65532 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB21_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v0, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fmin_ret_v2f16__offset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_add_i32_e32 v3, vcc, 0xfffc, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffc, v0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v4, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v1 +; GFX6-NEXT: ds_read_b32 v2, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v1 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v6 ; GFX6-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_or_b32_e32 v7, v0, v1 -; GFX6-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX6-NEXT: v_min_f32_e32 v6, v6, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX6-NEXT: v_min_f32_e32 v5, v5, v1 +; GFX6-NEXT: v_min_f32_e32 v6, v6, v2 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; GFX6-NEXT: v_or_b32_e32 v0, v6, v0 -; GFX6-NEXT: ds_cmpst_rtn_b32 v5, v3, v7, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX6-NEXT: v_or_b32_e32 v3, v6, v3 +; GFX6-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB21_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(3) %ptr, i32 16383 %result = atomicrmw fmin ptr addrspace(3) %gep, <2 x half> %val seq_cst @@ -6232,17 +6257,20 @@ define void @local_atomic_fmin_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX7-LABEL: local_atomic_fmin_noret_v2f16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: ds_read_b32 v3, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v1 +; GFX7-NEXT: ds_read_b32 v2, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v1 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v6 ; GFX7-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -6273,17 +6301,20 @@ define void @local_atomic_fmin_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX6-LABEL: local_atomic_fmin_noret_v2f16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v3, v0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v1 +; GFX6-NEXT: ds_read_b32 v2, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v1 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v6 ; GFX6-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -6493,17 +6524,20 @@ define void @local_atomic_fmin_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX7-LABEL: local_atomic_fmin_noret_v2f16__offset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: ds_read_b32 v3, v0 offset:65532 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v1 +; GFX7-NEXT: ds_read_b32 v2, v0 offset:65532 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v1 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v6 ; GFX7-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -6534,18 +6568,21 @@ define void @local_atomic_fmin_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX6-LABEL: local_atomic_fmin_noret_v2f16__offset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffc, v0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v4, v0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v1 +; GFX6-NEXT: ds_read_b32 v2, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v1 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v6 ; GFX6-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -6980,78 +7017,86 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: ds_read_b32 v3, v0 -; GFX7-NEXT: v_mov_b32_e32 v4, v1 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: ds_read_b32 v2, v0 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v5 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v6 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX7-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX7-NEXT: v_min_f32_e32 v6, v6, v4 -; GFX7-NEXT: v_alignbit_b32 v1, v1, v3, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_alignbit_b32 v3, v3, v6, 16 -; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v0, v1, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_min_f32_e32 v5, v5, v1 +; GFX7-NEXT: v_min_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v5 +; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16 +; GFX7-NEXT: ds_cmpst_rtn_b32 v5, v0, v3, v4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB24_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v0, v3 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v3 +; GFX7-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fmin_ret_v2bf16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v3, v0 -; GFX6-NEXT: v_mov_b32_e32 v4, v1 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX6-NEXT: ds_read_b32 v2, v0 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v5 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v6 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX6-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 ; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX6-NEXT: v_min_f32_e32 v6, v6, v4 -; GFX6-NEXT: v_alignbit_b32 v1, v1, v3, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX6-NEXT: v_alignbit_b32 v3, v3, v6, 16 -; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v0, v1, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_min_f32_e32 v5, v5, v1 +; GFX6-NEXT: v_min_f32_e32 v6, v6, v2 +; GFX6-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v5 +; GFX6-NEXT: v_alignbit_b32 v4, v4, v6, 16 +; GFX6-NEXT: ds_cmpst_rtn_b32 v5, v0, v3, v4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB24_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v0, v3 +; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v3 +; GFX6-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; GFX6-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fmin ptr addrspace(3) %ptr, <2 x bfloat> %val seq_cst ret <2 x bfloat> %result @@ -7456,78 +7501,87 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: ds_read_b32 v3, v0 offset:65532 -; GFX7-NEXT: v_mov_b32_e32 v4, v1 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: ds_read_b32 v2, v0 offset:65532 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v5 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v6 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX7-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX7-NEXT: v_min_f32_e32 v6, v6, v4 -; GFX7-NEXT: v_alignbit_b32 v1, v1, v3, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_alignbit_b32 v3, v3, v6, 16 -; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v0, v1, v3 offset:65532 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_min_f32_e32 v5, v5, v1 +; GFX7-NEXT: v_min_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v5 +; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16 +; GFX7-NEXT: ds_cmpst_rtn_b32 v5, v0, v3, v4 offset:65532 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB25_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v0, v3 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v3 +; GFX7-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fmin_ret_v2bf16__offset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0xfffc, v0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffc, v0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v0, v4 -; GFX6-NEXT: v_mov_b32_e32 v3, v1 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: ds_read_b32 v2, v0 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v5 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v6 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX6-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX6-NEXT: v_min_f32_e32 v6, v6, v3 -; GFX6-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v5 -; GFX6-NEXT: v_alignbit_b32 v1, v1, v6, 16 -; GFX6-NEXT: ds_cmpst_rtn_b32 v5, v4, v0, v1 +; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_min_f32_e32 v5, v5, v1 +; GFX6-NEXT: v_min_f32_e32 v6, v6, v2 +; GFX6-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v5 +; GFX6-NEXT: v_alignbit_b32 v4, v4, v6, 16 +; GFX6-NEXT: ds_cmpst_rtn_b32 v5, v0, v3, v4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v0 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB25_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v3 +; GFX6-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(3) %ptr, i32 16383 %result = atomicrmw fmin ptr addrspace(3) %gep, <2 x bfloat> %val seq_cst @@ -7917,15 +7971,17 @@ define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: ds_read_b32 v3, v0 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: ds_read_b32 v2, v0 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v5 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v6 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -7933,8 +7989,8 @@ define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX7-NEXT: v_min_f32_e32 v6, v6, v1 +; GFX7-NEXT: v_min_f32_e32 v5, v5, v1 +; GFX7-NEXT: v_min_f32_e32 v6, v6, v2 ; GFX7-NEXT: v_alignbit_b32 v3, v4, v3, 16 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v5 ; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16 @@ -7954,15 +8010,17 @@ define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v3, v0 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX6-NEXT: ds_read_b32 v2, v0 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v5 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v6 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX6-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -7970,8 +8028,8 @@ define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 ; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX6-NEXT: v_min_f32_e32 v6, v6, v1 +; GFX6-NEXT: v_min_f32_e32 v5, v5, v1 +; GFX6-NEXT: v_min_f32_e32 v6, v6, v2 ; GFX6-NEXT: v_alignbit_b32 v3, v4, v3, 16 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v5 ; GFX6-NEXT: v_alignbit_b32 v4, v4, v6, 16 @@ -8373,15 +8431,17 @@ define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: ds_read_b32 v3, v0 offset:65532 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: ds_read_b32 v2, v0 offset:65532 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v5 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v6 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -8389,8 +8449,8 @@ define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX7-NEXT: v_min_f32_e32 v6, v6, v1 +; GFX7-NEXT: v_min_f32_e32 v5, v5, v1 +; GFX7-NEXT: v_min_f32_e32 v6, v6, v2 ; GFX7-NEXT: v_alignbit_b32 v3, v4, v3, 16 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v5 ; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16 @@ -8411,15 +8471,17 @@ define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffc, v0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v3, v0 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX6-NEXT: ds_read_b32 v2, v0 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v5 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v6 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX6-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -8427,8 +8489,8 @@ define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 ; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX6-NEXT: v_min_f32_e32 v6, v6, v1 +; GFX6-NEXT: v_min_f32_e32 v5, v5, v1 +; GFX6-NEXT: v_min_f32_e32 v6, v6, v2 ; GFX6-NEXT: v_alignbit_b32 v3, v4, v3, 16 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v5 ; GFX6-NEXT: v_alignbit_b32 v4, v4, v6, 16 diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll index 27dc54969380b..3a971a3b5a8d2 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll @@ -6520,85 +6520,97 @@ define <2 x half> @local_atomic_fsub_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX7-LABEL: local_atomic_fsub_ret_v2f16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: ds_read_b32 v3, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v1 +; GFX7-NEXT: ds_read_b32 v2, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v1 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v6 ; GFX7-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_or_b32_e32 v7, v2, v1 -; GFX7-NEXT: v_sub_f32_e32 v5, v5, v3 -; GFX7-NEXT: v_sub_f32_e32 v6, v6, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX7-NEXT: v_sub_f32_e32 v5, v5, v1 +; GFX7-NEXT: v_sub_f32_e32 v6, v6, v2 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v5 -; GFX7-NEXT: v_or_b32_e32 v1, v6, v1 -; GFX7-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v3, v6, v3 +; GFX7-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB20_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v0, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fsub_ret_v2f16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v3, v0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v2 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v1 +; GFX6-NEXT: ds_read_b32 v2, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v1 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v6 ; GFX6-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_or_b32_e32 v7, v2, v1 -; GFX6-NEXT: v_sub_f32_e32 v5, v5, v3 -; GFX6-NEXT: v_sub_f32_e32 v6, v6, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX6-NEXT: v_sub_f32_e32 v5, v5, v1 +; GFX6-NEXT: v_sub_f32_e32 v6, v6, v2 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v5 -; GFX6-NEXT: v_or_b32_e32 v1, v6, v1 -; GFX6-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX6-NEXT: v_or_b32_e32 v3, v6, v3 +; GFX6-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB20_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v0, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(3) %ptr, i32 16383 %result = atomicrmw fsub ptr addrspace(3) %ptr, <2 x half> %val seq_cst @@ -6774,85 +6786,98 @@ define <2 x half> @local_atomic_fsub_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX7-LABEL: local_atomic_fsub_ret_v2f16__offset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: ds_read_b32 v3, v0 offset:65532 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v1 +; GFX7-NEXT: ds_read_b32 v2, v0 offset:65532 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v1 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v6 ; GFX7-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_or_b32_e32 v7, v2, v1 -; GFX7-NEXT: v_sub_f32_e32 v5, v5, v3 -; GFX7-NEXT: v_sub_f32_e32 v6, v6, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX7-NEXT: v_sub_f32_e32 v5, v5, v1 +; GFX7-NEXT: v_sub_f32_e32 v6, v6, v2 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v5 -; GFX7-NEXT: v_or_b32_e32 v1, v6, v1 -; GFX7-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v1 offset:65532 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v3, v6, v3 +; GFX7-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v3 offset:65532 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB21_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v0, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fsub_ret_v2f16__offset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_add_i32_e32 v3, vcc, 0xfffc, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffc, v0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v4, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v1 +; GFX6-NEXT: ds_read_b32 v2, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v1 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v6 ; GFX6-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_or_b32_e32 v7, v0, v1 -; GFX6-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX6-NEXT: v_sub_f32_e32 v6, v6, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX6-NEXT: v_sub_f32_e32 v5, v5, v1 +; GFX6-NEXT: v_sub_f32_e32 v6, v6, v2 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; GFX6-NEXT: v_or_b32_e32 v0, v6, v0 -; GFX6-NEXT: ds_cmpst_rtn_b32 v5, v3, v7, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX6-NEXT: v_or_b32_e32 v3, v6, v3 +; GFX6-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB21_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(3) %ptr, i32 16383 %result = atomicrmw fsub ptr addrspace(3) %gep, <2 x half> %val seq_cst @@ -7019,17 +7044,20 @@ define void @local_atomic_fsub_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX7-LABEL: local_atomic_fsub_noret_v2f16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: ds_read_b32 v3, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v1 +; GFX7-NEXT: ds_read_b32 v2, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v1 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v6 ; GFX7-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -7060,17 +7088,20 @@ define void @local_atomic_fsub_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX6-LABEL: local_atomic_fsub_noret_v2f16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v3, v0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v1 +; GFX6-NEXT: ds_read_b32 v2, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v1 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v6 ; GFX6-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -7261,17 +7292,20 @@ define void @local_atomic_fsub_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX7-LABEL: local_atomic_fsub_noret_v2f16__offset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: ds_read_b32 v3, v0 offset:65532 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v1 +; GFX7-NEXT: ds_read_b32 v2, v0 offset:65532 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v1 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v6 ; GFX7-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -7302,18 +7336,21 @@ define void @local_atomic_fsub_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX6-LABEL: local_atomic_fsub_noret_v2f16__offset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffc, v0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v4, v0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v1 +; GFX6-NEXT: ds_read_b32 v2, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v1 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v6 ; GFX6-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -7748,78 +7785,86 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: ds_read_b32 v3, v0 -; GFX7-NEXT: v_mov_b32_e32 v4, v1 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: ds_read_b32 v2, v0 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v5 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_sub_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_sub_f32_e32 v5, v5, v4 -; GFX7-NEXT: v_alignbit_b32 v1, v1, v3, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v6 -; GFX7-NEXT: v_alignbit_b32 v3, v3, v5, 16 -; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v0, v1, v3 +; GFX7-NEXT: v_sub_f32_e32 v5, v5, v1 +; GFX7-NEXT: v_alignbit_b32 v3, v3, v4, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_alignbit_b32 v4, v4, v5, 16 +; GFX7-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB24_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v0, v3 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v4 +; GFX7-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fsub_ret_v2bf16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v3, v0 -; GFX6-NEXT: v_mov_b32_e32 v4, v1 -; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX6-NEXT: ds_read_b32 v2, v0 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v5 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX6-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_sub_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_sub_f32_e32 v5, v5, v4 -; GFX6-NEXT: v_alignbit_b32 v1, v1, v3, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v6 -; GFX6-NEXT: v_alignbit_b32 v3, v3, v5, 16 -; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v0, v1, v3 +; GFX6-NEXT: v_sub_f32_e32 v5, v5, v1 +; GFX6-NEXT: v_alignbit_b32 v3, v3, v4, 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 +; GFX6-NEXT: v_alignbit_b32 v4, v4, v5, 16 +; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB24_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v0, v3 +; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v4 +; GFX6-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; GFX6-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fsub ptr addrspace(3) %ptr, <2 x bfloat> %val seq_cst ret <2 x bfloat> %result @@ -8224,78 +8269,87 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: ds_read_b32 v3, v0 offset:65532 -; GFX7-NEXT: v_mov_b32_e32 v4, v1 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: ds_read_b32 v2, v0 offset:65532 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v5 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_sub_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_sub_f32_e32 v5, v5, v4 -; GFX7-NEXT: v_alignbit_b32 v1, v1, v3, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v6 -; GFX7-NEXT: v_alignbit_b32 v3, v3, v5, 16 -; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v0, v1, v3 offset:65532 +; GFX7-NEXT: v_sub_f32_e32 v5, v5, v1 +; GFX7-NEXT: v_alignbit_b32 v3, v3, v4, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_alignbit_b32 v4, v4, v5, 16 +; GFX7-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB25_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v0, v3 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v4 +; GFX7-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fsub_ret_v2bf16__offset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0xfffc, v0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffc, v0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v0, v4 -; GFX6-NEXT: v_mov_b32_e32 v3, v1 -; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX6-NEXT: ds_read_b32 v2, v0 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v5 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX6-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_sub_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_sub_f32_e32 v5, v5, v3 -; GFX6-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v6 -; GFX6-NEXT: v_alignbit_b32 v1, v1, v5, 16 -; GFX6-NEXT: ds_cmpst_rtn_b32 v5, v4, v0, v1 +; GFX6-NEXT: v_sub_f32_e32 v5, v5, v1 +; GFX6-NEXT: v_alignbit_b32 v3, v3, v4, 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 +; GFX6-NEXT: v_alignbit_b32 v4, v4, v5, 16 +; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v0 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB25_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v4 +; GFX6-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(3) %ptr, i32 16383 %result = atomicrmw fsub ptr addrspace(3) %gep, <2 x bfloat> %val seq_cst @@ -8685,14 +8739,16 @@ define void @local_atomic_fsub_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: ds_read_b32 v4, v0 +; GFX7-NEXT: ds_read_b32 v2, v0 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v5 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -8722,14 +8778,16 @@ define void @local_atomic_fsub_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v4, v0 +; GFX6-NEXT: ds_read_b32 v2, v0 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v5 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX6-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -9141,14 +9199,16 @@ define void @local_atomic_fsub_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: ds_read_b32 v4, v0 offset:65532 +; GFX7-NEXT: ds_read_b32 v2, v0 offset:65532 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v5 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -9179,14 +9239,16 @@ define void @local_atomic_fsub_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffc, v0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v4, v0 +; GFX6-NEXT: ds_read_b32 v2, v0 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v5 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX6-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll index 2b6c487e27ed2..3d48ff437e8ff 100644 --- a/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll +++ b/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll @@ -40,7 +40,7 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo(half %src0, half %s ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG-CI-NEXT: v_mac_f32_e32 v2, v0, v1 ; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v2 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SDAG-CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo: @@ -50,7 +50,8 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo(half %src0, half %s ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GISEL-CI-NEXT: v_mac_f32_e32 v2, v0, v1 -; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v2 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v2 +; GISEL-CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GISEL-CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext half %src0 to float %src1.ext = fpext half %src1 to float @@ -102,8 +103,8 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_constlo(half %src0, half %s ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG-CI-NEXT: v_mac_f32_e32 v2, v0, v1 ; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v2 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SDAG-CI-NEXT: v_mov_b32_e32 v0, 1.0 +; SDAG-CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SDAG-CI-NEXT: v_or_b32_e32 v0, 0x3c00, v0 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-GFX11-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_constlo: @@ -122,8 +123,9 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_constlo(half %src0, half %s ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GISEL-CI-NEXT: v_mac_f32_e32 v2, v0, v1 -; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v2 -; GISEL-CI-NEXT: v_mov_b32_e32 v0, 0x3c00 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v2 +; GISEL-CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GISEL-CI-NEXT: v_or_b32_e32 v0, 0x3c00, v0 ; GISEL-CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext half %src0 to float %src1.ext = fpext half %src1 to float @@ -173,8 +175,9 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_reglo(half %src0, half %src ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG-CI-NEXT: v_mac_f32_e32 v2, v0, v1 ; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v2 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SDAG-CI-NEXT: v_mov_b32_e32 v0, v3 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v1, v3 +; SDAG-CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SDAG-CI-NEXT: v_or_b32_e32 v0, v1, v0 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-GFX11-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_reglo: @@ -192,8 +195,10 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_reglo(half %src0, half %src ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GISEL-CI-NEXT: v_mac_f32_e32 v2, v0, v1 -; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v2 -; GISEL-CI-NEXT: v_mov_b32_e32 v0, v3 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v2 +; GISEL-CI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GISEL-CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GISEL-CI-NEXT: v_or_b32_e32 v0, v1, v0 ; GISEL-CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext half %src0 to float %src1.ext = fpext half %src1 to float @@ -416,7 +421,7 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_precvt(half % ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG-CI-NEXT: v_mad_f32 v0, v0, v1, v2 clamp ; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SDAG-CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-GFX11-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_precvt: @@ -435,7 +440,8 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_precvt(half % ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GISEL-CI-NEXT: v_mad_f32 v0, v0, v1, v2 clamp -; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v0 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GISEL-CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GISEL-CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext half %src0 to float %src1.ext = fpext half %src1 to float @@ -476,7 +482,9 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt(half ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG-CI-NEXT: v_mac_f32_e32 v2, v0, v1 ; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v2 -; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v1, v0 clamp +; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v0, v0 clamp +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SDAG-CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt: @@ -492,7 +500,8 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt(half ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GISEL-CI-NEXT: v_min_f32_e32 v0, 1.0, v0 -; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v0 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GISEL-CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GISEL-CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext half %src0 to float %src1.ext = fpext half %src1 to float @@ -558,6 +567,8 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi ; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v1, v0 clamp ; SDAG-CI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SDAG-CI-NEXT: v_lshlrev_b32_e32 v0, 16, v1 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-GFX11-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi_use: @@ -587,6 +598,7 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GISEL-CI-NEXT: v_min_f32_e32 v1, 1.0, v1 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GISEL-CI-NEXT: v_lshlrev_b32_e32 v0, 16, v1 ; GISEL-CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext half %src0 to float %src1.ext = fpext half %src1 to float diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll index 8b31944acc15a..c452f9701ca00 100644 --- a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll +++ b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll @@ -459,24 +459,21 @@ define <2 x half> @v_mad_mix_v2f32(<2 x half> %src0, <2 x half> %src1, <2 x half ; SDAG-CI-LABEL: v_mad_mix_v2f32: ; SDAG-CI: ; %bb.0: ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SDAG-CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SDAG-CI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; SDAG-CI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SDAG-CI-NEXT: v_mac_f32_e32 v5, v1, v3 -; SDAG-CI-NEXT: v_mac_f32_e32 v4, v0, v2 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v4 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v1, v5 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_mac_f32_e32 v3, v5, v4 +; SDAG-CI-NEXT: v_mac_f32_e32 v2, v0, v1 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v3 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v1, v2 +; SDAG-CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SDAG-CI-NEXT: v_or_b32_e32 v0, v1, v0 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-GFX1100-LABEL: v_mad_mix_v2f32: @@ -507,16 +504,21 @@ define <2 x half> @v_mad_mix_v2f32(<2 x half> %src0, <2 x half> %src1, <2 x half ; GISEL-CI-LABEL: v_mad_mix_v2f32: ; GISEL-CI: ; %bb.0: ; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GISEL-CI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GISEL-CI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GISEL-CI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GISEL-CI-NEXT: v_mac_f32_e32 v4, v0, v2 -; GISEL-CI-NEXT: v_mac_f32_e32 v5, v1, v3 -; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v4 -; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v5 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GISEL-CI-NEXT: v_mac_f32_e32 v5, v3, v4 +; GISEL-CI-NEXT: v_mac_f32_e32 v2, v0, v1 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v5 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v2 +; GISEL-CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GISEL-CI-NEXT: v_or_b32_e32 v0, v1, v0 ; GISEL-CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext <2 x half> %src0 to <2 x float> %src1.ext = fpext <2 x half> %src1 to <2 x float> @@ -589,33 +591,26 @@ define <3 x half> @v_mad_mix_v3f32(<3 x half> %src0, <3 x half> %src1, <3 x half ; SDAG-CI-LABEL: v_mad_mix_v3f32: ; SDAG-CI: ; %bb.0: ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SDAG-CI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; SDAG-CI-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; SDAG-CI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SDAG-CI-NEXT: v_mac_f32_e32 v8, v2, v5 -; SDAG-CI-NEXT: v_mac_f32_e32 v7, v1, v4 -; SDAG-CI-NEXT: v_mac_f32_e32 v6, v0, v3 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v6 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v1, v7 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v2, v8 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SDAG-CI-NEXT: v_mac_f32_e32 v6, v8, v7 +; SDAG-CI-NEXT: v_mac_f32_e32 v4, v0, v2 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v6 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v2, v4 +; SDAG-CI-NEXT: v_mac_f32_e32 v5, v1, v3 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v1, v5 +; SDAG-CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SDAG-CI-NEXT: v_or_b32_e32 v0, v2, v0 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-GFX1100-LABEL: v_mad_mix_v3f32: @@ -670,21 +665,26 @@ define <3 x half> @v_mad_mix_v3f32(<3 x half> %src0, <3 x half> %src1, <3 x half ; GISEL-CI-LABEL: v_mad_mix_v3f32: ; GISEL-CI: ; %bb.0: ; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-CI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GISEL-CI-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; GISEL-CI-NEXT: v_lshrrev_b32_e32 v8, 16, v4 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GISEL-CI-NEXT: v_mac_f32_e32 v6, v0, v3 -; GISEL-CI-NEXT: v_mac_f32_e32 v7, v1, v4 -; GISEL-CI-NEXT: v_mac_f32_e32 v8, v2, v5 -; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v6 -; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v7 -; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v2, v8 +; GISEL-CI-NEXT: v_mac_f32_e32 v8, v6, v7 +; GISEL-CI-NEXT: v_mac_f32_e32 v4, v0, v2 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v8 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v2, v4 +; GISEL-CI-NEXT: v_mac_f32_e32 v5, v1, v3 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v5 +; GISEL-CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GISEL-CI-NEXT: v_or_b32_e32 v0, v2, v0 ; GISEL-CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext <3 x half> %src0 to <3 x float> %src1.ext = fpext <3 x half> %src1 to <3 x float> @@ -770,42 +770,36 @@ define <4 x half> @v_mad_mix_v4f32(<4 x half> %src0, <4 x half> %src1, <4 x half ; SDAG-CI-LABEL: v_mad_mix_v4f32: ; SDAG-CI: ; %bb.0: ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SDAG-CI-NEXT: v_lshrrev_b32_e32 v9, 16, v4 +; SDAG-CI-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; SDAG-CI-NEXT: v_lshrrev_b32_e32 v11, 16, v0 +; SDAG-CI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 +; SDAG-CI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; SDAG-CI-NEXT: v_lshrrev_b32_e32 v8, 16, v1 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SDAG-CI-NEXT: v_mac_f32_e32 v11, v3, v7 -; SDAG-CI-NEXT: v_mac_f32_e32 v10, v2, v6 -; SDAG-CI-NEXT: v_mac_f32_e32 v9, v1, v5 -; SDAG-CI-NEXT: v_mac_f32_e32 v8, v0, v4 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v8 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v1, v9 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v2, v10 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v3, v11 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_mac_f32_e32 v9, v11, v10 +; SDAG-CI-NEXT: v_mac_f32_e32 v6, v8, v7 +; SDAG-CI-NEXT: v_mac_f32_e32 v4, v0, v2 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v9 +; SDAG-CI-NEXT: v_mac_f32_e32 v5, v1, v3 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v1, v4 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v2, v6 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v3, v5 +; SDAG-CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SDAG-CI-NEXT: v_or_b32_e32 v0, v1, v0 +; SDAG-CI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SDAG-CI-NEXT: v_or_b32_e32 v1, v3, v1 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-GFX1100-LABEL: v_mad_mix_v4f32: @@ -872,26 +866,36 @@ define <4 x half> @v_mad_mix_v4f32(<4 x half> %src0, <4 x half> %src1, <4 x half ; GISEL-CI-LABEL: v_mad_mix_v4f32: ; GISEL-CI: ; %bb.0: ; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GISEL-CI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GISEL-CI-NEXT: v_lshrrev_b32_e32 v8, 16, v2 +; GISEL-CI-NEXT: v_lshrrev_b32_e32 v10, 16, v4 +; GISEL-CI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; GISEL-CI-NEXT: v_lshrrev_b32_e32 v9, 16, v3 +; GISEL-CI-NEXT: v_lshrrev_b32_e32 v11, 16, v5 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GISEL-CI-NEXT: v_mac_f32_e32 v8, v0, v4 -; GISEL-CI-NEXT: v_mac_f32_e32 v9, v1, v5 -; GISEL-CI-NEXT: v_mac_f32_e32 v10, v2, v6 -; GISEL-CI-NEXT: v_mac_f32_e32 v11, v3, v7 -; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v8 -; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v9 -; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v2, v10 -; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v3, v11 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GISEL-CI-NEXT: v_mac_f32_e32 v10, v6, v8 +; GISEL-CI-NEXT: v_mac_f32_e32 v4, v0, v2 +; GISEL-CI-NEXT: v_mac_f32_e32 v11, v7, v9 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v10 +; GISEL-CI-NEXT: v_mac_f32_e32 v5, v1, v3 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v4 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v2, v11 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v3, v5 +; GISEL-CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GISEL-CI-NEXT: v_or_b32_e32 v0, v1, v0 +; GISEL-CI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GISEL-CI-NEXT: v_or_b32_e32 v1, v3, v1 ; GISEL-CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext <4 x half> %src0 to <4 x float> %src1.ext = fpext <4 x half> %src1 to <4 x float> @@ -957,24 +961,25 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt(<2 x half> %src0, <2 x half> %s ; SDAG-CI-LABEL: v_mad_mix_v2f32_clamp_postcvt: ; SDAG-CI: ; %bb.0: ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SDAG-CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SDAG-CI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; SDAG-CI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SDAG-CI-NEXT: v_mac_f32_e32 v4, v0, v2 -; SDAG-CI-NEXT: v_mac_f32_e32 v5, v1, v3 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v4 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v1, v5 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-CI-NEXT: v_mac_f32_e32 v3, v5, v4 +; SDAG-CI-NEXT: v_mac_f32_e32 v2, v0, v1 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v3 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v1, v2 ; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v0, v0 clamp ; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v1, v1 clamp +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SDAG-CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SDAG-CI-NEXT: v_or_b32_e32 v0, v1, v0 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-GFX1100-LABEL: v_mad_mix_v2f32_clamp_postcvt: @@ -1005,28 +1010,33 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt(<2 x half> %src0, <2 x half> %s ; GISEL-CI-LABEL: v_mad_mix_v2f32_clamp_postcvt: ; GISEL-CI: ; %bb.0: ; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GISEL-CI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GISEL-CI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GISEL-CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GISEL-CI-NEXT: v_mac_f32_e32 v4, v0, v2 -; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v4 -; GISEL-CI-NEXT: v_mac_f32_e32 v5, v1, v3 -; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v5 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GISEL-CI-NEXT: v_max_f32_e32 v0, 0, v0 -; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GISEL-CI-NEXT: v_mac_f32_e32 v2, v3, v4 +; GISEL-CI-NEXT: v_mac_f32_e32 v5, v0, v1 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v2 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v5 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GISEL-CI-NEXT: v_max_f32_e32 v1, 0, v1 +; GISEL-CI-NEXT: v_max_f32_e32 v0, 0, v0 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GISEL-CI-NEXT: v_min_f32_e32 v0, 1.0, v0 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GISEL-CI-NEXT: v_min_f32_e32 v1, 1.0, v1 +; GISEL-CI-NEXT: v_min_f32_e32 v0, 1.0, v0 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GISEL-CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GISEL-CI-NEXT: v_or_b32_e32 v0, v0, v1 ; GISEL-CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext <2 x half> %src0 to <2 x float> %src1.ext = fpext <2 x half> %src1 to <2 x float> @@ -1120,33 +1130,32 @@ define <3 x half> @v_mad_mix_v3f32_clamp_postcvt(<3 x half> %src0, <3 x half> %s ; SDAG-CI-LABEL: v_mad_mix_v3f32_clamp_postcvt: ; SDAG-CI: ; %bb.0: ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SDAG-CI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; SDAG-CI-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; SDAG-CI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SDAG-CI-NEXT: v_mac_f32_e32 v6, v0, v3 -; SDAG-CI-NEXT: v_mac_f32_e32 v7, v1, v4 -; SDAG-CI-NEXT: v_mac_f32_e32 v8, v2, v5 +; SDAG-CI-NEXT: v_mac_f32_e32 v6, v8, v7 +; SDAG-CI-NEXT: v_mac_f32_e32 v4, v0, v2 ; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v6 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v1, v7 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v2, v8 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v2, v4 +; SDAG-CI-NEXT: v_mac_f32_e32 v5, v1, v3 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v1, v5 ; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v0, v0 clamp -; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v1, v1 clamp ; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v2, v2 clamp +; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v1, v1 clamp +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SDAG-CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SDAG-CI-NEXT: v_or_b32_e32 v0, v2, v0 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-GFX1100-LABEL: v_mad_mix_v3f32_clamp_postcvt: @@ -1205,39 +1214,44 @@ define <3 x half> @v_mad_mix_v3f32_clamp_postcvt(<3 x half> %src0, <3 x half> %s ; GISEL-CI-LABEL: v_mad_mix_v3f32_clamp_postcvt: ; GISEL-CI: ; %bb.0: ; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-CI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GISEL-CI-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; GISEL-CI-NEXT: v_lshrrev_b32_e32 v8, 16, v4 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GISEL-CI-NEXT: v_mac_f32_e32 v6, v0, v3 -; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v6 -; GISEL-CI-NEXT: v_mac_f32_e32 v7, v1, v4 -; GISEL-CI-NEXT: v_mac_f32_e32 v8, v2, v5 -; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v7 -; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v2, v8 +; GISEL-CI-NEXT: v_mac_f32_e32 v8, v6, v7 +; GISEL-CI-NEXT: v_mac_f32_e32 v4, v0, v2 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v4 +; GISEL-CI-NEXT: v_mac_f32_e32 v5, v1, v3 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v8 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v2, v5 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GISEL-CI-NEXT: v_max_f32_e32 v0, 0, v0 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GISEL-CI-NEXT: v_max_f32_e32 v1, 0, v1 -; GISEL-CI-NEXT: v_max_f32_e32 v2, 0, v2 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GISEL-CI-NEXT: v_max_f32_e32 v2, 0, v2 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GISEL-CI-NEXT: v_min_f32_e32 v0, 1.0, v0 -; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GISEL-CI-NEXT: v_min_f32_e32 v1, 1.0, v1 -; GISEL-CI-NEXT: v_min_f32_e32 v2, 1.0, v2 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v3, v1 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GISEL-CI-NEXT: v_min_f32_e32 v1, 1.0, v2 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GISEL-CI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GISEL-CI-NEXT: v_or_b32_e32 v0, v0, v2 ; GISEL-CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext <3 x half> %src0 to <3 x float> %src1.ext = fpext <3 x half> %src1 to <3 x float> @@ -1335,42 +1349,44 @@ define <4 x half> @v_mad_mix_v4f32_clamp_postcvt(<4 x half> %src0, <4 x half> %s ; SDAG-CI-LABEL: v_mad_mix_v4f32_clamp_postcvt: ; SDAG-CI: ; %bb.0: ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SDAG-CI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; SDAG-CI-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; SDAG-CI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SDAG-CI-NEXT: v_lshrrev_b32_e32 v9, 16, v5 +; SDAG-CI-NEXT: v_lshrrev_b32_e32 v10, 16, v3 +; SDAG-CI-NEXT: v_lshrrev_b32_e32 v11, 16, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SDAG-CI-NEXT: v_mac_f32_e32 v8, v0, v4 -; SDAG-CI-NEXT: v_mac_f32_e32 v9, v1, v5 -; SDAG-CI-NEXT: v_mac_f32_e32 v10, v2, v6 -; SDAG-CI-NEXT: v_mac_f32_e32 v11, v3, v7 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v8 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v1, v9 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v2, v10 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v3, v11 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_mac_f32_e32 v6, v8, v7 +; SDAG-CI-NEXT: v_mac_f32_e32 v9, v11, v10 +; SDAG-CI-NEXT: v_mac_f32_e32 v4, v0, v2 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v6 +; SDAG-CI-NEXT: v_mac_f32_e32 v5, v1, v3 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v1, v4 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v2, v9 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v3, v5 ; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v0, v0 clamp ; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v1, v1 clamp ; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v2, v2 clamp ; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v3, v3 clamp +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SDAG-CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SDAG-CI-NEXT: v_or_b32_e32 v0, v1, v0 +; SDAG-CI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SDAG-CI-NEXT: v_or_b32_e32 v1, v3, v1 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-GFX1100-LABEL: v_mad_mix_v4f32_clamp_postcvt: @@ -1415,50 +1431,60 @@ define <4 x half> @v_mad_mix_v4f32_clamp_postcvt(<4 x half> %src0, <4 x half> %s ; GISEL-CI-LABEL: v_mad_mix_v4f32_clamp_postcvt: ; GISEL-CI: ; %bb.0: ; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-CI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GISEL-CI-NEXT: v_lshrrev_b32_e32 v8, 16, v2 +; GISEL-CI-NEXT: v_lshrrev_b32_e32 v10, 16, v4 +; GISEL-CI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; GISEL-CI-NEXT: v_lshrrev_b32_e32 v9, 16, v3 +; GISEL-CI-NEXT: v_lshrrev_b32_e32 v11, 16, v5 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GISEL-CI-NEXT: v_mac_f32_e32 v8, v0, v4 -; GISEL-CI-NEXT: v_mac_f32_e32 v9, v1, v5 -; GISEL-CI-NEXT: v_mac_f32_e32 v10, v2, v6 -; GISEL-CI-NEXT: v_mac_f32_e32 v11, v3, v7 -; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v8 -; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v9 -; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v2, v10 +; GISEL-CI-NEXT: v_mac_f32_e32 v10, v6, v8 +; GISEL-CI-NEXT: v_mac_f32_e32 v4, v0, v2 +; GISEL-CI-NEXT: v_mac_f32_e32 v5, v1, v3 +; GISEL-CI-NEXT: v_mac_f32_e32 v11, v7, v9 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v10 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v4 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v3, v11 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v2, v5 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GISEL-CI-NEXT: v_max_f32_e32 v0, 0, v0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GISEL-CI-NEXT: v_max_f32_e32 v1, 0, v1 -; GISEL-CI-NEXT: v_max_f32_e32 v2, 0, v2 +; GISEL-CI-NEXT: v_max_f32_e32 v0, 0, v0 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GISEL-CI-NEXT: v_max_f32_e32 v3, 0, v3 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GISEL-CI-NEXT: v_max_f32_e32 v2, 0, v2 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GISEL-CI-NEXT: v_min_f32_e32 v0, 1.0, v0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GISEL-CI-NEXT: v_min_f32_e32 v1, 1.0, v1 -; GISEL-CI-NEXT: v_min_f32_e32 v2, 1.0, v2 +; GISEL-CI-NEXT: v_min_f32_e32 v0, 1.0, v0 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GISEL-CI-NEXT: v_min_f32_e32 v3, 1.0, v3 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GISEL-CI-NEXT: v_min_f32_e32 v2, 1.0, v2 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GISEL-CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GISEL-CI-NEXT: v_or_b32_e32 v0, v0, v1 +; GISEL-CI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GISEL-CI-NEXT: v_or_b32_e32 v1, v2, v1 ; GISEL-CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext <4 x half> %src0 to <4 x float> %src1.ext = fpext <4 x half> %src1 to <4 x float> @@ -1527,24 +1553,23 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt_lo(<2 x half> %src0, <2 x half> ; SDAG-CI-LABEL: v_mad_mix_v2f32_clamp_postcvt_lo: ; SDAG-CI: ; %bb.0: ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SDAG-CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SDAG-CI-NEXT: v_mac_f32_e32 v4, v0, v2 -; SDAG-CI-NEXT: v_mac_f32_e32 v5, v1, v3 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v4 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v1, v5 -; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v0, v0 clamp +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v4, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v5, v0 +; SDAG-CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SDAG-CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SDAG-CI-NEXT: v_mac_f32_e32 v2, v5, v4 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v2, v2 clamp +; SDAG-CI-NEXT: v_mac_f32_e32 v3, v0, v1 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v3 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v1, v2 +; SDAG-CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SDAG-CI-NEXT: v_or_b32_e32 v0, v1, v0 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-GFX1100-LABEL: v_mad_mix_v2f32_clamp_postcvt_lo: @@ -1602,16 +1627,19 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt_lo(<2 x half> %src0, <2 x half> ; GISEL-CI-LABEL: v_mad_mix_v2f32_clamp_postcvt_lo: ; GISEL-CI: ; %bb.0: ; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-CI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GISEL-CI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GISEL-CI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GISEL-CI-NEXT: v_mac_f32_e32 v5, v1, v3 -; GISEL-CI-NEXT: v_mac_f32_e32 v4, v0, v2 +; GISEL-CI-NEXT: v_mac_f32_e32 v5, v3, v4 +; GISEL-CI-NEXT: v_mac_f32_e32 v2, v0, v1 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v5 -; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v4 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v2 ; GISEL-CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GISEL-CI-NEXT: v_or_b32_e32 v0, v1, v0 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v0 @@ -1622,7 +1650,6 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt_lo(<2 x half> %src0, <2 x half> ; GISEL-CI-NEXT: v_min_f32_e32 v1, 1.0, v1 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GISEL-CI-NEXT: v_or_b32_e32 v0, v0, v1 -; GISEL-CI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GISEL-CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext <2 x half> %src0 to <2 x float> %src1.ext = fpext <2 x half> %src1 to <2 x float> @@ -1690,24 +1717,23 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt_hi(<2 x half> %src0, <2 x half> ; SDAG-CI-LABEL: v_mad_mix_v2f32_clamp_postcvt_hi: ; SDAG-CI: ; %bb.0: ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SDAG-CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SDAG-CI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; SDAG-CI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SDAG-CI-NEXT: v_mac_f32_e32 v5, v1, v3 -; SDAG-CI-NEXT: v_mac_f32_e32 v4, v0, v2 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v4 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v1, v5 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v1, v1 clamp +; SDAG-CI-NEXT: v_mac_f32_e32 v3, v5, v4 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SDAG-CI-NEXT: v_mac_f32_e32 v2, v0, v1 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v1, v2 +; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v3, v3 clamp +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v3 +; SDAG-CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SDAG-CI-NEXT: v_or_b32_e32 v0, v1, v0 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-GFX1100-LABEL: v_mad_mix_v2f32_clamp_postcvt_hi: @@ -1768,16 +1794,19 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt_hi(<2 x half> %src0, <2 x half> ; GISEL-CI-LABEL: v_mad_mix_v2f32_clamp_postcvt_hi: ; GISEL-CI: ; %bb.0: ; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-CI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GISEL-CI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GISEL-CI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GISEL-CI-NEXT: v_mac_f32_e32 v5, v1, v3 -; GISEL-CI-NEXT: v_mac_f32_e32 v4, v0, v2 +; GISEL-CI-NEXT: v_mac_f32_e32 v5, v3, v4 +; GISEL-CI-NEXT: v_mac_f32_e32 v2, v0, v1 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v5 -; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v4 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v2 ; GISEL-CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GISEL-CI-NEXT: v_or_b32_e32 v0, v1, v0 ; GISEL-CI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 @@ -1790,7 +1819,6 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt_hi(<2 x half> %src0, <2 x half> ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GISEL-CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GISEL-CI-NEXT: v_or_b32_e32 v0, v0, v1 -; GISEL-CI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GISEL-CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext <2 x half> %src0 to <2 x float> %src1.ext = fpext <2 x half> %src1 to <2 x float> @@ -1868,24 +1896,21 @@ define <2 x half> @v_mad_mix_v2f32_clamp_precvt(<2 x half> %src0, <2 x half> %sr ; SDAG-CI-LABEL: v_mad_mix_v2f32_clamp_precvt: ; SDAG-CI: ; %bb.0: ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SDAG-CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SDAG-CI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; SDAG-CI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SDAG-CI-NEXT: v_mad_f32 v0, v0, v2, v4 clamp -; SDAG-CI-NEXT: v_mad_f32 v1, v1, v3, v5 clamp -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_mad_f32 v3, v5, v4, v3 clamp +; SDAG-CI-NEXT: v_mad_f32 v0, v0, v1, v2 clamp +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v1, v3 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SDAG-CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SDAG-CI-NEXT: v_or_b32_e32 v0, v0, v1 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-GFX1100-LABEL: v_mad_mix_v2f32_clamp_precvt: @@ -1939,16 +1964,21 @@ define <2 x half> @v_mad_mix_v2f32_clamp_precvt(<2 x half> %src0, <2 x half> %sr ; GISEL-CI-LABEL: v_mad_mix_v2f32_clamp_precvt: ; GISEL-CI: ; %bb.0: ; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-CI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GISEL-CI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GISEL-CI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GISEL-CI-NEXT: v_mad_f32 v0, v0, v2, v4 clamp -; GISEL-CI-NEXT: v_mad_f32 v1, v1, v3, v5 clamp -; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GISEL-CI-NEXT: v_mad_f32 v0, v0, v1, v2 clamp +; GISEL-CI-NEXT: v_mad_f32 v1, v3, v4, v5 clamp ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GISEL-CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GISEL-CI-NEXT: v_or_b32_e32 v0, v0, v1 ; GISEL-CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext <2 x half> %src0 to <2 x float> %src1.ext = fpext <2 x half> %src1 to <2 x float> @@ -2038,33 +2068,26 @@ define <3 x half> @v_mad_mix_v3f32_clamp_precvt(<3 x half> %src0, <3 x half> %sr ; SDAG-CI-LABEL: v_mad_mix_v3f32_clamp_precvt: ; SDAG-CI: ; %bb.0: ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SDAG-CI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; SDAG-CI-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; SDAG-CI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SDAG-CI-NEXT: v_mad_f32 v0, v0, v3, v6 clamp -; SDAG-CI-NEXT: v_mad_f32 v1, v1, v4, v7 clamp -; SDAG-CI-NEXT: v_mad_f32 v2, v2, v5, v8 clamp +; SDAG-CI-NEXT: v_mad_f32 v6, v8, v7, v6 clamp +; SDAG-CI-NEXT: v_mad_f32 v0, v0, v2, v4 clamp +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v2, v6 ; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SDAG-CI-NEXT: v_mad_f32 v1, v1, v3, v5 clamp ; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SDAG-CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SDAG-CI-NEXT: v_or_b32_e32 v0, v0, v2 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-GFX1100-LABEL: v_mad_mix_v3f32_clamp_precvt: @@ -2129,21 +2152,26 @@ define <3 x half> @v_mad_mix_v3f32_clamp_precvt(<3 x half> %src0, <3 x half> %sr ; GISEL-CI-LABEL: v_mad_mix_v3f32_clamp_precvt: ; GISEL-CI: ; %bb.0: ; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-CI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GISEL-CI-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; GISEL-CI-NEXT: v_lshrrev_b32_e32 v8, 16, v4 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GISEL-CI-NEXT: v_mad_f32 v0, v0, v3, v6 clamp -; GISEL-CI-NEXT: v_mad_f32 v1, v1, v4, v7 clamp -; GISEL-CI-NEXT: v_mad_f32 v2, v2, v5, v8 clamp +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GISEL-CI-NEXT: v_mad_f32 v0, v0, v2, v4 clamp +; GISEL-CI-NEXT: v_mad_f32 v2, v6, v7, v8 clamp +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GISEL-CI-NEXT: v_mad_f32 v1, v1, v3, v5 clamp ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GISEL-CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GISEL-CI-NEXT: v_or_b32_e32 v0, v0, v2 ; GISEL-CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext <3 x half> %src0 to <3 x float> %src1.ext = fpext <3 x half> %src1 to <3 x float> @@ -2249,42 +2277,36 @@ define <4 x half> @v_mad_mix_v4f32_clamp_precvt(<4 x half> %src0, <4 x half> %sr ; SDAG-CI-LABEL: v_mad_mix_v4f32_clamp_precvt: ; SDAG-CI: ; %bb.0: ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SDAG-CI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; SDAG-CI-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; SDAG-CI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SDAG-CI-NEXT: v_lshrrev_b32_e32 v9, 16, v5 +; SDAG-CI-NEXT: v_lshrrev_b32_e32 v10, 16, v3 +; SDAG-CI-NEXT: v_lshrrev_b32_e32 v11, 16, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SDAG-CI-NEXT: v_mad_f32 v0, v0, v4, v8 clamp -; SDAG-CI-NEXT: v_mad_f32 v1, v1, v5, v9 clamp -; SDAG-CI-NEXT: v_mad_f32 v2, v2, v6, v10 clamp -; SDAG-CI-NEXT: v_mad_f32 v3, v3, v7, v11 clamp +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_mad_f32 v6, v8, v7, v6 clamp +; SDAG-CI-NEXT: v_mad_f32 v9, v11, v10, v9 clamp +; SDAG-CI-NEXT: v_mad_f32 v0, v0, v2, v4 clamp +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v2, v6 +; SDAG-CI-NEXT: v_mad_f32 v1, v1, v3, v5 clamp ; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v3, v9 ; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SDAG-CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SDAG-CI-NEXT: v_or_b32_e32 v0, v0, v2 +; SDAG-CI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SDAG-CI-NEXT: v_or_b32_e32 v1, v1, v2 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-GFX1100-LABEL: v_mad_mix_v4f32_clamp_precvt: @@ -2365,26 +2387,36 @@ define <4 x half> @v_mad_mix_v4f32_clamp_precvt(<4 x half> %src0, <4 x half> %sr ; GISEL-CI-LABEL: v_mad_mix_v4f32_clamp_precvt: ; GISEL-CI: ; %bb.0: ; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-CI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GISEL-CI-NEXT: v_lshrrev_b32_e32 v8, 16, v2 +; GISEL-CI-NEXT: v_lshrrev_b32_e32 v10, 16, v4 +; GISEL-CI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; GISEL-CI-NEXT: v_lshrrev_b32_e32 v9, 16, v3 +; GISEL-CI-NEXT: v_lshrrev_b32_e32 v11, 16, v5 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GISEL-CI-NEXT: v_mad_f32 v0, v0, v4, v8 clamp -; GISEL-CI-NEXT: v_mad_f32 v1, v1, v5, v9 clamp -; GISEL-CI-NEXT: v_mad_f32 v2, v2, v6, v10 clamp -; GISEL-CI-NEXT: v_mad_f32 v3, v3, v7, v11 clamp -; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GISEL-CI-NEXT: v_mad_f32 v0, v0, v2, v4 clamp +; GISEL-CI-NEXT: v_mad_f32 v2, v6, v8, v10 clamp +; GISEL-CI-NEXT: v_mad_f32 v1, v1, v3, v5 clamp +; GISEL-CI-NEXT: v_mad_f32 v3, v7, v9, v11 clamp ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GISEL-CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GISEL-CI-NEXT: v_or_b32_e32 v0, v0, v2 +; GISEL-CI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GISEL-CI-NEXT: v_or_b32_e32 v1, v1, v2 ; GISEL-CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext <4 x half> %src0 to <4 x float> %src1.ext = fpext <4 x half> %src1 to <4 x float> diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix.ll b/llvm/test/CodeGen/AMDGPU/mad-mix.ll index 95df131e21358..a252a63ca83e0 100644 --- a/llvm/test/CodeGen/AMDGPU/mad-mix.ll +++ b/llvm/test/CodeGen/AMDGPU/mad-mix.ll @@ -178,16 +178,26 @@ define float @v_mad_mix_f32_f16hi_f16hi_f16hi_elt(<2 x half> %src0, <2 x half> % ; SDAG-CI-LABEL: v_mad_mix_f32_f16hi_f16hi_f16hi_elt: ; SDAG-CI: ; %bb.0: ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-CI-NEXT: v_mad_f32 v0, v1, v3, v5 +; SDAG-CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SDAG-CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SDAG-CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-CI-NEXT: v_mac_f32_e32 v2, v0, v1 +; SDAG-CI-NEXT: v_mov_b32_e32 v0, v2 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-CI-LABEL: v_mad_mix_f32_f16hi_f16hi_f16hi_elt: ; GISEL-CI: ; %bb.0: ; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GISEL-CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GISEL-CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v3, v0 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v3 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v5 -; GISEL-CI-NEXT: v_mac_f32_e32 v0, v1, v2 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v2 +; GISEL-CI-NEXT: v_mac_f32_e32 v0, v3, v1 ; GISEL-CI-NEXT: s_setpc_b64 s[30:31] %src0.hi = extractelement <2 x half> %src0, i32 1 %src1.hi = extractelement <2 x half> %src1, i32 1 @@ -254,21 +264,19 @@ define <2 x float> @v_mad_mix_v2f32(<2 x half> %src0, <2 x half> %src1, <2 x hal ; SDAG-CI-LABEL: v_mad_mix_v2f32: ; SDAG-CI: ; %bb.0: ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v6, v3 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v3, v5 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v5, v6 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v6, v0 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v4 +; SDAG-CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SDAG-CI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; SDAG-CI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v4, v6 -; SDAG-CI-NEXT: v_mac_f32_e32 v3, v1, v5 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-CI-NEXT: v_mac_f32_e32 v3, v5, v4 +; SDAG-CI-NEXT: v_mac_f32_e32 v2, v0, v1 +; SDAG-CI-NEXT: v_mov_b32_e32 v0, v2 ; SDAG-CI-NEXT: v_mov_b32_e32 v1, v3 -; SDAG-CI-NEXT: v_mac_f32_e32 v0, v4, v2 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-GFX9GEN-LABEL: v_mad_mix_v2f32: @@ -300,14 +308,17 @@ define <2 x float> @v_mad_mix_v2f32(<2 x half> %src0, <2 x half> %src1, <2 x hal ; GISEL-CI-LABEL: v_mad_mix_v2f32: ; GISEL-CI: ; %bb.0: ; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v6, v0 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v7, v1 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GISEL-CI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v4, v0 +; GISEL-CI-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v5, v1 +; GISEL-CI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v4 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GISEL-CI-NEXT: v_mac_f32_e32 v0, v6, v2 -; GISEL-CI-NEXT: v_mac_f32_e32 v1, v7, v3 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v6, v0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v2 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-CI-NEXT: v_mac_f32_e32 v0, v4, v5 +; GISEL-CI-NEXT: v_mac_f32_e32 v1, v3, v6 ; GISEL-CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext <2 x half> %src0 to <2 x float> %src1.ext = fpext <2 x half> %src1 to <2 x float> @@ -371,28 +382,32 @@ define <2 x float> @v_mad_mix_v2f32_shuffle(<2 x half> %src0, <2 x half> %src1, ; SDAG-CI-LABEL: v_mad_mix_v2f32_shuffle: ; SDAG-CI: ; %bb.0: ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SDAG-CI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SDAG-CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SDAG-CI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v4, v0 -; SDAG-CI-NEXT: v_mad_f32 v0, v1, v2, v5 -; SDAG-CI-NEXT: v_mad_f32 v1, v4, v3, v5 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v5, v0 +; SDAG-CI-NEXT: v_mad_f32 v0, v4, v1, v2 +; SDAG-CI-NEXT: v_mac_f32_e32 v2, v5, v3 +; SDAG-CI-NEXT: v_mov_b32_e32 v1, v2 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-CI-LABEL: v_mad_mix_v2f32_shuffle: ; GISEL-CI: ; %bb.0: ; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v4, v1 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v6, v0 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v2 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v3 -; GISEL-CI-NEXT: v_mad_f32 v0, v4, v0, v1 -; GISEL-CI-NEXT: v_mac_f32_e32 v1, v6, v2 +; GISEL-CI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GISEL-CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v4, v0 +; GISEL-CI-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v5, v1 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v2 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v0 +; GISEL-CI-NEXT: v_mad_f32 v0, v3, v5, v1 +; GISEL-CI-NEXT: v_mac_f32_e32 v1, v4, v2 ; GISEL-CI-NEXT: s_setpc_b64 s[30:31] %src0.shuf = shufflevector <2 x half> %src0, <2 x half> poison, <2 x i32> %src1.shuf = shufflevector <2 x half> %src1, <2 x half> poison, <2 x i32> @@ -1246,16 +1261,14 @@ define <2 x float> @v_mad_mix_v2f32_f32imm1(<2 x half> %src0, <2 x half> %src1) ; SDAG-CI-LABEL: v_mad_mix_v2f32_f32imm1: ; SDAG-CI: ; %bb.0: ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SDAG-CI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SDAG-CI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SDAG-CI-NEXT: v_mad_f32 v0, v0, v2, 1.0 -; SDAG-CI-NEXT: v_mad_f32 v1, v1, v3, 1.0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SDAG-CI-NEXT: v_mad_f32 v0, v0, v1, 1.0 +; SDAG-CI-NEXT: v_mad_f32 v1, v3, v2, 1.0 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-GFX1100-LABEL: v_mad_mix_v2f32_f32imm1: @@ -1311,12 +1324,14 @@ define <2 x float> @v_mad_mix_v2f32_f32imm1(<2 x half> %src0, <2 x half> %src1) ; GISEL-CI-LABEL: v_mad_mix_v2f32_f32imm1: ; GISEL-CI: ; %bb.0: ; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-CI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GISEL-CI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GISEL-CI-NEXT: v_mad_f32 v0, v0, v2, 1.0 -; GISEL-CI-NEXT: v_mad_f32 v1, v1, v3, 1.0 +; GISEL-CI-NEXT: v_mad_f32 v0, v0, v1, 1.0 +; GISEL-CI-NEXT: v_mad_f32 v1, v2, v3, 1.0 ; GISEL-CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext <2 x half> %src0 to <2 x float> %src1.ext = fpext <2 x half> %src1 to <2 x float> @@ -1380,17 +1395,15 @@ define <2 x float> @v_mad_mix_v2f32_cvtf16imminv2pi(<2 x half> %src0, <2 x half> ; SDAG-CI-LABEL: v_mad_mix_v2f32_cvtf16imminv2pi: ; SDAG-CI: ; %bb.0: ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SDAG-CI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SDAG-CI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v4, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SDAG-CI-NEXT: v_mov_b32_e32 v1, 0x3e230000 -; SDAG-CI-NEXT: v_madak_f32 v0, v0, v2, 0x3e230000 -; SDAG-CI-NEXT: v_mac_f32_e32 v1, v4, v3 +; SDAG-CI-NEXT: v_madak_f32 v0, v0, v4, 0x3e230000 +; SDAG-CI-NEXT: v_mac_f32_e32 v1, v3, v2 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-GFX1100-LABEL: v_mad_mix_v2f32_cvtf16imminv2pi: @@ -1448,13 +1461,15 @@ define <2 x float> @v_mad_mix_v2f32_cvtf16imminv2pi(<2 x half> %src0, <2 x half> ; GISEL-CI-LABEL: v_mad_mix_v2f32_cvtf16imminv2pi: ; GISEL-CI: ; %bb.0: ; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-CI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GISEL-CI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v4, v1 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v4, v1 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GISEL-CI-NEXT: v_mov_b32_e32 v1, 0x3e230000 -; GISEL-CI-NEXT: v_madak_f32 v0, v0, v2, 0x3e230000 -; GISEL-CI-NEXT: v_mac_f32_e32 v1, v4, v3 +; GISEL-CI-NEXT: v_madak_f32 v0, v0, v4, 0x3e230000 +; GISEL-CI-NEXT: v_mac_f32_e32 v1, v2, v3 ; GISEL-CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext <2 x half> %src0 to <2 x float> %src1.ext = fpext <2 x half> %src1 to <2 x float> @@ -1517,17 +1532,15 @@ define <2 x float> @v_mad_mix_v2f32_f32imminv2pi(<2 x half> %src0, <2 x half> %s ; SDAG-CI-LABEL: v_mad_mix_v2f32_f32imminv2pi: ; SDAG-CI: ; %bb.0: ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SDAG-CI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SDAG-CI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v4, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SDAG-CI-NEXT: v_mov_b32_e32 v1, 0x3e22f983 -; SDAG-CI-NEXT: v_madak_f32 v0, v0, v2, 0x3e22f983 -; SDAG-CI-NEXT: v_mac_f32_e32 v1, v4, v3 +; SDAG-CI-NEXT: v_madak_f32 v0, v0, v4, 0x3e22f983 +; SDAG-CI-NEXT: v_mac_f32_e32 v1, v3, v2 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-GFX1100-LABEL: v_mad_mix_v2f32_f32imminv2pi: @@ -1583,13 +1596,15 @@ define <2 x float> @v_mad_mix_v2f32_f32imminv2pi(<2 x half> %src0, <2 x half> %s ; GISEL-CI-LABEL: v_mad_mix_v2f32_f32imminv2pi: ; GISEL-CI: ; %bb.0: ; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-CI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GISEL-CI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v4, v1 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v4, v1 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GISEL-CI-NEXT: v_mov_b32_e32 v1, 0x3e22f983 -; GISEL-CI-NEXT: v_madak_f32 v0, v0, v2, 0x3e22f983 -; GISEL-CI-NEXT: v_mac_f32_e32 v1, v4, v3 +; GISEL-CI-NEXT: v_madak_f32 v0, v0, v4, 0x3e22f983 +; GISEL-CI-NEXT: v_mac_f32_e32 v1, v2, v3 ; GISEL-CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext <2 x half> %src0 to <2 x float> %src1.ext = fpext <2 x half> %src1 to <2 x float> @@ -1638,15 +1653,24 @@ define float @v_mad_mix_clamp_f32_f16hi_f16hi_f16hi_elt(<2 x half> %src0, <2 x h ; SDAG-CI-LABEL: v_mad_mix_clamp_f32_f16hi_f16hi_f16hi_elt: ; SDAG-CI: ; %bb.0: ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-CI-NEXT: v_mad_f32 v0, v1, v3, v5 clamp +; SDAG-CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SDAG-CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SDAG-CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-CI-NEXT: v_mad_f32 v0, v0, v1, v2 clamp ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-CI-LABEL: v_mad_mix_clamp_f32_f16hi_f16hi_f16hi_elt: ; GISEL-CI: ; %bb.0: ; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v1 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v3 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v5 +; GISEL-CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GISEL-CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GISEL-CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GISEL-CI-NEXT: v_mad_f32 v0, v0, v1, v2 clamp ; GISEL-CI-NEXT: s_setpc_b64 s[30:31] %src0.hi = extractelement <2 x half> %src0, i32 1 diff --git a/llvm/test/CodeGen/AMDGPU/maximumnum.bf16.ll b/llvm/test/CodeGen/AMDGPU/maximumnum.bf16.ll index cd387b5a429e9..65b2f016a6ba0 100644 --- a/llvm/test/CodeGen/AMDGPU/maximumnum.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/maximumnum.bf16.ll @@ -372,18 +372,18 @@ define <2 x bfloat> @v_maximumnum_v2bf16(<2 x bfloat> %x, <2 x bfloat> %y) { ; GFX7-LABEL: v_maximumnum_v2bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_max_f32_e32 v1, v1, v3 -; GFX7-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_max_f32_e32 v2, v3, v2 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX7-NEXT: v_alignbit_b32 v0, v2, v0, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_maximumnum_v2bf16: @@ -732,18 +732,14 @@ define <2 x bfloat> @v_maximumnum_v2bf16_nnan(<2 x bfloat> %x, <2 x bfloat> %y) ; GFX7-LABEL: v_maximumnum_v2bf16_nnan: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_max_f32_e32 v1, v1, v3 -; GFX7-NEXT: v_max_f32_e32 v0, v0, v2 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX7-NEXT: v_max_f32_e32 v2, v3, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_alignbit_b32 v0, v0, v2, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_maximumnum_v2bf16_nnan: @@ -978,24 +974,24 @@ define <3 x bfloat> @v_maximumnum_v3bf16(<3 x bfloat> %x, <3 x bfloat> %y) { ; GFX7-LABEL: v_maximumnum_v3bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_max_f32_e32 v1, v1, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_max_f32_e32 v3, v4, v3 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_max_f32_e32 v2, v2, v5 -; GFX7-NEXT: v_max_f32_e32 v1, v1, v4 -; GFX7-NEXT: v_max_f32_e32 v0, v0, v3 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_alignbit_b32 v0, v3, v0, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_maximumnum_v3bf16: @@ -1473,24 +1469,18 @@ define <3 x bfloat> @v_maximumnum_v3bf16_nnan(<3 x bfloat> %x, <3 x bfloat> %y) ; GFX7-LABEL: v_maximumnum_v3bf16_nnan: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_max_f32_e32 v2, v2, v5 -; GFX7-NEXT: v_max_f32_e32 v1, v1, v4 -; GFX7-NEXT: v_max_f32_e32 v0, v0, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_max_f32_e32 v1, v1, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v0 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX7-NEXT: v_max_f32_e32 v3, v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_alignbit_b32 v0, v0, v3, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_maximumnum_v3bf16_nnan: @@ -1804,30 +1794,30 @@ define <4 x bfloat> @v_maximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) { ; GFX7-LABEL: v_maximumnum_v4bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_max_f32_e32 v4, v5, v4 +; GFX7-NEXT: v_max_f32_e32 v1, v1, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_max_f32_e32 v3, v5, v3 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_max_f32_e32 v3, v3, v7 -; GFX7-NEXT: v_max_f32_e32 v2, v2, v6 -; GFX7-NEXT: v_max_f32_e32 v1, v1, v5 -; GFX7-NEXT: v_max_f32_e32 v0, v0, v4 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX7-NEXT: v_alignbit_b32 v0, v3, v0, 16 +; GFX7-NEXT: v_alignbit_b32 v1, v4, v1, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_maximumnum_v4bf16: @@ -2465,30 +2455,22 @@ define <4 x bfloat> @v_maximumnum_v4bf16_nnan(<4 x bfloat> %x, <4 x bfloat> %y) ; GFX7-LABEL: v_maximumnum_v4bf16_nnan: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v1 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_max_f32_e32 v3, v3, v7 -; GFX7-NEXT: v_max_f32_e32 v2, v2, v6 -; GFX7-NEXT: v_max_f32_e32 v1, v1, v5 -; GFX7-NEXT: v_max_f32_e32 v0, v0, v4 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: v_max_f32_e32 v4, v5, v4 +; GFX7-NEXT: v_max_f32_e32 v1, v1, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v0 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX7-NEXT: v_alignbit_b32 v1, v1, v4, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_maximumnum_v4bf16_nnan: @@ -2904,42 +2886,42 @@ define <6 x bfloat> @v_maximumnum_v6bf16(<6 x bfloat> %x, <6 x bfloat> %y) { ; GFX7-LABEL: v_maximumnum_v6bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 +; GFX7-NEXT: v_max_f32_e32 v6, v7, v6 +; GFX7-NEXT: v_max_f32_e32 v2, v2, v5 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_max_f32_e32 v5, v5, v11 -; GFX7-NEXT: v_max_f32_e32 v4, v4, v10 -; GFX7-NEXT: v_max_f32_e32 v3, v3, v9 -; GFX7-NEXT: v_max_f32_e32 v2, v2, v8 -; GFX7-NEXT: v_max_f32_e32 v1, v1, v7 -; GFX7-NEXT: v_max_f32_e32 v0, v0, v6 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_max_f32_e32 v5, v7, v5 +; GFX7-NEXT: v_max_f32_e32 v1, v1, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v0 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_max_f32_e32 v4, v7, v4 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_max_f32_e32 v0, v0, v3 +; GFX7-NEXT: v_alignbit_b32 v0, v4, v0, 16 +; GFX7-NEXT: v_alignbit_b32 v1, v5, v1, 16 +; GFX7-NEXT: v_alignbit_b32 v2, v6, v2, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_maximumnum_v6bf16: @@ -3866,54 +3848,54 @@ define <8 x bfloat> @v_maximumnum_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) { ; GFX7-LABEL: v_maximumnum_v8bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v7 +; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 +; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 +; GFX7-NEXT: v_max_f32_e32 v8, v9, v8 +; GFX7-NEXT: v_max_f32_e32 v3, v3, v7 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v6 +; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 -; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_max_f32_e32 v7, v7, v15 -; GFX7-NEXT: v_max_f32_e32 v6, v6, v14 -; GFX7-NEXT: v_max_f32_e32 v5, v5, v13 -; GFX7-NEXT: v_max_f32_e32 v4, v4, v12 -; GFX7-NEXT: v_max_f32_e32 v3, v3, v11 -; GFX7-NEXT: v_max_f32_e32 v2, v2, v10 -; GFX7-NEXT: v_max_f32_e32 v1, v1, v9 -; GFX7-NEXT: v_max_f32_e32 v0, v0, v8 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_max_f32_e32 v7, v9, v7 +; GFX7-NEXT: v_max_f32_e32 v2, v2, v6 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_max_f32_e32 v6, v9, v6 +; GFX7-NEXT: v_max_f32_e32 v1, v1, v5 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v0 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_max_f32_e32 v5, v9, v5 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_max_f32_e32 v0, v0, v4 +; GFX7-NEXT: v_alignbit_b32 v0, v5, v0, 16 +; GFX7-NEXT: v_alignbit_b32 v1, v6, v1, 16 +; GFX7-NEXT: v_alignbit_b32 v2, v7, v2, 16 +; GFX7-NEXT: v_alignbit_b32 v3, v8, v3, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_maximumnum_v8bf16: @@ -5114,104 +5096,102 @@ define <16 x bfloat> @v_maximumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) { ; GFX7-LABEL: v_maximumnum_v16bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27 -; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GFX7-NEXT: v_max_f32_e32 v11, v11, v27 -; GFX7-NEXT: buffer_load_dword v27, off, s[0:3], s32 +; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v15 +; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v7 +; GFX7-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16 +; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 +; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GFX7-NEXT: v_max_f32_e32 v16, v17, v16 +; GFX7-NEXT: v_max_f32_e32 v7, v7, v15 +; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v14 +; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 +; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 -; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX7-NEXT: v_max_f32_e32 v15, v17, v15 +; GFX7-NEXT: v_max_f32_e32 v6, v6, v14 +; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v13 +; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 -; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30 +; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 ; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29 -; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 -; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28 -; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 -; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26 -; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 -; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 -; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23 -; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_max_f32_e32 v6, v6, v22 -; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21 +; GFX7-NEXT: v_max_f32_e32 v14, v17, v14 +; GFX7-NEXT: v_max_f32_e32 v5, v5, v13 +; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v12 +; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 +; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20 +; GFX7-NEXT: v_max_f32_e32 v13, v17, v13 +; GFX7-NEXT: v_max_f32_e32 v4, v4, v12 +; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v11 +; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 +; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19 +; GFX7-NEXT: v_max_f32_e32 v12, v17, v12 +; GFX7-NEXT: v_max_f32_e32 v3, v3, v11 +; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v10 +; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 +; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18 +; GFX7-NEXT: v_max_f32_e32 v11, v17, v11 +; GFX7-NEXT: v_max_f32_e32 v2, v2, v10 +; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v9 +; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 +; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_max_f32_e32 v10, v17, v10 +; GFX7-NEXT: v_max_f32_e32 v1, v1, v9 +; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v8 +; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v0 +; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 ; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_max_f32_e32 v9, v17, v9 +; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_max_f32_e32 v14, v14, v30 -; GFX7-NEXT: v_max_f32_e32 v13, v13, v29 -; GFX7-NEXT: v_max_f32_e32 v12, v12, v28 -; GFX7-NEXT: v_max_f32_e32 v10, v10, v26 -; GFX7-NEXT: v_max_f32_e32 v9, v9, v25 -; GFX7-NEXT: v_max_f32_e32 v8, v8, v24 -; GFX7-NEXT: v_max_f32_e32 v7, v7, v23 -; GFX7-NEXT: v_max_f32_e32 v5, v5, v21 -; GFX7-NEXT: v_max_f32_e32 v4, v4, v20 -; GFX7-NEXT: v_max_f32_e32 v3, v3, v19 -; GFX7-NEXT: v_max_f32_e32 v2, v2, v18 -; GFX7-NEXT: v_max_f32_e32 v1, v1, v17 -; GFX7-NEXT: v_max_f32_e32 v0, v0, v16 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v27 -; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GFX7-NEXT: v_max_f32_e32 v15, v15, v22 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; GFX7-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; GFX7-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GFX7-NEXT: v_max_f32_e32 v0, v0, v8 +; GFX7-NEXT: v_alignbit_b32 v0, v9, v0, 16 +; GFX7-NEXT: v_alignbit_b32 v1, v10, v1, 16 +; GFX7-NEXT: v_alignbit_b32 v2, v11, v2, 16 +; GFX7-NEXT: v_alignbit_b32 v3, v12, v3, 16 +; GFX7-NEXT: v_alignbit_b32 v4, v13, v4, 16 +; GFX7-NEXT: v_alignbit_b32 v5, v14, v5, 16 +; GFX7-NEXT: v_alignbit_b32 v6, v15, v6, 16 +; GFX7-NEXT: v_alignbit_b32 v7, v16, v7, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_maximumnum_v16bf16: @@ -7581,264 +7561,200 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) { ; GFX7-LABEL: v_maximumnum_v32bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128 +; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v30 +; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v14 +; GFX7-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31 +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30 -; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29 -; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28 -; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27 -; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26 -; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 -; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 -; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23 -; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 -; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21 -; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20 -; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19 -; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18 -; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 -; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 -; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GFX7-NEXT: v_max_f32_e32 v31, v32, v31 +; GFX7-NEXT: v_max_f32_e32 v14, v14, v30 +; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v29 +; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v13 +; GFX7-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; GFX7-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30 +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29 ; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GFX7-NEXT: v_max_f32_e32 v30, v32, v30 +; GFX7-NEXT: v_max_f32_e32 v13, v13, v29 +; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v28 +; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v12 +; GFX7-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29 +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28 ; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 -; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GFX7-NEXT: v_max_f32_e32 v29, v32, v29 +; GFX7-NEXT: v_max_f32_e32 v12, v12, v28 +; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v27 +; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v11 +; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28 +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GFX7-NEXT: v_max_f32_e32 v28, v32, v28 +; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; GFX7-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GFX7-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27 ; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX7-NEXT: v_max_f32_e32 v11, v11, v27 +; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v15 +; GFX7-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27 +; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v32 +; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GFX7-NEXT: v_max_f32_e32 v27, v27, v33 +; GFX7-NEXT: v_max_f32_e32 v15, v15, v32 +; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v26 +; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v10 +; GFX7-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26 ; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 -; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GFX7-NEXT: v_max_f32_e32 v32, v33, v32 +; GFX7-NEXT: v_max_f32_e32 v10, v10, v26 +; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v25 +; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v9 +; GFX7-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26 +; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 ; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX7-NEXT: v_max_f32_e32 v26, v33, v26 +; GFX7-NEXT: v_max_f32_e32 v9, v9, v25 +; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v24 +; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 +; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 ; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GFX7-NEXT: v_max_f32_e32 v25, v33, v25 +; GFX7-NEXT: v_max_f32_e32 v8, v8, v24 +; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v23 +; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v7 +; GFX7-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 +; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX7-NEXT: v_max_f32_e32 v24, v33, v24 +; GFX7-NEXT: v_max_f32_e32 v7, v7, v23 +; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v22 +; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23 +; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX7-NEXT: v_max_f32_e32 v23, v33, v23 +; GFX7-NEXT: v_max_f32_e32 v6, v6, v22 +; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v21 +; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX7-NEXT: v_max_f32_e32 v22, v33, v22 +; GFX7-NEXT: v_max_f32_e32 v5, v5, v21 +; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v20 +; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21 +; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX7-NEXT: v_max_f32_e32 v21, v33, v21 +; GFX7-NEXT: v_max_f32_e32 v4, v4, v20 +; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v19 +; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20 +; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX7-NEXT: v_max_f32_e32 v20, v33, v20 +; GFX7-NEXT: v_max_f32_e32 v3, v3, v19 +; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v18 +; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19 +; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX7-NEXT: v_max_f32_e32 v19, v33, v19 +; GFX7-NEXT: v_max_f32_e32 v2, v2, v18 +; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v17 +; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18 +; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: v_max_f32_e32 v18, v33, v18 +; GFX7-NEXT: v_max_f32_e32 v1, v1, v17 +; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v16 +; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v0 +; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GFX7-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_max_f32_e32 v17, v33, v17 +; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 -; GFX7-NEXT: v_max_f32_e32 v31, v31, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124 -; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_max_f32_e32 v30, v30, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:120 -; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_max_f32_e32 v29, v29, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:116 -; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_max_f32_e32 v28, v28, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:112 -; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_max_f32_e32 v27, v27, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:108 -; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_max_f32_e32 v26, v26, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:104 -; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_max_f32_e32 v25, v25, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:100 -; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_max_f32_e32 v24, v24, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:96 -; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_max_f32_e32 v23, v23, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:92 -; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_max_f32_e32 v22, v22, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:88 -; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_max_f32_e32 v21, v21, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:84 -; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_max_f32_e32 v20, v20, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:80 -; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_max_f32_e32 v19, v19, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:76 -; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_max_f32_e32 v18, v18, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72 -; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_max_f32_e32 v17, v17, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68 -; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_max_f32_e32 v16, v16, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64 -; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_max_f32_e32 v15, v15, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60 -; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_max_f32_e32 v14, v14, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56 -; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_max_f32_e32 v13, v13, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52 -; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_max_f32_e32 v12, v12, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:48 -; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_max_f32_e32 v11, v11, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:44 -; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_max_f32_e32 v10, v10, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:40 -; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_max_f32_e32 v9, v9, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36 -; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_max_f32_e32 v8, v8, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:32 -; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_max_f32_e32 v7, v7, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_max_f32_e32 v6, v6, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_max_f32_e32 v5, v5, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:20 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_max_f32_e32 v4, v4, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:16 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_max_f32_e32 v3, v3, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_max_f32_e32 v2, v2, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_max_f32_e32 v1, v1, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_max_f32_e32 v0, v0, v32 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_max_f32_e32 v0, v0, v16 +; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v17 +; GFX7-NEXT: v_alignbit_b32 v0, v16, v0, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v18 +; GFX7-NEXT: v_alignbit_b32 v1, v16, v1, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v19 +; GFX7-NEXT: v_alignbit_b32 v2, v16, v2, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v20 +; GFX7-NEXT: v_alignbit_b32 v3, v16, v3, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v21 +; GFX7-NEXT: v_alignbit_b32 v4, v16, v4, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v22 +; GFX7-NEXT: v_alignbit_b32 v5, v16, v5, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v23 +; GFX7-NEXT: v_alignbit_b32 v6, v16, v6, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v24 +; GFX7-NEXT: v_alignbit_b32 v7, v16, v7, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v25 +; GFX7-NEXT: v_alignbit_b32 v8, v16, v8, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v26 +; GFX7-NEXT: v_alignbit_b32 v9, v16, v9, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v32 +; GFX7-NEXT: v_alignbit_b32 v10, v16, v10, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v28 +; GFX7-NEXT: v_alignbit_b32 v11, v16, v11, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v27 +; GFX7-NEXT: v_lshrrev_b32_e32 v17, 16, v31 +; GFX7-NEXT: v_lshrrev_b32_e32 v18, 16, v30 +; GFX7-NEXT: v_lshrrev_b32_e32 v19, 16, v29 +; GFX7-NEXT: v_alignbit_b32 v12, v19, v12, 16 +; GFX7-NEXT: v_alignbit_b32 v13, v18, v13, 16 +; GFX7-NEXT: v_alignbit_b32 v14, v17, v14, 16 +; GFX7-NEXT: v_alignbit_b32 v15, v16, v15, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_maximumnum_v32bf16: @@ -12615,18 +12531,14 @@ define <2 x bfloat> @v_maximumnum_v2bf16_no_ieee(<2 x bfloat> %x, <2 x bfloat> % ; GFX7-LABEL: v_maximumnum_v2bf16_no_ieee: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_max_f32_e32 v1, v1, v3 -; GFX7-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX7-NEXT: v_max_f32_e32 v2, v3, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX7-NEXT: v_alignbit_b32 v0, v2, v0, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_maximumnum_v2bf16_no_ieee: @@ -12975,24 +12887,18 @@ define <3 x bfloat> @v_maximumnum_v3bf16_no_ieee(<3 x bfloat> %x, <3 x bfloat> % ; GFX7-LABEL: v_maximumnum_v3bf16_no_ieee: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_max_f32_e32 v2, v2, v5 -; GFX7-NEXT: v_max_f32_e32 v1, v1, v4 -; GFX7-NEXT: v_max_f32_e32 v0, v0, v3 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_max_f32_e32 v1, v1, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX7-NEXT: v_max_f32_e32 v3, v4, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_alignbit_b32 v0, v3, v0, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_maximumnum_v3bf16_no_ieee: @@ -13470,30 +13376,22 @@ define <4 x bfloat> @v_maximumnum_v4bf16_no_ieee(<4 x bfloat> %x, <4 x bfloat> % ; GFX7-LABEL: v_maximumnum_v4bf16_no_ieee: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_max_f32_e32 v3, v3, v7 -; GFX7-NEXT: v_max_f32_e32 v2, v2, v6 -; GFX7-NEXT: v_max_f32_e32 v1, v1, v5 -; GFX7-NEXT: v_max_f32_e32 v0, v0, v4 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_max_f32_e32 v4, v5, v4 +; GFX7-NEXT: v_max_f32_e32 v1, v1, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; GFX7-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX7-NEXT: v_alignbit_b32 v0, v3, v0, 16 +; GFX7-NEXT: v_alignbit_b32 v1, v4, v1, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_maximumnum_v4bf16_no_ieee: diff --git a/llvm/test/CodeGen/AMDGPU/maximumnum.ll b/llvm/test/CodeGen/AMDGPU/maximumnum.ll index c90b2c9170414..7d52b2e1d70c6 100644 --- a/llvm/test/CodeGen/AMDGPU/maximumnum.ll +++ b/llvm/test/CodeGen/AMDGPU/maximumnum.ll @@ -3372,29 +3372,35 @@ define <2 x half> @v_maximumnum_v2f16(<2 x half> %x, <2 x half> %y) { ; GFX7-SDAG-LABEL: v_maximumnum_v2f16: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX7-SDAG-NEXT: v_max_f32_e32 v1, v1, v3 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_max_f32_e32 v2, v2, v3 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: v_maximumnum_v2f16: ; GFX7-GISEL: ; %bb.0: ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-GISEL-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX7-GISEL-NEXT: v_max_f32_e32 v0, v0, v1 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: v_max_f32_e32 v1, v1, v3 +; GFX7-GISEL-NEXT: v_max_f32_e32 v1, v2, v3 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: v_maximumnum_v2f16: @@ -3522,29 +3528,35 @@ define <2 x half> @v_maximumnum_v2f16_nnan(<2 x half> %x, <2 x half> %y) { ; GFX7-SDAG-LABEL: v_maximumnum_v2f16_nnan: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX7-SDAG-NEXT: v_max_f32_e32 v1, v1, v3 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_max_f32_e32 v2, v3, v2 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: v_maximumnum_v2f16_nnan: ; GFX7-GISEL: ; %bb.0: ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-GISEL-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX7-GISEL-NEXT: v_max_f32_e32 v0, v0, v1 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: v_max_f32_e32 v1, v1, v3 +; GFX7-GISEL-NEXT: v_max_f32_e32 v1, v2, v3 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: v_maximumnum_v2f16_nnan: @@ -3598,38 +3610,43 @@ define <3 x half> @v_maximumnum_v3f16(<3 x half> %x, <3 x half> %y) { ; GFX7-SDAG-LABEL: v_maximumnum_v3f16: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v3 -; GFX7-SDAG-NEXT: v_max_f32_e32 v1, v1, v4 -; GFX7-SDAG-NEXT: v_max_f32_e32 v2, v2, v5 +; GFX7-SDAG-NEXT: v_max_f32_e32 v4, v4, v5 +; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v4 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_max_f32_e32 v1, v1, v3 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: v_maximumnum_v3f16: ; GFX7-GISEL: ; %bb.0: ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-GISEL-NEXT: v_max_f32_e32 v0, v0, v3 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v3, v4 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-GISEL-NEXT: v_max_f32_e32 v2, v4, v2 +; GFX7-GISEL-NEXT: v_max_f32_e32 v0, v0, v5 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-GISEL-NEXT: v_max_f32_e32 v1, v1, v3 -; GFX7-GISEL-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-GISEL-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: v_maximumnum_v3f16: @@ -3781,38 +3798,43 @@ define <3 x half> @v_maximumnum_v3f16_nnan(<3 x half> %x, <3 x half> %y) { ; GFX7-SDAG-LABEL: v_maximumnum_v3f16_nnan: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v5, 16, v0 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v3 -; GFX7-SDAG-NEXT: v_max_f32_e32 v1, v1, v4 -; GFX7-SDAG-NEXT: v_max_f32_e32 v2, v2, v5 +; GFX7-SDAG-NEXT: v_max_f32_e32 v4, v5, v4 +; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v4 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_max_f32_e32 v1, v1, v3 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: v_maximumnum_v3f16_nnan: ; GFX7-GISEL: ; %bb.0: ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-GISEL-NEXT: v_max_f32_e32 v0, v0, v3 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v3, v4 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-GISEL-NEXT: v_max_f32_e32 v2, v4, v2 +; GFX7-GISEL-NEXT: v_max_f32_e32 v0, v0, v5 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-GISEL-NEXT: v_max_f32_e32 v1, v1, v3 -; GFX7-GISEL-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-GISEL-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: v_maximumnum_v3f16_nnan: @@ -3879,47 +3901,59 @@ define <4 x half> @v_maximumnum_v4f16(<4 x half> %x, <4 x half> %y) { ; GFX7-SDAG-LABEL: v_maximumnum_v4f16: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v5, 16, v2 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v7, 16, v3 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v6, v6 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v4 -; GFX7-SDAG-NEXT: v_max_f32_e32 v1, v1, v5 -; GFX7-SDAG-NEXT: v_max_f32_e32 v2, v2, v6 -; GFX7-SDAG-NEXT: v_max_f32_e32 v3, v3, v7 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-SDAG-NEXT: v_max_f32_e32 v4, v4, v5 +; GFX7-SDAG-NEXT: v_max_f32_e32 v6, v6, v7 +; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v4 +; GFX7-SDAG-NEXT: v_max_f32_e32 v1, v1, v3 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v6 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX7-SDAG-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: v_maximumnum_v4f16: ; GFX7-GISEL: ; %bb.0: ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v7, 16, v3 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-GISEL-NEXT: v_max_f32_e32 v0, v0, v4 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v4, v6 -; GFX7-GISEL-NEXT: v_max_f32_e32 v1, v1, v5 +; GFX7-GISEL-NEXT: v_max_f32_e32 v0, v0, v5 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v5, v7 -; GFX7-GISEL-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-GISEL-NEXT: v_max_f32_e32 v3, v3, v5 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v5, v6 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v6, v7 +; GFX7-GISEL-NEXT: v_max_f32_e32 v2, v4, v2 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-GISEL-NEXT: v_max_f32_e32 v1, v1, v3 +; GFX7-GISEL-NEXT: v_max_f32_e32 v3, v5, v6 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-GISEL-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX7-GISEL-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: v_maximumnum_v4f16: @@ -4091,47 +4125,59 @@ define <4 x half> @v_maximumnum_v4f16_nnan(<4 x half> %x, <4 x half> %y) { ; GFX7-SDAG-LABEL: v_maximumnum_v4f16_nnan: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v5, 16, v0 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v7, v7 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v4 -; GFX7-SDAG-NEXT: v_max_f32_e32 v1, v1, v5 -; GFX7-SDAG-NEXT: v_max_f32_e32 v2, v2, v6 -; GFX7-SDAG-NEXT: v_max_f32_e32 v3, v3, v7 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_max_f32_e32 v4, v5, v4 +; GFX7-SDAG-NEXT: v_max_f32_e32 v6, v7, v6 +; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v4 +; GFX7-SDAG-NEXT: v_max_f32_e32 v1, v1, v3 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v6 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX7-SDAG-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: v_maximumnum_v4f16_nnan: ; GFX7-GISEL: ; %bb.0: ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v7, 16, v3 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-GISEL-NEXT: v_max_f32_e32 v0, v0, v4 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v4, v6 -; GFX7-GISEL-NEXT: v_max_f32_e32 v1, v1, v5 +; GFX7-GISEL-NEXT: v_max_f32_e32 v0, v0, v5 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v5, v7 -; GFX7-GISEL-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-GISEL-NEXT: v_max_f32_e32 v3, v3, v5 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v5, v6 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v6, v7 +; GFX7-GISEL-NEXT: v_max_f32_e32 v2, v4, v2 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-GISEL-NEXT: v_max_f32_e32 v1, v1, v3 +; GFX7-GISEL-NEXT: v_max_f32_e32 v3, v5, v6 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-GISEL-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX7-GISEL-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: v_maximumnum_v4f16_nnan: @@ -4195,65 +4241,83 @@ define <6 x half> @v_maximumnum_v6f16(<6 x half> %x, <6 x half> %y) { ; GFX7-SDAG-LABEL: v_maximumnum_v6f16: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v7, 16, v3 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v6, v6 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v8, 16, v1 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v9, 16, v4 +; GFX7-SDAG-NEXT: v_max_f32_e32 v6, v6, v7 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v8, v8 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v6 -; GFX7-SDAG-NEXT: v_max_f32_e32 v1, v1, v7 -; GFX7-SDAG-NEXT: v_max_f32_e32 v2, v2, v8 -; GFX7-SDAG-NEXT: v_max_f32_e32 v3, v3, v9 -; GFX7-SDAG-NEXT: v_max_f32_e32 v4, v4, v10 -; GFX7-SDAG-NEXT: v_max_f32_e32 v5, v5, v11 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v11, 16, v5 +; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v3 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v6 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v10, v10 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v11, v11 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-SDAG-NEXT: v_max_f32_e32 v8, v8, v9 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-SDAG-NEXT: v_max_f32_e32 v10, v10, v11 +; GFX7-SDAG-NEXT: v_max_f32_e32 v1, v1, v4 +; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v3 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v8 +; GFX7-SDAG-NEXT: v_max_f32_e32 v2, v2, v5 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v4, v10 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-SDAG-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX7-SDAG-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: v_maximumnum_v6f16: ; GFX7-GISEL: ; %bb.0: ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v9, 16, v3 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v9, v9 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v10, 16, v4 +; GFX7-GISEL-NEXT: v_max_f32_e32 v0, v0, v3 +; GFX7-GISEL-NEXT: v_max_f32_e32 v3, v6, v9 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-GISEL-NEXT: v_max_f32_e32 v0, v0, v6 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v6, v8 -; GFX7-GISEL-NEXT: v_max_f32_e32 v1, v1, v7 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v7, v9 -; GFX7-GISEL-NEXT: v_max_f32_e32 v2, v2, v6 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v6, v10 -; GFX7-GISEL-NEXT: v_max_f32_e32 v3, v3, v7 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v6, v7 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v7, v10 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v8, 16, v2 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v11, 16, v5 +; GFX7-GISEL-NEXT: v_max_f32_e32 v1, v1, v4 +; GFX7-GISEL-NEXT: v_max_f32_e32 v4, v6, v7 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v6, v8 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v7, v11 -; GFX7-GISEL-NEXT: v_max_f32_e32 v4, v4, v6 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-GISEL-NEXT: v_max_f32_e32 v5, v5, v7 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-GISEL-NEXT: v_max_f32_e32 v2, v2, v5 +; GFX7-GISEL-NEXT: v_max_f32_e32 v5, v6, v7 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-GISEL-NEXT: v_or_b32_e32 v0, v0, v3 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX7-GISEL-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX7-GISEL-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: v_maximumnum_v6f16: @@ -4469,83 +4533,107 @@ define <8 x half> @v_maximumnum_v8f16(<8 x half> %x, <8 x half> %y) { ; GFX7-SDAG-LABEL: v_maximumnum_v8f16: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v8, 16, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v9, 16, v4 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v8, v8 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v9, v9 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v10, 16, v1 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v11, 16, v5 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v10, v10 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v8 -; GFX7-SDAG-NEXT: v_max_f32_e32 v1, v1, v9 -; GFX7-SDAG-NEXT: v_max_f32_e32 v2, v2, v10 -; GFX7-SDAG-NEXT: v_max_f32_e32 v3, v3, v11 -; GFX7-SDAG-NEXT: v_max_f32_e32 v4, v4, v12 -; GFX7-SDAG-NEXT: v_max_f32_e32 v5, v5, v13 -; GFX7-SDAG-NEXT: v_max_f32_e32 v6, v6, v14 -; GFX7-SDAG-NEXT: v_max_f32_e32 v7, v7, v15 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-SDAG-NEXT: v_max_f32_e32 v8, v8, v9 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v12, 16, v2 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v13, 16, v6 +; GFX7-SDAG-NEXT: v_max_f32_e32 v10, v10, v11 +; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v4 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v4, v8 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v12, v12 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v13, v13 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v14, 16, v3 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v15, 16, v7 +; GFX7-SDAG-NEXT: v_max_f32_e32 v1, v1, v5 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v5, v10 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v14, v14 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v15, v15 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-SDAG-NEXT: v_max_f32_e32 v12, v12, v13 +; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX7-SDAG-NEXT: v_max_f32_e32 v14, v14, v15 +; GFX7-SDAG-NEXT: v_max_f32_e32 v2, v2, v6 +; GFX7-SDAG-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v4, v12 +; GFX7-SDAG-NEXT: v_max_f32_e32 v3, v3, v7 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v5, v14 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-SDAG-NEXT: v_or_b32_e32 v2, v2, v4 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX7-SDAG-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: v_maximumnum_v8f16: ; GFX7-GISEL: ; %bb.0: ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v8, 16, v0 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v12, 16, v4 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v8, v8 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v12, v12 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v9, 16, v1 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v13, 16, v5 +; GFX7-GISEL-NEXT: v_max_f32_e32 v0, v0, v4 +; GFX7-GISEL-NEXT: v_max_f32_e32 v4, v8, v12 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-GISEL-NEXT: v_max_f32_e32 v0, v0, v8 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v8, v10 -; GFX7-GISEL-NEXT: v_max_f32_e32 v1, v1, v9 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v9, v11 -; GFX7-GISEL-NEXT: v_max_f32_e32 v2, v2, v8 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v8, v12 -; GFX7-GISEL-NEXT: v_max_f32_e32 v3, v3, v9 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v8, v9 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v9, v13 -; GFX7-GISEL-NEXT: v_max_f32_e32 v4, v4, v8 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v14, 16, v6 +; GFX7-GISEL-NEXT: v_max_f32_e32 v1, v1, v5 +; GFX7-GISEL-NEXT: v_max_f32_e32 v5, v8, v9 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v8, v14 -; GFX7-GISEL-NEXT: v_max_f32_e32 v5, v5, v9 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v8, v10 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v9, v14 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v11, 16, v3 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v15, 16, v7 +; GFX7-GISEL-NEXT: v_max_f32_e32 v2, v2, v6 +; GFX7-GISEL-NEXT: v_max_f32_e32 v6, v8, v9 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v8, v11 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v9, v15 -; GFX7-GISEL-NEXT: v_max_f32_e32 v6, v6, v8 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-GISEL-NEXT: v_max_f32_e32 v7, v7, v9 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-GISEL-NEXT: v_max_f32_e32 v3, v3, v7 +; GFX7-GISEL-NEXT: v_max_f32_e32 v7, v8, v9 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-GISEL-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX7-GISEL-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-GISEL-NEXT: v_or_b32_e32 v2, v2, v4 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v7 +; GFX7-GISEL-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: v_maximumnum_v8f16: @@ -4805,159 +4893,203 @@ define <16 x half> @v_maximumnum_v16f16(<16 x half> %x, <16 x half> %y) { ; GFX7-SDAG-LABEL: v_maximumnum_v16f16: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v16 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v16, v17 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v17, v20 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-SDAG-NEXT: v_max_f32_e32 v1, v1, v16 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v16, v18 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v18, v21 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v16, 16, v6 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v17, 16, v7 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v18, 16, v15 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v19, 16, v14 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v20, 16, v5 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v21, 16, v13 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v22, 16, v4 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v23, 16, v12 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v16, v16 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-SDAG-NEXT: v_max_f32_e32 v2, v2, v16 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v16, v19 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v19, v22 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v20, v23 -; GFX7-SDAG-NEXT: v_max_f32_e32 v4, v4, v17 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v16, v16 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GFX7-SDAG-NEXT: v_max_f32_e32 v5, v5, v18 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GFX7-SDAG-NEXT: v_max_f32_e32 v3, v3, v16 -; GFX7-SDAG-NEXT: buffer_load_dword v16, off, s[0:3], s32 -; GFX7-SDAG-NEXT: v_max_f32_e32 v6, v6, v19 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v17, v24 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v18, v25 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v19, v26 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v7, v7 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v20, v20 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v21, v21 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v22, v22 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v23, v23 +; GFX7-SDAG-NEXT: v_max_f32_e32 v17, v17, v18 +; GFX7-SDAG-NEXT: v_max_f32_e32 v16, v16, v19 +; GFX7-SDAG-NEXT: v_max_f32_e32 v18, v20, v21 +; GFX7-SDAG-NEXT: v_max_f32_e32 v19, v22, v23 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v20, 16, v3 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v21, 16, v11 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v22, 16, v2 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v23, 16, v10 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v20, v20 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v21, v21 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v22, v22 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v23, v23 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v24, 16, v8 +; GFX7-SDAG-NEXT: v_max_f32_e32 v20, v20, v21 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v24, v24 +; GFX7-SDAG-NEXT: v_max_f32_e32 v21, v22, v23 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v22, 16, v1 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v23, 16, v9 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v22, v22 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v23, v23 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v17, v17 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v18, v18 +; GFX7-SDAG-NEXT: v_max_f32_e32 v22, v22, v23 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v23, 16, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v23, v23 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_max_f32_e32 v1, v1, v9 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v9, v22 +; GFX7-SDAG-NEXT: v_max_f32_e32 v23, v23, v24 +; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v8 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v8, v23 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GFX7-SDAG-NEXT: v_max_f32_e32 v7, v7, v20 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v20, v27 -; GFX7-SDAG-NEXT: v_max_f32_e32 v8, v8, v17 -; GFX7-SDAG-NEXT: v_max_f32_e32 v9, v9, v18 -; GFX7-SDAG-NEXT: v_max_f32_e32 v10, v10, v19 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v17, v28 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v18, v29 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v19, v30 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v15, v15 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v20, v20 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v8 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; GFX7-SDAG-NEXT: v_max_f32_e32 v2, v2, v10 +; GFX7-SDAG-NEXT: v_or_b32_e32 v1, v1, v8 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v8, v21 +; GFX7-SDAG-NEXT: v_max_f32_e32 v3, v3, v11 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v9, v20 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v17, v17 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v18, v18 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX7-SDAG-NEXT: v_or_b32_e32 v2, v2, v8 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; GFX7-SDAG-NEXT: v_max_f32_e32 v4, v4, v12 +; GFX7-SDAG-NEXT: v_or_b32_e32 v3, v3, v8 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v8, v19 +; GFX7-SDAG-NEXT: v_max_f32_e32 v5, v5, v13 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v9, v18 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v6, v6 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v19, v19 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v7, v7 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GFX7-SDAG-NEXT: v_max_f32_e32 v11, v11, v20 -; GFX7-SDAG-NEXT: v_max_f32_e32 v12, v12, v17 -; GFX7-SDAG-NEXT: v_max_f32_e32 v13, v13, v18 -; GFX7-SDAG-NEXT: v_max_f32_e32 v14, v14, v19 -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GFX7-SDAG-NEXT: v_max_f32_e32 v15, v15, v16 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX7-SDAG-NEXT: v_or_b32_e32 v4, v4, v8 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; GFX7-SDAG-NEXT: v_max_f32_e32 v6, v6, v14 +; GFX7-SDAG-NEXT: v_or_b32_e32 v5, v5, v8 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v8, v16 +; GFX7-SDAG-NEXT: v_max_f32_e32 v7, v7, v15 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v9, v17 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX7-SDAG-NEXT: v_or_b32_e32 v6, v6, v8 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; GFX7-SDAG-NEXT: v_or_b32_e32 v7, v7, v8 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: v_maximumnum_v16f16: ; GFX7-GISEL: ; %bb.0: ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v16, 16, v0 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v18, 16, v8 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-GISEL-NEXT: v_max_f32_e32 v0, v0, v16 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v16, v17 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v17, v20 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-GISEL-NEXT: v_max_f32_e32 v1, v1, v16 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v16, v18 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v18, v21 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GFX7-GISEL-NEXT: v_max_f32_e32 v4, v4, v17 -; GFX7-GISEL-NEXT: v_max_f32_e32 v2, v2, v16 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v16, v19 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v19, v22 -; GFX7-GISEL-NEXT: v_max_f32_e32 v5, v5, v18 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX7-GISEL-NEXT: v_max_f32_e32 v3, v3, v16 -; GFX7-GISEL-NEXT: buffer_load_dword v16, off, s[0:3], s32 -; GFX7-GISEL-NEXT: v_max_f32_e32 v6, v6, v19 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v17, v23 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v18, v24 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v17, 16, v1 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v17, v17 +; GFX7-GISEL-NEXT: v_max_f32_e32 v0, v0, v8 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v8, 16, v9 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v19, v25 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v8, v8 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v16, v16 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v18, v18 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v19, 16, v2 +; GFX7-GISEL-NEXT: v_max_f32_e32 v1, v1, v9 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v9, 16, v10 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v20, v26 -; GFX7-GISEL-NEXT: v_max_f32_e32 v7, v7, v17 -; GFX7-GISEL-NEXT: v_max_f32_e32 v8, v8, v18 -; GFX7-GISEL-NEXT: v_max_f32_e32 v9, v9, v19 -; GFX7-GISEL-NEXT: v_max_f32_e32 v10, v10, v20 +; GFX7-GISEL-NEXT: v_max_f32_e32 v8, v17, v8 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v17, v19 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v9, v9 +; GFX7-GISEL-NEXT: v_max_f32_e32 v16, v16, v18 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v18, 16, v3 +; GFX7-GISEL-NEXT: v_max_f32_e32 v2, v2, v10 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v10, 16, v11 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v17, v27 +; GFX7-GISEL-NEXT: v_max_f32_e32 v9, v17, v9 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v17, v18 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v10, v10 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v19, 16, v4 +; GFX7-GISEL-NEXT: v_max_f32_e32 v3, v3, v11 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v11, 16, v12 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v18, v28 +; GFX7-GISEL-NEXT: v_max_f32_e32 v10, v17, v10 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v17, v19 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v11, v11 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v18, 16, v5 +; GFX7-GISEL-NEXT: v_max_f32_e32 v4, v4, v12 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v12, 16, v13 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v19, v29 +; GFX7-GISEL-NEXT: v_max_f32_e32 v11, v17, v11 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v17, v18 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v12, v12 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v19, 16, v6 +; GFX7-GISEL-NEXT: v_max_f32_e32 v5, v5, v13 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v13, 16, v14 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v6, v6 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v20, v30 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GFX7-GISEL-NEXT: v_max_f32_e32 v11, v11, v17 -; GFX7-GISEL-NEXT: v_max_f32_e32 v12, v12, v18 -; GFX7-GISEL-NEXT: v_max_f32_e32 v13, v13, v19 -; GFX7-GISEL-NEXT: v_max_f32_e32 v14, v14, v20 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v18, 16, v7 +; GFX7-GISEL-NEXT: v_max_f32_e32 v12, v17, v12 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v17, 16, v15 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v8, v8 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v19, v19 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v13, v13 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v9, v9 +; GFX7-GISEL-NEXT: v_max_f32_e32 v6, v6, v14 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v14, v15 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v15, v18 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v17, v17 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v10, v10 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v11, v11 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v12, v12 +; GFX7-GISEL-NEXT: v_max_f32_e32 v13, v19, v13 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v13, v13 +; GFX7-GISEL-NEXT: v_max_f32_e32 v7, v7, v14 +; GFX7-GISEL-NEXT: v_max_f32_e32 v14, v15, v17 +; GFX7-GISEL-NEXT: v_or_b32_e32 v1, v1, v8 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v16, v16 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GFX7-GISEL-NEXT: v_max_f32_e32 v15, v15, v16 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v15, v15 +; GFX7-GISEL-NEXT: v_or_b32_e32 v2, v2, v8 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v8, 16, v10 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX7-GISEL-NEXT: v_or_b32_e32 v3, v3, v8 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v8, 16, v11 +; GFX7-GISEL-NEXT: v_or_b32_e32 v4, v4, v8 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v8, 16, v12 +; GFX7-GISEL-NEXT: v_or_b32_e32 v5, v5, v8 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v8, 16, v13 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v15, 16, v16 +; GFX7-GISEL-NEXT: v_or_b32_e32 v6, v6, v8 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v8, 16, v14 +; GFX7-GISEL-NEXT: v_or_b32_e32 v0, v0, v15 +; GFX7-GISEL-NEXT: v_or_b32_e32 v7, v7, v8 ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: v_maximumnum_v16f16: @@ -5389,431 +5521,408 @@ define <32 x half> @v_maximumnum_v32f16(<32 x half> %x, <32 x half> %y) { ; GFX7-SDAG-LABEL: v_maximumnum_v32f16: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v14, v14 +; GFX7-SDAG-NEXT: buffer_load_dword v48, off, s[0:3], s32 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v31, 16, v14 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v32, 16, v30 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v33, 16, v13 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v34, 16, v29 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v35, 16, v12 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v36, 16, v28 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v37, 16, v11 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v38, 16, v27 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v50, 16, v9 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v51, 16, v25 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v31, v31 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v32, v32 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v33, v33 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v34, v34 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v35, v35 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v36, v36 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v37, v37 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v38, v38 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v50, v50 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v51, v51 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v39, 16, v10 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v49, 16, v26 +; GFX7-SDAG-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX7-SDAG-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX7-SDAG-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX7-SDAG-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v52, 16, v8 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v53, 16, v24 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v54, 16, v7 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v55, 16, v23 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v40, 16, v6 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v41, 16, v22 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v39, v39 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v49, v49 +; GFX7-SDAG-NEXT: v_max_f32_e32 v31, v31, v32 +; GFX7-SDAG-NEXT: v_max_f32_e32 v32, v33, v34 +; GFX7-SDAG-NEXT: v_max_f32_e32 v33, v35, v36 +; GFX7-SDAG-NEXT: v_max_f32_e32 v35, v37, v38 +; GFX7-SDAG-NEXT: v_max_f32_e32 v37, v50, v51 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v50, 16, v4 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v51, 16, v20 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v52, v52 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v53, v53 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v54, v54 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v55, v55 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v40, v40 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v41, v41 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v50, v50 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v51, v51 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v42, 16, v15 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v43, 16, v5 +; GFX7-SDAG-NEXT: v_max_f32_e32 v36, v39, v49 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v49, 16, v21 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v42, v42 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v43, v43 +; GFX7-SDAG-NEXT: v_max_f32_e32 v38, v52, v53 +; GFX7-SDAG-NEXT: v_max_f32_e32 v39, v54, v55 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v49, v49 +; GFX7-SDAG-NEXT: v_max_f32_e32 v52, v40, v41 +; GFX7-SDAG-NEXT: v_max_f32_e32 v50, v50, v51 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v51, 16, v3 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v53, 16, v19 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v54, 16, v2 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v55, 16, v18 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v40, 16, v1 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v41, 16, v17 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v51, v51 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v53, v53 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v54, v54 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v55, v55 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v40, v40 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v41, v41 +; GFX7-SDAG-NEXT: v_max_f32_e32 v49, v43, v49 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v43, 16, v16 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v43, v43 +; GFX7-SDAG-NEXT: v_max_f32_e32 v51, v51, v53 +; GFX7-SDAG-NEXT: v_max_f32_e32 v53, v54, v55 +; GFX7-SDAG-NEXT: v_max_f32_e32 v54, v40, v41 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v19, v19 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v40, v14 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v30, v30 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v16, v16 +; GFX7-SDAG-NEXT: s_waitcnt vmcnt(4) +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v34, 16, v48 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v34, v34 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v48, v48 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v17, v17 +; GFX7-SDAG-NEXT: v_max_f32_e32 v34, v42, v34 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v42, 16, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v42, v42 +; GFX7-SDAG-NEXT: v_max_f32_e32 v14, v15, v48 +; GFX7-SDAG-NEXT: v_max_f32_e32 v15, v40, v30 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_max_f32_e32 v55, v42, v43 +; GFX7-SDAG-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX7-SDAG-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX7-SDAG-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX7-SDAG-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX7-SDAG-NEXT: v_max_f32_e32 v1, v1, v17 +; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v16 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v16, v55 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v17, v54 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v18, v18 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v23, v23 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v16 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v16, 16, v17 +; GFX7-SDAG-NEXT: v_max_f32_e32 v2, v2, v18 +; GFX7-SDAG-NEXT: v_or_b32_e32 v1, v1, v16 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v16, v53 +; GFX7-SDAG-NEXT: v_max_f32_e32 v3, v3, v19 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v17, v51 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v20, v20 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v21, v21 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX7-SDAG-NEXT: v_or_b32_e32 v2, v2, v16 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v16, 16, v17 +; GFX7-SDAG-NEXT: v_max_f32_e32 v4, v4, v20 +; GFX7-SDAG-NEXT: v_or_b32_e32 v3, v3, v16 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v16, v50 +; GFX7-SDAG-NEXT: v_max_f32_e32 v5, v5, v21 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v17, v49 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v6, v6 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v22, v22 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v7, v7 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v27, v27 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX7-SDAG-NEXT: v_or_b32_e32 v4, v4, v16 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v16, 16, v17 +; GFX7-SDAG-NEXT: v_max_f32_e32 v6, v6, v22 +; GFX7-SDAG-NEXT: v_or_b32_e32 v5, v5, v16 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v16, v52 +; GFX7-SDAG-NEXT: v_max_f32_e32 v7, v7, v23 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v17, v39 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v8, v8 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v24, v24 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v9, v9 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v25, v25 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX7-SDAG-NEXT: v_or_b32_e32 v6, v6, v16 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v16, 16, v17 +; GFX7-SDAG-NEXT: v_max_f32_e32 v8, v8, v24 +; GFX7-SDAG-NEXT: v_or_b32_e32 v7, v7, v16 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v16, v38 +; GFX7-SDAG-NEXT: v_max_f32_e32 v9, v9, v25 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v8, v8 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v17, v37 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v10, v10 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v26, v26 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v9, v9 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v11, v11 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v27, v27 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v28, v28 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GFX7-SDAG-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX7-SDAG-NEXT: v_or_b32_e32 v8, v8, v16 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v16, 16, v17 +; GFX7-SDAG-NEXT: v_max_f32_e32 v10, v10, v26 +; GFX7-SDAG-NEXT: v_or_b32_e32 v9, v9, v16 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v16, v36 +; GFX7-SDAG-NEXT: v_max_f32_e32 v11, v11, v27 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v10, v10 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v17, v35 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v12, v12 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v28, v28 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v11, v11 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v13, v13 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v29, v29 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v30, v30 -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(1) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v31 -; GFX7-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(1) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v32, v32 -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GFX7-SDAG-NEXT: v_max_f32_e32 v1, v1, v31 -; GFX7-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:12 -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GFX7-SDAG-NEXT: v_max_f32_e32 v2, v2, v31 -; GFX7-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:16 -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GFX7-SDAG-NEXT: v_max_f32_e32 v3, v3, v31 -; GFX7-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:20 -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GFX7-SDAG-NEXT: v_max_f32_e32 v4, v4, v31 -; GFX7-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:24 -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GFX7-SDAG-NEXT: v_max_f32_e32 v5, v5, v31 -; GFX7-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:28 -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GFX7-SDAG-NEXT: v_max_f32_e32 v6, v6, v31 -; GFX7-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:32 -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GFX7-SDAG-NEXT: v_max_f32_e32 v7, v7, v31 -; GFX7-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:36 -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GFX7-SDAG-NEXT: v_max_f32_e32 v8, v8, v31 -; GFX7-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:40 -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GFX7-SDAG-NEXT: v_max_f32_e32 v9, v9, v31 -; GFX7-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:44 -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GFX7-SDAG-NEXT: v_max_f32_e32 v10, v10, v31 -; GFX7-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:48 -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GFX7-SDAG-NEXT: v_max_f32_e32 v11, v11, v31 -; GFX7-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:52 -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GFX7-SDAG-NEXT: v_max_f32_e32 v12, v12, v31 -; GFX7-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:56 -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GFX7-SDAG-NEXT: v_max_f32_e32 v13, v13, v31 -; GFX7-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:60 -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GFX7-SDAG-NEXT: v_max_f32_e32 v14, v14, v31 -; GFX7-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:64 -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GFX7-SDAG-NEXT: v_max_f32_e32 v15, v15, v31 -; GFX7-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:68 -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GFX7-SDAG-NEXT: v_max_f32_e32 v16, v16, v31 -; GFX7-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:72 -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GFX7-SDAG-NEXT: v_max_f32_e32 v17, v17, v31 -; GFX7-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76 -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GFX7-SDAG-NEXT: v_max_f32_e32 v18, v18, v31 -; GFX7-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80 -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GFX7-SDAG-NEXT: v_max_f32_e32 v19, v19, v31 -; GFX7-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:84 -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GFX7-SDAG-NEXT: v_max_f32_e32 v20, v20, v31 -; GFX7-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:88 -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GFX7-SDAG-NEXT: v_max_f32_e32 v21, v21, v31 -; GFX7-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:92 -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GFX7-SDAG-NEXT: v_max_f32_e32 v22, v22, v31 -; GFX7-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:96 -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GFX7-SDAG-NEXT: v_max_f32_e32 v23, v23, v31 -; GFX7-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:100 -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GFX7-SDAG-NEXT: v_max_f32_e32 v24, v24, v31 -; GFX7-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:104 -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GFX7-SDAG-NEXT: v_max_f32_e32 v25, v25, v31 -; GFX7-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:108 -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GFX7-SDAG-NEXT: v_max_f32_e32 v26, v26, v31 -; GFX7-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:112 -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GFX7-SDAG-NEXT: v_max_f32_e32 v27, v27, v31 -; GFX7-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:116 -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GFX7-SDAG-NEXT: v_max_f32_e32 v28, v28, v31 -; GFX7-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120 -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GFX7-SDAG-NEXT: v_max_f32_e32 v29, v29, v31 -; GFX7-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:124 -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GFX7-SDAG-NEXT: v_max_f32_e32 v30, v30, v31 -; GFX7-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX7-SDAG-NEXT: v_or_b32_e32 v10, v10, v16 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v16, 16, v17 +; GFX7-SDAG-NEXT: v_max_f32_e32 v12, v12, v28 +; GFX7-SDAG-NEXT: v_or_b32_e32 v11, v11, v16 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v16, v33 +; GFX7-SDAG-NEXT: v_max_f32_e32 v13, v13, v29 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v12, v12 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v17, v32 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v13, v13 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX7-SDAG-NEXT: v_or_b32_e32 v12, v12, v16 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v16, 16, v17 +; GFX7-SDAG-NEXT: v_or_b32_e32 v13, v13, v16 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v16, v31 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v15, v15 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v17, v34 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v18, v14 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v14, 16, v16 +; GFX7-SDAG-NEXT: v_or_b32_e32 v14, v15, v14 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v15, 16, v17 +; GFX7-SDAG-NEXT: v_or_b32_e32 v15, v18, v15 ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GFX7-SDAG-NEXT: v_max_f32_e32 v31, v31, v32 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: v_maximumnum_v32f16: ; GFX7-GISEL: ; %bb.0: ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-GISEL-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v31, v0 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v32, v16 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v16, 16, v16 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v15, v15 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v16, v16 +; GFX7-GISEL-NEXT: v_max_f32_e32 v31, v31, v32 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v32, v17 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX7-GISEL-NEXT: v_max_f32_e32 v16, v0, v16 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v1 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v17, v17 +; GFX7-GISEL-NEXT: v_max_f32_e32 v0, v0, v32 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v32, v18 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; GFX7-GISEL-NEXT: v_max_f32_e32 v1, v1, v17 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v17, v2 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v18, v18 +; GFX7-GISEL-NEXT: v_max_f32_e32 v17, v17, v32 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v32, v19 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; GFX7-GISEL-NEXT: v_max_f32_e32 v18, v2, v18 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v19, v19 +; GFX7-GISEL-NEXT: v_max_f32_e32 v2, v2, v32 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v32, v20 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; GFX7-GISEL-NEXT: v_max_f32_e32 v3, v3, v19 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v19, v4 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v20, v20 +; GFX7-GISEL-NEXT: v_max_f32_e32 v19, v19, v32 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v32, v21 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; GFX7-GISEL-NEXT: v_max_f32_e32 v20, v4, v20 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v21, v21 +; GFX7-GISEL-NEXT: v_max_f32_e32 v4, v4, v32 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v32, v22 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; GFX7-GISEL-NEXT: v_max_f32_e32 v5, v5, v21 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v21, v6 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v6, v6 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v22, v22 +; GFX7-GISEL-NEXT: v_max_f32_e32 v21, v21, v32 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v32, v23 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; GFX7-GISEL-NEXT: v_max_f32_e32 v6, v6, v22 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v22, v7 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v7, v7 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v23, v23 +; GFX7-GISEL-NEXT: v_max_f32_e32 v22, v22, v32 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v32, v24 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; GFX7-GISEL-NEXT: v_max_f32_e32 v7, v7, v23 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v23, v8 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v8, v8 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v24, v24 +; GFX7-GISEL-NEXT: v_max_f32_e32 v23, v23, v32 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v32, v25 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; GFX7-GISEL-NEXT: v_max_f32_e32 v8, v8, v24 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v24, v9 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v9, v9 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v25, v25 +; GFX7-GISEL-NEXT: v_max_f32_e32 v24, v24, v32 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v32, v26 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; GFX7-GISEL-NEXT: v_max_f32_e32 v9, v9, v25 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v25, v10 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v10, v10 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v26, v26 +; GFX7-GISEL-NEXT: v_max_f32_e32 v25, v25, v32 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v32, v27 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; GFX7-GISEL-NEXT: v_max_f32_e32 v10, v10, v26 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v26, v11 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v11, v11 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v27, v27 +; GFX7-GISEL-NEXT: v_max_f32_e32 v26, v26, v32 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v32, v28 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; GFX7-GISEL-NEXT: v_max_f32_e32 v11, v11, v27 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v27, v12 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v12, v12 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v28, v28 +; GFX7-GISEL-NEXT: v_max_f32_e32 v27, v27, v32 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v32, v29 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; GFX7-GISEL-NEXT: v_max_f32_e32 v12, v12, v28 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v28, v13 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v13, v13 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v29, v29 +; GFX7-GISEL-NEXT: v_max_f32_e32 v28, v28, v32 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v32, v30 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; GFX7-GISEL-NEXT: v_max_f32_e32 v13, v13, v29 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v29, v14 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v14, v14 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v30, v30 -; GFX7-GISEL-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128 -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(1) -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GFX7-GISEL-NEXT: v_max_f32_e32 v0, v0, v31 -; GFX7-GISEL-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(1) -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v32, v32 -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GFX7-GISEL-NEXT: v_max_f32_e32 v1, v1, v31 -; GFX7-GISEL-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:12 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GFX7-GISEL-NEXT: v_max_f32_e32 v2, v2, v31 -; GFX7-GISEL-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:16 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GFX7-GISEL-NEXT: v_max_f32_e32 v3, v3, v31 -; GFX7-GISEL-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:20 +; GFX7-GISEL-NEXT: v_max_f32_e32 v29, v29, v32 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v32, v15 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; GFX7-GISEL-NEXT: v_max_f32_e32 v14, v14, v30 +; GFX7-GISEL-NEXT: buffer_load_dword v30, off, s[0:3], s32 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v15, v15 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v16, v16 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v17, v17 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GFX7-GISEL-NEXT: v_max_f32_e32 v4, v4, v31 -; GFX7-GISEL-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:24 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GFX7-GISEL-NEXT: v_max_f32_e32 v5, v5, v31 -; GFX7-GISEL-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:28 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GFX7-GISEL-NEXT: v_max_f32_e32 v6, v6, v31 -; GFX7-GISEL-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:32 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GFX7-GISEL-NEXT: v_max_f32_e32 v7, v7, v31 -; GFX7-GISEL-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:36 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GFX7-GISEL-NEXT: v_max_f32_e32 v8, v8, v31 -; GFX7-GISEL-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:40 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GFX7-GISEL-NEXT: v_max_f32_e32 v9, v9, v31 -; GFX7-GISEL-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:44 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GFX7-GISEL-NEXT: v_max_f32_e32 v10, v10, v31 -; GFX7-GISEL-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:48 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GFX7-GISEL-NEXT: v_max_f32_e32 v11, v11, v31 -; GFX7-GISEL-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:52 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GFX7-GISEL-NEXT: v_max_f32_e32 v12, v12, v31 -; GFX7-GISEL-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:56 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GFX7-GISEL-NEXT: v_max_f32_e32 v13, v13, v31 -; GFX7-GISEL-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:60 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GFX7-GISEL-NEXT: v_max_f32_e32 v14, v14, v31 -; GFX7-GISEL-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:64 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v14, v14 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GFX7-GISEL-NEXT: v_max_f32_e32 v15, v15, v31 -; GFX7-GISEL-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:68 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v33, v30 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v30, v30 +; GFX7-GISEL-NEXT: v_max_f32_e32 v32, v32, v33 +; GFX7-GISEL-NEXT: v_max_f32_e32 v15, v15, v30 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v30, v31 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v31, v0 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v16 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v16, v18 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v18, v2 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GFX7-GISEL-NEXT: v_max_f32_e32 v16, v16, v31 -; GFX7-GISEL-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:72 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GFX7-GISEL-NEXT: v_max_f32_e32 v17, v17, v31 -; GFX7-GISEL-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GFX7-GISEL-NEXT: v_max_f32_e32 v18, v18, v31 -; GFX7-GISEL-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GFX7-GISEL-NEXT: v_max_f32_e32 v19, v19, v31 -; GFX7-GISEL-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:84 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GFX7-GISEL-NEXT: v_max_f32_e32 v20, v20, v31 -; GFX7-GISEL-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:88 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GFX7-GISEL-NEXT: v_max_f32_e32 v21, v21, v31 -; GFX7-GISEL-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:92 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GFX7-GISEL-NEXT: v_max_f32_e32 v22, v22, v31 -; GFX7-GISEL-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:96 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GFX7-GISEL-NEXT: v_max_f32_e32 v23, v23, v31 -; GFX7-GISEL-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:100 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GFX7-GISEL-NEXT: v_max_f32_e32 v24, v24, v31 -; GFX7-GISEL-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:104 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GFX7-GISEL-NEXT: v_max_f32_e32 v25, v25, v31 -; GFX7-GISEL-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:108 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GFX7-GISEL-NEXT: v_max_f32_e32 v26, v26, v31 -; GFX7-GISEL-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:112 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GFX7-GISEL-NEXT: v_max_f32_e32 v27, v27, v31 -; GFX7-GISEL-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:116 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GFX7-GISEL-NEXT: v_max_f32_e32 v28, v28, v31 -; GFX7-GISEL-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v28, v28 -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GFX7-GISEL-NEXT: v_max_f32_e32 v29, v29, v31 -; GFX7-GISEL-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:124 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GFX7-GISEL-NEXT: v_max_f32_e32 v30, v30, v31 -; GFX7-GISEL-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GFX7-GISEL-NEXT: v_max_f32_e32 v31, v31, v32 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v31, v31 +; GFX7-GISEL-NEXT: v_or_b32_e32 v0, v30, v0 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v16 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v16, v20 +; GFX7-GISEL-NEXT: v_or_b32_e32 v2, v17, v2 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v17, v19 +; GFX7-GISEL-NEXT: v_or_b32_e32 v3, v18, v3 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v18, v4 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v16 +; GFX7-GISEL-NEXT: v_or_b32_e32 v4, v17, v4 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v16, v21 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v17, v22 +; GFX7-GISEL-NEXT: v_or_b32_e32 v5, v18, v5 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v18, v29 +; GFX7-GISEL-NEXT: v_or_b32_e32 v6, v16, v6 +; GFX7-GISEL-NEXT: v_or_b32_e32 v7, v17, v7 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v16, v23 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v17, v24 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v19, v32 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX7-GISEL-NEXT: v_or_b32_e32 v8, v16, v8 +; GFX7-GISEL-NEXT: v_or_b32_e32 v9, v17, v9 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v16, v25 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v17, v26 +; GFX7-GISEL-NEXT: v_or_b32_e32 v1, v31, v1 +; GFX7-GISEL-NEXT: v_or_b32_e32 v14, v18, v14 +; GFX7-GISEL-NEXT: v_or_b32_e32 v10, v16, v10 +; GFX7-GISEL-NEXT: v_or_b32_e32 v11, v17, v11 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v16, v27 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v17, v28 +; GFX7-GISEL-NEXT: v_or_b32_e32 v15, v19, v15 +; GFX7-GISEL-NEXT: v_or_b32_e32 v12, v16, v12 +; GFX7-GISEL-NEXT: v_or_b32_e32 v13, v17, v13 ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: v_maximumnum_v32f16: @@ -8497,29 +8606,35 @@ define <2 x half> @v_maximumnum_v2f16_no_ieee(<2 x half> %x, <2 x half> %y) #0 { ; GFX7-SDAG-LABEL: v_maximumnum_v2f16_no_ieee: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX7-SDAG-NEXT: v_max_f32_e32 v1, v1, v3 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_max_f32_e32 v2, v3, v2 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: v_maximumnum_v2f16_no_ieee: ; GFX7-GISEL: ; %bb.0: ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-GISEL-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX7-GISEL-NEXT: v_max_f32_e32 v0, v0, v1 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: v_max_f32_e32 v1, v1, v3 +; GFX7-GISEL-NEXT: v_max_f32_e32 v1, v2, v3 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: v_maximumnum_v2f16_no_ieee: @@ -8589,29 +8704,35 @@ define <2 x half> @v_maximumnum_v2f16_nnan_no_ieee(<2 x half> %x, <2 x half> %y) ; GFX7-SDAG-LABEL: v_maximumnum_v2f16_nnan_no_ieee: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX7-SDAG-NEXT: v_max_f32_e32 v1, v1, v3 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_max_f32_e32 v2, v3, v2 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: v_maximumnum_v2f16_nnan_no_ieee: ; GFX7-GISEL: ; %bb.0: ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-GISEL-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX7-GISEL-NEXT: v_max_f32_e32 v0, v0, v1 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: v_max_f32_e32 v1, v1, v3 +; GFX7-GISEL-NEXT: v_max_f32_e32 v1, v2, v3 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: v_maximumnum_v2f16_nnan_no_ieee: @@ -8665,38 +8786,43 @@ define <3 x half> @v_maximumnum_v3f16_nnan_no_ieee(<3 x half> %x, <3 x half> %y) ; GFX7-SDAG-LABEL: v_maximumnum_v3f16_nnan_no_ieee: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v5, 16, v0 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v3 -; GFX7-SDAG-NEXT: v_max_f32_e32 v1, v1, v4 -; GFX7-SDAG-NEXT: v_max_f32_e32 v2, v2, v5 +; GFX7-SDAG-NEXT: v_max_f32_e32 v4, v5, v4 +; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v4 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_max_f32_e32 v1, v1, v3 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: v_maximumnum_v3f16_nnan_no_ieee: ; GFX7-GISEL: ; %bb.0: ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-GISEL-NEXT: v_max_f32_e32 v0, v0, v3 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v3, v4 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-GISEL-NEXT: v_max_f32_e32 v2, v4, v2 +; GFX7-GISEL-NEXT: v_max_f32_e32 v0, v0, v5 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-GISEL-NEXT: v_max_f32_e32 v1, v1, v3 -; GFX7-GISEL-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-GISEL-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: v_maximumnum_v3f16_nnan_no_ieee: @@ -8763,47 +8889,59 @@ define <4 x half> @v_maximumnum_v4f16_nnan_no_ieee(<4 x half> %x, <4 x half> %y) ; GFX7-SDAG-LABEL: v_maximumnum_v4f16_nnan_no_ieee: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v5, 16, v0 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v7, v7 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v4 -; GFX7-SDAG-NEXT: v_max_f32_e32 v1, v1, v5 -; GFX7-SDAG-NEXT: v_max_f32_e32 v2, v2, v6 -; GFX7-SDAG-NEXT: v_max_f32_e32 v3, v3, v7 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_max_f32_e32 v4, v5, v4 +; GFX7-SDAG-NEXT: v_max_f32_e32 v6, v7, v6 +; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v4 +; GFX7-SDAG-NEXT: v_max_f32_e32 v1, v1, v3 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v6 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX7-SDAG-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: v_maximumnum_v4f16_nnan_no_ieee: ; GFX7-GISEL: ; %bb.0: ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v7, 16, v3 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-GISEL-NEXT: v_max_f32_e32 v0, v0, v4 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v4, v6 -; GFX7-GISEL-NEXT: v_max_f32_e32 v1, v1, v5 +; GFX7-GISEL-NEXT: v_max_f32_e32 v0, v0, v5 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v5, v7 -; GFX7-GISEL-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-GISEL-NEXT: v_max_f32_e32 v3, v3, v5 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v5, v6 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v6, v7 +; GFX7-GISEL-NEXT: v_max_f32_e32 v2, v4, v2 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-GISEL-NEXT: v_max_f32_e32 v1, v1, v3 +; GFX7-GISEL-NEXT: v_max_f32_e32 v3, v5, v6 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-GISEL-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX7-GISEL-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: v_maximumnum_v4f16_nnan_no_ieee: diff --git a/llvm/test/CodeGen/AMDGPU/minimumnum.bf16.ll b/llvm/test/CodeGen/AMDGPU/minimumnum.bf16.ll index dc47782c15281..a3c9977fee488 100644 --- a/llvm/test/CodeGen/AMDGPU/minimumnum.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/minimumnum.bf16.ll @@ -378,18 +378,18 @@ define <2 x bfloat> @v_minimumnum_v2bf16(<2 x bfloat> %x, <2 x bfloat> %y) { ; GFX7-LABEL: v_minimumnum_v2bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_min_f32_e32 v1, v1, v3 -; GFX7-NEXT: v_min_f32_e32 v0, v0, v2 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_min_f32_e32 v2, v3, v2 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX7-NEXT: v_alignbit_b32 v0, v2, v0, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_minimumnum_v2bf16: @@ -741,18 +741,14 @@ define <2 x bfloat> @v_minimumnum_v2bf16_nnan(<2 x bfloat> %x, <2 x bfloat> %y) ; GFX7-LABEL: v_minimumnum_v2bf16_nnan: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_min_f32_e32 v1, v1, v3 -; GFX7-NEXT: v_min_f32_e32 v0, v0, v2 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX7-NEXT: v_min_f32_e32 v2, v3, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_alignbit_b32 v0, v0, v2, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_minimumnum_v2bf16_nnan: @@ -990,24 +986,24 @@ define <3 x bfloat> @v_minimumnum_v3bf16(<3 x bfloat> %x, <3 x bfloat> %y) { ; GFX7-LABEL: v_minimumnum_v3bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_min_f32_e32 v1, v1, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_min_f32_e32 v3, v4, v3 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_min_f32_e32 v2, v2, v5 -; GFX7-NEXT: v_min_f32_e32 v1, v1, v4 -; GFX7-NEXT: v_min_f32_e32 v0, v0, v3 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_alignbit_b32 v0, v3, v0, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_minimumnum_v3bf16: @@ -1488,24 +1484,18 @@ define <3 x bfloat> @v_minimumnum_v3bf16_nnan(<3 x bfloat> %x, <3 x bfloat> %y) ; GFX7-LABEL: v_minimumnum_v3bf16_nnan: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_min_f32_e32 v2, v2, v5 -; GFX7-NEXT: v_min_f32_e32 v1, v1, v4 -; GFX7-NEXT: v_min_f32_e32 v0, v0, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_min_f32_e32 v1, v1, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v0 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX7-NEXT: v_min_f32_e32 v3, v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_alignbit_b32 v0, v0, v3, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_minimumnum_v3bf16_nnan: @@ -1821,30 +1811,30 @@ define <4 x bfloat> @v_minimumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) { ; GFX7-LABEL: v_minimumnum_v4bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_min_f32_e32 v4, v5, v4 +; GFX7-NEXT: v_min_f32_e32 v1, v1, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_min_f32_e32 v3, v5, v3 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_min_f32_e32 v3, v3, v7 -; GFX7-NEXT: v_min_f32_e32 v2, v2, v6 -; GFX7-NEXT: v_min_f32_e32 v1, v1, v5 -; GFX7-NEXT: v_min_f32_e32 v0, v0, v4 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX7-NEXT: v_alignbit_b32 v0, v3, v0, 16 +; GFX7-NEXT: v_alignbit_b32 v1, v4, v1, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_minimumnum_v4bf16: @@ -2485,30 +2475,22 @@ define <4 x bfloat> @v_minimumnum_v4bf16_nnan(<4 x bfloat> %x, <4 x bfloat> %y) ; GFX7-LABEL: v_minimumnum_v4bf16_nnan: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v1 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_min_f32_e32 v3, v3, v7 -; GFX7-NEXT: v_min_f32_e32 v2, v2, v6 -; GFX7-NEXT: v_min_f32_e32 v1, v1, v5 -; GFX7-NEXT: v_min_f32_e32 v0, v0, v4 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: v_min_f32_e32 v4, v5, v4 +; GFX7-NEXT: v_min_f32_e32 v1, v1, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v0 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX7-NEXT: v_alignbit_b32 v1, v1, v4, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_minimumnum_v4bf16_nnan: @@ -2927,42 +2909,42 @@ define <6 x bfloat> @v_minimumnum_v6bf16(<6 x bfloat> %x, <6 x bfloat> %y) { ; GFX7-LABEL: v_minimumnum_v6bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 +; GFX7-NEXT: v_min_f32_e32 v6, v7, v6 +; GFX7-NEXT: v_min_f32_e32 v2, v2, v5 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_min_f32_e32 v5, v5, v11 -; GFX7-NEXT: v_min_f32_e32 v4, v4, v10 -; GFX7-NEXT: v_min_f32_e32 v3, v3, v9 -; GFX7-NEXT: v_min_f32_e32 v2, v2, v8 -; GFX7-NEXT: v_min_f32_e32 v1, v1, v7 -; GFX7-NEXT: v_min_f32_e32 v0, v0, v6 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_min_f32_e32 v5, v7, v5 +; GFX7-NEXT: v_min_f32_e32 v1, v1, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v0 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_min_f32_e32 v4, v7, v4 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_min_f32_e32 v0, v0, v3 +; GFX7-NEXT: v_alignbit_b32 v0, v4, v0, 16 +; GFX7-NEXT: v_alignbit_b32 v1, v5, v1, 16 +; GFX7-NEXT: v_alignbit_b32 v2, v6, v2, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_minimumnum_v6bf16: @@ -3892,54 +3874,54 @@ define <8 x bfloat> @v_minimumnum_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) { ; GFX7-LABEL: v_minimumnum_v8bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v7 +; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 +; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 +; GFX7-NEXT: v_min_f32_e32 v8, v9, v8 +; GFX7-NEXT: v_min_f32_e32 v3, v3, v7 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v6 +; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 -; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_min_f32_e32 v7, v7, v15 -; GFX7-NEXT: v_min_f32_e32 v6, v6, v14 -; GFX7-NEXT: v_min_f32_e32 v5, v5, v13 -; GFX7-NEXT: v_min_f32_e32 v4, v4, v12 -; GFX7-NEXT: v_min_f32_e32 v3, v3, v11 -; GFX7-NEXT: v_min_f32_e32 v2, v2, v10 -; GFX7-NEXT: v_min_f32_e32 v1, v1, v9 -; GFX7-NEXT: v_min_f32_e32 v0, v0, v8 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_min_f32_e32 v7, v9, v7 +; GFX7-NEXT: v_min_f32_e32 v2, v2, v6 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_min_f32_e32 v6, v9, v6 +; GFX7-NEXT: v_min_f32_e32 v1, v1, v5 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v0 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_min_f32_e32 v5, v9, v5 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_min_f32_e32 v0, v0, v4 +; GFX7-NEXT: v_alignbit_b32 v0, v5, v0, 16 +; GFX7-NEXT: v_alignbit_b32 v1, v6, v1, 16 +; GFX7-NEXT: v_alignbit_b32 v2, v7, v2, 16 +; GFX7-NEXT: v_alignbit_b32 v3, v8, v3, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_minimumnum_v8bf16: @@ -5142,104 +5124,102 @@ define <16 x bfloat> @v_minimumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) { ; GFX7-LABEL: v_minimumnum_v16bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27 -; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GFX7-NEXT: v_min_f32_e32 v11, v11, v27 -; GFX7-NEXT: buffer_load_dword v27, off, s[0:3], s32 +; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v15 +; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v7 +; GFX7-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16 +; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 +; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GFX7-NEXT: v_min_f32_e32 v16, v17, v16 +; GFX7-NEXT: v_min_f32_e32 v7, v7, v15 +; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v14 +; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 +; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 -; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX7-NEXT: v_min_f32_e32 v15, v17, v15 +; GFX7-NEXT: v_min_f32_e32 v6, v6, v14 +; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v13 +; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 -; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30 +; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 ; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29 -; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 -; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28 -; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 -; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26 -; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 -; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 -; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23 -; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_min_f32_e32 v6, v6, v22 -; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21 +; GFX7-NEXT: v_min_f32_e32 v14, v17, v14 +; GFX7-NEXT: v_min_f32_e32 v5, v5, v13 +; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v12 +; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 +; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20 +; GFX7-NEXT: v_min_f32_e32 v13, v17, v13 +; GFX7-NEXT: v_min_f32_e32 v4, v4, v12 +; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v11 +; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 +; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19 +; GFX7-NEXT: v_min_f32_e32 v12, v17, v12 +; GFX7-NEXT: v_min_f32_e32 v3, v3, v11 +; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v10 +; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 +; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18 +; GFX7-NEXT: v_min_f32_e32 v11, v17, v11 +; GFX7-NEXT: v_min_f32_e32 v2, v2, v10 +; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v9 +; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 +; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_min_f32_e32 v10, v17, v10 +; GFX7-NEXT: v_min_f32_e32 v1, v1, v9 +; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v8 +; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v0 +; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 ; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_min_f32_e32 v9, v17, v9 +; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_min_f32_e32 v14, v14, v30 -; GFX7-NEXT: v_min_f32_e32 v13, v13, v29 -; GFX7-NEXT: v_min_f32_e32 v12, v12, v28 -; GFX7-NEXT: v_min_f32_e32 v10, v10, v26 -; GFX7-NEXT: v_min_f32_e32 v9, v9, v25 -; GFX7-NEXT: v_min_f32_e32 v8, v8, v24 -; GFX7-NEXT: v_min_f32_e32 v7, v7, v23 -; GFX7-NEXT: v_min_f32_e32 v5, v5, v21 -; GFX7-NEXT: v_min_f32_e32 v4, v4, v20 -; GFX7-NEXT: v_min_f32_e32 v3, v3, v19 -; GFX7-NEXT: v_min_f32_e32 v2, v2, v18 -; GFX7-NEXT: v_min_f32_e32 v1, v1, v17 -; GFX7-NEXT: v_min_f32_e32 v0, v0, v16 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v27 -; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GFX7-NEXT: v_min_f32_e32 v15, v15, v22 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; GFX7-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; GFX7-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GFX7-NEXT: v_min_f32_e32 v0, v0, v8 +; GFX7-NEXT: v_alignbit_b32 v0, v9, v0, 16 +; GFX7-NEXT: v_alignbit_b32 v1, v10, v1, 16 +; GFX7-NEXT: v_alignbit_b32 v2, v11, v2, 16 +; GFX7-NEXT: v_alignbit_b32 v3, v12, v3, 16 +; GFX7-NEXT: v_alignbit_b32 v4, v13, v4, 16 +; GFX7-NEXT: v_alignbit_b32 v5, v14, v5, 16 +; GFX7-NEXT: v_alignbit_b32 v6, v15, v6, 16 +; GFX7-NEXT: v_alignbit_b32 v7, v16, v7, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_minimumnum_v16bf16: @@ -7611,264 +7591,200 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) { ; GFX7-LABEL: v_minimumnum_v32bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128 +; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v30 +; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v14 +; GFX7-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31 +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30 -; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29 -; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28 -; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27 -; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26 -; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 -; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 -; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23 -; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 -; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21 -; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20 -; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19 -; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18 -; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 -; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 -; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GFX7-NEXT: v_min_f32_e32 v31, v32, v31 +; GFX7-NEXT: v_min_f32_e32 v14, v14, v30 +; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v29 +; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v13 +; GFX7-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; GFX7-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30 +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29 ; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GFX7-NEXT: v_min_f32_e32 v30, v32, v30 +; GFX7-NEXT: v_min_f32_e32 v13, v13, v29 +; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v28 +; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v12 +; GFX7-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29 +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28 ; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 -; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GFX7-NEXT: v_min_f32_e32 v29, v32, v29 +; GFX7-NEXT: v_min_f32_e32 v12, v12, v28 +; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v27 +; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v11 +; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28 +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GFX7-NEXT: v_min_f32_e32 v28, v32, v28 +; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; GFX7-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GFX7-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27 ; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX7-NEXT: v_min_f32_e32 v11, v11, v27 +; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v15 +; GFX7-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27 +; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v32 +; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GFX7-NEXT: v_min_f32_e32 v27, v27, v33 +; GFX7-NEXT: v_min_f32_e32 v15, v15, v32 +; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v26 +; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v10 +; GFX7-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26 ; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 -; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GFX7-NEXT: v_min_f32_e32 v32, v33, v32 +; GFX7-NEXT: v_min_f32_e32 v10, v10, v26 +; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v25 +; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v9 +; GFX7-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26 +; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 ; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX7-NEXT: v_min_f32_e32 v26, v33, v26 +; GFX7-NEXT: v_min_f32_e32 v9, v9, v25 +; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v24 +; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 +; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 ; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GFX7-NEXT: v_min_f32_e32 v25, v33, v25 +; GFX7-NEXT: v_min_f32_e32 v8, v8, v24 +; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v23 +; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v7 +; GFX7-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 +; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX7-NEXT: v_min_f32_e32 v24, v33, v24 +; GFX7-NEXT: v_min_f32_e32 v7, v7, v23 +; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v22 +; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23 +; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX7-NEXT: v_min_f32_e32 v23, v33, v23 +; GFX7-NEXT: v_min_f32_e32 v6, v6, v22 +; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v21 +; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX7-NEXT: v_min_f32_e32 v22, v33, v22 +; GFX7-NEXT: v_min_f32_e32 v5, v5, v21 +; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v20 +; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21 +; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX7-NEXT: v_min_f32_e32 v21, v33, v21 +; GFX7-NEXT: v_min_f32_e32 v4, v4, v20 +; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v19 +; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20 +; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX7-NEXT: v_min_f32_e32 v20, v33, v20 +; GFX7-NEXT: v_min_f32_e32 v3, v3, v19 +; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v18 +; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19 +; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX7-NEXT: v_min_f32_e32 v19, v33, v19 +; GFX7-NEXT: v_min_f32_e32 v2, v2, v18 +; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v17 +; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18 +; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: v_min_f32_e32 v18, v33, v18 +; GFX7-NEXT: v_min_f32_e32 v1, v1, v17 +; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v16 +; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v0 +; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GFX7-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_min_f32_e32 v17, v33, v17 +; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 -; GFX7-NEXT: v_min_f32_e32 v31, v31, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124 -; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_min_f32_e32 v30, v30, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:120 -; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_min_f32_e32 v29, v29, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:116 -; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_min_f32_e32 v28, v28, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:112 -; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_min_f32_e32 v27, v27, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:108 -; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_min_f32_e32 v26, v26, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:104 -; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_min_f32_e32 v25, v25, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:100 -; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_min_f32_e32 v24, v24, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:96 -; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_min_f32_e32 v23, v23, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:92 -; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_min_f32_e32 v22, v22, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:88 -; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_min_f32_e32 v21, v21, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:84 -; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_min_f32_e32 v20, v20, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:80 -; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_min_f32_e32 v19, v19, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:76 -; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_min_f32_e32 v18, v18, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72 -; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_min_f32_e32 v17, v17, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68 -; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_min_f32_e32 v16, v16, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64 -; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_min_f32_e32 v15, v15, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60 -; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_min_f32_e32 v14, v14, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56 -; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_min_f32_e32 v13, v13, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52 -; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_min_f32_e32 v12, v12, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:48 -; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_min_f32_e32 v11, v11, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:44 -; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_min_f32_e32 v10, v10, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:40 -; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_min_f32_e32 v9, v9, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36 -; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_min_f32_e32 v8, v8, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:32 -; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_min_f32_e32 v7, v7, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_min_f32_e32 v6, v6, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_min_f32_e32 v5, v5, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:20 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_min_f32_e32 v4, v4, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:16 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_min_f32_e32 v3, v3, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_min_f32_e32 v2, v2, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_min_f32_e32 v1, v1, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_min_f32_e32 v0, v0, v32 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_min_f32_e32 v0, v0, v16 +; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v17 +; GFX7-NEXT: v_alignbit_b32 v0, v16, v0, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v18 +; GFX7-NEXT: v_alignbit_b32 v1, v16, v1, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v19 +; GFX7-NEXT: v_alignbit_b32 v2, v16, v2, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v20 +; GFX7-NEXT: v_alignbit_b32 v3, v16, v3, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v21 +; GFX7-NEXT: v_alignbit_b32 v4, v16, v4, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v22 +; GFX7-NEXT: v_alignbit_b32 v5, v16, v5, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v23 +; GFX7-NEXT: v_alignbit_b32 v6, v16, v6, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v24 +; GFX7-NEXT: v_alignbit_b32 v7, v16, v7, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v25 +; GFX7-NEXT: v_alignbit_b32 v8, v16, v8, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v26 +; GFX7-NEXT: v_alignbit_b32 v9, v16, v9, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v32 +; GFX7-NEXT: v_alignbit_b32 v10, v16, v10, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v28 +; GFX7-NEXT: v_alignbit_b32 v11, v16, v11, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v27 +; GFX7-NEXT: v_lshrrev_b32_e32 v17, 16, v31 +; GFX7-NEXT: v_lshrrev_b32_e32 v18, 16, v30 +; GFX7-NEXT: v_lshrrev_b32_e32 v19, 16, v29 +; GFX7-NEXT: v_alignbit_b32 v12, v19, v12, 16 +; GFX7-NEXT: v_alignbit_b32 v13, v18, v13, 16 +; GFX7-NEXT: v_alignbit_b32 v14, v17, v14, 16 +; GFX7-NEXT: v_alignbit_b32 v15, v16, v15, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_minimumnum_v32bf16: @@ -12650,18 +12566,14 @@ define <2 x bfloat> @v_minimumnum_v2bf16_no_ieee(<2 x bfloat> %x, <2 x bfloat> % ; GFX7-LABEL: v_minimumnum_v2bf16_no_ieee: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_min_f32_e32 v1, v1, v3 -; GFX7-NEXT: v_min_f32_e32 v0, v0, v2 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX7-NEXT: v_min_f32_e32 v2, v3, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX7-NEXT: v_alignbit_b32 v0, v2, v0, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_minimumnum_v2bf16_no_ieee: @@ -13013,24 +12925,18 @@ define <3 x bfloat> @v_minimumnum_v3bf16_no_ieee(<3 x bfloat> %x, <3 x bfloat> % ; GFX7-LABEL: v_minimumnum_v3bf16_no_ieee: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_min_f32_e32 v2, v2, v5 -; GFX7-NEXT: v_min_f32_e32 v1, v1, v4 -; GFX7-NEXT: v_min_f32_e32 v0, v0, v3 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_min_f32_e32 v1, v1, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX7-NEXT: v_min_f32_e32 v3, v4, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_alignbit_b32 v0, v3, v0, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_minimumnum_v3bf16_no_ieee: @@ -13511,30 +13417,22 @@ define <4 x bfloat> @v_minimumnum_v4bf16_no_ieee(<4 x bfloat> %x, <4 x bfloat> % ; GFX7-LABEL: v_minimumnum_v4bf16_no_ieee: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_min_f32_e32 v3, v3, v7 -; GFX7-NEXT: v_min_f32_e32 v2, v2, v6 -; GFX7-NEXT: v_min_f32_e32 v1, v1, v5 -; GFX7-NEXT: v_min_f32_e32 v0, v0, v4 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_min_f32_e32 v4, v5, v4 +; GFX7-NEXT: v_min_f32_e32 v1, v1, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; GFX7-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX7-NEXT: v_alignbit_b32 v0, v3, v0, 16 +; GFX7-NEXT: v_alignbit_b32 v1, v4, v1, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_minimumnum_v4bf16_no_ieee: diff --git a/llvm/test/CodeGen/AMDGPU/minimumnum.ll b/llvm/test/CodeGen/AMDGPU/minimumnum.ll index 64e8b7b50de08..329118e3dca01 100644 --- a/llvm/test/CodeGen/AMDGPU/minimumnum.ll +++ b/llvm/test/CodeGen/AMDGPU/minimumnum.ll @@ -3197,29 +3197,35 @@ define <2 x half> @v_minimumnum_v2f16(<2 x half> %x, <2 x half> %y) { ; GFX7-SDAG-LABEL: v_minimumnum_v2f16: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v2 -; GFX7-SDAG-NEXT: v_min_f32_e32 v1, v1, v3 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_min_f32_e32 v2, v2, v3 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: v_minimumnum_v2f16: ; GFX7-GISEL: ; %bb.0: ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-GISEL-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX7-GISEL-NEXT: v_min_f32_e32 v0, v0, v1 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: v_min_f32_e32 v1, v1, v3 +; GFX7-GISEL-NEXT: v_min_f32_e32 v1, v2, v3 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: v_minimumnum_v2f16: @@ -3347,29 +3353,35 @@ define <2 x half> @v_minimumnum_v2f16_nnan(<2 x half> %x, <2 x half> %y) { ; GFX7-SDAG-LABEL: v_minimumnum_v2f16_nnan: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v2 -; GFX7-SDAG-NEXT: v_min_f32_e32 v1, v1, v3 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_min_f32_e32 v2, v3, v2 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: v_minimumnum_v2f16_nnan: ; GFX7-GISEL: ; %bb.0: ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-GISEL-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX7-GISEL-NEXT: v_min_f32_e32 v0, v0, v1 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: v_min_f32_e32 v1, v1, v3 +; GFX7-GISEL-NEXT: v_min_f32_e32 v1, v2, v3 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: v_minimumnum_v2f16_nnan: @@ -3423,38 +3435,43 @@ define <3 x half> @v_minimumnum_v3f16(<3 x half> %x, <3 x half> %y) { ; GFX7-SDAG-LABEL: v_minimumnum_v3f16: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v3 -; GFX7-SDAG-NEXT: v_min_f32_e32 v1, v1, v4 -; GFX7-SDAG-NEXT: v_min_f32_e32 v2, v2, v5 +; GFX7-SDAG-NEXT: v_min_f32_e32 v4, v4, v5 +; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v4 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_min_f32_e32 v1, v1, v3 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: v_minimumnum_v3f16: ; GFX7-GISEL: ; %bb.0: ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-GISEL-NEXT: v_min_f32_e32 v0, v0, v3 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v3, v4 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-GISEL-NEXT: v_min_f32_e32 v2, v4, v2 +; GFX7-GISEL-NEXT: v_min_f32_e32 v0, v0, v5 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-GISEL-NEXT: v_min_f32_e32 v1, v1, v3 -; GFX7-GISEL-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-GISEL-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: v_minimumnum_v3f16: @@ -3606,38 +3623,43 @@ define <3 x half> @v_minimumnum_v3f16_nnan(<3 x half> %x, <3 x half> %y) { ; GFX7-SDAG-LABEL: v_minimumnum_v3f16_nnan: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v5, 16, v0 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v3 -; GFX7-SDAG-NEXT: v_min_f32_e32 v1, v1, v4 -; GFX7-SDAG-NEXT: v_min_f32_e32 v2, v2, v5 +; GFX7-SDAG-NEXT: v_min_f32_e32 v4, v5, v4 +; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v4 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_min_f32_e32 v1, v1, v3 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: v_minimumnum_v3f16_nnan: ; GFX7-GISEL: ; %bb.0: ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-GISEL-NEXT: v_min_f32_e32 v0, v0, v3 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v3, v4 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-GISEL-NEXT: v_min_f32_e32 v2, v4, v2 +; GFX7-GISEL-NEXT: v_min_f32_e32 v0, v0, v5 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-GISEL-NEXT: v_min_f32_e32 v1, v1, v3 -; GFX7-GISEL-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-GISEL-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: v_minimumnum_v3f16_nnan: @@ -3704,47 +3726,59 @@ define <4 x half> @v_minimumnum_v4f16(<4 x half> %x, <4 x half> %y) { ; GFX7-SDAG-LABEL: v_minimumnum_v4f16: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v5, 16, v2 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v7, 16, v3 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v6, v6 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v4 -; GFX7-SDAG-NEXT: v_min_f32_e32 v1, v1, v5 -; GFX7-SDAG-NEXT: v_min_f32_e32 v2, v2, v6 -; GFX7-SDAG-NEXT: v_min_f32_e32 v3, v3, v7 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-SDAG-NEXT: v_min_f32_e32 v4, v4, v5 +; GFX7-SDAG-NEXT: v_min_f32_e32 v6, v6, v7 +; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v4 +; GFX7-SDAG-NEXT: v_min_f32_e32 v1, v1, v3 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v6 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX7-SDAG-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: v_minimumnum_v4f16: ; GFX7-GISEL: ; %bb.0: ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v7, 16, v3 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-GISEL-NEXT: v_min_f32_e32 v0, v0, v4 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v4, v6 -; GFX7-GISEL-NEXT: v_min_f32_e32 v1, v1, v5 +; GFX7-GISEL-NEXT: v_min_f32_e32 v0, v0, v5 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v5, v7 -; GFX7-GISEL-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-GISEL-NEXT: v_min_f32_e32 v3, v3, v5 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v5, v6 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v6, v7 +; GFX7-GISEL-NEXT: v_min_f32_e32 v2, v4, v2 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-GISEL-NEXT: v_min_f32_e32 v1, v1, v3 +; GFX7-GISEL-NEXT: v_min_f32_e32 v3, v5, v6 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-GISEL-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX7-GISEL-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: v_minimumnum_v4f16: @@ -3916,47 +3950,59 @@ define <4 x half> @v_minimumnum_v4f16_nnan(<4 x half> %x, <4 x half> %y) { ; GFX7-SDAG-LABEL: v_minimumnum_v4f16_nnan: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v5, 16, v0 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v7, v7 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v4 -; GFX7-SDAG-NEXT: v_min_f32_e32 v1, v1, v5 -; GFX7-SDAG-NEXT: v_min_f32_e32 v2, v2, v6 -; GFX7-SDAG-NEXT: v_min_f32_e32 v3, v3, v7 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_min_f32_e32 v4, v5, v4 +; GFX7-SDAG-NEXT: v_min_f32_e32 v6, v7, v6 +; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v4 +; GFX7-SDAG-NEXT: v_min_f32_e32 v1, v1, v3 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v6 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX7-SDAG-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: v_minimumnum_v4f16_nnan: ; GFX7-GISEL: ; %bb.0: ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v7, 16, v3 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-GISEL-NEXT: v_min_f32_e32 v0, v0, v4 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v4, v6 -; GFX7-GISEL-NEXT: v_min_f32_e32 v1, v1, v5 +; GFX7-GISEL-NEXT: v_min_f32_e32 v0, v0, v5 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v5, v7 -; GFX7-GISEL-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-GISEL-NEXT: v_min_f32_e32 v3, v3, v5 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v5, v6 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v6, v7 +; GFX7-GISEL-NEXT: v_min_f32_e32 v2, v4, v2 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-GISEL-NEXT: v_min_f32_e32 v1, v1, v3 +; GFX7-GISEL-NEXT: v_min_f32_e32 v3, v5, v6 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-GISEL-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX7-GISEL-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: v_minimumnum_v4f16_nnan: @@ -4020,65 +4066,83 @@ define <6 x half> @v_minimumnum_v6f16(<6 x half> %x, <6 x half> %y) { ; GFX7-SDAG-LABEL: v_minimumnum_v6f16: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v7, 16, v3 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v6, v6 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v8, 16, v1 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v9, 16, v4 +; GFX7-SDAG-NEXT: v_min_f32_e32 v6, v6, v7 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v8, v8 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v6 -; GFX7-SDAG-NEXT: v_min_f32_e32 v1, v1, v7 -; GFX7-SDAG-NEXT: v_min_f32_e32 v2, v2, v8 -; GFX7-SDAG-NEXT: v_min_f32_e32 v3, v3, v9 -; GFX7-SDAG-NEXT: v_min_f32_e32 v4, v4, v10 -; GFX7-SDAG-NEXT: v_min_f32_e32 v5, v5, v11 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v11, 16, v5 +; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v3 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v6 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v10, v10 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v11, v11 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-SDAG-NEXT: v_min_f32_e32 v8, v8, v9 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-SDAG-NEXT: v_min_f32_e32 v10, v10, v11 +; GFX7-SDAG-NEXT: v_min_f32_e32 v1, v1, v4 +; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v3 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v8 +; GFX7-SDAG-NEXT: v_min_f32_e32 v2, v2, v5 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v4, v10 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-SDAG-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX7-SDAG-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: v_minimumnum_v6f16: ; GFX7-GISEL: ; %bb.0: ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v9, 16, v3 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v9, v9 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v10, 16, v4 +; GFX7-GISEL-NEXT: v_min_f32_e32 v0, v0, v3 +; GFX7-GISEL-NEXT: v_min_f32_e32 v3, v6, v9 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-GISEL-NEXT: v_min_f32_e32 v0, v0, v6 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v6, v8 -; GFX7-GISEL-NEXT: v_min_f32_e32 v1, v1, v7 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v7, v9 -; GFX7-GISEL-NEXT: v_min_f32_e32 v2, v2, v6 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v6, v10 -; GFX7-GISEL-NEXT: v_min_f32_e32 v3, v3, v7 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v6, v7 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v7, v10 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v8, 16, v2 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v11, 16, v5 +; GFX7-GISEL-NEXT: v_min_f32_e32 v1, v1, v4 +; GFX7-GISEL-NEXT: v_min_f32_e32 v4, v6, v7 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v6, v8 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v7, v11 -; GFX7-GISEL-NEXT: v_min_f32_e32 v4, v4, v6 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-GISEL-NEXT: v_min_f32_e32 v5, v5, v7 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-GISEL-NEXT: v_min_f32_e32 v2, v2, v5 +; GFX7-GISEL-NEXT: v_min_f32_e32 v5, v6, v7 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-GISEL-NEXT: v_or_b32_e32 v0, v0, v3 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX7-GISEL-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX7-GISEL-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: v_minimumnum_v6f16: @@ -4294,83 +4358,107 @@ define <8 x half> @v_minimumnum_v8f16(<8 x half> %x, <8 x half> %y) { ; GFX7-SDAG-LABEL: v_minimumnum_v8f16: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v8, 16, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v9, 16, v4 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v8, v8 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v9, v9 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v10, 16, v1 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v11, 16, v5 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v10, v10 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v8 -; GFX7-SDAG-NEXT: v_min_f32_e32 v1, v1, v9 -; GFX7-SDAG-NEXT: v_min_f32_e32 v2, v2, v10 -; GFX7-SDAG-NEXT: v_min_f32_e32 v3, v3, v11 -; GFX7-SDAG-NEXT: v_min_f32_e32 v4, v4, v12 -; GFX7-SDAG-NEXT: v_min_f32_e32 v5, v5, v13 -; GFX7-SDAG-NEXT: v_min_f32_e32 v6, v6, v14 -; GFX7-SDAG-NEXT: v_min_f32_e32 v7, v7, v15 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-SDAG-NEXT: v_min_f32_e32 v8, v8, v9 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v12, 16, v2 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v13, 16, v6 +; GFX7-SDAG-NEXT: v_min_f32_e32 v10, v10, v11 +; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v4 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v4, v8 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v12, v12 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v13, v13 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v14, 16, v3 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v15, 16, v7 +; GFX7-SDAG-NEXT: v_min_f32_e32 v1, v1, v5 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v5, v10 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v14, v14 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v15, v15 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-SDAG-NEXT: v_min_f32_e32 v12, v12, v13 +; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX7-SDAG-NEXT: v_min_f32_e32 v14, v14, v15 +; GFX7-SDAG-NEXT: v_min_f32_e32 v2, v2, v6 +; GFX7-SDAG-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v4, v12 +; GFX7-SDAG-NEXT: v_min_f32_e32 v3, v3, v7 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v5, v14 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-SDAG-NEXT: v_or_b32_e32 v2, v2, v4 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX7-SDAG-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: v_minimumnum_v8f16: ; GFX7-GISEL: ; %bb.0: ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v8, 16, v0 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v12, 16, v4 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v8, v8 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v12, v12 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v9, 16, v1 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v13, 16, v5 +; GFX7-GISEL-NEXT: v_min_f32_e32 v0, v0, v4 +; GFX7-GISEL-NEXT: v_min_f32_e32 v4, v8, v12 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-GISEL-NEXT: v_min_f32_e32 v0, v0, v8 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v8, v10 -; GFX7-GISEL-NEXT: v_min_f32_e32 v1, v1, v9 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v9, v11 -; GFX7-GISEL-NEXT: v_min_f32_e32 v2, v2, v8 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v8, v12 -; GFX7-GISEL-NEXT: v_min_f32_e32 v3, v3, v9 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v8, v9 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v9, v13 -; GFX7-GISEL-NEXT: v_min_f32_e32 v4, v4, v8 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v14, 16, v6 +; GFX7-GISEL-NEXT: v_min_f32_e32 v1, v1, v5 +; GFX7-GISEL-NEXT: v_min_f32_e32 v5, v8, v9 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v8, v14 -; GFX7-GISEL-NEXT: v_min_f32_e32 v5, v5, v9 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v8, v10 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v9, v14 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v11, 16, v3 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v15, 16, v7 +; GFX7-GISEL-NEXT: v_min_f32_e32 v2, v2, v6 +; GFX7-GISEL-NEXT: v_min_f32_e32 v6, v8, v9 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v8, v11 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v9, v15 -; GFX7-GISEL-NEXT: v_min_f32_e32 v6, v6, v8 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-GISEL-NEXT: v_min_f32_e32 v7, v7, v9 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-GISEL-NEXT: v_min_f32_e32 v3, v3, v7 +; GFX7-GISEL-NEXT: v_min_f32_e32 v7, v8, v9 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-GISEL-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX7-GISEL-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-GISEL-NEXT: v_or_b32_e32 v2, v2, v4 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v7 +; GFX7-GISEL-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: v_minimumnum_v8f16: @@ -4630,159 +4718,203 @@ define <16 x half> @v_minimumnum_v16f16(<16 x half> %x, <16 x half> %y) { ; GFX7-SDAG-LABEL: v_minimumnum_v16f16: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v16 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v16, v17 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v17, v20 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-SDAG-NEXT: v_min_f32_e32 v1, v1, v16 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v16, v18 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v18, v21 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v16, 16, v6 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v17, 16, v7 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v18, 16, v15 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v19, 16, v14 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v20, 16, v5 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v21, 16, v13 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v22, 16, v4 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v23, 16, v12 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v16, v16 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-SDAG-NEXT: v_min_f32_e32 v2, v2, v16 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v16, v19 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v19, v22 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v20, v23 -; GFX7-SDAG-NEXT: v_min_f32_e32 v4, v4, v17 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v16, v16 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GFX7-SDAG-NEXT: v_min_f32_e32 v5, v5, v18 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GFX7-SDAG-NEXT: v_min_f32_e32 v3, v3, v16 -; GFX7-SDAG-NEXT: buffer_load_dword v16, off, s[0:3], s32 -; GFX7-SDAG-NEXT: v_min_f32_e32 v6, v6, v19 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v17, v24 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v18, v25 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v19, v26 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v7, v7 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v20, v20 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v21, v21 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v22, v22 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v23, v23 +; GFX7-SDAG-NEXT: v_min_f32_e32 v17, v17, v18 +; GFX7-SDAG-NEXT: v_min_f32_e32 v16, v16, v19 +; GFX7-SDAG-NEXT: v_min_f32_e32 v18, v20, v21 +; GFX7-SDAG-NEXT: v_min_f32_e32 v19, v22, v23 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v20, 16, v3 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v21, 16, v11 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v22, 16, v2 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v23, 16, v10 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v20, v20 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v21, v21 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v22, v22 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v23, v23 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v24, 16, v8 +; GFX7-SDAG-NEXT: v_min_f32_e32 v20, v20, v21 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v24, v24 +; GFX7-SDAG-NEXT: v_min_f32_e32 v21, v22, v23 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v22, 16, v1 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v23, 16, v9 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v22, v22 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v23, v23 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v17, v17 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v18, v18 +; GFX7-SDAG-NEXT: v_min_f32_e32 v22, v22, v23 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v23, 16, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v23, v23 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_min_f32_e32 v1, v1, v9 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v9, v22 +; GFX7-SDAG-NEXT: v_min_f32_e32 v23, v23, v24 +; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v8 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v8, v23 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GFX7-SDAG-NEXT: v_min_f32_e32 v7, v7, v20 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v20, v27 -; GFX7-SDAG-NEXT: v_min_f32_e32 v8, v8, v17 -; GFX7-SDAG-NEXT: v_min_f32_e32 v9, v9, v18 -; GFX7-SDAG-NEXT: v_min_f32_e32 v10, v10, v19 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v17, v28 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v18, v29 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v19, v30 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v15, v15 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v20, v20 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v8 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; GFX7-SDAG-NEXT: v_min_f32_e32 v2, v2, v10 +; GFX7-SDAG-NEXT: v_or_b32_e32 v1, v1, v8 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v8, v21 +; GFX7-SDAG-NEXT: v_min_f32_e32 v3, v3, v11 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v9, v20 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v17, v17 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v18, v18 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX7-SDAG-NEXT: v_or_b32_e32 v2, v2, v8 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; GFX7-SDAG-NEXT: v_min_f32_e32 v4, v4, v12 +; GFX7-SDAG-NEXT: v_or_b32_e32 v3, v3, v8 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v8, v19 +; GFX7-SDAG-NEXT: v_min_f32_e32 v5, v5, v13 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v9, v18 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v6, v6 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v19, v19 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v7, v7 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GFX7-SDAG-NEXT: v_min_f32_e32 v11, v11, v20 -; GFX7-SDAG-NEXT: v_min_f32_e32 v12, v12, v17 -; GFX7-SDAG-NEXT: v_min_f32_e32 v13, v13, v18 -; GFX7-SDAG-NEXT: v_min_f32_e32 v14, v14, v19 -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GFX7-SDAG-NEXT: v_min_f32_e32 v15, v15, v16 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX7-SDAG-NEXT: v_or_b32_e32 v4, v4, v8 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; GFX7-SDAG-NEXT: v_min_f32_e32 v6, v6, v14 +; GFX7-SDAG-NEXT: v_or_b32_e32 v5, v5, v8 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v8, v16 +; GFX7-SDAG-NEXT: v_min_f32_e32 v7, v7, v15 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v9, v17 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX7-SDAG-NEXT: v_or_b32_e32 v6, v6, v8 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; GFX7-SDAG-NEXT: v_or_b32_e32 v7, v7, v8 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: v_minimumnum_v16f16: ; GFX7-GISEL: ; %bb.0: ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v16, 16, v0 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v18, 16, v8 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-GISEL-NEXT: v_min_f32_e32 v0, v0, v16 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v16, v17 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v17, v20 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-GISEL-NEXT: v_min_f32_e32 v1, v1, v16 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v16, v18 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v18, v21 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GFX7-GISEL-NEXT: v_min_f32_e32 v4, v4, v17 -; GFX7-GISEL-NEXT: v_min_f32_e32 v2, v2, v16 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v16, v19 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v19, v22 -; GFX7-GISEL-NEXT: v_min_f32_e32 v5, v5, v18 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX7-GISEL-NEXT: v_min_f32_e32 v3, v3, v16 -; GFX7-GISEL-NEXT: buffer_load_dword v16, off, s[0:3], s32 -; GFX7-GISEL-NEXT: v_min_f32_e32 v6, v6, v19 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v17, v23 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v18, v24 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v17, 16, v1 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v17, v17 +; GFX7-GISEL-NEXT: v_min_f32_e32 v0, v0, v8 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v8, 16, v9 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v19, v25 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v8, v8 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v16, v16 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v18, v18 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v19, 16, v2 +; GFX7-GISEL-NEXT: v_min_f32_e32 v1, v1, v9 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v9, 16, v10 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v20, v26 -; GFX7-GISEL-NEXT: v_min_f32_e32 v7, v7, v17 -; GFX7-GISEL-NEXT: v_min_f32_e32 v8, v8, v18 -; GFX7-GISEL-NEXT: v_min_f32_e32 v9, v9, v19 -; GFX7-GISEL-NEXT: v_min_f32_e32 v10, v10, v20 +; GFX7-GISEL-NEXT: v_min_f32_e32 v8, v17, v8 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v17, v19 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v9, v9 +; GFX7-GISEL-NEXT: v_min_f32_e32 v16, v16, v18 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v18, 16, v3 +; GFX7-GISEL-NEXT: v_min_f32_e32 v2, v2, v10 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v10, 16, v11 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v17, v27 +; GFX7-GISEL-NEXT: v_min_f32_e32 v9, v17, v9 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v17, v18 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v10, v10 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v19, 16, v4 +; GFX7-GISEL-NEXT: v_min_f32_e32 v3, v3, v11 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v11, 16, v12 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v18, v28 +; GFX7-GISEL-NEXT: v_min_f32_e32 v10, v17, v10 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v17, v19 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v11, v11 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v18, 16, v5 +; GFX7-GISEL-NEXT: v_min_f32_e32 v4, v4, v12 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v12, 16, v13 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v19, v29 +; GFX7-GISEL-NEXT: v_min_f32_e32 v11, v17, v11 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v17, v18 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v12, v12 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v19, 16, v6 +; GFX7-GISEL-NEXT: v_min_f32_e32 v5, v5, v13 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v13, 16, v14 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v6, v6 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v20, v30 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GFX7-GISEL-NEXT: v_min_f32_e32 v11, v11, v17 -; GFX7-GISEL-NEXT: v_min_f32_e32 v12, v12, v18 -; GFX7-GISEL-NEXT: v_min_f32_e32 v13, v13, v19 -; GFX7-GISEL-NEXT: v_min_f32_e32 v14, v14, v20 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v18, 16, v7 +; GFX7-GISEL-NEXT: v_min_f32_e32 v12, v17, v12 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v17, 16, v15 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v8, v8 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v19, v19 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v13, v13 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v9, v9 +; GFX7-GISEL-NEXT: v_min_f32_e32 v6, v6, v14 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v14, v15 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v15, v18 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v17, v17 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v10, v10 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v11, v11 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v12, v12 +; GFX7-GISEL-NEXT: v_min_f32_e32 v13, v19, v13 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v13, v13 +; GFX7-GISEL-NEXT: v_min_f32_e32 v7, v7, v14 +; GFX7-GISEL-NEXT: v_min_f32_e32 v14, v15, v17 +; GFX7-GISEL-NEXT: v_or_b32_e32 v1, v1, v8 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v16, v16 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GFX7-GISEL-NEXT: v_min_f32_e32 v15, v15, v16 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v15, v15 +; GFX7-GISEL-NEXT: v_or_b32_e32 v2, v2, v8 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v8, 16, v10 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX7-GISEL-NEXT: v_or_b32_e32 v3, v3, v8 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v8, 16, v11 +; GFX7-GISEL-NEXT: v_or_b32_e32 v4, v4, v8 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v8, 16, v12 +; GFX7-GISEL-NEXT: v_or_b32_e32 v5, v5, v8 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v8, 16, v13 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v15, 16, v16 +; GFX7-GISEL-NEXT: v_or_b32_e32 v6, v6, v8 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v8, 16, v14 +; GFX7-GISEL-NEXT: v_or_b32_e32 v0, v0, v15 +; GFX7-GISEL-NEXT: v_or_b32_e32 v7, v7, v8 ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: v_minimumnum_v16f16: @@ -5214,431 +5346,408 @@ define <32 x half> @v_minimumnum_v32f16(<32 x half> %x, <32 x half> %y) { ; GFX7-SDAG-LABEL: v_minimumnum_v32f16: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v14, v14 +; GFX7-SDAG-NEXT: buffer_load_dword v48, off, s[0:3], s32 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v31, 16, v14 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v32, 16, v30 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v33, 16, v13 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v34, 16, v29 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v35, 16, v12 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v36, 16, v28 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v37, 16, v11 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v38, 16, v27 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v50, 16, v9 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v51, 16, v25 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v31, v31 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v32, v32 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v33, v33 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v34, v34 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v35, v35 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v36, v36 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v37, v37 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v38, v38 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v50, v50 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v51, v51 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v39, 16, v10 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v49, 16, v26 +; GFX7-SDAG-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX7-SDAG-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX7-SDAG-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX7-SDAG-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v52, 16, v8 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v53, 16, v24 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v54, 16, v7 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v55, 16, v23 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v40, 16, v6 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v41, 16, v22 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v39, v39 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v49, v49 +; GFX7-SDAG-NEXT: v_min_f32_e32 v31, v31, v32 +; GFX7-SDAG-NEXT: v_min_f32_e32 v32, v33, v34 +; GFX7-SDAG-NEXT: v_min_f32_e32 v33, v35, v36 +; GFX7-SDAG-NEXT: v_min_f32_e32 v35, v37, v38 +; GFX7-SDAG-NEXT: v_min_f32_e32 v37, v50, v51 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v50, 16, v4 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v51, 16, v20 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v52, v52 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v53, v53 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v54, v54 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v55, v55 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v40, v40 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v41, v41 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v50, v50 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v51, v51 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v42, 16, v15 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v43, 16, v5 +; GFX7-SDAG-NEXT: v_min_f32_e32 v36, v39, v49 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v49, 16, v21 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v42, v42 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v43, v43 +; GFX7-SDAG-NEXT: v_min_f32_e32 v38, v52, v53 +; GFX7-SDAG-NEXT: v_min_f32_e32 v39, v54, v55 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v49, v49 +; GFX7-SDAG-NEXT: v_min_f32_e32 v52, v40, v41 +; GFX7-SDAG-NEXT: v_min_f32_e32 v50, v50, v51 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v51, 16, v3 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v53, 16, v19 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v54, 16, v2 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v55, 16, v18 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v40, 16, v1 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v41, 16, v17 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v51, v51 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v53, v53 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v54, v54 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v55, v55 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v40, v40 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v41, v41 +; GFX7-SDAG-NEXT: v_min_f32_e32 v49, v43, v49 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v43, 16, v16 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v43, v43 +; GFX7-SDAG-NEXT: v_min_f32_e32 v51, v51, v53 +; GFX7-SDAG-NEXT: v_min_f32_e32 v53, v54, v55 +; GFX7-SDAG-NEXT: v_min_f32_e32 v54, v40, v41 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v19, v19 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v40, v14 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v30, v30 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v16, v16 +; GFX7-SDAG-NEXT: s_waitcnt vmcnt(4) +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v34, 16, v48 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v34, v34 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v48, v48 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v17, v17 +; GFX7-SDAG-NEXT: v_min_f32_e32 v34, v42, v34 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v42, 16, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v42, v42 +; GFX7-SDAG-NEXT: v_min_f32_e32 v14, v15, v48 +; GFX7-SDAG-NEXT: v_min_f32_e32 v15, v40, v30 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_min_f32_e32 v55, v42, v43 +; GFX7-SDAG-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX7-SDAG-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX7-SDAG-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX7-SDAG-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX7-SDAG-NEXT: v_min_f32_e32 v1, v1, v17 +; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v16 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v16, v55 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v17, v54 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v18, v18 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v23, v23 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v16 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v16, 16, v17 +; GFX7-SDAG-NEXT: v_min_f32_e32 v2, v2, v18 +; GFX7-SDAG-NEXT: v_or_b32_e32 v1, v1, v16 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v16, v53 +; GFX7-SDAG-NEXT: v_min_f32_e32 v3, v3, v19 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v17, v51 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v20, v20 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v21, v21 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX7-SDAG-NEXT: v_or_b32_e32 v2, v2, v16 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v16, 16, v17 +; GFX7-SDAG-NEXT: v_min_f32_e32 v4, v4, v20 +; GFX7-SDAG-NEXT: v_or_b32_e32 v3, v3, v16 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v16, v50 +; GFX7-SDAG-NEXT: v_min_f32_e32 v5, v5, v21 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v17, v49 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v6, v6 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v22, v22 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v7, v7 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v27, v27 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX7-SDAG-NEXT: v_or_b32_e32 v4, v4, v16 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v16, 16, v17 +; GFX7-SDAG-NEXT: v_min_f32_e32 v6, v6, v22 +; GFX7-SDAG-NEXT: v_or_b32_e32 v5, v5, v16 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v16, v52 +; GFX7-SDAG-NEXT: v_min_f32_e32 v7, v7, v23 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v17, v39 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v8, v8 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v24, v24 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v9, v9 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v25, v25 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX7-SDAG-NEXT: v_or_b32_e32 v6, v6, v16 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v16, 16, v17 +; GFX7-SDAG-NEXT: v_min_f32_e32 v8, v8, v24 +; GFX7-SDAG-NEXT: v_or_b32_e32 v7, v7, v16 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v16, v38 +; GFX7-SDAG-NEXT: v_min_f32_e32 v9, v9, v25 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v8, v8 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v17, v37 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v10, v10 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v26, v26 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v9, v9 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v11, v11 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v27, v27 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v28, v28 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GFX7-SDAG-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX7-SDAG-NEXT: v_or_b32_e32 v8, v8, v16 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v16, 16, v17 +; GFX7-SDAG-NEXT: v_min_f32_e32 v10, v10, v26 +; GFX7-SDAG-NEXT: v_or_b32_e32 v9, v9, v16 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v16, v36 +; GFX7-SDAG-NEXT: v_min_f32_e32 v11, v11, v27 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v10, v10 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v17, v35 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v12, v12 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v28, v28 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v11, v11 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v13, v13 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v29, v29 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v30, v30 -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(1) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v31 -; GFX7-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(1) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v32, v32 -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GFX7-SDAG-NEXT: v_min_f32_e32 v1, v1, v31 -; GFX7-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:12 -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GFX7-SDAG-NEXT: v_min_f32_e32 v2, v2, v31 -; GFX7-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:16 -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GFX7-SDAG-NEXT: v_min_f32_e32 v3, v3, v31 -; GFX7-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:20 -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GFX7-SDAG-NEXT: v_min_f32_e32 v4, v4, v31 -; GFX7-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:24 -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GFX7-SDAG-NEXT: v_min_f32_e32 v5, v5, v31 -; GFX7-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:28 -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GFX7-SDAG-NEXT: v_min_f32_e32 v6, v6, v31 -; GFX7-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:32 -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GFX7-SDAG-NEXT: v_min_f32_e32 v7, v7, v31 -; GFX7-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:36 -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GFX7-SDAG-NEXT: v_min_f32_e32 v8, v8, v31 -; GFX7-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:40 -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GFX7-SDAG-NEXT: v_min_f32_e32 v9, v9, v31 -; GFX7-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:44 -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GFX7-SDAG-NEXT: v_min_f32_e32 v10, v10, v31 -; GFX7-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:48 -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GFX7-SDAG-NEXT: v_min_f32_e32 v11, v11, v31 -; GFX7-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:52 -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GFX7-SDAG-NEXT: v_min_f32_e32 v12, v12, v31 -; GFX7-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:56 -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GFX7-SDAG-NEXT: v_min_f32_e32 v13, v13, v31 -; GFX7-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:60 -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GFX7-SDAG-NEXT: v_min_f32_e32 v14, v14, v31 -; GFX7-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:64 -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GFX7-SDAG-NEXT: v_min_f32_e32 v15, v15, v31 -; GFX7-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:68 -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GFX7-SDAG-NEXT: v_min_f32_e32 v16, v16, v31 -; GFX7-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:72 -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GFX7-SDAG-NEXT: v_min_f32_e32 v17, v17, v31 -; GFX7-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76 -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GFX7-SDAG-NEXT: v_min_f32_e32 v18, v18, v31 -; GFX7-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80 -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GFX7-SDAG-NEXT: v_min_f32_e32 v19, v19, v31 -; GFX7-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:84 -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GFX7-SDAG-NEXT: v_min_f32_e32 v20, v20, v31 -; GFX7-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:88 -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GFX7-SDAG-NEXT: v_min_f32_e32 v21, v21, v31 -; GFX7-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:92 -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GFX7-SDAG-NEXT: v_min_f32_e32 v22, v22, v31 -; GFX7-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:96 -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GFX7-SDAG-NEXT: v_min_f32_e32 v23, v23, v31 -; GFX7-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:100 -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GFX7-SDAG-NEXT: v_min_f32_e32 v24, v24, v31 -; GFX7-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:104 -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GFX7-SDAG-NEXT: v_min_f32_e32 v25, v25, v31 -; GFX7-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:108 -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GFX7-SDAG-NEXT: v_min_f32_e32 v26, v26, v31 -; GFX7-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:112 -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GFX7-SDAG-NEXT: v_min_f32_e32 v27, v27, v31 -; GFX7-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:116 -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GFX7-SDAG-NEXT: v_min_f32_e32 v28, v28, v31 -; GFX7-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120 -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GFX7-SDAG-NEXT: v_min_f32_e32 v29, v29, v31 -; GFX7-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:124 -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GFX7-SDAG-NEXT: v_min_f32_e32 v30, v30, v31 -; GFX7-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX7-SDAG-NEXT: v_or_b32_e32 v10, v10, v16 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v16, 16, v17 +; GFX7-SDAG-NEXT: v_min_f32_e32 v12, v12, v28 +; GFX7-SDAG-NEXT: v_or_b32_e32 v11, v11, v16 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v16, v33 +; GFX7-SDAG-NEXT: v_min_f32_e32 v13, v13, v29 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v12, v12 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v17, v32 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v13, v13 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX7-SDAG-NEXT: v_or_b32_e32 v12, v12, v16 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v16, 16, v17 +; GFX7-SDAG-NEXT: v_or_b32_e32 v13, v13, v16 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v16, v31 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v15, v15 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v17, v34 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v18, v14 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v14, 16, v16 +; GFX7-SDAG-NEXT: v_or_b32_e32 v14, v15, v14 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v15, 16, v17 +; GFX7-SDAG-NEXT: v_or_b32_e32 v15, v18, v15 ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GFX7-SDAG-NEXT: v_min_f32_e32 v31, v31, v32 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: v_minimumnum_v32f16: ; GFX7-GISEL: ; %bb.0: ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-GISEL-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v31, v0 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v32, v16 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v16, 16, v16 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v15, v15 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v16, v16 +; GFX7-GISEL-NEXT: v_min_f32_e32 v31, v31, v32 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v32, v17 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX7-GISEL-NEXT: v_min_f32_e32 v16, v0, v16 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v1 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v17, v17 +; GFX7-GISEL-NEXT: v_min_f32_e32 v0, v0, v32 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v32, v18 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; GFX7-GISEL-NEXT: v_min_f32_e32 v1, v1, v17 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v17, v2 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v18, v18 +; GFX7-GISEL-NEXT: v_min_f32_e32 v17, v17, v32 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v32, v19 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; GFX7-GISEL-NEXT: v_min_f32_e32 v18, v2, v18 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v19, v19 +; GFX7-GISEL-NEXT: v_min_f32_e32 v2, v2, v32 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v32, v20 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; GFX7-GISEL-NEXT: v_min_f32_e32 v3, v3, v19 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v19, v4 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v20, v20 +; GFX7-GISEL-NEXT: v_min_f32_e32 v19, v19, v32 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v32, v21 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; GFX7-GISEL-NEXT: v_min_f32_e32 v20, v4, v20 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v21, v21 +; GFX7-GISEL-NEXT: v_min_f32_e32 v4, v4, v32 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v32, v22 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; GFX7-GISEL-NEXT: v_min_f32_e32 v5, v5, v21 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v21, v6 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v6, v6 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v22, v22 +; GFX7-GISEL-NEXT: v_min_f32_e32 v21, v21, v32 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v32, v23 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; GFX7-GISEL-NEXT: v_min_f32_e32 v6, v6, v22 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v22, v7 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v7, v7 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v23, v23 +; GFX7-GISEL-NEXT: v_min_f32_e32 v22, v22, v32 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v32, v24 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; GFX7-GISEL-NEXT: v_min_f32_e32 v7, v7, v23 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v23, v8 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v8, v8 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v24, v24 +; GFX7-GISEL-NEXT: v_min_f32_e32 v23, v23, v32 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v32, v25 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; GFX7-GISEL-NEXT: v_min_f32_e32 v8, v8, v24 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v24, v9 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v9, v9 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v25, v25 +; GFX7-GISEL-NEXT: v_min_f32_e32 v24, v24, v32 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v32, v26 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; GFX7-GISEL-NEXT: v_min_f32_e32 v9, v9, v25 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v25, v10 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v10, v10 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v26, v26 +; GFX7-GISEL-NEXT: v_min_f32_e32 v25, v25, v32 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v32, v27 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; GFX7-GISEL-NEXT: v_min_f32_e32 v10, v10, v26 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v26, v11 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v11, v11 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v27, v27 +; GFX7-GISEL-NEXT: v_min_f32_e32 v26, v26, v32 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v32, v28 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; GFX7-GISEL-NEXT: v_min_f32_e32 v11, v11, v27 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v27, v12 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v12, v12 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v28, v28 +; GFX7-GISEL-NEXT: v_min_f32_e32 v27, v27, v32 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v32, v29 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; GFX7-GISEL-NEXT: v_min_f32_e32 v12, v12, v28 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v28, v13 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v13, v13 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v29, v29 +; GFX7-GISEL-NEXT: v_min_f32_e32 v28, v28, v32 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v32, v30 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; GFX7-GISEL-NEXT: v_min_f32_e32 v13, v13, v29 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v29, v14 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v14, v14 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v30, v30 -; GFX7-GISEL-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128 -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(1) -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GFX7-GISEL-NEXT: v_min_f32_e32 v0, v0, v31 -; GFX7-GISEL-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(1) -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v32, v32 -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GFX7-GISEL-NEXT: v_min_f32_e32 v1, v1, v31 -; GFX7-GISEL-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:12 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GFX7-GISEL-NEXT: v_min_f32_e32 v2, v2, v31 -; GFX7-GISEL-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:16 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GFX7-GISEL-NEXT: v_min_f32_e32 v3, v3, v31 -; GFX7-GISEL-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:20 +; GFX7-GISEL-NEXT: v_min_f32_e32 v29, v29, v32 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v32, v15 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; GFX7-GISEL-NEXT: v_min_f32_e32 v14, v14, v30 +; GFX7-GISEL-NEXT: buffer_load_dword v30, off, s[0:3], s32 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v15, v15 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v16, v16 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v17, v17 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GFX7-GISEL-NEXT: v_min_f32_e32 v4, v4, v31 -; GFX7-GISEL-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:24 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GFX7-GISEL-NEXT: v_min_f32_e32 v5, v5, v31 -; GFX7-GISEL-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:28 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GFX7-GISEL-NEXT: v_min_f32_e32 v6, v6, v31 -; GFX7-GISEL-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:32 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GFX7-GISEL-NEXT: v_min_f32_e32 v7, v7, v31 -; GFX7-GISEL-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:36 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GFX7-GISEL-NEXT: v_min_f32_e32 v8, v8, v31 -; GFX7-GISEL-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:40 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GFX7-GISEL-NEXT: v_min_f32_e32 v9, v9, v31 -; GFX7-GISEL-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:44 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GFX7-GISEL-NEXT: v_min_f32_e32 v10, v10, v31 -; GFX7-GISEL-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:48 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GFX7-GISEL-NEXT: v_min_f32_e32 v11, v11, v31 -; GFX7-GISEL-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:52 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GFX7-GISEL-NEXT: v_min_f32_e32 v12, v12, v31 -; GFX7-GISEL-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:56 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GFX7-GISEL-NEXT: v_min_f32_e32 v13, v13, v31 -; GFX7-GISEL-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:60 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GFX7-GISEL-NEXT: v_min_f32_e32 v14, v14, v31 -; GFX7-GISEL-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:64 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v14, v14 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GFX7-GISEL-NEXT: v_min_f32_e32 v15, v15, v31 -; GFX7-GISEL-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:68 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v33, v30 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v30, v30 +; GFX7-GISEL-NEXT: v_min_f32_e32 v32, v32, v33 +; GFX7-GISEL-NEXT: v_min_f32_e32 v15, v15, v30 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v30, v31 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v31, v0 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v16 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v16, v18 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v18, v2 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GFX7-GISEL-NEXT: v_min_f32_e32 v16, v16, v31 -; GFX7-GISEL-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:72 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GFX7-GISEL-NEXT: v_min_f32_e32 v17, v17, v31 -; GFX7-GISEL-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GFX7-GISEL-NEXT: v_min_f32_e32 v18, v18, v31 -; GFX7-GISEL-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GFX7-GISEL-NEXT: v_min_f32_e32 v19, v19, v31 -; GFX7-GISEL-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:84 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GFX7-GISEL-NEXT: v_min_f32_e32 v20, v20, v31 -; GFX7-GISEL-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:88 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GFX7-GISEL-NEXT: v_min_f32_e32 v21, v21, v31 -; GFX7-GISEL-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:92 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GFX7-GISEL-NEXT: v_min_f32_e32 v22, v22, v31 -; GFX7-GISEL-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:96 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GFX7-GISEL-NEXT: v_min_f32_e32 v23, v23, v31 -; GFX7-GISEL-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:100 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GFX7-GISEL-NEXT: v_min_f32_e32 v24, v24, v31 -; GFX7-GISEL-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:104 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GFX7-GISEL-NEXT: v_min_f32_e32 v25, v25, v31 -; GFX7-GISEL-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:108 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GFX7-GISEL-NEXT: v_min_f32_e32 v26, v26, v31 -; GFX7-GISEL-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:112 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GFX7-GISEL-NEXT: v_min_f32_e32 v27, v27, v31 -; GFX7-GISEL-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:116 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GFX7-GISEL-NEXT: v_min_f32_e32 v28, v28, v31 -; GFX7-GISEL-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v28, v28 -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GFX7-GISEL-NEXT: v_min_f32_e32 v29, v29, v31 -; GFX7-GISEL-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:124 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GFX7-GISEL-NEXT: v_min_f32_e32 v30, v30, v31 -; GFX7-GISEL-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GFX7-GISEL-NEXT: v_min_f32_e32 v31, v31, v32 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v31, v31 +; GFX7-GISEL-NEXT: v_or_b32_e32 v0, v30, v0 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v16 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v16, v20 +; GFX7-GISEL-NEXT: v_or_b32_e32 v2, v17, v2 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v17, v19 +; GFX7-GISEL-NEXT: v_or_b32_e32 v3, v18, v3 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v18, v4 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v16 +; GFX7-GISEL-NEXT: v_or_b32_e32 v4, v17, v4 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v16, v21 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v17, v22 +; GFX7-GISEL-NEXT: v_or_b32_e32 v5, v18, v5 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v18, v29 +; GFX7-GISEL-NEXT: v_or_b32_e32 v6, v16, v6 +; GFX7-GISEL-NEXT: v_or_b32_e32 v7, v17, v7 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v16, v23 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v17, v24 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v19, v32 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX7-GISEL-NEXT: v_or_b32_e32 v8, v16, v8 +; GFX7-GISEL-NEXT: v_or_b32_e32 v9, v17, v9 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v16, v25 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v17, v26 +; GFX7-GISEL-NEXT: v_or_b32_e32 v1, v31, v1 +; GFX7-GISEL-NEXT: v_or_b32_e32 v14, v18, v14 +; GFX7-GISEL-NEXT: v_or_b32_e32 v10, v16, v10 +; GFX7-GISEL-NEXT: v_or_b32_e32 v11, v17, v11 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v16, v27 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v17, v28 +; GFX7-GISEL-NEXT: v_or_b32_e32 v15, v19, v15 +; GFX7-GISEL-NEXT: v_or_b32_e32 v12, v16, v12 +; GFX7-GISEL-NEXT: v_or_b32_e32 v13, v17, v13 ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: v_minimumnum_v32f16: @@ -8322,29 +8431,35 @@ define <2 x half> @v_minimumnum_v2f16_no_ieee(<2 x half> %x, <2 x half> %y) #0 { ; GFX7-SDAG-LABEL: v_minimumnum_v2f16_no_ieee: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v2 -; GFX7-SDAG-NEXT: v_min_f32_e32 v1, v1, v3 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_min_f32_e32 v2, v3, v2 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: v_minimumnum_v2f16_no_ieee: ; GFX7-GISEL: ; %bb.0: ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-GISEL-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX7-GISEL-NEXT: v_min_f32_e32 v0, v0, v1 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: v_min_f32_e32 v1, v1, v3 +; GFX7-GISEL-NEXT: v_min_f32_e32 v1, v2, v3 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: v_minimumnum_v2f16_no_ieee: @@ -8414,29 +8529,35 @@ define <2 x half> @v_minimumnum_v2f16_nnan_no_ieee(<2 x half> %x, <2 x half> %y) ; GFX7-SDAG-LABEL: v_minimumnum_v2f16_nnan_no_ieee: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v2 -; GFX7-SDAG-NEXT: v_min_f32_e32 v1, v1, v3 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_min_f32_e32 v2, v3, v2 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: v_minimumnum_v2f16_nnan_no_ieee: ; GFX7-GISEL: ; %bb.0: ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-GISEL-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX7-GISEL-NEXT: v_min_f32_e32 v0, v0, v1 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: v_min_f32_e32 v1, v1, v3 +; GFX7-GISEL-NEXT: v_min_f32_e32 v1, v2, v3 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: v_minimumnum_v2f16_nnan_no_ieee: @@ -8490,38 +8611,43 @@ define <3 x half> @v_minimumnum_v3f16_nnan_no_ieee(<3 x half> %x, <3 x half> %y) ; GFX7-SDAG-LABEL: v_minimumnum_v3f16_nnan_no_ieee: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v5, 16, v0 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v3 -; GFX7-SDAG-NEXT: v_min_f32_e32 v1, v1, v4 -; GFX7-SDAG-NEXT: v_min_f32_e32 v2, v2, v5 +; GFX7-SDAG-NEXT: v_min_f32_e32 v4, v5, v4 +; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v4 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_min_f32_e32 v1, v1, v3 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: v_minimumnum_v3f16_nnan_no_ieee: ; GFX7-GISEL: ; %bb.0: ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-GISEL-NEXT: v_min_f32_e32 v0, v0, v3 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v3, v4 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-GISEL-NEXT: v_min_f32_e32 v2, v4, v2 +; GFX7-GISEL-NEXT: v_min_f32_e32 v0, v0, v5 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-GISEL-NEXT: v_min_f32_e32 v1, v1, v3 -; GFX7-GISEL-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-GISEL-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: v_minimumnum_v3f16_nnan_no_ieee: @@ -8588,47 +8714,59 @@ define <4 x half> @v_minimumnum_v4f16_nnan_no_ieee(<4 x half> %x, <4 x half> %y) ; GFX7-SDAG-LABEL: v_minimumnum_v4f16_nnan_no_ieee: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v5, 16, v0 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v7, v7 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v4 -; GFX7-SDAG-NEXT: v_min_f32_e32 v1, v1, v5 -; GFX7-SDAG-NEXT: v_min_f32_e32 v2, v2, v6 -; GFX7-SDAG-NEXT: v_min_f32_e32 v3, v3, v7 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_min_f32_e32 v4, v5, v4 +; GFX7-SDAG-NEXT: v_min_f32_e32 v6, v7, v6 +; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v4 +; GFX7-SDAG-NEXT: v_min_f32_e32 v1, v1, v3 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v6 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX7-SDAG-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: v_minimumnum_v4f16_nnan_no_ieee: ; GFX7-GISEL: ; %bb.0: ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v7, 16, v3 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-GISEL-NEXT: v_min_f32_e32 v0, v0, v4 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v4, v6 -; GFX7-GISEL-NEXT: v_min_f32_e32 v1, v1, v5 +; GFX7-GISEL-NEXT: v_min_f32_e32 v0, v0, v5 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v5, v7 -; GFX7-GISEL-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-GISEL-NEXT: v_min_f32_e32 v3, v3, v5 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v5, v6 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v6, v7 +; GFX7-GISEL-NEXT: v_min_f32_e32 v2, v4, v2 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-GISEL-NEXT: v_min_f32_e32 v1, v1, v3 +; GFX7-GISEL-NEXT: v_min_f32_e32 v3, v5, v6 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-GISEL-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX7-GISEL-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: v_minimumnum_v4f16_nnan_no_ieee: diff --git a/llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll b/llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll index 40676cef1bc5e..888d95e8c7329 100644 --- a/llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll +++ b/llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll @@ -1959,14 +1959,18 @@ define <2 x i16> @v_mul_add_1_v2i16(<2 x i16> %x, <2 x i16> %y) { ; GFX67-LABEL: v_mul_add_1_v2i16: ; GFX67: ; %bb.0: ; GFX67-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX67-NEXT: v_add_i32_e32 v2, vcc, 1, v2 +; GFX67-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX67-NEXT: v_add_i32_e32 v1, vcc, 1, v1 ; GFX67-NEXT: v_add_i32_e32 v3, vcc, 1, v3 -; GFX67-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX67-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX67-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX67-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX67-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-NEXT: v_mul_u32_u24_e32 v0, v0, v2 -; GFX67-NEXT: v_mul_u32_u24_e32 v1, v1, v3 +; GFX67-NEXT: v_mul_u32_u24_e32 v0, v0, v1 +; GFX67-NEXT: v_mul_u32_u24_e32 v1, v2, v3 +; GFX67-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX67-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX67-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX67-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_mul_add_1_v2i16: @@ -2011,14 +2015,18 @@ define <2 x i16> @v_mul_add_1_v2i16_commute(<2 x i16> %x, <2 x i16> %y) { ; GFX67-LABEL: v_mul_add_1_v2i16_commute: ; GFX67: ; %bb.0: ; GFX67-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX67-NEXT: v_add_i32_e32 v2, vcc, 1, v2 +; GFX67-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX67-NEXT: v_add_i32_e32 v1, vcc, 1, v1 ; GFX67-NEXT: v_add_i32_e32 v3, vcc, 1, v3 -; GFX67-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX67-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX67-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX67-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX67-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-NEXT: v_mul_u32_u24_e32 v0, v2, v0 -; GFX67-NEXT: v_mul_u32_u24_e32 v1, v3, v1 +; GFX67-NEXT: v_mul_u32_u24_e32 v0, v1, v0 +; GFX67-NEXT: v_mul_u32_u24_e32 v1, v3, v2 +; GFX67-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX67-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX67-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX67-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_mul_add_1_v2i16_commute: @@ -2063,16 +2071,15 @@ define <2 x i16> @v_mul_add_x_v2i16(<2 x i16> %x, <2 x i16> %y) { ; GFX67-LABEL: v_mul_add_x_v2i16: ; GFX67: ; %bb.0: ; GFX67-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX67-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX67-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX67-NEXT: v_and_b32_e32 v4, 0xffff, v0 -; GFX67-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX67-NEXT: v_and_b32_e32 v5, 0xffff, v1 -; GFX67-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX67-NEXT: v_mad_u32_u24 v1, v5, v3, v1 -; GFX67-NEXT: v_mad_u32_u24 v0, v4, v2, v0 -; GFX67-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX67-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-NEXT: v_or_b32_e32 v0, v0, v3 ; GFX67-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX67-NEXT: v_mad_u32_u24 v2, v3, v2, v3 +; GFX67-NEXT: v_mad_u32_u24 v0, v4, v1, v0 +; GFX67-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX67-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX67-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX67-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_mul_add_x_v2i16: @@ -2113,14 +2120,18 @@ define <2 x i16> @v_mul_sub_1_v2i16(<2 x i16> %x, <2 x i16> %y) { ; GFX67-LABEL: v_mul_sub_1_v2i16: ; GFX67: ; %bb.0: ; GFX67-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX67-NEXT: v_add_i32_e32 v2, vcc, -1, v2 +; GFX67-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX67-NEXT: v_add_i32_e32 v1, vcc, -1, v1 ; GFX67-NEXT: v_add_i32_e32 v3, vcc, -1, v3 -; GFX67-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX67-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX67-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX67-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX67-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-NEXT: v_mul_u32_u24_e32 v0, v0, v2 -; GFX67-NEXT: v_mul_u32_u24_e32 v1, v1, v3 +; GFX67-NEXT: v_mul_u32_u24_e32 v0, v0, v1 +; GFX67-NEXT: v_mul_u32_u24_e32 v1, v2, v3 +; GFX67-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX67-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX67-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX67-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_mul_sub_1_v2i16: @@ -2165,14 +2176,18 @@ define <2 x i16> @v_mul_sub_1_v2i16_commute(<2 x i16> %x, <2 x i16> %y) { ; GFX67-LABEL: v_mul_sub_1_v2i16_commute: ; GFX67: ; %bb.0: ; GFX67-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX67-NEXT: v_add_i32_e32 v2, vcc, -1, v2 +; GFX67-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX67-NEXT: v_add_i32_e32 v1, vcc, -1, v1 ; GFX67-NEXT: v_add_i32_e32 v3, vcc, -1, v3 -; GFX67-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX67-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX67-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX67-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX67-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-NEXT: v_mul_u32_u24_e32 v0, v2, v0 -; GFX67-NEXT: v_mul_u32_u24_e32 v1, v3, v1 +; GFX67-NEXT: v_mul_u32_u24_e32 v0, v1, v0 +; GFX67-NEXT: v_mul_u32_u24_e32 v1, v3, v2 +; GFX67-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX67-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX67-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX67-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_mul_sub_1_v2i16_commute: @@ -2217,18 +2232,17 @@ define <2 x i16> @v_mul_sub_x_v2i16(<2 x i16> %x, <2 x i16> %y) { ; GFX67-LABEL: v_mul_sub_x_v2i16: ; GFX67: ; %bb.0: ; GFX67-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX67-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX67-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX67-NEXT: v_and_b32_e32 v4, 0xffff, v0 -; GFX67-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX67-NEXT: v_mul_u32_u24_e32 v2, v4, v2 -; GFX67-NEXT: v_and_b32_e32 v4, 0xffff, v1 -; GFX67-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX67-NEXT: v_mul_u32_u24_e32 v3, v4, v3 -; GFX67-NEXT: v_sub_i32_e32 v1, vcc, v3, v1 -; GFX67-NEXT: v_sub_i32_e32 v0, vcc, v2, v0 -; GFX67-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX67-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-NEXT: v_or_b32_e32 v0, v0, v3 ; GFX67-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX67-NEXT: v_mul_u32_u24_e32 v1, v4, v1 +; GFX67-NEXT: v_mul_u32_u24_e32 v2, v3, v2 +; GFX67-NEXT: v_sub_i32_e32 v2, vcc, v2, v3 +; GFX67-NEXT: v_sub_i32_e32 v0, vcc, v1, v0 +; GFX67-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX67-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX67-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX67-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_mul_sub_x_v2i16: @@ -2272,14 +2286,18 @@ define <2 x i16> @v_mul_add_2_v2i16(<2 x i16> %x, <2 x i16> %y) { ; GFX67-LABEL: v_mul_add_2_v2i16: ; GFX67: ; %bb.0: ; GFX67-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX67-NEXT: v_add_i32_e32 v2, vcc, 2, v2 +; GFX67-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX67-NEXT: v_add_i32_e32 v1, vcc, 2, v1 ; GFX67-NEXT: v_add_i32_e32 v3, vcc, 2, v3 -; GFX67-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX67-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX67-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX67-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX67-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-NEXT: v_mul_u32_u24_e32 v0, v0, v2 -; GFX67-NEXT: v_mul_u32_u24_e32 v1, v1, v3 +; GFX67-NEXT: v_mul_u32_u24_e32 v0, v0, v1 +; GFX67-NEXT: v_mul_u32_u24_e32 v1, v2, v3 +; GFX67-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX67-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX67-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX67-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_mul_add_2_v2i16: @@ -2324,14 +2342,18 @@ define <2 x i16> @v_mul_sub_2_v2i16(<2 x i16> %x, <2 x i16> %y) { ; GFX67-LABEL: v_mul_sub_2_v2i16: ; GFX67: ; %bb.0: ; GFX67-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX67-NEXT: v_add_i32_e32 v2, vcc, -2, v2 +; GFX67-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX67-NEXT: v_add_i32_e32 v1, vcc, -2, v1 ; GFX67-NEXT: v_add_i32_e32 v3, vcc, -2, v3 -; GFX67-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX67-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX67-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX67-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX67-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-NEXT: v_mul_u32_u24_e32 v0, v0, v2 -; GFX67-NEXT: v_mul_u32_u24_e32 v1, v1, v3 +; GFX67-NEXT: v_mul_u32_u24_e32 v0, v0, v1 +; GFX67-NEXT: v_mul_u32_u24_e32 v1, v2, v3 +; GFX67-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX67-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX67-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX67-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_mul_sub_2_v2i16: @@ -3281,14 +3303,14 @@ define <2 x i16> @v_mul_9_add_52_v2i16(<2 x i16> %arg) { ; GFX67-LABEL: v_mul_9_add_52_v2i16: ; GFX67: ; %bb.0: ; GFX67-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX67-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX67-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-NEXT: v_mad_u32_u24 v1, v1, 9, 52 +; GFX67-NEXT: v_mul_u32_u24_e32 v1, 9, v1 ; GFX67-NEXT: v_mad_u32_u24 v0, v0, 9, 52 -; GFX67-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX67-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX67-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX67-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX67-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX67-NEXT: v_add_i32_e32 v0, vcc, 0x340000, v0 ; GFX67-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_mul_9_add_52_v2i16: @@ -3562,14 +3584,14 @@ define <2 x i16> @v_mul_5_add_1_v2i16(<2 x i16> %arg) { ; GFX67-LABEL: v_mul_5_add_1_v2i16: ; GFX67: ; %bb.0: ; GFX67-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX67-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX67-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-NEXT: v_mad_u32_u24 v1, v1, 5, 1 +; GFX67-NEXT: v_mul_u32_u24_e32 v1, 5, v1 ; GFX67-NEXT: v_mad_u32_u24 v0, v0, 5, 1 -; GFX67-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX67-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX67-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX67-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX67-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX67-NEXT: v_add_i32_e32 v0, vcc, 0x10000, v0 ; GFX67-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_mul_5_add_1_v2i16: @@ -3609,16 +3631,16 @@ define <2 x i16> @v_mul_284_add_82_v2i16(<2 x i16> %arg) { ; GFX67-LABEL: v_mul_284_add_82_v2i16: ; GFX67: ; %bb.0: ; GFX67-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX67-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX67-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX67-NEXT: s_movk_i32 s4, 0x11c ; GFX67-NEXT: v_mov_b32_e32 v2, 0x52 -; GFX67-NEXT: v_mad_u32_u24 v1, v1, s4, v2 +; GFX67-NEXT: v_mul_u32_u24_e32 v1, 0x11c, v1 ; GFX67-NEXT: v_mad_u32_u24 v0, v0, s4, v2 -; GFX67-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX67-NEXT: v_and_b32_e32 v0, 0xfffe, v0 -; GFX67-NEXT: v_or_b32_e32 v0, v0, v3 -; GFX67-NEXT: v_and_b32_e32 v1, 0xfffe, v1 +; GFX67-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX67-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX67-NEXT: v_add_i32_e32 v0, vcc, 0x520000, v0 ; GFX67-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_mul_284_add_82_v2i16: diff --git a/llvm/test/CodeGen/AMDGPU/repeated-divisor.ll b/llvm/test/CodeGen/AMDGPU/repeated-divisor.ll index 88a8f3affc83a..38d0a377a3ffb 100644 --- a/llvm/test/CodeGen/AMDGPU/repeated-divisor.ll +++ b/llvm/test/CodeGen/AMDGPU/repeated-divisor.ll @@ -263,8 +263,12 @@ define <2 x half> @v_repeat_divisor_f16_x2_arcp(half %x, half %y, half %D) #0 { ; GFX6-NEXT: v_fma_f32 v3, -v3, v6, v5 ; GFX6-NEXT: v_div_fmas_f32 v3, v3, v4, v6 ; GFX6-NEXT: v_div_fixup_f32 v2, v3, v2, 1.0 -; GFX6-NEXT: v_mul_f32_e32 v0, v0, v2 ; GFX6-NEXT: v_mul_f32_e32 v1, v1, v2 +; GFX6-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_repeat_divisor_f16_x2_arcp: @@ -527,13 +531,13 @@ define <3 x half> @v_repeat_divisor_f16_x3_arcp(half %x, half %y, half %z, half ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_div_scale_f32 v4, s[4:5], v3, v3, 1.0 ; GFX6-NEXT: v_rcp_f32_e32 v5, v4 ; GFX6-NEXT: v_fma_f32 v6, -v4, v5, 1.0 @@ -545,9 +549,14 @@ define <3 x half> @v_repeat_divisor_f16_x3_arcp(half %x, half %y, half %z, half ; GFX6-NEXT: v_fma_f32 v4, -v4, v7, v6 ; GFX6-NEXT: v_div_fmas_f32 v4, v4, v5, v7 ; GFX6-NEXT: v_div_fixup_f32 v3, v4, v3, 1.0 -; GFX6-NEXT: v_mul_f32_e32 v0, v0, v3 ; GFX6-NEXT: v_mul_f32_e32 v1, v1, v3 -; GFX6-NEXT: v_mul_f32_e32 v2, v2, v3 +; GFX6-NEXT: v_mul_f32_e32 v0, v0, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_mul_f32_e32 v1, v2, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_repeat_divisor_f16_x3_arcp: @@ -782,19 +791,16 @@ define <4 x half> @v_repeat_divisor_v2f16_x2(<2 x half> %x, <2 x half> %y, <2 x ; GFX6-LABEL: v_repeat_divisor_v2f16_x2: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GFX6-NEXT: v_div_scale_f32 v6, s[4:5], v4, v4, 1.0 ; GFX6-NEXT: v_rcp_f32_e32 v7, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: v_fma_f32 v8, -v6, v7, 1.0 ; GFX6-NEXT: v_fma_f32 v7, v8, v7, v7 ; GFX6-NEXT: v_div_scale_f32 v8, vcc, 1.0, v4, 1.0 @@ -802,24 +808,32 @@ define <4 x half> @v_repeat_divisor_v2f16_x2(<2 x half> %x, <2 x half> %y, <2 x ; GFX6-NEXT: v_fma_f32 v10, -v6, v9, v8 ; GFX6-NEXT: v_fma_f32 v9, v10, v7, v9 ; GFX6-NEXT: v_fma_f32 v6, -v6, v9, v8 -; GFX6-NEXT: v_div_scale_f32 v8, s[4:5], v5, v5, 1.0 -; GFX6-NEXT: v_rcp_f32_e32 v10, v8 ; GFX6-NEXT: v_div_fmas_f32 v6, v6, v7, v9 +; GFX6-NEXT: v_div_scale_f32 v7, s[4:5], v2, v2, 1.0 +; GFX6-NEXT: v_rcp_f32_e32 v8, v7 ; GFX6-NEXT: v_div_fixup_f32 v4, v6, v4, 1.0 -; GFX6-NEXT: v_div_scale_f32 v7, vcc, 1.0, v5, 1.0 -; GFX6-NEXT: v_fma_f32 v6, -v8, v10, 1.0 -; GFX6-NEXT: v_fma_f32 v6, v6, v10, v10 -; GFX6-NEXT: v_mul_f32_e32 v9, v7, v6 -; GFX6-NEXT: v_fma_f32 v10, -v8, v9, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-NEXT: v_mul_f32_e32 v5, v5, v4 +; GFX6-NEXT: v_fma_f32 v6, -v7, v8, 1.0 +; GFX6-NEXT: v_fma_f32 v6, v6, v8, v8 +; GFX6-NEXT: v_div_scale_f32 v8, vcc, 1.0, v2, 1.0 +; GFX6-NEXT: v_mul_f32_e32 v9, v8, v6 +; GFX6-NEXT: v_fma_f32 v10, -v7, v9, v8 ; GFX6-NEXT: v_fma_f32 v9, v10, v6, v9 -; GFX6-NEXT: v_fma_f32 v7, -v8, v9, v7 +; GFX6-NEXT: v_fma_f32 v7, -v7, v9, v8 ; GFX6-NEXT: v_div_fmas_f32 v6, v7, v6, v9 -; GFX6-NEXT: v_div_fixup_f32 v5, v6, v5, 1.0 -; GFX6-NEXT: v_mul_f32_e32 v0, v0, v4 -; GFX6-NEXT: v_mul_f32_e32 v1, v1, v5 -; GFX6-NEXT: v_mul_f32_e32 v2, v2, v4 -; GFX6-NEXT: v_mul_f32_e32 v3, v3, v5 +; GFX6-NEXT: v_div_fixup_f32 v2, v6, v2, 1.0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX6-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX6-NEXT: v_mul_f32_e32 v3, v3, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_mul_f32_e32 v1, v1, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v5 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_repeat_divisor_v2f16_x2: @@ -864,63 +878,69 @@ define <6 x half> @v_repeat_divisor_v3f16_x2(<3 x half> %x, <3 x half> %y, <3 x ; GFX6-LABEL: v_repeat_divisor_v3f16_x2: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GFX6-NEXT: v_lshrrev_b32_e32 v8, 16, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_div_scale_f32 v9, s[4:5], v6, v6, 1.0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX6-NEXT: v_div_scale_f32 v9, s[4:5], v4, v4, 1.0 ; GFX6-NEXT: v_rcp_f32_e32 v10, v9 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v7, 16, v0 ; GFX6-NEXT: v_fma_f32 v11, -v9, v10, 1.0 ; GFX6-NEXT: v_fma_f32 v10, v11, v10, v10 -; GFX6-NEXT: v_div_scale_f32 v11, vcc, 1.0, v6, 1.0 +; GFX6-NEXT: v_div_scale_f32 v11, vcc, 1.0, v4, 1.0 ; GFX6-NEXT: v_mul_f32_e32 v12, v11, v10 ; GFX6-NEXT: v_fma_f32 v13, -v9, v12, v11 ; GFX6-NEXT: v_fma_f32 v12, v13, v10, v12 ; GFX6-NEXT: v_fma_f32 v9, -v9, v12, v11 -; GFX6-NEXT: v_div_scale_f32 v11, s[4:5], v7, v7, 1.0 +; GFX6-NEXT: v_div_scale_f32 v11, s[4:5], v8, v8, 1.0 ; GFX6-NEXT: v_rcp_f32_e32 v13, v11 ; GFX6-NEXT: v_div_fmas_f32 v9, v9, v10, v12 -; GFX6-NEXT: v_div_fixup_f32 v6, v9, v6, 1.0 -; GFX6-NEXT: v_div_scale_f32 v10, vcc, 1.0, v7, 1.0 +; GFX6-NEXT: v_div_fixup_f32 v4, v9, v4, 1.0 +; GFX6-NEXT: v_div_scale_f32 v10, vcc, 1.0, v8, 1.0 ; GFX6-NEXT: v_fma_f32 v9, -v11, v13, 1.0 ; GFX6-NEXT: v_fma_f32 v9, v9, v13, v13 ; GFX6-NEXT: v_mul_f32_e32 v12, v10, v9 ; GFX6-NEXT: v_fma_f32 v13, -v11, v12, v10 ; GFX6-NEXT: v_fma_f32 v12, v13, v9, v12 ; GFX6-NEXT: v_fma_f32 v10, -v11, v12, v10 -; GFX6-NEXT: v_div_scale_f32 v11, s[4:5], v8, v8, 1.0 +; GFX6-NEXT: v_div_scale_f32 v11, s[4:5], v5, v5, 1.0 ; GFX6-NEXT: v_rcp_f32_e32 v13, v11 ; GFX6-NEXT: v_div_fmas_f32 v9, v10, v9, v12 -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_div_fixup_f32 v7, v9, v7, 1.0 +; GFX6-NEXT: v_div_fixup_f32 v8, v9, v8, 1.0 +; GFX6-NEXT: v_div_scale_f32 v10, vcc, 1.0, v5, 1.0 ; GFX6-NEXT: v_fma_f32 v9, -v11, v13, 1.0 ; GFX6-NEXT: v_fma_f32 v9, v9, v13, v13 -; GFX6-NEXT: v_div_scale_f32 v10, vcc, 1.0, v8, 1.0 ; GFX6-NEXT: v_mul_f32_e32 v12, v10, v9 ; GFX6-NEXT: v_fma_f32 v13, -v11, v12, v10 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: v_fma_f32 v12, v13, v9, v12 ; GFX6-NEXT: v_fma_f32 v10, -v11, v12, v10 ; GFX6-NEXT: v_div_fmas_f32 v9, v10, v9, v12 -; GFX6-NEXT: v_div_fixup_f32 v8, v9, v8, 1.0 -; GFX6-NEXT: v_mul_f32_e32 v0, v0, v6 -; GFX6-NEXT: v_mul_f32_e32 v1, v1, v7 -; GFX6-NEXT: v_mul_f32_e32 v2, v2, v8 -; GFX6-NEXT: v_mul_f32_e32 v3, v3, v6 -; GFX6-NEXT: v_mul_f32_e32 v4, v4, v7 -; GFX6-NEXT: v_mul_f32_e32 v5, v5, v8 +; GFX6-NEXT: v_div_fixup_f32 v5, v9, v5, 1.0 +; GFX6-NEXT: v_mul_f32_e32 v2, v2, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_mul_f32_e32 v1, v1, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: v_mul_f32_e32 v3, v3, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_mul_f32_e32 v0, v0, v4 +; GFX6-NEXT: v_mul_f32_e32 v7, v7, v8 +; GFX6-NEXT: v_mul_f32_e32 v4, v6, v8 +; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v7 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v3 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_repeat_divisor_v3f16_x2: diff --git a/llvm/test/CodeGen/AMDGPU/roundeven.ll b/llvm/test/CodeGen/AMDGPU/roundeven.ll index a259156c09bd7..0b7e4e90dc317 100644 --- a/llvm/test/CodeGen/AMDGPU/roundeven.ll +++ b/llvm/test/CodeGen/AMDGPU/roundeven.ll @@ -430,23 +430,29 @@ define <2 x half> @v_roundeven_v2f16(<2 x half> %x) { ; GFX6-LABEL: v_roundeven_v2f16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-NEXT: v_rndne_f32_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: v_rndne_f32_e32 v1, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_rndne_f32_e32 v0, v0 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_roundeven_v2f16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_rndne_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: v_rndne_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_rndne_f32_e32 v0, v0 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_roundeven_v2f16: @@ -492,23 +498,29 @@ define <2 x half> @v_roundeven_v2f16(<2 x half> %x) { ; SDAG_GFX6-LABEL: v_roundeven_v2f16: ; SDAG_GFX6: ; %bb.0: ; SDAG_GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG_GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SDAG_GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SDAG_GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG_GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; SDAG_GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SDAG_GFX6-NEXT: v_rndne_f32_e32 v0, v0 +; SDAG_GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SDAG_GFX6-NEXT: v_rndne_f32_e32 v1, v1 +; SDAG_GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SDAG_GFX6-NEXT: v_rndne_f32_e32 v0, v0 +; SDAG_GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SDAG_GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SDAG_GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; SDAG_GFX6-NEXT: s_setpc_b64 s[30:31] ; ; SDAG_GFX7-LABEL: v_roundeven_v2f16: ; SDAG_GFX7: ; %bb.0: ; SDAG_GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG_GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SDAG_GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SDAG_GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG_GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; SDAG_GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SDAG_GFX7-NEXT: v_rndne_f32_e32 v0, v0 +; SDAG_GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SDAG_GFX7-NEXT: v_rndne_f32_e32 v1, v1 +; SDAG_GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SDAG_GFX7-NEXT: v_rndne_f32_e32 v0, v0 +; SDAG_GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SDAG_GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SDAG_GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; SDAG_GFX7-NEXT: s_setpc_b64 s[30:31] ; ; SDAG_GFX8-LABEL: v_roundeven_v2f16: @@ -558,33 +570,31 @@ define <2 x half> @v_roundeven_v2f16_fneg(<2 x half> %x) { ; GFX6-LABEL: v_roundeven_v2f16_fneg: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v0 -; GFX6-NEXT: v_rndne_f32_e32 v0, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_rndne_f32_e32 v1, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_rndne_f32_e32 v1, v1 +; GFX6-NEXT: v_rndne_f32_e32 v0, v0 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_roundeven_v2f16_fneg: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v0 -; GFX7-NEXT: v_rndne_f32_e32 v0, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_rndne_f32_e32 v1, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_rndne_f32_e32 v1, v1 +; GFX7-NEXT: v_rndne_f32_e32 v0, v0 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_roundeven_v2f16_fneg: @@ -635,31 +645,31 @@ define <2 x half> @v_roundeven_v2f16_fneg(<2 x half> %x) { ; SDAG_GFX6-LABEL: v_roundeven_v2f16_fneg: ; SDAG_GFX6: ; %bb.0: ; SDAG_GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG_GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SDAG_GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SDAG_GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SDAG_GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; SDAG_GFX6-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 ; SDAG_GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SDAG_GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SDAG_GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SDAG_GFX6-NEXT: v_rndne_f32_e32 v0, v0 +; SDAG_GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SDAG_GFX6-NEXT: v_rndne_f32_e32 v1, v1 +; SDAG_GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SDAG_GFX6-NEXT: v_rndne_f32_e32 v0, v0 +; SDAG_GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SDAG_GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SDAG_GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; SDAG_GFX6-NEXT: s_setpc_b64 s[30:31] ; ; SDAG_GFX7-LABEL: v_roundeven_v2f16_fneg: ; SDAG_GFX7: ; %bb.0: ; SDAG_GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG_GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SDAG_GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SDAG_GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SDAG_GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; SDAG_GFX7-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 ; SDAG_GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SDAG_GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SDAG_GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SDAG_GFX7-NEXT: v_rndne_f32_e32 v0, v0 +; SDAG_GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SDAG_GFX7-NEXT: v_rndne_f32_e32 v1, v1 +; SDAG_GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SDAG_GFX7-NEXT: v_rndne_f32_e32 v0, v0 +; SDAG_GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SDAG_GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SDAG_GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; SDAG_GFX7-NEXT: s_setpc_b64 s[30:31] ; ; SDAG_GFX8-LABEL: v_roundeven_v2f16_fneg: @@ -710,35 +720,47 @@ define <4 x half> @v_roundeven_v4f16(<4 x half> %x) { ; GFX6-LABEL: v_roundeven_v4f16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_rndne_f32_e32 v0, v0 -; GFX6-NEXT: v_rndne_f32_e32 v1, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_rndne_f32_e32 v2, v2 -; GFX6-NEXT: v_rndne_f32_e32 v3, v3 +; GFX6-NEXT: v_rndne_f32_e32 v0, v0 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: v_rndne_f32_e32 v3, v3 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_rndne_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_roundeven_v4f16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_rndne_f32_e32 v0, v0 -; GFX7-NEXT: v_rndne_f32_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: v_rndne_f32_e32 v2, v2 -; GFX7-NEXT: v_rndne_f32_e32 v3, v3 +; GFX7-NEXT: v_rndne_f32_e32 v0, v0 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_rndne_f32_e32 v3, v3 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_rndne_f32_e32 v1, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_roundeven_v4f16: @@ -799,35 +821,47 @@ define <4 x half> @v_roundeven_v4f16(<4 x half> %x) { ; SDAG_GFX6-LABEL: v_roundeven_v4f16: ; SDAG_GFX6: ; %bb.0: ; SDAG_GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG_GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SDAG_GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SDAG_GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SDAG_GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SDAG_GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SDAG_GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG_GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SDAG_GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; SDAG_GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SDAG_GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SDAG_GFX6-NEXT: v_rndne_f32_e32 v0, v0 -; SDAG_GFX6-NEXT: v_rndne_f32_e32 v1, v1 +; SDAG_GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG_GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SDAG_GFX6-NEXT: v_rndne_f32_e32 v2, v2 ; SDAG_GFX6-NEXT: v_rndne_f32_e32 v3, v3 +; SDAG_GFX6-NEXT: v_rndne_f32_e32 v0, v0 +; SDAG_GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SDAG_GFX6-NEXT: v_rndne_f32_e32 v1, v1 +; SDAG_GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SDAG_GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SDAG_GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SDAG_GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SDAG_GFX6-NEXT: v_or_b32_e32 v0, v0, v2 +; SDAG_GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SDAG_GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; SDAG_GFX6-NEXT: s_setpc_b64 s[30:31] ; ; SDAG_GFX7-LABEL: v_roundeven_v4f16: ; SDAG_GFX7: ; %bb.0: ; SDAG_GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG_GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SDAG_GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SDAG_GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SDAG_GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SDAG_GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SDAG_GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG_GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SDAG_GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; SDAG_GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SDAG_GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SDAG_GFX7-NEXT: v_rndne_f32_e32 v0, v0 -; SDAG_GFX7-NEXT: v_rndne_f32_e32 v1, v1 +; SDAG_GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG_GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SDAG_GFX7-NEXT: v_rndne_f32_e32 v2, v2 ; SDAG_GFX7-NEXT: v_rndne_f32_e32 v3, v3 +; SDAG_GFX7-NEXT: v_rndne_f32_e32 v0, v0 +; SDAG_GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SDAG_GFX7-NEXT: v_rndne_f32_e32 v1, v1 +; SDAG_GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SDAG_GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SDAG_GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SDAG_GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SDAG_GFX7-NEXT: v_or_b32_e32 v0, v0, v2 +; SDAG_GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SDAG_GFX7-NEXT: v_or_b32_e32 v1, v1, v2 ; SDAG_GFX7-NEXT: s_setpc_b64 s[30:31] ; ; SDAG_GFX8-LABEL: v_roundeven_v4f16: diff --git a/llvm/test/CodeGen/AMDGPU/saddsat.ll b/llvm/test/CodeGen/AMDGPU/saddsat.ll index 4995ce6e57d00..9debb88dd0d7f 100644 --- a/llvm/test/CodeGen/AMDGPU/saddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/saddsat.ll @@ -158,20 +158,19 @@ define <2 x i16> @v_saddsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) { ; GFX6-LABEL: v_saddsat_v2i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16 -; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GFX6-NEXT: v_bfe_i32 v3, v3, 0, 16 +; GFX6-NEXT: v_ashrrev_i32_e32 v2, 16, v1 +; GFX6-NEXT: v_ashrrev_i32_e32 v3, 16, v0 ; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16 -; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: s_movk_i32 s4, 0x8000 ; GFX6-NEXT: v_mov_b32_e32 v3, 0x7fff -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GFX6-NEXT: v_med3_i32 v1, v1, s4, v3 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GFX6-NEXT: v_med3_i32 v2, v2, s4, v3 ; GFX6-NEXT: v_med3_i32 v0, v0, s4, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_saddsat_v2i16: @@ -216,25 +215,24 @@ define <3 x i16> @v_saddsat_v3i16(<3 x i16> %lhs, <3 x i16> %rhs) { ; GFX6-LABEL: v_saddsat_v3i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_bfe_i32 v3, v3, 0, 16 +; GFX6-NEXT: v_ashrrev_i32_e32 v4, 16, v2 +; GFX6-NEXT: v_ashrrev_i32_e32 v5, 16, v0 +; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16 ; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GFX6-NEXT: v_bfe_i32 v4, v4, 0, 16 +; GFX6-NEXT: v_bfe_i32 v3, v3, 0, 16 ; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16 -; GFX6-NEXT: v_bfe_i32 v5, v5, 0, 16 -; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16 -; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v4 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; GFX6-NEXT: s_movk_i32 s4, 0x8000 -; GFX6-NEXT: v_mov_b32_e32 v4, 0x7fff -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v3 -; GFX6-NEXT: v_med3_i32 v1, v1, s4, v4 -; GFX6-NEXT: v_med3_i32 v0, v0, s4, v4 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_mov_b32_e32 v5, 0x7fff +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GFX6-NEXT: v_med3_i32 v4, v4, s4, v5 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; GFX6-NEXT: v_med3_i32 v0, v0, s4, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_med3_i32 v1, v1, s4, v5 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX6-NEXT: v_med3_i32 v3, v2, s4, v4 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v3 -; GFX6-NEXT: v_alignbit_b32 v1, v3, v1, 16 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v4 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_saddsat_v3i16: @@ -288,30 +286,30 @@ define <2 x float> @v_saddsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { ; GFX6-LABEL: v_saddsat_v4i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_bfe_i32 v4, v4, 0, 16 -; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GFX6-NEXT: v_bfe_i32 v5, v5, 0, 16 +; GFX6-NEXT: v_ashrrev_i32_e32 v4, 16, v3 +; GFX6-NEXT: v_ashrrev_i32_e32 v5, 16, v1 +; GFX6-NEXT: v_ashrrev_i32_e32 v6, 16, v2 +; GFX6-NEXT: v_ashrrev_i32_e32 v7, 16, v0 +; GFX6-NEXT: v_bfe_i32 v3, v3, 0, 16 ; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16 -; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v5 -; GFX6-NEXT: s_movk_i32 s4, 0x8000 -; GFX6-NEXT: v_mov_b32_e32 v5, 0x7fff -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v4 -; GFX6-NEXT: v_med3_i32 v1, v1, s4, v5 -; GFX6-NEXT: v_med3_i32 v0, v0, s4, v5 -; GFX6-NEXT: v_bfe_i32 v6, v6, 0, 16 ; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16 -; GFX6-NEXT: v_bfe_i32 v7, v7, 0, 16 -; GFX6-NEXT: v_bfe_i32 v3, v3, 0, 16 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX6-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; GFX6-NEXT: s_movk_i32 s4, 0x8000 +; GFX6-NEXT: v_mov_b32_e32 v7, 0x7fff +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v5, v4 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; GFX6-NEXT: v_med3_i32 v6, v6, s4, v7 +; GFX6-NEXT: v_med3_i32 v0, v0, s4, v7 +; GFX6-NEXT: v_med3_i32 v2, v2, s4, v7 +; GFX6-NEXT: v_med3_i32 v1, v1, s4, v7 +; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_add_i32_e32 v1, vcc, v3, v7 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v6 -; GFX6-NEXT: v_med3_i32 v1, v1, s4, v5 -; GFX6-NEXT: v_med3_i32 v2, v2, s4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX6-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v6 +; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_saddsat_v4i16: diff --git a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll index 5518c7a14cc69..b0e920478e3a5 100644 --- a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll +++ b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll @@ -9,24 +9,25 @@ define <2 x half> @add_select_fabs_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 x ; CI-LABEL: add_select_fabs_fabs_v2f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; CI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; CI-NEXT: v_cvt_f32_f16_e64 v3, |v3| -; CI-NEXT: v_cvt_f32_f16_e64 v5, |v5| +; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; CI-NEXT: v_cvt_f32_f16_e64 v6, |v6| +; CI-NEXT: v_cvt_f32_f16_e64 v7, |v7| +; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; CI-NEXT: v_cvt_f32_f16_e64 v2, |v2| -; CI-NEXT: v_cvt_f32_f16_e64 v4, |v4| -; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; CI-NEXT: v_cvt_f32_f16_e64 v3, |v3| +; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CI-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc +; CI-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; CI-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc -; CI-NEXT: v_add_f32_e32 v0, v0, v6 -; CI-NEXT: v_add_f32_e32 v1, v1, v7 +; CI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; CI-NEXT: v_add_f32_e32 v1, v1, v5 +; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NEXT: v_add_f32_e32 v0, v0, v4 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_or_b32_e32 v0, v0, v1 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: add_select_fabs_fabs_v2f16: @@ -100,30 +101,34 @@ define { <2 x half>, <2 x half> } @add_select_multi_use_lhs_fabs_fabs_v2f16(<2 x ; CI-LABEL: add_select_multi_use_lhs_fabs_fabs_v2f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; CI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; CI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; CI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; CI-NEXT: v_cvt_f32_f16_e64 v3, |v3| -; CI-NEXT: v_cvt_f32_f16_e64 v5, |v5| -; CI-NEXT: v_cvt_f32_f16_e64 v2, |v2| -; CI-NEXT: v_cvt_f32_f16_e64 v4, |v4| +; CI-NEXT: v_lshrrev_b32_e32 v8, 16, v2 +; CI-NEXT: v_lshrrev_b32_e32 v9, 16, v3 +; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 +; CI-NEXT: v_cvt_f32_f16_e64 v8, |v8| +; CI-NEXT: v_cvt_f32_f16_e64 v9, |v9| +; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 ; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; CI-NEXT: v_cvt_f32_f16_e64 v2, |v2| +; CI-NEXT: v_cvt_f32_f16_e64 v3, |v3| ; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; CI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; CI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CI-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc +; CI-NEXT: v_cndmask_b32_e32 v1, v9, v8, vcc ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; CI-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc -; CI-NEXT: v_add_f32_e32 v0, v0, v8 -; CI-NEXT: v_add_f32_e32 v1, v1, v9 -; CI-NEXT: v_add_f32_e32 v2, v2, v6 -; CI-NEXT: v_add_f32_e32 v3, v3, v7 +; CI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; CI-NEXT: v_add_f32_e32 v1, v1, v7 +; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NEXT: v_add_f32_e32 v0, v0, v5 +; CI-NEXT: v_add_f32_e32 v3, v8, v6 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; CI-NEXT: v_add_f32_e32 v2, v2, v4 +; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_or_b32_e32 v0, v0, v1 +; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; CI-NEXT: v_or_b32_e32 v1, v2, v1 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: add_select_multi_use_lhs_fabs_fabs_v2f16: @@ -208,24 +213,26 @@ define { <2 x half>, <2 x half> } @add_select_multi_store_use_lhs_fabs_fabs_v2f1 ; CI-LABEL: add_select_multi_store_use_lhs_fabs_fabs_v2f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; CI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; CI-NEXT: v_cvt_f32_f16_e64 v3, |v3| -; CI-NEXT: v_cvt_f32_f16_e64 v5, |v5| -; CI-NEXT: v_cvt_f32_f16_e64 v4, |v4| -; CI-NEXT: v_cvt_f32_f16_e64 v2, |v2| +; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; CI-NEXT: v_bfe_u32 v7, v2, 16, 15 +; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; CI-NEXT: v_cvt_f32_f16_e64 v6, |v6| ; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; CI-NEXT: v_cvt_f32_f16_e64 v8, |v2| +; CI-NEXT: v_cvt_f32_f16_e64 v3, |v3| +; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CI-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc +; CI-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; CI-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc -; CI-NEXT: v_add_f32_e32 v0, v0, v6 -; CI-NEXT: v_add_f32_e32 v1, v1, v7 +; CI-NEXT: v_cndmask_b32_e32 v0, v3, v8, vcc +; CI-NEXT: v_add_f32_e32 v1, v1, v5 +; CI-NEXT: v_cvt_f16_f32_e32 v3, v1 +; CI-NEXT: v_add_f32_e32 v0, v0, v4 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v2 +; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; CI-NEXT: v_or_b32_e32 v0, v0, v2 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: add_select_multi_store_use_lhs_fabs_fabs_v2f16: @@ -303,30 +310,34 @@ define { <2 x half>, <2 x half> } @add_select_multi_use_rhs_fabs_fabs_v2f16(<2 x ; CI-LABEL: add_select_multi_use_rhs_fabs_fabs_v2f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; CI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; CI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; CI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; CI-NEXT: v_cvt_f32_f16_e64 v3, |v3| -; CI-NEXT: v_cvt_f32_f16_e64 v5, |v5| -; CI-NEXT: v_cvt_f32_f16_e64 v2, |v2| -; CI-NEXT: v_cvt_f32_f16_e64 v4, |v4| -; CI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; CI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; CI-NEXT: v_lshrrev_b32_e32 v8, 16, v2 +; CI-NEXT: v_lshrrev_b32_e32 v9, 16, v3 +; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v4 +; CI-NEXT: v_cvt_f32_f16_e64 v8, |v8| +; CI-NEXT: v_cvt_f32_f16_e64 v9, |v9| +; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 ; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; CI-NEXT: v_cvt_f32_f16_e64 v2, |v2| +; CI-NEXT: v_cvt_f32_f16_e64 v3, |v3| ; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CI-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc +; CI-NEXT: v_cndmask_b32_e32 v1, v9, v8, vcc ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; CI-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc -; CI-NEXT: v_add_f32_e32 v0, v0, v6 +; CI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; CI-NEXT: v_add_f32_e32 v1, v1, v7 -; CI-NEXT: v_add_f32_e32 v2, v4, v8 -; CI-NEXT: v_add_f32_e32 v3, v5, v9 +; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NEXT: v_add_f32_e32 v0, v0, v4 +; CI-NEXT: v_add_f32_e32 v2, v9, v6 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; CI-NEXT: v_add_f32_e32 v3, v3, v5 +; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_or_b32_e32 v0, v0, v1 +; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; CI-NEXT: v_or_b32_e32 v1, v3, v1 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: add_select_multi_use_rhs_fabs_fabs_v2f16: @@ -411,24 +422,25 @@ define <2 x half> @add_select_fabs_var_v2f16(<2 x i32> %c, <2 x half> %x, <2 x h ; CI-LABEL: add_select_fabs_var_v2f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; CI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; CI-NEXT: v_cvt_f32_f16_e64 v6, |v6| +; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; CI-NEXT: v_cvt_f32_f16_e64 v3, |v3| -; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; CI-NEXT: v_cvt_f32_f16_e64 v2, |v2| -; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CI-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc +; CI-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; CI-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc -; CI-NEXT: v_add_f32_e32 v0, v0, v6 -; CI-NEXT: v_add_f32_e32 v1, v1, v7 +; CI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; CI-NEXT: v_add_f32_e32 v1, v1, v5 +; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NEXT: v_add_f32_e32 v0, v0, v4 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_or_b32_e32 v0, v0, v1 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: add_select_fabs_var_v2f16: @@ -496,20 +508,22 @@ define <2 x half> @add_select_fabs_negk_v2f16(<2 x i32> %c, <2 x half> %x, <2 x ; CI-LABEL: add_select_fabs_negk_v2f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; CI-NEXT: v_cvt_f32_f16_e64 v3, |v3| -; CI-NEXT: v_cvt_f32_f16_e64 v2, |v2| -; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; CI-NEXT: v_cvt_f32_f16_e64 v5, |v5| ; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; CI-NEXT: v_cvt_f32_f16_e64 v2, |v2| +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CI-NEXT: v_cndmask_b32_e32 v1, -1.0, v3, vcc +; CI-NEXT: v_cndmask_b32_e32 v1, -1.0, v5, vcc ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cndmask_b32_e32 v0, -1.0, v2, vcc -; CI-NEXT: v_add_f32_e32 v0, v0, v4 -; CI-NEXT: v_add_f32_e32 v1, v1, v5 +; CI-NEXT: v_add_f32_e32 v1, v1, v4 +; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NEXT: v_add_f32_e32 v0, v0, v3 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_or_b32_e32 v0, v0, v1 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: add_select_fabs_negk_v2f16: @@ -579,16 +593,19 @@ define <2 x half> @add_select_fabs_negk_negk_v2f16(<2 x i32> %c, <2 x half> %x) ; CI-LABEL: add_select_fabs_negk_negk_v2f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CI-NEXT: v_cndmask_b32_e64 v1, -1.0, -2.0, vcc +; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cndmask_b32_e64 v0, -1.0, -2.0, vcc +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; CI-NEXT: v_cndmask_b32_e64 v1, -1.0, -2.0, vcc ; CI-NEXT: v_sub_f32_e32 v1, v3, v1 +; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; CI-NEXT: v_sub_f32_e32 v0, v2, v0 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_or_b32_e32 v0, v0, v1 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: add_select_fabs_negk_negk_v2f16: @@ -655,16 +672,19 @@ define <2 x half> @add_select_posk_posk_v2f16(<2 x i32> %c, <2 x half> %x) { ; CI-LABEL: add_select_posk_posk_v2f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CI-NEXT: v_cndmask_b32_e64 v1, 1.0, 2.0, vcc +; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cndmask_b32_e64 v0, 1.0, 2.0, vcc +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; CI-NEXT: v_cndmask_b32_e64 v1, 1.0, 2.0, vcc ; CI-NEXT: v_add_f32_e32 v1, v1, v3 +; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; CI-NEXT: v_add_f32_e32 v0, v0, v2 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_or_b32_e32 v0, v0, v1 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: add_select_posk_posk_v2f16: @@ -730,20 +750,22 @@ define <2 x half> @add_select_negk_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 x ; CI-LABEL: add_select_negk_fabs_v2f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; CI-NEXT: v_cvt_f32_f16_e64 v3, |v3| -; CI-NEXT: v_cvt_f32_f16_e64 v2, |v2| -; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; CI-NEXT: v_cvt_f32_f16_e64 v5, |v5| ; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; CI-NEXT: v_cvt_f32_f16_e64 v2, |v2| +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; CI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; CI-NEXT: v_cndmask_b32_e32 v1, -1.0, v3, vcc +; CI-NEXT: v_cndmask_b32_e32 v1, -1.0, v5, vcc ; CI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cndmask_b32_e32 v0, -1.0, v2, vcc -; CI-NEXT: v_add_f32_e32 v0, v0, v4 -; CI-NEXT: v_add_f32_e32 v1, v1, v5 +; CI-NEXT: v_add_f32_e32 v1, v1, v4 +; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NEXT: v_add_f32_e32 v0, v0, v3 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_or_b32_e32 v0, v0, v1 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: add_select_negk_fabs_v2f16: @@ -812,21 +834,23 @@ define <2 x half> @add_select_negliteralk_fabs_v2f16(<2 x i32> %c, <2 x half> %x ; CI-LABEL: add_select_negliteralk_fabs_v2f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; CI-NEXT: v_cvt_f32_f16_e64 v3, |v3| -; CI-NEXT: v_cvt_f32_f16_e64 v2, |v2| -; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; CI-NEXT: v_cvt_f32_f16_e64 v5, |v5| ; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; CI-NEXT: v_cvt_f32_f16_e64 v2, |v2| +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; CI-NEXT: v_mov_b32_e32 v6, 0xc4800000 ; CI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; CI-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; CI-NEXT: v_cndmask_b32_e32 v1, v6, v5, vcc ; CI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc -; CI-NEXT: v_add_f32_e32 v0, v0, v4 -; CI-NEXT: v_add_f32_e32 v1, v1, v5 +; CI-NEXT: v_add_f32_e32 v1, v1, v4 +; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NEXT: v_add_f32_e32 v0, v0, v3 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_or_b32_e32 v0, v0, v1 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: add_select_negliteralk_fabs_v2f16: @@ -895,20 +919,22 @@ define <2 x half> @add_select_fabs_posk_v2f16(<2 x i32> %c, <2 x half> %x, <2 x ; CI-LABEL: add_select_fabs_posk_v2f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; CI-NEXT: v_cvt_f32_f16_e64 v3, |v3| -; CI-NEXT: v_cvt_f32_f16_e64 v2, |v2| -; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; CI-NEXT: v_cvt_f32_f16_e64 v5, |v5| ; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; CI-NEXT: v_cvt_f32_f16_e64 v2, |v2| +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CI-NEXT: v_cndmask_b32_e32 v1, 1.0, v3, vcc +; CI-NEXT: v_cndmask_b32_e32 v1, 1.0, v5, vcc ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cndmask_b32_e32 v0, 1.0, v2, vcc -; CI-NEXT: v_add_f32_e32 v0, v0, v4 -; CI-NEXT: v_add_f32_e32 v1, v1, v5 +; CI-NEXT: v_add_f32_e32 v1, v1, v4 +; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NEXT: v_add_f32_e32 v0, v0, v3 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_or_b32_e32 v0, v0, v1 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: add_select_fabs_posk_v2f16: @@ -977,20 +1003,22 @@ define <2 x half> @add_select_posk_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 x ; CI-LABEL: add_select_posk_fabs_v2f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; CI-NEXT: v_cvt_f32_f16_e64 v3, |v3| -; CI-NEXT: v_cvt_f32_f16_e64 v2, |v2| -; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; CI-NEXT: v_cvt_f32_f16_e64 v5, |v5| ; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; CI-NEXT: v_cvt_f32_f16_e64 v2, |v2| +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; CI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; CI-NEXT: v_cndmask_b32_e32 v1, 1.0, v3, vcc +; CI-NEXT: v_cndmask_b32_e32 v1, 1.0, v5, vcc ; CI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cndmask_b32_e32 v0, 1.0, v2, vcc -; CI-NEXT: v_add_f32_e32 v0, v0, v4 -; CI-NEXT: v_add_f32_e32 v1, v1, v5 +; CI-NEXT: v_add_f32_e32 v1, v1, v4 +; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NEXT: v_add_f32_e32 v0, v0, v3 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_or_b32_e32 v0, v0, v1 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: add_select_posk_fabs_v2f16: @@ -1059,24 +1087,25 @@ define <2 x half> @add_select_fneg_fneg_v2f16(<2 x i32> %c, <2 x half> %x, <2 x ; CI-LABEL: add_select_fneg_fneg_v2f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; CI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CI-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc +; CI-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; CI-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc -; CI-NEXT: v_sub_f32_e32 v0, v6, v0 -; CI-NEXT: v_sub_f32_e32 v1, v7, v1 +; CI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; CI-NEXT: v_sub_f32_e32 v1, v5, v1 +; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NEXT: v_sub_f32_e32 v0, v4, v0 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_or_b32_e32 v0, v0, v1 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: add_select_fneg_fneg_v2f16: @@ -1140,30 +1169,34 @@ define { <2 x half>, <2 x half> } @add_select_multi_use_lhs_fneg_fneg_v2f16(<2 x ; CI-LABEL: add_select_multi_use_lhs_fneg_fneg_v2f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; CI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; CI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; CI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; CI-NEXT: v_lshrrev_b32_e32 v8, 16, v2 +; CI-NEXT: v_lshrrev_b32_e32 v9, 16, v3 +; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v4 ; CI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; CI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 ; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CI-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc +; CI-NEXT: v_cndmask_b32_e32 v1, v9, v8, vcc ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; CI-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc -; CI-NEXT: v_sub_f32_e32 v0, v6, v0 +; CI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; CI-NEXT: v_sub_f32_e32 v1, v7, v1 -; CI-NEXT: v_sub_f32_e32 v2, v8, v2 -; CI-NEXT: v_sub_f32_e32 v3, v9, v3 +; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NEXT: v_sub_f32_e32 v0, v4, v0 +; CI-NEXT: v_sub_f32_e32 v3, v6, v8 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; CI-NEXT: v_sub_f32_e32 v2, v5, v2 +; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_or_b32_e32 v0, v0, v1 +; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; CI-NEXT: v_or_b32_e32 v1, v2, v1 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: add_select_multi_use_lhs_fneg_fneg_v2f16: @@ -1236,26 +1269,26 @@ define { <2 x half>, <2 x half> } @add_select_multi_store_use_lhs_fneg_fneg_v2f1 ; CI-LABEL: add_select_multi_store_use_lhs_fneg_fneg_v2f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; CI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; CI-NEXT: v_cvt_f32_f16_e32 v8, v3 +; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; CI-NEXT: v_cvt_f32_f16_e32 v9, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v8, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CI-NEXT: v_cvt_f32_f16_e64 v2, -v2 -; CI-NEXT: v_cvt_f32_f16_e64 v3, -v3 -; CI-NEXT: v_cndmask_b32_e32 v1, v5, v8, vcc +; CI-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; CI-NEXT: v_cndmask_b32_e32 v0, v4, v9, vcc -; CI-NEXT: v_sub_f32_e32 v0, v6, v0 -; CI-NEXT: v_sub_f32_e32 v1, v7, v1 +; CI-NEXT: v_cndmask_b32_e32 v0, v3, v8, vcc +; CI-NEXT: v_sub_f32_e32 v1, v5, v1 +; CI-NEXT: v_cvt_f16_f32_e32 v3, v1 +; CI-NEXT: v_sub_f32_e32 v0, v4, v0 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_xor_b32_e32 v1, 0x80008000, v2 +; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; CI-NEXT: v_or_b32_e32 v0, v0, v2 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: add_select_multi_store_use_lhs_fneg_fneg_v2f16: @@ -1325,30 +1358,34 @@ define { <2 x half>, <2 x half> } @add_select_multi_use_rhs_fneg_fneg_v2f16(<2 x ; CI-LABEL: add_select_multi_use_rhs_fneg_fneg_v2f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; CI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; CI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; CI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; CI-NEXT: v_lshrrev_b32_e32 v8, 16, v2 +; CI-NEXT: v_lshrrev_b32_e32 v9, 16, v3 +; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v4 ; CI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; CI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 ; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CI-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc +; CI-NEXT: v_cndmask_b32_e32 v1, v9, v8, vcc ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; CI-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc -; CI-NEXT: v_sub_f32_e32 v0, v6, v0 +; CI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; CI-NEXT: v_sub_f32_e32 v1, v7, v1 -; CI-NEXT: v_sub_f32_e32 v2, v8, v4 -; CI-NEXT: v_sub_f32_e32 v3, v9, v5 +; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NEXT: v_sub_f32_e32 v0, v4, v0 +; CI-NEXT: v_sub_f32_e32 v2, v6, v9 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; CI-NEXT: v_sub_f32_e32 v3, v5, v3 +; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_or_b32_e32 v0, v0, v1 +; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; CI-NEXT: v_or_b32_e32 v1, v3, v1 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: add_select_multi_use_rhs_fneg_fneg_v2f16: @@ -1421,28 +1458,26 @@ define <2 x half> @add_select_fneg_var_v2f16(<2 x i32> %c, <2 x half> %x, <2 x h ; CI-LABEL: add_select_fneg_var_v2f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; CI-NEXT: v_or_b32_e32 v2, v2, v3 -; CI-NEXT: v_cvt_f16_f32_e32 v3, v5 -; CI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; CI-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v5, v7 +; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v3 ; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CI-NEXT: v_cndmask_b32_e32 v1, v3, v7, vcc +; CI-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; CI-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc -; CI-NEXT: v_add_f32_e32 v0, v0, v6 +; CI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; CI-NEXT: v_add_f32_e32 v1, v1, v5 +; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NEXT: v_add_f32_e32 v0, v0, v4 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_or_b32_e32 v0, v0, v1 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: add_select_fneg_var_v2f16: @@ -1510,20 +1545,22 @@ define <2 x half> @add_select_fneg_negk_v2f16(<2 x i32> %c, <2 x half> %x, <2 x ; CI-LABEL: add_select_fneg_negk_v2f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CI-NEXT: v_cndmask_b32_e32 v1, 1.0, v3, vcc +; CI-NEXT: v_cndmask_b32_e32 v1, 1.0, v5, vcc ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cndmask_b32_e32 v0, 1.0, v2, vcc -; CI-NEXT: v_sub_f32_e32 v0, v4, v0 -; CI-NEXT: v_sub_f32_e32 v1, v5, v1 +; CI-NEXT: v_sub_f32_e32 v1, v4, v1 +; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NEXT: v_sub_f32_e32 v0, v3, v0 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_or_b32_e32 v0, v0, v1 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: add_select_fneg_negk_v2f16: @@ -1587,21 +1624,23 @@ define <2 x half> @add_select_fneg_inv2pi_v2f16(<2 x i32> %c, <2 x half> %x, <2 ; CI-LABEL: add_select_fneg_inv2pi_v2f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; CI-NEXT: v_mov_b32_e32 v6, 0xbe230000 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CI-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; CI-NEXT: v_cndmask_b32_e32 v1, v6, v5, vcc ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc -; CI-NEXT: v_sub_f32_e32 v0, v4, v0 -; CI-NEXT: v_sub_f32_e32 v1, v5, v1 +; CI-NEXT: v_sub_f32_e32 v1, v4, v1 +; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NEXT: v_sub_f32_e32 v0, v3, v0 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_or_b32_e32 v0, v0, v1 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: add_select_fneg_inv2pi_v2f16: @@ -1665,21 +1704,23 @@ define <2 x half> @add_select_fneg_neginv2pi_v2f16(<2 x i32> %c, <2 x half> %x, ; CI-LABEL: add_select_fneg_neginv2pi_v2f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; CI-NEXT: v_mov_b32_e32 v6, 0x3e230000 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CI-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; CI-NEXT: v_cndmask_b32_e32 v1, v6, v5, vcc ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc -; CI-NEXT: v_sub_f32_e32 v0, v4, v0 -; CI-NEXT: v_sub_f32_e32 v1, v5, v1 +; CI-NEXT: v_sub_f32_e32 v1, v4, v1 +; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NEXT: v_sub_f32_e32 v0, v3, v0 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_or_b32_e32 v0, v0, v1 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: add_select_fneg_neginv2pi_v2f16: @@ -1743,16 +1784,19 @@ define <2 x half> @add_select_negk_negk_v2f16(<2 x i32> %c, <2 x half> %x) { ; CI-LABEL: add_select_negk_negk_v2f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CI-NEXT: v_cndmask_b32_e64 v1, -1.0, -2.0, vcc +; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cndmask_b32_e64 v0, -1.0, -2.0, vcc +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; CI-NEXT: v_cndmask_b32_e64 v1, -1.0, -2.0, vcc ; CI-NEXT: v_add_f32_e32 v1, v1, v3 +; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; CI-NEXT: v_add_f32_e32 v0, v0, v2 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_or_b32_e32 v0, v0, v1 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: add_select_negk_negk_v2f16: @@ -1818,18 +1862,21 @@ define <2 x half> @add_select_negliteralk_negliteralk_v2f16(<2 x i32> %c, <2 x h ; CI-LABEL: add_select_negliteralk_negliteralk_v2f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_mov_b32_e32 v4, 0xc5800000 -; CI-NEXT: v_mov_b32_e32 v5, 0xc5000000 +; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CI-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc +; CI-NEXT: v_mov_b32_e32 v4, 0xc5800000 +; CI-NEXT: v_mov_b32_e32 v5, 0xc5000000 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc -; CI-NEXT: v_add_f32_e32 v0, v0, v2 +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; CI-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc ; CI-NEXT: v_add_f32_e32 v1, v1, v3 +; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NEXT: v_add_f32_e32 v0, v0, v2 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_or_b32_e32 v0, v0, v1 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: add_select_negliteralk_negliteralk_v2f16: @@ -1895,16 +1942,19 @@ define <2 x half> @add_select_fneg_negk_negk_v2f16(<2 x i32> %c, <2 x half> %x) ; CI-LABEL: add_select_fneg_negk_negk_v2f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CI-NEXT: v_cndmask_b32_e64 v1, -1.0, -2.0, vcc +; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cndmask_b32_e64 v0, -1.0, -2.0, vcc +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; CI-NEXT: v_cndmask_b32_e64 v1, -1.0, -2.0, vcc ; CI-NEXT: v_sub_f32_e32 v1, v3, v1 +; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; CI-NEXT: v_sub_f32_e32 v0, v2, v0 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_or_b32_e32 v0, v0, v1 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: add_select_fneg_negk_negk_v2f16: @@ -1971,20 +2021,22 @@ define <2 x half> @add_select_negk_fneg_v2f16(<2 x i32> %c, <2 x half> %x, <2 x ; CI-LABEL: add_select_negk_fneg_v2f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; CI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; CI-NEXT: v_cndmask_b32_e32 v1, 1.0, v3, vcc +; CI-NEXT: v_cndmask_b32_e32 v1, 1.0, v5, vcc ; CI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cndmask_b32_e32 v0, 1.0, v2, vcc -; CI-NEXT: v_sub_f32_e32 v0, v4, v0 -; CI-NEXT: v_sub_f32_e32 v1, v5, v1 +; CI-NEXT: v_sub_f32_e32 v1, v4, v1 +; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NEXT: v_sub_f32_e32 v0, v3, v0 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_or_b32_e32 v0, v0, v1 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: add_select_negk_fneg_v2f16: @@ -2048,20 +2100,22 @@ define <2 x half> @add_select_fneg_posk_v2f16(<2 x i32> %c, <2 x half> %x, <2 x ; CI-LABEL: add_select_fneg_posk_v2f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CI-NEXT: v_cndmask_b32_e32 v1, -1.0, v3, vcc +; CI-NEXT: v_cndmask_b32_e32 v1, -1.0, v5, vcc ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cndmask_b32_e32 v0, -1.0, v2, vcc -; CI-NEXT: v_sub_f32_e32 v0, v4, v0 -; CI-NEXT: v_sub_f32_e32 v1, v5, v1 +; CI-NEXT: v_sub_f32_e32 v1, v4, v1 +; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NEXT: v_sub_f32_e32 v0, v3, v0 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_or_b32_e32 v0, v0, v1 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: add_select_fneg_posk_v2f16: @@ -2125,20 +2179,22 @@ define <2 x half> @add_select_posk_fneg_v2f16(<2 x i32> %c, <2 x half> %x, <2 x ; CI-LABEL: add_select_posk_fneg_v2f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; CI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; CI-NEXT: v_cndmask_b32_e32 v1, -1.0, v3, vcc +; CI-NEXT: v_cndmask_b32_e32 v1, -1.0, v5, vcc ; CI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cndmask_b32_e32 v0, -1.0, v2, vcc -; CI-NEXT: v_sub_f32_e32 v0, v4, v0 -; CI-NEXT: v_sub_f32_e32 v1, v5, v1 +; CI-NEXT: v_sub_f32_e32 v1, v4, v1 +; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NEXT: v_sub_f32_e32 v0, v3, v0 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_or_b32_e32 v0, v0, v1 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: add_select_posk_fneg_v2f16: @@ -2202,28 +2258,26 @@ define <2 x half> @add_select_negfabs_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 ; CI-LABEL: add_select_negfabs_fabs_v2f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; CI-NEXT: v_or_b32_e32 v2, v2, v3 -; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; CI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; CI-NEXT: v_or_b32_e32 v2, 0x80008000, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v7 +; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v3 ; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; CI-NEXT: v_cvt_f32_f16_e64 v6, |v6| ; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; CI-NEXT: v_cvt_f32_f16_e64 v5, |v5| +; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; CI-NEXT: v_cvt_f32_f16_e64 v3, |v3| ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cvt_f32_f16_e64 v4, |v4| -; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CI-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc +; CI-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; CI-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc -; CI-NEXT: v_add_f32_e32 v0, v0, v6 -; CI-NEXT: v_add_f32_e32 v1, v1, v3 +; CI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; CI-NEXT: v_add_f32_e32 v1, v1, v5 +; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NEXT: v_add_f32_e32 v0, v0, v4 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_or_b32_e32 v0, v0, v1 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: add_select_negfabs_fabs_v2f16: @@ -2298,28 +2352,26 @@ define <2 x half> @add_select_fabs_negfabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 ; CI-LABEL: add_select_fabs_negfabs_v2f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; CI-NEXT: v_or_b32_e32 v4, v4, v5 -; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; CI-NEXT: v_or_b32_e32 v4, 0x80008000, v4 -; CI-NEXT: v_cvt_f32_f16_e32 v5, v7 -; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v4 -; CI-NEXT: v_cvt_f32_f16_e64 v3, |v3| +; CI-NEXT: v_or_b32_e32 v3, 0x80008000, v3 +; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; CI-NEXT: v_cvt_f32_f16_e64 v6, |v6| ; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; CI-NEXT: v_cvt_f32_f16_e64 v2, |v2| +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CI-NEXT: v_cndmask_b32_e32 v1, v7, v3, vcc +; CI-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; CI-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc -; CI-NEXT: v_add_f32_e32 v0, v0, v6 +; CI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; CI-NEXT: v_add_f32_e32 v1, v1, v5 +; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NEXT: v_add_f32_e32 v0, v0, v4 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_or_b32_e32 v0, v0, v1 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: add_select_fabs_negfabs_v2f16: @@ -2394,28 +2446,26 @@ define <2 x half> @add_select_neg_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 x h ; CI-LABEL: add_select_neg_fabs_v2f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; CI-NEXT: v_or_b32_e32 v2, v2, v3 -; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; CI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; CI-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v7 +; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v3 ; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; CI-NEXT: v_cvt_f32_f16_e64 v6, |v6| ; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; CI-NEXT: v_cvt_f32_f16_e64 v5, |v5| +; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; CI-NEXT: v_cvt_f32_f16_e64 v3, |v3| ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cvt_f32_f16_e64 v4, |v4| -; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CI-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc +; CI-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; CI-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc -; CI-NEXT: v_add_f32_e32 v0, v0, v6 -; CI-NEXT: v_add_f32_e32 v1, v1, v3 +; CI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; CI-NEXT: v_add_f32_e32 v1, v1, v5 +; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NEXT: v_add_f32_e32 v0, v0, v4 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_or_b32_e32 v0, v0, v1 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: add_select_neg_fabs_v2f16: @@ -2489,28 +2539,26 @@ define <2 x half> @add_select_fabs_neg_v2f16(<2 x i32> %c, <2 x half> %x, <2 x h ; CI-LABEL: add_select_fabs_neg_v2f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; CI-NEXT: v_or_b32_e32 v4, v4, v5 -; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; CI-NEXT: v_xor_b32_e32 v4, 0x80008000, v4 -; CI-NEXT: v_cvt_f32_f16_e32 v5, v7 -; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v4 -; CI-NEXT: v_cvt_f32_f16_e64 v3, |v3| +; CI-NEXT: v_xor_b32_e32 v3, 0x80008000, v3 +; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; CI-NEXT: v_cvt_f32_f16_e64 v6, |v6| ; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; CI-NEXT: v_cvt_f32_f16_e64 v2, |v2| +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CI-NEXT: v_cndmask_b32_e32 v1, v7, v3, vcc +; CI-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; CI-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc -; CI-NEXT: v_add_f32_e32 v0, v0, v6 +; CI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; CI-NEXT: v_add_f32_e32 v1, v1, v5 +; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NEXT: v_add_f32_e32 v0, v0, v4 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_or_b32_e32 v0, v0, v1 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: add_select_fabs_neg_v2f16: @@ -2584,24 +2632,25 @@ define <2 x half> @add_select_neg_negfabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 ; CI-LABEL: add_select_neg_negfabs_v2f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; CI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_cvt_f32_f16_e64 v5, |v5| -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cvt_f32_f16_e64 v4, |v4| -; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; CI-NEXT: v_cvt_f32_f16_e64 v7, |v7| +; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_cvt_f32_f16_e64 v3, |v3| +; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CI-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc +; CI-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; CI-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc -; CI-NEXT: v_sub_f32_e32 v0, v6, v0 -; CI-NEXT: v_sub_f32_e32 v1, v7, v1 +; CI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; CI-NEXT: v_sub_f32_e32 v1, v5, v1 +; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NEXT: v_sub_f32_e32 v0, v4, v0 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_or_b32_e32 v0, v0, v1 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: add_select_neg_negfabs_v2f16: @@ -2671,24 +2720,25 @@ define <2 x half> @add_select_negfabs_neg_v2f16(<2 x i32> %c, <2 x half> %x, <2 ; CI-LABEL: add_select_negfabs_neg_v2f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; CI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; CI-NEXT: v_cvt_f32_f16_e64 v7, |v7| ; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; CI-NEXT: v_cvt_f32_f16_e64 v3, |v3| -; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; CI-NEXT: v_cvt_f32_f16_e64 v2, |v2| -; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; CI-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; CI-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc -; CI-NEXT: v_sub_f32_e32 v0, v6, v0 -; CI-NEXT: v_sub_f32_e32 v1, v7, v1 +; CI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; CI-NEXT: v_sub_f32_e32 v1, v5, v1 +; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NEXT: v_sub_f32_e32 v0, v4, v0 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_or_b32_e32 v0, v0, v1 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: add_select_negfabs_neg_v2f16: @@ -2758,24 +2808,23 @@ define <2 x half> @mul_select_negfabs_posk_v2f16(<2 x i32> %c, <2 x half> %x, <2 ; CI-LABEL: mul_select_negfabs_posk_v2f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; CI-NEXT: v_or_b32_e32 v2, v2, v3 ; CI-NEXT: v_or_b32_e32 v2, 0x80008000, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v5 -; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; CI-NEXT: v_cvt_f32_f16_e32 v5, v2 +; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CI-NEXT: v_cndmask_b32_e32 v1, 4.0, v5, vcc +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; CI-NEXT: v_cndmask_b32_e32 v0, 4.0, v2, vcc -; CI-NEXT: v_mul_f32_e32 v0, v0, v4 -; CI-NEXT: v_mul_f32_e32 v1, v1, v3 +; CI-NEXT: v_cndmask_b32_e32 v0, 4.0, v5, vcc +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; CI-NEXT: v_cndmask_b32_e32 v1, 4.0, v2, vcc +; CI-NEXT: v_mul_f32_e32 v1, v1, v4 +; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NEXT: v_mul_f32_e32 v0, v0, v3 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_or_b32_e32 v0, v0, v1 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: mul_select_negfabs_posk_v2f16: @@ -2845,24 +2894,23 @@ define <2 x half> @mul_select_posk_negfabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 ; CI-LABEL: mul_select_posk_negfabs_v2f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; CI-NEXT: v_or_b32_e32 v2, v2, v3 ; CI-NEXT: v_or_b32_e32 v2, 0x80008000, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v5 -; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; CI-NEXT: v_cvt_f32_f16_e32 v5, v2 +; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; CI-NEXT: v_cndmask_b32_e32 v1, 4.0, v5, vcc +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; CI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; CI-NEXT: v_cndmask_b32_e32 v0, 4.0, v2, vcc -; CI-NEXT: v_mul_f32_e32 v0, v0, v4 -; CI-NEXT: v_mul_f32_e32 v1, v1, v3 +; CI-NEXT: v_cndmask_b32_e32 v0, 4.0, v5, vcc +; CI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; CI-NEXT: v_cndmask_b32_e32 v1, 4.0, v2, vcc +; CI-NEXT: v_mul_f32_e32 v1, v1, v4 +; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NEXT: v_mul_f32_e32 v0, v0, v3 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_or_b32_e32 v0, v0, v1 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: mul_select_posk_negfabs_v2f16: @@ -2932,24 +2980,23 @@ define <2 x half> @mul_select_negfabs_negk_v2f16(<2 x i32> %c, <2 x half> %x, <2 ; CI-LABEL: mul_select_negfabs_negk_v2f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; CI-NEXT: v_or_b32_e32 v2, v2, v3 ; CI-NEXT: v_or_b32_e32 v2, 0x80008000, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v5 -; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; CI-NEXT: v_cvt_f32_f16_e32 v5, v2 +; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CI-NEXT: v_cndmask_b32_e32 v1, -4.0, v5, vcc +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; CI-NEXT: v_cndmask_b32_e32 v0, -4.0, v2, vcc -; CI-NEXT: v_mul_f32_e32 v0, v0, v4 -; CI-NEXT: v_mul_f32_e32 v1, v1, v3 +; CI-NEXT: v_cndmask_b32_e32 v0, -4.0, v5, vcc +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; CI-NEXT: v_cndmask_b32_e32 v1, -4.0, v2, vcc +; CI-NEXT: v_mul_f32_e32 v1, v1, v4 +; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NEXT: v_mul_f32_e32 v0, v0, v3 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_or_b32_e32 v0, v0, v1 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: mul_select_negfabs_negk_v2f16: @@ -3019,24 +3066,23 @@ define <2 x half> @mul_select_negk_negfabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 ; CI-LABEL: mul_select_negk_negfabs_v2f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; CI-NEXT: v_or_b32_e32 v2, v2, v3 ; CI-NEXT: v_or_b32_e32 v2, 0x80008000, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v5 -; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; CI-NEXT: v_cvt_f32_f16_e32 v5, v2 +; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; CI-NEXT: v_cndmask_b32_e32 v1, -4.0, v5, vcc +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; CI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; CI-NEXT: v_cndmask_b32_e32 v0, -4.0, v2, vcc -; CI-NEXT: v_mul_f32_e32 v0, v0, v4 -; CI-NEXT: v_mul_f32_e32 v1, v1, v3 +; CI-NEXT: v_cndmask_b32_e32 v0, -4.0, v5, vcc +; CI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; CI-NEXT: v_cndmask_b32_e32 v1, -4.0, v2, vcc +; CI-NEXT: v_mul_f32_e32 v1, v1, v4 +; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NEXT: v_mul_f32_e32 v0, v0, v3 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_or_b32_e32 v0, v0, v1 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: mul_select_negk_negfabs_v2f16: @@ -3110,24 +3156,27 @@ define <2 x half> @select_fneg_posk_src_add_v2f16(<2 x i32> %c, <2 x half> %x, < ; CI-LABEL: select_fneg_posk_src_add_v2f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; CI-NEXT: v_add_f32_e32 v3, 4.0, v3 -; CI-NEXT: v_add_f32_e32 v2, 4.0, v2 ; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; CI-NEXT: v_add_f32_e32 v2, 4.0, v2 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; CI-NEXT: v_or_b32_e32 v2, v2, v3 ; CI-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v2 -; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cndmask_b32_e32 v0, 2.0, v3, vcc -; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CI-NEXT: v_cndmask_b32_e32 v1, 2.0, v2, vcc +; CI-NEXT: v_cndmask_b32_e32 v1, 2.0, v3, vcc +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NEXT: v_cndmask_b32_e32 v0, 2.0, v2, vcc +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_or_b32_e32 v0, v0, v1 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: select_fneg_posk_src_add_v2f16: @@ -3198,16 +3247,19 @@ define <2 x half> @select_fneg_posk_src_add_v2f16_nsz(<2 x i32> %c, <2 x half> % ; CI-LABEL: select_fneg_posk_src_add_v2f16_nsz: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_sub_f32_e32 v2, -4.0, v2 -; CI-NEXT: v_sub_f32_e32 v3, -4.0, v3 -; CI-NEXT: v_cndmask_b32_e32 v0, 2.0, v2, vcc +; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; CI-NEXT: v_sub_f32_e32 v3, -4.0, v3 +; CI-NEXT: v_sub_f32_e32 v2, -4.0, v2 ; CI-NEXT: v_cndmask_b32_e32 v1, 2.0, v3, vcc +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NEXT: v_cndmask_b32_e32 v0, 2.0, v2, vcc +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_or_b32_e32 v0, v0, v1 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: select_fneg_posk_src_add_v2f16_nsz: @@ -3272,24 +3324,27 @@ define <2 x half> @select_fneg_posk_src_sub_v2f16(<2 x i32> %c, <2 x half> %x) { ; CI-LABEL: select_fneg_posk_src_sub_v2f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; CI-NEXT: v_add_f32_e32 v3, -4.0, v3 -; CI-NEXT: v_add_f32_e32 v2, -4.0, v2 ; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; CI-NEXT: v_add_f32_e32 v2, -4.0, v2 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; CI-NEXT: v_or_b32_e32 v2, v2, v3 ; CI-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v2 -; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cndmask_b32_e32 v0, 2.0, v3, vcc -; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CI-NEXT: v_cndmask_b32_e32 v1, 2.0, v2, vcc +; CI-NEXT: v_cndmask_b32_e32 v1, 2.0, v3, vcc +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NEXT: v_cndmask_b32_e32 v0, 2.0, v2, vcc +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_or_b32_e32 v0, v0, v1 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: select_fneg_posk_src_sub_v2f16: @@ -3360,16 +3415,19 @@ define <2 x half> @select_fneg_posk_src_sub_v2f16_nsz(<2 x i32> %c, <2 x half> % ; CI-LABEL: select_fneg_posk_src_sub_v2f16_nsz: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_sub_f32_e32 v2, 4.0, v2 -; CI-NEXT: v_sub_f32_e32 v3, 4.0, v3 -; CI-NEXT: v_cndmask_b32_e32 v0, 2.0, v2, vcc +; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; CI-NEXT: v_sub_f32_e32 v3, 4.0, v3 +; CI-NEXT: v_sub_f32_e32 v2, 4.0, v2 ; CI-NEXT: v_cndmask_b32_e32 v1, 2.0, v3, vcc +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NEXT: v_cndmask_b32_e32 v0, 2.0, v2, vcc +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_or_b32_e32 v0, v0, v1 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: select_fneg_posk_src_sub_v2f16_nsz: @@ -3434,16 +3492,19 @@ define <2 x half> @select_fneg_posk_src_mul_v2f16(<2 x i32> %c, <2 x half> %x) { ; CI-LABEL: select_fneg_posk_src_mul_v2f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_mul_f32_e32 v2, -4.0, v2 -; CI-NEXT: v_mul_f32_e32 v3, -4.0, v3 -; CI-NEXT: v_cndmask_b32_e32 v0, 2.0, v2, vcc +; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; CI-NEXT: v_mul_f32_e32 v3, -4.0, v3 +; CI-NEXT: v_mul_f32_e32 v2, -4.0, v2 ; CI-NEXT: v_cndmask_b32_e32 v1, 2.0, v3, vcc +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NEXT: v_cndmask_b32_e32 v0, 2.0, v2, vcc +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_or_b32_e32 v0, v0, v1 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: select_fneg_posk_src_mul_v2f16: @@ -3508,28 +3569,30 @@ define <2 x half> @select_fneg_posk_src_fma_v2f16(<2 x i32> %c, <2 x half> %x, < ; CI-LABEL: select_fneg_posk_src_fma_v2f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; CI-NEXT: v_fma_f32 v3, v3, 4.0, v5 -; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_fma_f32 v2, v2, 4.0, v4 +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; CI-NEXT: v_fma_f32 v4, v5, 4.0, v4 +; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; CI-NEXT: v_fma_f32 v2, v2, 4.0, v3 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; CI-NEXT: v_or_b32_e32 v2, v2, v3 ; CI-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v2 -; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cndmask_b32_e32 v0, 2.0, v3, vcc -; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CI-NEXT: v_cndmask_b32_e32 v1, 2.0, v2, vcc +; CI-NEXT: v_cndmask_b32_e32 v1, 2.0, v3, vcc +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NEXT: v_cndmask_b32_e32 v0, 2.0, v2, vcc +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_or_b32_e32 v0, v0, v1 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: select_fneg_posk_src_fma_v2f16: @@ -3602,30 +3665,32 @@ define <2 x half> @select_fneg_posk_src_fmad_v2f16(<2 x i32> %c, <2 x half> %x, ; CI-LABEL: select_fneg_posk_src_fmad_v2f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_mul_f32_e32 v3, 4.0, v3 -; CI-NEXT: v_add_f32_e32 v3, v3, v5 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_mul_f32_e32 v5, 4.0, v5 +; CI-NEXT: v_add_f32_e32 v4, v5, v4 ; CI-NEXT: v_mul_f32_e32 v2, 4.0, v2 -; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_add_f32_e32 v2, v2, v4 +; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; CI-NEXT: v_add_f32_e32 v2, v2, v3 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; CI-NEXT: v_or_b32_e32 v2, v2, v3 ; CI-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v2 -; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cndmask_b32_e32 v0, 2.0, v3, vcc -; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CI-NEXT: v_cndmask_b32_e32 v1, 2.0, v2, vcc +; CI-NEXT: v_cndmask_b32_e32 v1, 2.0, v3, vcc +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NEXT: v_cndmask_b32_e32 v0, 2.0, v2, vcc +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_or_b32_e32 v0, v0, v1 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: select_fneg_posk_src_fmad_v2f16: @@ -3698,22 +3763,24 @@ define <2 x half> @select_fneg_posk_src_fmad_v2f16_nsz(<2 x i32> %c, <2 x half> ; CI-LABEL: select_fneg_posk_src_fmad_v2f16_nsz: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; CI-NEXT: v_mul_f32_e32 v5, -4.0, v5 +; CI-NEXT: v_sub_f32_e32 v4, v5, v4 ; CI-NEXT: v_mul_f32_e32 v2, -4.0, v2 -; CI-NEXT: v_mul_f32_e32 v3, -4.0, v3 -; CI-NEXT: v_sub_f32_e32 v2, v2, v4 +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; CI-NEXT: v_sub_f32_e32 v2, v2, v3 +; CI-NEXT: v_cndmask_b32_e32 v1, 2.0, v4, vcc ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; CI-NEXT: v_sub_f32_e32 v3, v3, v5 +; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; CI-NEXT: v_cndmask_b32_e32 v0, 2.0, v2, vcc -; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CI-NEXT: v_cndmask_b32_e32 v1, 2.0, v3, vcc +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_or_b32_e32 v0, v0, v1 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: select_fneg_posk_src_fmad_v2f16_nsz: diff --git a/llvm/test/CodeGen/AMDGPU/select-flags-to-fmin-fmax.ll b/llvm/test/CodeGen/AMDGPU/select-flags-to-fmin-fmax.ll index 09f7e7a926376..0de366132e31e 100644 --- a/llvm/test/CodeGen/AMDGPU/select-flags-to-fmin-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/select-flags-to-fmin-fmax.ll @@ -879,16 +879,18 @@ define <2 x half> @v_test_fmin_legacy_ule_v2f16_safe(<2 x half> %a, <2 x half> % ; GFX7-LABEL: v_test_fmin_legacy_ule_v2f16_safe: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_min_legacy_f32_e32 v0, v2, v0 -; GFX7-NEXT: v_min_legacy_f32_e32 v1, v3, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_min_legacy_f32_e32 v2, v3, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_min_legacy_f32_e32 v0, v1, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_test_fmin_legacy_ule_v2f16_safe: @@ -948,16 +950,18 @@ define <2 x half> @v_test_fmin_legacy_ule_v2f16_nnan_flag(<2 x half> %a, <2 x ha ; GFX7-LABEL: v_test_fmin_legacy_ule_v2f16_nnan_flag: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_min_legacy_f32_e32 v0, v2, v0 -; GFX7-NEXT: v_min_legacy_f32_e32 v1, v3, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_min_legacy_f32_e32 v2, v3, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_min_legacy_f32_e32 v0, v1, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_test_fmin_legacy_ule_v2f16_nnan_flag: @@ -1017,16 +1021,18 @@ define <2 x half> @v_test_fmin_legacy_ule_v2f16_nsz_flag(<2 x half> %a, <2 x hal ; GFX7-LABEL: v_test_fmin_legacy_ule_v2f16_nsz_flag: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_min_legacy_f32_e32 v0, v2, v0 -; GFX7-NEXT: v_min_legacy_f32_e32 v1, v3, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_min_legacy_f32_e32 v2, v3, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_min_legacy_f32_e32 v0, v1, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_test_fmin_legacy_ule_v2f16_nsz_flag: @@ -1086,16 +1092,18 @@ define <2 x half> @v_test_fmin_legacy_ule_v2f16_nnan_nsz_flag(<2 x half> %a, <2 ; GFX7-LABEL: v_test_fmin_legacy_ule_v2f16_nnan_nsz_flag: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_min_f32_e32 v0, v0, v2 -; GFX7-NEXT: v_min_f32_e32 v1, v1, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_min_f32_e32 v2, v3, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_test_fmin_legacy_ule_v2f16_nnan_nsz_flag: @@ -1122,16 +1130,18 @@ define <2 x half> @v_test_fmax_legacy_uge_v2f16_safe(<2 x half> %a, <2 x half> % ; GFX7-LABEL: v_test_fmax_legacy_uge_v2f16_safe: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_max_legacy_f32_e32 v0, v2, v0 -; GFX7-NEXT: v_max_legacy_f32_e32 v1, v3, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_max_legacy_f32_e32 v2, v3, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_max_legacy_f32_e32 v0, v1, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_test_fmax_legacy_uge_v2f16_safe: @@ -1191,16 +1201,18 @@ define <2 x half> @v_test_fmax_legacy_uge_v2f16_nnan_flag(<2 x half> %a, <2 x ha ; GFX7-LABEL: v_test_fmax_legacy_uge_v2f16_nnan_flag: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_max_legacy_f32_e32 v0, v2, v0 -; GFX7-NEXT: v_max_legacy_f32_e32 v1, v3, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_max_legacy_f32_e32 v2, v3, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_max_legacy_f32_e32 v0, v1, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_test_fmax_legacy_uge_v2f16_nnan_flag: @@ -1260,16 +1272,18 @@ define <2 x half> @v_test_fmax_legacy_uge_v2f16_nsz_flag(<2 x half> %a, <2 x hal ; GFX7-LABEL: v_test_fmax_legacy_uge_v2f16_nsz_flag: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_max_legacy_f32_e32 v0, v2, v0 -; GFX7-NEXT: v_max_legacy_f32_e32 v1, v3, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_max_legacy_f32_e32 v2, v3, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_max_legacy_f32_e32 v0, v1, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_test_fmax_legacy_uge_v2f16_nsz_flag: @@ -1329,16 +1343,18 @@ define <2 x half> @v_test_fmax_legacy_uge_v2f16_nnan_nsz_flag(<2 x half> %a, <2 ; GFX7-LABEL: v_test_fmax_legacy_uge_v2f16_nnan_nsz_flag: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX7-NEXT: v_max_f32_e32 v1, v1, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_max_f32_e32 v2, v3, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_test_fmax_legacy_uge_v2f16_nnan_nsz_flag: @@ -1365,26 +1381,30 @@ define <4 x half> @v_test_fmin_legacy_ule_v4f16_safe(<4 x half> %a, <4 x half> % ; GFX7-LABEL: v_test_fmin_legacy_ule_v4f16_safe: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX7-NEXT: v_min_legacy_f32_e32 v0, v4, v0 -; GFX7-NEXT: v_min_legacy_f32_e32 v1, v5, v1 -; GFX7-NEXT: v_min_legacy_f32_e32 v2, v6, v2 -; GFX7-NEXT: v_min_legacy_f32_e32 v3, v7, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_min_legacy_f32_e32 v4, v5, v4 +; GFX7-NEXT: v_min_legacy_f32_e32 v6, v7, v6 +; GFX7-NEXT: v_min_legacy_f32_e32 v0, v2, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v4 +; GFX7-NEXT: v_min_legacy_f32_e32 v1, v3, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_test_fmin_legacy_ule_v4f16_safe: @@ -1463,26 +1483,30 @@ define <4 x half> @v_test_fmin_legacy_ule_v4f16_nnan_flag(<4 x half> %a, <4 x ha ; GFX7-LABEL: v_test_fmin_legacy_ule_v4f16_nnan_flag: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX7-NEXT: v_min_legacy_f32_e32 v0, v4, v0 -; GFX7-NEXT: v_min_legacy_f32_e32 v1, v5, v1 -; GFX7-NEXT: v_min_legacy_f32_e32 v2, v6, v2 -; GFX7-NEXT: v_min_legacy_f32_e32 v3, v7, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_min_legacy_f32_e32 v4, v5, v4 +; GFX7-NEXT: v_min_legacy_f32_e32 v6, v7, v6 +; GFX7-NEXT: v_min_legacy_f32_e32 v0, v2, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v4 +; GFX7-NEXT: v_min_legacy_f32_e32 v1, v3, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_test_fmin_legacy_ule_v4f16_nnan_flag: @@ -1561,26 +1585,30 @@ define <4 x half> @v_test_fmin_legacy_ule_v4f16_nsz_flag(<4 x half> %a, <4 x hal ; GFX7-LABEL: v_test_fmin_legacy_ule_v4f16_nsz_flag: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX7-NEXT: v_min_legacy_f32_e32 v0, v4, v0 -; GFX7-NEXT: v_min_legacy_f32_e32 v1, v5, v1 -; GFX7-NEXT: v_min_legacy_f32_e32 v2, v6, v2 -; GFX7-NEXT: v_min_legacy_f32_e32 v3, v7, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_min_legacy_f32_e32 v4, v5, v4 +; GFX7-NEXT: v_min_legacy_f32_e32 v6, v7, v6 +; GFX7-NEXT: v_min_legacy_f32_e32 v0, v2, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v4 +; GFX7-NEXT: v_min_legacy_f32_e32 v1, v3, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_test_fmin_legacy_ule_v4f16_nsz_flag: @@ -1659,26 +1687,30 @@ define <4 x half> @v_test_fmin_legacy_ule_v4f16_nnan_nsz_flag(<4 x half> %a, <4 ; GFX7-LABEL: v_test_fmin_legacy_ule_v4f16_nnan_nsz_flag: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_min_f32_e32 v0, v0, v4 -; GFX7-NEXT: v_min_f32_e32 v1, v1, v5 -; GFX7-NEXT: v_min_f32_e32 v2, v2, v6 -; GFX7-NEXT: v_min_f32_e32 v3, v3, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_min_f32_e32 v4, v5, v4 +; GFX7-NEXT: v_min_f32_e32 v6, v7, v6 +; GFX7-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v4 +; GFX7-NEXT: v_min_f32_e32 v1, v1, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_test_fmin_legacy_ule_v4f16_nnan_nsz_flag: @@ -1707,26 +1739,30 @@ define <4 x half> @v_test_fmax_legacy_uge_v4f16_safe(<4 x half> %a, <4 x half> % ; GFX7-LABEL: v_test_fmax_legacy_uge_v4f16_safe: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX7-NEXT: v_max_legacy_f32_e32 v0, v4, v0 -; GFX7-NEXT: v_max_legacy_f32_e32 v1, v5, v1 -; GFX7-NEXT: v_max_legacy_f32_e32 v2, v6, v2 -; GFX7-NEXT: v_max_legacy_f32_e32 v3, v7, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_max_legacy_f32_e32 v4, v5, v4 +; GFX7-NEXT: v_max_legacy_f32_e32 v6, v7, v6 +; GFX7-NEXT: v_max_legacy_f32_e32 v0, v2, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v4 +; GFX7-NEXT: v_max_legacy_f32_e32 v1, v3, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_test_fmax_legacy_uge_v4f16_safe: @@ -1805,26 +1841,30 @@ define <4 x half> @v_test_fmax_legacy_uge_v4f16_nnan_flag(<4 x half> %a, <4 x ha ; GFX7-LABEL: v_test_fmax_legacy_uge_v4f16_nnan_flag: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX7-NEXT: v_max_legacy_f32_e32 v0, v4, v0 -; GFX7-NEXT: v_max_legacy_f32_e32 v1, v5, v1 -; GFX7-NEXT: v_max_legacy_f32_e32 v2, v6, v2 -; GFX7-NEXT: v_max_legacy_f32_e32 v3, v7, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_max_legacy_f32_e32 v4, v5, v4 +; GFX7-NEXT: v_max_legacy_f32_e32 v6, v7, v6 +; GFX7-NEXT: v_max_legacy_f32_e32 v0, v2, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v4 +; GFX7-NEXT: v_max_legacy_f32_e32 v1, v3, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_test_fmax_legacy_uge_v4f16_nnan_flag: @@ -1903,26 +1943,30 @@ define <4 x half> @v_test_fmax_legacy_uge_v4f16_nsz_flag(<4 x half> %a, <4 x hal ; GFX7-LABEL: v_test_fmax_legacy_uge_v4f16_nsz_flag: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX7-NEXT: v_max_legacy_f32_e32 v0, v4, v0 -; GFX7-NEXT: v_max_legacy_f32_e32 v1, v5, v1 -; GFX7-NEXT: v_max_legacy_f32_e32 v2, v6, v2 -; GFX7-NEXT: v_max_legacy_f32_e32 v3, v7, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_max_legacy_f32_e32 v4, v5, v4 +; GFX7-NEXT: v_max_legacy_f32_e32 v6, v7, v6 +; GFX7-NEXT: v_max_legacy_f32_e32 v0, v2, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v4 +; GFX7-NEXT: v_max_legacy_f32_e32 v1, v3, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_test_fmax_legacy_uge_v4f16_nsz_flag: @@ -2001,26 +2045,30 @@ define <4 x half> @v_test_fmax_legacy_uge_v4f16_nnan_nsz_flag(<4 x half> %a, <4 ; GFX7-LABEL: v_test_fmax_legacy_uge_v4f16_nnan_nsz_flag: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_max_f32_e32 v0, v0, v4 -; GFX7-NEXT: v_max_f32_e32 v1, v1, v5 -; GFX7-NEXT: v_max_f32_e32 v2, v2, v6 -; GFX7-NEXT: v_max_f32_e32 v3, v3, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_max_f32_e32 v4, v5, v4 +; GFX7-NEXT: v_max_f32_e32 v6, v7, v6 +; GFX7-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v4 +; GFX7-NEXT: v_max_f32_e32 v1, v1, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_test_fmax_legacy_uge_v4f16_nnan_nsz_flag: diff --git a/llvm/test/CodeGen/AMDGPU/select.f16.ll b/llvm/test/CodeGen/AMDGPU/select.f16.ll index da454eeed8759..52cb3935b9a01 100644 --- a/llvm/test/CodeGen/AMDGPU/select.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/select.f16.ll @@ -1651,31 +1651,9 @@ define <4 x half> @v_select_v4f16(<4 x half> %a, <4 x half> %b, i32 %cond) { ; SI-LABEL: v_select_v4f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 -; SI-NEXT: v_or_b32_e32 v3, v6, v3 -; SI-NEXT: v_or_b32_e32 v1, v4, v1 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v8 -; SI-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc -; SI-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc -; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v3 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; SI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_select_v4f16: @@ -1701,30 +1679,34 @@ define <4 x half> @v_vselect_v4f16(<4 x half> %a, <4 x half> %b, <4 x i32> %cond ; SI-LABEL: v_vselect_v4f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v8 -; SI-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9 -; SI-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v10 -; SI-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v11 -; SI-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 +; SI-NEXT: v_cndmask_b32_e32 v7, v11, v10, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 +; SI-NEXT: v_cndmask_b32_e32 v5, v9, v8, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v6 +; SI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; SI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SI-NEXT: v_cvt_f16_f32_e32 v2, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_vselect_v4f16: @@ -1792,55 +1774,11 @@ define <8 x half> @v_select_v8f16(<8 x half> %a, <8 x half> %b, i32 %cond) { ; SI-LABEL: v_select_v8f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v6, v6, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v15 -; SI-NEXT: v_or_b32_e32 v4, v4, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v13 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v11 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v9 -; SI-NEXT: v_or_b32_e32 v7, v14, v7 -; SI-NEXT: v_or_b32_e32 v5, v12, v5 -; SI-NEXT: v_or_b32_e32 v3, v10, v3 -; SI-NEXT: v_or_b32_e32 v1, v8, v1 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16 -; SI-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc -; SI-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc -; SI-NEXT: v_cndmask_b32_e32 v5, v5, v4, vcc -; SI-NEXT: v_cndmask_b32_e32 v7, v7, v6, vcc -; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v7 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v8 +; SI-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; SI-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; SI-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; SI-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_select_v8f16: @@ -1869,54 +1807,62 @@ define <8 x half> @v_vselect_v8f16(<8 x half> %a, <8 x half> %b, <8 x i32> %cond ; SI-LABEL: v_vselect_v8f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v3 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v13 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v15 +; SI-NEXT: v_cndmask_b32_e32 v17, v18, v17, vcc +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v5 +; SI-NEXT: v_cndmask_b32_e64 v13, v13, v19, s[4:5] +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cndmask_b32_e32 v11, v18, v19, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cndmask_b32_e32 v9, v15, v16, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v14 +; SI-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v12 +; SI-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v10 +; SI-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v8 +; SI-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; SI-NEXT: v_cvt_f16_f32_e32 v4, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v0, v0, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; SI-NEXT: v_or_b32_e32 v1, v1, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc -; SI-NEXT: v_cvt_f16_f32_e32 v8, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v20 -; SI-NEXT: v_cndmask_b32_e32 v4, v12, v4, vcc -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v21 -; SI-NEXT: v_cndmask_b32_e32 v5, v13, v5, vcc -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v22 -; SI-NEXT: v_cndmask_b32_e32 v6, v9, v6, vcc -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v23 -; SI-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_vselect_v8f16: @@ -2021,107 +1967,15 @@ define <16 x half> @v_select_v16f16(<16 x half> %a, <16 x half> %b, i32 %cond) { ; SI-LABEL: v_select_v16f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_or_b32_e32 v12, v12, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_or_b32_e32 v14, v14, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v28 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v10, v10, v11 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_or_b32_e32 v15, v26, v15 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:4 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_or_b32_e32 v8, v8, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v25 -; SI-NEXT: v_or_b32_e32 v6, v6, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v23 -; SI-NEXT: v_or_b32_e32 v4, v4, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v17 -; SI-NEXT: v_or_b32_e32 v9, v24, v9 -; SI-NEXT: v_or_b32_e32 v7, v22, v7 -; SI-NEXT: v_or_b32_e32 v5, v20, v5 -; SI-NEXT: v_or_b32_e32 v1, v16, v1 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v30 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v11, v3 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v19 -; SI-NEXT: v_or_b32_e32 v11, v18, v11 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v26 -; SI-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc -; SI-NEXT: v_cndmask_b32_e32 v11, v11, v2, vcc -; SI-NEXT: v_cndmask_b32_e32 v5, v5, v4, vcc -; SI-NEXT: v_cndmask_b32_e32 v7, v7, v6, vcc -; SI-NEXT: v_cndmask_b32_e32 v9, v9, v8, vcc -; SI-NEXT: v_cndmask_b32_e32 v15, v15, v10, vcc -; SI-NEXT: v_cndmask_b32_e32 v13, v13, v12, vcc -; SI-NEXT: v_cndmask_b32_e32 v16, v3, v14, vcc -; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v13 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16 +; SI-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc +; SI-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc +; SI-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc +; SI-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc +; SI-NEXT: v_cndmask_b32_e32 v4, v12, v4, vcc +; SI-NEXT: v_cndmask_b32_e32 v5, v13, v5, vcc +; SI-NEXT: v_cndmask_b32_e32 v6, v14, v6, vcc +; SI-NEXT: v_cndmask_b32_e32 v7, v15, v7, vcc ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_select_v16f16: @@ -2156,128 +2010,120 @@ define <16 x half> @v_vselect_v16f16(<16 x half> %a, <16 x half> %b, <16 x i32> ; SI-LABEL: v_vselect_v16f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:28 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cndmask_b32_e32 v29, v32, v31, vcc +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cndmask_b32_e32 v27, v32, v31, vcc +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v25 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cndmask_b32_e32 v25, v32, v31, vcc +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cndmask_b32_e32 v23, v32, v31, vcc +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cndmask_b32_e32 v21, v32, v31, vcc +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cndmask_b32_e32 v19, v32, v31, vcc +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cndmask_b32_e32 v17, v32, v31, vcc +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v33 +; SI-NEXT: v_cndmask_b32_e32 v31, v32, v31, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v30 +; SI-NEXT: v_cndmask_b32_e32 v7, v15, v7, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v28 +; SI-NEXT: v_cndmask_b32_e32 v6, v14, v6, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v26 +; SI-NEXT: v_cndmask_b32_e32 v5, v13, v5, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v24 +; SI-NEXT: v_cndmask_b32_e32 v4, v12, v4, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v22 +; SI-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v20 +; SI-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v18 +; SI-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16 +; SI-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc +; SI-NEXT: v_cvt_f16_f32_e32 v8, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v0, v0, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; SI-NEXT: v_or_b32_e32 v1, v1, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v2, v2, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; SI-NEXT: v_or_b32_e32 v3, v3, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v4, v4, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; SI-NEXT: v_or_b32_e32 v5, v5, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v31 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v32 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v33 -; SI-NEXT: v_cndmask_b32_e64 v4, v20, v4, s[6:7] -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:44 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 -; SI-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc -; SI-NEXT: v_cndmask_b32_e64 v3, v19, v3, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v19, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v22 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v34 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:40 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cndmask_b32_e32 v5, v19, v5, vcc -; SI-NEXT: v_cvt_f32_f16_e32 v19, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v23 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v35 -; SI-NEXT: v_cndmask_b32_e32 v6, v19, v6, vcc -; SI-NEXT: v_cvt_f32_f16_e32 v19, v21 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cmp_eq_u32_e64 s[8:9], 0, v31 -; SI-NEXT: v_cndmask_b32_e64 v0, v16, v0, s[8:9] -; SI-NEXT: v_cvt_f16_f32_e32 v16, v17 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cmp_eq_u32_e64 s[8:9], 0, v32 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:60 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cndmask_b32_e64 v1, v16, v1, s[8:9] -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:32 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:56 -; SI-NEXT: v_cndmask_b32_e32 v7, v19, v7, vcc -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:64 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v17 -; SI-NEXT: v_cndmask_b32_e32 v8, v24, v8, vcc -; SI-NEXT: v_cvt_f32_f16_e32 v17, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v26 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v29 -; SI-NEXT: v_cndmask_b32_e32 v9, v17, v9, vcc -; SI-NEXT: v_cvt_f32_f16_e32 v17, v24 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v28 -; SI-NEXT: v_cndmask_b32_e32 v10, v17, v10, vcc -; SI-NEXT: v_cvt_f32_f16_e32 v17, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cndmask_b32_e32 v11, v18, v11, vcc -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v22 -; SI-NEXT: v_cndmask_b32_e32 v12, v20, v12, vcc -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v30 -; SI-NEXT: v_cndmask_b32_e32 v13, v17, v13, vcc -; SI-NEXT: v_cvt_f16_f32_e32 v17, v23 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cndmask_b32_e32 v14, v16, v14, vcc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19 -; SI-NEXT: v_cndmask_b32_e32 v15, v17, v15, vcc +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v6, v6, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_vselect_v16f16: @@ -2466,262 +2312,27 @@ define <32 x half> @v_select_v32f16(<32 x half> %a, <32 x half> %b, i32 %cond) { ; SI-LABEL: v_select_v32f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_or_b32_e32 v20, v20, v21 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_or_b32_e32 v12, v12, v13 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:56 -; SI-NEXT: v_or_b32_e32 v10, v10, v11 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:48 -; SI-NEXT: v_or_b32_e32 v8, v8, v9 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:40 -; SI-NEXT: v_or_b32_e32 v6, v6, v7 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:32 -; SI-NEXT: v_or_b32_e32 v4, v4, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_or_b32_e32 v22, v22, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_or_b32_e32 v24, v24, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; SI-NEXT: v_or_b32_e32 v26, v26, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_or_b32_e32 v28, v28, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_or_b32_e32 v18, v18, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_or_b32_e32 v16, v16, v17 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:124 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_or_b32_e32 v14, v14, v15 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:116 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:108 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:100 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:64 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:128 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_or_b32_e32 v23, v25, v23 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:120 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_or_b32_e32 v25, v27, v25 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:112 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; SI-NEXT: v_or_b32_e32 v27, v29, v27 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:104 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; SI-NEXT: v_or_b32_e32 v29, v30, v29 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:96 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; SI-NEXT: v_or_b32_e32 v30, v31, v30 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:88 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; SI-NEXT: v_or_b32_e32 v31, v32, v31 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:76 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: v_or_b32_e32 v19, v32, v19 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: v_or_b32_e32 v17, v32, v17 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: v_or_b32_e32 v15, v32, v15 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: v_or_b32_e32 v13, v32, v13 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:44 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: v_or_b32_e32 v11, v32, v11 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: v_or_b32_e32 v9, v32, v9 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: v_or_b32_e32 v7, v32, v7 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: v_or_b32_e32 v5, v32, v5 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: v_or_b32_e32 v3, v32, v3 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: v_or_b32_e32 v1, v32, v1 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:132 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v32 -; SI-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc -; SI-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc -; SI-NEXT: v_cndmask_b32_e32 v5, v5, v4, vcc -; SI-NEXT: v_cndmask_b32_e32 v7, v7, v6, vcc -; SI-NEXT: v_cndmask_b32_e32 v9, v9, v8, vcc -; SI-NEXT: v_cndmask_b32_e32 v11, v11, v10, vcc -; SI-NEXT: v_cndmask_b32_e32 v13, v13, v12, vcc -; SI-NEXT: v_cndmask_b32_e32 v15, v15, v14, vcc -; SI-NEXT: v_cndmask_b32_e32 v17, v17, v16, vcc -; SI-NEXT: v_cndmask_b32_e32 v19, v19, v18, vcc -; SI-NEXT: v_cndmask_b32_e32 v31, v31, v20, vcc -; SI-NEXT: v_cndmask_b32_e32 v30, v30, v22, vcc -; SI-NEXT: v_cndmask_b32_e32 v29, v29, v24, vcc -; SI-NEXT: v_cndmask_b32_e32 v27, v27, v26, vcc -; SI-NEXT: v_cndmask_b32_e32 v32, v25, v28, vcc -; SI-NEXT: v_cndmask_b32_e32 v33, v23, v21, vcc -; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v9 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v11 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v15 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v19 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v31 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v31 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v29 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v32 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31 +; SI-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 +; SI-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc +; SI-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc +; SI-NEXT: v_cndmask_b32_e32 v3, v19, v3, vcc +; SI-NEXT: v_cndmask_b32_e32 v4, v20, v4, vcc +; SI-NEXT: v_cndmask_b32_e32 v5, v21, v5, vcc +; SI-NEXT: v_cndmask_b32_e32 v6, v22, v6, vcc +; SI-NEXT: v_cndmask_b32_e32 v7, v23, v7, vcc +; SI-NEXT: v_cndmask_b32_e32 v8, v24, v8, vcc +; SI-NEXT: v_cndmask_b32_e32 v9, v25, v9, vcc +; SI-NEXT: v_cndmask_b32_e32 v10, v26, v10, vcc +; SI-NEXT: v_cndmask_b32_e32 v11, v27, v11, vcc +; SI-NEXT: v_cndmask_b32_e32 v12, v28, v12, vcc +; SI-NEXT: v_cndmask_b32_e32 v13, v29, v13, vcc +; SI-NEXT: v_cndmask_b32_e32 v14, v30, v14, vcc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cndmask_b32_e32 v15, v16, v15, vcc ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_select_v32f16: @@ -2777,328 +2388,324 @@ define <32 x half> @v_vselect_v32f16(<32 x half> %a, <32 x half> %b, <32 x i32> ; SI-LABEL: v_vselect_v32f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:132 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:128 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:8 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v11 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v10 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v26 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v9 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:44 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31 +; SI-NEXT: v_cndmask_b32_e32 v31, v40, v37, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v33 +; SI-NEXT: v_cndmask_b32_e32 v33, v42, v41, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v35 +; SI-NEXT: v_cndmask_b32_e32 v35, v45, v43, vcc +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:124 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v36 +; SI-NEXT: v_cndmask_b32_e32 v36, v47, v46, vcc +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:116 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v59 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 +; SI-NEXT: v_cndmask_b32_e32 v37, v57, v56, vcc +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v38 +; SI-NEXT: v_cndmask_b32_e32 v38, v34, v58, vcc +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v39 +; SI-NEXT: v_cndmask_b32_e32 v34, v41, v40, vcc +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v55 +; SI-NEXT: v_cndmask_b32_e32 v39, v40, v39, vcc +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v50 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v22 +; SI-NEXT: v_cndmask_b32_e32 v50, v42, v41, vcc +; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v51 +; SI-NEXT: v_cndmask_b32_e32 v51, v40, v55, vcc +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v52 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v4 +; SI-NEXT: v_cndmask_b32_e32 v52, v42, v41, vcc +; SI-NEXT: v_cvt_f32_f16_e32 v41, v55 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v55 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v53 +; SI-NEXT: v_cndmask_b32_e32 v53, v42, v41, vcc +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v54 +; SI-NEXT: v_cndmask_b32_e32 v54, v47, v43, vcc +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v43 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v18 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:68 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v43 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v49 +; SI-NEXT: v_cndmask_b32_e32 v49, v56, v47, vcc +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v48 +; SI-NEXT: v_cndmask_b32_e32 v48, v58, v57, vcc +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v16 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:76 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:52 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v44 +; SI-NEXT: v_cndmask_b32_e32 v44, v58, v56, vcc +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v32 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v45 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:20 +; SI-NEXT: v_cndmask_b32_e32 v15, v58, v15, vcc +; SI-NEXT: v_cvt_f32_f16_e32 v58, v30 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:12 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v46 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 +; SI-NEXT: v_cndmask_b32_e32 v14, v58, v14, vcc +; SI-NEXT: v_cvt_f32_f16_e32 v58, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v55 +; SI-NEXT: v_cndmask_b32_e32 v12, v29, v13, vcc +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v40 +; SI-NEXT: v_cndmask_b32_e32 v13, v28, v58, vcc +; SI-NEXT: v_cvt_f32_f16_e32 v28, v10 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v41 +; SI-NEXT: v_cndmask_b32_e32 v10, v27, v11, vcc +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v42 +; SI-NEXT: v_cndmask_b32_e32 v11, v26, v28, vcc +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v43 +; SI-NEXT: v_cndmask_b32_e32 v9, v25, v9, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v59 +; SI-NEXT: v_cndmask_b32_e32 v8, v24, v8, vcc +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v47 +; SI-NEXT: v_cndmask_b32_e32 v7, v23, v7, vcc +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v57 +; SI-NEXT: v_cndmask_b32_e32 v6, v22, v6, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v60 +; SI-NEXT: v_cndmask_b32_e32 v5, v21, v5, vcc +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v56 +; SI-NEXT: v_cndmask_b32_e32 v4, v20, v4, vcc +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v32 +; SI-NEXT: v_cndmask_b32_e32 v3, v19, v3, vcc +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v45 +; SI-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v30 +; SI-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v46 +; SI-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc +; SI-NEXT: v_cvt_f16_f32_e32 v16, v44 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v48 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v0, v0, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 +; SI-NEXT: v_or_b32_e32 v1, v1, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v49 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v54 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v2, v2, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 +; SI-NEXT: v_or_b32_e32 v3, v3, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v53 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v52 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v4, v4, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 +; SI-NEXT: v_or_b32_e32 v5, v5, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v51 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v50 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v6, v6, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 +; SI-NEXT: v_or_b32_e32 v7, v7, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v39 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v38 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cmp_eq_u32_e64 s[16:17], 0, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:136 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_eq_u32_e64 s[14:15], 0, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:140 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:144 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_eq_u32_e64 s[10:11], 0, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:148 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_eq_u32_e64 s[8:9], 0, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:152 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:156 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:160 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cndmask_b32_e64 v0, v31, v0, s[16:17] -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cndmask_b32_e64 v1, v31, v1, s[14:15] -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:12 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cndmask_b32_e64 v2, v31, v2, s[12:13] -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cndmask_b32_e64 v3, v31, v3, s[10:11] -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cndmask_b32_e64 v4, v31, v4, s[8:9] -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:24 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cndmask_b32_e64 v5, v31, v5, s[6:7] -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:28 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cndmask_b32_e64 v6, v31, v6, s[4:5] -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:32 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cndmask_b32_e32 v7, v31, v7, vcc -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:164 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:36 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cndmask_b32_e32 v8, v31, v8, vcc -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:168 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:40 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cndmask_b32_e32 v9, v31, v9, vcc -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:172 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:44 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cndmask_b32_e32 v10, v31, v10, vcc -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:176 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:48 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cndmask_b32_e32 v11, v31, v11, vcc -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:180 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:52 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cndmask_b32_e32 v12, v31, v12, vcc -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:184 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:56 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cndmask_b32_e32 v13, v31, v13, vcc -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:188 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:60 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cndmask_b32_e32 v14, v31, v14, vcc -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:192 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:64 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cndmask_b32_e32 v15, v31, v15, vcc -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:196 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:68 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cndmask_b32_e32 v16, v31, v16, vcc -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:200 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:72 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cndmask_b32_e32 v17, v31, v17, vcc -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:204 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cndmask_b32_e32 v18, v31, v18, vcc -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:208 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cndmask_b32_e32 v19, v31, v19, vcc -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:212 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:84 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cndmask_b32_e32 v20, v31, v20, vcc -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:216 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:88 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cndmask_b32_e32 v21, v31, v21, vcc -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:220 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:92 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cndmask_b32_e32 v22, v31, v22, vcc -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:224 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:96 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cndmask_b32_e32 v23, v31, v23, vcc -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:228 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:100 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cndmask_b32_e32 v24, v31, v24, vcc -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:232 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:104 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cndmask_b32_e32 v25, v31, v25, vcc -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:236 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:108 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cndmask_b32_e32 v26, v31, v26, vcc -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:240 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:112 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cndmask_b32_e32 v27, v31, v27, vcc -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:244 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:116 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cndmask_b32_e32 v28, v31, v28, vcc -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:248 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cndmask_b32_e32 v29, v31, v29, vcc -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:252 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:124 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cndmask_b32_e32 v30, v31, v30, vcc -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:256 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v8, v8, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 +; SI-NEXT: v_or_b32_e32 v9, v9, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v35 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v33 +; SI-NEXT: v_or_b32_e32 v11, v18, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v31 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v34 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v14, v14, v16 +; SI-NEXT: v_or_b32_e32 v13, v18, v13 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cndmask_b32_e32 v31, v32, v31, vcc ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_vselect_v32f16: diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-count-graphics.ll b/llvm/test/CodeGen/AMDGPU/sgpr-count-graphics.ll index 3c7b5bf97b879..8346e1cac399f 100644 --- a/llvm/test/CodeGen/AMDGPU/sgpr-count-graphics.ll +++ b/llvm/test/CodeGen/AMDGPU/sgpr-count-graphics.ll @@ -1,5 +1,5 @@ -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s | FileCheck %s --check-prefixes=CHECK,PACKED16 -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck %s --check-prefixes=CHECK,SPLIT16 +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s | FileCheck %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck %s @global = addrspace(1) global i32 poison, align 4 @@ -22,11 +22,9 @@ define amdgpu_ps void @vec_of_i8(<4 x i8> inreg %v4i8) { ret void } -; Vectors of 16-bit types are packed for newer architectures and unpacked for older ones. - +; Vectors of 16-bit types are packed. ; CHECK-LABEL: vec_of_16_bit_ty: -; PACKED16: TotalNumSgprs: 3 -; SPLIT16: TotalNumSgprs: 6 +; CHECK: TotalNumSgprs: 3 define amdgpu_ps void @vec_of_16_bit_ty(<2 x i16> inreg %v2i16, <4 x half> inreg %v4half) { ret void } diff --git a/llvm/test/CodeGen/AMDGPU/sibling-call.ll b/llvm/test/CodeGen/AMDGPU/sibling-call.ll index 00214ef36e1f0..ec940d9d0955f 100644 --- a/llvm/test/CodeGen/AMDGPU/sibling-call.ll +++ b/llvm/test/CodeGen/AMDGPU/sibling-call.ll @@ -1029,13 +1029,30 @@ entry: declare hidden fastcc <3 x i16> @v3i16_fastcc_v3i16(<3 x i16> %arg0) define hidden fastcc <3 x i16> @sibling_call_v3i16_fastcc_v3i16(<3 x i16> %a) #1 { -; GCN-LABEL: sibling_call_v3i16_fastcc_v3i16: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_getpc_b64 s[16:17] -; GCN-NEXT: s_add_u32 s16, s16, v3i16_fastcc_v3i16@rel32@lo+4 -; GCN-NEXT: s_addc_u32 s17, s17, v3i16_fastcc_v3i16@rel32@hi+12 -; GCN-NEXT: s_setpc_b64 s[16:17] +; FIJI-LABEL: sibling_call_v3i16_fastcc_v3i16: +; FIJI: ; %bb.0: ; %entry +; FIJI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FIJI-NEXT: s_getpc_b64 s[16:17] +; FIJI-NEXT: s_add_u32 s16, s16, v3i16_fastcc_v3i16@rel32@lo+4 +; FIJI-NEXT: s_addc_u32 s17, s17, v3i16_fastcc_v3i16@rel32@hi+12 +; FIJI-NEXT: s_setpc_b64 s[16:17] +; +; HAWAII-LABEL: sibling_call_v3i16_fastcc_v3i16: +; HAWAII: ; %bb.0: ; %entry +; HAWAII-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; HAWAII-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; HAWAII-NEXT: s_getpc_b64 s[16:17] +; HAWAII-NEXT: s_add_u32 s16, s16, v3i16_fastcc_v3i16@rel32@lo+4 +; HAWAII-NEXT: s_addc_u32 s17, s17, v3i16_fastcc_v3i16@rel32@hi+12 +; HAWAII-NEXT: s_setpc_b64 s[16:17] +; +; GFX9-LABEL: sibling_call_v3i16_fastcc_v3i16: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_getpc_b64 s[16:17] +; GFX9-NEXT: s_add_u32 s16, s16, v3i16_fastcc_v3i16@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s17, s17, v3i16_fastcc_v3i16@rel32@hi+12 +; GFX9-NEXT: s_setpc_b64 s[16:17] entry: %ret = tail call fastcc <3 x i16> @v3i16_fastcc_v3i16(<3 x i16> %a) ret <3 x i16> %ret diff --git a/llvm/test/CodeGen/AMDGPU/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/ssubsat.ll index 09c0e775f783d..3690529af06b1 100644 --- a/llvm/test/CodeGen/AMDGPU/ssubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/ssubsat.ll @@ -158,20 +158,19 @@ define <2 x i16> @v_ssubsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) { ; GFX6-LABEL: v_ssubsat_v2i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16 -; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GFX6-NEXT: v_bfe_i32 v3, v3, 0, 16 +; GFX6-NEXT: v_ashrrev_i32_e32 v2, 16, v1 +; GFX6-NEXT: v_ashrrev_i32_e32 v3, 16, v0 ; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16 -; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v3 +; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: s_movk_i32 s4, 0x8000 ; GFX6-NEXT: v_mov_b32_e32 v3, 0x7fff -; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 -; GFX6-NEXT: v_med3_i32 v1, v1, s4, v3 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 +; GFX6-NEXT: v_med3_i32 v2, v2, s4, v3 ; GFX6-NEXT: v_med3_i32 v0, v0, s4, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_ssubsat_v2i16: @@ -216,25 +215,24 @@ define <3 x i16> @v_ssubsat_v3i16(<3 x i16> %lhs, <3 x i16> %rhs) { ; GFX6-LABEL: v_ssubsat_v3i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_bfe_i32 v3, v3, 0, 16 +; GFX6-NEXT: v_ashrrev_i32_e32 v4, 16, v2 +; GFX6-NEXT: v_ashrrev_i32_e32 v5, 16, v0 +; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16 ; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GFX6-NEXT: v_bfe_i32 v4, v4, 0, 16 +; GFX6-NEXT: v_bfe_i32 v3, v3, 0, 16 ; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16 -; GFX6-NEXT: v_bfe_i32 v5, v5, 0, 16 -; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16 -; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v4 +; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v5, v4 ; GFX6-NEXT: s_movk_i32 s4, 0x8000 -; GFX6-NEXT: v_mov_b32_e32 v4, 0x7fff -; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 -; GFX6-NEXT: v_med3_i32 v1, v1, s4, v4 -; GFX6-NEXT: v_med3_i32 v0, v0, s4, v4 -; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_mov_b32_e32 v5, 0x7fff +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 +; GFX6-NEXT: v_med3_i32 v4, v4, s4, v5 +; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v3 +; GFX6-NEXT: v_med3_i32 v0, v0, s4, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_med3_i32 v1, v1, s4, v5 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX6-NEXT: v_med3_i32 v3, v2, s4, v4 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v3 -; GFX6-NEXT: v_alignbit_b32 v1, v3, v1, 16 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v4 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_ssubsat_v3i16: @@ -288,30 +286,30 @@ define <2 x float> @v_ssubsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { ; GFX6-LABEL: v_ssubsat_v4i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_bfe_i32 v4, v4, 0, 16 -; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GFX6-NEXT: v_bfe_i32 v5, v5, 0, 16 +; GFX6-NEXT: v_ashrrev_i32_e32 v4, 16, v3 +; GFX6-NEXT: v_ashrrev_i32_e32 v5, 16, v1 +; GFX6-NEXT: v_ashrrev_i32_e32 v6, 16, v2 +; GFX6-NEXT: v_ashrrev_i32_e32 v7, 16, v0 +; GFX6-NEXT: v_bfe_i32 v3, v3, 0, 16 ; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16 -; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v5 -; GFX6-NEXT: s_movk_i32 s4, 0x8000 -; GFX6-NEXT: v_mov_b32_e32 v5, 0x7fff -; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 -; GFX6-NEXT: v_med3_i32 v1, v1, s4, v5 -; GFX6-NEXT: v_med3_i32 v0, v0, s4, v5 -; GFX6-NEXT: v_bfe_i32 v6, v6, 0, 16 ; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16 -; GFX6-NEXT: v_bfe_i32 v7, v7, 0, 16 -; GFX6-NEXT: v_bfe_i32 v3, v3, 0, 16 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v7, v6 +; GFX6-NEXT: s_movk_i32 s4, 0x8000 +; GFX6-NEXT: v_mov_b32_e32 v7, 0x7fff +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v5, v4 +; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v3 +; GFX6-NEXT: v_med3_i32 v6, v6, s4, v7 +; GFX6-NEXT: v_med3_i32 v0, v0, s4, v7 +; GFX6-NEXT: v_med3_i32 v2, v2, s4, v7 +; GFX6-NEXT: v_med3_i32 v1, v1, s4, v7 +; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v3, v7 -; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v6 -; GFX6-NEXT: v_med3_i32 v1, v1, s4, v5 -; GFX6-NEXT: v_med3_i32 v2, v2, s4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX6-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v6 +; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_ssubsat_v4i16: diff --git a/llvm/test/CodeGen/AMDGPU/strict_fpext.ll b/llvm/test/CodeGen/AMDGPU/strict_fpext.ll index 40aac82888de2..5335787a820be 100644 --- a/llvm/test/CodeGen/AMDGPU/strict_fpext.ll +++ b/llvm/test/CodeGen/AMDGPU/strict_fpext.ll @@ -46,10 +46,7 @@ define <2 x float> @v_constrained_fpext_v2f16_to_v2f32_fpexcept_strict(<2 x half ; SI-LABEL: v_constrained_fpext_v2f16_to_v2f32_fpexcept_strict: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: s_setpc_b64 s[30:31] @@ -93,15 +90,11 @@ define <3 x float> @v_constrained_fpext_v3f16_to_v3f32_fpexcept_strict(<3 x half ; SI-LABEL: v_constrained_fpext_v3f16_to_v3f32_fpexcept_strict: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 +; SI-NEXT: v_mov_b32_e32 v1, v3 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; GFX89-LABEL: v_constrained_fpext_v3f16_to_v3f32_fpexcept_strict: @@ -247,13 +240,10 @@ define <2 x double> @v_constrained_fpext_v2f16_to_v2f64_fpexcept_strict(<2 x hal ; SI-LABEL: v_constrained_fpext_v2f16_to_v2f64_fpexcept_strict: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 -; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 +; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v1 ; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -301,16 +291,11 @@ define <3 x double> @v_constrained_fpext_v3f16_to_v2f64_fpexcept_strict(<3 x hal ; SI-LABEL: v_constrained_fpext_v3f16_to_v2f64_fpexcept_strict: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v2 -; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v1 +; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v2 ; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v3 ; SI-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 ; SI-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/strict_fptrunc.ll b/llvm/test/CodeGen/AMDGPU/strict_fptrunc.ll index 3e889c0a0670a..dc57c22f16a26 100644 --- a/llvm/test/CodeGen/AMDGPU/strict_fptrunc.ll +++ b/llvm/test/CodeGen/AMDGPU/strict_fptrunc.ll @@ -49,9 +49,13 @@ define <2 x half> @v_constrained_fptrunc_v2f32_to_v2f16_fpexcept_strict(<2 x flo ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_constrained_fptrunc_v2f32_to_v2f16_fpexcept_strict: @@ -102,15 +106,20 @@ define <3 x half> @v_constrained_fptrunc_v3f32_to_v3f16_fpexcept_strict(<3 x flo ; SI-LABEL: v_constrained_fptrunc_v3f32_to_v3f16_fpexcept_strict: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_constrained_fptrunc_v3f32_to_v3f16_fpexcept_strict: diff --git a/llvm/test/CodeGen/AMDGPU/strictfp_f16_abi_promote.ll b/llvm/test/CodeGen/AMDGPU/strictfp_f16_abi_promote.ll index ebd4bc881f2af..8e43f4e788bb0 100644 --- a/llvm/test/CodeGen/AMDGPU/strictfp_f16_abi_promote.ll +++ b/llvm/test/CodeGen/AMDGPU/strictfp_f16_abi_promote.ll @@ -32,16 +32,13 @@ define void @v2f16_arg(<2 x half> %arg, ptr %ptr) #0 { ; GFX7-LABEL: v2f16_arg: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v0 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v0 -; GFX7-NEXT: v_add_i32_e32 v0, vcc, 4, v2 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc -; GFX7-NEXT: flat_store_dword v[0:1], v5 -; GFX7-NEXT: flat_store_dword v[2:3], v4 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 4, v1 +; GFX7-NEXT: flat_store_dword v[1:2], v3 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc +; GFX7-NEXT: flat_store_dword v[0:1], v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %fpext = call <2 x float> @llvm.experimental.constrained.fpext.v2f32.v2f16(<2 x half> %arg, metadata !"fpexcept.strict") @@ -53,22 +50,17 @@ define void @v3f16_arg(<3 x half> %arg, ptr %ptr) #0 { ; GFX7-LABEL: v3f16_arg: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v0 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v0 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v0 -; GFX7-NEXT: v_add_i32_e32 v0, vcc, 8, v3 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v4, vcc -; GFX7-NEXT: flat_store_dword v[0:1], v2 -; GFX7-NEXT: v_add_i32_e32 v0, vcc, 4, v3 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v4, vcc +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 8, v2 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc ; GFX7-NEXT: flat_store_dword v[0:1], v6 -; GFX7-NEXT: flat_store_dword v[3:4], v5 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 4, v2 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GFX7-NEXT: flat_store_dword v[2:3], v4 +; GFX7-NEXT: flat_store_dword v[0:1], v5 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %fpext = call <3 x float> @llvm.experimental.constrained.fpext.v3f32.v3f16(<3 x half> %arg, metadata !"fpexcept.strict") @@ -80,28 +72,22 @@ define void @v4f16_arg(<4 x half> %arg, ptr %ptr) #0 { ; GFX7-LABEL: v4f16_arg: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v3 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v1 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v0 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v0 -; GFX7-NEXT: v_add_i32_e32 v0, vcc, 12, v4 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc -; GFX7-NEXT: flat_store_dword v[0:1], v2 -; GFX7-NEXT: v_add_i32_e32 v0, vcc, 8, v4 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc -; GFX7-NEXT: flat_store_dword v[0:1], v7 -; GFX7-NEXT: v_add_i32_e32 v0, vcc, 4, v4 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc -; GFX7-NEXT: flat_store_dword v[0:1], v3 -; GFX7-NEXT: flat_store_dword v[4:5], v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v1 +; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 12, v2 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: flat_store_dword v[0:1], v5 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 8, v2 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GFX7-NEXT: flat_store_dword v[0:1], v6 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 4, v2 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GFX7-NEXT: flat_store_dword v[0:1], v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %fpext = call <4 x float> @llvm.experimental.constrained.fpext.v4f32.v4f16(<4 x half> %arg, metadata !"fpexcept.strict") @@ -128,9 +114,13 @@ define <2 x half> @v2f16_return(<2 x float> %arg) #0 { ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] %fptrunc = call <2 x half> @llvm.experimental.constrained.fptrunc.v2f16.v2f32(<2 x float> %arg, metadata !"round.tonearest", metadata !"fpexcept.strict") ret <2 x half> %fptrunc @@ -140,15 +130,20 @@ define <3 x half> @v3f16_return(<3 x float> %arg) #0 { ; GFX7-LABEL: v3f16_return: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] %fptrunc = call <3 x half> @llvm.experimental.constrained.fptrunc.v3f16.v3f32(<3 x float> %arg, metadata !"round.tonearest", metadata !"fpexcept.strict") ret <3 x half> %fptrunc @@ -158,18 +153,26 @@ define <4 x half> @v4f16_return(<4 x float> %arg) #0 { ; GFX7-LABEL: v4f16_return: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX7-NEXT: v_or_b32_e32 v1, v2, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] %fptrunc = call <4 x half> @llvm.experimental.constrained.fptrunc.v4f16.v4f32(<4 x float> %arg, metadata !"round.tonearest", metadata !"fpexcept.strict") ret <4 x half> %fptrunc @@ -218,17 +221,13 @@ define void @outgoing_v2f16_arg(ptr %ptr) #0 { ; GFX7-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GFX7-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX7-NEXT: s_mov_b64 exec, s[18:19] -; GFX7-NEXT: flat_load_dword v1, v[0:1] +; GFX7-NEXT: flat_load_dword v0, v[0:1] ; GFX7-NEXT: v_writelane_b32 v40, s16, 2 ; GFX7-NEXT: v_writelane_b32 v40, s30, 0 ; GFX7-NEXT: s_mov_b32 s17, v2f16_user@abs32@hi ; GFX7-NEXT: s_mov_b32 s16, v2f16_user@abs32@lo ; GFX7-NEXT: s_addk_i32 s32, 0x400 ; GFX7-NEXT: v_writelane_b32 v40, s31, 1 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7-NEXT: v_readlane_b32 s31, v40, 1 ; GFX7-NEXT: v_readlane_b32 s30, v40, 0 @@ -307,23 +306,13 @@ define void @outgoing_v2f16_return(ptr %ptr) #0 { ; GFX7-NEXT: v_mov_b32_e32 v41, v1 ; GFX7-NEXT: v_mov_b32_e32 v40, v0 ; GFX7-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: flat_store_dword v[40:41], v0 +; GFX7-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX7-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX7-NEXT: v_readlane_b32 s31, v42, 1 ; GFX7-NEXT: v_readlane_b32 s30, v42, 0 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_mov_b32 s32, s33 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-NEXT: v_readlane_b32 s4, v42, 2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX7-NEXT: flat_store_dword v[40:41], v0 -; GFX7-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX7-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX7-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX7-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GFX7-NEXT: s_mov_b64 exec, s[6:7] @@ -355,30 +344,10 @@ define void @outgoing_v4f16_return(ptr %ptr) #0 { ; GFX7-NEXT: v_mov_b32_e32 v41, v1 ; GFX7-NEXT: v_mov_b32_e32 v40, v0 ; GFX7-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_or_b32_e32 v4, v0, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v3 -; GFX7-NEXT: v_or_b32_e32 v2, v2, v0 -; GFX7-NEXT: v_add_i32_e32 v0, vcc, 4, v40 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v41, vcc -; GFX7-NEXT: flat_store_dword v[0:1], v2 -; GFX7-NEXT: flat_store_dword v[40:41], v4 +; GFX7-NEXT: v_add_i32_e32 v2, vcc, 4, v40 +; GFX7-NEXT: v_addc_u32_e32 v3, vcc, 0, v41, vcc +; GFX7-NEXT: flat_store_dword v[2:3], v1 +; GFX7-NEXT: flat_store_dword v[40:41], v0 ; GFX7-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX7-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX7-NEXT: v_readlane_b32 s31, v42, 1 @@ -416,56 +385,16 @@ define void @outgoing_v8f16_return(ptr %ptr) #0 { ; GFX7-NEXT: v_mov_b32_e32 v41, v1 ; GFX7-NEXT: v_mov_b32_e32 v40, v0 ; GFX7-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_or_b32_e32 v8, v0, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; GFX7-NEXT: v_or_b32_e32 v2, v2, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v7 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v6 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v5, v1, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v3 -; GFX7-NEXT: v_or_b32_e32 v3, v4, v0 -; GFX7-NEXT: v_add_i32_e32 v0, vcc, 12, v40 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v41, vcc -; GFX7-NEXT: flat_store_dword v[0:1], v3 -; GFX7-NEXT: v_add_i32_e32 v0, vcc, 8, v40 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v41, vcc -; GFX7-NEXT: flat_store_dword v[0:1], v5 -; GFX7-NEXT: v_add_i32_e32 v0, vcc, 4, v40 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v41, vcc -; GFX7-NEXT: flat_store_dword v[0:1], v2 -; GFX7-NEXT: flat_store_dword v[40:41], v8 +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 12, v40 +; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v41, vcc +; GFX7-NEXT: flat_store_dword v[4:5], v3 +; GFX7-NEXT: v_add_i32_e32 v3, vcc, 8, v40 +; GFX7-NEXT: v_addc_u32_e32 v4, vcc, 0, v41, vcc +; GFX7-NEXT: flat_store_dword v[3:4], v2 +; GFX7-NEXT: v_add_i32_e32 v2, vcc, 4, v40 +; GFX7-NEXT: v_addc_u32_e32 v3, vcc, 0, v41, vcc +; GFX7-NEXT: flat_store_dword v[2:3], v1 +; GFX7-NEXT: flat_store_dword v[40:41], v0 ; GFX7-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX7-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX7-NEXT: v_readlane_b32 s31, v42, 1 @@ -499,37 +428,14 @@ define half @call_split_type_used_outside_block_v8f16() #0 { ; GFX7-NEXT: s_addk_i32 s32, 0x400 ; GFX7-NEXT: v_writelane_b32 v40, s31, 1 ; GFX7-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: v_readlane_b32 s31, v40, 1 ; GFX7-NEXT: v_readlane_b32 s30, v40, 0 ; GFX7-NEXT: s_mov_b32 s32, s33 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: v_readlane_b32 s4, v40, 2 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX7-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX7-NEXT: s_mov_b64 exec, s[6:7] diff --git a/llvm/test/CodeGen/AMDGPU/trunc-combine.ll b/llvm/test/CodeGen/AMDGPU/trunc-combine.ll index cf844653cb8b6..0fd79048301cc 100644 --- a/llvm/test/CodeGen/AMDGPU/trunc-combine.ll +++ b/llvm/test/CodeGen/AMDGPU/trunc-combine.ll @@ -144,7 +144,6 @@ define <2 x i16> @trunc_v2i64_arg_to_v2i16(<2 x i64> %arg0) #0 { ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: trunc_v2i64_arg_to_v2i16: @@ -161,18 +160,11 @@ define <2 x i16> @trunc_v2i64_arg_to_v2i16(<2 x i64> %arg0) #0 { ; on the final result, due to losing the fact that the upper half of ; the lhs vector was undef. define <2 x i16> @vector_trunc_high_bits_undef_lshr_lhs_alignbit_regression(i32 %arg0) { -; SI-LABEL: vector_trunc_high_bits_undef_lshr_lhs_alignbit_regression: -; SI: ; %bb.0: -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: vector_trunc_high_bits_undef_lshr_lhs_alignbit_regression: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: vector_trunc_high_bits_undef_lshr_lhs_alignbit_regression: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: s_setpc_b64 s[30:31] %undef.hi.elt = insertelement <2 x i32> poison, i32 %arg0, i32 0 %lshr = lshr <2 x i32> %undef.hi.elt, splat (i32 16) %trunc = trunc <2 x i32> %lshr to <2 x i16> @@ -184,7 +176,6 @@ define <2 x i16> @vector_trunc_high_bits_undef_lshr_rhs_alignbit_regression(i32 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_lshr_b32_e32 v0, 16, v0 -; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: vector_trunc_high_bits_undef_lshr_rhs_alignbit_regression: @@ -199,18 +190,11 @@ define <2 x i16> @vector_trunc_high_bits_undef_lshr_rhs_alignbit_regression(i32 } define <2 x i16> @vector_trunc_high_bits_undef_ashr_lhs_alignbit_regression(i32 %arg0) { -; SI-LABEL: vector_trunc_high_bits_undef_ashr_lhs_alignbit_regression: -; SI: ; %bb.0: -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: vector_trunc_high_bits_undef_ashr_lhs_alignbit_regression: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: vector_trunc_high_bits_undef_ashr_lhs_alignbit_regression: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: s_setpc_b64 s[30:31] %undef.hi.elt = insertelement <2 x i32> poison, i32 %arg0, i32 0 %ashr = ashr <2 x i32> %undef.hi.elt, splat (i32 16) %trunc = trunc <2 x i32> %ashr to <2 x i16> @@ -223,7 +207,6 @@ define <2 x i16> @vector_trunc_high_bits_undef_ashr_rhs_alignbit_regression(i32 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_ashr_i32_e32 v0, -4, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: vector_trunc_high_bits_undef_ashr_rhs_alignbit_regression: @@ -243,7 +226,6 @@ define <2 x i16> @vector_trunc_high_bits_undef_add_lhs_alignbit_regression(i32 % ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_add_i32_e32 v0, vcc, 16, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: vector_trunc_high_bits_undef_add_lhs_alignbit_regression: @@ -263,7 +245,6 @@ define <2 x i16> @vector_trunc_high_bits_undef_shl_rhs_alignbit_regression(i32 % ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_lshl_b32_e32 v0, 2, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xfffe, v0 -; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: vector_trunc_high_bits_undef_shl_rhs_alignbit_regression: @@ -283,7 +264,6 @@ define <2 x i16> @vector_trunc_high_bits_undef_sub_lhs_alignbit_regression(i32 % ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_add_i32_e32 v0, vcc, -16, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: vector_trunc_high_bits_undef_sub_lhs_alignbit_regression: @@ -298,18 +278,11 @@ define <2 x i16> @vector_trunc_high_bits_undef_sub_lhs_alignbit_regression(i32 % } define <2 x i16> @vector_trunc_high_bits_undef_or_lhs_alignbit_regression(i32 %arg0) { -; SI-LABEL: vector_trunc_high_bits_undef_or_lhs_alignbit_regression: -; SI: ; %bb.0: -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, 0xffff0011, v0 -; SI-NEXT: v_mov_b32_e32 v1, 0xffff -; SI-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: vector_trunc_high_bits_undef_or_lhs_alignbit_regression: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_or_b32_e32 v0, 0xffff0011, v0 -; VI-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: vector_trunc_high_bits_undef_or_lhs_alignbit_regression: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_or_b32_e32 v0, 0xffff0011, v0 +; GCN-NEXT: s_setpc_b64 s[30:31] %undef.hi.elt = insertelement <2 x i32> poison, i32 %arg0, i32 0 %lshr = or <2 x i32> %undef.hi.elt, splat (i32 17) %trunc = trunc <2 x i32> %lshr to <2 x i16> @@ -322,7 +295,6 @@ define <2 x i16> @vector_trunc_high_bits_undef_xor_lhs_alignbit_regression(i32 % ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_xor_b32_e32 v0, 17, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: vector_trunc_high_bits_undef_xor_lhs_alignbit_regression: @@ -342,7 +314,6 @@ define <2 x i16> @vector_trunc_high_bits_undef_shl_lhs_alignbit_regression(i32 % ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xfffc, v0 -; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: vector_trunc_high_bits_undef_shl_lhs_alignbit_regression: @@ -357,20 +328,12 @@ define <2 x i16> @vector_trunc_high_bits_undef_shl_lhs_alignbit_regression(i32 % } define <2 x i16> @vector_trunc_high_bits_undef_mul_lhs_alignbit_regression(i32 %arg0) { -; SI-LABEL: vector_trunc_high_bits_undef_mul_lhs_alignbit_regression: -; SI: ; %bb.0: -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mul_lo_u32 v0, v0, 18 -; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: v_and_b32_e32 v0, 0xfffe, v0 -; SI-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: vector_trunc_high_bits_undef_mul_lhs_alignbit_regression: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mul_lo_u32 v0, v0, 18 -; VI-NEXT: v_and_b32_e32 v0, 0xfffe, v0 -; VI-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: vector_trunc_high_bits_undef_mul_lhs_alignbit_regression: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mul_lo_u32 v0, v0, 18 +; GCN-NEXT: v_and_b32_e32 v0, 0xfffe, v0 +; GCN-NEXT: s_setpc_b64 s[30:31] %undef.hi.elt = insertelement <2 x i32> poison, i32 %arg0, i32 0 %lshr = mul <2 x i32> %undef.hi.elt, splat (i32 18) %trunc = trunc <2 x i32> %lshr to <2 x i16> @@ -387,7 +350,6 @@ define <2 x i16> @vector_trunc_high_bits_undef_sdiv_lhs_alignbit_regression(i32 ; SI-NEXT: v_lshrrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: vector_trunc_high_bits_undef_sdiv_lhs_alignbit_regression: @@ -417,7 +379,6 @@ define <2 x i16> @vector_trunc_high_bits_undef_srem_lhs_alignbit_regression(i32 ; SI-NEXT: v_mul_lo_u32 v1, v1, 18 ; SI-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: vector_trunc_high_bits_undef_srem_lhs_alignbit_regression: @@ -444,7 +405,6 @@ define <2 x i16> @vector_trunc_high_bits_undef_udiv_lhs_alignbit_regression(i32 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_mov_b32 s4, 0x38e38e39 ; SI-NEXT: v_mul_hi_u32 v0, v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: v_bfe_u32 v0, v0, 2, 16 ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -471,7 +431,6 @@ define <2 x i16> @vector_trunc_high_bits_undef_urem_lhs_alignbit_regression(i32 ; SI-NEXT: v_mul_lo_u32 v1, v1, 18 ; SI-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: vector_trunc_high_bits_undef_urem_lhs_alignbit_regression: diff --git a/llvm/test/CodeGen/AMDGPU/uaddsat.ll b/llvm/test/CodeGen/AMDGPU/uaddsat.ll index 7f89581d00fde..63ab0a3bde0e6 100644 --- a/llvm/test/CodeGen/AMDGPU/uaddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/uaddsat.ll @@ -145,16 +145,16 @@ define <2 x i16> @v_uaddsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) { ; GFX6-LABEL: v_uaddsat_v2i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, v3, v2 ; GFX6-NEXT: v_min_u32_e32 v1, 0xffff, v1 ; GFX6-NEXT: v_min_u32_e32 v0, 0xffff, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_uaddsat_v2i16: @@ -190,21 +190,20 @@ define <3 x i16> @v_uaddsat_v3i16(<3 x i16> %lhs, <3 x i16> %rhs) { ; GFX6-LABEL: v_uaddsat_v3i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v0 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v4 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v3 -; GFX6-NEXT: v_min_u32_e32 v1, 0xffff, v1 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 -; GFX6-NEXT: v_min_u32_e32 v0, 0xffff, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v5, v4 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: v_min_u32_e32 v2, 0xffff, v2 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; GFX6-NEXT: v_min_u32_e32 v0, 0xffff, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_min_u32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_uaddsat_v3i16: @@ -244,22 +243,22 @@ define <2 x float> @v_uaddsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { ; GFX6-LABEL: v_uaddsat_v4i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v5 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v4 -; GFX6-NEXT: v_min_u32_e32 v1, 0xffff, v1 -; GFX6-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v7, 16, v0 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v7, v6 +; GFX6-NEXT: v_min_u32_e32 v2, 0xffff, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v1 ; GFX6-NEXT: v_min_u32_e32 v0, 0xffff, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v6 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v7 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v5, v4 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 ; GFX6-NEXT: v_min_u32_e32 v2, 0xffff, v2 ; GFX6-NEXT: v_min_u32_e32 v1, 0xffff, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 diff --git a/llvm/test/CodeGen/AMDGPU/usubsat.ll b/llvm/test/CodeGen/AMDGPU/usubsat.ll index 3ddb2f02c48fe..97bb4112ecdcd 100644 --- a/llvm/test/CodeGen/AMDGPU/usubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/usubsat.ll @@ -272,16 +272,16 @@ define <2 x i16> @v_usubsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) { ; GFX6-LABEL: v_usubsat_v2i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX6-NEXT: v_max_u32_e32 v1, v1, v3 -; GFX6-NEXT: v_max_u32_e32 v0, v0, v2 -; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v3 -; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX6-NEXT: v_max_u32_e32 v0, v0, v1 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 +; GFX6-NEXT: v_max_u32_e32 v1, v3, v2 +; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_usubsat_v2i16: @@ -311,21 +311,20 @@ define <3 x i16> @v_usubsat_v3i16(<3 x i16> %lhs, <3 x i16> %rhs) { ; GFX6-LABEL: v_usubsat_v3i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff, v4 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX6-NEXT: v_max_u32_e32 v1, v1, v6 -; GFX6-NEXT: v_max_u32_e32 v0, v0, v3 -; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v4 -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v0 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_max_u32_e32 v1, v2, v5 -; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v1, v5 -; GFX6-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX6-NEXT: v_max_u32_e32 v0, v0, v2 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 +; GFX6-NEXT: v_max_u32_e32 v2, v5, v4 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v4 +; GFX6-NEXT: v_max_u32_e32 v1, v1, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v3 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_usubsat_v3i16: @@ -358,24 +357,24 @@ define <2 x float> @v_usubsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { ; GFX6-LABEL: v_usubsat_v4i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v9, 0xffff, v5 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v7, 16, v0 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX6-NEXT: v_max_u32_e32 v1, v1, v9 -; GFX6-NEXT: v_max_u32_e32 v0, v0, v4 -; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v5 -; GFX6-NEXT: v_and_b32_e32 v8, 0xffff, v7 +; GFX6-NEXT: v_max_u32_e32 v0, v0, v2 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 +; GFX6-NEXT: v_max_u32_e32 v2, v7, v6 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_max_u32_e32 v1, v2, v6 -; GFX6-NEXT: v_max_u32_e32 v2, v3, v8 -; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v7 -; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v6 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX6-NEXT: v_max_u32_e32 v2, v5, v4 +; GFX6-NEXT: v_max_u32_e32 v1, v1, v3 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v4 +; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-add.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-add.ll index 4f4e2115810c6..c159cf28fd2db 100644 --- a/llvm/test/CodeGen/AMDGPU/vector-reduce-add.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-add.ll @@ -1005,12 +1005,14 @@ define i16 @test_vector_reduce_add_v2i16(<2 x i16> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_add_v2i16: ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX7-SDAG-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: test_vector_reduce_add_v2i16: ; GFX7-GISEL: ; %bb.0: ; %entry ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX7-GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX7-GISEL-NEXT: v_bfe_u32 v0, v0, 0, 16 ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] @@ -1121,20 +1123,18 @@ define i16 @test_vector_reduce_add_v3i16(<3 x i16> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_add_v3i16: ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v3 -; GFX7-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX7-SDAG-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GFX7-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX7-SDAG-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GFX7-SDAG-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: test_vector_reduce_add_v3i16: ; GFX7-GISEL: ; %bb.0: ; %entry ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX7-GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GFX7-GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: test_vector_reduce_add_v3i16: @@ -1247,20 +1247,24 @@ define i16 @test_vector_reduce_add_v4i16(<4 x i16> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_add_v4i16: ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_add_i32_e32 v1, vcc, v1, v3 -; GFX7-SDAG-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX7-SDAG-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; GFX7-SDAG-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v3 -; GFX7-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v2 ; GFX7-SDAG-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: test_vector_reduce_add_v4i16: ; GFX7-GISEL: ; %bb.0: ; %entry ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GFX7-GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX7-GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GFX7-GISEL-NEXT: v_add_i32_e32 v1, vcc, v2, v3 ; GFX7-GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX7-GISEL-NEXT: v_bfe_u32 v0, v0, 0, 16 ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] @@ -1389,28 +1393,36 @@ define i16 @test_vector_reduce_add_v8i16(<8 x i16> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_add_v8i16: ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_add_i32_e32 v2, vcc, v2, v6 -; GFX7-SDAG-NEXT: v_add_i32_e32 v0, vcc, v0, v4 -; GFX7-SDAG-NEXT: v_add_i32_e32 v3, vcc, v3, v7 -; GFX7-SDAG-NEXT: v_add_i32_e32 v1, vcc, v1, v5 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; GFX7-SDAG-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; GFX7-SDAG-NEXT: v_add_i32_e32 v4, vcc, v4, v6 ; GFX7-SDAG-NEXT: v_add_i32_e32 v1, vcc, v1, v3 ; GFX7-SDAG-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX7-SDAG-NEXT: v_add_i32_e32 v2, vcc, v4, v5 +; GFX7-SDAG-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v3 -; GFX7-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v2 ; GFX7-SDAG-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: test_vector_reduce_add_v8i16: ; GFX7-GISEL: ; %bb.0: ; %entry ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4 -; GFX7-GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v5 -; GFX7-GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v6 -; GFX7-GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v7 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v7, 16, v3 ; GFX7-GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GFX7-GISEL-NEXT: v_add_i32_e32 v2, vcc, v4, v6 ; GFX7-GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; GFX7-GISEL-NEXT: v_add_i32_e32 v3, vcc, v5, v7 +; GFX7-GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GFX7-GISEL-NEXT: v_add_i32_e32 v1, vcc, v2, v3 ; GFX7-GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX7-GISEL-NEXT: v_bfe_u32 v0, v0, 0, 16 ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] @@ -1573,44 +1585,60 @@ define i16 @test_vector_reduce_add_v16i16(<16 x i16> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_add_v16i16: ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_add_i32_e32 v5, vcc, v5, v13 -; GFX7-SDAG-NEXT: v_add_i32_e32 v1, vcc, v1, v9 -; GFX7-SDAG-NEXT: v_add_i32_e32 v7, vcc, v7, v15 -; GFX7-SDAG-NEXT: v_add_i32_e32 v3, vcc, v3, v11 -; GFX7-SDAG-NEXT: v_add_i32_e32 v4, vcc, v4, v12 -; GFX7-SDAG-NEXT: v_add_i32_e32 v0, vcc, v0, v8 -; GFX7-SDAG-NEXT: v_add_i32_e32 v6, vcc, v6, v14 -; GFX7-SDAG-NEXT: v_add_i32_e32 v2, vcc, v2, v10 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v8, 16, v1 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v9, 16, v3 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v10, 16, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v11, 16, v2 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v12, 16, v5 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v13, 16, v7 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v14, 16, v4 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v15, 16, v6 +; GFX7-SDAG-NEXT: v_add_i32_e32 v11, vcc, v11, v15 +; GFX7-SDAG-NEXT: v_add_i32_e32 v10, vcc, v10, v14 +; GFX7-SDAG-NEXT: v_add_i32_e32 v9, vcc, v9, v13 +; GFX7-SDAG-NEXT: v_add_i32_e32 v8, vcc, v8, v12 ; GFX7-SDAG-NEXT: v_add_i32_e32 v2, vcc, v2, v6 ; GFX7-SDAG-NEXT: v_add_i32_e32 v0, vcc, v0, v4 ; GFX7-SDAG-NEXT: v_add_i32_e32 v3, vcc, v3, v7 ; GFX7-SDAG-NEXT: v_add_i32_e32 v1, vcc, v1, v5 +; GFX7-SDAG-NEXT: v_add_i32_e32 v4, vcc, v8, v9 +; GFX7-SDAG-NEXT: v_add_i32_e32 v5, vcc, v10, v11 ; GFX7-SDAG-NEXT: v_add_i32_e32 v1, vcc, v1, v3 ; GFX7-SDAG-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX7-SDAG-NEXT: v_add_i32_e32 v2, vcc, v5, v4 +; GFX7-SDAG-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v3 -; GFX7-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v2 ; GFX7-SDAG-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: test_vector_reduce_add_v16i16: ; GFX7-GISEL: ; %bb.0: ; %entry ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v8 -; GFX7-GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v9 -; GFX7-GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v10 -; GFX7-GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v11 -; GFX7-GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v12 -; GFX7-GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v13 -; GFX7-GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v14 -; GFX7-GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v15 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v8, 16, v0 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v9, 16, v1 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v11, 16, v3 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v12, 16, v4 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v13, 16, v5 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v14, 16, v6 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v15, 16, v7 ; GFX7-GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4 +; GFX7-GISEL-NEXT: v_add_i32_e32 v4, vcc, v8, v12 ; GFX7-GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v5 +; GFX7-GISEL-NEXT: v_add_i32_e32 v5, vcc, v9, v13 ; GFX7-GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v6 +; GFX7-GISEL-NEXT: v_add_i32_e32 v6, vcc, v10, v14 ; GFX7-GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v7 +; GFX7-GISEL-NEXT: v_add_i32_e32 v7, vcc, v11, v15 ; GFX7-GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GFX7-GISEL-NEXT: v_add_i32_e32 v2, vcc, v4, v6 ; GFX7-GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; GFX7-GISEL-NEXT: v_add_i32_e32 v3, vcc, v5, v7 +; GFX7-GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GFX7-GISEL-NEXT: v_add_i32_e32 v1, vcc, v2, v3 ; GFX7-GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX7-GISEL-NEXT: v_bfe_u32 v0, v0, 0, 16 ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-and.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-and.ll index c01b825b523ff..8c005bb083a0c 100644 --- a/llvm/test/CodeGen/AMDGPU/vector-reduce-and.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-and.ll @@ -973,26 +973,12 @@ entry: } define i16 @test_vector_reduce_and_v2i16(<2 x i16> %v) { -; GFX7-SDAG-LABEL: test_vector_reduce_and_v2i16: -; GFX7-SDAG: ; %bb.0: ; %entry -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; GFX7-SDAG-NEXT: v_and_b32_e32 v0, v0, v1 -; GFX7-SDAG-NEXT: v_and_b32_e32 v1, v1, v2 -; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-GISEL-LABEL: test_vector_reduce_and_v2i16: -; GFX7-GISEL: ; %bb.0: ; %entry -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX7-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7-GISEL-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX7-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX7-GISEL-NEXT: v_and_b32_e32 v0, v0, v1 -; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX7-LABEL: test_vector_reduce_and_v2i16: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX7-NEXT: v_and_b32_e32 v0, v0, v1 +; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: test_vector_reduce_and_v2i16: ; GFX8: ; %bb.0: ; %entry @@ -1086,16 +1072,17 @@ define i16 @test_vector_reduce_and_v3i16(<3 x i16> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_and_v3i16: ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_and_b32_e32 v0, v0, v2 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX7-SDAG-NEXT: v_and_b32_e32 v0, v0, v1 -; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-SDAG-NEXT: v_and_b32_e32 v0, v0, v2 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: test_vector_reduce_and_v3i16: ; GFX7-GISEL: ; %bb.0: ; %entry ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-GISEL-NEXT: v_and_b32_e32 v0, v0, v1 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX7-GISEL-NEXT: v_and_b32_e32 v0, v0, v2 +; GFX7-GISEL-NEXT: v_and_b32_e32 v0, v0, v1 ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: test_vector_reduce_and_v3i16: @@ -1195,21 +1182,21 @@ define i16 @test_vector_reduce_and_v4i16(<4 x i16> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_and_v4i16: ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_and_b32_e32 v1, v1, v3 -; GFX7-SDAG-NEXT: v_and_b32_e32 v0, v0, v2 ; GFX7-SDAG-NEXT: v_and_b32_e32 v0, v0, v1 -; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX7-SDAG-NEXT: v_and_b32_e32 v0, v0, v1 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: test_vector_reduce_and_v4i16: ; GFX7-GISEL: ; %bb.0: ; %entry ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7-GISEL-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX7-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; GFX7-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v3 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-GISEL-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX7-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; GFX7-GISEL-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX7-GISEL-NEXT: v_and_b32_e32 v0, v0, v1 ; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 @@ -1339,31 +1326,32 @@ define i16 @test_vector_reduce_and_v8i16(<8 x i16> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_and_v8i16: ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_and_b32_e32 v2, v2, v6 -; GFX7-SDAG-NEXT: v_and_b32_e32 v0, v0, v4 -; GFX7-SDAG-NEXT: v_and_b32_e32 v3, v3, v7 -; GFX7-SDAG-NEXT: v_and_b32_e32 v1, v1, v5 ; GFX7-SDAG-NEXT: v_and_b32_e32 v1, v1, v3 ; GFX7-SDAG-NEXT: v_and_b32_e32 v0, v0, v2 ; GFX7-SDAG-NEXT: v_and_b32_e32 v0, v0, v1 -; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX7-SDAG-NEXT: v_and_b32_e32 v0, v0, v1 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: test_vector_reduce_and_v8i16: ; GFX7-GISEL: ; %bb.0: ; %entry ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX7-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7-GISEL-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX7-GISEL-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX7-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; GFX7-GISEL-NEXT: v_or_b32_e32 v1, v4, v1 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX7-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX7-GISEL-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v5 -; GFX7-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v4 -; GFX7-GISEL-NEXT: v_or_b32_e32 v2, v2, v3 -; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX7-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v6 -; GFX7-GISEL-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX7-GISEL-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v7 +; GFX7-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX7-GISEL-NEXT: v_or_b32_e32 v3, v4, v3 ; GFX7-GISEL-NEXT: v_and_b32_e32 v0, v0, v2 ; GFX7-GISEL-NEXT: v_and_b32_e32 v1, v1, v3 ; GFX7-GISEL-NEXT: v_and_b32_e32 v0, v0, v1 @@ -1521,14 +1509,6 @@ define i16 @test_vector_reduce_and_v16i16(<16 x i16> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_and_v16i16: ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_and_b32_e32 v5, v5, v13 -; GFX7-SDAG-NEXT: v_and_b32_e32 v1, v1, v9 -; GFX7-SDAG-NEXT: v_and_b32_e32 v7, v7, v15 -; GFX7-SDAG-NEXT: v_and_b32_e32 v3, v3, v11 -; GFX7-SDAG-NEXT: v_and_b32_e32 v4, v4, v12 -; GFX7-SDAG-NEXT: v_and_b32_e32 v0, v0, v8 -; GFX7-SDAG-NEXT: v_and_b32_e32 v6, v6, v14 -; GFX7-SDAG-NEXT: v_and_b32_e32 v2, v2, v10 ; GFX7-SDAG-NEXT: v_and_b32_e32 v2, v2, v6 ; GFX7-SDAG-NEXT: v_and_b32_e32 v0, v0, v4 ; GFX7-SDAG-NEXT: v_and_b32_e32 v3, v3, v7 @@ -1536,36 +1516,45 @@ define i16 @test_vector_reduce_and_v16i16(<16 x i16> %v) { ; GFX7-SDAG-NEXT: v_and_b32_e32 v1, v1, v3 ; GFX7-SDAG-NEXT: v_and_b32_e32 v0, v0, v2 ; GFX7-SDAG-NEXT: v_and_b32_e32 v0, v0, v1 -; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX7-SDAG-NEXT: v_and_b32_e32 v0, v0, v1 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: test_vector_reduce_and_v16i16: ; GFX7-GISEL: ; %bb.0: ; %entry ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v8, 16, v0 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v9, 16, v1 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; GFX7-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7-GISEL-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; GFX7-GISEL-NEXT: v_or_b32_e32 v0, v8, v0 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; GFX7-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v11, 16, v3 +; GFX7-GISEL-NEXT: v_or_b32_e32 v1, v8, v1 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v8, 16, v10 ; GFX7-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX7-GISEL-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v5 -; GFX7-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v4 -; GFX7-GISEL-NEXT: v_or_b32_e32 v2, v2, v3 -; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX7-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v6 -; GFX7-GISEL-NEXT: v_or_b32_e32 v3, v3, v4 -; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v9 -; GFX7-GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v8 -; GFX7-GISEL-NEXT: v_or_b32_e32 v4, v4, v5 -; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v11 -; GFX7-GISEL-NEXT: v_and_b32_e32 v6, 0xffff, v10 -; GFX7-GISEL-NEXT: v_or_b32_e32 v5, v5, v6 -; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v6, 16, v13 -; GFX7-GISEL-NEXT: v_and_b32_e32 v7, 0xffff, v12 -; GFX7-GISEL-NEXT: v_or_b32_e32 v6, v6, v7 -; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v7, 16, v15 -; GFX7-GISEL-NEXT: v_and_b32_e32 v8, 0xffff, v14 -; GFX7-GISEL-NEXT: v_or_b32_e32 v7, v7, v8 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v12, 16, v4 +; GFX7-GISEL-NEXT: v_or_b32_e32 v2, v8, v2 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v8, 16, v11 +; GFX7-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v13, 16, v5 +; GFX7-GISEL-NEXT: v_or_b32_e32 v3, v8, v3 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v8, 16, v12 +; GFX7-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v14, 16, v6 +; GFX7-GISEL-NEXT: v_or_b32_e32 v4, v8, v4 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v8, 16, v13 +; GFX7-GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v15, 16, v7 +; GFX7-GISEL-NEXT: v_or_b32_e32 v5, v8, v5 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v8, 16, v14 +; GFX7-GISEL-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX7-GISEL-NEXT: v_or_b32_e32 v6, v8, v6 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v8, 16, v15 +; GFX7-GISEL-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX7-GISEL-NEXT: v_or_b32_e32 v7, v8, v7 ; GFX7-GISEL-NEXT: v_and_b32_e32 v0, v0, v4 ; GFX7-GISEL-NEXT: v_and_b32_e32 v1, v1, v5 ; GFX7-GISEL-NEXT: v_and_b32_e32 v2, v2, v6 diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-fadd.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-fadd.ll index ab66959de07d9..98e7df04be444 100644 --- a/llvm/test/CodeGen/AMDGPU/vector-reduce-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-fadd.ll @@ -20,12 +20,11 @@ define half @test_vector_reduce_fadd_v2half(half %sp, <2 x half> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_fadd_v2half: ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -34,10 +33,11 @@ define half @test_vector_reduce_fadd_v2half(half %sp, <2 x half> %v) { ; GFX7-GISEL: ; %bb.0: ; %entry ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX7-GISEL-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v2 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -158,31 +158,30 @@ define half @test_vector_reduce_fadd_v3half(half %sp, <3 x half> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_fadd_v3half: ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v3 +; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: test_vector_reduce_fadd_v3half: ; GFX7-GISEL: ; %bb.0: ; %entry ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX7-GISEL-NEXT: v_add_f32_e32 v0, v0, v3 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v2 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v3 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v2 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -312,38 +311,38 @@ define half @test_vector_reduce_fadd_v4half(half %sp, <4 x half> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_fadd_v4half: ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v1 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v4 ; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v3 -; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v4 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: test_vector_reduce_fadd_v4half: ; GFX7-GISEL: ; %bb.0: ; %entry ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX7-GISEL-NEXT: v_add_f32_e32 v0, v0, v3 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v2 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v3 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v2 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v4 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -500,66 +499,66 @@ define half @test_vector_reduce_fadd_v8half(half %sp, <8 x half> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_fadd_v8half: ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v8, 16, v1 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v8, v8 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v7, 16, v2 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v6, 16, v3 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v8 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v7 ; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v3 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v8, v8 +; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v6 ; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v4 ; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v5 -; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v6 -; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v7 -; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v8 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: test_vector_reduce_fadd_v8half: ; GFX7-GISEL: ; %bb.0: ; %entry ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v5, v1 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX7-GISEL-NEXT: v_add_f32_e32 v0, v0, v5 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v2 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v3 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v2 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v4 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v5 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v3 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v6 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v7 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v4 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v8 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v4 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -788,122 +787,122 @@ define half @test_vector_reduce_fadd_v16half(half %sp, <16 x half> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_fadd_v16half: ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v16, 16, v1 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v16, v16 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v15, 16, v2 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v15, v15 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v14, 16, v3 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v14, v14 +; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v16 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v13, 16, v4 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v13, v13 +; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v15 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v12, 16, v5 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v3 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v8, v8 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v12, v12 +; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v14 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v11, 16, v6 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v6, v6 ; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v4 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v9, v9 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v11, v11 +; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v13 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v10, 16, v7 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v7, v7 ; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v5 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v14, v14 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v10, v10 +; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v12 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v9, 16, v8 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v8, v8 ; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v6 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v11, v11 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v9, v9 +; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v11 ; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v7 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v12, v12 +; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v10 ; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v8 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v13, v13 ; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v9 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v10 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v11 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v12 -; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v13 -; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v14 -; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v15 -; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v16 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: test_vector_reduce_fadd_v16half: ; GFX7-GISEL: ; %bb.0: ; %entry ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v9, v1 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX7-GISEL-NEXT: v_add_f32_e32 v0, v0, v9 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v2 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v3 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v2 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v4 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v5 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v3 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v6 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v7 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v4 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v8 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v4 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v9 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v5 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v10 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v5 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v11 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v6 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v12 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v13 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v7 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v14 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v7 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v15 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v8 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v16 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v8 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-fmax.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-fmax.ll index acca5169126e4..b9dcb1b7295c2 100644 --- a/llvm/test/CodeGen/AMDGPU/vector-reduce-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-fmax.ll @@ -20,16 +20,16 @@ define half @test_vector_reduce_fmax_v2half(<2 x half> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_fmax_v2half: ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v1 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: test_vector_reduce_fmax_v2half: ; GFX7-GISEL: ; %bb.0: ; %entry ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-GISEL-NEXT: v_max_f32_e32 v0, v0, v1 @@ -184,23 +184,22 @@ define half @test_vector_reduce_fmax_v3half(<3 x half> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_fmax_v3half: ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-SDAG-NEXT: v_max3_f32 v0, v0, v1, v2 +; GFX7-SDAG-NEXT: v_max3_f32 v0, v0, v2, v1 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: test_vector_reduce_fmax_v3half: ; GFX7-GISEL: ; %bb.0: ; %entry ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-GISEL-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX7-GISEL-NEXT: v_max_f32_e32 v0, v0, v2 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v2 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-GISEL-NEXT: v_max_f32_e32 v0, v0, v1 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -389,28 +388,28 @@ define half @test_vector_reduce_fmax_v4half(<4 x half> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_fmax_v4half: ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v1 -; GFX7-SDAG-NEXT: v_max3_f32 v0, v0, v2, v3 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v3 +; GFX7-SDAG-NEXT: v_max3_f32 v0, v0, v1, v2 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: test_vector_reduce_fmax_v4half: ; GFX7-GISEL: ; %bb.0: ; %entry ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-GISEL-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX7-GISEL-NEXT: v_max_f32_e32 v0, v0, v2 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: v_max_f32_e32 v1, v2, v3 +; GFX7-GISEL-NEXT: v_max_f32_e32 v1, v1, v3 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 @@ -616,46 +615,46 @@ define half @test_vector_reduce_fmax_v8half(<8 x half> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_fmax_v8half: ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v7, 16, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v7, v7 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v6, v6 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v1 -; GFX7-SDAG-NEXT: v_max3_f32 v0, v0, v2, v3 -; GFX7-SDAG-NEXT: v_max3_f32 v0, v0, v4, v5 -; GFX7-SDAG-NEXT: v_max3_f32 v0, v0, v6, v7 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v7 +; GFX7-SDAG-NEXT: v_max3_f32 v0, v0, v1, v6 +; GFX7-SDAG-NEXT: v_max3_f32 v0, v0, v2, v5 +; GFX7-SDAG-NEXT: v_max3_f32 v0, v0, v3, v4 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: test_vector_reduce_fmax_v8half: ; GFX7-GISEL: ; %bb.0: ; %entry ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v5, 16, v1 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; GFX7-GISEL-NEXT: v_max_f32_e32 v0, v0, v4 +; GFX7-GISEL-NEXT: v_max_f32_e32 v1, v1, v5 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-GISEL-NEXT: v_max_f32_e32 v0, v0, v1 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: v_max_f32_e32 v1, v2, v3 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v4 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v3, v5 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v4, v6 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v5, v7 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-GISEL-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-GISEL-NEXT: v_max_f32_e32 v2, v2, v3 +; GFX7-GISEL-NEXT: v_max_f32_e32 v3, v3, v5 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-GISEL-NEXT: v_max_f32_e32 v3, v4, v5 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 @@ -963,80 +962,80 @@ define half @test_vector_reduce_fmax_v16half(<16 x half> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_fmax_v16half: ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v8, v8 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v15, 16, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v14, 16, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v15, v15 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v13, 16, v2 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v14, v14 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v12, 16, v3 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v13, v13 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v11, 16, v4 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v12, v12 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v10, 16, v5 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v11, v11 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v15 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v9, 16, v6 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v10, v10 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-SDAG-NEXT: v_max3_f32 v0, v0, v1, v14 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v8, 16, v7 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v9, v9 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GFX7-SDAG-NEXT: v_max3_f32 v0, v0, v2, v13 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GFX7-SDAG-NEXT: v_max3_f32 v0, v0, v2, v3 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GFX7-SDAG-NEXT: v_max3_f32 v0, v0, v4, v5 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GFX7-SDAG-NEXT: v_max3_f32 v0, v0, v6, v7 -; GFX7-SDAG-NEXT: v_max3_f32 v0, v0, v8, v9 -; GFX7-SDAG-NEXT: v_max3_f32 v0, v0, v10, v11 -; GFX7-SDAG-NEXT: v_max3_f32 v0, v0, v12, v13 -; GFX7-SDAG-NEXT: v_max3_f32 v0, v0, v14, v15 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GFX7-SDAG-NEXT: v_max3_f32 v0, v0, v3, v12 +; GFX7-SDAG-NEXT: v_max3_f32 v0, v0, v4, v11 +; GFX7-SDAG-NEXT: v_max3_f32 v0, v0, v5, v10 +; GFX7-SDAG-NEXT: v_max3_f32 v0, v0, v6, v9 +; GFX7-SDAG-NEXT: v_max3_f32 v0, v0, v7, v8 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: test_vector_reduce_fmax_v16half: ; GFX7-GISEL: ; %bb.0: ; %entry ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v8, 16, v0 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v9, 16, v1 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v8, v8 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v9, v9 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v11, 16, v3 +; GFX7-GISEL-NEXT: v_max_f32_e32 v0, v0, v8 +; GFX7-GISEL-NEXT: v_max_f32_e32 v1, v1, v9 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v8, v10 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-GISEL-NEXT: v_max_f32_e32 v0, v0, v1 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: v_max_f32_e32 v1, v2, v3 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v4 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v3, v5 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v4, v6 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v5, v7 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v6, v10 -; GFX7-GISEL-NEXT: v_max_f32_e32 v2, v2, v3 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v7, v11 -; GFX7-GISEL-NEXT: v_max_f32_e32 v3, v4, v5 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v4, v8 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v5, v9 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v9, v11 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v12, 16, v4 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v13, 16, v5 +; GFX7-GISEL-NEXT: v_max_f32_e32 v2, v2, v8 +; GFX7-GISEL-NEXT: v_max_f32_e32 v3, v3, v9 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v8, v12 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v9, v13 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v14, 16, v6 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v15, 16, v7 +; GFX7-GISEL-NEXT: v_max_f32_e32 v4, v4, v8 +; GFX7-GISEL-NEXT: v_max_f32_e32 v5, v5, v9 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v6, v6 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v8, v14 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v7, v7 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v9, v15 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-GISEL-NEXT: v_max_f32_e32 v4, v4, v5 -; GFX7-GISEL-NEXT: v_max_f32_e32 v5, v6, v7 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v6, v12 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v7, v13 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-GISEL-NEXT: v_max_f32_e32 v6, v6, v8 +; GFX7-GISEL-NEXT: v_max_f32_e32 v7, v7, v9 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-GISEL-NEXT: v_max_f32_e32 v6, v6, v7 -; GFX7-GISEL-NEXT: v_max_f32_e32 v7, v8, v9 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-fmaximum.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-fmaximum.ll index 9c54c30daefe0..58da94d7c4683 100644 --- a/llvm/test/CodeGen/AMDGPU/vector-reduce-fmaximum.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-fmaximum.ll @@ -12,11 +12,10 @@ define half @test_vector_reduce_fmaximum_v2half(<2 x half> %v) { ; GFX7-LABEL: test_vector_reduce_fmaximum_v2half: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_mov_b32_e32 v2, 0x7fc00000 ; GFX7-NEXT: v_max_f32_e32 v3, v0, v1 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc @@ -99,19 +98,17 @@ define half @test_vector_reduce_fmaximum_v3half(<3 x half> %v) { ; GFX7-LABEL: test_vector_reduce_fmaximum_v3half: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_mov_b32_e32 v4, 0x7fc00000 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_max_f32_e32 v3, v0, v1 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc -; GFX7-NEXT: v_max_f32_e32 v1, v0, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_mov_b32_e32 v4, 0x7fc00000 +; GFX7-NEXT: v_max_f32_e32 v3, v0, v2 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX7-NEXT: v_max_f32_e32 v2, v0, v1 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: test_vector_reduce_fmaximum_v3half: @@ -217,21 +214,19 @@ define half @test_vector_reduce_fmaximum_v4half(<4 x half> %v) { ; GFX7-LABEL: test_vector_reduce_fmaximum_v4half: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_max_f32_e32 v4, v0, v2 ; GFX7-NEXT: v_mov_b32_e32 v5, 0x7fc00000 -; GFX7-NEXT: v_max_f32_e32 v4, v0, v1 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc -; GFX7-NEXT: v_max_f32_e32 v1, v0, v2 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v5, v1, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc +; GFX7-NEXT: v_max_f32_e32 v2, v0, v1 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc ; GFX7-NEXT: v_max_f32_e32 v1, v0, v3 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v3 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v5, v1, vcc @@ -355,44 +350,40 @@ define half @test_vector_reduce_fmaximum_v8half(<8 x half> %v) { ; GFX7-LABEL: test_vector_reduce_fmaximum_v8half: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_max_f32_e32 v8, v0, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GFX7-NEXT: v_max_f32_e32 v8, v0, v7 ; GFX7-NEXT: v_mov_b32_e32 v9, 0x7fc00000 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v7 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v9, v8, vcc -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_max_f32_e32 v7, v0, v1 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v9, v7, vcc +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_max_f32_e32 v1, v0, v6 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v6 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_max_f32_e32 v1, v0, v2 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_max_f32_e32 v1, v0, v5 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v5 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc ; GFX7-NEXT: v_max_f32_e32 v1, v0, v3 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v3 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6 ; GFX7-NEXT: v_max_f32_e32 v1, v0, v4 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v4 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX7-NEXT: v_max_f32_e32 v1, v0, v5 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v5 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc -; GFX7-NEXT: v_max_f32_e32 v1, v0, v6 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v6 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc -; GFX7-NEXT: v_max_f32_e32 v1, v0, v7 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v7 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: test_vector_reduce_fmaximum_v8half: @@ -585,84 +576,76 @@ define half @test_vector_reduce_fmaximum_v16half(<16 x half> %v) { ; GFX7-LABEL: test_vector_reduce_fmaximum_v16half: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v15, v15 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_max_f32_e32 v16, v0, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v14, v14 +; GFX7-NEXT: v_max_f32_e32 v16, v0, v15 ; GFX7-NEXT: v_mov_b32_e32 v17, 0x7fc00000 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v15 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v17, v16, vcc -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v13, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_max_f32_e32 v15, v0, v1 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v17, v15, vcc +; GFX7-NEXT: v_cvt_f32_f16_e32 v13, v13 +; GFX7-NEXT: v_max_f32_e32 v1, v0, v14 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v14 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v17, v1, vcc +; GFX7-NEXT: v_lshrrev_b32_e32 v12, 16, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_max_f32_e32 v1, v0, v2 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v17, v1, vcc -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v12, v12 +; GFX7-NEXT: v_max_f32_e32 v1, v0, v13 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v13 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v17, v1, vcc +; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_max_f32_e32 v1, v0, v3 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v3 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v17, v1, vcc -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v11, v11 +; GFX7-NEXT: v_max_f32_e32 v1, v0, v12 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v12 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v17, v1, vcc +; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GFX7-NEXT: v_max_f32_e32 v1, v0, v4 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v4 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v17, v1, vcc -; GFX7-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v10 +; GFX7-NEXT: v_max_f32_e32 v1, v0, v11 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v11 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v17, v1, vcc +; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6 ; GFX7-NEXT: v_max_f32_e32 v1, v0, v5 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v5 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v17, v1, vcc -; GFX7-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v8 +; GFX7-NEXT: v_cvt_f32_f16_e32 v9, v9 +; GFX7-NEXT: v_max_f32_e32 v1, v0, v10 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v10 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v17, v1, vcc +; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7 ; GFX7-NEXT: v_max_f32_e32 v1, v0, v6 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v6 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v17, v1, vcc -; GFX7-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GFX7-NEXT: v_cvt_f32_f16_e32 v9, v9 +; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v8 +; GFX7-NEXT: v_max_f32_e32 v1, v0, v9 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v9 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v17, v1, vcc ; GFX7-NEXT: v_max_f32_e32 v1, v0, v7 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v7 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v17, v1, vcc -; GFX7-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v10 ; GFX7-NEXT: v_max_f32_e32 v1, v0, v8 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v8 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v17, v1, vcc -; GFX7-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GFX7-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GFX7-NEXT: v_max_f32_e32 v1, v0, v9 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v9 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v17, v1, vcc -; GFX7-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GFX7-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GFX7-NEXT: v_max_f32_e32 v1, v0, v10 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v10 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v17, v1, vcc -; GFX7-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GFX7-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GFX7-NEXT: v_max_f32_e32 v1, v0, v11 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v11 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v17, v1, vcc -; GFX7-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GFX7-NEXT: v_max_f32_e32 v1, v0, v12 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v12 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v17, v1, vcc -; GFX7-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GFX7-NEXT: v_max_f32_e32 v1, v0, v13 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v13 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v17, v1, vcc -; GFX7-NEXT: v_max_f32_e32 v1, v0, v14 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v14 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v17, v1, vcc -; GFX7-NEXT: v_max_f32_e32 v1, v0, v15 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v15 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v17, v1, vcc ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: test_vector_reduce_fmaximum_v16half: diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-fmin.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-fmin.ll index 3f56d9871c568..07524d6917740 100644 --- a/llvm/test/CodeGen/AMDGPU/vector-reduce-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-fmin.ll @@ -20,16 +20,16 @@ define half @test_vector_reduce_fmin_v2half(<2 x half> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_fmin_v2half: ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v1 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: test_vector_reduce_fmin_v2half: ; GFX7-GISEL: ; %bb.0: ; %entry ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-GISEL-NEXT: v_min_f32_e32 v0, v0, v1 @@ -184,23 +184,22 @@ define half @test_vector_reduce_fmin_v3half(<3 x half> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_fmin_v3half: ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-SDAG-NEXT: v_min3_f32 v0, v0, v1, v2 +; GFX7-SDAG-NEXT: v_min3_f32 v0, v0, v2, v1 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: test_vector_reduce_fmin_v3half: ; GFX7-GISEL: ; %bb.0: ; %entry ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-GISEL-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX7-GISEL-NEXT: v_min_f32_e32 v0, v0, v2 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v2 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-GISEL-NEXT: v_min_f32_e32 v0, v0, v1 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -389,28 +388,28 @@ define half @test_vector_reduce_fmin_v4half(<4 x half> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_fmin_v4half: ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v1 -; GFX7-SDAG-NEXT: v_min3_f32 v0, v0, v2, v3 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v3 +; GFX7-SDAG-NEXT: v_min3_f32 v0, v0, v1, v2 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: test_vector_reduce_fmin_v4half: ; GFX7-GISEL: ; %bb.0: ; %entry ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-GISEL-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX7-GISEL-NEXT: v_min_f32_e32 v0, v0, v2 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: v_min_f32_e32 v1, v2, v3 +; GFX7-GISEL-NEXT: v_min_f32_e32 v1, v1, v3 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 @@ -616,46 +615,46 @@ define half @test_vector_reduce_fmin_v8half(<8 x half> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_fmin_v8half: ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v7, 16, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v7, v7 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v6, v6 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v1 -; GFX7-SDAG-NEXT: v_min3_f32 v0, v0, v2, v3 -; GFX7-SDAG-NEXT: v_min3_f32 v0, v0, v4, v5 -; GFX7-SDAG-NEXT: v_min3_f32 v0, v0, v6, v7 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v7 +; GFX7-SDAG-NEXT: v_min3_f32 v0, v0, v1, v6 +; GFX7-SDAG-NEXT: v_min3_f32 v0, v0, v2, v5 +; GFX7-SDAG-NEXT: v_min3_f32 v0, v0, v3, v4 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: test_vector_reduce_fmin_v8half: ; GFX7-GISEL: ; %bb.0: ; %entry ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v5, 16, v1 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; GFX7-GISEL-NEXT: v_min_f32_e32 v0, v0, v4 +; GFX7-GISEL-NEXT: v_min_f32_e32 v1, v1, v5 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-GISEL-NEXT: v_min_f32_e32 v0, v0, v1 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: v_min_f32_e32 v1, v2, v3 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v4 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v3, v5 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v4, v6 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v5, v7 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-GISEL-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-GISEL-NEXT: v_min_f32_e32 v2, v2, v3 +; GFX7-GISEL-NEXT: v_min_f32_e32 v3, v3, v5 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-GISEL-NEXT: v_min_f32_e32 v3, v4, v5 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 @@ -963,80 +962,80 @@ define half @test_vector_reduce_fmin_v16half(<16 x half> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_fmin_v16half: ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v8, v8 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v15, 16, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v14, 16, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v15, v15 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v13, 16, v2 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v14, v14 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v12, 16, v3 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v13, v13 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v11, 16, v4 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v12, v12 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v10, 16, v5 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v11, v11 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v15 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v9, 16, v6 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v10, v10 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-SDAG-NEXT: v_min3_f32 v0, v0, v1, v14 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v8, 16, v7 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v9, v9 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GFX7-SDAG-NEXT: v_min3_f32 v0, v0, v2, v13 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GFX7-SDAG-NEXT: v_min3_f32 v0, v0, v2, v3 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GFX7-SDAG-NEXT: v_min3_f32 v0, v0, v4, v5 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GFX7-SDAG-NEXT: v_min3_f32 v0, v0, v6, v7 -; GFX7-SDAG-NEXT: v_min3_f32 v0, v0, v8, v9 -; GFX7-SDAG-NEXT: v_min3_f32 v0, v0, v10, v11 -; GFX7-SDAG-NEXT: v_min3_f32 v0, v0, v12, v13 -; GFX7-SDAG-NEXT: v_min3_f32 v0, v0, v14, v15 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GFX7-SDAG-NEXT: v_min3_f32 v0, v0, v3, v12 +; GFX7-SDAG-NEXT: v_min3_f32 v0, v0, v4, v11 +; GFX7-SDAG-NEXT: v_min3_f32 v0, v0, v5, v10 +; GFX7-SDAG-NEXT: v_min3_f32 v0, v0, v6, v9 +; GFX7-SDAG-NEXT: v_min3_f32 v0, v0, v7, v8 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: test_vector_reduce_fmin_v16half: ; GFX7-GISEL: ; %bb.0: ; %entry ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v8, 16, v0 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v9, 16, v1 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v8, v8 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v9, v9 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v11, 16, v3 +; GFX7-GISEL-NEXT: v_min_f32_e32 v0, v0, v8 +; GFX7-GISEL-NEXT: v_min_f32_e32 v1, v1, v9 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v8, v10 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-GISEL-NEXT: v_min_f32_e32 v0, v0, v1 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: v_min_f32_e32 v1, v2, v3 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v4 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v3, v5 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v4, v6 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v5, v7 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v6, v10 -; GFX7-GISEL-NEXT: v_min_f32_e32 v2, v2, v3 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v7, v11 -; GFX7-GISEL-NEXT: v_min_f32_e32 v3, v4, v5 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v4, v8 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v5, v9 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v9, v11 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v12, 16, v4 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v13, 16, v5 +; GFX7-GISEL-NEXT: v_min_f32_e32 v2, v2, v8 +; GFX7-GISEL-NEXT: v_min_f32_e32 v3, v3, v9 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v8, v12 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v9, v13 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v14, 16, v6 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v15, 16, v7 +; GFX7-GISEL-NEXT: v_min_f32_e32 v4, v4, v8 +; GFX7-GISEL-NEXT: v_min_f32_e32 v5, v5, v9 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v6, v6 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v8, v14 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v7, v7 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v9, v15 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-GISEL-NEXT: v_min_f32_e32 v4, v4, v5 -; GFX7-GISEL-NEXT: v_min_f32_e32 v5, v6, v7 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v6, v12 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v7, v13 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-GISEL-NEXT: v_min_f32_e32 v6, v6, v8 +; GFX7-GISEL-NEXT: v_min_f32_e32 v7, v7, v9 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-GISEL-NEXT: v_min_f32_e32 v6, v6, v7 -; GFX7-GISEL-NEXT: v_min_f32_e32 v7, v8, v9 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-fminimum.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-fminimum.ll index 9d3a4f387bfc8..16732a429e4b0 100644 --- a/llvm/test/CodeGen/AMDGPU/vector-reduce-fminimum.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-fminimum.ll @@ -14,11 +14,10 @@ define half @test_vector_reduce_fminimum_v2half(<2 x half> %v) { ; GFX7-LABEL: test_vector_reduce_fminimum_v2half: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_mov_b32_e32 v2, 0x7fc00000 ; GFX7-NEXT: v_min_f32_e32 v3, v0, v1 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc @@ -123,19 +122,17 @@ define half @test_vector_reduce_fminimum_v3half(<3 x half> %v) { ; GFX7-LABEL: test_vector_reduce_fminimum_v3half: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_mov_b32_e32 v4, 0x7fc00000 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_min_f32_e32 v3, v0, v1 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc -; GFX7-NEXT: v_min_f32_e32 v1, v0, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_mov_b32_e32 v4, 0x7fc00000 +; GFX7-NEXT: v_min_f32_e32 v3, v0, v2 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX7-NEXT: v_min_f32_e32 v2, v0, v1 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: test_vector_reduce_fminimum_v3half: @@ -266,21 +263,19 @@ define half @test_vector_reduce_fminimum_v4half(<4 x half> %v) { ; GFX7-LABEL: test_vector_reduce_fminimum_v4half: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_min_f32_e32 v4, v0, v2 ; GFX7-NEXT: v_mov_b32_e32 v5, 0x7fc00000 -; GFX7-NEXT: v_min_f32_e32 v4, v0, v1 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc -; GFX7-NEXT: v_min_f32_e32 v1, v0, v2 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v5, v1, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc +; GFX7-NEXT: v_min_f32_e32 v2, v0, v1 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc ; GFX7-NEXT: v_min_f32_e32 v1, v0, v3 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v3 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v5, v1, vcc @@ -433,44 +428,40 @@ define half @test_vector_reduce_fminimum_v8half(<8 x half> %v) { ; GFX7-LABEL: test_vector_reduce_fminimum_v8half: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_min_f32_e32 v8, v0, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GFX7-NEXT: v_min_f32_e32 v8, v0, v7 ; GFX7-NEXT: v_mov_b32_e32 v9, 0x7fc00000 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v7 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v9, v8, vcc -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_min_f32_e32 v7, v0, v1 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v9, v7, vcc +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_min_f32_e32 v1, v0, v6 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v6 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_min_f32_e32 v1, v0, v2 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_min_f32_e32 v1, v0, v5 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v5 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc ; GFX7-NEXT: v_min_f32_e32 v1, v0, v3 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v3 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6 ; GFX7-NEXT: v_min_f32_e32 v1, v0, v4 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v4 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX7-NEXT: v_min_f32_e32 v1, v0, v5 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v5 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc -; GFX7-NEXT: v_min_f32_e32 v1, v0, v6 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v6 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc -; GFX7-NEXT: v_min_f32_e32 v1, v0, v7 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v7 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: test_vector_reduce_fminimum_v8half: @@ -705,84 +696,76 @@ define half @test_vector_reduce_fminimum_v16half(<16 x half> %v) { ; GFX7-LABEL: test_vector_reduce_fminimum_v16half: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v15, v15 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_min_f32_e32 v16, v0, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v14, v14 +; GFX7-NEXT: v_min_f32_e32 v16, v0, v15 ; GFX7-NEXT: v_mov_b32_e32 v17, 0x7fc00000 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v15 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v17, v16, vcc -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v13, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_min_f32_e32 v15, v0, v1 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v17, v15, vcc +; GFX7-NEXT: v_cvt_f32_f16_e32 v13, v13 +; GFX7-NEXT: v_min_f32_e32 v1, v0, v14 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v14 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v17, v1, vcc +; GFX7-NEXT: v_lshrrev_b32_e32 v12, 16, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_min_f32_e32 v1, v0, v2 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v17, v1, vcc -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v12, v12 +; GFX7-NEXT: v_min_f32_e32 v1, v0, v13 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v13 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v17, v1, vcc +; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_min_f32_e32 v1, v0, v3 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v3 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v17, v1, vcc -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v11, v11 +; GFX7-NEXT: v_min_f32_e32 v1, v0, v12 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v12 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v17, v1, vcc +; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GFX7-NEXT: v_min_f32_e32 v1, v0, v4 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v4 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v17, v1, vcc -; GFX7-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v10 +; GFX7-NEXT: v_min_f32_e32 v1, v0, v11 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v11 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v17, v1, vcc +; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6 ; GFX7-NEXT: v_min_f32_e32 v1, v0, v5 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v5 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v17, v1, vcc -; GFX7-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v8 +; GFX7-NEXT: v_cvt_f32_f16_e32 v9, v9 +; GFX7-NEXT: v_min_f32_e32 v1, v0, v10 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v10 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v17, v1, vcc +; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7 ; GFX7-NEXT: v_min_f32_e32 v1, v0, v6 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v6 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v17, v1, vcc -; GFX7-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GFX7-NEXT: v_cvt_f32_f16_e32 v9, v9 +; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v8 +; GFX7-NEXT: v_min_f32_e32 v1, v0, v9 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v9 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v17, v1, vcc ; GFX7-NEXT: v_min_f32_e32 v1, v0, v7 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v7 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v17, v1, vcc -; GFX7-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v10 ; GFX7-NEXT: v_min_f32_e32 v1, v0, v8 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v8 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v17, v1, vcc -; GFX7-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GFX7-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GFX7-NEXT: v_min_f32_e32 v1, v0, v9 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v9 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v17, v1, vcc -; GFX7-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GFX7-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GFX7-NEXT: v_min_f32_e32 v1, v0, v10 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v10 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v17, v1, vcc -; GFX7-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GFX7-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GFX7-NEXT: v_min_f32_e32 v1, v0, v11 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v11 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v17, v1, vcc -; GFX7-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GFX7-NEXT: v_min_f32_e32 v1, v0, v12 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v12 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v17, v1, vcc -; GFX7-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GFX7-NEXT: v_min_f32_e32 v1, v0, v13 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v13 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v17, v1, vcc -; GFX7-NEXT: v_min_f32_e32 v1, v0, v14 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v14 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v17, v1, vcc -; GFX7-NEXT: v_min_f32_e32 v1, v0, v15 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v15 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v17, v1, vcc ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: test_vector_reduce_fminimum_v16half: diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-fmul.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-fmul.ll index a6aff732830ee..45fc82abb507e 100644 --- a/llvm/test/CodeGen/AMDGPU/vector-reduce-fmul.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-fmul.ll @@ -20,12 +20,11 @@ define half @test_vector_reduce_fmul_v2half(half %sp, <2 x half> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_fmul_v2half: ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -34,10 +33,11 @@ define half @test_vector_reduce_fmul_v2half(half %sp, <2 x half> %v) { ; GFX7-GISEL: ; %bb.0: ; %entry ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX7-GISEL-NEXT: v_mul_f32_e32 v0, v0, v2 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v2 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -158,31 +158,30 @@ define half @test_vector_reduce_fmul_v3half(half %sp, <3 x half> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_fmul_v3half: ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 ; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v3 +; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: test_vector_reduce_fmul_v3half: ; GFX7-GISEL: ; %bb.0: ; %entry ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX7-GISEL-NEXT: v_mul_f32_e32 v0, v0, v3 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v2 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v3 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v2 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -312,38 +311,38 @@ define half @test_vector_reduce_fmul_v4half(half %sp, <4 x half> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_fmul_v4half: ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v1 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v4 ; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 ; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v3 -; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v4 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: test_vector_reduce_fmul_v4half: ; GFX7-GISEL: ; %bb.0: ; %entry ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX7-GISEL-NEXT: v_mul_f32_e32 v0, v0, v3 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v2 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v3 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v2 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v4 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -500,66 +499,66 @@ define half @test_vector_reduce_fmul_v8half(half %sp, <8 x half> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_fmul_v8half: ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v8, 16, v1 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v8, v8 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v7, 16, v2 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v6, 16, v3 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v8 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v7 ; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v3 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v8, v8 +; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v6 ; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v4 ; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v5 -; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v6 -; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v7 -; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v8 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: test_vector_reduce_fmul_v8half: ; GFX7-GISEL: ; %bb.0: ; %entry ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v5, v1 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX7-GISEL-NEXT: v_mul_f32_e32 v0, v0, v5 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v2 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v3 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v2 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v4 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v5 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v3 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v6 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v7 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v4 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v8 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v4 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -788,122 +787,122 @@ define half @test_vector_reduce_fmul_v16half(half %sp, <16 x half> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_fmul_v16half: ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v16, 16, v1 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v16, v16 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v15, 16, v2 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v15, v15 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v14, 16, v3 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v14, v14 +; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v16 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v13, 16, v4 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v13, v13 +; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v15 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v12, 16, v5 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v3 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v8, v8 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v12, v12 +; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v14 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v11, 16, v6 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v6, v6 ; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v4 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v9, v9 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v11, v11 +; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v13 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v10, 16, v7 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v7, v7 ; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v5 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v14, v14 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v10, v10 +; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v12 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v9, 16, v8 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v8, v8 ; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v6 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v11, v11 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v9, v9 +; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v11 ; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v7 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v12, v12 +; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v10 ; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v8 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v13, v13 ; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v9 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v10 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v11 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v12 -; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v13 -; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v14 -; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v15 -; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v16 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: test_vector_reduce_fmul_v16half: ; GFX7-GISEL: ; %bb.0: ; %entry ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v9, v1 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX7-GISEL-NEXT: v_mul_f32_e32 v0, v0, v9 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v2 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v3 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v2 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v4 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v5 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v3 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v6 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v7 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v4 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v8 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v4 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v9 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v5 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v10 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v5 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v11 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v6 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v12 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v13 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v7 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v14 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v7 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v15 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v8 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v16 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v8 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-mul.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-mul.ll index 94448411cfd0e..d707ab797f66c 100644 --- a/llvm/test/CodeGen/AMDGPU/vector-reduce-mul.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-mul.ll @@ -972,17 +972,15 @@ define i16 @test_vector_reduce_mul_v2i16(<2 x i16> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_mul_v2i16: ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX7-SDAG-NEXT: v_mul_lo_u32 v0, v0, v1 -; GFX7-SDAG-NEXT: v_mul_lo_u32 v1, v1, v2 ; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: test_vector_reduce_mul_v2i16: ; GFX7-GISEL: ; %bb.0: ; %entry ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX7-GISEL-NEXT: v_mul_lo_u32 v0, v0, v1 ; GFX7-GISEL-NEXT: v_bfe_u32 v0, v0, 0, 16 ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] @@ -1093,23 +1091,20 @@ define i16 @test_vector_reduce_mul_v3i16(<3 x i16> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_mul_v3i16: ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v3 -; GFX7-SDAG-NEXT: v_or_b32_e32 v2, 0x10000, v2 -; GFX7-SDAG-NEXT: v_mul_lo_u32 v0, v0, v2 ; GFX7-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-SDAG-NEXT: v_mul_u32_u24_e32 v1, v1, v2 -; GFX7-SDAG-NEXT: v_mul_lo_u32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_or_b32_e32 v1, 0x10000, v1 +; GFX7-SDAG-NEXT: v_mul_lo_u32 v1, v0, v1 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-SDAG-NEXT: v_mul_u32_u24_e32 v0, 1, v0 +; GFX7-SDAG-NEXT: v_mul_lo_u32 v0, v1, v0 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: test_vector_reduce_mul_v3i16: ; GFX7-GISEL: ; %bb.0: ; %entry ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-GISEL-NEXT: v_mul_lo_u32 v0, v0, v1 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX7-GISEL-NEXT: v_mul_lo_u32 v0, v0, v2 +; GFX7-GISEL-NEXT: v_mul_lo_u32 v0, v0, v1 ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: test_vector_reduce_mul_v3i16: @@ -1231,17 +1226,21 @@ define i16 @test_vector_reduce_mul_v4i16(<4 x i16> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_mul_v4i16: ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_mul_lo_u32 v1, v1, v3 -; GFX7-SDAG-NEXT: v_mul_lo_u32 v0, v0, v2 -; GFX7-SDAG-NEXT: v_mul_lo_u32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_mul_lo_u32 v2, v0, v1 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-SDAG-NEXT: v_mul_u32_u24_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_mul_lo_u32 v0, v2, v0 ; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: test_vector_reduce_mul_v4i16: ; GFX7-GISEL: ; %bb.0: ; %entry ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-GISEL-NEXT: v_mul_lo_u32 v0, v0, v2 -; GFX7-GISEL-NEXT: v_mul_lo_u32 v1, v1, v3 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX7-GISEL-NEXT: v_mul_lo_u32 v0, v0, v1 +; GFX7-GISEL-NEXT: v_mul_lo_u32 v1, v2, v3 ; GFX7-GISEL-NEXT: v_mul_lo_u32 v0, v0, v1 ; GFX7-GISEL-NEXT: v_bfe_u32 v0, v0, 0, 16 ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] @@ -1376,25 +1375,33 @@ define i16 @test_vector_reduce_mul_v8i16(<8 x i16> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_mul_v8i16: ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_mul_lo_u32 v2, v2, v6 -; GFX7-SDAG-NEXT: v_mul_lo_u32 v3, v3, v7 -; GFX7-SDAG-NEXT: v_mul_lo_u32 v1, v1, v5 -; GFX7-SDAG-NEXT: v_mul_lo_u32 v0, v0, v4 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v5, 16, v1 ; GFX7-SDAG-NEXT: v_mul_lo_u32 v1, v1, v3 ; GFX7-SDAG-NEXT: v_mul_lo_u32 v0, v0, v2 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; GFX7-SDAG-NEXT: v_mul_u32_u24_e32 v2, v5, v7 +; GFX7-SDAG-NEXT: v_mul_u32_u24_e32 v3, v4, v6 +; GFX7-SDAG-NEXT: v_mul_lo_u32 v2, v3, v2 ; GFX7-SDAG-NEXT: v_mul_lo_u32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_mul_lo_u32 v0, v0, v2 ; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: test_vector_reduce_mul_v8i16: ; GFX7-GISEL: ; %bb.0: ; %entry ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-GISEL-NEXT: v_mul_lo_u32 v0, v0, v4 -; GFX7-GISEL-NEXT: v_mul_lo_u32 v1, v1, v5 -; GFX7-GISEL-NEXT: v_mul_lo_u32 v2, v2, v6 -; GFX7-GISEL-NEXT: v_mul_lo_u32 v3, v3, v7 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v7, 16, v3 ; GFX7-GISEL-NEXT: v_mul_lo_u32 v0, v0, v2 +; GFX7-GISEL-NEXT: v_mul_lo_u32 v2, v4, v6 ; GFX7-GISEL-NEXT: v_mul_lo_u32 v1, v1, v3 +; GFX7-GISEL-NEXT: v_mul_lo_u32 v3, v5, v7 +; GFX7-GISEL-NEXT: v_mul_lo_u32 v0, v0, v1 +; GFX7-GISEL-NEXT: v_mul_lo_u32 v1, v2, v3 ; GFX7-GISEL-NEXT: v_mul_lo_u32 v0, v0, v1 ; GFX7-GISEL-NEXT: v_bfe_u32 v0, v0, 0, 16 ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] @@ -1563,41 +1570,57 @@ define i16 @test_vector_reduce_mul_v16i16(<16 x i16> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_mul_v16i16: ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_mul_lo_u32 v5, v5, v13 -; GFX7-SDAG-NEXT: v_mul_lo_u32 v1, v1, v9 -; GFX7-SDAG-NEXT: v_mul_lo_u32 v7, v7, v15 -; GFX7-SDAG-NEXT: v_mul_lo_u32 v6, v6, v14 -; GFX7-SDAG-NEXT: v_mul_lo_u32 v2, v2, v10 -; GFX7-SDAG-NEXT: v_mul_lo_u32 v3, v3, v11 -; GFX7-SDAG-NEXT: v_mul_lo_u32 v4, v4, v12 -; GFX7-SDAG-NEXT: v_mul_lo_u32 v0, v0, v8 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v8, 16, v1 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v9, 16, v3 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v10, 16, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v11, 16, v2 ; GFX7-SDAG-NEXT: v_mul_lo_u32 v2, v2, v6 ; GFX7-SDAG-NEXT: v_mul_lo_u32 v3, v3, v7 ; GFX7-SDAG-NEXT: v_mul_lo_u32 v1, v1, v5 ; GFX7-SDAG-NEXT: v_mul_lo_u32 v0, v0, v4 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v12, 16, v5 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v13, 16, v7 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v14, 16, v4 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v15, 16, v6 +; GFX7-SDAG-NEXT: v_mul_u32_u24_e32 v11, v11, v15 +; GFX7-SDAG-NEXT: v_mul_u32_u24_e32 v10, v10, v14 +; GFX7-SDAG-NEXT: v_mul_u32_u24_e32 v9, v9, v13 +; GFX7-SDAG-NEXT: v_mul_u32_u24_e32 v8, v8, v12 +; GFX7-SDAG-NEXT: v_mul_lo_u32 v4, v8, v9 +; GFX7-SDAG-NEXT: v_mul_lo_u32 v5, v10, v11 ; GFX7-SDAG-NEXT: v_mul_lo_u32 v1, v1, v3 ; GFX7-SDAG-NEXT: v_mul_lo_u32 v0, v0, v2 +; GFX7-SDAG-NEXT: v_mul_lo_u32 v2, v5, v4 ; GFX7-SDAG-NEXT: v_mul_lo_u32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_mul_lo_u32 v0, v0, v2 ; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: test_vector_reduce_mul_v16i16: ; GFX7-GISEL: ; %bb.0: ; %entry ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-GISEL-NEXT: v_mul_lo_u32 v0, v0, v8 -; GFX7-GISEL-NEXT: v_mul_lo_u32 v1, v1, v9 -; GFX7-GISEL-NEXT: v_mul_lo_u32 v2, v2, v10 -; GFX7-GISEL-NEXT: v_mul_lo_u32 v3, v3, v11 -; GFX7-GISEL-NEXT: v_mul_lo_u32 v4, v4, v12 -; GFX7-GISEL-NEXT: v_mul_lo_u32 v5, v5, v13 -; GFX7-GISEL-NEXT: v_mul_lo_u32 v6, v6, v14 -; GFX7-GISEL-NEXT: v_mul_lo_u32 v7, v7, v15 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v8, 16, v0 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v9, 16, v1 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v11, 16, v3 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v12, 16, v4 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v13, 16, v5 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v14, 16, v6 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v15, 16, v7 ; GFX7-GISEL-NEXT: v_mul_lo_u32 v0, v0, v4 +; GFX7-GISEL-NEXT: v_mul_lo_u32 v4, v8, v12 ; GFX7-GISEL-NEXT: v_mul_lo_u32 v1, v1, v5 +; GFX7-GISEL-NEXT: v_mul_lo_u32 v5, v9, v13 ; GFX7-GISEL-NEXT: v_mul_lo_u32 v2, v2, v6 +; GFX7-GISEL-NEXT: v_mul_lo_u32 v6, v10, v14 ; GFX7-GISEL-NEXT: v_mul_lo_u32 v3, v3, v7 +; GFX7-GISEL-NEXT: v_mul_lo_u32 v7, v11, v15 ; GFX7-GISEL-NEXT: v_mul_lo_u32 v0, v0, v2 +; GFX7-GISEL-NEXT: v_mul_lo_u32 v2, v4, v6 ; GFX7-GISEL-NEXT: v_mul_lo_u32 v1, v1, v3 +; GFX7-GISEL-NEXT: v_mul_lo_u32 v3, v5, v7 +; GFX7-GISEL-NEXT: v_mul_lo_u32 v0, v0, v1 +; GFX7-GISEL-NEXT: v_mul_lo_u32 v1, v2, v3 ; GFX7-GISEL-NEXT: v_mul_lo_u32 v0, v0, v1 ; GFX7-GISEL-NEXT: v_bfe_u32 v0, v0, 0, 16 ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-or.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-or.ll index b9ab28dfdfae7..73a3dba77a497 100644 --- a/llvm/test/CodeGen/AMDGPU/vector-reduce-or.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-or.ll @@ -999,21 +999,15 @@ define i16 @test_vector_reduce_or_v2i16(<2 x i16> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_or_v2i16: ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX7-SDAG-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_or_b32_e32 v0, 0xffff0000, v0 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: test_vector_reduce_or_v2i16: ; GFX7-GISEL: ; %bb.0: ; %entry ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX7-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7-GISEL-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX7-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX7-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -1109,20 +1103,18 @@ define i16 @test_vector_reduce_or_v3i16(<3 x i16> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_or_v3i16: ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX7-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX7-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: test_vector_reduce_or_v3i16: ; GFX7-GISEL: ; %bb.0: ; %entry ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX7-GISEL-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: test_vector_reduce_or_v3i16: @@ -1216,8 +1208,8 @@ define i16 @test_vector_reduce_or_v4i16(<4 x i16> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_or_v4i16: ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_or_b32_e32 v1, v1, v3 -; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-SDAG-NEXT: v_or_b32_e32 v0, 0xffff0000, v0 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -1225,12 +1217,13 @@ define i16 @test_vector_reduce_or_v4i16(<4 x i16> %v) { ; GFX7-GISEL-LABEL: test_vector_reduce_or_v4i16: ; GFX7-GISEL: ; %bb.0: ; %entry ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7-GISEL-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX7-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; GFX7-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v3 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-GISEL-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX7-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; GFX7-GISEL-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX7-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 @@ -1360,31 +1353,33 @@ define i16 @test_vector_reduce_or_v8i16(<8 x i16> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_or_v8i16: ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_or_b32_e32 v2, v2, v6 -; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX7-SDAG-NEXT: v_or_b32_e32 v3, v3, v7 -; GFX7-SDAG-NEXT: v_or_b32_e32 v1, v1, v5 ; GFX7-SDAG-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-SDAG-NEXT: v_or_b32_e32 v0, 0xffff0000, v0 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: test_vector_reduce_or_v8i16: ; GFX7-GISEL: ; %bb.0: ; %entry ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX7-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7-GISEL-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX7-GISEL-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX7-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; GFX7-GISEL-NEXT: v_or_b32_e32 v1, v4, v1 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX7-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX7-GISEL-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v5 -; GFX7-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v4 -; GFX7-GISEL-NEXT: v_or_b32_e32 v2, v2, v3 -; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX7-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v6 -; GFX7-GISEL-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX7-GISEL-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v7 +; GFX7-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX7-GISEL-NEXT: v_or_b32_e32 v3, v4, v3 ; GFX7-GISEL-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX7-GISEL-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX7-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 @@ -1536,14 +1531,6 @@ define i16 @test_vector_reduce_or_v16i16(<16 x i16> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_or_v16i16: ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_or_b32_e32 v5, v5, v13 -; GFX7-SDAG-NEXT: v_or_b32_e32 v1, v1, v9 -; GFX7-SDAG-NEXT: v_or_b32_e32 v7, v7, v15 -; GFX7-SDAG-NEXT: v_or_b32_e32 v3, v3, v11 -; GFX7-SDAG-NEXT: v_or_b32_e32 v4, v4, v12 -; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v8 -; GFX7-SDAG-NEXT: v_or_b32_e32 v6, v6, v14 -; GFX7-SDAG-NEXT: v_or_b32_e32 v2, v2, v10 ; GFX7-SDAG-NEXT: v_or_b32_e32 v2, v2, v6 ; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v4 ; GFX7-SDAG-NEXT: v_or_b32_e32 v3, v3, v7 @@ -1551,36 +1538,46 @@ define i16 @test_vector_reduce_or_v16i16(<16 x i16> %v) { ; GFX7-SDAG-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-SDAG-NEXT: v_or_b32_e32 v0, 0xffff0000, v0 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: test_vector_reduce_or_v16i16: ; GFX7-GISEL: ; %bb.0: ; %entry ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v8, 16, v0 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v9, 16, v1 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; GFX7-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7-GISEL-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; GFX7-GISEL-NEXT: v_or_b32_e32 v0, v8, v0 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; GFX7-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v11, 16, v3 +; GFX7-GISEL-NEXT: v_or_b32_e32 v1, v8, v1 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v8, 16, v10 ; GFX7-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX7-GISEL-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v5 -; GFX7-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v4 -; GFX7-GISEL-NEXT: v_or_b32_e32 v2, v2, v3 -; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX7-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v6 -; GFX7-GISEL-NEXT: v_or_b32_e32 v3, v3, v4 -; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v9 -; GFX7-GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v8 -; GFX7-GISEL-NEXT: v_or_b32_e32 v4, v4, v5 -; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v11 -; GFX7-GISEL-NEXT: v_and_b32_e32 v6, 0xffff, v10 -; GFX7-GISEL-NEXT: v_or_b32_e32 v5, v5, v6 -; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v6, 16, v13 -; GFX7-GISEL-NEXT: v_and_b32_e32 v7, 0xffff, v12 -; GFX7-GISEL-NEXT: v_or_b32_e32 v6, v6, v7 -; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v7, 16, v15 -; GFX7-GISEL-NEXT: v_and_b32_e32 v8, 0xffff, v14 -; GFX7-GISEL-NEXT: v_or_b32_e32 v7, v7, v8 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v12, 16, v4 +; GFX7-GISEL-NEXT: v_or_b32_e32 v2, v8, v2 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v8, 16, v11 +; GFX7-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v13, 16, v5 +; GFX7-GISEL-NEXT: v_or_b32_e32 v3, v8, v3 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v8, 16, v12 +; GFX7-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v14, 16, v6 +; GFX7-GISEL-NEXT: v_or_b32_e32 v4, v8, v4 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v8, 16, v13 +; GFX7-GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v15, 16, v7 +; GFX7-GISEL-NEXT: v_or_b32_e32 v5, v8, v5 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v8, 16, v14 +; GFX7-GISEL-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX7-GISEL-NEXT: v_or_b32_e32 v6, v8, v6 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v8, 16, v15 +; GFX7-GISEL-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX7-GISEL-NEXT: v_or_b32_e32 v7, v8, v7 ; GFX7-GISEL-NEXT: v_or_b32_e32 v0, v0, v4 ; GFX7-GISEL-NEXT: v_or_b32_e32 v1, v1, v5 ; GFX7-GISEL-NEXT: v_or_b32_e32 v2, v2, v6 diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-smax.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-smax.ll index 43430dc132f0e..1e0c0bb072a76 100644 --- a/llvm/test/CodeGen/AMDGPU/vector-reduce-smax.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-smax.ll @@ -1637,7 +1637,7 @@ define i16 @test_vector_reduce_smax_v2i16(<2 x i16> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_smax_v2i16: ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_bfe_i32 v1, v1, 0, 16 +; GFX7-SDAG-NEXT: v_ashrrev_i32_e32 v1, 16, v0 ; GFX7-SDAG-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GFX7-SDAG-NEXT: v_max_i32_e32 v0, v0, v1 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -1645,14 +1645,14 @@ define i16 @test_vector_reduce_smax_v2i16(<2 x i16> %v) { ; GFX7-GISEL-LABEL: test_vector_reduce_smax_v2i16: ; GFX7-GISEL: ; %bb.0: ; %entry ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-GISEL-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GFX7-GISEL-NEXT: v_bfe_i32 v1, v1, 0, 16 -; GFX7-GISEL-NEXT: v_max_i32_e32 v0, v0, v1 -; GFX7-GISEL-NEXT: v_max_i32_e32 v1, 0, v1 -; GFX7-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-GISEL-NEXT: v_bfe_i32 v1, v0, 0, 16 +; GFX7-GISEL-NEXT: v_bfe_i32 v0, v0, 16, 16 +; GFX7-GISEL-NEXT: v_max_i32_e32 v1, v1, v0 +; GFX7-GISEL-NEXT: v_max_i32_e32 v0, 0, v0 ; GFX7-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-GISEL-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: test_vector_reduce_smax_v2i16: @@ -1761,24 +1761,22 @@ define i16 @test_vector_reduce_smax_v3i16(<3 x i16> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_smax_v3i16: ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX7-SDAG-NEXT: v_or_b32_e32 v2, 0x80000000, v2 -; GFX7-SDAG-NEXT: v_ashrrev_i32_e32 v3, 16, v2 -; GFX7-SDAG-NEXT: v_bfe_i32 v2, v2, 0, 16 -; GFX7-SDAG-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GFX7-SDAG-NEXT: v_max_i32_e32 v0, v0, v2 +; GFX7-SDAG-NEXT: v_ashrrev_i32_e32 v2, 16, v0 ; GFX7-SDAG-NEXT: v_bfe_i32 v1, v1, 0, 16 -; GFX7-SDAG-NEXT: v_max3_i32 v0, v0, v1, v3 +; GFX7-SDAG-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX7-SDAG-NEXT: v_max_i32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: s_movk_i32 s4, 0x8000 +; GFX7-SDAG-NEXT: v_max3_i32 v0, v0, v2, s4 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: test_vector_reduce_smax_v3i16: ; GFX7-GISEL: ; %bb.0: ; %entry ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-GISEL-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX7-GISEL-NEXT: v_bfe_i32 v2, v0, 0, 16 +; GFX7-GISEL-NEXT: v_bfe_i32 v0, v0, 16, 16 +; GFX7-GISEL-NEXT: v_max_i32_e32 v0, v2, v0 ; GFX7-GISEL-NEXT: v_bfe_i32 v1, v1, 0, 16 ; GFX7-GISEL-NEXT: v_max_i32_e32 v0, v0, v1 -; GFX7-GISEL-NEXT: v_bfe_i32 v1, v2, 0, 16 -; GFX7-GISEL-NEXT: v_max_i32_e32 v0, v0, v1 ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: test_vector_reduce_smax_v3i16: @@ -1906,29 +1904,29 @@ define i16 @test_vector_reduce_smax_v4i16(<4 x i16> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_smax_v4i16: ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_bfe_i32 v3, v3, 0, 16 -; GFX7-SDAG-NEXT: v_bfe_i32 v1, v1, 0, 16 -; GFX7-SDAG-NEXT: v_bfe_i32 v2, v2, 0, 16 +; GFX7-SDAG-NEXT: v_ashrrev_i32_e32 v2, 16, v0 +; GFX7-SDAG-NEXT: v_ashrrev_i32_e32 v3, 16, v1 ; GFX7-SDAG-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GFX7-SDAG-NEXT: v_max_i32_e32 v1, v1, v3 -; GFX7-SDAG-NEXT: v_max3_i32 v0, v0, v2, v1 +; GFX7-SDAG-NEXT: v_bfe_i32 v1, v1, 0, 16 +; GFX7-SDAG-NEXT: v_max_i32_e32 v2, v2, v3 +; GFX7-SDAG-NEXT: v_max3_i32 v0, v0, v1, v2 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: test_vector_reduce_smax_v4i16: ; GFX7-GISEL: ; %bb.0: ; %entry ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-GISEL-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GFX7-GISEL-NEXT: v_bfe_i32 v2, v2, 0, 16 -; GFX7-GISEL-NEXT: v_max_i32_e32 v0, v0, v2 -; GFX7-GISEL-NEXT: v_bfe_i32 v1, v1, 0, 16 -; GFX7-GISEL-NEXT: v_bfe_i32 v2, v3, 0, 16 -; GFX7-GISEL-NEXT: v_max_i32_e32 v1, v1, v2 +; GFX7-GISEL-NEXT: v_bfe_i32 v2, v0, 0, 16 +; GFX7-GISEL-NEXT: v_bfe_i32 v3, v1, 0, 16 +; GFX7-GISEL-NEXT: v_bfe_i32 v0, v0, 16, 16 +; GFX7-GISEL-NEXT: v_bfe_i32 v1, v1, 16, 16 +; GFX7-GISEL-NEXT: v_max_i32_e32 v2, v2, v3 ; GFX7-GISEL-NEXT: v_max_i32_e32 v0, v0, v1 -; GFX7-GISEL-NEXT: v_max_i32_e32 v1, 0, v1 -; GFX7-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-GISEL-NEXT: v_max_i32_e32 v1, v2, v0 +; GFX7-GISEL-NEXT: v_max_i32_e32 v0, 0, v0 ; GFX7-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-GISEL-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: test_vector_reduce_smax_v4i16: @@ -2056,44 +2054,44 @@ define i16 @test_vector_reduce_smax_v8i16(<8 x i16> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_smax_v8i16: ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_bfe_i32 v5, v5, 0, 16 -; GFX7-SDAG-NEXT: v_bfe_i32 v1, v1, 0, 16 -; GFX7-SDAG-NEXT: v_bfe_i32 v4, v4, 0, 16 +; GFX7-SDAG-NEXT: v_ashrrev_i32_e32 v5, 16, v0 ; GFX7-SDAG-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GFX7-SDAG-NEXT: v_bfe_i32 v6, v6, 0, 16 +; GFX7-SDAG-NEXT: v_ashrrev_i32_e32 v7, 16, v2 ; GFX7-SDAG-NEXT: v_bfe_i32 v2, v2, 0, 16 -; GFX7-SDAG-NEXT: v_bfe_i32 v7, v7, 0, 16 +; GFX7-SDAG-NEXT: v_ashrrev_i32_e32 v4, 16, v1 +; GFX7-SDAG-NEXT: v_bfe_i32 v1, v1, 0, 16 +; GFX7-SDAG-NEXT: v_ashrrev_i32_e32 v6, 16, v3 ; GFX7-SDAG-NEXT: v_bfe_i32 v3, v3, 0, 16 -; GFX7-SDAG-NEXT: v_max_i32_e32 v0, v0, v4 -; GFX7-SDAG-NEXT: v_max_i32_e32 v1, v1, v5 -; GFX7-SDAG-NEXT: v_max3_i32 v1, v1, v3, v7 -; GFX7-SDAG-NEXT: v_max3_i32 v0, v0, v2, v6 -; GFX7-SDAG-NEXT: v_max_i32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_max_i32_e32 v5, v5, v7 +; GFX7-SDAG-NEXT: v_max_i32_e32 v0, v0, v2 +; GFX7-SDAG-NEXT: v_max3_i32 v2, v5, v4, v6 +; GFX7-SDAG-NEXT: v_max3_i32 v0, v0, v1, v3 +; GFX7-SDAG-NEXT: v_max_i32_e32 v0, v0, v2 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: test_vector_reduce_smax_v8i16: ; GFX7-GISEL: ; %bb.0: ; %entry ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-GISEL-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GFX7-GISEL-NEXT: v_bfe_i32 v4, v4, 0, 16 -; GFX7-GISEL-NEXT: v_max_i32_e32 v0, v0, v4 -; GFX7-GISEL-NEXT: v_bfe_i32 v1, v1, 0, 16 -; GFX7-GISEL-NEXT: v_bfe_i32 v4, v5, 0, 16 -; GFX7-GISEL-NEXT: v_max_i32_e32 v1, v1, v4 -; GFX7-GISEL-NEXT: v_bfe_i32 v2, v2, 0, 16 -; GFX7-GISEL-NEXT: v_bfe_i32 v4, v6, 0, 16 -; GFX7-GISEL-NEXT: v_max_i32_e32 v2, v2, v4 -; GFX7-GISEL-NEXT: v_bfe_i32 v3, v3, 0, 16 -; GFX7-GISEL-NEXT: v_bfe_i32 v4, v7, 0, 16 -; GFX7-GISEL-NEXT: v_max_i32_e32 v3, v3, v4 +; GFX7-GISEL-NEXT: v_bfe_i32 v4, v0, 0, 16 +; GFX7-GISEL-NEXT: v_bfe_i32 v5, v2, 0, 16 +; GFX7-GISEL-NEXT: v_bfe_i32 v0, v0, 16, 16 +; GFX7-GISEL-NEXT: v_bfe_i32 v2, v2, 16, 16 +; GFX7-GISEL-NEXT: v_max_i32_e32 v4, v4, v5 ; GFX7-GISEL-NEXT: v_max_i32_e32 v0, v0, v2 +; GFX7-GISEL-NEXT: v_bfe_i32 v2, v1, 0, 16 +; GFX7-GISEL-NEXT: v_bfe_i32 v5, v3, 0, 16 +; GFX7-GISEL-NEXT: v_bfe_i32 v1, v1, 16, 16 +; GFX7-GISEL-NEXT: v_bfe_i32 v3, v3, 16, 16 +; GFX7-GISEL-NEXT: v_max_i32_e32 v2, v2, v5 ; GFX7-GISEL-NEXT: v_max_i32_e32 v1, v1, v3 +; GFX7-GISEL-NEXT: v_max_i32_e32 v2, v4, v2 ; GFX7-GISEL-NEXT: v_max_i32_e32 v0, v0, v1 -; GFX7-GISEL-NEXT: v_max_i32_e32 v1, 0, v1 -; GFX7-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-GISEL-NEXT: v_max_i32_e32 v1, v2, v0 +; GFX7-GISEL-NEXT: v_max_i32_e32 v0, 0, v0 ; GFX7-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-GISEL-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: test_vector_reduce_smax_v8i16: @@ -2255,74 +2253,74 @@ define i16 @test_vector_reduce_smax_v16i16(<16 x i16> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_smax_v16i16: ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_bfe_i32 v8, v8, 0, 16 +; GFX7-SDAG-NEXT: v_ashrrev_i32_e32 v9, 16, v0 ; GFX7-SDAG-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GFX7-SDAG-NEXT: v_bfe_i32 v9, v9, 0, 16 -; GFX7-SDAG-NEXT: v_bfe_i32 v1, v1, 0, 16 -; GFX7-SDAG-NEXT: v_bfe_i32 v13, v13, 0, 16 -; GFX7-SDAG-NEXT: v_bfe_i32 v5, v5, 0, 16 -; GFX7-SDAG-NEXT: v_bfe_i32 v12, v12, 0, 16 +; GFX7-SDAG-NEXT: v_ashrrev_i32_e32 v13, 16, v4 ; GFX7-SDAG-NEXT: v_bfe_i32 v4, v4, 0, 16 -; GFX7-SDAG-NEXT: v_bfe_i32 v11, v11, 0, 16 -; GFX7-SDAG-NEXT: v_bfe_i32 v3, v3, 0, 16 -; GFX7-SDAG-NEXT: v_bfe_i32 v15, v15, 0, 16 -; GFX7-SDAG-NEXT: v_bfe_i32 v7, v7, 0, 16 -; GFX7-SDAG-NEXT: v_bfe_i32 v10, v10, 0, 16 +; GFX7-SDAG-NEXT: v_ashrrev_i32_e32 v8, 16, v2 +; GFX7-SDAG-NEXT: v_ashrrev_i32_e32 v10, 16, v1 +; GFX7-SDAG-NEXT: v_ashrrev_i32_e32 v11, 16, v3 ; GFX7-SDAG-NEXT: v_bfe_i32 v2, v2, 0, 16 -; GFX7-SDAG-NEXT: v_bfe_i32 v14, v14, 0, 16 +; GFX7-SDAG-NEXT: v_bfe_i32 v1, v1, 0, 16 +; GFX7-SDAG-NEXT: v_bfe_i32 v3, v3, 0, 16 +; GFX7-SDAG-NEXT: v_ashrrev_i32_e32 v12, 16, v6 +; GFX7-SDAG-NEXT: v_ashrrev_i32_e32 v14, 16, v5 +; GFX7-SDAG-NEXT: v_ashrrev_i32_e32 v15, 16, v7 ; GFX7-SDAG-NEXT: v_bfe_i32 v6, v6, 0, 16 -; GFX7-SDAG-NEXT: v_max_i32_e32 v1, v1, v9 -; GFX7-SDAG-NEXT: v_max_i32_e32 v0, v0, v8 -; GFX7-SDAG-NEXT: v_max_i32_e32 v6, v6, v14 -; GFX7-SDAG-NEXT: v_max_i32_e32 v2, v2, v10 -; GFX7-SDAG-NEXT: v_max_i32_e32 v7, v7, v15 -; GFX7-SDAG-NEXT: v_max_i32_e32 v3, v3, v11 -; GFX7-SDAG-NEXT: v_max3_i32 v0, v0, v4, v12 -; GFX7-SDAG-NEXT: v_max3_i32 v1, v1, v5, v13 -; GFX7-SDAG-NEXT: v_max3_i32 v1, v1, v3, v7 +; GFX7-SDAG-NEXT: v_bfe_i32 v5, v5, 0, 16 +; GFX7-SDAG-NEXT: v_bfe_i32 v7, v7, 0, 16 +; GFX7-SDAG-NEXT: v_max_i32_e32 v9, v9, v13 +; GFX7-SDAG-NEXT: v_max_i32_e32 v0, v0, v4 +; GFX7-SDAG-NEXT: v_max_i32_e32 v11, v11, v15 +; GFX7-SDAG-NEXT: v_max_i32_e32 v10, v10, v14 +; GFX7-SDAG-NEXT: v_max_i32_e32 v3, v3, v7 +; GFX7-SDAG-NEXT: v_max_i32_e32 v1, v1, v5 +; GFX7-SDAG-NEXT: v_max3_i32 v4, v9, v8, v12 ; GFX7-SDAG-NEXT: v_max3_i32 v0, v0, v2, v6 -; GFX7-SDAG-NEXT: v_max_i32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_max3_i32 v2, v4, v10, v11 +; GFX7-SDAG-NEXT: v_max3_i32 v0, v0, v1, v3 +; GFX7-SDAG-NEXT: v_max_i32_e32 v0, v0, v2 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: test_vector_reduce_smax_v16i16: ; GFX7-GISEL: ; %bb.0: ; %entry ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-GISEL-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GFX7-GISEL-NEXT: v_bfe_i32 v8, v8, 0, 16 -; GFX7-GISEL-NEXT: v_max_i32_e32 v0, v0, v8 -; GFX7-GISEL-NEXT: v_bfe_i32 v1, v1, 0, 16 -; GFX7-GISEL-NEXT: v_bfe_i32 v8, v9, 0, 16 -; GFX7-GISEL-NEXT: v_max_i32_e32 v1, v1, v8 -; GFX7-GISEL-NEXT: v_bfe_i32 v2, v2, 0, 16 -; GFX7-GISEL-NEXT: v_bfe_i32 v8, v10, 0, 16 -; GFX7-GISEL-NEXT: v_max_i32_e32 v2, v2, v8 -; GFX7-GISEL-NEXT: v_bfe_i32 v3, v3, 0, 16 -; GFX7-GISEL-NEXT: v_bfe_i32 v8, v11, 0, 16 -; GFX7-GISEL-NEXT: v_max_i32_e32 v3, v3, v8 -; GFX7-GISEL-NEXT: v_bfe_i32 v4, v4, 0, 16 -; GFX7-GISEL-NEXT: v_bfe_i32 v8, v12, 0, 16 -; GFX7-GISEL-NEXT: v_max_i32_e32 v4, v4, v8 -; GFX7-GISEL-NEXT: v_bfe_i32 v5, v5, 0, 16 -; GFX7-GISEL-NEXT: v_bfe_i32 v8, v13, 0, 16 -; GFX7-GISEL-NEXT: v_max_i32_e32 v5, v5, v8 -; GFX7-GISEL-NEXT: v_bfe_i32 v6, v6, 0, 16 -; GFX7-GISEL-NEXT: v_bfe_i32 v8, v14, 0, 16 -; GFX7-GISEL-NEXT: v_max_i32_e32 v6, v6, v8 -; GFX7-GISEL-NEXT: v_bfe_i32 v7, v7, 0, 16 -; GFX7-GISEL-NEXT: v_bfe_i32 v8, v15, 0, 16 -; GFX7-GISEL-NEXT: v_max_i32_e32 v7, v7, v8 +; GFX7-GISEL-NEXT: v_bfe_i32 v8, v0, 0, 16 +; GFX7-GISEL-NEXT: v_bfe_i32 v9, v4, 0, 16 +; GFX7-GISEL-NEXT: v_bfe_i32 v0, v0, 16, 16 +; GFX7-GISEL-NEXT: v_bfe_i32 v4, v4, 16, 16 +; GFX7-GISEL-NEXT: v_max_i32_e32 v8, v8, v9 ; GFX7-GISEL-NEXT: v_max_i32_e32 v0, v0, v4 +; GFX7-GISEL-NEXT: v_bfe_i32 v4, v1, 0, 16 +; GFX7-GISEL-NEXT: v_bfe_i32 v9, v5, 0, 16 +; GFX7-GISEL-NEXT: v_bfe_i32 v1, v1, 16, 16 +; GFX7-GISEL-NEXT: v_bfe_i32 v5, v5, 16, 16 +; GFX7-GISEL-NEXT: v_max_i32_e32 v4, v4, v9 ; GFX7-GISEL-NEXT: v_max_i32_e32 v1, v1, v5 +; GFX7-GISEL-NEXT: v_bfe_i32 v5, v2, 0, 16 +; GFX7-GISEL-NEXT: v_bfe_i32 v9, v6, 0, 16 +; GFX7-GISEL-NEXT: v_bfe_i32 v2, v2, 16, 16 +; GFX7-GISEL-NEXT: v_bfe_i32 v6, v6, 16, 16 +; GFX7-GISEL-NEXT: v_max_i32_e32 v5, v5, v9 ; GFX7-GISEL-NEXT: v_max_i32_e32 v2, v2, v6 +; GFX7-GISEL-NEXT: v_bfe_i32 v6, v3, 0, 16 +; GFX7-GISEL-NEXT: v_bfe_i32 v9, v7, 0, 16 +; GFX7-GISEL-NEXT: v_bfe_i32 v3, v3, 16, 16 +; GFX7-GISEL-NEXT: v_bfe_i32 v7, v7, 16, 16 +; GFX7-GISEL-NEXT: v_max_i32_e32 v6, v6, v9 ; GFX7-GISEL-NEXT: v_max_i32_e32 v3, v3, v7 +; GFX7-GISEL-NEXT: v_max_i32_e32 v5, v8, v5 ; GFX7-GISEL-NEXT: v_max_i32_e32 v0, v0, v2 +; GFX7-GISEL-NEXT: v_max_i32_e32 v2, v4, v6 ; GFX7-GISEL-NEXT: v_max_i32_e32 v1, v1, v3 +; GFX7-GISEL-NEXT: v_max_i32_e32 v2, v5, v2 ; GFX7-GISEL-NEXT: v_max_i32_e32 v0, v0, v1 -; GFX7-GISEL-NEXT: v_max_i32_e32 v1, 0, v1 -; GFX7-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-GISEL-NEXT: v_max_i32_e32 v1, v2, v0 +; GFX7-GISEL-NEXT: v_max_i32_e32 v0, 0, v0 ; GFX7-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-GISEL-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: test_vector_reduce_smax_v16i16: diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-smin.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-smin.ll index 94de3bc7241c1..e30990557bb4d 100644 --- a/llvm/test/CodeGen/AMDGPU/vector-reduce-smin.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-smin.ll @@ -1637,7 +1637,7 @@ define i16 @test_vector_reduce_smin_v2i16(<2 x i16> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_smin_v2i16: ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_bfe_i32 v1, v1, 0, 16 +; GFX7-SDAG-NEXT: v_ashrrev_i32_e32 v1, 16, v0 ; GFX7-SDAG-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GFX7-SDAG-NEXT: v_min_i32_e32 v0, v0, v1 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -1645,14 +1645,14 @@ define i16 @test_vector_reduce_smin_v2i16(<2 x i16> %v) { ; GFX7-GISEL-LABEL: test_vector_reduce_smin_v2i16: ; GFX7-GISEL: ; %bb.0: ; %entry ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-GISEL-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GFX7-GISEL-NEXT: v_bfe_i32 v1, v1, 0, 16 -; GFX7-GISEL-NEXT: v_min_i32_e32 v0, v0, v1 -; GFX7-GISEL-NEXT: v_min_i32_e32 v1, 0, v1 -; GFX7-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-GISEL-NEXT: v_bfe_i32 v1, v0, 0, 16 +; GFX7-GISEL-NEXT: v_bfe_i32 v0, v0, 16, 16 +; GFX7-GISEL-NEXT: v_min_i32_e32 v1, v1, v0 +; GFX7-GISEL-NEXT: v_min_i32_e32 v0, 0, v0 ; GFX7-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-GISEL-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: test_vector_reduce_smin_v2i16: @@ -1761,24 +1761,22 @@ define i16 @test_vector_reduce_smin_v3i16(<3 x i16> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_smin_v3i16: ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX7-SDAG-NEXT: v_or_b32_e32 v2, 0x7fff0000, v2 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GFX7-SDAG-NEXT: v_bfe_i32 v2, v2, 0, 16 -; GFX7-SDAG-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GFX7-SDAG-NEXT: v_min_i32_e32 v0, v0, v2 +; GFX7-SDAG-NEXT: v_ashrrev_i32_e32 v2, 16, v0 ; GFX7-SDAG-NEXT: v_bfe_i32 v1, v1, 0, 16 -; GFX7-SDAG-NEXT: v_min3_i32 v0, v0, v1, v3 +; GFX7-SDAG-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX7-SDAG-NEXT: v_min_i32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: s_movk_i32 s4, 0x7fff +; GFX7-SDAG-NEXT: v_min3_i32 v0, v0, v2, s4 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: test_vector_reduce_smin_v3i16: ; GFX7-GISEL: ; %bb.0: ; %entry ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-GISEL-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX7-GISEL-NEXT: v_bfe_i32 v2, v0, 0, 16 +; GFX7-GISEL-NEXT: v_bfe_i32 v0, v0, 16, 16 +; GFX7-GISEL-NEXT: v_min_i32_e32 v0, v2, v0 ; GFX7-GISEL-NEXT: v_bfe_i32 v1, v1, 0, 16 ; GFX7-GISEL-NEXT: v_min_i32_e32 v0, v0, v1 -; GFX7-GISEL-NEXT: v_bfe_i32 v1, v2, 0, 16 -; GFX7-GISEL-NEXT: v_min_i32_e32 v0, v0, v1 ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: test_vector_reduce_smin_v3i16: @@ -1906,29 +1904,29 @@ define i16 @test_vector_reduce_smin_v4i16(<4 x i16> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_smin_v4i16: ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_bfe_i32 v3, v3, 0, 16 -; GFX7-SDAG-NEXT: v_bfe_i32 v1, v1, 0, 16 -; GFX7-SDAG-NEXT: v_bfe_i32 v2, v2, 0, 16 +; GFX7-SDAG-NEXT: v_ashrrev_i32_e32 v2, 16, v0 +; GFX7-SDAG-NEXT: v_ashrrev_i32_e32 v3, 16, v1 ; GFX7-SDAG-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GFX7-SDAG-NEXT: v_min_i32_e32 v1, v1, v3 -; GFX7-SDAG-NEXT: v_min3_i32 v0, v0, v2, v1 +; GFX7-SDAG-NEXT: v_bfe_i32 v1, v1, 0, 16 +; GFX7-SDAG-NEXT: v_min_i32_e32 v2, v2, v3 +; GFX7-SDAG-NEXT: v_min3_i32 v0, v0, v1, v2 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: test_vector_reduce_smin_v4i16: ; GFX7-GISEL: ; %bb.0: ; %entry ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-GISEL-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GFX7-GISEL-NEXT: v_bfe_i32 v2, v2, 0, 16 -; GFX7-GISEL-NEXT: v_min_i32_e32 v0, v0, v2 -; GFX7-GISEL-NEXT: v_bfe_i32 v1, v1, 0, 16 -; GFX7-GISEL-NEXT: v_bfe_i32 v2, v3, 0, 16 -; GFX7-GISEL-NEXT: v_min_i32_e32 v1, v1, v2 +; GFX7-GISEL-NEXT: v_bfe_i32 v2, v0, 0, 16 +; GFX7-GISEL-NEXT: v_bfe_i32 v3, v1, 0, 16 +; GFX7-GISEL-NEXT: v_bfe_i32 v0, v0, 16, 16 +; GFX7-GISEL-NEXT: v_bfe_i32 v1, v1, 16, 16 +; GFX7-GISEL-NEXT: v_min_i32_e32 v2, v2, v3 ; GFX7-GISEL-NEXT: v_min_i32_e32 v0, v0, v1 -; GFX7-GISEL-NEXT: v_min_i32_e32 v1, 0, v1 -; GFX7-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-GISEL-NEXT: v_min_i32_e32 v1, v2, v0 +; GFX7-GISEL-NEXT: v_min_i32_e32 v0, 0, v0 ; GFX7-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-GISEL-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: test_vector_reduce_smin_v4i16: @@ -2056,44 +2054,44 @@ define i16 @test_vector_reduce_smin_v8i16(<8 x i16> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_smin_v8i16: ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_bfe_i32 v5, v5, 0, 16 -; GFX7-SDAG-NEXT: v_bfe_i32 v1, v1, 0, 16 -; GFX7-SDAG-NEXT: v_bfe_i32 v4, v4, 0, 16 +; GFX7-SDAG-NEXT: v_ashrrev_i32_e32 v5, 16, v0 ; GFX7-SDAG-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GFX7-SDAG-NEXT: v_bfe_i32 v6, v6, 0, 16 +; GFX7-SDAG-NEXT: v_ashrrev_i32_e32 v7, 16, v2 ; GFX7-SDAG-NEXT: v_bfe_i32 v2, v2, 0, 16 -; GFX7-SDAG-NEXT: v_bfe_i32 v7, v7, 0, 16 +; GFX7-SDAG-NEXT: v_ashrrev_i32_e32 v4, 16, v1 +; GFX7-SDAG-NEXT: v_bfe_i32 v1, v1, 0, 16 +; GFX7-SDAG-NEXT: v_ashrrev_i32_e32 v6, 16, v3 ; GFX7-SDAG-NEXT: v_bfe_i32 v3, v3, 0, 16 -; GFX7-SDAG-NEXT: v_min_i32_e32 v0, v0, v4 -; GFX7-SDAG-NEXT: v_min_i32_e32 v1, v1, v5 -; GFX7-SDAG-NEXT: v_min3_i32 v1, v1, v3, v7 -; GFX7-SDAG-NEXT: v_min3_i32 v0, v0, v2, v6 -; GFX7-SDAG-NEXT: v_min_i32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_min_i32_e32 v5, v5, v7 +; GFX7-SDAG-NEXT: v_min_i32_e32 v0, v0, v2 +; GFX7-SDAG-NEXT: v_min3_i32 v2, v5, v4, v6 +; GFX7-SDAG-NEXT: v_min3_i32 v0, v0, v1, v3 +; GFX7-SDAG-NEXT: v_min_i32_e32 v0, v0, v2 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: test_vector_reduce_smin_v8i16: ; GFX7-GISEL: ; %bb.0: ; %entry ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-GISEL-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GFX7-GISEL-NEXT: v_bfe_i32 v4, v4, 0, 16 -; GFX7-GISEL-NEXT: v_min_i32_e32 v0, v0, v4 -; GFX7-GISEL-NEXT: v_bfe_i32 v1, v1, 0, 16 -; GFX7-GISEL-NEXT: v_bfe_i32 v4, v5, 0, 16 -; GFX7-GISEL-NEXT: v_min_i32_e32 v1, v1, v4 -; GFX7-GISEL-NEXT: v_bfe_i32 v2, v2, 0, 16 -; GFX7-GISEL-NEXT: v_bfe_i32 v4, v6, 0, 16 -; GFX7-GISEL-NEXT: v_min_i32_e32 v2, v2, v4 -; GFX7-GISEL-NEXT: v_bfe_i32 v3, v3, 0, 16 -; GFX7-GISEL-NEXT: v_bfe_i32 v4, v7, 0, 16 -; GFX7-GISEL-NEXT: v_min_i32_e32 v3, v3, v4 +; GFX7-GISEL-NEXT: v_bfe_i32 v4, v0, 0, 16 +; GFX7-GISEL-NEXT: v_bfe_i32 v5, v2, 0, 16 +; GFX7-GISEL-NEXT: v_bfe_i32 v0, v0, 16, 16 +; GFX7-GISEL-NEXT: v_bfe_i32 v2, v2, 16, 16 +; GFX7-GISEL-NEXT: v_min_i32_e32 v4, v4, v5 ; GFX7-GISEL-NEXT: v_min_i32_e32 v0, v0, v2 +; GFX7-GISEL-NEXT: v_bfe_i32 v2, v1, 0, 16 +; GFX7-GISEL-NEXT: v_bfe_i32 v5, v3, 0, 16 +; GFX7-GISEL-NEXT: v_bfe_i32 v1, v1, 16, 16 +; GFX7-GISEL-NEXT: v_bfe_i32 v3, v3, 16, 16 +; GFX7-GISEL-NEXT: v_min_i32_e32 v2, v2, v5 ; GFX7-GISEL-NEXT: v_min_i32_e32 v1, v1, v3 +; GFX7-GISEL-NEXT: v_min_i32_e32 v2, v4, v2 ; GFX7-GISEL-NEXT: v_min_i32_e32 v0, v0, v1 -; GFX7-GISEL-NEXT: v_min_i32_e32 v1, 0, v1 -; GFX7-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-GISEL-NEXT: v_min_i32_e32 v1, v2, v0 +; GFX7-GISEL-NEXT: v_min_i32_e32 v0, 0, v0 ; GFX7-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-GISEL-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: test_vector_reduce_smin_v8i16: @@ -2254,74 +2252,74 @@ define i16 @test_vector_reduce_smin_v16i16(<16 x i16> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_smin_v16i16: ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_bfe_i32 v8, v8, 0, 16 +; GFX7-SDAG-NEXT: v_ashrrev_i32_e32 v9, 16, v0 ; GFX7-SDAG-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GFX7-SDAG-NEXT: v_bfe_i32 v9, v9, 0, 16 -; GFX7-SDAG-NEXT: v_bfe_i32 v1, v1, 0, 16 -; GFX7-SDAG-NEXT: v_bfe_i32 v13, v13, 0, 16 -; GFX7-SDAG-NEXT: v_bfe_i32 v5, v5, 0, 16 -; GFX7-SDAG-NEXT: v_bfe_i32 v12, v12, 0, 16 +; GFX7-SDAG-NEXT: v_ashrrev_i32_e32 v13, 16, v4 ; GFX7-SDAG-NEXT: v_bfe_i32 v4, v4, 0, 16 -; GFX7-SDAG-NEXT: v_bfe_i32 v11, v11, 0, 16 -; GFX7-SDAG-NEXT: v_bfe_i32 v3, v3, 0, 16 -; GFX7-SDAG-NEXT: v_bfe_i32 v15, v15, 0, 16 -; GFX7-SDAG-NEXT: v_bfe_i32 v7, v7, 0, 16 -; GFX7-SDAG-NEXT: v_bfe_i32 v10, v10, 0, 16 +; GFX7-SDAG-NEXT: v_ashrrev_i32_e32 v8, 16, v2 +; GFX7-SDAG-NEXT: v_ashrrev_i32_e32 v10, 16, v1 +; GFX7-SDAG-NEXT: v_ashrrev_i32_e32 v11, 16, v3 ; GFX7-SDAG-NEXT: v_bfe_i32 v2, v2, 0, 16 -; GFX7-SDAG-NEXT: v_bfe_i32 v14, v14, 0, 16 +; GFX7-SDAG-NEXT: v_bfe_i32 v1, v1, 0, 16 +; GFX7-SDAG-NEXT: v_bfe_i32 v3, v3, 0, 16 +; GFX7-SDAG-NEXT: v_ashrrev_i32_e32 v12, 16, v6 +; GFX7-SDAG-NEXT: v_ashrrev_i32_e32 v14, 16, v5 +; GFX7-SDAG-NEXT: v_ashrrev_i32_e32 v15, 16, v7 ; GFX7-SDAG-NEXT: v_bfe_i32 v6, v6, 0, 16 -; GFX7-SDAG-NEXT: v_min_i32_e32 v1, v1, v9 -; GFX7-SDAG-NEXT: v_min_i32_e32 v0, v0, v8 -; GFX7-SDAG-NEXT: v_min_i32_e32 v6, v6, v14 -; GFX7-SDAG-NEXT: v_min_i32_e32 v2, v2, v10 -; GFX7-SDAG-NEXT: v_min_i32_e32 v7, v7, v15 -; GFX7-SDAG-NEXT: v_min_i32_e32 v3, v3, v11 -; GFX7-SDAG-NEXT: v_min3_i32 v0, v0, v4, v12 -; GFX7-SDAG-NEXT: v_min3_i32 v1, v1, v5, v13 -; GFX7-SDAG-NEXT: v_min3_i32 v1, v1, v3, v7 +; GFX7-SDAG-NEXT: v_bfe_i32 v5, v5, 0, 16 +; GFX7-SDAG-NEXT: v_bfe_i32 v7, v7, 0, 16 +; GFX7-SDAG-NEXT: v_min_i32_e32 v9, v9, v13 +; GFX7-SDAG-NEXT: v_min_i32_e32 v0, v0, v4 +; GFX7-SDAG-NEXT: v_min_i32_e32 v11, v11, v15 +; GFX7-SDAG-NEXT: v_min_i32_e32 v10, v10, v14 +; GFX7-SDAG-NEXT: v_min_i32_e32 v3, v3, v7 +; GFX7-SDAG-NEXT: v_min_i32_e32 v1, v1, v5 +; GFX7-SDAG-NEXT: v_min3_i32 v4, v9, v8, v12 ; GFX7-SDAG-NEXT: v_min3_i32 v0, v0, v2, v6 -; GFX7-SDAG-NEXT: v_min_i32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_min3_i32 v2, v4, v10, v11 +; GFX7-SDAG-NEXT: v_min3_i32 v0, v0, v1, v3 +; GFX7-SDAG-NEXT: v_min_i32_e32 v0, v0, v2 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: test_vector_reduce_smin_v16i16: ; GFX7-GISEL: ; %bb.0: ; %entry ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-GISEL-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GFX7-GISEL-NEXT: v_bfe_i32 v8, v8, 0, 16 -; GFX7-GISEL-NEXT: v_min_i32_e32 v0, v0, v8 -; GFX7-GISEL-NEXT: v_bfe_i32 v1, v1, 0, 16 -; GFX7-GISEL-NEXT: v_bfe_i32 v8, v9, 0, 16 -; GFX7-GISEL-NEXT: v_min_i32_e32 v1, v1, v8 -; GFX7-GISEL-NEXT: v_bfe_i32 v2, v2, 0, 16 -; GFX7-GISEL-NEXT: v_bfe_i32 v8, v10, 0, 16 -; GFX7-GISEL-NEXT: v_min_i32_e32 v2, v2, v8 -; GFX7-GISEL-NEXT: v_bfe_i32 v3, v3, 0, 16 -; GFX7-GISEL-NEXT: v_bfe_i32 v8, v11, 0, 16 -; GFX7-GISEL-NEXT: v_min_i32_e32 v3, v3, v8 -; GFX7-GISEL-NEXT: v_bfe_i32 v4, v4, 0, 16 -; GFX7-GISEL-NEXT: v_bfe_i32 v8, v12, 0, 16 -; GFX7-GISEL-NEXT: v_min_i32_e32 v4, v4, v8 -; GFX7-GISEL-NEXT: v_bfe_i32 v5, v5, 0, 16 -; GFX7-GISEL-NEXT: v_bfe_i32 v8, v13, 0, 16 -; GFX7-GISEL-NEXT: v_min_i32_e32 v5, v5, v8 -; GFX7-GISEL-NEXT: v_bfe_i32 v6, v6, 0, 16 -; GFX7-GISEL-NEXT: v_bfe_i32 v8, v14, 0, 16 -; GFX7-GISEL-NEXT: v_min_i32_e32 v6, v6, v8 -; GFX7-GISEL-NEXT: v_bfe_i32 v7, v7, 0, 16 -; GFX7-GISEL-NEXT: v_bfe_i32 v8, v15, 0, 16 -; GFX7-GISEL-NEXT: v_min_i32_e32 v7, v7, v8 +; GFX7-GISEL-NEXT: v_bfe_i32 v8, v0, 0, 16 +; GFX7-GISEL-NEXT: v_bfe_i32 v9, v4, 0, 16 +; GFX7-GISEL-NEXT: v_bfe_i32 v0, v0, 16, 16 +; GFX7-GISEL-NEXT: v_bfe_i32 v4, v4, 16, 16 +; GFX7-GISEL-NEXT: v_min_i32_e32 v8, v8, v9 ; GFX7-GISEL-NEXT: v_min_i32_e32 v0, v0, v4 +; GFX7-GISEL-NEXT: v_bfe_i32 v4, v1, 0, 16 +; GFX7-GISEL-NEXT: v_bfe_i32 v9, v5, 0, 16 +; GFX7-GISEL-NEXT: v_bfe_i32 v1, v1, 16, 16 +; GFX7-GISEL-NEXT: v_bfe_i32 v5, v5, 16, 16 +; GFX7-GISEL-NEXT: v_min_i32_e32 v4, v4, v9 ; GFX7-GISEL-NEXT: v_min_i32_e32 v1, v1, v5 +; GFX7-GISEL-NEXT: v_bfe_i32 v5, v2, 0, 16 +; GFX7-GISEL-NEXT: v_bfe_i32 v9, v6, 0, 16 +; GFX7-GISEL-NEXT: v_bfe_i32 v2, v2, 16, 16 +; GFX7-GISEL-NEXT: v_bfe_i32 v6, v6, 16, 16 +; GFX7-GISEL-NEXT: v_min_i32_e32 v5, v5, v9 ; GFX7-GISEL-NEXT: v_min_i32_e32 v2, v2, v6 +; GFX7-GISEL-NEXT: v_bfe_i32 v6, v3, 0, 16 +; GFX7-GISEL-NEXT: v_bfe_i32 v9, v7, 0, 16 +; GFX7-GISEL-NEXT: v_bfe_i32 v3, v3, 16, 16 +; GFX7-GISEL-NEXT: v_bfe_i32 v7, v7, 16, 16 +; GFX7-GISEL-NEXT: v_min_i32_e32 v6, v6, v9 ; GFX7-GISEL-NEXT: v_min_i32_e32 v3, v3, v7 +; GFX7-GISEL-NEXT: v_min_i32_e32 v5, v8, v5 ; GFX7-GISEL-NEXT: v_min_i32_e32 v0, v0, v2 +; GFX7-GISEL-NEXT: v_min_i32_e32 v2, v4, v6 ; GFX7-GISEL-NEXT: v_min_i32_e32 v1, v1, v3 +; GFX7-GISEL-NEXT: v_min_i32_e32 v2, v5, v2 ; GFX7-GISEL-NEXT: v_min_i32_e32 v0, v0, v1 -; GFX7-GISEL-NEXT: v_min_i32_e32 v1, 0, v1 -; GFX7-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-GISEL-NEXT: v_min_i32_e32 v1, v2, v0 +; GFX7-GISEL-NEXT: v_min_i32_e32 v0, 0, v0 ; GFX7-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-GISEL-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: test_vector_reduce_smin_v16i16: diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-umax.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-umax.ll index 87b1e78f73da7..b9b9e0848333c 100644 --- a/llvm/test/CodeGen/AMDGPU/vector-reduce-umax.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-umax.ll @@ -1530,7 +1530,7 @@ define i16 @test_vector_reduce_umax_v2i16(<2 x i16> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_umax_v2i16: ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7-SDAG-NEXT: v_max_u32_e32 v0, v0, v1 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -1538,8 +1538,8 @@ define i16 @test_vector_reduce_umax_v2i16(<2 x i16> %v) { ; GFX7-GISEL-LABEL: test_vector_reduce_umax_v2i16: ; GFX7-GISEL: ; %bb.0: ; %entry ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX7-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX7-GISEL-NEXT: v_max_u32_e32 v0, v0, v1 ; GFX7-GISEL-NEXT: v_max_u32_e32 v1, 0, v1 ; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -1652,21 +1652,21 @@ define i16 @test_vector_reduce_umax_v3i16(<3 x i16> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_umax_v3i16: ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7-SDAG-NEXT: v_max_u32_e32 v0, v0, v2 ; GFX7-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX7-SDAG-NEXT: v_max3_u32 v0, v0, v1, 0 +; GFX7-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v0 +; GFX7-SDAG-NEXT: v_max_u32_e32 v1, v2, v1 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-SDAG-NEXT: v_max3_u32 v0, v1, v0, 0 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: test_vector_reduce_umax_v3i16: ; GFX7-GISEL: ; %bb.0: ; %entry ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX7-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-GISEL-NEXT: v_max_u32_e32 v0, v0, v2 ; GFX7-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX7-GISEL-NEXT: v_max_u32_e32 v0, v0, v1 -; GFX7-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; GFX7-GISEL-NEXT: v_max_u32_e32 v0, v0, v1 ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: test_vector_reduce_umax_v3i16: @@ -1788,23 +1788,23 @@ define i16 @test_vector_reduce_umax_v4i16(<4 x i16> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_umax_v4i16: ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX7-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX7-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7-SDAG-NEXT: v_max_u32_e32 v1, v1, v3 -; GFX7-SDAG-NEXT: v_max3_u32 v0, v0, v2, v1 +; GFX7-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v1 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-SDAG-NEXT: v_max_u32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_max3_u32 v0, v2, v3, v0 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: test_vector_reduce_umax_v4i16: ; GFX7-GISEL: ; %bb.0: ; %entry ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GFX7-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX7-GISEL-NEXT: v_max_u32_e32 v0, v0, v2 ; GFX7-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX7-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v3 -; GFX7-GISEL-NEXT: v_max_u32_e32 v1, v1, v2 +; GFX7-GISEL-NEXT: v_max_u32_e32 v0, v0, v1 +; GFX7-GISEL-NEXT: v_max_u32_e32 v1, v2, v3 ; GFX7-GISEL-NEXT: v_max_u32_e32 v0, v0, v1 ; GFX7-GISEL-NEXT: v_max_u32_e32 v1, 0, v1 ; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -1936,38 +1936,38 @@ define i16 @test_vector_reduce_umax_v8i16(<8 x i16> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_umax_v8i16: ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX7-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX7-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v5, 16, v0 ; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7-SDAG-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v7, 16, v2 ; GFX7-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX7-SDAG-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX7-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX7-SDAG-NEXT: v_max_u32_e32 v0, v0, v4 -; GFX7-SDAG-NEXT: v_max_u32_e32 v1, v1, v5 -; GFX7-SDAG-NEXT: v_max3_u32 v1, v1, v3, v7 -; GFX7-SDAG-NEXT: v_max3_u32 v0, v0, v2, v6 +; GFX7-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v1 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-SDAG-NEXT: v_and_b32_e32 v6, 0xffff, v3 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-SDAG-NEXT: v_max_u32_e32 v0, v0, v2 +; GFX7-SDAG-NEXT: v_max_u32_e32 v2, v5, v7 +; GFX7-SDAG-NEXT: v_max3_u32 v1, v2, v1, v3 +; GFX7-SDAG-NEXT: v_max3_u32 v0, v0, v4, v6 ; GFX7-SDAG-NEXT: v_max_u32_e32 v0, v0, v1 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: test_vector_reduce_umax_v8i16: ; GFX7-GISEL: ; %bb.0: ; %entry ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v7, 16, v3 ; GFX7-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX7-GISEL-NEXT: v_max_u32_e32 v0, v0, v4 -; GFX7-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX7-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v5 -; GFX7-GISEL-NEXT: v_max_u32_e32 v1, v1, v4 ; GFX7-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX7-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v6 -; GFX7-GISEL-NEXT: v_max_u32_e32 v2, v2, v4 +; GFX7-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX7-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX7-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v7 -; GFX7-GISEL-NEXT: v_max_u32_e32 v3, v3, v4 ; GFX7-GISEL-NEXT: v_max_u32_e32 v0, v0, v2 +; GFX7-GISEL-NEXT: v_max_u32_e32 v2, v4, v6 ; GFX7-GISEL-NEXT: v_max_u32_e32 v1, v1, v3 +; GFX7-GISEL-NEXT: v_max_u32_e32 v3, v5, v7 +; GFX7-GISEL-NEXT: v_max_u32_e32 v0, v0, v1 +; GFX7-GISEL-NEXT: v_max_u32_e32 v1, v2, v3 ; GFX7-GISEL-NEXT: v_max_u32_e32 v0, v0, v1 ; GFX7-GISEL-NEXT: v_max_u32_e32 v1, 0, v1 ; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -2133,68 +2133,68 @@ define i16 @test_vector_reduce_umax_v16i16(<16 x i16> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_umax_v16i16: ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7-SDAG-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX7-SDAG-NEXT: v_and_b32_e32 v9, 0xffff, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-SDAG-NEXT: v_and_b32_e32 v13, 0xffff, v4 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v8, 16, v2 +; GFX7-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v10, 16, v1 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v11, 16, v3 ; GFX7-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX7-SDAG-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GFX7-SDAG-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX7-SDAG-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GFX7-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX7-SDAG-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; GFX7-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX7-SDAG-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GFX7-SDAG-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX7-SDAG-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GFX7-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX7-SDAG-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v12, 16, v6 ; GFX7-SDAG-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX7-SDAG-NEXT: v_max_u32_e32 v1, v1, v9 -; GFX7-SDAG-NEXT: v_max_u32_e32 v0, v0, v8 -; GFX7-SDAG-NEXT: v_max_u32_e32 v6, v6, v14 -; GFX7-SDAG-NEXT: v_max_u32_e32 v2, v2, v10 -; GFX7-SDAG-NEXT: v_max_u32_e32 v7, v7, v15 -; GFX7-SDAG-NEXT: v_max_u32_e32 v3, v3, v11 -; GFX7-SDAG-NEXT: v_max3_u32 v0, v0, v4, v12 -; GFX7-SDAG-NEXT: v_max3_u32 v1, v1, v5, v13 -; GFX7-SDAG-NEXT: v_max3_u32 v1, v1, v3, v7 -; GFX7-SDAG-NEXT: v_max3_u32 v0, v0, v2, v6 -; GFX7-SDAG-NEXT: v_max_u32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v14, 16, v5 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v15, 16, v7 +; GFX7-SDAG-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX7-SDAG-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX7-SDAG-NEXT: v_max_u32_e32 v0, v0, v4 +; GFX7-SDAG-NEXT: v_max_u32_e32 v4, v9, v13 +; GFX7-SDAG-NEXT: v_max_u32_e32 v3, v3, v7 +; GFX7-SDAG-NEXT: v_max_u32_e32 v1, v1, v5 +; GFX7-SDAG-NEXT: v_max_u32_e32 v5, v11, v15 +; GFX7-SDAG-NEXT: v_max_u32_e32 v7, v10, v14 +; GFX7-SDAG-NEXT: v_max3_u32 v2, v4, v2, v6 +; GFX7-SDAG-NEXT: v_max3_u32 v0, v0, v8, v12 +; GFX7-SDAG-NEXT: v_max3_u32 v0, v0, v7, v5 +; GFX7-SDAG-NEXT: v_max3_u32 v1, v2, v1, v3 +; GFX7-SDAG-NEXT: v_max_u32_e32 v0, v1, v0 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: test_vector_reduce_umax_v16i16: ; GFX7-GISEL: ; %bb.0: ; %entry ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v8, 16, v0 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v9, 16, v1 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v11, 16, v3 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v12, 16, v4 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v13, 16, v5 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v14, 16, v6 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v15, 16, v7 ; GFX7-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7-GISEL-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GFX7-GISEL-NEXT: v_max_u32_e32 v0, v0, v8 -; GFX7-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX7-GISEL-NEXT: v_and_b32_e32 v8, 0xffff, v9 -; GFX7-GISEL-NEXT: v_max_u32_e32 v1, v1, v8 -; GFX7-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX7-GISEL-NEXT: v_and_b32_e32 v8, 0xffff, v10 -; GFX7-GISEL-NEXT: v_max_u32_e32 v2, v2, v8 -; GFX7-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX7-GISEL-NEXT: v_and_b32_e32 v8, 0xffff, v11 -; GFX7-GISEL-NEXT: v_max_u32_e32 v3, v3, v8 ; GFX7-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX7-GISEL-NEXT: v_and_b32_e32 v8, 0xffff, v12 -; GFX7-GISEL-NEXT: v_max_u32_e32 v4, v4, v8 +; GFX7-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX7-GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX7-GISEL-NEXT: v_and_b32_e32 v8, 0xffff, v13 -; GFX7-GISEL-NEXT: v_max_u32_e32 v5, v5, v8 +; GFX7-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX7-GISEL-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX7-GISEL-NEXT: v_and_b32_e32 v8, 0xffff, v14 -; GFX7-GISEL-NEXT: v_max_u32_e32 v6, v6, v8 +; GFX7-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX7-GISEL-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX7-GISEL-NEXT: v_and_b32_e32 v8, 0xffff, v15 -; GFX7-GISEL-NEXT: v_max_u32_e32 v7, v7, v8 ; GFX7-GISEL-NEXT: v_max_u32_e32 v0, v0, v4 +; GFX7-GISEL-NEXT: v_max_u32_e32 v4, v8, v12 ; GFX7-GISEL-NEXT: v_max_u32_e32 v1, v1, v5 +; GFX7-GISEL-NEXT: v_max_u32_e32 v5, v9, v13 ; GFX7-GISEL-NEXT: v_max_u32_e32 v2, v2, v6 +; GFX7-GISEL-NEXT: v_max_u32_e32 v6, v10, v14 ; GFX7-GISEL-NEXT: v_max_u32_e32 v3, v3, v7 +; GFX7-GISEL-NEXT: v_max_u32_e32 v7, v11, v15 ; GFX7-GISEL-NEXT: v_max_u32_e32 v0, v0, v2 +; GFX7-GISEL-NEXT: v_max_u32_e32 v2, v4, v6 ; GFX7-GISEL-NEXT: v_max_u32_e32 v1, v1, v3 +; GFX7-GISEL-NEXT: v_max_u32_e32 v3, v5, v7 +; GFX7-GISEL-NEXT: v_max_u32_e32 v0, v0, v1 +; GFX7-GISEL-NEXT: v_max_u32_e32 v1, v2, v3 ; GFX7-GISEL-NEXT: v_max_u32_e32 v0, v0, v1 ; GFX7-GISEL-NEXT: v_max_u32_e32 v1, 0, v1 ; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-umin.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-umin.ll index 2dd549c191ce9..3d1acef777f5d 100644 --- a/llvm/test/CodeGen/AMDGPU/vector-reduce-umin.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-umin.ll @@ -1273,21 +1273,13 @@ entry: ; FIXME: With -new-reg-bank-select, v_alignbit_b32 is regression. Need pattern to look through COPY. define i16 @test_vector_reduce_umin_v2i16(<2 x i16> %v) { -; GFX7-SDAG-LABEL: test_vector_reduce_umin_v2i16: -; GFX7-SDAG: ; %bb.0: ; %entry -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7-SDAG-NEXT: v_min_u32_e32 v0, v0, v1 -; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-GISEL-LABEL: test_vector_reduce_umin_v2i16: -; GFX7-GISEL: ; %bb.0: ; %entry -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX7-GISEL-NEXT: v_min_u32_e32 v0, v0, v1 -; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX7-LABEL: test_vector_reduce_umin_v2i16: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_min_u32_e32 v0, v0, v1 +; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: test_vector_reduce_umin_v2i16: ; GFX8-SDAG: ; %bb.0: ; %entry @@ -1395,22 +1387,22 @@ define i16 @test_vector_reduce_umin_v3i16(<3 x i16> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_umin_v3i16: ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX7-SDAG-NEXT: s_mov_b32 s4, 0xffff -; GFX7-SDAG-NEXT: v_min_u32_e32 v0, v0, v2 ; GFX7-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX7-SDAG-NEXT: v_min3_u32 v0, v0, v1, s4 +; GFX7-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v0 +; GFX7-SDAG-NEXT: s_mov_b32 s4, 0xffff +; GFX7-SDAG-NEXT: v_min_u32_e32 v1, v2, v1 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-SDAG-NEXT: v_min3_u32 v0, v1, v0, s4 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: test_vector_reduce_umin_v3i16: ; GFX7-GISEL: ; %bb.0: ; %entry ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX7-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-GISEL-NEXT: v_min_u32_e32 v0, v0, v2 ; GFX7-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX7-GISEL-NEXT: v_min_u32_e32 v0, v0, v1 -; GFX7-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; GFX7-GISEL-NEXT: v_min_u32_e32 v0, v0, v1 ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: test_vector_reduce_umin_v3i16: @@ -1533,23 +1525,23 @@ define i16 @test_vector_reduce_umin_v4i16(<4 x i16> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_umin_v4i16: ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX7-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX7-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7-SDAG-NEXT: v_min_u32_e32 v1, v1, v3 -; GFX7-SDAG-NEXT: v_min3_u32 v0, v0, v2, v1 +; GFX7-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v1 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-SDAG-NEXT: v_min_u32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_min3_u32 v0, v2, v3, v0 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: test_vector_reduce_umin_v4i16: ; GFX7-GISEL: ; %bb.0: ; %entry ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GFX7-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX7-GISEL-NEXT: v_min_u32_e32 v0, v0, v2 ; GFX7-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX7-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v3 -; GFX7-GISEL-NEXT: v_min_u32_e32 v1, v1, v2 +; GFX7-GISEL-NEXT: v_min_u32_e32 v0, v0, v1 +; GFX7-GISEL-NEXT: v_min_u32_e32 v1, v2, v3 ; GFX7-GISEL-NEXT: v_min_u32_e32 v0, v0, v1 ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -1678,38 +1670,38 @@ define i16 @test_vector_reduce_umin_v8i16(<8 x i16> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_umin_v8i16: ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX7-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX7-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v5, 16, v0 ; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7-SDAG-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v7, 16, v2 ; GFX7-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX7-SDAG-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX7-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX7-SDAG-NEXT: v_min_u32_e32 v0, v0, v4 -; GFX7-SDAG-NEXT: v_min_u32_e32 v1, v1, v5 -; GFX7-SDAG-NEXT: v_min3_u32 v1, v1, v3, v7 -; GFX7-SDAG-NEXT: v_min3_u32 v0, v0, v2, v6 +; GFX7-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v1 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-SDAG-NEXT: v_and_b32_e32 v6, 0xffff, v3 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-SDAG-NEXT: v_min_u32_e32 v0, v0, v2 +; GFX7-SDAG-NEXT: v_min_u32_e32 v2, v5, v7 +; GFX7-SDAG-NEXT: v_min3_u32 v1, v2, v1, v3 +; GFX7-SDAG-NEXT: v_min3_u32 v0, v0, v4, v6 ; GFX7-SDAG-NEXT: v_min_u32_e32 v0, v0, v1 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: test_vector_reduce_umin_v8i16: ; GFX7-GISEL: ; %bb.0: ; %entry ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v7, 16, v3 ; GFX7-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX7-GISEL-NEXT: v_min_u32_e32 v0, v0, v4 -; GFX7-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX7-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v5 -; GFX7-GISEL-NEXT: v_min_u32_e32 v1, v1, v4 ; GFX7-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX7-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v6 -; GFX7-GISEL-NEXT: v_min_u32_e32 v2, v2, v4 +; GFX7-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX7-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX7-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v7 -; GFX7-GISEL-NEXT: v_min_u32_e32 v3, v3, v4 ; GFX7-GISEL-NEXT: v_min_u32_e32 v0, v0, v2 +; GFX7-GISEL-NEXT: v_min_u32_e32 v2, v4, v6 ; GFX7-GISEL-NEXT: v_min_u32_e32 v1, v1, v3 +; GFX7-GISEL-NEXT: v_min_u32_e32 v3, v5, v7 +; GFX7-GISEL-NEXT: v_min_u32_e32 v0, v0, v1 +; GFX7-GISEL-NEXT: v_min_u32_e32 v1, v2, v3 ; GFX7-GISEL-NEXT: v_min_u32_e32 v0, v0, v1 ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -1872,68 +1864,68 @@ define i16 @test_vector_reduce_umin_v16i16(<16 x i16> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_umin_v16i16: ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7-SDAG-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX7-SDAG-NEXT: v_and_b32_e32 v9, 0xffff, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-SDAG-NEXT: v_and_b32_e32 v13, 0xffff, v4 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v8, 16, v2 +; GFX7-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v10, 16, v1 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v11, 16, v3 ; GFX7-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX7-SDAG-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GFX7-SDAG-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX7-SDAG-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GFX7-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX7-SDAG-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; GFX7-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX7-SDAG-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GFX7-SDAG-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX7-SDAG-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GFX7-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX7-SDAG-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v12, 16, v6 ; GFX7-SDAG-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX7-SDAG-NEXT: v_min_u32_e32 v1, v1, v9 -; GFX7-SDAG-NEXT: v_min_u32_e32 v0, v0, v8 -; GFX7-SDAG-NEXT: v_min_u32_e32 v6, v6, v14 -; GFX7-SDAG-NEXT: v_min_u32_e32 v2, v2, v10 -; GFX7-SDAG-NEXT: v_min_u32_e32 v7, v7, v15 -; GFX7-SDAG-NEXT: v_min_u32_e32 v3, v3, v11 -; GFX7-SDAG-NEXT: v_min3_u32 v0, v0, v4, v12 -; GFX7-SDAG-NEXT: v_min3_u32 v1, v1, v5, v13 -; GFX7-SDAG-NEXT: v_min3_u32 v1, v1, v3, v7 -; GFX7-SDAG-NEXT: v_min3_u32 v0, v0, v2, v6 -; GFX7-SDAG-NEXT: v_min_u32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v14, 16, v5 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v15, 16, v7 +; GFX7-SDAG-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX7-SDAG-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX7-SDAG-NEXT: v_min_u32_e32 v0, v0, v4 +; GFX7-SDAG-NEXT: v_min_u32_e32 v4, v9, v13 +; GFX7-SDAG-NEXT: v_min_u32_e32 v3, v3, v7 +; GFX7-SDAG-NEXT: v_min_u32_e32 v1, v1, v5 +; GFX7-SDAG-NEXT: v_min_u32_e32 v5, v11, v15 +; GFX7-SDAG-NEXT: v_min_u32_e32 v7, v10, v14 +; GFX7-SDAG-NEXT: v_min3_u32 v2, v4, v2, v6 +; GFX7-SDAG-NEXT: v_min3_u32 v0, v0, v8, v12 +; GFX7-SDAG-NEXT: v_min3_u32 v0, v0, v7, v5 +; GFX7-SDAG-NEXT: v_min3_u32 v1, v2, v1, v3 +; GFX7-SDAG-NEXT: v_min_u32_e32 v0, v1, v0 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: test_vector_reduce_umin_v16i16: ; GFX7-GISEL: ; %bb.0: ; %entry ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v8, 16, v0 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v9, 16, v1 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v11, 16, v3 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v12, 16, v4 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v13, 16, v5 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v14, 16, v6 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v15, 16, v7 ; GFX7-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7-GISEL-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GFX7-GISEL-NEXT: v_min_u32_e32 v0, v0, v8 -; GFX7-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX7-GISEL-NEXT: v_and_b32_e32 v8, 0xffff, v9 -; GFX7-GISEL-NEXT: v_min_u32_e32 v1, v1, v8 -; GFX7-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX7-GISEL-NEXT: v_and_b32_e32 v8, 0xffff, v10 -; GFX7-GISEL-NEXT: v_min_u32_e32 v2, v2, v8 -; GFX7-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX7-GISEL-NEXT: v_and_b32_e32 v8, 0xffff, v11 -; GFX7-GISEL-NEXT: v_min_u32_e32 v3, v3, v8 ; GFX7-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX7-GISEL-NEXT: v_and_b32_e32 v8, 0xffff, v12 -; GFX7-GISEL-NEXT: v_min_u32_e32 v4, v4, v8 +; GFX7-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX7-GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX7-GISEL-NEXT: v_and_b32_e32 v8, 0xffff, v13 -; GFX7-GISEL-NEXT: v_min_u32_e32 v5, v5, v8 +; GFX7-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX7-GISEL-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX7-GISEL-NEXT: v_and_b32_e32 v8, 0xffff, v14 -; GFX7-GISEL-NEXT: v_min_u32_e32 v6, v6, v8 +; GFX7-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX7-GISEL-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX7-GISEL-NEXT: v_and_b32_e32 v8, 0xffff, v15 -; GFX7-GISEL-NEXT: v_min_u32_e32 v7, v7, v8 ; GFX7-GISEL-NEXT: v_min_u32_e32 v0, v0, v4 +; GFX7-GISEL-NEXT: v_min_u32_e32 v4, v8, v12 ; GFX7-GISEL-NEXT: v_min_u32_e32 v1, v1, v5 +; GFX7-GISEL-NEXT: v_min_u32_e32 v5, v9, v13 ; GFX7-GISEL-NEXT: v_min_u32_e32 v2, v2, v6 +; GFX7-GISEL-NEXT: v_min_u32_e32 v6, v10, v14 ; GFX7-GISEL-NEXT: v_min_u32_e32 v3, v3, v7 +; GFX7-GISEL-NEXT: v_min_u32_e32 v7, v11, v15 ; GFX7-GISEL-NEXT: v_min_u32_e32 v0, v0, v2 +; GFX7-GISEL-NEXT: v_min_u32_e32 v2, v4, v6 ; GFX7-GISEL-NEXT: v_min_u32_e32 v1, v1, v3 +; GFX7-GISEL-NEXT: v_min_u32_e32 v3, v5, v7 +; GFX7-GISEL-NEXT: v_min_u32_e32 v0, v0, v1 +; GFX7-GISEL-NEXT: v_min_u32_e32 v1, v2, v3 ; GFX7-GISEL-NEXT: v_min_u32_e32 v0, v0, v1 ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-xor.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-xor.ll index 18c5db94d322c..fbc9606718bf0 100644 --- a/llvm/test/CodeGen/AMDGPU/vector-reduce-xor.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-xor.ll @@ -945,21 +945,15 @@ define i16 @test_vector_reduce_xor_v2i16(<2 x i16> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_xor_v2i16: ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX7-SDAG-NEXT: v_xor_b32_e32 v0, v0, v1 -; GFX7-SDAG-NEXT: v_xor_b32_e32 v1, v1, v2 ; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: test_vector_reduce_xor_v2i16: ; GFX7-GISEL: ; %bb.0: ; %entry ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX7-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7-GISEL-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX7-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX7-GISEL-NEXT: v_xor_b32_e32 v0, v0, v1 ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -1055,20 +1049,18 @@ define i16 @test_vector_reduce_xor_v3i16(<3 x i16> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_xor_v3i16: ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX7-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; GFX7-SDAG-NEXT: v_xor_b32_e32 v0, v0, v1 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX7-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX7-SDAG-NEXT: v_xor_b32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_xor_b32_e32 v0, v0, v2 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: test_vector_reduce_xor_v3i16: ; GFX7-GISEL: ; %bb.0: ; %entry ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-GISEL-NEXT: v_xor_b32_e32 v0, v0, v1 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX7-GISEL-NEXT: v_xor_b32_e32 v0, v0, v2 +; GFX7-GISEL-NEXT: v_xor_b32_e32 v0, v0, v1 ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: test_vector_reduce_xor_v3i16: @@ -1162,8 +1154,8 @@ define i16 @test_vector_reduce_xor_v4i16(<4 x i16> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_xor_v4i16: ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_xor_b32_e32 v1, v1, v3 -; GFX7-SDAG-NEXT: v_xor_b32_e32 v0, v0, v2 +; GFX7-SDAG-NEXT: v_xor_b32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX7-SDAG-NEXT: v_xor_b32_e32 v0, v0, v1 ; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -1171,12 +1163,13 @@ define i16 @test_vector_reduce_xor_v4i16(<4 x i16> %v) { ; GFX7-GISEL-LABEL: test_vector_reduce_xor_v4i16: ; GFX7-GISEL: ; %bb.0: ; %entry ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7-GISEL-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX7-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; GFX7-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v3 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-GISEL-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX7-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; GFX7-GISEL-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX7-GISEL-NEXT: v_xor_b32_e32 v0, v0, v1 ; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 @@ -1306,31 +1299,33 @@ define i16 @test_vector_reduce_xor_v8i16(<8 x i16> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_xor_v8i16: ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_xor_b32_e32 v2, v2, v6 -; GFX7-SDAG-NEXT: v_xor_b32_e32 v0, v0, v4 -; GFX7-SDAG-NEXT: v_xor_b32_e32 v3, v3, v7 -; GFX7-SDAG-NEXT: v_xor_b32_e32 v1, v1, v5 ; GFX7-SDAG-NEXT: v_xor_b32_e32 v1, v1, v3 ; GFX7-SDAG-NEXT: v_xor_b32_e32 v0, v0, v2 ; GFX7-SDAG-NEXT: v_xor_b32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX7-SDAG-NEXT: v_xor_b32_e32 v0, v0, v1 ; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: test_vector_reduce_xor_v8i16: ; GFX7-GISEL: ; %bb.0: ; %entry ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX7-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7-GISEL-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX7-GISEL-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX7-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; GFX7-GISEL-NEXT: v_or_b32_e32 v1, v4, v1 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX7-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX7-GISEL-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v5 -; GFX7-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v4 -; GFX7-GISEL-NEXT: v_or_b32_e32 v2, v2, v3 -; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX7-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v6 -; GFX7-GISEL-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX7-GISEL-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v7 +; GFX7-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX7-GISEL-NEXT: v_or_b32_e32 v3, v4, v3 ; GFX7-GISEL-NEXT: v_xor_b32_e32 v0, v0, v2 ; GFX7-GISEL-NEXT: v_xor_b32_e32 v1, v1, v3 ; GFX7-GISEL-NEXT: v_xor_b32_e32 v0, v0, v1 @@ -1483,51 +1478,55 @@ define i16 @test_vector_reduce_xor_v16i16(<16 x i16> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_xor_v16i16: ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_xor_b32_e32 v5, v5, v13 -; GFX7-SDAG-NEXT: v_xor_b32_e32 v1, v1, v9 -; GFX7-SDAG-NEXT: v_xor_b32_e32 v7, v7, v15 -; GFX7-SDAG-NEXT: v_xor_b32_e32 v3, v3, v11 -; GFX7-SDAG-NEXT: v_xor_b32_e32 v4, v4, v12 -; GFX7-SDAG-NEXT: v_xor_b32_e32 v0, v0, v8 -; GFX7-SDAG-NEXT: v_xor_b32_e32 v6, v6, v14 -; GFX7-SDAG-NEXT: v_xor_b32_e32 v2, v2, v10 -; GFX7-SDAG-NEXT: v_xor_b32_e32 v2, v2, v6 -; GFX7-SDAG-NEXT: v_xor_b32_e32 v0, v0, v4 ; GFX7-SDAG-NEXT: v_xor_b32_e32 v3, v3, v7 ; GFX7-SDAG-NEXT: v_xor_b32_e32 v1, v1, v5 +; GFX7-SDAG-NEXT: v_xor_b32_e32 v2, v2, v6 +; GFX7-SDAG-NEXT: v_xor_b32_e32 v0, v0, v4 ; GFX7-SDAG-NEXT: v_xor_b32_e32 v1, v1, v3 +; GFX7-SDAG-NEXT: v_xor_b32_e32 v3, v0, v2 +; GFX7-SDAG-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX7-SDAG-NEXT: v_xor_b32_e32 v0, v0, v2 -; GFX7-SDAG-NEXT: v_xor_b32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_xor_b32_e32 v3, v3, v1 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-SDAG-NEXT: v_xor_b32_e32 v0, v3, v0 ; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: test_vector_reduce_xor_v16i16: ; GFX7-GISEL: ; %bb.0: ; %entry ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v8, 16, v0 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v9, 16, v1 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; GFX7-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7-GISEL-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; GFX7-GISEL-NEXT: v_or_b32_e32 v0, v8, v0 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; GFX7-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v11, 16, v3 +; GFX7-GISEL-NEXT: v_or_b32_e32 v1, v8, v1 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v8, 16, v10 ; GFX7-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX7-GISEL-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v5 -; GFX7-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v4 -; GFX7-GISEL-NEXT: v_or_b32_e32 v2, v2, v3 -; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX7-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v6 -; GFX7-GISEL-NEXT: v_or_b32_e32 v3, v3, v4 -; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v9 -; GFX7-GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v8 -; GFX7-GISEL-NEXT: v_or_b32_e32 v4, v4, v5 -; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v11 -; GFX7-GISEL-NEXT: v_and_b32_e32 v6, 0xffff, v10 -; GFX7-GISEL-NEXT: v_or_b32_e32 v5, v5, v6 -; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v6, 16, v13 -; GFX7-GISEL-NEXT: v_and_b32_e32 v7, 0xffff, v12 -; GFX7-GISEL-NEXT: v_or_b32_e32 v6, v6, v7 -; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v7, 16, v15 -; GFX7-GISEL-NEXT: v_and_b32_e32 v8, 0xffff, v14 -; GFX7-GISEL-NEXT: v_or_b32_e32 v7, v7, v8 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v12, 16, v4 +; GFX7-GISEL-NEXT: v_or_b32_e32 v2, v8, v2 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v8, 16, v11 +; GFX7-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v13, 16, v5 +; GFX7-GISEL-NEXT: v_or_b32_e32 v3, v8, v3 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v8, 16, v12 +; GFX7-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v14, 16, v6 +; GFX7-GISEL-NEXT: v_or_b32_e32 v4, v8, v4 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v8, 16, v13 +; GFX7-GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v15, 16, v7 +; GFX7-GISEL-NEXT: v_or_b32_e32 v5, v8, v5 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v8, 16, v14 +; GFX7-GISEL-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX7-GISEL-NEXT: v_or_b32_e32 v6, v8, v6 +; GFX7-GISEL-NEXT: v_lshlrev_b32_e32 v8, 16, v15 +; GFX7-GISEL-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX7-GISEL-NEXT: v_or_b32_e32 v7, v8, v7 ; GFX7-GISEL-NEXT: v_xor_b32_e32 v0, v0, v4 ; GFX7-GISEL-NEXT: v_xor_b32_e32 v1, v1, v5 ; GFX7-GISEL-NEXT: v_xor_b32_e32 v2, v2, v6